libhevc: added BTI and PAC support in libhevc

Test: readelf -nw libhevcdec.a Test: readelf -nW libhevcenc.a Test: atest MctsMediaV2TestCases atest MctsMediaDecoderTestCases atest MctsMediaEncoderTestCases atest MctsMediaCodecTestCases Test: hevc_dec_fuzzer hevc_enc_fuzzer Bug: 485868924 Change-Id: I7eb3c915662654d649ba4aa5793f71afff8c47e7
Redefine hevc examples test modules as cc_binary
2026-04-03 04:40:49 +07:00 · 2026-04-02 08:34:07 -07:00 · 2026-04-01 15:33:37 -07:00 · 2026-04-01 15:33:37 -07:00 · 2026-04-01 15:33:37 -07:00 · 2026-03-16 10:03:26 -07:00
186 changed files with 11201 additions and 4338 deletions
--- a/.github/workflows/cifuzz.yml
+++ b/.github/workflows/cifuzz.yml
@ -20,7 +20,7 @@ jobs:
       language: c++
       fuzz-seconds: 600
   - name: Upload Crash
-     uses: actions/upload-artifact@v3
+     uses: actions/upload-artifact@v4
     if: failure() && steps.build.outcome == 'success'
     with:
       name: artifacts
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@ -2,22 +2,100 @@ name: CMake

 on:
  push:
-    branches: [ "main" ]
  pull_request:
-    branches: [ "main" ]

 env:
  BUILD_TYPE: Release

 jobs:
  build:
-    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        include:
+          - name: ubuntu-latest-gcc-cmake
+            os: ubuntu-latest
+            cc: gcc
+            cxx: g++
+            build-system: cmake
+            cmake-opts: ''
+
+          - name: ubuntu-latest-clang-cmake
+            os: ubuntu-latest
+            cc: clang
+            cxx: clang++
+            build-system: cmake
+            cmake-opts: ''
+
+          - name: ubuntu-24.04-arm-clang-cmake
+            os: ubuntu-24.04-arm
+            cc: clang
+            cxx: clang++
+            build-system: cmake
+            cmake-opts: ''
+
+          - name: ubuntu-latest-clang-cmake-asan-fuzzer
+            os: ubuntu-latest
+            cc: clang
+            cxx: clang++
+            build-system: cmake
+            cmake-opts: '-DSANITIZE=fuzzer-no-link,address'
+
+          - name: ubuntu-latest-clang-cmake-ninja
+            os: ubuntu-latest
+            cc: clang
+            cxx: clang++
+            build-system: cmake
+            cmake-opts: '-G Ninja'
+
+          - name: macos-latest-clang-cmake
+            os: macos-latest
+            cc: clang
+            cxx: clang++
+            build-system: cmake
+            cmake-opts: ''
+
+          - name: ubuntu-latest-cross-aarch64-cmake
+            os: ubuntu-latest
+            cc: aarch64-linux-gnu-gcc
+            cxx: aarch64-linux-gnu-g++
+            build-system: cmake
+            cmake-opts: '-DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/aarch64_toolchain.cmake'
+
+          - name: ubuntu-latest-cross-aarch32-cmake
+            os: ubuntu-latest
+            cc: arm-linux-gnueabihf-gcc
+            cxx: arm-linux-gnueabihf-g++
+            build-system: cmake
+            cmake-opts: '-DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/aarch32_toolchain.cmake'
+
+    runs-on: ${{ matrix.os }}

    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
+
+    - name: Install Linux dependencies
+      if: startsWith(matrix.os,'ubuntu') && contains(matrix.cmake-opts,'-G Ninja')
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y ninja-build
+
+    - name: Install cross-aarch64 dependencies
+      if: startsWith(matrix.os,'ubuntu') && contains(matrix.cmake-opts,'aarch64')
+      run: |
+        sudo apt-get update
+        sudo apt-get install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
+
+    - name: Install cross-arm dependencies
+      if: startsWith(matrix.os,'ubuntu') && contains(matrix.cmake-opts,'aarch32')
+      run: |
+        sudo apt-get update
+        sudo apt-get install gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf

    - name: Configure CMake
-      run: cmake -B ${{github.workspace}}/out -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
+      env:
+        CC: ${{ matrix.cc }}
+        CXX: ${{ matrix.cxx }}
+      run: cmake -B ${{github.workspace}}/out -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} ${{ matrix.cmake-opts }}

    - name: Build
      run: cmake --build ${{github.workspace}}/out --config ${{env.BUILD_TYPE}}
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@ -0,0 +1,18 @@
+{
+    "configurations": [
+        {
+            "name": "Native",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "${workspaceFolder}/common",
+                "${workspaceFolder}/decoder",
+                "${workspaceFolder}/encoder"
+            ],
+            "defines": [],
+            "cStandard": "c17",
+            "cppStandard": "c++17",
+            "configurationProvider": "ms-vscode.cmake-tools"
+        }
+    ],
+    "version": 4
+}
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,81 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Run hevcenc - Linux",
+            "type": "cppdbg",
+            "request": "launch",
+            "program": "${workspaceFolder}/build/hevcenc",
+            "args": ["../test/encoder/vid_enc_cfg.txt"],
+            "stopAtEntry": false,
+            "cwd": "${workspaceFolder}/build",
+            "environment": [],
+            "externalConsole": false,
+            "MIMode": "gdb",
+            "setupCommands": [
+                {
+                    "description": "Enable pretty-printing for gdb",
+                    "text": "-enable-pretty-printing",
+                    "ignoreFailures": true
+                }
+            ]
+        },
+        {
+            "name": "Run hevcdec - Linux",
+            "type": "cppdbg",
+            "request": "launch",
+            "program": "${workspaceFolder}/build/hevcdec",
+            "args": ["../test/decoder/test.cfg"],
+            "stopAtEntry": false,
+            "cwd": "${workspaceFolder}/build",
+            "environment": [],
+            "externalConsole": false,
+            "MIMode": "gdb",
+            "setupCommands": [
+                {
+                    "description": "Enable pretty-printing for gdb",
+                    "text": "-enable-pretty-printing",
+                    "ignoreFailures": true
+                }
+            ]
+        },
+        {
+            "name": "Run hevcenc - Mac",
+            "type": "cppdbg",
+            "request": "launch",
+            "program": "${workspaceFolder}/build/hevcenc",
+            "args": ["../test/encoder/vid_enc_cfg.txt"],
+            "stopAtEntry": false,
+            "cwd": "${workspaceFolder}/build",
+            "environment": [],
+            "externalConsole": false,
+            "MIMode": "lldb",
+            "setupCommands": [
+                {
+                    "description": "Enable pretty-printing for lldb",
+                    "text": "-enable-pretty-printing",
+                    "ignoreFailures": true
+                }
+            ]
+        },
+        {
+            "name": "Run hevcdec - Mac",
+            "type": "cppdbg",
+            "request": "launch",
+            "program": "${workspaceFolder}/build/hevcdec",
+            "args": ["../test/decoder/test.cfg"],
+            "stopAtEntry": false,
+            "cwd": "${workspaceFolder}/build",
+            "environment": [],
+            "externalConsole": false,
+            "MIMode": "lldb",
+            "setupCommands": [
+                {
+                    "description": "Enable pretty-printing for lldb",
+                    "text": "-enable-pretty-printing",
+                    "ignoreFailures": true
+                }
+            ]
+        }
+    ]
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,19 @@
+{
+    "cmake.buildDirectory": "${workspaceFolder}/build",
+    "cmake.sourceDirectory": "${workspaceFolder}",
+    "cmake.configureArgs": [
+        "-DENABLE_MVC=OFF",
+        "-DENABLE_SVC=OFF",
+        "-DENABLE_TESTS=OFF",
+        "-DCMAKE_C_COMPILER=clang",
+        "-DCMAKE_CXX_COMPILER=clang++"
+    ],
+    "cmake.preferredGenerators": [
+        "Unix Makefiles"
+    ],
+    "cmake.debugConfig": {
+        "hevcenc": "hevcenc",
+        "hevcdec": "hevcdec"
+    },
+    "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools"
+}
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@ -0,0 +1,29 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "type": "cmake",
+            "label": "Configure",
+            "command": "configure",
+            "problemMatcher": [
+                "$gcc"
+            ],
+            "group": "build"
+        },
+        {
+            "type": "cmake",
+            "label": "Build",
+            "command": "build",
+            "problemMatcher": [
+                "$gcc"
+            ],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            },
+            "dependsOn": [
+                "Configure"
+            ]
+        }
+    ]
+}
--- a/Android.bp
+++ b/Android.bp
@ -40,18 +40,17 @@ cc_library_static {

    cflags: [
        "-D_LIB",
-        "-DMULTICORE",
        "-fPIC",
+        "-DENABLE_MAIN_REXT_PROFILE",

        "-O3",
        "-DANDROID",
+        "-DDISABLE_SEI",
        "-Wall",
        "-Werror",
        // common/x86/ihevc_sao_ssse3_intr.c: implicit conversion from
        // 'int' to 'char' changes value from 128 to -128
        "-Wno-error=constant-conversion",
-        // #KEEP_THREAD_ACTIVE is experimental
-        "-UKEEP_THREADS_ACTIVE",
    ],

    export_include_dirs: [
@ -60,64 +59,66 @@ cc_library_static {
    ],

    srcs: [
-        "common/ihevc_quant_tables.c",
-        "common/ihevc_inter_pred_filters.c",
-        "common/ihevc_weighted_pred.c",
-        "common/ihevc_padding.c",
-        "common/ihevc_deblk_edge_filter.c",
-        "common/ihevc_deblk_tables.c",
+        "common/ihevc_buf_mgr.c",
        "common/ihevc_cabac_tables.c",
-        "common/ihevc_common_tables.c",
-        "common/ihevc_intra_pred_filters.c",
        "common/ihevc_chroma_intra_pred_filters.c",
-        "common/ihevc_mem_fns.c",
-        "common/ihevc_sao.c",
-        "common/ihevc_trans_tables.c",
-        "common/ihevc_recon.c",
-        "common/ihevc_itrans.c",
-        "common/ihevc_itrans_recon.c",
-        "common/ihevc_iquant_recon.c",
-        "common/ihevc_iquant_itrans_recon.c",
-        "common/ihevc_itrans_recon_32x32.c",
-        "common/ihevc_itrans_recon_16x16.c",
-        "common/ihevc_itrans_recon_8x8.c",
-        "common/ihevc_chroma_itrans_recon.c",
-        "common/ihevc_chroma_iquant_recon.c",
        "common/ihevc_chroma_iquant_itrans_recon.c",
-        "common/ihevc_chroma_recon.c",
+        "common/ihevc_chroma_iquant_recon.c",
+        "common/ihevc_chroma_itrans_recon.c",
+        "common/ihevc_chroma_itrans_recon_32x32.c",
        "common/ihevc_chroma_itrans_recon_16x16.c",
        "common/ihevc_chroma_itrans_recon_8x8.c",
-        "common/ihevc_buf_mgr.c",
+        "common/ihevc_chroma_recon.c",
+        "common/ihevc_common_tables.c",
+        "common/ihevc_deblk_edge_filter.c",
+        "common/ihevc_deblk_tables.c",
        "common/ihevc_disp_mgr.c",
        "common/ihevc_dpb_mgr.c",
+        "common/ihevc_inter_pred_filters.c",
+        "common/ihevc_intra_pred_filters.c",
+        "common/ihevc_iquant_itrans_recon.c",
+        "common/ihevc_iquant_recon.c",
+        "common/ihevc_itrans.c",
+        "common/ihevc_itrans_res.c",
+        "common/ihevc_itrans_recon.c",
+        "common/ihevc_itrans_recon_16x16.c",
+        "common/ihevc_itrans_recon_32x32.c",
+        "common/ihevc_itrans_recon_8x8.c",
+        "common/ihevc_mem_fns.c",
+        "common/ihevc_padding.c",
+        "common/ihevc_quant_tables.c",
+        "common/ihevc_recon.c",
+        "common/ihevc_sao.c",
+        "common/ihevc_trans_tables.c",
+        "common/ihevc_weighted_pred.c",
        "common/ithread.c",
-        "decoder/ihevcd_version.c",
        "decoder/ihevcd_api.c",
-        "decoder/ihevcd_decode.c",
-        "decoder/ihevcd_nal.c",
        "decoder/ihevcd_bitstream.c",
-        "decoder/ihevcd_parse_headers.c",
-        "decoder/ihevcd_parse_slice_header.c",
-        "decoder/ihevcd_parse_slice.c",
-        "decoder/ihevcd_parse_residual.c",
+        "decoder/ihevcd_boundary_strength.c",
        "decoder/ihevcd_cabac.c",
-        "decoder/ihevcd_intra_pred_mode_prediction.c",
-        "decoder/ihevcd_process_slice.c",
-        "decoder/ihevcd_utils.c",
-        "decoder/ihevcd_job_queue.c",
-        "decoder/ihevcd_ref_list.c",
+        "decoder/ihevcd_common_tables.c",
+        "decoder/ihevcd_deblk.c",
+        "decoder/ihevcd_decode.c",
+        "decoder/ihevcd_fmt_conv.c",
        "decoder/ihevcd_get_mv.c",
-        "decoder/ihevcd_mv_pred.c",
-        "decoder/ihevcd_mv_merge.c",
+        "decoder/ihevcd_ilf_padding.c",
+        "decoder/ihevcd_inter_pred.c",
+        "decoder/ihevcd_intra_pred_mode_prediction.c",
        "decoder/ihevcd_iquant_itrans_recon_ctb.c",
        "decoder/ihevcd_itrans_recon_dc.c",
-        "decoder/ihevcd_common_tables.c",
-        "decoder/ihevcd_boundary_strength.c",
-        "decoder/ihevcd_deblk.c",
-        "decoder/ihevcd_inter_pred.c",
+        "decoder/ihevcd_job_queue.c",
+        "decoder/ihevcd_mv_merge.c",
+        "decoder/ihevcd_mv_pred.c",
+        "decoder/ihevcd_nal.c",
+        "decoder/ihevcd_parse_headers.c",
+        "decoder/ihevcd_parse_residual.c",
+        "decoder/ihevcd_parse_slice.c",
+        "decoder/ihevcd_parse_slice_header.c",
+        "decoder/ihevcd_process_slice.c",
+        "decoder/ihevcd_ref_list.c",
        "decoder/ihevcd_sao.c",
-        "decoder/ihevcd_ilf_padding.c",
-        "decoder/ihevcd_fmt_conv.c",
+        "decoder/ihevcd_utils.c",
+        "decoder/ihevcd_version.c",
    ],

    arch: {
@ -131,29 +132,64 @@ cc_library_static {
                "-DDEFAULT_ARCH=D_ARCH_ARMV8_GENERIC",
            ],
            local_include_dirs: [
-                "decoder/arm",
                "common/arm",
-                "decoder/arm64",
                "common/arm64",
+                "decoder/arm",
+                "decoder/arm64",
            ],

            srcs: [
-                "decoder/arm/ihevcd_function_selector.c",
-                "decoder/arm/ihevcd_function_selector_noneon.c",
-                "decoder/arm64/ihevcd_function_selector_av8.c",
                "common/arm/ihevc_intra_pred_filters_neon_intr.c",
                "common/arm/ihevc_weighted_pred_neon_intr.c",
-                "common/arm64/ihevc_mem_fns.s",
-                "common/arm64/ihevc_itrans_recon_32x32.s",
-                "common/arm64/ihevc_weighted_pred_bi_default.s",
-                "common/arm64/ihevc_weighted_pred_bi.s",
-                "common/arm64/ihevc_weighted_pred_uni.s",
+                "common/arm64/ihevc_deblk_chroma_horz.s",
+                "common/arm64/ihevc_deblk_chroma_vert.s",
                "common/arm64/ihevc_deblk_luma_horz.s",
                "common/arm64/ihevc_deblk_luma_vert.s",
-                "common/arm64/ihevc_deblk_chroma_vert.s",
-                "common/arm64/ihevc_deblk_chroma_horz.s",
-                "common/arm64/ihevc_sao_band_offset_luma.s",
+                "common/arm64/ihevc_inter_pred_chroma_copy.s",
+                "common/arm64/ihevc_inter_pred_chroma_copy_w16out.s",
+                "common/arm64/ihevc_inter_pred_chroma_horz.s",
+                "common/arm64/ihevc_inter_pred_chroma_horz_w16out.s",
+                "common/arm64/ihevc_inter_pred_chroma_vert.s",
+                "common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s",
+                "common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s",
+                "common/arm64/ihevc_inter_pred_chroma_vert_w16out.s",
+                "common/arm64/ihevc_inter_pred_filters_luma_horz.s",
+                "common/arm64/ihevc_inter_pred_filters_luma_vert.s",
+                "common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s",
+                "common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s",
+                "common/arm64/ihevc_inter_pred_luma_copy.s",
+                "common/arm64/ihevc_inter_pred_luma_copy_w16out.s",
+                "common/arm64/ihevc_inter_pred_luma_horz_w16out.s",
+                "common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s",
+                "common/arm64/ihevc_intra_pred_chroma_dc.s",
+                "common/arm64/ihevc_intra_pred_chroma_horz.s",
+                "common/arm64/ihevc_intra_pred_chroma_mode2.s",
+                "common/arm64/ihevc_intra_pred_chroma_mode_18_34.s",
+                "common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s",
+                "common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s",
+                "common/arm64/ihevc_intra_pred_chroma_planar.s",
+                "common/arm64/ihevc_intra_pred_chroma_ver.s",
+                "common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s",
+                "common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s",
+                "common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s",
+                "common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s",
+                "common/arm64/ihevc_intra_pred_luma_dc.s",
+                "common/arm64/ihevc_intra_pred_luma_horz.s",
+                "common/arm64/ihevc_intra_pred_luma_mode2.s",
+                "common/arm64/ihevc_intra_pred_luma_mode_18_34.s",
+                "common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s",
+                "common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s",
+                "common/arm64/ihevc_intra_pred_luma_planar.s",
+                "common/arm64/ihevc_intra_pred_luma_vert.s",
+                "common/arm64/ihevc_itrans_recon_16x16.s",
+                "common/arm64/ihevc_itrans_recon_32x32.s",
+                "common/arm64/ihevc_itrans_recon_4x4.s",
+                "common/arm64/ihevc_itrans_recon_4x4_ttype1.s",
+                "common/arm64/ihevc_itrans_recon_8x8.s",
+                "common/arm64/ihevc_mem_fns.s",
+                "common/arm64/ihevc_padding.s",
                "common/arm64/ihevc_sao_band_offset_chroma.s",
+                "common/arm64/ihevc_sao_band_offset_luma.s",
                "common/arm64/ihevc_sao_edge_offset_class0.s",
                "common/arm64/ihevc_sao_edge_offset_class0_chroma.s",
                "common/arm64/ihevc_sao_edge_offset_class1.s",
@ -162,156 +198,107 @@ cc_library_static {
                "common/arm64/ihevc_sao_edge_offset_class2_chroma.s",
                "common/arm64/ihevc_sao_edge_offset_class3.s",
                "common/arm64/ihevc_sao_edge_offset_class3_chroma.s",
-                "common/arm64/ihevc_inter_pred_luma_horz_w16out.s",
-                "common/arm64/ihevc_inter_pred_filters_luma_horz.s",
-                "common/arm64/ihevc_inter_pred_filters_luma_vert.s",
-                "common/arm64/ihevc_inter_pred_chroma_horz.s",
-                "common/arm64/ihevc_inter_pred_chroma_horz_w16out.s",
-                "common/arm64/ihevc_inter_pred_chroma_vert.s",
-                "common/arm64/ihevc_inter_pred_chroma_vert_w16out.s",
-                "common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s",
-                "common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s",
-                "common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s",
-                "common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s",
-                "common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s",
-                "common/arm64/ihevc_inter_pred_luma_copy_w16out.s",
-                "common/arm64/ihevc_inter_pred_luma_copy.s",
-                "common/arm64/ihevc_inter_pred_chroma_copy.s",
-                "common/arm64/ihevc_inter_pred_chroma_copy_w16out.s",
-                "common/arm64/ihevc_itrans_recon_4x4_ttype1.s",
-                "common/arm64/ihevc_itrans_recon_4x4.s",
-                "common/arm64/ihevc_itrans_recon_8x8.s",
-                "common/arm64/ihevc_itrans_recon_16x16.s",
-                "common/arm64/ihevc_intra_pred_chroma_planar.s",
-                "common/arm64/ihevc_intra_pred_chroma_dc.s",
-                "common/arm64/ihevc_intra_pred_chroma_horz.s",
-                "common/arm64/ihevc_intra_pred_chroma_ver.s",
-                "common/arm64/ihevc_intra_pred_chroma_mode2.s",
-                "common/arm64/ihevc_intra_pred_chroma_mode_18_34.s",
-                "common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s",
-                "common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s",
-                "common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s",
-                "common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s",
-                "common/arm64/ihevc_intra_pred_luma_planar.s",
-                "common/arm64/ihevc_intra_pred_luma_horz.s",
-                "common/arm64/ihevc_intra_pred_luma_mode2.s",
-                "common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s",
-                "common/arm64/ihevc_intra_pred_luma_mode_18_34.s",
-                "common/arm64/ihevc_intra_pred_luma_vert.s",
-                "common/arm64/ihevc_intra_pred_luma_dc.s",
-                "common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s",
-                "common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s",
-                "common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s",
-                "common/arm64/ihevc_padding.s",
-                "decoder/arm64/ihevcd_itrans_recon_dc_luma.s",
-                "decoder/arm64/ihevcd_itrans_recon_dc_chroma.s",
+                "common/arm64/ihevc_weighted_pred_bi.s",
+                "common/arm64/ihevc_weighted_pred_bi_default.s",
+                "common/arm64/ihevc_weighted_pred_uni.s",
+                "decoder/arm/ihevcd_function_selector.c",
+                "decoder/arm/ihevcd_function_selector_noneon.c",
                "decoder/arm64/ihevcd_fmt_conv_420sp_to_420p.s",
                "decoder/arm64/ihevcd_fmt_conv_420sp_to_420sp.s",
-                "decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s",
+                "decoder/arm64/ihevcd_function_selector_av8.c",
+                "decoder/arm64/ihevcd_itrans_recon_dc_chroma.s",
+                "decoder/arm64/ihevcd_itrans_recon_dc_luma.s",
            ],
        },

        arm: {
            local_include_dirs: [
-                "decoder/arm",
                "common/arm",
+                "decoder/arm",
            ],

            srcs: [
                "decoder/arm/ihevcd_function_selector.c",
                "decoder/arm/ihevcd_function_selector_noneon.c",
+                "common/arm/ihevc_deblk_chroma_horz.s",
+                "common/arm/ihevc_deblk_chroma_vert.s",
+                "common/arm/ihevc_deblk_luma_horz.s",
+                "common/arm/ihevc_deblk_luma_vert.s",
+                "common/arm/ihevc_inter_pred_chroma_copy.s",
+                "common/arm/ihevc_inter_pred_chroma_copy_w16out.s",
+                "common/arm/ihevc_inter_pred_chroma_horz.s",
+                "common/arm/ihevc_inter_pred_chroma_horz_w16out.s",
+                "common/arm/ihevc_inter_pred_chroma_vert.s",
+                "common/arm/ihevc_inter_pred_chroma_vert_w16inp.s",
+                "common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s",
+                "common/arm/ihevc_inter_pred_chroma_vert_w16out.s",
+                "common/arm/ihevc_inter_pred_filters_luma_horz.s",
+                "common/arm/ihevc_inter_pred_filters_luma_vert.s",
+                "common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s",
+                "common/arm/ihevc_inter_pred_luma_copy.s",
+                "common/arm/ihevc_inter_pred_luma_copy_w16out.s",
+                "common/arm/ihevc_inter_pred_luma_horz_w16out.s",
+                "common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s",
+                "common/arm/ihevc_intra_pred_chroma_dc.s",
+                "common/arm/ihevc_intra_pred_chroma_horz.s",
+                "common/arm/ihevc_intra_pred_chroma_mode2.s",
+                "common/arm/ihevc_intra_pred_chroma_mode_18_34.s",
+                "common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s",
+                "common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s",
+                "common/arm/ihevc_intra_pred_chroma_planar.s",
+                "common/arm/ihevc_intra_pred_chroma_ver.s",
+                "common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s",
+                "common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s",
+                "common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s",
+                "common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s",
+                "common/arm/ihevc_intra_pred_filters_neon_intr.c",
+                "common/arm/ihevc_intra_pred_luma_dc.s",
+                "common/arm/ihevc_intra_pred_luma_horz.s",
+                "common/arm/ihevc_intra_pred_luma_mode2.s",
+                "common/arm/ihevc_intra_pred_luma_mode_18_34.s",
+                "common/arm/ihevc_intra_pred_luma_mode_27_to_33.s",
+                "common/arm/ihevc_intra_pred_luma_mode_3_to_9.s",
+                "common/arm/ihevc_intra_pred_luma_planar.s",
+                "common/arm/ihevc_intra_pred_luma_vert.s",
+                "common/arm/ihevc_intra_ref_substitution_a9q.c",
+                "common/arm/ihevc_itrans_recon_16x16.s",
+                "common/arm/ihevc_itrans_recon_32x32.s",
+                "common/arm/ihevc_itrans_recon_4x4.s",
+                "common/arm/ihevc_itrans_recon_4x4_ttype1.s",
+                "common/arm/ihevc_itrans_recon_8x8.s",
+                "common/arm/ihevc_mem_fns.s",
+                "common/arm/ihevc_padding.s",
+                "common/arm/ihevc_sao_band_offset_chroma.s",
+                "common/arm/ihevc_sao_band_offset_luma.s",
+                "common/arm/ihevc_sao_edge_offset_class0.s",
+                "common/arm/ihevc_sao_edge_offset_class0_chroma.s",
+                "common/arm/ihevc_sao_edge_offset_class1.s",
+                "common/arm/ihevc_sao_edge_offset_class1_chroma.s",
+                "common/arm/ihevc_sao_edge_offset_class2.s",
+                "common/arm/ihevc_sao_edge_offset_class2_chroma.s",
+                "common/arm/ihevc_sao_edge_offset_class3.s",
+                "common/arm/ihevc_sao_edge_offset_class3_chroma.s",
+                "common/arm/ihevc_weighted_pred_bi.s",
+                "common/arm/ihevc_weighted_pred_bi_default.s",
+                "common/arm/ihevc_weighted_pred_neon_intr.c",
+                "common/arm/ihevc_weighted_pred_uni.s",
+                "decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s",
+                "decoder/arm/ihevcd_fmt_conv_420sp_to_420sp.s",
+                "decoder/arm/ihevcd_function_selector_a9q.c",
+                "decoder/arm/ihevcd_itrans_recon_dc_chroma.s",
+                "decoder/arm/ihevcd_itrans_recon_dc_luma.s",
            ],

            cflags: [
+                // Modules coded with neon intrinsics are not not included in Android Build.
                "-DDISABLE_NEONINTR",
                "-DARM",
                "-DARMGCC",
                "-fno-tree-vectorize",
-
-                // These will be overriden by armv7_a_neon
-                "-DDISABLE_NEON",
-                "-DDEFAULT_ARCH=D_ARCH_ARM_NONEON",
+                "-DDEFAULT_ARCH=D_ARCH_ARM_A9Q",
            ],

            instruction_set: "arm",
-
-            neon: {
-                srcs: [
-                    "decoder/arm/ihevcd_function_selector_a9q.c",
-                    "common/arm/ihevc_intra_ref_substitution_a9q.c",
-                    "common/arm/ihevc_intra_pred_filters_neon_intr.c",
-                    "common/arm/ihevc_weighted_pred_neon_intr.c",
-                    "common/arm/ihevc_mem_fns.s",
-                    "common/arm/ihevc_itrans_recon_32x32.s",
-                    "common/arm/ihevc_weighted_pred_bi_default.s",
-                    "common/arm/ihevc_weighted_pred_bi.s",
-                    "common/arm/ihevc_weighted_pred_uni.s",
-                    "common/arm/ihevc_deblk_luma_horz.s",
-                    "common/arm/ihevc_deblk_luma_vert.s",
-                    "common/arm/ihevc_deblk_chroma_vert.s",
-                    "common/arm/ihevc_deblk_chroma_horz.s",
-                    "common/arm/ihevc_sao_band_offset_luma.s",
-                    "common/arm/ihevc_sao_band_offset_chroma.s",
-                    "common/arm/ihevc_sao_edge_offset_class0.s",
-                    "common/arm/ihevc_sao_edge_offset_class0_chroma.s",
-                    "common/arm/ihevc_sao_edge_offset_class1.s",
-                    "common/arm/ihevc_sao_edge_offset_class1_chroma.s",
-                    "common/arm/ihevc_sao_edge_offset_class2.s",
-                    "common/arm/ihevc_sao_edge_offset_class2_chroma.s",
-                    "common/arm/ihevc_sao_edge_offset_class3.s",
-                    "common/arm/ihevc_sao_edge_offset_class3_chroma.s",
-                    "common/arm/ihevc_inter_pred_luma_horz_w16out.s",
-                    "common/arm/ihevc_inter_pred_filters_luma_horz.s",
-                    "common/arm/ihevc_inter_pred_filters_luma_vert.s",
-                    "common/arm/ihevc_inter_pred_chroma_horz.s",
-                    "common/arm/ihevc_inter_pred_chroma_horz_w16out.s",
-                    "common/arm/ihevc_inter_pred_chroma_vert.s",
-                    "common/arm/ihevc_inter_pred_chroma_vert_w16out.s",
-                    "common/arm/ihevc_inter_pred_chroma_vert_w16inp.s",
-                    "common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s",
-                    "common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s",
-                    "common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s",
-                    "common/arm/ihevc_inter_pred_luma_copy_w16out.s",
-                    "common/arm/ihevc_inter_pred_luma_copy.s",
-                    "common/arm/ihevc_inter_pred_chroma_copy.s",
-                    "common/arm/ihevc_inter_pred_chroma_copy_w16out.s",
-                    "common/arm/ihevc_itrans_recon_4x4_ttype1.s",
-                    "common/arm/ihevc_itrans_recon_4x4.s",
-                    "common/arm/ihevc_itrans_recon_8x8.s",
-                    "common/arm/ihevc_itrans_recon_16x16.s",
-                    "common/arm/ihevc_intra_pred_chroma_planar.s",
-                    "common/arm/ihevc_intra_pred_chroma_dc.s",
-                    "common/arm/ihevc_intra_pred_chroma_horz.s",
-                    "common/arm/ihevc_intra_pred_chroma_ver.s",
-                    "common/arm/ihevc_intra_pred_chroma_mode2.s",
-                    "common/arm/ihevc_intra_pred_chroma_mode_18_34.s",
-                    "common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s",
-                    "common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s",
-                    "common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s",
-                    "common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s",
-                    "common/arm/ihevc_intra_pred_luma_planar.s",
-                    "common/arm/ihevc_intra_pred_luma_horz.s",
-                    "common/arm/ihevc_intra_pred_luma_mode2.s",
-                    "common/arm/ihevc_intra_pred_luma_mode_27_to_33.s",
-                    "common/arm/ihevc_intra_pred_luma_mode_18_34.s",
-                    "common/arm/ihevc_intra_pred_luma_vert.s",
-                    "common/arm/ihevc_intra_pred_luma_dc.s",
-                    "common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s",
-                    "common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s",
-                    "common/arm/ihevc_intra_pred_luma_mode_3_to_9.s",
-                    "common/arm/ihevc_padding.s",
-                    "decoder/arm/ihevcd_itrans_recon_dc_luma.s",
-                    "decoder/arm/ihevcd_itrans_recon_dc_chroma.s",
-                    "decoder/arm/ihevcd_fmt_conv_420sp_to_420p.s",
-                    "decoder/arm/ihevcd_fmt_conv_420sp_to_420sp.s",
-                    "decoder/arm/ihevcd_fmt_conv_420sp_to_rgba8888.s",
-                ],
-                cflags: [
-                    "-UDISABLE_NEON",
-                    "-UDEFAULT_ARCH",
-                    "-DDEFAULT_ARCH=D_ARCH_ARM_A9Q",
-                ],
-            },
        },

        x86_64: {
@ -324,37 +311,37 @@ cc_library_static {
            ],

            local_include_dirs: [
-                "decoder/x86",
                "common/x86",
+                "decoder/x86",
            ],

            srcs: [
-                "decoder/x86/ihevcd_function_selector.c",
-                "decoder/x86/ihevcd_function_selector_generic.c",
-                "decoder/x86/ihevcd_function_selector_ssse3.c",
-                "decoder/x86/ihevcd_function_selector_sse42.c",
-                "common/x86/ihevc_inter_pred_filters_ssse3_intr.c",
-                "common/x86/ihevc_weighted_pred_ssse3_intr.c",
-                "common/x86/ihevc_intra_pred_filters_ssse3_intr.c",
-                "common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c",
-                "common/x86/ihevc_itrans_recon_ssse3_intr.c",
-                "common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c",
-                "common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c",
-                "common/x86/ihevc_sao_ssse3_intr.c",
-                "common/x86/ihevc_deblk_ssse3_intr.c",
-                "common/x86/ihevc_padding_ssse3_intr.c",
-                "common/x86/ihevc_mem_fns_ssse3_intr.c",
-                "decoder/x86/ihevcd_fmt_conv_ssse3_intr.c",
-                "decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c",
-                "common/x86/ihevc_inter_pred_filters_sse42_intr.c",
-                "common/x86/ihevc_weighted_pred_sse42_intr.c",
-                "common/x86/ihevc_intra_pred_filters_sse42_intr.c",
-                "common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c",
-                "common/x86/ihevc_itrans_recon_sse42_intr.c",
                "common/x86/ihevc_16x16_itrans_recon_sse42_intr.c",
                "common/x86/ihevc_32x32_itrans_recon_sse42_intr.c",
-                "decoder/x86/ihevcd_it_rec_dc_sse42_intr.c",
+                "common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c",
+                "common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c",
+                "common/x86/ihevc_deblk_ssse3_intr.c",
+                "common/x86/ihevc_inter_pred_filters_sse42_intr.c",
+                "common/x86/ihevc_inter_pred_filters_ssse3_intr.c",
+                "common/x86/ihevc_intra_pred_filters_sse42_intr.c",
+                "common/x86/ihevc_intra_pred_filters_ssse3_intr.c",
+                "common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c",
+                "common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c",
+                "common/x86/ihevc_itrans_recon_sse42_intr.c",
+                "common/x86/ihevc_itrans_recon_ssse3_intr.c",
+                "common/x86/ihevc_mem_fns_ssse3_intr.c",
+                "common/x86/ihevc_padding_ssse3_intr.c",
+                "common/x86/ihevc_sao_ssse3_intr.c",
                "common/x86/ihevc_tables_x86_intr.c",
+                "common/x86/ihevc_weighted_pred_sse42_intr.c",
+                "common/x86/ihevc_weighted_pred_ssse3_intr.c",
+                "decoder/x86/ihevcd_fmt_conv_ssse3_intr.c",
+                "decoder/x86/ihevcd_function_selector.c",
+                "decoder/x86/ihevcd_function_selector_generic.c",
+                "decoder/x86/ihevcd_function_selector_sse42.c",
+                "decoder/x86/ihevcd_function_selector_ssse3.c",
+                "decoder/x86/ihevcd_it_rec_dc_sse42_intr.c",
+                "decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c",
            ],
        },

@ -368,37 +355,37 @@ cc_library_static {
            ],

            local_include_dirs: [
-                "decoder/x86",
                "common/x86",
+                "decoder/x86",
            ],

            srcs: [
-                "decoder/x86/ihevcd_function_selector.c",
-                "decoder/x86/ihevcd_function_selector_generic.c",
-                "decoder/x86/ihevcd_function_selector_ssse3.c",
-                "decoder/x86/ihevcd_function_selector_sse42.c",
-                "common/x86/ihevc_inter_pred_filters_ssse3_intr.c",
-                "common/x86/ihevc_weighted_pred_ssse3_intr.c",
-                "common/x86/ihevc_intra_pred_filters_ssse3_intr.c",
-                "common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c",
-                "common/x86/ihevc_itrans_recon_ssse3_intr.c",
-                "common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c",
-                "common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c",
-                "common/x86/ihevc_sao_ssse3_intr.c",
-                "common/x86/ihevc_deblk_ssse3_intr.c",
-                "common/x86/ihevc_padding_ssse3_intr.c",
-                "common/x86/ihevc_mem_fns_ssse3_intr.c",
-                "decoder/x86/ihevcd_fmt_conv_ssse3_intr.c",
-                "decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c",
-                "common/x86/ihevc_inter_pred_filters_sse42_intr.c",
-                "common/x86/ihevc_weighted_pred_sse42_intr.c",
-                "common/x86/ihevc_intra_pred_filters_sse42_intr.c",
-                "common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c",
-                "common/x86/ihevc_itrans_recon_sse42_intr.c",
                "common/x86/ihevc_16x16_itrans_recon_sse42_intr.c",
                "common/x86/ihevc_32x32_itrans_recon_sse42_intr.c",
-                "decoder/x86/ihevcd_it_rec_dc_sse42_intr.c",
+                "common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c",
+                "common/x86/ihevc_chroma_intra_pred_filters_ssse3_intr.c",
+                "common/x86/ihevc_deblk_ssse3_intr.c",
+                "common/x86/ihevc_inter_pred_filters_sse42_intr.c",
+                "common/x86/ihevc_inter_pred_filters_ssse3_intr.c",
+                "common/x86/ihevc_intra_pred_filters_sse42_intr.c",
+                "common/x86/ihevc_intra_pred_filters_ssse3_intr.c",
+                "common/x86/ihevc_itrans_recon_16x16_ssse3_intr.c",
+                "common/x86/ihevc_itrans_recon_32x32_ssse3_intr.c",
+                "common/x86/ihevc_itrans_recon_sse42_intr.c",
+                "common/x86/ihevc_itrans_recon_ssse3_intr.c",
+                "common/x86/ihevc_mem_fns_ssse3_intr.c",
+                "common/x86/ihevc_padding_ssse3_intr.c",
+                "common/x86/ihevc_sao_ssse3_intr.c",
                "common/x86/ihevc_tables_x86_intr.c",
+                "common/x86/ihevc_weighted_pred_sse42_intr.c",
+                "common/x86/ihevc_weighted_pred_ssse3_intr.c",
+                "decoder/x86/ihevcd_fmt_conv_ssse3_intr.c",
+                "decoder/x86/ihevcd_function_selector.c",
+                "decoder/x86/ihevcd_function_selector_generic.c",
+                "decoder/x86/ihevcd_function_selector_sse42.c",
+                "decoder/x86/ihevcd_function_selector_ssse3.c",
+                "decoder/x86/ihevcd_it_rec_dc_sse42_intr.c",
+                "decoder/x86/ihevcd_it_rec_dc_ssse3_intr.c",
            ],
        },
        riscv64: {
@ -418,7 +405,7 @@ cc_library_static {
        misc_undefined: ["bounds"],
        // Enable CFI if this becomes a shared library.
        cfi: true,
-	config: {
+        config: {
            cfi_assembly_support: true,
        },
        blocklist: "libhevc_blocklist.txt",
@ -431,7 +418,7 @@ cc_library_static {
    min_sdk_version: "29",
 }

-cc_test {
+cc_binary {
    name: "hevcdec",
    host_supported: true,
    cflags: [
@ -442,7 +429,6 @@ cc_test {
        "-Wall",
        "-Werror",
    ],
-    gtest: false,
    srcs: ["test/decoder/main.c"],
    static_libs: ["libhevcdec"],
    target: {
@ -458,12 +444,12 @@ cc_library_static {
    host_supported: true,
    cflags: [
        "-DENABLE_MAIN_REXT_PROFILE",
+        "-DDISABLE_SEI",
        "-fPIC",
        "-O3",
        "-Wall",
        "-Wno-unused-variable",
        "-Wno-unused-parameter",
-        "-Wno-switch",
    ],

    export_include_dirs: [
@ -475,6 +461,7 @@ cc_library_static {
        "common/ihevc_cabac_tables.c",
        "common/ihevc_chroma_intra_pred_filters.c",
        "common/ihevc_chroma_itrans_recon.c",
+        "common/ihevc_chroma_itrans_recon_32x32.c",
        "common/ihevc_chroma_itrans_recon_16x16.c",
        "common/ihevc_chroma_itrans_recon_8x8.c",
        "common/ihevc_common_tables.c",
@ -591,31 +578,18 @@ cc_library_static {
        arm64: {

            local_include_dirs: [
-                "encoder/arm",
                "common/arm",
                "common/arm64",
+                "encoder/arm",
            ],

            srcs: [
-                "encoder/arm/ihevce_coarse_layer_sad_neon.c",
-                "encoder/arm/ihevce_common_utils_neon.c",
-                "encoder/arm/ihevce_copy_neon.c",
-                "encoder/arm/ihevce_had_compute_neon.c",
-                "encoder/arm/ihevce_hme_utils_neon.c",
-                "encoder/arm/ihevce_itrans_recon_neon.c",
-                "encoder/arm/ihevce_me_neon.c",
-                "encoder/arm/ihevce_sad_compute_neon.c",
-                "encoder/arm/ihevce_scale_by_2_neon.c",
-                "encoder/arm/ihevce_scan_coeffs_neon.c",
-                "encoder/arm/ihevce_ssd_and_sad_calculator_neon.c",
-                "encoder/arm/ihevce_ssd_calculator_neon.c",
-                "encoder/arm/ihevce_subpel_neon.c",
+                "common/arm/ihevc_intra_pred_filters_neon_intr.c",
+                "common/arm/ihevc_intra_ref_substitution_a9q.c",
+                "common/arm/ihevc_quant_iquant_ssd_neon_intr.c",
                "common/arm/ihevc_resi_trans_neon.c",
                "common/arm/ihevc_resi_trans_neon_32x32.c",
-                "common/arm/ihevc_quant_iquant_ssd_neon_intr.c",
-                "common/arm/ihevc_intra_pred_filters_neon_intr.c",
                "common/arm/ihevc_weighted_pred_neon_intr.c",
-                "common/arm/ihevc_intra_ref_substitution_a9q.c",
                "common/arm64/ihevc_deblk_chroma_horz.s",
                "common/arm64/ihevc_deblk_chroma_vert.s",
                "common/arm64/ihevc_deblk_luma_horz.s",
@ -676,6 +650,19 @@ cc_library_static {
                "common/arm64/ihevc_weighted_pred_bi.s",
                "common/arm64/ihevc_weighted_pred_bi_default.s",
                "common/arm64/ihevc_weighted_pred_uni.s",
+                "encoder/arm/ihevce_coarse_layer_sad_neon.c",
+                "encoder/arm/ihevce_common_utils_neon.c",
+                "encoder/arm/ihevce_copy_neon.c",
+                "encoder/arm/ihevce_had_compute_neon.c",
+                "encoder/arm/ihevce_hme_utils_neon.c",
+                "encoder/arm/ihevce_itrans_recon_neon.c",
+                "encoder/arm/ihevce_me_neon.c",
+                "encoder/arm/ihevce_sad_compute_neon.c",
+                "encoder/arm/ihevce_scale_by_2_neon.c",
+                "encoder/arm/ihevce_scan_coeffs_neon.c",
+                "encoder/arm/ihevce_ssd_and_sad_calculator_neon.c",
+                "encoder/arm/ihevce_ssd_calculator_neon.c",
+                "encoder/arm/ihevce_subpel_neon.c",
            ],

            cflags: [
@ -687,101 +674,98 @@ cc_library_static {

        arm: {
            local_include_dirs: [
-                "encoder/arm",
                "common/arm",
+                "encoder/arm",
            ],

+            srcs: [
+                "common/arm/ihevc_deblk_chroma_horz.s",
+                "common/arm/ihevc_deblk_chroma_vert.s",
+                "common/arm/ihevc_deblk_luma_horz.s",
+                "common/arm/ihevc_deblk_luma_vert.s",
+                "common/arm/ihevc_inter_pred_chroma_copy.s",
+                "common/arm/ihevc_inter_pred_chroma_copy_w16out.s",
+                "common/arm/ihevc_inter_pred_chroma_horz.s",
+                "common/arm/ihevc_inter_pred_chroma_horz_w16out.s",
+                "common/arm/ihevc_inter_pred_chroma_vert.s",
+                "common/arm/ihevc_inter_pred_chroma_vert_w16inp.s",
+                "common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s",
+                "common/arm/ihevc_inter_pred_chroma_vert_w16out.s",
+                "common/arm/ihevc_inter_pred_filters_luma_horz.s",
+                "common/arm/ihevc_inter_pred_filters_luma_vert.s",
+                "common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s",
+                "common/arm/ihevc_inter_pred_luma_copy.s",
+                "common/arm/ihevc_inter_pred_luma_copy_w16out.s",
+                "common/arm/ihevc_inter_pred_luma_horz_w16out.s",
+                "common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s",
+                "common/arm/ihevc_intra_pred_chroma_dc.s",
+                "common/arm/ihevc_intra_pred_chroma_horz.s",
+                "common/arm/ihevc_intra_pred_chroma_mode2.s",
+                "common/arm/ihevc_intra_pred_chroma_mode_18_34.s",
+                "common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s",
+                "common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s",
+                "common/arm/ihevc_intra_pred_chroma_planar.s",
+                "common/arm/ihevc_intra_pred_chroma_ver.s",
+                "common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s",
+                "common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s",
+                "common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s",
+                "common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s",
+                "common/arm/ihevc_intra_pred_filters_neon_intr.c",
+                "common/arm/ihevc_intra_pred_luma_dc.s",
+                "common/arm/ihevc_intra_pred_luma_horz.s",
+                "common/arm/ihevc_intra_pred_luma_mode2.s",
+                "common/arm/ihevc_intra_pred_luma_mode_18_34.s",
+                "common/arm/ihevc_intra_pred_luma_mode_27_to_33.s",
+                "common/arm/ihevc_intra_pred_luma_mode_3_to_9.s",
+                "common/arm/ihevc_intra_pred_luma_planar.s",
+                "common/arm/ihevc_intra_pred_luma_vert.s",
+                "common/arm/ihevc_intra_ref_substitution_a9q.c",
+                "common/arm/ihevc_itrans_recon_16x16.s",
+                "common/arm/ihevc_itrans_recon_32x32.s",
+                "common/arm/ihevc_itrans_recon_4x4.s",
+                "common/arm/ihevc_itrans_recon_4x4_ttype1.s",
+                "common/arm/ihevc_itrans_recon_8x8.s",
+                "common/arm/ihevc_mem_fns.s",
+                "common/arm/ihevc_padding.s",
+                "common/arm/ihevc_quant_iquant_ssd_neon_intr.c",
+                "common/arm/ihevc_resi_trans.s",
+                "common/arm/ihevc_resi_trans_32x32_a9q.s",
+                "common/arm/ihevc_resi_trans_neon.c",
+                "common/arm/ihevc_resi_trans_neon_32x32.c",
+                "common/arm/ihevc_sao_band_offset_chroma.s",
+                "common/arm/ihevc_sao_band_offset_luma.s",
+                "common/arm/ihevc_sao_edge_offset_class0.s",
+                "common/arm/ihevc_sao_edge_offset_class0_chroma.s",
+                "common/arm/ihevc_sao_edge_offset_class1.s",
+                "common/arm/ihevc_sao_edge_offset_class1_chroma.s",
+                "common/arm/ihevc_sao_edge_offset_class2.s",
+                "common/arm/ihevc_sao_edge_offset_class2_chroma.s",
+                "common/arm/ihevc_sao_edge_offset_class3.s",
+                "common/arm/ihevc_sao_edge_offset_class3_chroma.s",
+                "common/arm/ihevc_weighted_pred_bi.s",
+                "common/arm/ihevc_weighted_pred_bi_default.s",
+                "common/arm/ihevc_weighted_pred_neon_intr.c",
+                "common/arm/ihevc_weighted_pred_uni.s",
+                "encoder/arm/ihevce_coarse_layer_sad_neon.c",
+                "encoder/arm/ihevce_common_utils_neon.c",
+                "encoder/arm/ihevce_copy_neon.c",
+                "encoder/arm/ihevce_had_compute_neon.c",
+                "encoder/arm/ihevce_hme_utils_neon.c",
+                "encoder/arm/ihevce_itrans_recon_neon.c",
+                "encoder/arm/ihevce_me_neon.c",
+                "encoder/arm/ihevce_sad_compute_neon.c",
+                "encoder/arm/ihevce_scale_by_2_neon.c",
+                "encoder/arm/ihevce_scan_coeffs_neon.c",
+                "encoder/arm/ihevce_ssd_and_sad_calculator_neon.c",
+                "encoder/arm/ihevce_ssd_calculator_neon.c",
+                "encoder/arm/ihevce_subpel_neon.c",
+            ],
+
+            cflags: [
+                "-DENABLE_NEON",
+                "-DARM",
+            ],
            instruction_set: "arm",
-
-            neon: {
-                srcs: [
-                    "encoder/arm/ihevce_coarse_layer_sad_neon.c",
-                    "encoder/arm/ihevce_common_utils_neon.c",
-                    "encoder/arm/ihevce_copy_neon.c",
-                    "encoder/arm/ihevce_had_compute_neon.c",
-                    "encoder/arm/ihevce_hme_utils_neon.c",
-                    "encoder/arm/ihevce_itrans_recon_neon.c",
-                    "encoder/arm/ihevce_me_neon.c",
-                    "encoder/arm/ihevce_sad_compute_neon.c",
-                    "encoder/arm/ihevce_scale_by_2_neon.c",
-                    "encoder/arm/ihevce_scan_coeffs_neon.c",
-                    "encoder/arm/ihevce_ssd_and_sad_calculator_neon.c",
-                    "encoder/arm/ihevce_ssd_calculator_neon.c",
-                    "encoder/arm/ihevce_subpel_neon.c",
-                    "common/arm/ihevc_resi_trans_neon.c",
-                    "common/arm/ihevc_resi_trans_neon_32x32.c",
-                    "common/arm/ihevc_quant_iquant_ssd_neon_intr.c",
-                    "common/arm/ihevc_intra_pred_filters_neon_intr.c",
-                    "common/arm/ihevc_weighted_pred_neon_intr.c",
-                    "common/arm/ihevc_intra_ref_substitution_a9q.c",
-                    "common/arm/ihevc_deblk_chroma_horz.s",
-                    "common/arm/ihevc_deblk_chroma_vert.s",
-                    "common/arm/ihevc_deblk_luma_horz.s",
-                    "common/arm/ihevc_deblk_luma_vert.s",
-                    "common/arm/ihevc_inter_pred_chroma_copy.s",
-                    "common/arm/ihevc_inter_pred_chroma_copy_w16out.s",
-                    "common/arm/ihevc_inter_pred_chroma_horz.s",
-                    "common/arm/ihevc_inter_pred_chroma_horz_w16out.s",
-                    "common/arm/ihevc_inter_pred_chroma_vert.s",
-                    "common/arm/ihevc_inter_pred_chroma_vert_w16inp.s",
-                    "common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s",
-                    "common/arm/ihevc_inter_pred_chroma_vert_w16out.s",
-                    "common/arm/ihevc_inter_pred_filters_luma_horz.s",
-                    "common/arm/ihevc_inter_pred_filters_luma_vert.s",
-                    "common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s",
-                    "common/arm/ihevc_inter_pred_luma_copy.s",
-                    "common/arm/ihevc_inter_pred_luma_copy_w16out.s",
-                    "common/arm/ihevc_inter_pred_luma_horz_w16out.s",
-                    "common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s",
-                    "common/arm/ihevc_intra_pred_chroma_dc.s",
-                    "common/arm/ihevc_intra_pred_chroma_horz.s",
-                    "common/arm/ihevc_intra_pred_chroma_mode2.s",
-                    "common/arm/ihevc_intra_pred_chroma_mode_18_34.s",
-                    "common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s",
-                    "common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s",
-                    "common/arm/ihevc_intra_pred_chroma_planar.s",
-                    "common/arm/ihevc_intra_pred_chroma_ver.s",
-                    "common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s",
-                    "common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s",
-                    "common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s",
-                    "common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s",
-                    "common/arm/ihevc_intra_pred_luma_dc.s",
-                    "common/arm/ihevc_intra_pred_luma_horz.s",
-                    "common/arm/ihevc_intra_pred_luma_mode2.s",
-                    "common/arm/ihevc_intra_pred_luma_mode_18_34.s",
-                    "common/arm/ihevc_intra_pred_luma_mode_27_to_33.s",
-                    "common/arm/ihevc_intra_pred_luma_mode_3_to_9.s",
-                    "common/arm/ihevc_intra_pred_luma_planar.s",
-                    "common/arm/ihevc_intra_pred_luma_vert.s",
-                    "common/arm/ihevc_itrans_recon_16x16.s",
-                    "common/arm/ihevc_itrans_recon_32x32.s",
-                    "common/arm/ihevc_itrans_recon_4x4.s",
-                    "common/arm/ihevc_itrans_recon_4x4_ttype1.s",
-                    "common/arm/ihevc_itrans_recon_8x8.s",
-                    "common/arm/ihevc_resi_trans.s",
-                    "common/arm/ihevc_resi_trans_32x32_a9q.s",
-                    "common/arm/ihevc_mem_fns.s",
-                    "common/arm/ihevc_padding.s",
-                    "common/arm/ihevc_sao_band_offset_chroma.s",
-                    "common/arm/ihevc_sao_band_offset_luma.s",
-                    "common/arm/ihevc_sao_edge_offset_class0.s",
-                    "common/arm/ihevc_sao_edge_offset_class0_chroma.s",
-                    "common/arm/ihevc_sao_edge_offset_class1.s",
-                    "common/arm/ihevc_sao_edge_offset_class1_chroma.s",
-                    "common/arm/ihevc_sao_edge_offset_class2.s",
-                    "common/arm/ihevc_sao_edge_offset_class2_chroma.s",
-                    "common/arm/ihevc_sao_edge_offset_class3.s",
-                    "common/arm/ihevc_sao_edge_offset_class3_chroma.s",
-                    "common/arm/ihevc_weighted_pred_bi_default.s",
-                    "common/arm/ihevc_weighted_pred_bi.s",
-                    "common/arm/ihevc_weighted_pred_uni.s",
-                ],
-
-                cflags: [
-                    "-DENABLE_NEON",
-                    "-DARM",
-                ],
-            },
        },

        x86_64: {
@ -807,7 +791,7 @@ cc_library_static {
        misc_undefined: ["bounds"],
        // Enable CFI if this becomes a shared library.
        cfi: true,
-	config: {
+        config: {
            cfi_assembly_support: true,
        },
        blocklist: "libhevc_blocklist.txt",
@ -820,7 +804,7 @@ cc_library_static {
    min_sdk_version: "29",
 }

-cc_test {
+cc_binary {
    name: "hevcenc",
    host_supported: true,
    cflags: [
@ -829,7 +813,6 @@ cc_test {
        "-Wall",
        "-Werror",
    ],
-    gtest: false,
    srcs: ["test/encoder/main.c"],
    static_libs: ["libhevcenc"],
    sanitize: {
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,17 @@
 cmake_minimum_required(VERSION 3.9.1)
 project(libhevc C CXX)
-enable_language(ASM)
+
+if(NOT DEFINED SYSTEM_NAME)
+  set(SYSTEM_NAME ${CMAKE_HOST_SYSTEM_NAME})
+endif()
+
+if(NOT DEFINED SYSTEM_PROCESSOR)
+  set(SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR})
+endif()
+
+if(NOT "${SYSTEM_NAME}" STREQUAL "Darwin")
+  enable_language(ASM)
+endif()

 set(HEVC_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 set(HEVC_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
@ -31,6 +42,7 @@ endif()
 include("${HEVC_ROOT}/cmake/utils.cmake")

 libhevc_add_compile_options()
+libhevc_add_gtest()
 libhevc_add_definitions()
 libhevc_set_link_libraries()

@ -43,3 +55,5 @@ include("${HEVC_ROOT}/test/encoder/hevcenc.cmake")

 include("${HEVC_ROOT}/fuzzer/hevc_dec_fuzzer.cmake")
 include("${HEVC_ROOT}/fuzzer/hevc_enc_fuzzer.cmake")
+
+include("${HEVC_ROOT}/tests/common/common.cmake")
--- a/1
+++ b/1
@ -1,3 +1,4 @@
 # owners for external/libhevc
+include platform/frameworks/av:/media/janitors/avic_OWNERS
 include platform/frameworks/av:/media/janitors/codec_OWNERS
 essick@google.com
--- a/README.md
+++ b/README.md
@ -7,6 +7,8 @@ Supports:
 - aarch32/aarch64 on Linux.
 - aarch32/aarch64 on Android.
 - x86_32/x86_64 on Linux.
+- aarch64 on Mac.
+- x86_64 on Mac.

 ## Native Builds
 Use the following commands for building on the target machine
@ -51,3 +53,29 @@ $ make
 $ cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/aarch32_toolchain.cmake
 $ make
 ```
+
+### Building for android
+NOTE: This assumes that you are building on a machine that has
+ [Android NDK](https://developer.android.com/ndk/downloads).
+
+```
+$ cd external/libhevc
+$ mkdir build
+$ cd build
+```
+
+#### Armv7 (32-bit)
+
+    cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/android_toolchain.cmake\
+        -DHEVC_ANDROID_NDK_PATH=/opt/android-ndk-r26d/\
+        -DANDROID_ABI=armeabi-v7a\
+        -DANDROID_PLATFORM=android-23 ../
+    make
+
+#### Armv8 (64-bit)
+
+    cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/android_toolchain.cmake\
+        -DHEVC_ANDROID_NDK_PATH=/opt/android-ndk-r26d/\
+        -DANDROID_ABI=arm64-v8a\
+        -DANDROID_PLATFORM=android-23 ../
+    make
--- a/cmake/toolchains/aarch32_toolchain.cmake
+++ b/cmake/toolchains/aarch32_toolchain.cmake
@ -1,7 +1,10 @@
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR aarch32)
+set(SYSTEM_NAME Linux)
+set(SYSTEM_PROCESSOR aarch32)

 # Modify these variables with paths to appropriate compilers that can produce
 # armv7 targets
 set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
 set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
+
+# Build all binaries as static, so that they can be run using qemu
+set(CMAKE_EXE_LINKER_FLAGS "-static")
--- a/cmake/toolchains/aarch64_toolchain.cmake
+++ b/cmake/toolchains/aarch64_toolchain.cmake
@ -1,5 +1,5 @@
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR aarch64)
+set(SYSTEM_NAME Linux)
+set(SYSTEM_PROCESSOR aarch64)

 # Modify these variables with paths to appropriate compilers that can produce
 # armv8 targets
@ -11,3 +11,6 @@ set(CMAKE_C_COMPILER_AR
 set(CMAKE_CXX_COMPILER_AR
    aarch64-linux-gnu-gcc-ar
    CACHE FILEPATH "Archiver")
+
+# Build all binaries as static, so that they can be run using qemu
+set(CMAKE_EXE_LINKER_FLAGS "-static")
--- a/cmake/toolchains/android_toolchain.cmake
+++ b/cmake/toolchains/android_toolchain.cmake
@ -0,0 +1,34 @@
+set(SYSTEM_NAME Android)
+set(CMAKE_SYSTEM_NAME Android)
+
+if(NOT ANDROID_PLATFORM)
+  set(ANDROID_PLATFORM android-23)
+endif()
+
+# Choose target architecture with:
+# -DANDROID_ABI={armeabi-v7a, arm64-v8a, x86, x86_64}
+if(NOT ANDROID_ABI)
+  set(ANDROID_ABI arm64-v8a)
+endif()
+
+if(ANDROID_ABI MATCHES "^armeabi")
+  set(SYSTEM_PROCESSOR aarch32)
+else()
+  set(SYSTEM_PROCESSOR aarch64)
+endif()
+
+# Toolchain files don't have access to cached variables:
+# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
+# environment variable when loaded the first time.
+if(HEVC_ANDROID_NDK_PATH)
+  set(ENV{HEVC_ANDROID_NDK_PATH} "${HEVC_ANDROID_NDK_PATH}")
+else()
+  set(HEVC_ANDROID_NDK_PATH "$ENV{HEVC_ANDROID_NDK_PATH}")
+endif()
+
+if(NOT HEVC_ANDROID_NDK_PATH)
+  message(FATAL_ERROR "HEVC_ANDROID_NDK_PATH not set.")
+  return()
+endif()
+
+include("${HEVC_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -2,9 +2,9 @@ include(CheckCXXCompilerFlag)

 # Adds compiler options for all targets
 function(libhevc_add_compile_options)
-  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+  if("${SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${SYSTEM_PROCESSOR}" STREQUAL "arm64")
    add_compile_options(-march=armv8-a)
-  elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch32")
+  elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch32")
    add_compile_options(-march=armv7-a -mfpu=neon)
  else()
    add_compile_options(-msse4.2 -mno-avx)
@ -32,9 +32,15 @@ endfunction()

 # Adds defintions for all targets
 function(libhevc_add_definitions)
-  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+  if("${SYSTEM_NAME}" STREQUAL "Darwin")
+    if("${SYSTEM_PROCESSOR}" STREQUAL "arm64")
+      add_definitions(-DARMV8 -DDARWIN -DDEFAULT_ARCH=D_ARCH_ARMV8_GENERIC)
+    else()
+      add_definitions(-DX86 -DDARWIN -DDISABLE_AVX2 -DDEFAULT_ARCH=D_ARCH_X86_GENERIC)
+    endif()
+  elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch64")
    add_definitions(-DARMV8 -DDEFAULT_ARCH=D_ARCH_ARMV8_GENERIC -DENABLE_NEON)
-  elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch32")
+  elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch32")
    add_definitions(-DARMV7 -DDEFAULT_ARCH=D_ARCH_ARM_A9Q -DENABLE_NEON
                    -DDISABLE_NEONINTR)
  else()
@ -112,3 +118,35 @@ endfunction()
 function(libhevc_add_fuzzer NAME LIB)
  libhevc_add_executable(${NAME} ${LIB} FUZZER 1 ${ARGV})
 endfunction()
+
+# Adds GoogleTest and Threads dependency
+function(libhevc_add_gtest)
+  include(FetchContent)
+  FetchContent_Declare(
+    googletest
+    URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
+  )
+  # For Windows: Prevent overriding the parent project's compiler/linker settings
+  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  FetchContent_MakeAvailable(googletest)
+endfunction()
+
+# cmake-format: off
+# Adds a target for a gtest executable
+#
+# Arguments:
+# NAME: Name of the executable
+#
+# Optional Arguments:
+# SOURCES: Additional source files
+# cmake-format: on
+function(libhevc_add_gtest_executable NAME)
+  set(multi_value_args SOURCES)
+  cmake_parse_arguments(ARG "" "" "${multi_value_args}" ${ARGN})
+
+  libhevc_add_executable(
+    ${NAME} libhevcdec
+    SOURCES ${HEVC_ROOT}/tests/common/func_selector.cc
+            ${HEVC_ROOT}/tests/common/tests_common.cc ${ARG_SOURCES}
+    LIBS GTest::gtest_main)
+endfunction()
--- a/common/arm/ihevc_intra_pred_filters_neon_intr.c
+++ b/common/arm/ihevc_intra_pred_filters_neon_intr.c
@ -438,6 +438,11 @@ void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
 * @param[in] mode
 *  integer intraprediction mode
 *
+ * @param[in] intra_smoothing_flags
+ *  integer bit 3 indicates if intra smoothing is enabled/disabled
+ *          unconditionally. this is applicable to frext profiles only
+ *          bit 0 indicates strong intra smoothing enabled/disabled
+ *
 * @returns
 *
 * @remarks
@ -451,7 +456,7 @@ void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
                                             WORD32 nt,
                                             UWORD8 *pu1_dst,
                                             WORD32 mode,
-                                             WORD32 strong_intra_smoothing_enable_flag)
+                                             WORD32 intra_smoothing_flags)
 {
    WORD32 filter_flag;
    WORD32 i = 0;
@ -475,10 +480,12 @@ void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
    WORD32 abs_cond_left_flag = 0;
    WORD32 abs_cond_top_flag = 0;
    WORD32 dc_val = 1 << (BIT_DEPTH - 5);
+    WORD32 intra_smoothing_disabled = (intra_smoothing_flags >> 3);
+    WORD32 strong_intra_smoothing_enable_flag = intra_smoothing_flags & 1;
+
    shift_res = vdup_n_u8(0);
-
-    filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
-
+    filter_flag = intra_smoothing_disabled ?
+                    0 : (gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2)));
    if(0 == filter_flag)
    {
        if(pu1_src == pu1_dst)
--- a/common/arm/ihevc_intra_ref_substitution_a9q.c
+++ b/common/arm/ihevc_intra_ref_substitution_a9q.c
@ -103,7 +103,8 @@ void ihevc_intra_pred_chroma_ref_substitution_a9q(UWORD8 *pu1_top_left,
                                                  WORD32 nt,
                                                  WORD32 nbr_flags,
                                                  UWORD8 *pu1_dst,
-                                                  WORD32 dst_strd)
+                                                  WORD32 dst_strd,
+                                                  WORD32 chroma_format_idc)
 {
    UWORD8 pu1_ref_u, pu1_ref_v;
    WORD32 dc_val, i, j;
@ -180,7 +181,7 @@ void ihevc_intra_pred_chroma_ref_substitution_a9q(UWORD8 *pu1_top_left,
            // U-V interleaved Top-top right samples
        }

-        if(nt == 4)
+        if(nt == 4 || (nt == 8 && chroma_format_idc == CHROMA_FMT_IDC_YUV444))
        {
            /* 1 bit extraction for all the neighboring blocks */
            tp_left = (nbr_flags & 0x10000) >> 16;
@ -248,8 +249,9 @@ void ihevc_intra_pred_chroma_ref_substitution_a9q(UWORD8 *pu1_top_left,

            }
        }
-        else if(nt == 8)
+        else if(nt == 8 || (nt == 16 && chroma_format_idc == CHROMA_FMT_IDC_YUV444))
        {
+            WORD32 sub_sample = chroma_format_idc == CHROMA_FMT_IDC_YUV444 ? 2 : 1;
            WORD32 nbr_flags_temp = 0;
            nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
                            + ((nbr_flags & 0x300) >> 4)
@ -259,16 +261,16 @@ void ihevc_intra_pred_chroma_ref_substitution_a9q(UWORD8 *pu1_top_left,
            /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
            {
-                nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 4; /* for bottom left and left */
-                if(nbr_id_from_bl == 32)
-                    nbr_id_from_bl = 16;
-                if(nbr_id_from_bl == 16)
+                nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * (4 * sub_sample); /* for bottom left and left */
+                if(nbr_id_from_bl == 32 * sub_sample)
+                    nbr_id_from_bl = 16 * sub_sample;
+                if(nbr_id_from_bl == 16 * sub_sample)
                {
                    /* for top left : 1 pel per nbr bit */
                    if(!((nbr_flags_temp >> 8) & 0x1))
                    {
                        nbr_id_from_bl++;
-                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 4; /* top and top right;  8 pels per nbr bit */
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 4 * sub_sample; /* top and top right;  8 pels per nbr bit */

                    }
                }
@ -287,14 +289,14 @@ void ihevc_intra_pred_chroma_ref_substitution_a9q(UWORD8 *pu1_top_left,
            }

            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
-            while(nbr_id_from_bl < ((T8C_4NT)+1))
+            while(nbr_id_from_bl < ((T8C_4NT * sub_sample)+1))
            {
                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
                /* Divide by 8 to obtain the original index */
-                frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+                frwd_nbr_flag = (nbr_id_from_bl >> (chroma_format_idc == CHROMA_FMT_IDC_YUV444 ? 3 : 2)); /*+ (nbr_id_from_bl & 0x1);*/

                /* The Top-left flag is at the last bit location of nbr_flags*/
-                if(nbr_id_from_bl == (T8C_4NT / 2))
+                if(nbr_id_from_bl == (T8C_4NT * sub_sample / 2))
                {
                    get_bits = GET_BIT(nbr_flags_temp, 8);

@ -313,22 +315,23 @@ void ihevc_intra_pred_chroma_ref_substitution_a9q(UWORD8 *pu1_top_left,
                        UWORD16 *pu2_dst;
                        /* 8 pel substitution (other than TL) */
                        pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
-                        ihevc_memset_16bit_a9q((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+                        ihevc_memset_16bit_a9q((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4 * sub_sample);
                    }

                }
-                nbr_id_from_bl += (nbr_id_from_bl == (T8C_4NT / 2)) ? 1 : 4;
+                nbr_id_from_bl += (nbr_id_from_bl == (T8C_4NT * sub_sample / 2)) ? 1 : 4 * sub_sample;
            }

        }
-        else if(nt == 16)
+        else if(nt == 16 || (nt == 32 && chroma_format_idc == CHROMA_FMT_IDC_YUV444))
        {
+            WORD32 sub_sample = chroma_format_idc == CHROMA_FMT_IDC_YUV444 ? 2 : 1;
            /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
            /* as each bit in nbr flags corresponds to 4 pels for bot_left, left, top and topright but 1 pel for topleft */
            {
-                nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 4; /* for bottom left and left */
+                nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 4 * sub_sample; /* for bottom left and left */

-                if(nbr_id_from_bl == 32)
+                if(nbr_id_from_bl == 32 * sub_sample)
                {
                    /* for top left : 1 pel per nbr bit */
                    if(!((nbr_flags >> 16) & 0x1))
@ -336,7 +339,7 @@ void ihevc_intra_pred_chroma_ref_substitution_a9q(UWORD8 *pu1_top_left,
                        /* top left not available */
                        nbr_id_from_bl++;
                        /* top and top right;  4 pels per nbr bit */
-                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 4;
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 4 * sub_sample;
                    }
                }
                /* Reverse Substitution Process*/
@ -354,14 +357,14 @@ void ihevc_intra_pred_chroma_ref_substitution_a9q(UWORD8 *pu1_top_left,
            }

            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
-            while(nbr_id_from_bl < ((T16C_4NT)+1))
+            while(nbr_id_from_bl < ((T16C_4NT * sub_sample)+1))
            {
                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
                /* Devide by 4 to obtain the original index */
-                frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+                frwd_nbr_flag = (nbr_id_from_bl >> (chroma_format_idc == CHROMA_FMT_IDC_YUV444 ? 3 : 2)); /*+ (nbr_id_from_bl & 0x1);*/

                /* The Top-left flag is at the last bit location of nbr_flags*/
-                if(nbr_id_from_bl == (T16C_4NT / 2))
+                if(nbr_id_from_bl == (T16C_4NT * sub_sample / 2))
                {
                    get_bits = GET_BIT(nbr_flags, 16);
                    /* only pel substitution for TL */
@ -379,11 +382,11 @@ void ihevc_intra_pred_chroma_ref_substitution_a9q(UWORD8 *pu1_top_left,
                        UWORD16 *pu2_dst;
                        /* 4 pel substitution (other than TL) */
                        pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
-                        ihevc_memset_16bit_a9q((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+                        ihevc_memset_16bit_a9q((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4 * sub_sample);
                    }

                }
-                nbr_id_from_bl += (nbr_id_from_bl == (T16C_4NT / 2)) ? 1 : 4;
+                nbr_id_from_bl += (nbr_id_from_bl == (T16C_4NT * sub_sample / 2)) ? 1 : 4 * sub_sample;
            }
        }
    }
--- a/common/arm64/ihevc_deblk_chroma_horz.s
+++ b/common/arm64/ihevc_deblk_chroma_horz.s
@ -45,9 +45,8 @@
 //                             WORD32 filter_flag_q)
 //

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -57,7 +56,7 @@

 .type ihevc_deblk_chroma_horz_av8, %function

-ihevc_deblk_chroma_horz_av8:
+ENTRY ihevc_deblk_chroma_horz_av8
    sxtw        x4,w4
    sxtw        x5,w5
    sxtw        x6,w6
@ -166,6 +165,7 @@ l1.3528:
 l1.3540:
    ldp         x19, x20,[sp],#16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_deblk_chroma_vert.s
+++ b/common/arm64/ihevc_deblk_chroma_vert.s
@ -46,9 +46,8 @@
 //                             WORD32 filter_flag_p,
 //                             WORD32 filter_flag_q)

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -58,7 +57,7 @@

 .type ihevc_deblk_chroma_vert_av8, %function

-ihevc_deblk_chroma_vert_av8:
+ENTRY ihevc_deblk_chroma_vert_av8
    sxtw        x4,w4
    sxtw        x5,w5
    sxtw        x6,w6
@ -205,6 +204,7 @@ l1.3204:
 l1.3228:
    ldp         x19, x20,[sp],#16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_deblk_luma_horz.s
+++ b/common/arm64/ihevc_deblk_luma_horz.s
@ -36,8 +36,8 @@
 //*
 //*******************************************************************************/

+.include "ihevc_neon_macros.s"
 .text
-.align 4


 .extern gai4_ihevc_tc_table
@ -46,7 +46,7 @@

 .type ihevc_deblk_luma_horz_av8, %function

-ihevc_deblk_luma_horz_av8:
+ENTRY ihevc_deblk_luma_horz_av8
    // stmfd sp!, {x3-x12,x14}
    sxtw        x5,w5
    sxtw        x6,w6
@ -434,6 +434,7 @@ l1.2404:
    ldp         d10,d11,[sp],#16
    ldp         d8,d9,[sp],#16              // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
                                            // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
+    EXIT_FUNC
    ret

    // x4=flag p
@ -584,6 +585,7 @@ l1.2852:
    ldp         d10,d11,[sp],#16
    ldp         d8,d9,[sp],#16              // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
                                            // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_deblk_luma_vert.s
+++ b/common/arm64/ihevc_deblk_luma_vert.s
@ -37,8 +37,8 @@
 //*
 //*******************************************************************************/

+.include "ihevc_neon_macros.s"
 .text
-.align 4



@ -49,7 +49,7 @@

 .type ihevc_deblk_luma_vert_av8, %function

-ihevc_deblk_luma_vert_av8:
+ENTRY ihevc_deblk_luma_vert_av8

    sxtw        x5,w5
    sxtw        x6,w6
@ -450,6 +450,7 @@ l1.964:
    ldp         d12,d13,[sp],#16
    ldp         d10,d11,[sp],#16
    ldp         d8,d9,[sp],#16
+    EXIT_FUNC
    ret

 l1.968:
@ -630,6 +631,7 @@ l1.1412:
    ldp         d12,d13,[sp],#16
    ldp         d10,d11,[sp],#16
    ldp         d8,d9,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_chroma_copy.s
+++ b/common/arm64/ihevc_inter_pred_chroma_copy.s
@ -91,14 +91,14 @@
 //x5 =>  ht
 //x6 =>  wd

+.include "ihevc_neon_macros.s"
 .text
-.align 4

 .globl ihevc_inter_pred_chroma_copy_av8

 .type ihevc_inter_pred_chroma_copy_av8, %function

-ihevc_inter_pred_chroma_copy_av8:
+ENTRY ihevc_inter_pred_chroma_copy_av8

    LSL         x12,x6,#1                   //wd << 1
    CMP         x5,#0                       //checks ht == 0
@ -142,7 +142,8 @@ END_INNER_LOOP_WD_4:
    BGT         OUTER_LOOP_WD_4_HT_2

 END_LOOPS:
-    RET
+    EXIT_FUNC
+    ret

 OUTER_LOOP_WD_4_HT_2:
    SUBS        x4,x12,#0                   //checks wd == 0
@ -251,6 +252,7 @@ INNER_LOOP_WD_16_HT_2:
    LD1         {v1.16b},[x7],x2            //vld1_u8(pu1_src_tmp)
    ST1         {v1.16b},[x6],x3            //vst1_u8(pu1_dst_tmp, tmp_src)

-    RET
+    EXIT_FUNC
+    ret


--- a/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_copy_w16out.s
@ -92,16 +92,14 @@
 //x5 =>  ht
 //x6 =>  wd

-.text
-.align 4
-
 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_chroma_copy_w16out_av8

 .type ihevc_inter_pred_chroma_copy_w16out_av8, %function

-ihevc_inter_pred_chroma_copy_w16out_av8:
+ENTRY ihevc_inter_pred_chroma_copy_w16out_av8

    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments

@ -173,6 +171,7 @@ end_loops:
    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


@ -339,6 +338,7 @@ core_loop_wd_8_ht_2:
    // ldmfd sp!,{x4-x12,x15}         //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_chroma_horz.s
+++ b/common/arm64/ihevc_inter_pred_chroma_horz.s
@ -93,16 +93,15 @@
 //x2 =>  src_strd
 //x3 =>  dst_strd

-.text
-.align 4

 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_chroma_horz_av8

 .type ihevc_inter_pred_chroma_horz_av8, %function

-ihevc_inter_pred_chroma_horz_av8:
+ENTRY ihevc_inter_pred_chroma_horz_av8

    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments

@ -769,6 +768,7 @@ end_loops:
    ldp         d13,d14,[sp],#16
    ldp         d11,d12,[sp],#16
    ldp         d9,d10,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_horz_w16out.s
@ -91,17 +91,16 @@
 //x3 =>  dst_strd


-.text
-.align 4

 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_chroma_horz_w16out_av8


 .type ihevc_inter_pred_chroma_horz_w16out_av8, %function

-ihevc_inter_pred_chroma_horz_w16out_av8:
+ENTRY ihevc_inter_pred_chroma_horz_w16out_av8

    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments

@ -794,6 +793,7 @@ end_loops:
    ldp         d14,d15,[sp],#16
    ldp         d12,d13,[sp],#16
    ldp         d10,d11,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_chroma_vert.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert.s
@ -92,16 +92,15 @@
 //x1 => *pi2_dst
 //x2 =>  src_strd
 //x3 =>  dst_strd
-.text
-.align 4

 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_chroma_vert_av8

 .type ihevc_inter_pred_chroma_vert_av8, %function

-ihevc_inter_pred_chroma_vert_av8:
+ENTRY ihevc_inter_pred_chroma_vert_av8

    // stmfd sp!,{x4-x12,x14}        //stack stores the values of the arguments

@ -399,6 +398,7 @@ end_loops:
    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp.s
@ -92,16 +92,14 @@
 //x2 =>  src_strd
 //x3 =>  dst_strd

-.text
-.align 4
-
 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_chroma_vert_w16inp_av8

 .type ihevc_inter_pred_chroma_vert_w16inp_av8, %function

-ihevc_inter_pred_chroma_vert_w16inp_av8:
+ENTRY ihevc_inter_pred_chroma_vert_w16inp_av8

    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments

@ -349,6 +347,7 @@ end_loops:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16inp_w16out.s
@ -92,16 +92,15 @@
 //x1 => *pi2_dst
 //x2 =>  src_strd
 //x3 =>  dst_strd
-.text
-.align 4

 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_chroma_vert_w16inp_w16out_av8

 .type ihevc_inter_pred_chroma_vert_w16inp_w16out_av8, %function

-ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
+ENTRY ihevc_inter_pred_chroma_vert_w16inp_w16out_av8

    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments

@ -336,6 +335,7 @@ end_loops:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
+++ b/common/arm64/ihevc_inter_pred_chroma_vert_w16out.s
@ -93,16 +93,14 @@
 //x2 =>  src_strd
 //x3 =>  dst_strd

-.text
-.align 4
-
 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_chroma_vert_w16out_av8

 .type ihevc_inter_pred_chroma_vert_w16out_av8, %function

-ihevc_inter_pred_chroma_vert_w16out_av8:
+ENTRY ihevc_inter_pred_chroma_vert_w16out_av8

    // stmfd sp!,{x4-x12,x14}        //stack stores the values of the arguments

@ -386,6 +384,7 @@ end_loops:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_filters_luma_horz.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_horz.s
@ -103,16 +103,15 @@
 //    x5 =>  ht
 //    x6 =>  wd

-.text
-.align 4

 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_luma_horz_av8

 .type ihevc_inter_pred_luma_horz_av8, %function

-ihevc_inter_pred_luma_horz_av8:
+ENTRY ihevc_inter_pred_luma_horz_av8

    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
    push_v_regs
@ -286,6 +285,7 @@ end_loops:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp], #16
    pop_v_regs
+    EXIT_FUNC
    ret


@ -481,6 +481,7 @@ end_loops1:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp], #16
    pop_v_regs
+    EXIT_FUNC
    ret


@ -595,6 +596,7 @@ end_inner_loop_4:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp], #16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_filters_luma_vert.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert.s
@ -103,16 +103,15 @@
 //    x12 => *pi1_coeff
 //    x5 =>  ht
 //    x3 =>  wd
-.text
-.align 4

 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_luma_vert_av8

 .type ihevc_inter_pred_luma_vert_av8, %function

-ihevc_inter_pred_luma_vert_av8:
+ENTRY ihevc_inter_pred_luma_vert_av8

    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments

@ -428,6 +427,7 @@ end_loops:
    bne         lbl409
    ldp         x19, x20,[sp], #16

+    EXIT_FUNC
    ret
 lbl409:
    mov         x5, #4
@ -518,5 +518,6 @@ end_inner_loop_wd_4:
    // ldmfd sp!, {x4-x12, x15}    //reload the registers from sp
    ldp         x19, x20,[sp], #16

+    EXIT_FUNC
    ret

--- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16inp.s
@ -94,16 +94,15 @@
 //                                    word32 ht,
 //                                    word32 wd   )

-.text
-.align 4

 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_luma_vert_w16inp_av8

 .type ihevc_inter_pred_luma_vert_w16inp_av8, %function

-ihevc_inter_pred_luma_vert_w16inp_av8:
+ENTRY ihevc_inter_pred_luma_vert_w16inp_av8

    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments

@ -384,6 +383,7 @@ end_loops:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp], #16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
+++ b/common/arm64/ihevc_inter_pred_filters_luma_vert_w16out.s
@ -62,12 +62,13 @@


 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_luma_vert_w16out_av8

 .type ihevc_inter_pred_luma_vert_w16out_av8, %function

-ihevc_inter_pred_luma_vert_w16out_av8:
+ENTRY ihevc_inter_pred_luma_vert_w16out_av8

    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments

@ -378,6 +379,7 @@ end_loops_16out:
    bne         lbl355
    ldp         x19, x20,[sp], #16

+    EXIT_FUNC
    ret
 lbl355:
    mov         x5, #4
@ -471,6 +473,7 @@ end_inner_loop_wd_4_16out:
    // ldmfd sp!, {x4-x12, x15}    //reload the registers from sp
    ldp         x19, x20,[sp], #16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_luma_copy.s
+++ b/common/arm64/ihevc_inter_pred_luma_copy.s
@ -71,16 +71,14 @@
 //    x11 =>  ht
 //    x16 => wd

-.text
-.align 4
-
 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_luma_copy_av8

 .type ihevc_inter_pred_luma_copy_av8, %function

-ihevc_inter_pred_luma_copy_av8:
+ENTRY ihevc_inter_pred_luma_copy_av8
    // stmfd sp!, {x8-x16, lr}                //stack stores the values of the arguments
    stp         x19,x20,[sp, #-16]!
    mov         x16,x6                      //loads wd
@ -125,6 +123,7 @@ end_loops:
 //  MRS x20,PMCCFILTR_EL0
    sub         x0,x20,x19
    ldp         x19,x20,[sp],#16
+    EXIT_FUNC
    ret


@ -159,6 +158,7 @@ end_inner_loop_wd_8:
 //  MRS x20,PMCCFILTR_EL0
    sub         x0,x20,x19
    ldp         x19,x20,[sp],#16
+    EXIT_FUNC
    ret

 core_loop_wd_16:
@ -192,6 +192,7 @@ end_inner_loop_wd_16:
 //  MRS x20,PMCCFILTR_EL0
    sub         x0,x20,x19
    ldp         x19,x20,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_luma_copy_w16out.s
+++ b/common/arm64/ihevc_inter_pred_luma_copy_w16out.s
@ -72,16 +72,15 @@
 //    x7 =>  ht
 //    x12 => wd

-.text
-.align 4

 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_luma_copy_w16out_av8

 .type ihevc_inter_pred_luma_copy_w16out_av8, %function

-ihevc_inter_pred_luma_copy_w16out_av8:
+ENTRY ihevc_inter_pred_luma_copy_w16out_av8

    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments

@ -140,6 +139,7 @@ end_loops:
    ldp         x19, x20,[sp], #16


+    EXIT_FUNC
    ret


@ -265,6 +265,7 @@ epilog_end:
    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
    ldp         x19, x20,[sp], #16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_luma_horz_w16out.s
+++ b/common/arm64/ihevc_inter_pred_luma_horz_w16out.s
@ -107,16 +107,15 @@
 //x15 - #1
 //x16 - src_ptx1
 //x19 - loop_counter
-.text
-.align 4

 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_luma_horz_w16out_av8

 .type ihevc_inter_pred_luma_horz_w16out_av8, %function

-ihevc_inter_pred_luma_horz_w16out_av8:
+ENTRY ihevc_inter_pred_luma_horz_w16out_av8

    // stmfd sp!, {x8-x16, x19}                //stack stores the values of the arguments
    push_v_regs
@ -305,6 +304,7 @@ height_residue_4:
    bne         lbl280
    ldp         x19, x20,[sp], #16
    pop_v_regs
+    EXIT_FUNC
    ret
 lbl280:

@ -365,6 +365,7 @@ end_inner_loop_height_residue_4:
    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
    ldp         x19, x20,[sp], #16
    pop_v_regs
+    EXIT_FUNC
    ret

 outer_loop8_residual:
@ -476,6 +477,7 @@ end_inner_loop_8:
    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
    ldp         x19, x20,[sp], #16
    pop_v_regs
+    EXIT_FUNC
    ret


@ -666,6 +668,7 @@ end_loops1:
    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
    ldp         x19, x20,[sp], #16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
+++ b/common/arm64/ihevc_inter_pred_luma_vert_w16inp_w16out.s
@ -102,16 +102,14 @@
 //  r5 =>  ht
 //  r6 =>  wd

-.text
-.align 4
-
 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_inter_pred_luma_vert_w16inp_w16out_av8

 .type ihevc_inter_pred_luma_vert_w16inp_w16out_av8, %function

-ihevc_inter_pred_luma_vert_w16inp_w16out_av8:
+ENTRY ihevc_inter_pred_luma_vert_w16inp_w16out_av8

    //stmfd     sp!, {r4-r12, r14}  //stack stores the values of the arguments

@ -408,6 +406,7 @@ end_loops:
    //ldmfd     sp!,{r4-r12,r15}            //reload the registers from sp
    ldp         x19, x20,[sp], #16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_chroma_dc.s
+++ b/common/arm64/ihevc_intra_pred_chroma_dc.s
@ -92,9 +92,8 @@
 //    mode
 //    pi1_coeff

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -102,7 +101,7 @@

 .type ihevc_intra_pred_chroma_dc_av8, %function

-ihevc_intra_pred_chroma_dc_av8:
+ENTRY ihevc_intra_pred_chroma_dc_av8

    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
    push_v_regs
@ -293,6 +292,7 @@ end_func:
    // ldmfd sp!,{x4-x12,x15}     //reload the registers from sp
    ldp         x19, x20,[sp],#16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_chroma_horz.s
+++ b/common/arm64/ihevc_intra_pred_chroma_horz.s
@ -84,16 +84,15 @@
 //x2 => *pu1_dst
 //x3 =>  dst_strd

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text


 .globl ihevc_intra_pred_chroma_horz_av8

 .type ihevc_intra_pred_chroma_horz_av8, %function

-ihevc_intra_pred_chroma_horz_av8:
+ENTRY ihevc_intra_pred_chroma_horz_av8

    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments

@ -189,6 +188,7 @@ core_loop_16:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret
    b           endloop

@ -270,6 +270,7 @@ core_loop_8:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret
    b           endloop

@ -318,6 +319,7 @@ core_loop_4:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret
    b           endloop

@ -353,6 +355,7 @@ core_loop_4:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret

 endloop:
--- a/common/arm64/ihevc_intra_pred_chroma_mode2.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode2.s
@ -92,9 +92,8 @@
 //    mode
 //    pi1_coeff

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -102,7 +101,7 @@

 .type ihevc_intra_pred_chroma_mode2_av8, %function

-ihevc_intra_pred_chroma_mode2_av8:
+ENTRY ihevc_intra_pred_chroma_mode2_av8

    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
    push_v_regs
@ -303,6 +302,7 @@ end_func:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_18_34.s
@ -92,9 +92,8 @@
 //    mode
 //    pi1_coeff

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -102,7 +101,7 @@

 .type ihevc_intra_pred_chroma_mode_18_34_av8, %function

-ihevc_intra_pred_chroma_mode_18_34_av8:
+ENTRY ihevc_intra_pred_chroma_mode_18_34_av8

    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments

@ -189,6 +188,7 @@ end_func:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_27_to_33.s
@ -81,9 +81,8 @@
 //                                         word32 nt,
 //                                         word32 mode)

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text


 .globl ihevc_intra_pred_chroma_mode_27_to_33_av8
@ -92,7 +91,7 @@

 .type ihevc_intra_pred_chroma_mode_27_to_33_av8, %function

-ihevc_intra_pred_chroma_mode_27_to_33_av8:
+ENTRY ihevc_intra_pred_chroma_mode_27_to_33_av8

    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments

@ -549,6 +548,7 @@ end_loops:
    ldp         d14,d15,[sp],#16
    ldp         d12,d13,[sp],#16
    ldp         d9,d10,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
+++ b/common/arm64/ihevc_intra_pred_chroma_mode_3_to_9.s
@ -86,10 +86,8 @@
 //    nt
 //    mode

-.text
-.align 4
-
 .include "ihevc_neon_macros.s"
+.text



@ -101,7 +99,7 @@

 .type ihevc_intra_pred_chroma_mode_3_to_9_av8, %function

-ihevc_intra_pred_chroma_mode_3_to_9_av8:
+ENTRY ihevc_intra_pred_chroma_mode_3_to_9_av8

    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments

@ -489,6 +487,7 @@ end_func:
    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
    ldp         d13,d14,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_chroma_planar.s
+++ b/common/arm64/ihevc_intra_pred_chroma_planar.s
@ -92,9 +92,8 @@
 //    mode
 //    pi1_coeff

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text


 .globl ihevc_intra_pred_chroma_planar_av8
@ -103,7 +102,7 @@

 .type ihevc_intra_pred_chroma_planar_av8, %function

-ihevc_intra_pred_chroma_planar_av8:
+ENTRY ihevc_intra_pred_chroma_planar_av8

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

@ -374,6 +373,7 @@ end_loop:
                                            // d8 is used as dummy register and loaded along with d14 using ldp. d8 is not used in the function.
    ldp         d12,d13,[sp],#16
    ldp         d10,d11,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_chroma_ver.s
+++ b/common/arm64/ihevc_intra_pred_chroma_ver.s
@ -87,16 +87,15 @@
 //    nt
 //    mode

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text


 .globl ihevc_intra_pred_chroma_ver_av8

 .type ihevc_intra_pred_chroma_ver_av8, %function

-ihevc_intra_pred_chroma_ver_av8:
+ENTRY ihevc_intra_pred_chroma_ver_av8

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
    push_v_regs
@ -226,6 +225,7 @@ end_func:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s
@ -88,9 +88,8 @@
 //    nt
 //    mode

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -102,7 +101,7 @@

 .type ihevc_intra_pred_chroma_mode_11_to_17_av8, %function

-ihevc_intra_pred_chroma_mode_11_to_17_av8:
+ENTRY ihevc_intra_pred_chroma_mode_11_to_17_av8

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

@ -617,6 +616,7 @@ end_func:
    ldp         x19, x20,[sp],#16
    ldp         d14,d15,[sp],#16
    ldp         d12,d13,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
+++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_19_to_25.s
@ -88,9 +88,8 @@
 //    nt
 //    mode

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text


 .globl ihevc_intra_pred_chroma_mode_19_to_25_av8
@ -100,7 +99,7 @@

 .type ihevc_intra_pred_chroma_mode_19_to_25_av8, %function

-ihevc_intra_pred_chroma_mode_19_to_25_av8:
+ENTRY ihevc_intra_pred_chroma_mode_19_to_25_av8

    // stmfd sp!, {x4-x12, x14}             //stack stores the values of the arguments

@ -571,6 +570,7 @@ end_loops:
    ldp         d8,d14,[sp],#16             // Loading d14 using { ldr d14,[sp]; add sp,sp,#8 } is giving bus error.
                                            // d8 is used as dummy register and loaded along with d14 using ldp. d8 is not used in the function.
    ldp         d12,d13,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_11_to_17.s
@ -88,9 +88,8 @@
 //    nt
 //    mode

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -102,7 +101,7 @@

 .type ihevc_intra_pred_luma_mode_11_to_17_av8, %function

-ihevc_intra_pred_luma_mode_11_to_17_av8:
+ENTRY ihevc_intra_pred_luma_mode_11_to_17_av8

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

@ -691,6 +690,7 @@ end_func:
    ldp         x19, x20,[sp],#16
    ldp         d14,d15,[sp],#16
    ldp         d12,d13,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
+++ b/common/arm64/ihevc_intra_pred_filters_luma_mode_19_to_25.s
@ -88,9 +88,8 @@
 //    nt
 //    mode

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -101,7 +100,7 @@

 .type ihevc_intra_pred_luma_mode_19_to_25_av8, %function

-ihevc_intra_pred_luma_mode_19_to_25_av8:
+ENTRY ihevc_intra_pred_luma_mode_19_to_25_av8

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

@ -661,6 +660,7 @@ end_loops:
    ldp         d14,d15,[sp],#16
    ldp         d12,d13,[sp],#16
    ldp         d9,d10,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_luma_dc.s
+++ b/common/arm64/ihevc_intra_pred_luma_dc.s
@ -92,16 +92,15 @@
 //    mode
 //    pi1_coeff

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text


 .globl ihevc_intra_pred_luma_dc_av8

 .type ihevc_intra_pred_luma_dc_av8, %function

-ihevc_intra_pred_luma_dc_av8:
+ENTRY ihevc_intra_pred_luma_dc_av8

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

@ -511,6 +510,7 @@ end_func:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_luma_horz.s
+++ b/common/arm64/ihevc_intra_pred_luma_horz.s
@ -84,9 +84,8 @@
 //x2 => *pu1_dst
 //x3 =>  dst_strd

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -94,7 +93,7 @@

 .type ihevc_intra_pred_luma_horz_av8, %function

-ihevc_intra_pred_luma_horz_av8:
+ENTRY ihevc_intra_pred_luma_horz_av8

    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments

@ -189,6 +188,7 @@ core_loop_32:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret
    b           end_func

@ -269,6 +269,7 @@ core_loop_16:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret
    b           end_func

@ -315,6 +316,7 @@ core_loop_8:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret
    b           end_func

@ -350,6 +352,7 @@ core_loop_4:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret
 end_func:

--- a/common/arm64/ihevc_intra_pred_luma_mode2.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode2.s
@ -92,9 +92,8 @@
 //    mode
 //    pi1_coeff

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -102,7 +101,7 @@

 .type ihevc_intra_pred_luma_mode2_av8, %function

-ihevc_intra_pred_luma_mode2_av8:
+ENTRY ihevc_intra_pred_luma_mode2_av8

    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments

@ -270,6 +269,7 @@ end_func:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_luma_mode_18_34.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode_18_34.s
@ -92,9 +92,8 @@
 //    mode
 //    pi1_coeff

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -102,7 +101,7 @@

 .type ihevc_intra_pred_luma_mode_18_34_av8, %function

-ihevc_intra_pred_luma_mode_18_34_av8:
+ENTRY ihevc_intra_pred_luma_mode_18_34_av8

    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
    push_v_regs
@ -278,6 +277,7 @@ end_func:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode_27_to_33.s
@ -85,9 +85,8 @@
 //x2 => *pu1_dst
 //x3 =>  dst_strd

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -97,7 +96,7 @@

 .type ihevc_intra_pred_luma_mode_27_to_33_av8, %function

-ihevc_intra_pred_luma_mode_27_to_33_av8:
+ENTRY ihevc_intra_pred_luma_mode_27_to_33_av8

    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments

@ -554,6 +553,7 @@ end_loops:
    ldp         d14,d15,[sp],#16
    ldp         d12,d13,[sp],#16
    ldp         d9,d10,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
+++ b/common/arm64/ihevc_intra_pred_luma_mode_3_to_9.s
@ -88,9 +88,8 @@
 //    nt
 //    mode

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -103,7 +102,7 @@

 .type ihevc_intra_pred_luma_mode_3_to_9_av8, %function

-ihevc_intra_pred_luma_mode_3_to_9_av8:
+ENTRY ihevc_intra_pred_luma_mode_3_to_9_av8

    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments

@ -563,6 +562,7 @@ end_func:
    ldp         x19, x20,[sp],#16
    ldp         d14,d15,[sp],#16
    ldp         d12,d13,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_luma_planar.s
+++ b/common/arm64/ihevc_intra_pred_luma_planar.s
@ -92,9 +92,8 @@
 //    mode
 //    pi1_coeff

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -104,7 +103,7 @@

 .type ihevc_intra_pred_luma_planar_av8, %function

-ihevc_intra_pred_luma_planar_av8:
+ENTRY ihevc_intra_pred_luma_planar_av8

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

@ -558,6 +557,7 @@ end_loop:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_intra_pred_luma_vert.s
+++ b/common/arm64/ihevc_intra_pred_luma_vert.s
@ -88,9 +88,8 @@
 //    nt
 //    mode

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -98,7 +97,7 @@

 .type ihevc_intra_pred_luma_ver_av8, %function

-ihevc_intra_pred_luma_ver_av8:
+ENTRY ihevc_intra_pred_luma_ver_av8

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

@ -424,6 +423,7 @@ end_func:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_itrans_recon_16x16.s
+++ b/common/arm64/ihevc_itrans_recon_16x16.s
@ -105,10 +105,8 @@
 //    x12
 //    x11

-.text
-.align 4
-
 .include "ihevc_neon_macros.s"
+.text



@ -123,7 +121,7 @@

 .type ihevc_itrans_recon_16x16_av8, %function

-ihevc_itrans_recon_16x16_av8:
+ENTRY ihevc_itrans_recon_16x16_av8

    ldr         w11, [sp]
    // stmfd sp!,{x4-x12,x14}
@ -1226,6 +1224,7 @@ skip_last8rows_stage2_kernel2:
    // ldmfd sp!,{x4-x12,pc}
    ldp         x19, x20,[sp],#16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_itrans_recon_32x32.s
+++ b/common/arm64/ihevc_itrans_recon_32x32.s
@ -124,9 +124,8 @@
 //d5[2]= 43        d7[2]=9
 //d5[3]= 38        d7[3]=4

-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -143,7 +142,7 @@

 .type ihevc_itrans_recon_32x32_av8, %function

-ihevc_itrans_recon_32x32_av8:
+ENTRY ihevc_itrans_recon_32x32_av8

    ldr         w11, [sp]

@ -3042,6 +3041,7 @@ prediction_buffer:
    // ldmfd sp!,{x0-x12,pc}
    ldp         x19, x20,[sp],#16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_itrans_recon_4x4.s
+++ b/common/arm64/ihevc_itrans_recon_4x4.s
@ -100,10 +100,8 @@
 //    x6 => dst_strd
 //    x7 => zero_cols

-.text
-.align 4
-
 .include "ihevc_neon_macros.s"
+.text

 .set shift_stage1_idct ,   7
 .set shift_stage2_idct ,   12
@ -116,7 +114,7 @@

 .type ihevc_itrans_recon_4x4_av8, %function

-ihevc_itrans_recon_4x4_av8:
+ENTRY ihevc_itrans_recon_4x4_av8

    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments

@ -229,6 +227,7 @@ ihevc_itrans_recon_4x4_av8:
    // ldmfd sp!,{x4-x12,x15}                //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
+++ b/common/arm64/ihevc_itrans_recon_4x4_ttype1.s
@ -103,10 +103,8 @@
 //    x6 => dst_strd
 //    x7 => zero_cols

-.text
-.align 4
-
 .include "ihevc_neon_macros.s"
+.text

 .set shift_stage1_idct ,   7
 .set shift_stage2_idct ,   12
@ -115,7 +113,7 @@

 .type ihevc_itrans_recon_4x4_ttype1_av8, %function

-ihevc_itrans_recon_4x4_ttype1_av8:
+ENTRY ihevc_itrans_recon_4x4_ttype1_av8

    // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments

@ -234,6 +232,7 @@ ihevc_itrans_recon_4x4_ttype1_av8:
    // ldmfd sp!,{x4-x12,x15}            //reload the registers from sp
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_itrans_recon_8x8.s
+++ b/common/arm64/ihevc_itrans_recon_8x8.s
@ -105,10 +105,8 @@
 //    zero_cols


-
-.text
-.align 4
 .include "ihevc_neon_macros.s"
+.text



@ -123,7 +121,7 @@

 .type ihevc_itrans_recon_8x8_av8, %function

-ihevc_itrans_recon_8x8_av8:
+ENTRY ihevc_itrans_recon_8x8_av8
 ////register usage.extern        - loading and until idct of columns
 ////    cosine constants     -     d0
 ////    sine constants         -     d1
@ -1030,6 +1028,7 @@ pred_buff_addition:
    // ldmfd sp!,{x4-x12,pc}
    ldp         x19, x20,[sp],#16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_mem_fns.s
+++ b/common/arm64/ihevc_mem_fns.s
@ -69,14 +69,14 @@
 //    x1 => *pu1_src
 //    x2 => num_bytes

+.include "ihevc_neon_macros.s"
 .text
-.p2align 2


    .global ihevc_memcpy_mul_8_av8
 .type ihevc_memcpy_mul_8_av8, %function

-ihevc_memcpy_mul_8_av8:
+ENTRY ihevc_memcpy_mul_8_av8

 LOOP_NEON_MEMCPY_MUL_8:
    // Memcpy 8 bytes
@ -85,6 +85,7 @@ LOOP_NEON_MEMCPY_MUL_8:

    SUBS        x2,x2,#8
    BNE         LOOP_NEON_MEMCPY_MUL_8
+    EXIT_FUNC
    ret


@ -104,7 +105,7 @@ LOOP_NEON_MEMCPY_MUL_8:
    .global ihevc_memcpy_av8
 .type ihevc_memcpy_av8, %function

-ihevc_memcpy_av8:
+ENTRY ihevc_memcpy_av8
    SUBS        x2,x2,#8
    BLT         ARM_MEMCPY
 LOOP_NEON_MEMCPY:
@ -126,6 +127,7 @@ LOOP_ARM_MEMCPY:
    SUBS        x2,x2,#1
    BNE         LOOP_ARM_MEMCPY
 MEMCPY_RETURN:
+    EXIT_FUNC
    ret


@ -140,14 +142,13 @@ MEMCPY_RETURN:
 //    x2 => num_bytes

 .text
-.p2align 2



    .global ihevc_memset_mul_8_av8
 .type ihevc_memset_mul_8_av8, %function

-ihevc_memset_mul_8_av8:
+ENTRY ihevc_memset_mul_8_av8

 // Assumptions: numbytes is either 8, 16 or 32
    dup         v0.8b,w1
@ -158,6 +159,7 @@ LOOP_MEMSET_MUL_8:
    SUBS        x2,x2,#8
    BNE         LOOP_MEMSET_MUL_8

+    EXIT_FUNC
    ret


@ -176,7 +178,7 @@ LOOP_MEMSET_MUL_8:
    .global ihevc_memset_av8
 .type ihevc_memset_av8, %function

-ihevc_memset_av8:
+ENTRY ihevc_memset_av8
    SUBS        x2,x2,#8
    BLT         ARM_MEMSET
    dup         v0.8b,w1
@ -198,6 +200,7 @@ LOOP_ARM_MEMSET:
    BNE         LOOP_ARM_MEMSET

 MEMSET_RETURN:
+    EXIT_FUNC
    ret


@ -212,14 +215,13 @@ MEMSET_RETURN:
 //    x2 => num_words

 .text
-.p2align 2



    .global ihevc_memset_16bit_mul_8_av8
 .type ihevc_memset_16bit_mul_8_av8, %function

-ihevc_memset_16bit_mul_8_av8:
+ENTRY ihevc_memset_16bit_mul_8_av8

 // Assumptions: num_words is either 8, 16 or 32

@ -231,6 +233,7 @@ LOOP_MEMSET_16BIT_MUL_8:
    SUBS        x2,x2,#8
    BNE         LOOP_MEMSET_16BIT_MUL_8

+    EXIT_FUNC
    ret


@ -249,7 +252,7 @@ LOOP_MEMSET_16BIT_MUL_8:
    .global ihevc_memset_16bit_av8
 .type ihevc_memset_16bit_av8, %function

-ihevc_memset_16bit_av8:
+ENTRY ihevc_memset_16bit_av8
    SUBS        x2,x2,#8
    BLT         ARM_MEMSET_16BIT
    dup         v0.8h,w1
@ -271,6 +274,7 @@ LOOP_ARM_MEMSET_16BIT:
    BNE         LOOP_ARM_MEMSET_16BIT

 MEMSET_16BIT_RETURN:
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_neon_macros.s
+++ b/common/arm64/ihevc_neon_macros.s
@ -47,3 +47,53 @@
    ldp         d10,d11,[sp],#16
    ldp         d8,d9,[sp],#16
 .endm
+
+// --- Internal Security Dispatchers ---
+// These expand to real instructions only if the compiler flags are present.
+
+.macro BTI_ENABLE
+#if defined(__ARM_FEATURE_BTI_DEFAULT)
+    bti c
+#endif
+.endm
+
+.macro PAC_ENTRY
+#if defined(__ARM_FEATURE_PAC_DEFAULT)
+    paciasp
+#endif
+.endm
+
+.macro PAC_EXIT
+#if defined(__ARM_FEATURE_PAC_DEFAULT)
+    autiasp
+#endif
+.endm
+
+// --- Main ENTRY and EXIT_FUNC Macros ---
+
+.macro ENTRY name
+    .p2align 2
+\name:
+    BTI_ENABLE
+    PAC_ENTRY
+.endm
+
+.macro EXIT_FUNC
+    PAC_EXIT
+.endm
+
+// --- GNU Property Note ---
+// Signals BTI and PAC support to the Android linker.
+#if defined(__linux__) && defined(__aarch64__)
+    .pushsection .note.gnu.property, "a"  // Switch to Note section
+    .p2align 3
+    .word 4           // Name size
+    .word 16          // Data size
+    .word 5           // NT_GNU_PROPERTY_TYPE_0
+    .asciz "GNU"      // Owner
+    .word 0xc0000000  // GNU_PROPERTY_AARCH64_FEATURE_1_AND
+    .word 4           // Data size
+    .word 3           // Value: BTI (Bit 0) | PAC (Bit 1)
+    .word 0           // Padding
+    .popsection                           // Switch back to previous section
+#endif
--- a/common/arm64/ihevc_padding.s
+++ b/common/arm64/ihevc_padding.s
@ -85,14 +85,14 @@
 //    x2 => ht
 //    x3 => pad_size

+.include "ihevc_neon_macros.s"
 .text
-.align 4

 .globl ihevc_pad_left_luma_av8

 .type ihevc_pad_left_luma_av8, %function

-ihevc_pad_left_luma_av8:
+ENTRY ihevc_pad_left_luma_av8

 loop_start_luma_left:
    // pad size is assumed to be pad_left = 80
@ -148,6 +148,7 @@ loop_start_luma_left:

    bne         loop_start_luma_left

+    EXIT_FUNC
    ret


@ -209,7 +210,7 @@ loop_start_luma_left:

 .type ihevc_pad_left_chroma_av8, %function

-ihevc_pad_left_chroma_av8:
+ENTRY ihevc_pad_left_chroma_av8


 loop_start_chroma_left:
@ -266,6 +267,7 @@ loop_start_chroma_left:

    bne         loop_start_chroma_left

+    EXIT_FUNC
    ret


@ -337,7 +339,7 @@ loop_start_chroma_left:

 .type ihevc_pad_right_luma_av8, %function

-ihevc_pad_right_luma_av8:
+ENTRY ihevc_pad_right_luma_av8


 loop_start_luma_right:
@ -395,6 +397,7 @@ loop_start_luma_right:

    bne         loop_start_luma_right

+    EXIT_FUNC
    ret


@ -455,7 +458,7 @@ loop_start_luma_right:

 .type ihevc_pad_right_chroma_av8, %function

-ihevc_pad_right_chroma_av8:
+ENTRY ihevc_pad_right_chroma_av8


 loop_start_chroma_right:
@ -512,6 +515,7 @@ loop_start_chroma_right:

    bne         loop_start_chroma_right

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_sao_band_offset_chroma.s
+++ b/common/arm64/ihevc_sao_band_offset_chroma.s
@ -61,14 +61,13 @@
 //x9    =>    wd 60
 //x10=>    ht 64

-.text
-.p2align 2
 .include "ihevc_neon_macros.s"
+.text

 .globl gu1_table_band_idx
 .globl ihevc_sao_band_offset_chroma_av8

-ihevc_sao_band_offset_chroma_av8:
+ENTRY ihevc_sao_band_offset_chroma_av8
    mov         x8,#0
    mov         x9,#0
    mov         x10,#0
@ -424,6 +423,7 @@ END_LOOP:
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16
    pop_v_regs
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_sao_band_offset_luma.s
+++ b/common/arm64/ihevc_sao_band_offset_luma.s
@ -65,12 +65,11 @@
 .include "ihevc_neon_macros.s"

 .text
-.p2align 2

 .globl gu1_table_band_idx
 .globl ihevc_sao_band_offset_luma_av8

-ihevc_sao_band_offset_luma_av8:
+ENTRY ihevc_sao_band_offset_luma_av8

    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments

@ -244,6 +243,7 @@ HEIGHT_LOOP:
    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
    ldp         d13,d14,[sp],#16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_sao_edge_offset_class0.s
+++ b/common/arm64/ihevc_sao_edge_offset_class0.s
@ -59,15 +59,14 @@
 //x9    =>    wd
 //x10=>    ht

-.text
-.p2align 2

 .include "ihevc_neon_macros.s"
+.text

 .globl gi1_table_edge_idx
 .globl ihevc_sao_edge_offset_class0_av8

-ihevc_sao_edge_offset_class0_av8:
+ENTRY ihevc_sao_edge_offset_class0_av8


    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
@ -338,6 +337,7 @@ END_LOOPS:
    // LDMFD sp!,{x4-x12,x15}              //Reload the registers from SP
    ldp         x19, x20,[sp], #16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class0_chroma.s
@ -60,14 +60,13 @@
 //x9    =>    wd
 //x10=>    ht

-.text
-.p2align 2
 .include "ihevc_neon_macros.s"
+.text

 .globl gi1_table_edge_idx
 .globl ihevc_sao_edge_offset_class0_chroma_av8

-ihevc_sao_edge_offset_class0_chroma_av8:
+ENTRY ihevc_sao_edge_offset_class0_chroma_av8

    ldr         x8,[sp,#0]
    ldr         x9,[sp,#8]
@ -477,6 +476,7 @@ END_LOOPS:
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_sao_edge_offset_class1.s
+++ b/common/arm64/ihevc_sao_edge_offset_class1.s
@ -58,15 +58,14 @@
 //x7    =>    wd
 //x8 =>    ht

-.text
-.p2align 2

 .include "ihevc_neon_macros.s"
+.text

 .globl gi1_table_edge_idx
 .globl ihevc_sao_edge_offset_class1_av8

-ihevc_sao_edge_offset_class1_av8:
+ENTRY ihevc_sao_edge_offset_class1_av8


    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
@ -355,6 +354,7 @@ END_LOOPS:
    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
    ldp         x19, x20,[sp], #16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_sao_edge_offset_class1_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class1_chroma.s
@ -60,14 +60,13 @@
 //x8    =>    wd
 //x9 =>    ht

-.text
-.p2align 2
 .include "ihevc_neon_macros.s"
+.text

 .globl gi1_table_edge_idx
 .globl ihevc_sao_edge_offset_class1_chroma_av8

-ihevc_sao_edge_offset_class1_chroma_av8:
+ENTRY ihevc_sao_edge_offset_class1_chroma_av8


    ldr         x8,[sp,#0]
@ -458,6 +457,7 @@ END_LOOPS:
    ldp         x19, x20,[sp],#16


+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_sao_edge_offset_class2.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2.s
@ -58,15 +58,14 @@
 //x7    =>    wd
 //x8=>    ht

-.text
-.p2align 2

 .include "ihevc_neon_macros.s"
+.text

 .globl gi1_table_edge_idx
 .globl ihevc_sao_edge_offset_class2_av8

-ihevc_sao_edge_offset_class2_av8:
+ENTRY ihevc_sao_edge_offset_class2_av8


    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
@ -846,6 +845,7 @@ END_LOOPS:
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class2_chroma.s
@ -60,14 +60,13 @@
 //x7    =>    wd
 //x8=>    ht

-.text
-.p2align 2
 .include "ihevc_neon_macros.s"
+.text

 .globl gi1_table_edge_idx
 .globl ihevc_sao_edge_offset_class2_chroma_av8

-ihevc_sao_edge_offset_class2_chroma_av8:
+ENTRY ihevc_sao_edge_offset_class2_chroma_av8


    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
@ -1132,6 +1131,7 @@ END_LOOPS:
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_sao_edge_offset_class3.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3.s
@ -58,15 +58,14 @@
 //x7    =>    wd
 //x8=>    ht

-.text
-.p2align 2

 .include "ihevc_neon_macros.s"
+.text

 .globl gi1_table_edge_idx
 .globl ihevc_sao_edge_offset_class3_av8

-ihevc_sao_edge_offset_class3_av8:
+ENTRY ihevc_sao_edge_offset_class3_av8


    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
@ -889,6 +888,7 @@ END_LOOPS:
    ldp         x23, x24,[sp], #16
    ldp         x21, x22,[sp], #16
    ldp         x19, x20,[sp], #16
+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
+++ b/common/arm64/ihevc_sao_edge_offset_class3_chroma.s
@ -60,13 +60,12 @@
 //x7    =>    wd
 //x8=>    ht

-.text
-.p2align 2
 .include "ihevc_neon_macros.s"
+.text
 .globl gi1_table_edge_idx
 .globl ihevc_sao_edge_offset_class3_chroma_av8

-ihevc_sao_edge_offset_class3_chroma_av8:
+ENTRY ihevc_sao_edge_offset_class3_chroma_av8


    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
@ -1167,6 +1166,7 @@ END_LOOPS:
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_weighted_pred_bi.s
+++ b/common/arm64/ihevc_weighted_pred_bi.s
@ -134,16 +134,14 @@
 //    x14 =>    ht
 //    x7    =>    wd

-.text
-.align 4
-
 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_weighted_pred_bi_av8

 .type ihevc_weighted_pred_bi_av8, %function

-ihevc_weighted_pred_bi_av8:
+ENTRY ihevc_weighted_pred_bi_av8

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

@ -307,6 +305,7 @@ end_loops:
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_weighted_pred_bi_default.s
+++ b/common/arm64/ihevc_weighted_pred_bi_default.s
@ -107,16 +107,15 @@
 //    x7 =>  lvl_shift2
 //    x8 =>  ht
 //    x9 =>  wd
-.text
-.align 4

 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_weighted_pred_bi_default_av8

 .type ihevc_weighted_pred_bi_default_av8, %function

-ihevc_weighted_pred_bi_default_av8:
+ENTRY ihevc_weighted_pred_bi_default_av8

    ldr         w8,[sp,#0]
    ldr         w9,[sp,#8]
@ -534,6 +533,7 @@ end_loops:
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/arm64/ihevc_weighted_pred_uni.s
+++ b/common/arm64/ihevc_weighted_pred_uni.s
@ -112,16 +112,14 @@
 //    x8 =>    ht
 //    x9    =>    wd

-.text
-.align 4
-
 .include "ihevc_neon_macros.s"
+.text

 .globl ihevc_weighted_pred_uni_av8

 .type ihevc_weighted_pred_uni_av8, %function

-ihevc_weighted_pred_uni_av8:
+ENTRY ihevc_weighted_pred_uni_av8

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

@ -240,6 +238,7 @@ end_loops:
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16

+    EXIT_FUNC
    ret


--- a/common/common.cmake
+++ b/common/common.cmake
@ -18,6 +18,7 @@ list(
  "${HEVC_ROOT}/common/ihevc_trans_tables.c"
  "${HEVC_ROOT}/common/ihevc_recon.c"
  "${HEVC_ROOT}/common/ihevc_itrans.c"
+  "${HEVC_ROOT}/common/ihevc_itrans_res.c"
  "${HEVC_ROOT}/common/ihevc_itrans_recon.c"
  "${HEVC_ROOT}/common/ihevc_iquant_recon.c"
  "${HEVC_ROOT}/common/ihevc_iquant_itrans_recon.c"
@ -29,6 +30,7 @@ list(
  "${HEVC_ROOT}/common/ihevc_chroma_iquant_itrans_recon.c"
  "${HEVC_ROOT}/common/ihevc_chroma_recon.c"
  "${HEVC_ROOT}/common/ihevc_chroma_itrans_recon_16x16.c"
+  "${HEVC_ROOT}/common/ihevc_chroma_itrans_recon_32x32.c"
  "${HEVC_ROOT}/common/ihevc_chroma_itrans_recon_8x8.c"
  "${HEVC_ROOT}/common/ihevc_buf_mgr.c"
  "${HEVC_ROOT}/common/ihevc_disp_mgr.c"
@ -62,7 +64,7 @@ list(
 include_directories(${HEVC_ROOT}/common)

 # arm/x86 sources
-if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
+if("${SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${SYSTEM_PROCESSOR}" STREQUAL "arm64")
  list(
    APPEND
    LIBHEVC_COMMON_ASMS
@ -135,7 +137,7 @@ if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
    "${HEVC_ROOT}/common/arm64/ihevc_weighted_pred_uni.s")

  include_directories(${HEVC_ROOT}/common/arm64 ${HEVC_ROOT}/common/arm)
-elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch32")
+elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch32")
  list(
    APPEND
    LIBHEVC_COMMON_ASMS
--- a/common/ihevc_cabac_tables.c
+++ b/common/ihevc_cabac_tables.c
--- a/common/ihevc_cabac_tables.h
+++ b/common/ihevc_cabac_tables.h
@ -92,9 +92,13 @@ typedef enum
    IHEVC_CAB_COEFFY_PREFIX         = IHEVC_CAB_COEFFX_PREFIX           + 18,
    IHEVC_CAB_CODED_SUBLK_IDX       = IHEVC_CAB_COEFFY_PREFIX           + 18,
    IHEVC_CAB_COEFF_FLAG            = IHEVC_CAB_CODED_SUBLK_IDX         + 4,
-    IHEVC_CAB_COEFABS_GRTR1_FLAG    = IHEVC_CAB_COEFF_FLAG              + 42,
+    IHEVC_CAB_COEFABS_GRTR1_FLAG    = IHEVC_CAB_COEFF_FLAG              + 44,
    IHEVC_CAB_COEFABS_GRTR2_FLAG    = IHEVC_CAB_COEFABS_GRTR1_FLAG      + 24,
-    IHEVC_CAB_CTXT_END              = IHEVC_CAB_COEFABS_GRTR2_FLAG      + 6
+    IHEVC_CAB_CCP_LOG2_RES_ABS      = IHEVC_CAB_COEFABS_GRTR2_FLAG      + 6,
+    IHEVC_CAB_CCP_RES_SIGN_FLAG     = IHEVC_CAB_CCP_LOG2_RES_ABS        + 8,
+    IHEVC_CAB_EXPLICIT_RDPCM_FLAG   = IHEVC_CAB_CCP_RES_SIGN_FLAG       + 2,
+    IHEVC_CAB_EXPLICIT_RDPCM_DIR    = IHEVC_CAB_EXPLICIT_RDPCM_FLAG     + 2,
+    IHEVC_CAB_CTXT_END              = IHEVC_CAB_EXPLICIT_RDPCM_DIR      + 2,
 }IHEVC_CABAC_CTXT_OFFSETS;


--- a/common/ihevc_chroma_intra_pred.h
+++ b/common/ihevc_chroma_intra_pred.h
@ -126,7 +126,14 @@ typedef void ihevc_intra_pred_chroma_ref_substitution_ft(UWORD8 *pu1_top_left,
                                                         WORD32 nt,
                                                         WORD32 nbr_flags,
                                                         UWORD8 *pu1_dst,
-                                                         WORD32 dst_strd);
+                                                         WORD32 dst_strd,
+                                                         WORD32 chroma_format_idc);
+
+typedef void ihevc_intra_pred_chroma_ref_filtering_ft(UWORD8 *pu1_src,
+                                                      WORD32 nt,
+                                                      UWORD8 *pu1_dst,
+                                                      WORD32 mode,
+                                                      WORD32 intra_smoothing_flags);

 typedef void ihevc_hbd_intra_pred_chroma_planar_ft(
                UWORD16 *pu2_ref,
@ -240,6 +247,7 @@ ihevc_intra_pred_chroma_mode_11_to_17_ft ihevc_intra_pred_chroma_mode_11_to_17;
 ihevc_intra_pred_chroma_mode_19_to_25_ft ihevc_intra_pred_chroma_mode_19_to_25;
 ihevc_intra_pred_chroma_mode_27_to_33_ft ihevc_intra_pred_chroma_mode_27_to_33;
 ihevc_intra_pred_chroma_ref_substitution_ft ihevc_intra_pred_chroma_ref_substitution;
+ihevc_intra_pred_chroma_ref_filtering_ft ihevc_intra_pred_chroma_ref_filtering;

 ihevc_hbd_intra_pred_chroma_planar_ft ihevc_hbd_intra_pred_chroma_planar;
 ihevc_hbd_intra_pred_chroma_dc_ft ihevc_hbd_intra_pred_chroma_dc;
--- a/common/ihevc_chroma_intra_pred_filters.c
+++ b/common/ihevc_chroma_intra_pred_filters.c
@ -64,6 +64,7 @@

 #include "ihevc_typedefs.h"
 #include "ihevc_macros.h"
+#include "ihevc_defs.h"
 #include "ihevc_func_selector.h"
 #include "ihevc_platform_macros.h"
 #include "ihevc_intra_pred.h"
@ -129,7 +130,8 @@ void ihevc_intra_pred_chroma_ref_substitution(UWORD8 *pu1_top_left,
                                              WORD32 nt,
                                              WORD32 nbr_flags,
                                              UWORD8 *pu1_dst,
-                                              WORD32 dst_strd)
+                                              WORD32 dst_strd,
+                                              WORD32 chroma_format_idc)
 {
    UWORD8 pu1_ref_u, pu1_ref_v;
    WORD32 dc_val, i, j;
@ -206,7 +208,7 @@ void ihevc_intra_pred_chroma_ref_substitution(UWORD8 *pu1_top_left,
            // U-V interleaved Top-top right samples
        }

-        if(nt == 4)
+        if(nt == 4 || (nt == 8 && chroma_format_idc == CHROMA_FMT_IDC_YUV444))
        {
            /* 1 bit extraction for all the neighboring blocks */
            tp_left = (nbr_flags & 0x10000) >> 16;
@ -274,8 +276,9 @@ void ihevc_intra_pred_chroma_ref_substitution(UWORD8 *pu1_top_left,

            }
        }
-        else if(nt == 8)
+        else if(nt == 8 || (nt == 16 && chroma_format_idc == CHROMA_FMT_IDC_YUV444))
        {
+            WORD32 sub_sample = chroma_format_idc == CHROMA_FMT_IDC_YUV444 ? 2 : 1;
            WORD32 nbr_flags_temp = 0;
            nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
                            + ((nbr_flags & 0x300) >> 4)
@ -285,16 +288,16 @@ void ihevc_intra_pred_chroma_ref_substitution(UWORD8 *pu1_top_left,
            /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
            /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
            {
-                nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 4; /* for bottom left and left */
-                if(nbr_id_from_bl == 32)
-                    nbr_id_from_bl = 16;
-                if(nbr_id_from_bl == 16)
+                nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * (4 * sub_sample); /* for bottom left and left */
+                if(nbr_id_from_bl == 32 * sub_sample)
+                    nbr_id_from_bl = 16 * sub_sample;
+                if(nbr_id_from_bl == 16 * sub_sample)
                {
                    /* for top left : 1 pel per nbr bit */
                    if(!((nbr_flags_temp >> 8) & 0x1))
                    {
                        nbr_id_from_bl++;
-                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 4; /* top and top right;  8 pels per nbr bit */
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 4 * sub_sample; /* top and top right;  8 pels per nbr bit */

                    }
                }
@ -313,14 +316,14 @@ void ihevc_intra_pred_chroma_ref_substitution(UWORD8 *pu1_top_left,
            }

            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
-            while(nbr_id_from_bl < ((T8C_4NT)+1))
+            while(nbr_id_from_bl < ((T8C_4NT * sub_sample)+1))
            {
                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
                /* Divide by 8 to obtain the original index */
-                frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+                frwd_nbr_flag = (nbr_id_from_bl >> (chroma_format_idc == CHROMA_FMT_IDC_YUV444 ? 3 : 2)); /*+ (nbr_id_from_bl & 0x1);*/

                /* The Top-left flag is at the last bit location of nbr_flags*/
-                if(nbr_id_from_bl == (T8C_4NT / 2))
+                if(nbr_id_from_bl == (T8C_4NT * sub_sample / 2))
                {
                    get_bits = GET_BIT(nbr_flags_temp, 8);

@ -339,22 +342,23 @@ void ihevc_intra_pred_chroma_ref_substitution(UWORD8 *pu1_top_left,
                        UWORD16 *pu2_dst;
                        /* 8 pel substitution (other than TL) */
                        pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
-                        ihevc_memset_16bit((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+                        ihevc_memset_16bit((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4 * sub_sample);
                    }

                }
-                nbr_id_from_bl += (nbr_id_from_bl == (T8C_4NT / 2)) ? 1 : 4;
+                nbr_id_from_bl += (nbr_id_from_bl == (T8C_4NT * sub_sample / 2)) ? 1 : 4 * sub_sample;
            }

        }
-        else if(nt == 16)
+        else if(nt == 16 || (nt == 32 && chroma_format_idc == CHROMA_FMT_IDC_YUV444))
        {
+            WORD32 sub_sample = chroma_format_idc == CHROMA_FMT_IDC_YUV444 ? 2 : 1;
            /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
            /* as each bit in nbr flags corresponds to 4 pels for bot_left, left, top and topright but 1 pel for topleft */
            {
-                nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 4; /* for bottom left and left */
+                nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 4 * sub_sample; /* for bottom left and left */

-                if(nbr_id_from_bl == 32)
+                if(nbr_id_from_bl == 32 * sub_sample)
                {
                    /* for top left : 1 pel per nbr bit */
                    if(!((nbr_flags >> 16) & 0x1))
@ -362,7 +366,7 @@ void ihevc_intra_pred_chroma_ref_substitution(UWORD8 *pu1_top_left,
                        /* top left not available */
                        nbr_id_from_bl++;
                        /* top and top right;  4 pels per nbr bit */
-                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 4;
+                        nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 4 * sub_sample;
                    }
                }
                /* Reverse Substitution Process*/
@ -380,14 +384,14 @@ void ihevc_intra_pred_chroma_ref_substitution(UWORD8 *pu1_top_left,
            }

            /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
-            while(nbr_id_from_bl < ((T16C_4NT)+1))
+            while(nbr_id_from_bl < ((T16C_4NT * sub_sample)+1))
            {
                /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
                /* Devide by 4 to obtain the original index */
-                frwd_nbr_flag = (nbr_id_from_bl >> 2); /*+ (nbr_id_from_bl & 0x1);*/
+                frwd_nbr_flag = (nbr_id_from_bl >> (chroma_format_idc == CHROMA_FMT_IDC_YUV444 ? 3 : 2)); /*+ (nbr_id_from_bl & 0x1);*/

                /* The Top-left flag is at the last bit location of nbr_flags*/
-                if(nbr_id_from_bl == (T16C_4NT / 2))
+                if(nbr_id_from_bl == (T16C_4NT * sub_sample / 2))
                {
                    get_bits = GET_BIT(nbr_flags, 16);
                    /* only pel substitution for TL */
@ -405,17 +409,110 @@ void ihevc_intra_pred_chroma_ref_substitution(UWORD8 *pu1_top_left,
                        UWORD16 *pu2_dst;
                        /* 4 pel substitution (other than TL) */
                        pu2_dst = (UWORD16 *)&pu1_dst[(2 * nbr_id_from_bl) - 2];
-                        ihevc_memset_16bit((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4);
+                        ihevc_memset_16bit((UWORD16 *)(pu1_dst + (2 * nbr_id_from_bl)), pu2_dst[0], 4 * sub_sample);
                    }

                }
-                nbr_id_from_bl += (nbr_id_from_bl == (T16C_4NT / 2)) ? 1 : 4;
+                nbr_id_from_bl += (nbr_id_from_bl == (T16C_4NT * sub_sample / 2)) ? 1 : 4 * sub_sample;
            }
        }
    }
 }


+/**
+*******************************************************************************
+*
+* @brief
+*    Intra prediction interpolation filter for chroma ref_filtering (4:4:4)
+*
+*
+* @par Description:
+*    Reference DC filtering for neighboring chroma samples dependent on TU size and
+*    mode  Refer to section 8.4.4.2.3 in the standard
+*
+* @param[in] pu1_src
+*  UWORD8 pointer to the source
+*
+* @param[out] pu1_dst
+*  UWORD8 pointer to the destination
+*
+* @param[in] nt
+*  integer Transform Block size
+*
+* @param[in] mode
+*  integer intraprediction mode
+*
+* @param[in] strong_intra_smoothing_enable_flag
+*  integer containing intra_smoothing_disabled_flag and strong_smoothing_enable_flag
+*
+* @returns
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+void ihevc_intra_pred_chroma_ref_filtering(UWORD8 *pu1_src,
+                                           WORD32 nt,
+                                           UWORD8 *pu1_dst,
+                                           WORD32 mode,
+                                           WORD32 intra_smoothing_flag)
+{
+    WORD32 filter_flag;
+    WORD32 i; /* Generic indexing variable */
+    WORD32 four_nt = 4 * nt;
+    UWORD8 au1_flt[((4 * MAX_CU_SIZE) + 1) * 2];
+    WORD32 intra_smoothing_disabled_flag = (intra_smoothing_flag >> 3) & 0x1;
+    WORD32 strong_intra_smoothing_enable_flag = intra_smoothing_flag & 0x1;
+    UNUSED(strong_intra_smoothing_enable_flag);
+
+    if(intra_smoothing_disabled_flag)
+    {
+        if(pu1_src == pu1_dst) return;
+        for(i = 0; i < (2 * (four_nt + 1)); i += 2)
+        {
+            pu1_dst[i]     = pu1_src[i];
+            pu1_dst[i + 1] = pu1_src[i + 1];
+        }
+        return;
+    }
+
+    filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+    if(0 == filter_flag)
+    {
+        if(pu1_src == pu1_dst) return;
+        for(i = 0; i < (2 * (four_nt + 1)); i += 2)
+        {
+            pu1_dst[i]     = pu1_src[i];
+            pu1_dst[i + 1] = pu1_src[i + 1];
+        }
+    }
+    else
+    {
+        /* Extremities Untouched*/
+        au1_flt[0] = pu1_src[0];
+        au1_flt[1] = pu1_src[1];
+        au1_flt[four_nt * 2] = pu1_src[four_nt * 2];
+        au1_flt[(four_nt * 2) + 1] = pu1_src[(four_nt * 2) + 1];
+
+        for(i = 2; i < four_nt * 2; i += 2)
+        {
+            au1_flt[i] = (pu1_src[i - 2] + 2 * pu1_src[i] + pu1_src[i + 2] + 2) >> 2;
+            au1_flt[i + 1] = (pu1_src[i - 1] + 2 * pu1_src[i + 1] + pu1_src[i + 3] + 2) >> 2;
+        }
+
+        for(i = 0; i < (2 * (four_nt + 1)); i += 2)
+        {
+            pu1_dst[i]     = au1_flt[i];
+            pu1_dst[i + 1] = au1_flt[i + 1];
+        }
+    }
+}
+
+
+
 /**
 *******************************************************************************
 *
--- a/common/ihevc_chroma_itrans_recon.h
+++ b/common/ihevc_chroma_itrans_recon.h
@ -94,6 +94,15 @@ typedef void ihevc_hbd_chroma_itrans_recon_16x16_ft(WORD16 *pi2_src,
                                                    WORD32 zero_cols,
                                                    WORD32 zero_rows,
                                                    UWORD8 bit_depth);
+typedef void ihevc_chroma_itrans_recon_32x32_ft(WORD16 *pi2_src,
+                                                WORD16 *pi2_tmp,
+                                                UWORD8 *pu1_pred,
+                                                UWORD8 *pu1_dst,
+                                                WORD32 src_strd,
+                                                WORD32 pred_strd,
+                                                WORD32 dst_strd,
+                                                WORD32 zero_cols,
+                                                WORD32 zero_rows);

 ihevc_chroma_itrans_recon_4x4_ft ihevc_chroma_itrans_recon_4x4;
 ihevc_hbd_chroma_itrans_recon_4x4_ft ihevc_hbd_chroma_itrans_recon_4x4;
@ -101,6 +110,7 @@ ihevc_chroma_itrans_recon_8x8_ft ihevc_chroma_itrans_recon_8x8;
 ihevc_hbd_chroma_itrans_recon_8x8_ft ihevc_hbd_chroma_itrans_recon_8x8;
 ihevc_chroma_itrans_recon_16x16_ft ihevc_chroma_itrans_recon_16x16;
 ihevc_hbd_chroma_itrans_recon_16x16_ft ihevc_hbd_chroma_itrans_recon_16x16;
+ihevc_chroma_itrans_recon_32x32_ft ihevc_chroma_itrans_recon_32x32;

 /* A9 Q Function Declarations */
 ihevc_chroma_itrans_recon_4x4_ft ihevc_chroma_itrans_recon_4x4_a9q;
--- a/common/ihevc_chroma_itrans_recon_32x32.c
+++ b/common/ihevc_chroma_itrans_recon_32x32.c
--- a/common/ihevc_chroma_recon.c
+++ b/common/ihevc_chroma_recon.c
@ -306,3 +306,157 @@ void ihevc_chroma_recon_16x16(WORD16 *pi2_src,
    }
 }

+/**
+ *******************************************************************************
+ *
+ * @brief
+ *  This function performs reconstruction for 32x32 input block
+ *
+ * @par Description:
+ *  Performs reconstruction of 32x32 input block by adding adding prediction
+ * data to input and clipping it to 8 bit
+ *
+ * @param[in] pi2_src
+ *  Input 32x32 coefficients
+ *
+ * @param[in] pu1_pred
+ *  Prediction 32x32 block
+ *
+ * @param[out] pu1_dst
+ *  Output 32x32 block
+ *
+ * @param[in] src_strd
+ *  Input stride
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @param[in] shift
+ *  Output shift
+ *
+ * @param[in] zero_cols
+ *  Zero columns in pi2_tmp
+ *
+ * @returns  Void
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+void ihevc_chroma_recon_32x32(WORD16 *pi2_src,
+                              UWORD8 *pu1_pred,
+                              UWORD8 *pu1_dst,
+                              WORD32 src_strd,
+                              WORD32 pred_strd,
+                              WORD32 dst_strd,
+                              WORD32 zero_cols)
+{
+    WORD32 i, j;
+    WORD32 trans_size;
+
+    trans_size = TRANS_SIZE_32;
+
+    /* Reconstruction */
+
+    for(i = 0; i < trans_size; i++)
+    {
+        /* Checking for Zero Cols */
+        if((zero_cols & 1) == 1)
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] = pu1_pred[j * pred_strd];
+            }
+        }
+        else
+        {
+            for(j = 0; j < trans_size; j++)
+            {
+                pu1_dst[j * dst_strd] =
+                                CLIP_U8(pi2_src[j * src_strd] + pu1_pred[j * pred_strd]);
+            }
+        }
+        pi2_src++;
+        pu1_dst += 2;
+        pu1_pred += 2;
+        zero_cols = zero_cols >> 1;
+    }
+}
+
+/**
+ ******************************************************************************
+ *
+ * @brief Constructs chroma recon with Cross Component Prediction (CCP)
+ *
+ * @par   Description
+ * This routine uses reconstructed luma residual samples to predict chroma
+ * residual samples as per HEVC Specification Section 8.6.6. It scales the
+ * luma residual by a signaled alpha value and adds it to the chroma residual
+ * prior to final reconstruction.
+ *
+ * @param[in]   pi2_luma_res
+ * pointer to the luma residual
+ *
+ * @param[in] pi2_chroma_res
+ * pointer to the chroma residual
+ *
+ * @param[in] pu1_pred
+ * prediction block
+ *
+ * @param[in] pu1_dst
+ * destination block
+ *
+ * @param[in]   alpha
+ * scaling factor for the luma residual
+ *
+ * @param[in]   trans_size
+ * transform size
+ *
+ * @param[in]   luma_res_stride
+ * stride of the luma residual buffer
+ *
+ * @param[in]   chroma_res_stride
+ * stride of the chroma residual buffer
+ *
+ * @param[in] pred_strd
+ *  Prediction stride
+ *
+ * @param[in] dst_strd
+ *  Output Stride
+ *
+ * @return      success or failure error code
+ *
+ ******************************************************************************
+ */
+void ihevc_chroma_recon_nxn_ccp(WORD16 *pi2_luma_res,
+                                WORD16 *pi2_chroma_res,
+                                UWORD8 *pu1_pred,
+                                UWORD8 *pu1_dst,
+                                WORD32 alpha,
+                                WORD32 trans_size,
+                                WORD32 luma_res_stride,
+                                WORD32 chroma_res_stride,
+                                WORD32 pred_stride,
+                                WORD32 dst_stride)
+{
+    WORD32 i, j;
+
+    for(i = 0; i < trans_size; i++)
+    {
+        for(j = 0; j < trans_size; j++)
+        {
+            WORD32 res = (alpha * pi2_luma_res[j]) >> 3;
+            pu1_dst[j * 2] = CLIP_U8(pu1_pred[j * 2] + (pi2_chroma_res[j] + res));
+        }
+        pi2_luma_res += luma_res_stride;
+        pi2_chroma_res += chroma_res_stride;
+        pu1_dst += dst_stride;
+        pu1_pred += pred_stride;
+    }
+}
--- a/common/ihevc_chroma_recon.h
+++ b/common/ihevc_chroma_recon.h
@ -88,6 +88,13 @@ typedef void ihevc_hbd_chroma_recon_16x16_ft(WORD16 *pi2_src,
                                             WORD32 dst_strd,
                                             WORD32 zero_cols,
                                             UWORD8 bit_depth);
+typedef void ihevc_chroma_recon_32x32_ft(WORD16 *pi2_src,
+                                         UWORD8 *pu1_pred,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 src_strd,
+                                         WORD32 pred_strd,
+                                         WORD32 dst_strd,
+                                         WORD32 zero_cols);

 ihevc_chroma_recon_4x4_ft ihevc_chroma_recon_4x4;
 ihevc_hbd_chroma_recon_4x4_ft ihevc_hbd_chroma_recon_4x4;
@ -95,5 +102,17 @@ ihevc_chroma_recon_8x8_ft ihevc_chroma_recon_8x8;
 ihevc_hbd_chroma_recon_8x8_ft ihevc_hbd_chroma_recon_8x8;
 ihevc_chroma_recon_16x16_ft ihevc_chroma_recon_16x16;
 ihevc_hbd_chroma_recon_16x16_ft ihevc_hbd_chroma_recon_16x16;
+ihevc_chroma_recon_32x32_ft ihevc_chroma_recon_32x32;
+
+void ihevc_chroma_recon_nxn_ccp(WORD16 *pi2_luma_res,
+                                WORD16 *pi2_chroma_res,
+                                UWORD8 *pu1_pred,
+                                UWORD8 *pu1_dst,
+                                WORD32 alpha,
+                                WORD32 trans_size,
+                                WORD32 luma_res_stride,
+                                WORD32 chroma_res_stride,
+                                WORD32 pred_stride,
+                                WORD32 dst_stride);

 #endif /*_IHEVC_CHROMA_RECON_H_*/
--- a/common/ihevc_defs.h
+++ b/common/ihevc_defs.h
@ -40,7 +40,10 @@
 /*****************************************************************************/
 enum
 {
+    IHEVC_PROFILE_UNKNOWN = -1,
    IHEVC_PROFILE_MAIN = 0,
+    IHEVC_PROFILE_MAIN_STILL = 1,
+    IHEVC_PROFILE_MAIN_REXT = 2,
 };

 enum
--- a/common/ihevc_inter_pred.h
+++ b/common/ihevc_inter_pred.h
@ -46,6 +46,9 @@
 #define REF_WIDTH 1280
 #define REF_HEIGHT 720

+extern WORD8 gai1_ihevc_luma_filter[4][NTAPS_LUMA];
+extern WORD8 gai1_ihevc_chroma_filter[8][NTAPS_LUMA];
+
 /*****************************************************************************/
 /* Function Declarations                                                     */
 /*****************************************************************************/
--- a/common/ihevc_inter_pred_filters.c
+++ b/common/ihevc_inter_pred_filters.c
@ -61,6 +61,26 @@
 #include "ihevc_func_selector.h"

 #include "ihevc_inter_pred.h"
+
+WORD8 gai1_ihevc_luma_filter[4][NTAPS_LUMA] =
+{
+    { 0, 0, 0, 64, 0, 0, 0, 0 },
+    { -1, 4, -10, 58, 17, -5, 1, 0 },
+    { -1, 4, -11, 40, 40, -11, 4, -1 },
+    { 0, 1, -5, 17, 58, -10, 4, -1 } };
+
+/* The filter uses only the first four elements in each array */
+WORD8 gai1_ihevc_chroma_filter[8][NTAPS_LUMA] =
+{
+    { 0, 64, 0, 0, 0, 0, 0, 0 },
+    { -2, 58, 10, -2, 0, 0, 0, 0 },
+    { -4, 54, 16, -2, 0, 0, 0, 0 },
+    { -6, 46, 28, -4, 0, 0, 0, 0 },
+    { -4, 36, 36, -4, 0, 0, 0, 0 },
+    { -4, 28, 46, -6, 0, 0, 0, 0 },
+    { -2, 16, 54, -4, 0, 0, 0, 0 },
+    { -2, 10, 58, -2, 0, 0, 0, 0 } };
+
 /*****************************************************************************/
 /* Function Definitions                                                      */
 /*****************************************************************************/
--- a/common/ihevc_intra_pred.h
+++ b/common/ihevc_intra_pred.h
@ -159,7 +159,7 @@ typedef void ihevc_intra_pred_ref_filtering_ft(UWORD8 *pu1_src,
                                               WORD32 nt,
                                               UWORD8 *pu1_dst,
                                               WORD32 mode,
-                                               WORD32 strong_intra_smoothing_enable_flag);
+                                               WORD32 intra_smoothing_flags);

 typedef void ihevc_hbd_intra_pred_luma_planar_ft(
                UWORD16 *pu2_ref,
--- a/common/ihevc_intra_pred_filters.c
+++ b/common/ihevc_intra_pred_filters.c
@ -627,6 +627,11 @@ void ihevc_intra_pred_luma_ref_substitution(UWORD8 *pu1_top_left,
 * @param[in] mode
 *  integer intraprediction mode
 *
+* @param[in] intra_smoothing_flags
+*  integer bit 3 indicates if intra smoothing is enabled/disabled
+*          unconditionally. this is applicable to frext profiles only
+*          bit 0 indicates strong intra smoothing enabled/disabled
+*
 * @returns
 *
 * @remarks
@ -640,7 +645,7 @@ void ihevc_intra_pred_ref_filtering(UWORD8 *pu1_src,
                                    WORD32 nt,
                                    UWORD8 *pu1_dst,
                                    WORD32 mode,
-                                    WORD32 strong_intra_smoothing_enable_flag)
+                                    WORD32 intra_smoothing_flags)
 {
    WORD32 filter_flag;
    WORD32 i; /* Generic indexing variable */
@ -651,9 +656,11 @@ void ihevc_intra_pred_ref_filtering(UWORD8 *pu1_src,
    WORD32 abs_cond_top_flag = 0;
    /*WORD32 dc_val = 1 << (BIT_DEPTH - 5);*/
    WORD32 dc_val = 1 << (8 - 5);
-    //WORD32 strong_intra_smoothing_enable_flag  = 1;
+    WORD32 intra_smoothing_disabled = (intra_smoothing_flags >> 3);
+    WORD32 strong_intra_smoothing_enable_flag = intra_smoothing_flags & 1;

-    filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+    filter_flag = intra_smoothing_disabled ?
+                    0 : (gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2)));
    if(0 == filter_flag)
    {
        if(pu1_src == pu1_dst)
@ -943,8 +950,8 @@ void ihevc_intra_pred_luma_dc(UWORD8 *pu1_ref,
 * @param[in] nt
 *  integer Transform Block size
 *
-* @param[in] mode
-*  integer intraprediction mode
+* @param[in] disable_boundary_filter
+*  disable boundary filtering
 *
 * @returns
 *
@ -960,17 +967,16 @@ void ihevc_intra_pred_luma_horz(UWORD8 *pu1_ref,
                                UWORD8 *pu1_dst,
                                WORD32 dst_strd,
                                WORD32 nt,
-                                WORD32 mode)
+                                WORD32 disable_boundary_filter)
 {

    WORD32 row, col;
    WORD32 two_nt;
    WORD16 s2_predpixel;
-    UNUSED(mode);
    UNUSED(src_strd);
    two_nt = 2 * nt;

-    if(nt == 32)
+    if(nt == 32 || disable_boundary_filter)
    {
        for(row = 0; row < nt; row++)
            for(col = 0; col < nt; col++)
@ -1023,8 +1029,8 @@ void ihevc_intra_pred_luma_horz(UWORD8 *pu1_ref,
 * @param[in] nt
 *  integer Transform Block size
 *
-* @param[in] mode
-*  integer intraprediction mode
+* @param[in] disable_boundary_filter
+*  disable boundary filtering
 *
 * @returns
 *
@ -1040,15 +1046,14 @@ void ihevc_intra_pred_luma_ver(UWORD8 *pu1_ref,
                               UWORD8 *pu1_dst,
                               WORD32 dst_strd,
                               WORD32 nt,
-                               WORD32 mode)
+                               WORD32 disable_boundary_filter)
 {
    WORD32 row, col;
    WORD16 s2_predpixel;
    WORD32 two_nt = 2 * nt;
-    UNUSED(mode);
    UNUSED(src_strd);

-    if(nt == 32)
+    if(nt == 32 || disable_boundary_filter)
    {
        /* Replication to next columns*/
        for(row = 0; row < nt; row++)
--- a/common/ihevc_itrans_res.c
+++ b/common/ihevc_itrans_res.c
--- a/common/ihevc_itrans_res.h
+++ b/common/ihevc_itrans_res.h
@ -0,0 +1,108 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ ******************************************************************************/
+/**
+ *******************************************************************************
+ * @file
+ *  ihevc_itrans_res.h
+ *
+ * @brief
+ *  Functions declarations for inverse transform
+ *
+ * @author
+ *  Ittiam
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+#ifndef _IHEVC_ITRANS_RES_H_
+#define _IHEVC_ITRANS_RES_H_
+
+typedef void ihevc_itrans_res_4x4_ttype1_ft(WORD16 *pi2_src,
+                                            WORD16 *pi2_tmp,
+                                            WORD16 *pi2_dst,
+                                            WORD32 src_strd,
+                                            WORD32 dst_strd,
+                                            WORD32 zero_cols,
+                                            WORD32 zero_rows);
+
+typedef void ihevc_itrans_res_4x4_ft(WORD16 *pi2_src,
+                                     WORD16 *pi2_tmp,
+                                     WORD16 *pi2_dst,
+                                     WORD32 src_strd,
+                                     WORD32 dst_strd,
+                                     WORD32 zero_cols,
+                                     WORD32 zero_rows);
+
+typedef void ihevcd_itrans_res_dc_ft(WORD16 *pi2_dst,
+                                     WORD32 dst_strd,
+                                     WORD32 log2_trans_size,
+                                     WORD16 i2_coeff_value);
+
+typedef void ihevc_itrans_res_8x8_ft(WORD16 *pi2_src,
+                                     WORD16 *pi2_tmp,
+                                     WORD16 *pi2_dst,
+                                     WORD32 src_strd,
+                                     WORD32 dst_strd,
+                                     WORD32 zero_cols,
+                                     WORD32 zero_rows);
+
+typedef void ihevc_itrans_res_16x16_ft(WORD16 *pi2_src,
+                                       WORD16 *pi2_tmp,
+                                       WORD16 *pi2_dst,
+                                       WORD32 src_strd,
+                                       WORD32 dst_strd,
+                                       WORD32 zero_cols,
+                                       WORD32 zero_rows);
+
+typedef void ihevc_itrans_res_32x32_ft(WORD16 *pi2_src,
+                                       WORD16 *pi2_tmp,
+                                       WORD16 *pi2_dst,
+                                       WORD32 src_strd,
+                                       WORD32 dst_strd,
+                                       WORD32 zero_cols,
+                                       WORD32 zero_rows);
+
+typedef void ihevc_res_4x4_transform(WORD16 *pi2_src,
+                                     WORD16 *pi2_dst,
+                                     WORD32 src_strd,
+                                     WORD32 dst_strd,
+                                     WORD32 zero_cols);
+
+typedef void ihevc_res_nxn_transform(WORD16 *pi2_src,
+                                     WORD16 *pi2_dst,
+                                     WORD32 src_strd,
+                                     WORD32 dst_strd,
+                                     WORD32 trans_size,
+                                     WORD32 zero_cols);
+
+/* C function declarations */
+ihevc_itrans_res_4x4_ttype1_ft ihevc_itrans_res_4x4_ttype1;
+ihevc_itrans_res_4x4_ft ihevc_itrans_res_4x4;
+ihevcd_itrans_res_dc_ft ihevcd_itrans_res_dc;
+ihevc_itrans_res_8x8_ft ihevc_itrans_res_8x8;
+ihevc_itrans_res_16x16_ft ihevc_itrans_res_16x16;
+ihevc_itrans_res_32x32_ft ihevc_itrans_res_32x32;
+
+ihevc_res_4x4_transform ihevc_res_4x4_rotate;
+ihevc_res_nxn_transform ihevc_res_nxn_copy;
+ihevc_res_nxn_transform ihevc_res_nxn_rdpcm_horz;
+ihevc_res_nxn_transform ihevc_res_nxn_rdpcm_vert;
+
+#endif /*_IHEVC_ITRANS_RES_H_*/
--- a/common/ihevc_structs.h
+++ b/common/ihevc_structs.h
@ -39,6 +39,7 @@
 #define _IHEVC_STRUCTS_H_


+#ifndef DISABLE_SEI
 /**
 * Buffering Period SEI parameters Info
 */
@ -519,6 +520,7 @@ typedef struct
    time_code_t s_time_code;
 } sei_params_t;

+#endif

 /**
 * Sub-layer HRD parameters Info
@ -952,7 +954,9 @@ typedef struct
    // See IV_FLD_TYPE_T for all field types
    UWORD32 e4_fld_type;

+#ifndef DISABLE_SEI
    sei_params_t s_sei_params;
+#endif

    WORD32 i4_vui_present;

@ -1416,6 +1420,28 @@ typedef struct
     */
    UWORD32    b3_chroma_intra_mode_idx    : 3;

+#ifdef ENABLE_MAIN_REXT_PROFILE
+    /**
+     *  Cb CCP alpha magnitude
+     */
+    UWORD32    b3_cb_log2_res_scale_abs_plus1    : 3;
+
+    /**
+     *  Cb CCP alpha sign
+     */
+    UWORD32    b1_cb_log2_res_sign          : 1;
+
+    /**
+     *  Cr CCP alpha magnitude
+     */
+    UWORD32    b3_cr_log2_res_scale_abs_plus1    : 3;
+
+    /**
+     *  Cr CCP alpha sign
+     */
+    UWORD32    b1_cr_log2_res_sign          : 1;
+#endif
+
 }tu_t;

 /**
@ -2477,6 +2503,36 @@ typedef struct
     */
    vui_t s_vui_parameters;

+    /**
+     *  sps_extension_present_flag
+     */
+    WORD8 i1_sps_extension_present_flag;
+
+    /**
+     *  sps_range_extension_present_flag
+     */
+    WORD8 i1_sps_range_extension_flag;
+
+    /**
+     *  sps_multilayer_extension_present_flag
+     */
+    WORD8 i1_sps_multilayer_extension_flag;
+
+    /**
+     *  sps_3d_extension_present_flag
+     */
+    WORD8 i1_sps_3d_extension_flag;
+
+    /**
+     *  sps_scc_extension_present_flag
+     */
+    WORD8 i1_sps_scc_extension_flag;
+
+    /**
+     *  sps_extension_reserved
+     */
+    WORD8 i1_sps_extension_4bits;
+
    /**
     *  Log2(CTB Size) in luma units
     */
@ -2539,8 +2595,8 @@ typedef struct
    /* Inter 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
    /* Intra 16x16 Y, 16x16 U, 16x16 V                                       */
    /* Inter 16x16 Y, 16x16 U, 16x16 V                                       */
-    /* Intra 32x32 Y                                                         */
-    /* Inter 32x32 Y                                                         */
+    /* Intra 32x32 Y, 32x32 U, 32x32 V                                       */
+    /* Inter 32x32 Y, 32x32 U, 32x32 V                                       */
    /*************************************************************************/
    WORD16 *pi2_scaling_mat;

@ -2582,9 +2638,9 @@ typedef struct
    WORD8   i1_use_high_precision_pred_wt;

    /**
-     * fast_rice_adaptation_enabled_flag
+     * persistent_rice_adaptation_enabled_flag
     */
-    WORD8   i1_fast_rice_adaptation_enabled_flag;
+    WORD8   i1_persistent_rice_adaptation_enabled_flag;

    /**
     * cabac_bypass_alignment_enabled_flag
@ -2615,8 +2671,8 @@ typedef struct
    /* Inter 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
    /* Intra 16x16 Y, 16x16 U, 16x16 V                                       */
    /* Inter 16x16 Y, 16x16 U, 16x16 V                                       */
-    /* Intra 32x32 Y                                                         */
-    /* Inter 32x32 Y                                                         */
+    /* Intra 32x32 Y, 32x32 U, 32x32 V                                       */
+    /* Inter 32x32 Y, 32x32 U, 32x32 V                                       */
    /*************************************************************************/
    WORD16 *pi2_scaling_mat;

@ -2818,6 +2874,36 @@ typedef struct
     */
    WORD8 i1_slice_extension_present_flag;

+    /**
+     *  pps_extension_present_flag
+     */
+    WORD8 i1_pps_extension_present_flag;
+
+    /**
+     *  pps_range_extension_present_flag
+     */
+    WORD8 i1_pps_range_extension_flag;
+
+    /**
+     *  pps_multilayer_extension_present_flag
+     */
+    WORD8 i1_pps_multilayer_extension_flag;
+
+    /**
+     *  pps_3d_extension_present_flag
+     */
+    WORD8 i1_pps_3d_extension_flag;
+
+    /**
+     *  pps_scc_extension_present_flag
+     */
+    WORD8 i1_pps_scc_extension_flag;
+
+    /**
+     *  pps_extension_reserved
+     */
+    WORD8 i1_pps_extension_4bits;
+
    /**
     *  scaling_list_dc_coef_minus8
     */
@ -2838,7 +2924,7 @@ typedef struct
    /**
     * log2_max_transform_skip_block_size_minus2
     */
-    WORD32 i4_log2_max_transform_skip_block_size_minus2;
+    WORD8 i1_log2_max_transform_skip_block_size_minus2;

    /**
     * cross_component_prediction_enabled_flag
--- a/common/ihevc_trans_tables.c
+++ b/common/ihevc_trans_tables.c
@ -877,6 +877,12 @@ const WORD16 g_ai2_ihevc_trans_intr_4[4][8] =

 const UWORD8 IHEVCE_CHROMA_SHUFFLEMASK_HBD[8] = { 0x00, 0x01, 0x04, 0x05,
    0x08, 0x09, 0x0C, 0x0D };
+
+const UWORD32 gau4_ihevcd_4_bit_reverse[] =
+{
+    0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15
+};
+
 #ifndef DISABLE_AVX2
 const WORD32 g_ai4_ihevc_trans_8_intr_avx2[7][8] =
 { /* 4*32 = 128 bit */
--- a/common/ihevc_trans_tables.h
+++ b/common/ihevc_trans_tables.h
@ -116,4 +116,6 @@ extern const WORD16 g_ai2_ihevc_trans_intr_4[4][8];

 extern const UWORD8 IHEVCE_CHROMA_SHUFFLEMASK_HBD[8];

+extern MEM_ALIGN16 const UWORD32 gau4_ihevcd_4_bit_reverse[16];
+
 #endif /*_IHEVC_TRANS_TABLES_H_*/
--- a/common/x86/ihevc_intra_pred_filters_sse42_intr.c
+++ b/common/x86/ihevc_intra_pred_filters_sse42_intr.c
@ -91,6 +91,11 @@
 * @param[in] mode
 *  integer intraprediction mode
 *
+* @param[in] intra_smoothing_flags
+*  integer bit 3 indicates if intra smoothing is enabled/disabled
+*          unconditionally. this is applicable to frext profiles only
+*          bit 0 indicates strong intra smoothing enabled/disabled
+*
 * @returns
 *
 * @remarks
@ -104,7 +109,7 @@ void ihevc_intra_pred_ref_filtering_sse42(UWORD8 *pu1_src,
                                          WORD32 nt,
                                          UWORD8 *pu1_dst,
                                          WORD32 mode,
-                                          WORD32 strong_intra_smoothing_enable_flag)
+                                          WORD32 intra_smoothing_flags)
 {
    WORD32 filter_flag;
    WORD32 i; /* Generic indexing variable */
@ -117,11 +122,10 @@ void ihevc_intra_pred_ref_filtering_sse42(UWORD8 *pu1_src,
    __m128i src_temp1, src_temp2, src_temp3, src_temp7;
    __m128i src_temp4, src_temp5, src_temp6, src_temp8;

-    //WORD32 strong_intra_smoothing_enable_flag  = 1;
-
-
-
-    filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+    WORD32 intra_smoothing_disabled = (intra_smoothing_flags >> 3);
+    WORD32 strong_intra_smoothing_enable_flag = intra_smoothing_flags & 1;
+    filter_flag = intra_smoothing_disabled ?
+                    0 : (gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2)));
    if(0 == filter_flag)
    {
        if(pu1_src == pu1_dst)
--- a/common/x86/ihevc_intra_pred_filters_ssse3_intr.c
+++ b/common/x86/ihevc_intra_pred_filters_ssse3_intr.c
@ -377,6 +377,11 @@ void ihevc_intra_pred_luma_ref_substitution_ssse3(UWORD8 *pu1_top_left,
 * @param[in] mode
 *  integer intraprediction mode
 *
+* @param[in] intra_smoothing_flags
+*  integer bit 3 indicates if intra smoothing is enabled/disabled
+*          unconditionally. this is applicable to frext profiles only
+*          bit 0 indicates strong intra smoothing enabled/disabled
+*
 * @returns
 *
 * @remarks
@ -389,7 +394,7 @@ void ihevc_intra_pred_ref_filtering_ssse3(UWORD8 *pu1_src,
                                          WORD32 nt,
                                          UWORD8 *pu1_dst,
                                          WORD32 mode,
-                                          WORD32 strong_intra_smoothing_enable_flag)
+                                          WORD32 intra_smoothing_flags)
 {
    WORD32 filter_flag;
    WORD32 i; /* Generic indexing variable */
@ -402,9 +407,10 @@ void ihevc_intra_pred_ref_filtering_ssse3(UWORD8 *pu1_src,
    __m128i src_temp1, src_temp2, src_temp3, src_temp7;
    __m128i src_temp4, src_temp5, src_temp6, src_temp8;

-    //WORD32 strong_intra_smoothing_enable_flag  = 1;
-
-    filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
+    WORD32 intra_smoothing_disabled = (intra_smoothing_flags >> 3);
+    WORD32 strong_intra_smoothing_enable_flag = intra_smoothing_flags & 1;
+    filter_flag = intra_smoothing_disabled ?
+                    0 : (gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2)));
    if(0 == filter_flag)
    {
        if(pu1_src == pu1_dst)
--- a/decoder/arm/ihevcd_fmt_conv_420sp_to_rgba8888.s
+++ b/decoder/arm/ihevcd_fmt_conv_420sp_to_rgba8888.s
@ -1,453 +0,0 @@
-@/*****************************************************************************
-@*
-@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
-@*
-@* Licensed under the Apache License, Version 2.0 (the "License");
-@* you may not use this file except in compliance with the License.
-@* You may obtain a copy of the License at:
-@*
-@* http://www.apache.org/licenses/LICENSE-2.0
-@*
-@* Unless required by applicable law or agreed to in writing, software
-@* distributed under the License is distributed on an "AS IS" BASIS,
-@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@* See the License for the specific language governing permissions and
-@* limitations under the License.
-@*
-@*****************************************************************************/
-@/**
-@/*******************************************************************************
-@* @file
-@*  ihevcd_fmt_conv_420sp_to_rgba8888.s
-@*
-@* @brief
-@*  contains function definitions for format conversions
-@*
-@* @author
-@*  ittiam
-@*
-@* @par list of functions:
-@*
-@*
-@* @remarks
-@*  none
-@*
-@*******************************************************************************/
-    .equ DO1STROUNDING, 0
-
-    @ ARM
-    @
-    @ PRESERVE8
-
-.text
-.p2align 2
-
-
-
-
-@/*****************************************************************************
-@*                                                                            *
-@*  Function Name    : ihevcd_fmt_conv_420sp_to_rgba8888()                    *
-@*                                                                            *
-@*  Description      : This function conversts the image from YUV422 color    *
-@*                     space to RGB888 color space. The function can be       *
-@*                     invoked at the MB level.                               *
-@*                                                                            *
-@*  Arguments        : R0           pubY                                      *
-@*                     R1           pubUV                                     *
-@*                     R2           pusRGB                                    *
-@*                     R3           pusRGB                                    *
-@*                     [R13 #40]    usHeight                                  *
-@*                     [R13 #44]    usWidth                                   *
-@*                     [R13 #48]    usStrideY                                 *
-@*                     [R13 #52]    usStrideU                                 *
-@*                     [R13 #56]    usStrideV                                 *
-@*                     [R13 #60]    usStrideRGB                               *
-@*                                                                            *
-@*  Values Returned  : None                                                   *
-@*                                                                            *
-@*  Register Usage   : R0 - R14                                               *
-@*                                                                            *
-@*  Stack Usage      : 104 Bytes                                              *
-@*                                                                            *
-@*  Interruptibility : Interruptible                                          *
-@*                                                                            *
-@*  Known Limitations                                                         *
-@*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
-@*                     greater than or equal to 16                            *
-@*                     Image Height:    Assumed to be even.                   *
-@*                                                                            *
-@*  Revision History :                                                        *
-@*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
-@*         07 06 2010   Varshita        Draft                                 *
-@*         07 06 2010   Naveen Kr T     Completed                             *
-@*         05 08 2013   Naveen K P      Modified for HEVC                     *
-@*         30 10 2018   Saurabh Sood    Store D registers to stack            *
-@*****************************************************************************/
-    .global ihevcd_fmt_conv_420sp_to_rgba8888_a9q
-.type ihevcd_fmt_conv_420sp_to_rgba8888_a9q, function
-ihevcd_fmt_conv_420sp_to_rgba8888_a9q:
-
-    @// push the registers on the stack
-    STMFD       SP!,{R4-R12,LR}
-    VPUSH       {d8-d15}
-
-    @//R0 - Y PTR
-    @//R1 - UV PTR
-    @//R2 - RGB PTR
-    @//R3 - RGB PTR
-    @//R4 - PIC WIDTH
-    @//R5 - PIC HT
-    @//R6 - STRIDE Y
-    @//R7 - STRIDE U
-    @//R8 - STRIDE V
-    @//R9 - STRIDE RGB
-
-    @//ONE ROW PROCESSING AT A TIME
-
-    @//THE FOUR CONSTANTS ARE:
-    @//C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
-
-    @PLD        [R0]
-    @PLD        [R1]
-    @PLD        [R2]
-
-
-    @/* can be loaded from a defined const type */
-    MOVW        R10,#0x3311
-    VMOV.16     D0[0],R10                   @//C1
-
-    MOVW        R10,#0xF379
-    VMOV.16     D0[1],R10                   @//C2
-
-    MOVW        R10,#0xE5F8
-    VMOV.16     D0[2],R10                   @//C3
-
-    MOVW        R10,#0x4092
-    VMOV.16     D0[3],R10                   @//C4
-
-    @//LOAD CONSTANT 128 INTO A CORTEX REGISTER
-    MOV         R10,#128
-    VDUP.8      D1,R10
-
-    @//D0 HAS C1-C2-C3-C4
-    @// load other parameters from stack
-    LDR         R5,[sp,#104]
-    @LDR  R4,[sp,#44]
-    LDR         R6,[sp,#108]
-    LDR         R7,[sp,#112]
-    @LDR  R8,[sp,#52]
-    LDR         R9,[sp,#116]
-
-    @// calculate offsets, offset = stride - width
-    SUB         R10,R6,R3                   @// luma offset
-    SUB         R11,R7,R3
-    @, LSR #1   @// u offset
-    @SUB     R12,R8,R3, LSR #1  @// v offset
-    SUB         R14,R9,R3                   @// rgb offset in pixels
-
-    @// calculate height loop count
-    MOV         R5,R5, LSR #1               @// height_cnt = height / 16
-
-    @// create next row pointers for rgb and luma data
-    ADD         R7,R0,R6                    @// luma_next_row = luma + luma_stride
-    ADD         R8,R2,R9,LSL #2             @// rgb_next_row = rgb + rgb_stride
-
-LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
-
-    @//LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
-    VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF UV
-    @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V
-
-    @// calculate width loop count
-    MOV         R6,R3, LSR #4               @// width_cnt = width / 16
-
-    @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
-    @//LOAD VALUES OF Y 8-BIT VALUES
-    VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
-                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
-    VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
-                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
-
-    SUBS        R6,R6,#1
-    BEQ         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
-
-LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
-    @VMOV.I8 Q1,#128
-    VUZP.8      D2,D3
-
-
-    @//NEED TO SUBTRACT (U-128) AND (V-128)
-    @//(D2-D1),(D3-D1)
-    VSUBL.U8    Q2,D2,D1                    @//(U-128)
-    VSUBL.U8    Q3,D3,D1                    @//(V-128)
-
-    @//LOAD VALUES OF U&V for next row
-    VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF U
-    @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V
-
-    @PLD        [R0]
-    PLD         [R1]
-
-    @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
-    VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
-    VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B
-
-    VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
-    VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R
-
-    VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
-    VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
-    VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
-    VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3
-
-    @//NARROW RIGHT SHIFT BY 13 FOR R&B
-    VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
-    VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
-    @//Q4 - WEIGHT FOR B
-
-    @//NARROW RIGHT SHIFT BY 13 FOR R&B
-    VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
-    VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
-    @//Q5 - WEIGHT FOR R
-
-    @//NARROW RIGHT SHIFT BY 13 FOR G
-    VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
-    VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
-    @//Q6 - WEIGHT FOR G
-
-    VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
-    VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
-    VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G
-
-    VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
-    VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
-    VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G
-
-    VQMOVUN.S16 D14,Q7
-    VQMOVUN.S16 D15,Q9
-    VQMOVUN.S16 D16,Q8
-    VMOV.I8     D17,#0
-
-    VZIP.8      D14,D15
-    VZIP.8      D16,D17
-    VZIP.16     Q7,Q8
-
-
-    VQMOVUN.S16 D20,Q10
-    VQMOVUN.S16 D21,Q12
-    VQMOVUN.S16 D22,Q11
-    VMOV.I8     D23,#0
-
-    VZIP.8      D20,D21
-    VZIP.8      D22,D23
-    VZIP.16     Q10,Q11
-
-    VZIP.32     Q7,Q10
-    VZIP.32     Q8,Q11
-
-    VST1.32     D14,[R2]!
-    VST1.32     D15,[R2]!
-    VST1.32     D20,[R2]!
-    VST1.32     D21,[R2]!
-    VST1.32     D16,[R2]!
-    VST1.32     D17,[R2]!
-    VST1.32     D22,[R2]!
-    VST1.32     D23,[R2]!
-
-    @//D14-D20 - TOALLY HAVE 16 VALUES
-    @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
-    VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
-    VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
-    VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G
-
-    VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
-    VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
-    VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G
-
-    @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
-    @//LOAD VALUES OF Y 8-BIT VALUES
-    VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
-                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
-    VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
-                                            @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
-
-    PLD         [R0]
-    PLD         [R7]
-
-    VQMOVUN.S16 D14,Q7
-    VQMOVUN.S16 D15,Q9
-    VQMOVUN.S16 D16,Q8
-    VMOV.I8     D17,#0
-
-    VZIP.8      D14,D15
-    VZIP.8      D16,D17
-    VZIP.16     Q7,Q8
-
-
-    VQMOVUN.S16 D20,Q10
-    VQMOVUN.S16 D21,Q12
-    VQMOVUN.S16 D22,Q11
-    VMOV.I8     D23,#0
-
-    VZIP.8      D20,D21
-    VZIP.8      D22,D23
-    VZIP.16     Q10,Q11
-
-    VZIP.32     Q7,Q10
-    VZIP.32     Q8,Q11
-
-    VST1.32     D14,[R8]!
-    VST1.32     D15,[R8]!
-    VST1.32     D20,[R8]!
-    VST1.32     D21,[R8]!
-    VST1.32     D16,[R8]!
-    VST1.32     D17,[R8]!
-    VST1.32     D22,[R8]!
-    VST1.32     D23,[R8]!
-
-    SUBS        R6,R6,#1                    @// width_cnt -= 1
-    BNE         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
-
-LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
-    @VMOV.I8 Q1,#128
-    VUZP.8      D2,D3
-
-
-    @//NEED TO SUBTRACT (U-128) AND (V-128)
-    @//(D2-D1),(D3-D1)
-    VSUBL.U8    Q2,D2,D1                    @//(U-128)
-    VSUBL.U8    Q3,D3,D1                    @//(V-128)
-
-
-    @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
-    VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
-    VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B
-
-    VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
-    VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R
-
-    VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
-    VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
-    VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
-    VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3
-
-    @//NARROW RIGHT SHIFT BY 13 FOR R&B
-    VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
-    VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
-    @//Q4 - WEIGHT FOR B
-
-    @//NARROW RIGHT SHIFT BY 13 FOR R&B
-    VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
-    VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
-    @//Q5 - WEIGHT FOR R
-
-    @//NARROW RIGHT SHIFT BY 13 FOR G
-    VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
-    VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
-    @//Q6 - WEIGHT FOR G
-
-    VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
-    VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
-    VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G
-
-    VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
-    VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
-    VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G
-
-    VQMOVUN.S16 D14,Q7
-    VQMOVUN.S16 D15,Q9
-    VQMOVUN.S16 D16,Q8
-    VMOV.I8     D17,#0
-
-    VZIP.8      D14,D15
-    VZIP.8      D16,D17
-    VZIP.16     Q7,Q8
-
-
-    VQMOVUN.S16 D20,Q10
-    VQMOVUN.S16 D21,Q12
-    VQMOVUN.S16 D22,Q11
-    VMOV.I8     D23,#0
-
-    VZIP.8      D20,D21
-    VZIP.8      D22,D23
-    VZIP.16     Q10,Q11
-
-    VZIP.32     Q7,Q10
-    VZIP.32     Q8,Q11
-
-    VST1.32     D14,[R2]!
-    VST1.32     D15,[R2]!
-    VST1.32     D20,[R2]!
-    VST1.32     D21,[R2]!
-    VST1.32     D16,[R2]!
-    VST1.32     D17,[R2]!
-    VST1.32     D22,[R2]!
-    VST1.32     D23,[R2]!
-
-    @//D14-D20 - TOALLY HAVE 16 VALUES
-    @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
-    VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
-    VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
-    VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G
-
-    VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
-    VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
-    VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G
-
-
-    VQMOVUN.S16 D14,Q7
-    VQMOVUN.S16 D15,Q9
-    VQMOVUN.S16 D16,Q8
-    VMOV.I8     D17,#0
-
-    VZIP.8      D14,D15
-    VZIP.8      D16,D17
-    VZIP.16     Q7,Q8
-
-
-    VQMOVUN.S16 D20,Q10
-    VQMOVUN.S16 D21,Q12
-    VQMOVUN.S16 D22,Q11
-    VMOV.I8     D23,#0
-
-    VZIP.8      D20,D21
-    VZIP.8      D22,D23
-    VZIP.16     Q10,Q11
-
-    VZIP.32     Q7,Q10
-    VZIP.32     Q8,Q11
-
-    VST1.32     D14,[R8]!
-    VST1.32     D15,[R8]!
-    VST1.32     D20,[R8]!
-    VST1.32     D21,[R8]!
-    VST1.32     D16,[R8]!
-    VST1.32     D17,[R8]!
-    VST1.32     D22,[R8]!
-    VST1.32     D23,[R8]!
-
-    @// Adjust the address pointers
-    ADD         R0,R7,R10                   @// luma = luma_next + offset
-    ADD         R2,R8,R14,LSL #2            @// rgb = rgb_next + offset
-
-    ADD         R7,R0,R3                    @// luma_next = luma + width
-    ADD         R8,R2,R3,LSL #2             @// rgb_next_row = rgb + width
-
-    ADD         R1,R1,R11                   @// adjust u pointer
-    @ADD        R2,R2,R12           @// adjust v pointer
-
-    ADD         R7,R7,R10                   @// luma_next = luma + width + offset (because of register crunch)
-    ADD         R8,R8,R14,LSL #2            @// rgb_next_row = rgb + width + offset
-
-    SUBS        R5,R5,#1                    @// height_cnt -= 1
-
-    BNE         LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
-
-    @//POP THE REGISTERS
-    VPOP        {d8-d15}
-    LDMFD       SP!,{R4-R12,PC}
-
-
-    .section .note.GNU-stack,"",%progbits
--- a/decoder/arm/ihevcd_function_selector.c
+++ b/decoder/arm/ihevcd_function_selector.c
@ -58,10 +58,6 @@
 #include "ihevcd_function_selector.h"
 #include "ihevcd_structs.h"

-void ihevcd_init_function_ptr_neonintr(codec_t *ps_codec);
-void ihevcd_init_function_ptr_noneon(codec_t *ps_codec);
-void ihevcd_init_function_ptr_a9q(codec_t *ps_codec);
-void ihevcd_init_function_ptr_av8(codec_t *ps_codec);
 void ihevcd_init_function_ptr(void *pv_codec)
 {
    codec_t *ps_codec = (codec_t *)pv_codec;
@ -71,11 +67,11 @@ void ihevcd_init_function_ptr(void *pv_codec)
    {
 #ifndef DISABLE_NEONINTR
        case ARCH_ARM_NEONINTR:
-            ihevcd_init_function_ptr_neonintr(ps_codec);
+            ihevcd_init_function_ptr_neonintr(&ps_codec->s_func_selector);
            break;
 #endif
        case ARCH_ARM_NONEON:
-            ihevcd_init_function_ptr_noneon(ps_codec);
+            ihevcd_init_function_ptr_noneon(&ps_codec->s_func_selector);
            break;
        default:
        case ARCH_ARM_A5:
@ -84,9 +80,9 @@ void ihevcd_init_function_ptr(void *pv_codec)
        case ARCH_ARM_A15:
        case ARCH_ARM_A9Q:
 #ifndef DISABLE_NEON
-            ihevcd_init_function_ptr_a9q(ps_codec);
+            ihevcd_init_function_ptr_a9q(&ps_codec->s_func_selector);
 #else
-            ihevcd_init_function_ptr_noneon(ps_codec);
+            ihevcd_init_function_ptr_noneon(&ps_codec->s_func_selector);
 #endif
            break;
    }
@ -106,12 +102,17 @@ void ihevcd_init_function_ptr(void *pv_codec)
    switch(ps_codec->e_processor_arch)
    {
        case ARCH_ARM_NONEON:
-            ihevcd_init_function_ptr_noneon(ps_codec);
+            ihevcd_init_function_ptr_noneon(&ps_codec->s_func_selector);
            break;
        case ARCH_ARMV8_GENERIC:
        default:
-            ihevcd_init_function_ptr_av8(ps_codec);
+#ifdef DARWIN
+            ihevcd_init_function_ptr_noneon(&ps_codec->s_func_selector);
            break;
+#else
+            ihevcd_init_function_ptr_av8(&ps_codec->s_func_selector);
+            break;
+#endif
    }
 #endif
 }
--- a/Show more
+++ b/Show more