avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for me_cmp functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for me_cmp functions in new file me_cmp_msa.c

Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Shivraj Patil 2015-06-29 20:57:14 +05:30 committed by Michael Niedermayer
parent 2f3f98af2b
commit 709bb45c66
7 changed files with 866 additions and 0 deletions

View file

@ -1295,6 +1295,29 @@
#define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
#define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
/* Description : SAD (Sum of Absolute Difference)
Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
Outputs - sad_m (halfword vector with sad)
Return Type - unsigned halfword
Details : Absolute difference of all the byte elements from 'in0' with
'ref0' is calculated and preserved in 'diff0'. From the 16
unsigned absolute diff values, even-odd pairs are added
together to generate 8 halfword results.
*/
#define SAD_UB2_UH(in0, in1, ref0, ref1) \
( { \
v16u8 diff0_m, diff1_m; \
v8u16 sad_m = { 0 }; \
\
diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
\
sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
\
sad_m; \
} )
/* Description : Insert specified word elements from input vectors to 1
destination vector
Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
@ -2429,6 +2452,42 @@
}
#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
#define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
/* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
in8, in9, in10, in11, in12, in13, in14, in15
Outputs - out0, out1, out2, out3
Return Type - unsigned byte
Details :
*/
#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15, \
out0, out1, out2, out3) \
{ \
v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\
ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
\
ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
\
ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
\
tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
\
tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
\
tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
}
/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
in8, in9, in10, in11, in12, in13, in14, in15