lavc/x86/videodsp: Drop MMX usage
Remove the MMX versions of these functions and modify the SSE implementations to avoid using MMX registers. Signed-off-by: Frank Plowman <post@frankplowman.com> Signed-off-by: Zhao Zhili <zhilizhao@tencent.com> Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
parent
4de67e8746
commit
1e3dc705df
2 changed files with 59 additions and 90 deletions
|
|
@ -123,54 +123,43 @@ hvar_fn
|
|||
; - if (%2 & 8) fills 8 bytes into xmm$next
|
||||
; - if (%2 & 4) fills 4 bytes into xmm$next
|
||||
; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
|
||||
; on mmx, - fills mm0-7 for consecutive sets of 8 pixels
|
||||
; - if (%2 & 4) fills 4 bytes into mm$next
|
||||
; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
|
||||
; writing data out is in the same way
|
||||
%macro READ_NUM_BYTES 2
|
||||
%assign %%off 0 ; offset in source buffer
|
||||
%assign %%mmx_idx 0 ; mmx register index
|
||||
%assign %%xmm_idx 0 ; xmm register index
|
||||
|
||||
%rep %2/mmsize
|
||||
%if mmsize == 16
|
||||
movu xmm %+ %%xmm_idx, [srcq+%%off]
|
||||
%assign %%xmm_idx %%xmm_idx+1
|
||||
%else ; mmx
|
||||
movu mm %+ %%mmx_idx, [srcq+%%off]
|
||||
%assign %%mmx_idx %%mmx_idx+1
|
||||
%endif
|
||||
%assign %%off %%off+mmsize
|
||||
%endrep ; %2/mmsize
|
||||
|
||||
%if mmsize == 16
|
||||
%if (%2-%%off) >= 8
|
||||
%if %2 > 16 && (%2-%%off) > 8
|
||||
movu xmm %+ %%xmm_idx, [srcq+%2-16]
|
||||
%assign %%xmm_idx %%xmm_idx+1
|
||||
%assign %%off %2
|
||||
%else
|
||||
movq mm %+ %%mmx_idx, [srcq+%%off]
|
||||
%assign %%mmx_idx %%mmx_idx+1
|
||||
movq xmm %+ %%xmm_idx, [srcq+%%off]
|
||||
%assign %%xmm_idx %%xmm_idx+1
|
||||
%assign %%off %%off+8
|
||||
%endif
|
||||
%endif ; (%2-%%off) >= 8
|
||||
%endif
|
||||
|
||||
%if (%2-%%off) >= 4
|
||||
%if %2 > 8 && (%2-%%off) > 4
|
||||
movq mm %+ %%mmx_idx, [srcq+%2-8]
|
||||
movq xmm %+ %%xmm_idx, [srcq+%2-8]
|
||||
%assign %%off %2
|
||||
%else
|
||||
movd mm %+ %%mmx_idx, [srcq+%%off]
|
||||
movd xmm %+ %%xmm_idx, [srcq+%%off]
|
||||
%assign %%off %%off+4
|
||||
%endif
|
||||
%assign %%mmx_idx %%mmx_idx+1
|
||||
%assign %%xmm_idx %%xmm_idx+1
|
||||
%endif ; (%2-%%off) >= 4
|
||||
|
||||
%if (%2-%%off) >= 1
|
||||
%if %2 >= 4
|
||||
movd mm %+ %%mmx_idx, [srcq+%2-4]
|
||||
movd xmm %+ %%xmm_idx, [srcq+%2-4]
|
||||
%elif (%2-%%off) == 1
|
||||
mov valb, [srcq+%2-1]
|
||||
%elif (%2-%%off) == 2
|
||||
|
|
@ -185,48 +174,40 @@ hvar_fn
|
|||
|
||||
%macro WRITE_NUM_BYTES 2
|
||||
%assign %%off 0 ; offset in destination buffer
|
||||
%assign %%mmx_idx 0 ; mmx register index
|
||||
%assign %%xmm_idx 0 ; xmm register index
|
||||
|
||||
%rep %2/mmsize
|
||||
%if mmsize == 16
|
||||
movu [dstq+%%off], xmm %+ %%xmm_idx
|
||||
%assign %%xmm_idx %%xmm_idx+1
|
||||
%else ; mmx
|
||||
movu [dstq+%%off], mm %+ %%mmx_idx
|
||||
%assign %%mmx_idx %%mmx_idx+1
|
||||
%endif
|
||||
%assign %%off %%off+mmsize
|
||||
%endrep ; %2/mmsize
|
||||
|
||||
%if mmsize == 16
|
||||
%if (%2-%%off) >= 8
|
||||
%if %2 > 16 && (%2-%%off) > 8
|
||||
movu [dstq+%2-16], xmm %+ %%xmm_idx
|
||||
%assign %%xmm_idx %%xmm_idx+1
|
||||
%assign %%off %2
|
||||
%else
|
||||
movq [dstq+%%off], mm %+ %%mmx_idx
|
||||
%assign %%mmx_idx %%mmx_idx+1
|
||||
movq [dstq+%%off], xmm %+ %%xmm_idx
|
||||
%assign %%xmm_idx %%xmm_idx+1
|
||||
%assign %%off %%off+8
|
||||
%endif
|
||||
%endif ; (%2-%%off) >= 8
|
||||
%endif
|
||||
|
||||
%if (%2-%%off) >= 4
|
||||
%if %2 > 8 && (%2-%%off) > 4
|
||||
movq [dstq+%2-8], mm %+ %%mmx_idx
|
||||
movq [dstq+%2-8], xmm %+ %%xmm_idx
|
||||
%assign %%off %2
|
||||
%else
|
||||
movd [dstq+%%off], mm %+ %%mmx_idx
|
||||
movd [dstq+%%off], xmm %+ %%xmm_idx
|
||||
%assign %%off %%off+4
|
||||
%endif
|
||||
%assign %%mmx_idx %%mmx_idx+1
|
||||
%assign %%xmm_idx %%xmm_idx+1
|
||||
%endif ; (%2-%%off) >= 4
|
||||
|
||||
%if (%2-%%off) >= 1
|
||||
%if %2 >= 4
|
||||
movd [dstq+%2-4], mm %+ %%mmx_idx
|
||||
movd [dstq+%2-4], xmm %+ %%xmm_idx
|
||||
%elif (%2-%%off) == 1
|
||||
mov [dstq+%2-1], valb
|
||||
%elif (%2-%%off) == 2
|
||||
|
|
@ -318,11 +299,8 @@ cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
|
|||
%endrep ; 1+%2-%1
|
||||
%endmacro ; VERTICAL_EXTEND
|
||||
|
||||
INIT_MMX mmx
|
||||
VERTICAL_EXTEND 1, 15
|
||||
|
||||
INIT_XMM sse
|
||||
VERTICAL_EXTEND 16, 22
|
||||
INIT_XMM sse2
|
||||
VERTICAL_EXTEND 1, 22
|
||||
|
||||
; left/right (horizontal) fast extend functions
|
||||
; these are essentially identical to the vertical extend ones above,
|
||||
|
|
@ -337,11 +315,7 @@ VERTICAL_EXTEND 16, 22
|
|||
imul vald, 0x01010101
|
||||
%if %1 >= 8
|
||||
movd m0, vald
|
||||
%if mmsize == 16
|
||||
pshufd m0, m0, q0000
|
||||
%else
|
||||
punpckldq m0, m0
|
||||
%endif ; mmsize == 16
|
||||
%endif ; %1 > 16
|
||||
%endif ; avx2
|
||||
%endmacro ; READ_V_PIXEL
|
||||
|
|
@ -356,7 +330,6 @@ VERTICAL_EXTEND 16, 22
|
|||
%assign %%off %%off+mmsize
|
||||
%endrep ; %1/mmsize
|
||||
|
||||
%if mmsize == 16
|
||||
%if %1-%%off >= 8
|
||||
%if %1 > 16 && %1-%%off > 8
|
||||
movu [%2+%1-16], m0
|
||||
|
|
@ -366,7 +339,6 @@ VERTICAL_EXTEND 16, 22
|
|||
%assign %%off %%off+8
|
||||
%endif
|
||||
%endif ; %1-%%off >= 8
|
||||
%endif ; mmsize == 16
|
||||
|
||||
%if %1-%%off >= 4
|
||||
%if %1 > 8 && %1-%%off > 4
|
||||
|
|
@ -415,11 +387,8 @@ cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
|
|||
%endrep ; 1+(%2-%1)/2
|
||||
%endmacro ; H_EXTEND
|
||||
|
||||
INIT_MMX mmx
|
||||
H_EXTEND 2, 14
|
||||
|
||||
INIT_XMM sse2
|
||||
H_EXTEND 16, 22
|
||||
H_EXTEND 2, 22
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_XMM avx2
|
||||
|
|
|
|||
|
|
@ -37,37 +37,37 @@ typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
|
|||
x86_reg start_y, x86_reg end_y, x86_reg bh,
|
||||
x86_reg w);
|
||||
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
|
||||
static emu_edge_vfix_func * const vfixtbl_sse[22] = {
|
||||
ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx,
|
||||
ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx,
|
||||
ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx,
|
||||
ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
|
||||
ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
|
||||
ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
|
||||
ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
|
||||
ff_emu_edge_vfix22_sse
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix1_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix2_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix3_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix4_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix5_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix6_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix7_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix8_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix9_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix10_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix11_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix12_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix13_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix14_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix15_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix16_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix17_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix18_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix19_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix20_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix21_sse2;
|
||||
extern emu_edge_vfix_func ff_emu_edge_vfix22_sse2;
|
||||
static emu_edge_vfix_func * const vfixtbl_sse2[22] = {
|
||||
ff_emu_edge_vfix1_sse2, ff_emu_edge_vfix2_sse2, ff_emu_edge_vfix3_sse2,
|
||||
ff_emu_edge_vfix4_sse2, ff_emu_edge_vfix5_sse2, ff_emu_edge_vfix6_sse2,
|
||||
ff_emu_edge_vfix7_sse2, ff_emu_edge_vfix8_sse2, ff_emu_edge_vfix9_sse2,
|
||||
ff_emu_edge_vfix10_sse2, ff_emu_edge_vfix11_sse2, ff_emu_edge_vfix12_sse2,
|
||||
ff_emu_edge_vfix13_sse2, ff_emu_edge_vfix14_sse2, ff_emu_edge_vfix15_sse2,
|
||||
ff_emu_edge_vfix16_sse2, ff_emu_edge_vfix17_sse2, ff_emu_edge_vfix18_sse2,
|
||||
ff_emu_edge_vfix19_sse2, ff_emu_edge_vfix20_sse2, ff_emu_edge_vfix21_sse2,
|
||||
ff_emu_edge_vfix22_sse2
|
||||
};
|
||||
extern emu_edge_vvar_func ff_emu_edge_vvar_sse;
|
||||
|
||||
|
|
@ -76,21 +76,21 @@ typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
|
|||
typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
|
||||
x86_reg start_x, x86_reg n_words, x86_reg bh);
|
||||
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix2_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix4_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix6_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix8_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix10_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix12_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix14_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
|
||||
static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
|
||||
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
|
||||
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
|
||||
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
|
||||
ff_emu_edge_hfix2_sse2, ff_emu_edge_hfix4_sse2, ff_emu_edge_hfix6_sse2,
|
||||
ff_emu_edge_hfix8_sse2, ff_emu_edge_hfix10_sse2, ff_emu_edge_hfix12_sse2,
|
||||
ff_emu_edge_hfix14_sse2, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
|
||||
ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
|
||||
};
|
||||
extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
|
||||
|
|
@ -104,7 +104,7 @@ extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
|
|||
extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
|
||||
extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
|
||||
static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
|
||||
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
|
||||
ff_emu_edge_hfix2_sse2, ff_emu_edge_hfix4_sse2, ff_emu_edge_hfix6_sse2,
|
||||
ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
|
||||
ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
|
||||
ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
|
||||
|
|
@ -196,7 +196,7 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
|
|||
int h)
|
||||
{
|
||||
emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
|
||||
src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
|
||||
src_x, src_y, w, h, vfixtbl_sse2, &ff_emu_edge_vvar_sse,
|
||||
hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
|
||||
}
|
||||
|
||||
|
|
@ -209,7 +209,7 @@ static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
|
|||
int h)
|
||||
{
|
||||
emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
|
||||
src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
|
||||
src_x, src_y, w, h, vfixtbl_sse2, &ff_emu_edge_vvar_sse,
|
||||
hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
|
||||
}
|
||||
#endif /* HAVE_AVX2_EXTERNAL */
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue