libxaac/decoder/armv8/ixheaacd_cos_sin_mod_loop2.s
Rajat Kumar 91227e2f61 LDR changed to MOV instruction in armv8 and armv7 assemblies
Removes memory accesses to constant values and replaces
them with mov instructions. This would allow this library
to be compatible with execute-only memory layouts.

Bug: 124323156
Test: poc
Change-Id: I801d4cfff953b9d13b80a97be04a3223117df29f
2019-03-27 15:49:19 -07:00

213 lines
6.2 KiB
ArmAsm

.macro push_v_regs
stp q8, q9, [sp, #-32]!
stp q10, q11, [sp, #-32]!
stp q12, q13, [sp, #-32]!
stp q14, q15, [sp, #-32]!
stp X8, X9, [sp, #-16]!
stp X10, X11, [sp, #-16]!
stp X12, X13, [sp, #-16]!
stp X14, X15, [sp, #-16]!
stp X16, X17, [sp, #-16]!
stp X18, X19, [sp, #-16]!
stp X20, X21, [sp, #-16]!
stp X29, X30, [sp, #-16]!
.endm
.macro pop_v_regs
ldp X29, X30, [sp], #16
ldp X20, X21, [sp], #16
ldp X18, X19, [sp], #16
ldp X16, X17, [sp], #16
ldp X14, X15, [sp], #16
ldp X12, X13, [sp], #16
ldp X10, X11, [sp], #16
ldp X8, X9, [sp], #16
ldp q14, q15, [sp], #32
ldp q12, q13, [sp], #32
ldp q10, q11, [sp], #32
ldp q8, q9, [sp], #32
.endm
.text
.p2align 2
.global ixheaacd_cos_sin_mod_loop2
ixheaacd_cos_sin_mod_loop2:
// STMFD sp!, {x4-x12, x14}
push_v_regs
//stp x19, x20,[sp,#-16]!
//VPUSH {D8-D15}
//generating load addresses
ADD x3, x0, x2, LSL #3 //psubband1 = &subband[2 * M - 1];
SUB x3, x3, #4
ADD x10, x0, #256
ADD x11, x10, x2, LSL #3
SUB x11, x11, #4
MOV x8, #-4
MOV w19, #0
DUP V0.4s, w19
DUP V1.4s, w19
LDR w6, [x0]
sxtw x6, w6
ASR x4, x2, #1 //M_2 = ixheaacd_shx32(M, 1);
SUB x4, x4, #1
ASR x6, x6, #1 //*psubband = *psubband >> 1;
LD1 {v2.s}[0], [x3]
STR w6, [x0], #4 //psubband++;
sxtw x6, w6
LDR w7, [x0]
sxtw x7, w7
ASR x7, x7, #1
sub x20, x7, #0
neg x6, x20
STR w6, [x3], #-4
sxtw x6, w6
LD1 {v3.s}[0], [x3] // im = *psubband1;
LD2 {v0.h, v1.h}[0], [x1], #4
sxtl v0.4s, v0.4h
sxtl v1.4s, v1.4h
dup v0.2s, v0.s[0]
dup v1.2s, v1.s[0]
LD1 {v2.s}[1], [x11] //re = *psubband12;
// LDR w6, [x10]
// sxtw x6,w6
// ASR x7, x6, #1
// MOV x9, #0
// QSUB x7, x9, x7
LD1 {v4.s}[0], [x10]
SSHR v4.2s, v4.2s, #1
MOV x9, #0
DUP v6.2s, w9
SQSUB v4.2s, v6.2s, v4.2s
ST1 {v4.s}[0], [x11]
// str X7, [X11]
SUB x11, x11, #4
// sxtw x7,w7
LDR w6, [x10, #4]
sxtw x6, w6
ASR x6, x6, #1
STR w6, [x10], #4
sxtw x6, w6
LD1 {v3.s}[1], [x11]
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
sshr v4.2d, v4.2d, #16
sMULL v6.2d, v0.2s, v3.2s //add 2nd
sshr v6.2d, v6.2d, #16
sMULL v8.2d, v1.2s, v2.2s //add 1st
sshr v8.2d, v8.2d, #16
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
sshr v10.2d, v10.2d, #16
add v12.2d, v8.2d , v6.2d
SQSUB v14.2d, v10.2d , v4.2d
SQSUB v16.2d, v4.2d , v10.2d
//shrn v12.2s, v12.2d,#32
//shrn v14.2s, v14.2d,#32
//shrn v16.2s, v16.2d,#32
ST1 {v12.s}[0], [x3], x8
ST1 {v14.s}[0], [x0], #4
SQNEG v12.4s, v12.4s
ST1 {v12.s}[2], [x10], #4
ST1 {v16.s}[2], [x11], x8
LOOP1:
LD1 {v2.2s}, [x0]
LD1 {v3.2s}, [x10]
LDR w5, [x3] //RE2
sxtw x5, w5
LDR w6, [x11] //RE3
sxtw x6, w6
//VTRN.32 D2, D3
TRN1 v4.2s, v2.2s, v3.2s
TRN2 v3.2s, v2.2s, v3.2s
MOV v2.8b, v4.8b
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
sshr v4.2d, v4.2d, #16
sMULL v6.2d, v0.2s, v3.2s //add 2nd
sshr v6.2d, v6.2d, #16
sMULL v8.2d, v1.2s, v2.2s //add 1st
sshr v8.2d, v8.2d, #16
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
sshr v10.2d, v10.2d, #16
add v12.2d, v8.2d , v6.2d
SQSUB v14.2d, v4.2d , v10.2d
SQSUB v16.2d, v10.2d , v4.2d
//shrn v12.2s, v12.2d,#32
//shrn v14.2s, v14.2d,#32
//shrn v16.2s, v16.2d,#32
ST1 {v12.s}[0], [x0], #4
ST1 {v14.s}[0], [x3], x8
SQNEG v12.4s, v12.4s
ST1 {v12.s}[2], [x11], x8
ST1 {v16.s}[2], [x10], #4
MOV w19, #0
DUP V0.4s, w19
DUP V1.4s, w19
// second part
LD2 {v0.h, v1.h}[0], [x1], #4
sxtl v0.4s, v0.4h
sxtl v1.4s, v1.4h
dup v0.2s, v0.s[0]
dup v1.2s, v1.s[0]
mov v3.s[0], w5
mov v3.s[1], w6
LD1 {v2.s}[0], [x3]
LD1 {v2.s}[1], [x11]
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
sshr v4.2d, v4.2d, #16
sMULL v6.2d, v0.2s, v3.2s //add 2nd
sshr v6.2d, v6.2d, #16
sMULL v8.2d, v1.2s, v2.2s //add 1st
sshr v8.2d, v8.2d, #16
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
sshr v10.2d, v10.2d, #16
add v12.2d, v4.2d , v10.2d
SQSUB v14.2d, v8.2d , v6.2d
SQSUB v16.2d, v6.2d , v8.2d
//shrn v12.2s, v12.2d,#32
//shrn v14.2s, v14.2d,#32
//shrn v16.2s, v16.2d,#32
ST1 {v12.s}[0], [x3], x8
ST1 {v14.s}[0], [x0], #4
SQNEG v12.4s, v12.4s
subs x4, x4, #1
ST1 {v12.s}[2], [x10], #4
ST1 {v16.s}[2], [x11], x8
BGT LOOP1
//VPOP {D8-D15}
// LDMFD sp!, {x4-x12, x15}
//ldp x19, x20,[sp],#16
pop_v_regs
ret