nengel@2: /* nengel@2: * ARM NEON optimised RDFT nengel@2: * Copyright (c) 2009 Mans Rullgard nengel@2: * nengel@2: * This file is part of FFmpeg. nengel@2: * nengel@2: * FFmpeg is free software; you can redistribute it and/or nengel@2: * modify it under the terms of the GNU Lesser General Public nengel@2: * License as published by the Free Software Foundation; either nengel@2: * version 2.1 of the License, or (at your option) any later version. nengel@2: * nengel@2: * FFmpeg is distributed in the hope that it will be useful, nengel@2: * but WITHOUT ANY WARRANTY; without even the implied warranty of nengel@2: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU nengel@2: * Lesser General Public License for more details. nengel@2: * nengel@2: * You should have received a copy of the GNU Lesser General Public nengel@2: * License along with FFmpeg; if not, write to the Free Software nengel@2: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA nengel@2: */ nengel@2: nengel@2: #include "asm.S" nengel@2: nengel@2: preserve8 nengel@2: nengel@2: function ff_rdft_calc_neon, export=1 nengel@2: push {r4-r8,lr} nengel@2: nengel@2: ldr r6, [r0, #4] @ inverse nengel@2: mov r4, r0 nengel@2: mov r5, r1 nengel@2: nengel@2: lsls r6, r6, #31 nengel@2: bne 1f nengel@2: add r0, r4, #20 nengel@2: bl X(ff_fft_permute_neon) nengel@2: add r0, r4, #20 nengel@2: mov r1, r5 nengel@2: bl X(ff_fft_calc_neon) nengel@2: 1: nengel@2: ldr r12, [r4, #0] @ nbits nengel@2: mov r2, #1 nengel@2: lsl r12, r2, r12 nengel@2: add r0, r5, #8 nengel@2: add r1, r5, r12, lsl #2 nengel@2: lsr r12, r12, #2 nengel@2: ldr r2, [r4, #12] @ tcos nengel@2: sub r12, r12, #2 nengel@2: ldr r3, [r4, #16] @ tsin nengel@2: mov r7, r0 nengel@2: sub r1, r1, #8 nengel@2: mov lr, r1 nengel@2: mov r8, #-8 nengel@2: vld1.32 {d0}, [r0,:64]! @ d1[0,1] nengel@2: vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] nengel@2: vld1.32 {d4}, [r2,:64]! @ tcos[i] nengel@2: vld1.32 {d5}, [r3,:64]! @ tsin[i] nengel@2: vmov.f32 d18, #0.5 @ k1 nengel@2: vdup.32 d19, r6 nengel@2: pld [r0, #32] nengel@2: veor d19, d18, d19 @ k2 nengel@2: vmov.i32 d16, #0 nengel@2: vmov.i32 d17, #1<<31 nengel@2: pld [r1, #-32] nengel@2: vtrn.32 d16, d17 nengel@2: pld [r2, #32] nengel@2: vrev64.32 d16, d16 @ d16=1,0 d17=0,1 nengel@2: pld [r3, #32] nengel@2: 2: nengel@2: veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] nengel@2: vld1.32 {d24}, [r0,:64]! @ d1[0,1] nengel@2: vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] nengel@2: vld1.32 {d25}, [r1,:64], r8 @ d2[0,1] nengel@2: vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] nengel@2: veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1] nengel@2: pld [r0, #32] nengel@2: vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re nengel@2: pld [r1, #-32] nengel@2: vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1] nengel@2: vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1] nengel@2: vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re nengel@2: veor d7, d21, d16 @ -od.im, od.re nengel@2: vrev64.32 d3, d21 @ od.re, od.im nengel@2: veor d6, d20, d17 @ ev.re,-ev.im nengel@2: veor d2, d3, d16 @ -od.re, od.im nengel@2: vmla.f32 d20, d3, d4[1] nengel@2: vmla.f32 d20, d7, d5[1] nengel@2: vmla.f32 d6, d2, d4[1] nengel@2: vmla.f32 d6, d21, d5[1] nengel@2: vld1.32 {d4}, [r2,:64]! @ tcos[i] nengel@2: veor d7, d23, d16 @ -od.im, od.re nengel@2: vld1.32 {d5}, [r3,:64]! @ tsin[i] nengel@2: veor d24, d22, d17 @ ev.re,-ev.im nengel@2: vrev64.32 d3, d23 @ od.re, od.im nengel@2: pld [r2, #32] nengel@2: veor d2, d3, d16 @ -od.re, od.im nengel@2: pld [r3, #32] nengel@2: vmla.f32 d22, d3, d4[0] nengel@2: vmla.f32 d22, d7, d5[0] nengel@2: vmla.f32 d24, d2, d4[0] nengel@2: vmla.f32 d24, d23, d5[0] nengel@2: vld1.32 {d0}, [r0,:64]! @ d1[0,1] nengel@2: vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] nengel@2: vst1.32 {d20}, [r7,:64]! nengel@2: vst1.32 {d6}, [lr,:64], r8 nengel@2: vst1.32 {d22}, [r7,:64]! nengel@2: vst1.32 {d24}, [lr,:64], r8 nengel@2: subs r12, r12, #2 nengel@2: bgt 2b nengel@2: nengel@2: veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] nengel@2: vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] nengel@2: vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] nengel@2: ldr r2, [r4, #8] @ sign_convention nengel@2: vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re nengel@2: add r0, r0, #4 nengel@2: bfc r2, #0, #31 nengel@2: vld1.32 {d0[0]}, [r0,:32] nengel@2: veor d7, d21, d16 @ -od.im, od.re nengel@2: vrev64.32 d3, d21 @ od.re, od.im nengel@2: veor d6, d20, d17 @ ev.re,-ev.im nengel@2: vld1.32 {d22}, [r5,:64] nengel@2: vdup.32 d1, r2 nengel@2: vmov d23, d22 nengel@2: veor d2, d3, d16 @ -od.re, od.im nengel@2: vtrn.32 d22, d23 nengel@2: veor d0, d0, d1 nengel@2: veor d23, d23, d17 nengel@2: vmla.f32 d20, d3, d4[1] nengel@2: vmla.f32 d20, d7, d5[1] nengel@2: vmla.f32 d6, d2, d4[1] nengel@2: vmla.f32 d6, d21, d5[1] nengel@2: vadd.f32 d22, d22, d23 nengel@2: vst1.32 {d20}, [r7,:64] nengel@2: vst1.32 {d6}, [lr,:64] nengel@2: vst1.32 {d0[0]}, [r0,:32] nengel@2: vst1.32 {d22}, [r5,:64] nengel@2: nengel@2: cmp r6, #0 nengel@2: popeq {r4-r8,pc} nengel@2: nengel@2: vmul.f32 d22, d22, d18 nengel@2: vst1.32 {d22}, [r5,:64] nengel@2: add r0, r4, #20 nengel@2: mov r1, r5 nengel@2: bl X(ff_fft_permute_neon) nengel@2: add r0, r4, #20 nengel@2: mov r1, r5 nengel@2: pop {r4-r8,lr} nengel@2: b X(ff_fft_calc_neon) nengel@2: endfunc