diff libavcodec/arm/jrevdct_arm.S @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libavcodec/arm/jrevdct_arm.S	Tue Sep 25 15:55:33 2012 +0200
     1.3 @@ -0,0 +1,388 @@
     1.4 +/*
     1.5 +   C-like prototype :
     1.6 +        void j_rev_dct_arm(DCTBLOCK data)
     1.7 +
     1.8 +   With DCTBLOCK being a pointer to an array of 64 'signed shorts'
     1.9 +
    1.10 +   Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
    1.11 +
    1.12 +   Permission is hereby granted, free of charge, to any person obtaining a copy
    1.13 +   of this software and associated documentation files (the "Software"), to deal
    1.14 +   in the Software without restriction, including without limitation the rights
    1.15 +   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    1.16 +   copies of the Software, and to permit persons to whom the Software is
    1.17 +   furnished to do so, subject to the following conditions:
    1.18 +
    1.19 +   The above copyright notice and this permission notice shall be included in
    1.20 +   all copies or substantial portions of the Software.
    1.21 +
    1.22 +   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    1.23 +   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    1.24 +   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    1.25 +   COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
    1.26 +   IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
    1.27 +   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    1.28 +
    1.29 +*/
    1.30 +
    1.31 +#include "asm.S"
    1.32 +
    1.33 +#define FIX_0_298631336 2446
    1.34 +#define FIX_0_541196100 4433
    1.35 +#define FIX_0_765366865 6270
    1.36 +#define FIX_1_175875602 9633
    1.37 +#define FIX_1_501321110 12299
    1.38 +#define FIX_2_053119869 16819
    1.39 +#define FIX_3_072711026 25172
    1.40 +#define FIX_M_0_390180644 -3196
    1.41 +#define FIX_M_0_899976223 -7373
    1.42 +#define FIX_M_1_847759065 -15137
    1.43 +#define FIX_M_1_961570560 -16069
    1.44 +#define FIX_M_2_562915447 -20995
    1.45 +#define FIX_0xFFFF 0xFFFF
    1.46 +
    1.47 +#define FIX_0_298631336_ID      0
    1.48 +#define FIX_0_541196100_ID      4
    1.49 +#define FIX_0_765366865_ID      8
    1.50 +#define FIX_1_175875602_ID     12
    1.51 +#define FIX_1_501321110_ID     16
    1.52 +#define FIX_2_053119869_ID     20
    1.53 +#define FIX_3_072711026_ID     24
    1.54 +#define FIX_M_0_390180644_ID   28
    1.55 +#define FIX_M_0_899976223_ID   32
    1.56 +#define FIX_M_1_847759065_ID   36
    1.57 +#define FIX_M_1_961570560_ID   40
    1.58 +#define FIX_M_2_562915447_ID   44
    1.59 +#define FIX_0xFFFF_ID          48
    1.60 +        .text
    1.61 +        .align
    1.62 +
    1.63 +function ff_j_rev_dct_arm, export=1
    1.64 +        stmdb   sp!, { r4 - r12, lr }   @ all callee saved regs
    1.65 +
    1.66 +        sub sp, sp, #4                  @ reserve some space on the stack
    1.67 +        str r0, [ sp ]                  @ save the DCT pointer to the stack
    1.68 +
    1.69 +        mov lr, r0                      @ lr = pointer to the current row
    1.70 +        mov r12, #8                     @ r12 = row-counter
    1.71 +        adr r11, const_array            @ r11 = base pointer to the constants array
    1.72 +row_loop:
    1.73 +        ldrsh r0, [lr, # 0]             @ r0 = 'd0'
    1.74 +        ldrsh r2, [lr, # 2]             @ r2 = 'd2'
    1.75 +
    1.76 +        @ Optimization for row that have all items except the first set to 0
    1.77 +        @ (this works as the DCTELEMS are always 4-byte aligned)
    1.78 +        ldr r5, [lr, # 0]
    1.79 +        ldr r6, [lr, # 4]
    1.80 +        ldr r3, [lr, # 8]
    1.81 +        ldr r4, [lr, #12]
    1.82 +        orr r3, r3, r4
    1.83 +        orr r3, r3, r6
    1.84 +        orrs r5, r3, r5
    1.85 +        beq end_of_row_loop             @ nothing to be done as ALL of them are '0'
    1.86 +        orrs r3, r3, r2
    1.87 +        beq empty_row
    1.88 +
    1.89 +        ldrsh r1, [lr, # 8]             @ r1 = 'd1'
    1.90 +        ldrsh r4, [lr, # 4]             @ r4 = 'd4'
    1.91 +        ldrsh r6, [lr, # 6]             @ r6 = 'd6'
    1.92 +
    1.93 +        ldr r3, [r11, #FIX_0_541196100_ID]
    1.94 +        add r7, r2, r6
    1.95 +        ldr r5, [r11, #FIX_M_1_847759065_ID]
    1.96 +        mul r7, r3, r7                      @ r7 = z1
    1.97 +        ldr r3, [r11, #FIX_0_765366865_ID]
    1.98 +        mla r6, r5, r6, r7                  @ r6 = tmp2
    1.99 +        add r5, r0, r4                      @ r5 = tmp0
   1.100 +        mla r2, r3, r2, r7                  @ r2 = tmp3
   1.101 +        sub r3, r0, r4                      @ r3 = tmp1
   1.102 +
   1.103 +        add r0, r2, r5, lsl #13             @ r0 = tmp10
   1.104 +        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
   1.105 +        add r4, r6, r3, lsl #13             @ r4 = tmp11
   1.106 +        rsb r3, r6, r3, lsl #13             @ r3 = tmp12
   1.107 +
   1.108 +        stmdb   sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
   1.109 +
   1.110 +        ldrsh r3, [lr, #10]             @ r3 = 'd3'
   1.111 +        ldrsh r5, [lr, #12]             @ r5 = 'd5'
   1.112 +        ldrsh r7, [lr, #14]             @ r7 = 'd7'
   1.113 +
   1.114 +        add r0, r3, r5                        @ r0 = 'z2'
   1.115 +        add r2, r1, r7                  @ r2 = 'z1'
   1.116 +        add r4, r3, r7                  @ r4 = 'z3'
   1.117 +        add r6, r1, r5                  @ r6 = 'z4'
   1.118 +        ldr r9, [r11, #FIX_1_175875602_ID]
   1.119 +        add r8, r4, r6                  @ r8 = z3 + z4
   1.120 +        ldr r10, [r11, #FIX_M_0_899976223_ID]
   1.121 +        mul r8, r9, r8                  @ r8 = 'z5'
   1.122 +        ldr r9, [r11, #FIX_M_2_562915447_ID]
   1.123 +        mul r2, r10, r2                 @ r2 = 'z1'
   1.124 +        ldr r10, [r11, #FIX_M_1_961570560_ID]
   1.125 +        mul r0, r9, r0                  @ r0 = 'z2'
   1.126 +        ldr r9, [r11, #FIX_M_0_390180644_ID]
   1.127 +        mla r4, r10, r4, r8             @ r4 = 'z3'
   1.128 +        ldr r10, [r11, #FIX_0_298631336_ID]
   1.129 +        mla r6, r9, r6, r8              @ r6 = 'z4'
   1.130 +        ldr r9, [r11, #FIX_2_053119869_ID]
   1.131 +        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
   1.132 +        ldr r10, [r11, #FIX_3_072711026_ID]
   1.133 +        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
   1.134 +        ldr r9, [r11, #FIX_1_501321110_ID]
   1.135 +        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
   1.136 +        add r7, r7, r4                  @ r7 = tmp0
   1.137 +        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
   1.138 +        add r5,        r5, r6                  @ r5 = tmp1
   1.139 +        add r3, r3, r4                  @ r3 = tmp2
   1.140 +        add r1, r1, r6                  @ r1 = tmp3
   1.141 +
   1.142 +        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
   1.143 +                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
   1.144 +
   1.145 +        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
   1.146 +        add r8, r0, r1
   1.147 +        add r8, r8, #(1<<10)
   1.148 +        mov r8, r8, asr #11
   1.149 +        strh r8, [lr, # 0]
   1.150 +
   1.151 +        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
   1.152 +        sub r8, r0, r1
   1.153 +        add r8, r8, #(1<<10)
   1.154 +        mov r8, r8, asr #11
   1.155 +        strh r8, [lr, #14]
   1.156 +
   1.157 +        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
   1.158 +        add r8, r6, r3
   1.159 +        add r8, r8, #(1<<10)
   1.160 +        mov r8, r8, asr #11
   1.161 +        strh r8, [lr, # 2]
   1.162 +
   1.163 +        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
   1.164 +        sub r8, r6, r3
   1.165 +        add r8, r8, #(1<<10)
   1.166 +        mov r8, r8, asr #11
   1.167 +        strh r8, [lr, #12]
   1.168 +
   1.169 +        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
   1.170 +        add r8, r4, r5
   1.171 +        add r8, r8, #(1<<10)
   1.172 +        mov r8, r8, asr #11
   1.173 +        strh r8, [lr, # 4]
   1.174 +
   1.175 +        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
   1.176 +        sub r8, r4, r5
   1.177 +        add r8, r8, #(1<<10)
   1.178 +        mov r8, r8, asr #11
   1.179 +        strh r8, [lr, #10]
   1.180 +
   1.181 +        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
   1.182 +        add r8, r2, r7
   1.183 +        add r8, r8, #(1<<10)
   1.184 +        mov r8, r8, asr #11
   1.185 +        strh r8, [lr, # 6]
   1.186 +
   1.187 +        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
   1.188 +        sub r8, r2, r7
   1.189 +        add r8, r8, #(1<<10)
   1.190 +        mov r8, r8, asr #11
   1.191 +        strh r8, [lr, # 8]
   1.192 +
   1.193 +        @ End of row loop
   1.194 +        add lr, lr, #16
   1.195 +        subs r12, r12, #1
   1.196 +        bne row_loop
   1.197 +        beq start_column_loop
   1.198 +
   1.199 +empty_row:
   1.200 +        ldr r1, [r11, #FIX_0xFFFF_ID]
   1.201 +        mov r0, r0, lsl #2
   1.202 +        and r0, r0, r1
   1.203 +        add r0, r0, r0, lsl #16
   1.204 +        str r0, [lr, # 0]
   1.205 +        str r0, [lr, # 4]
   1.206 +        str r0, [lr, # 8]
   1.207 +        str r0, [lr, #12]
   1.208 +
   1.209 +end_of_row_loop:
   1.210 +        @ End of loop
   1.211 +        add lr, lr, #16
   1.212 +        subs r12, r12, #1
   1.213 +        bne row_loop
   1.214 +
   1.215 +start_column_loop:
   1.216 +        @ Start of column loop
   1.217 +        ldr lr, [ sp ]
   1.218 +        mov r12, #8
   1.219 +column_loop:
   1.220 +        ldrsh r0, [lr, #( 0*8)]             @ r0 = 'd0'
   1.221 +        ldrsh r2, [lr, #( 4*8)]             @ r2 = 'd2'
   1.222 +        ldrsh r4, [lr, #( 8*8)]             @ r4 = 'd4'
   1.223 +        ldrsh r6, [lr, #(12*8)]             @ r6 = 'd6'
   1.224 +
   1.225 +        ldr r3, [r11, #FIX_0_541196100_ID]
   1.226 +        add r1, r2, r6
   1.227 +        ldr r5, [r11, #FIX_M_1_847759065_ID]
   1.228 +        mul r1, r3, r1                      @ r1 = z1
   1.229 +        ldr r3, [r11, #FIX_0_765366865_ID]
   1.230 +        mla r6, r5, r6, r1                  @ r6 = tmp2
   1.231 +        add r5, r0, r4                      @ r5 = tmp0
   1.232 +        mla r2, r3, r2, r1                  @ r2 = tmp3
   1.233 +        sub r3, r0, r4                      @ r3 = tmp1
   1.234 +
   1.235 +        add r0, r2, r5, lsl #13             @ r0 = tmp10
   1.236 +        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
   1.237 +        add r4, r6, r3, lsl #13             @ r4 = tmp11
   1.238 +        rsb r6, r6, r3, lsl #13             @ r6 = tmp12
   1.239 +
   1.240 +        ldrsh r1, [lr, #( 2*8)]             @ r1 = 'd1'
   1.241 +        ldrsh r3, [lr, #( 6*8)]             @ r3 = 'd3'
   1.242 +        ldrsh r5, [lr, #(10*8)]             @ r5 = 'd5'
   1.243 +        ldrsh r7, [lr, #(14*8)]             @ r7 = 'd7'
   1.244 +
   1.245 +        @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
   1.246 +        orr r9, r1, r3
   1.247 +        orr r10, r5, r7
   1.248 +        orrs r10, r9, r10
   1.249 +        beq empty_odd_column
   1.250 +
   1.251 +        stmdb   sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
   1.252 +
   1.253 +        add r0, r3, r5                  @ r0 = 'z2'
   1.254 +        add r2, r1, r7                  @ r2 = 'z1'
   1.255 +        add r4, r3, r7                  @ r4 = 'z3'
   1.256 +        add r6, r1, r5                  @ r6 = 'z4'
   1.257 +        ldr r9, [r11, #FIX_1_175875602_ID]
   1.258 +        add r8, r4, r6
   1.259 +        ldr r10, [r11, #FIX_M_0_899976223_ID]
   1.260 +        mul r8, r9, r8                  @ r8 = 'z5'
   1.261 +        ldr r9, [r11, #FIX_M_2_562915447_ID]
   1.262 +        mul r2, r10, r2                 @ r2 = 'z1'
   1.263 +        ldr r10, [r11, #FIX_M_1_961570560_ID]
   1.264 +        mul r0, r9, r0                  @ r0 = 'z2'
   1.265 +        ldr r9, [r11, #FIX_M_0_390180644_ID]
   1.266 +        mla r4, r10, r4, r8             @ r4 = 'z3'
   1.267 +        ldr r10, [r11, #FIX_0_298631336_ID]
   1.268 +        mla r6, r9, r6, r8              @ r6 = 'z4'
   1.269 +        ldr r9, [r11, #FIX_2_053119869_ID]
   1.270 +        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
   1.271 +        ldr r10, [r11, #FIX_3_072711026_ID]
   1.272 +        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
   1.273 +        ldr r9, [r11, #FIX_1_501321110_ID]
   1.274 +        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
   1.275 +        add r7, r7, r4                  @ r7 = tmp0
   1.276 +        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
   1.277 +        add r5,        r5, r6                  @ r5 = tmp1
   1.278 +        add r3, r3, r4                  @ r3 = tmp2
   1.279 +        add r1, r1, r6                  @ r1 = tmp3
   1.280 +
   1.281 +        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
   1.282 +                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
   1.283 +
   1.284 +        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
   1.285 +        add r8, r0, r1
   1.286 +        add r8, r8, #(1<<17)
   1.287 +        mov r8, r8, asr #18
   1.288 +        strh r8, [lr, #( 0*8)]
   1.289 +
   1.290 +        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
   1.291 +        sub r8, r0, r1
   1.292 +        add r8, r8, #(1<<17)
   1.293 +        mov r8, r8, asr #18
   1.294 +        strh r8, [lr, #(14*8)]
   1.295 +
   1.296 +        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
   1.297 +        add r8, r4, r3
   1.298 +        add r8, r8, #(1<<17)
   1.299 +        mov r8, r8, asr #18
   1.300 +        strh r8, [lr, #( 2*8)]
   1.301 +
   1.302 +        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
   1.303 +        sub r8, r4, r3
   1.304 +        add r8, r8, #(1<<17)
   1.305 +        mov r8, r8, asr #18
   1.306 +        strh r8, [lr, #(12*8)]
   1.307 +
   1.308 +        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
   1.309 +        add r8, r6, r5
   1.310 +        add r8, r8, #(1<<17)
   1.311 +        mov r8, r8, asr #18
   1.312 +        strh r8, [lr, #( 4*8)]
   1.313 +
   1.314 +        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
   1.315 +        sub r8, r6, r5
   1.316 +        add r8, r8, #(1<<17)
   1.317 +        mov r8, r8, asr #18
   1.318 +        strh r8, [lr, #(10*8)]
   1.319 +
   1.320 +        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
   1.321 +        add r8, r2, r7
   1.322 +        add r8, r8, #(1<<17)
   1.323 +        mov r8, r8, asr #18
   1.324 +        strh r8, [lr, #( 6*8)]
   1.325 +
   1.326 +        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
   1.327 +        sub r8, r2, r7
   1.328 +        add r8, r8, #(1<<17)
   1.329 +        mov r8, r8, asr #18
   1.330 +        strh r8, [lr, #( 8*8)]
   1.331 +
   1.332 +        @ End of row loop
   1.333 +        add lr, lr, #2
   1.334 +        subs r12, r12, #1
   1.335 +        bne column_loop
   1.336 +        beq the_end
   1.337 +
   1.338 +empty_odd_column:
   1.339 +        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
   1.340 +        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
   1.341 +        add r0, r0, #(1<<17)
   1.342 +        mov r0, r0, asr #18
   1.343 +        strh r0, [lr, #( 0*8)]
   1.344 +        strh r0, [lr, #(14*8)]
   1.345 +
   1.346 +        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
   1.347 +        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
   1.348 +        add r4, r4, #(1<<17)
   1.349 +        mov r4, r4, asr #18
   1.350 +        strh r4, [lr, #( 2*8)]
   1.351 +        strh r4, [lr, #(12*8)]
   1.352 +
   1.353 +        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
   1.354 +        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
   1.355 +        add r6, r6, #(1<<17)
   1.356 +        mov r6, r6, asr #18
   1.357 +        strh r6, [lr, #( 4*8)]
   1.358 +        strh r6, [lr, #(10*8)]
   1.359 +
   1.360 +        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
   1.361 +        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
   1.362 +        add r2, r2, #(1<<17)
   1.363 +        mov r2, r2, asr #18
   1.364 +        strh r2, [lr, #( 6*8)]
   1.365 +        strh r2, [lr, #( 8*8)]
   1.366 +
   1.367 +        @ End of row loop
   1.368 +        add lr, lr, #2
   1.369 +        subs r12, r12, #1
   1.370 +        bne column_loop
   1.371 +
   1.372 +the_end:
   1.373 +        @ The end....
   1.374 +        add sp, sp, #4
   1.375 +        ldmia   sp!, { r4 - r12, pc }   @ restore callee saved regs and return
   1.376 +
   1.377 +const_array:
   1.378 +        .align
   1.379 +        .word FIX_0_298631336
   1.380 +        .word FIX_0_541196100
   1.381 +        .word FIX_0_765366865
   1.382 +        .word FIX_1_175875602
   1.383 +        .word FIX_1_501321110
   1.384 +        .word FIX_2_053119869
   1.385 +        .word FIX_3_072711026
   1.386 +        .word FIX_M_0_390180644
   1.387 +        .word FIX_M_0_899976223
   1.388 +        .word FIX_M_1_847759065
   1.389 +        .word FIX_M_1_961570560
   1.390 +        .word FIX_M_2_562915447
   1.391 +        .word FIX_0xFFFF