diff libavcodec/arm/h264pred_neon.S @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libavcodec/arm/h264pred_neon.S	Tue Sep 25 15:55:33 2012 +0200
     1.3 @@ -0,0 +1,362 @@
     1.4 +/*
     1.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
     1.6 + *
     1.7 + * This file is part of FFmpeg.
     1.8 + *
     1.9 + * FFmpeg is free software; you can redistribute it and/or
    1.10 + * modify it under the terms of the GNU Lesser General Public
    1.11 + * License as published by the Free Software Foundation; either
    1.12 + * version 2.1 of the License, or (at your option) any later version.
    1.13 + *
    1.14 + * FFmpeg is distributed in the hope that it will be useful,
    1.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.17 + * Lesser General Public License for more details.
    1.18 + *
    1.19 + * You should have received a copy of the GNU Lesser General Public
    1.20 + * License along with FFmpeg; if not, write to the Free Software
    1.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    1.22 + */
    1.23 +
    1.24 +#include "asm.S"
    1.25 +
    1.26 +        .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
    1.27 +.if \n == 8 || \hi == 0
    1.28 +        vld1.8          {\rd[0]}, [\rs], \rt
    1.29 +        vld1.8          {\rd[1]}, [\rs], \rt
    1.30 +        vld1.8          {\rd[2]}, [\rs], \rt
    1.31 +        vld1.8          {\rd[3]}, [\rs], \rt
    1.32 +.endif
    1.33 +.if \n == 8 || \hi == 1
    1.34 +        vld1.8          {\rd[4]}, [\rs], \rt
    1.35 +        vld1.8          {\rd[5]}, [\rs], \rt
    1.36 +        vld1.8          {\rd[6]}, [\rs], \rt
    1.37 +        vld1.8          {\rd[7]}, [\rs], \rt
    1.38 +.endif
    1.39 +        .endm
    1.40 +
    1.41 +        .macro add16x8  dq,  dl,  dh,  rl,  rh
    1.42 +        vaddl.u8        \dq, \rl, \rh
    1.43 +        vadd.u16        \dl, \dl, \dh
    1.44 +        vpadd.u16       \dl, \dl, \dl
    1.45 +        vpadd.u16       \dl, \dl, \dl
    1.46 +        .endm
    1.47 +
    1.48 +function ff_pred16x16_128_dc_neon, export=1
    1.49 +        vmov.i8         q0,  #128
    1.50 +        b               .L_pred16x16_dc_end
    1.51 +endfunc
    1.52 +
    1.53 +function ff_pred16x16_top_dc_neon, export=1
    1.54 +        sub             r2,  r0,  r1
    1.55 +        vld1.8          {q0},     [r2,:128]
    1.56 +        add16x8         q0,  d0,  d1,  d0,  d1
    1.57 +        vrshrn.u16      d0,  q0,  #4
    1.58 +        vdup.8          q0,  d0[0]
    1.59 +        b               .L_pred16x16_dc_end
    1.60 +endfunc
    1.61 +
    1.62 +function ff_pred16x16_left_dc_neon, export=1
    1.63 +        sub             r2,  r0,  #1
    1.64 +        ldcol.8         d0,  r2,  r1
    1.65 +        ldcol.8         d1,  r2,  r1
    1.66 +        add16x8         q0,  d0,  d1,  d0,  d1
    1.67 +        vrshrn.u16      d0,  q0,  #4
    1.68 +        vdup.8          q0,  d0[0]
    1.69 +        b               .L_pred16x16_dc_end
    1.70 +endfunc
    1.71 +
    1.72 +function ff_pred16x16_dc_neon, export=1
    1.73 +        sub             r2,  r0,  r1
    1.74 +        vld1.8          {q0},     [r2,:128]
    1.75 +        sub             r2,  r0,  #1
    1.76 +        ldcol.8         d2,  r2,  r1
    1.77 +        ldcol.8         d3,  r2,  r1
    1.78 +        vaddl.u8        q0,  d0,  d1
    1.79 +        vaddl.u8        q1,  d2,  d3
    1.80 +        vadd.u16        q0,  q0,  q1
    1.81 +        vadd.u16        d0,  d0,  d1
    1.82 +        vpadd.u16       d0,  d0,  d0
    1.83 +        vpadd.u16       d0,  d0,  d0
    1.84 +        vrshrn.u16      d0,  q0,  #5
    1.85 +        vdup.8          q0,  d0[0]
    1.86 +.L_pred16x16_dc_end:
    1.87 +        mov             r3,  #8
    1.88 +6:      vst1.8          {q0},     [r0,:128], r1
    1.89 +        vst1.8          {q0},     [r0,:128], r1
    1.90 +        subs            r3,  r3,  #1
    1.91 +        bne             6b
    1.92 +        bx              lr
    1.93 +endfunc
    1.94 +
    1.95 +function ff_pred16x16_hor_neon, export=1
    1.96 +        sub             r2,  r0,  #1
    1.97 +        mov             r3,  #16
    1.98 +1:      vld1.8          {d0[],d1[]},[r2],      r1
    1.99 +        vst1.8          {q0},       [r0,:128], r1
   1.100 +        subs            r3,  r3,  #1
   1.101 +        bne             1b
   1.102 +        bx              lr
   1.103 +endfunc
   1.104 +
   1.105 +function ff_pred16x16_vert_neon, export=1
   1.106 +        sub             r0,  r0,  r1
   1.107 +        vld1.8          {q0},     [r0,:128], r1
   1.108 +        mov             r3,  #8
   1.109 +1:      vst1.8          {q0},     [r0,:128], r1
   1.110 +        vst1.8          {q0},     [r0,:128], r1
   1.111 +        subs            r3,  r3,  #1
   1.112 +        bne             1b
   1.113 +        bx              lr
   1.114 +endfunc
   1.115 +
   1.116 +function ff_pred16x16_plane_neon, export=1
   1.117 +        sub             r3,  r0,  r1
   1.118 +        add             r2,  r3,  #8
   1.119 +        sub             r3,  r3,  #1
   1.120 +        vld1.8          {d0},     [r3]
   1.121 +        vld1.8          {d2},     [r2,:64], r1
   1.122 +        ldcol.8         d1,  r3,  r1
   1.123 +        add             r3,  r3,  r1
   1.124 +        ldcol.8         d3,  r3,  r1
   1.125 +        vrev64.8        q0,  q0
   1.126 +        vaddl.u8        q8,  d2,  d3
   1.127 +        vsubl.u8        q2,  d2,  d0
   1.128 +        vsubl.u8        q3,  d3,  d1
   1.129 +        movrel          r3,  p16weight
   1.130 +        vld1.8          {q0},     [r3,:128]
   1.131 +        vmul.s16        q2,  q2,  q0
   1.132 +        vmul.s16        q3,  q3,  q0
   1.133 +        vadd.i16        d4,  d4,  d5
   1.134 +        vadd.i16        d5,  d6,  d7
   1.135 +        vpadd.i16       d4,  d4,  d5
   1.136 +        vpadd.i16       d4,  d4,  d4
   1.137 +        vshl.i16        d5,  d4,  #2
   1.138 +        vaddl.s16       q2,  d4,  d5
   1.139 +        vrshrn.s32      d4,  q2,  #6
   1.140 +        mov             r3,  #0
   1.141 +        vtrn.16         d4,  d5
   1.142 +        vadd.i16        d2,  d4,  d5
   1.143 +        vshl.i16        d3,  d2,  #3
   1.144 +        vrev64.16       d16, d17
   1.145 +        vsub.i16        d3,  d3,  d2
   1.146 +        vadd.i16        d16, d16, d0
   1.147 +        vshl.i16        d2,  d16, #4
   1.148 +        vsub.i16        d2,  d2,  d3
   1.149 +        vshl.i16        d3,  d4,  #4
   1.150 +        vext.16         q0,  q0,  q0,  #7
   1.151 +        vsub.i16        d6,  d5,  d3
   1.152 +        vmov.16         d0[0], r3
   1.153 +        vmul.i16        q0,  q0,  d4[0]
   1.154 +        vdup.16         q1,  d2[0]
   1.155 +        vdup.16         q2,  d4[0]
   1.156 +        vdup.16         q3,  d6[0]
   1.157 +        vshl.i16        q2,  q2,  #3
   1.158 +        vadd.i16        q1,  q1,  q0
   1.159 +        vadd.i16        q3,  q3,  q2
   1.160 +        mov             r3,  #16
   1.161 +1:
   1.162 +        vqshrun.s16     d0,  q1,  #5
   1.163 +        vadd.i16        q1,  q1,  q2
   1.164 +        vqshrun.s16     d1,  q1,  #5
   1.165 +        vadd.i16        q1,  q1,  q3
   1.166 +        vst1.8          {q0},     [r0,:128], r1
   1.167 +        subs            r3,  r3,  #1
   1.168 +        bne             1b
   1.169 +        bx              lr
   1.170 +endfunc
   1.171 +
   1.172 +        .section        .rodata
   1.173 +        .align          4
   1.174 +p16weight:
   1.175 +        .short          1,2,3,4,5,6,7,8
   1.176 +
   1.177 +        .text
   1.178 +
   1.179 +function ff_pred8x8_hor_neon, export=1
   1.180 +        sub             r2,  r0,  #1
   1.181 +        mov             r3,  #8
   1.182 +1:      vld1.8          {d0[]},   [r2],     r1
   1.183 +        vst1.8          {d0},     [r0,:64], r1
   1.184 +        subs            r3,  r3,  #1
   1.185 +        bne             1b
   1.186 +        bx              lr
   1.187 +endfunc
   1.188 +
   1.189 +function ff_pred8x8_vert_neon, export=1
   1.190 +        sub             r0,  r0,  r1
   1.191 +        vld1.8          {d0},     [r0,:64], r1
   1.192 +        mov             r3,  #4
   1.193 +1:      vst1.8          {d0},     [r0,:64], r1
   1.194 +        vst1.8          {d0},     [r0,:64], r1
   1.195 +        subs            r3,  r3,  #1
   1.196 +        bne             1b
   1.197 +        bx              lr
   1.198 +endfunc
   1.199 +
   1.200 +function ff_pred8x8_plane_neon, export=1
   1.201 +        sub             r3,  r0,  r1
   1.202 +        add             r2,  r3,  #4
   1.203 +        sub             r3,  r3,  #1
   1.204 +        vld1.32         {d0[0]},  [r3]
   1.205 +        vld1.32         {d2[0]},  [r2,:32], r1
   1.206 +        ldcol.8         d0,  r3,  r1,  4,  hi=1
   1.207 +        add             r3,  r3,  r1
   1.208 +        ldcol.8         d3,  r3,  r1,  4
   1.209 +        vaddl.u8        q8,  d2,  d3
   1.210 +        vrev32.8        d0,  d0
   1.211 +        vtrn.32         d2,  d3
   1.212 +        vsubl.u8        q2,  d2,  d0
   1.213 +        movrel          r3,  p16weight
   1.214 +        vld1.16         {q0},     [r3,:128]
   1.215 +        vmul.s16        d4,  d4,  d0
   1.216 +        vmul.s16        d5,  d5,  d0
   1.217 +        vpadd.i16       d4,  d4,  d5
   1.218 +        vpaddl.s16      d4,  d4
   1.219 +        vshl.i32        d5,  d4,  #4
   1.220 +        vadd.s32        d4,  d4,  d5
   1.221 +        vrshrn.s32      d4,  q2,  #5
   1.222 +        mov             r3,  #0
   1.223 +        vtrn.16         d4,  d5
   1.224 +        vadd.i16        d2,  d4,  d5
   1.225 +        vshl.i16        d3,  d2,  #2
   1.226 +        vrev64.16       d16, d16
   1.227 +        vsub.i16        d3,  d3,  d2
   1.228 +        vadd.i16        d16, d16, d0
   1.229 +        vshl.i16        d2,  d16, #4
   1.230 +        vsub.i16        d2,  d2,  d3
   1.231 +        vshl.i16        d3,  d4,  #3
   1.232 +        vext.16         q0,  q0,  q0,  #7
   1.233 +        vsub.i16        d6,  d5,  d3
   1.234 +        vmov.16         d0[0], r3
   1.235 +        vmul.i16        q0,  q0,  d4[0]
   1.236 +        vdup.16         q1,  d2[0]
   1.237 +        vdup.16         q2,  d4[0]
   1.238 +        vdup.16         q3,  d6[0]
   1.239 +        vshl.i16        q2,  q2,  #3
   1.240 +        vadd.i16        q1,  q1,  q0
   1.241 +        vadd.i16        q3,  q3,  q2
   1.242 +        mov             r3,  #8
   1.243 +1:
   1.244 +        vqshrun.s16     d0,  q1,  #5
   1.245 +        vadd.i16        q1,  q1,  q3
   1.246 +        vst1.8          {d0},     [r0,:64], r1
   1.247 +        subs            r3,  r3,  #1
   1.248 +        bne             1b
   1.249 +        bx              lr
   1.250 +endfunc
   1.251 +
   1.252 +function ff_pred8x8_128_dc_neon, export=1
   1.253 +        vmov.i8         q0,  #128
   1.254 +        b               .L_pred8x8_dc_end
   1.255 +endfunc
   1.256 +
   1.257 +function ff_pred8x8_top_dc_neon, export=1
   1.258 +        sub             r2,  r0,  r1
   1.259 +        vld1.8          {d0},     [r2,:64]
   1.260 +        vpaddl.u8       d0,  d0
   1.261 +        vpadd.u16       d0,  d0,  d0
   1.262 +        vrshrn.u16      d0,  q0,  #2
   1.263 +        vdup.8          d1,  d0[1]
   1.264 +        vdup.8          d0,  d0[0]
   1.265 +        vtrn.32         d0,  d1
   1.266 +        b               .L_pred8x8_dc_end
   1.267 +endfunc
   1.268 +
   1.269 +function ff_pred8x8_left_dc_neon, export=1
   1.270 +        sub             r2,  r0,  #1
   1.271 +        ldcol.8         d0,  r2,  r1
   1.272 +        vpaddl.u8       d0,  d0
   1.273 +        vpadd.u16       d0,  d0,  d0
   1.274 +        vrshrn.u16      d0,  q0,  #2
   1.275 +        vdup.8          d1,  d0[1]
   1.276 +        vdup.8          d0,  d0[0]
   1.277 +        b               .L_pred8x8_dc_end
   1.278 +endfunc
   1.279 +
   1.280 +function ff_pred8x8_dc_neon, export=1
   1.281 +        sub             r2,  r0,  r1
   1.282 +        vld1.8          {d0},     [r2,:64]
   1.283 +        sub             r2,  r0,  #1
   1.284 +        ldcol.8         d1,  r2,  r1
   1.285 +        vtrn.32         d0,  d1
   1.286 +        vpaddl.u8       q0,  q0
   1.287 +        vpadd.u16       d0,  d0,  d1
   1.288 +        vpadd.u16       d1,  d0,  d0
   1.289 +        vrshrn.u16      d2,  q0,  #3
   1.290 +        vrshrn.u16      d3,  q0,  #2
   1.291 +        vdup.8          d0,  d2[4]
   1.292 +        vdup.8          d1,  d3[3]
   1.293 +        vdup.8          d4,  d3[2]
   1.294 +        vdup.8          d5,  d2[5]
   1.295 +        vtrn.32         q0,  q2
   1.296 +.L_pred8x8_dc_end:
   1.297 +        mov             r3,  #4
   1.298 +        add             r2,  r0,  r1,  lsl #2
   1.299 +6:      vst1.8          {d0},     [r0,:64], r1
   1.300 +        vst1.8          {d1},     [r2,:64], r1
   1.301 +        subs            r3,  r3,  #1
   1.302 +        bne             6b
   1.303 +        bx              lr
   1.304 +endfunc
   1.305 +
   1.306 +function ff_pred8x8_l0t_dc_neon, export=1
   1.307 +        sub             r2,  r0,  r1
   1.308 +        vld1.8          {d0},     [r2,:64]
   1.309 +        sub             r2,  r0,  #1
   1.310 +        ldcol.8         d1,  r2,  r1,  4
   1.311 +        vtrn.32         d0,  d1
   1.312 +        vpaddl.u8       q0,  q0
   1.313 +        vpadd.u16       d0,  d0,  d1
   1.314 +        vpadd.u16       d1,  d0,  d0
   1.315 +        vrshrn.u16      d2,  q0,  #3
   1.316 +        vrshrn.u16      d3,  q0,  #2
   1.317 +        vdup.8          d0,  d2[4]
   1.318 +        vdup.8          d1,  d3[0]
   1.319 +        vdup.8          q2,  d3[2]
   1.320 +        vtrn.32         q0,  q2
   1.321 +        b               .L_pred8x8_dc_end
   1.322 +endfunc
   1.323 +
   1.324 +function ff_pred8x8_l00_dc_neon, export=1
   1.325 +        sub             r2,  r0,  #1
   1.326 +        ldcol.8         d0,  r2,  r1,  4
   1.327 +        vpaddl.u8       d0,  d0
   1.328 +        vpadd.u16       d0,  d0,  d0
   1.329 +        vrshrn.u16      d0,  q0,  #2
   1.330 +        vmov.i8         d1,  #128
   1.331 +        vdup.8          d0,  d0[0]
   1.332 +        b               .L_pred8x8_dc_end
   1.333 +endfunc
   1.334 +
   1.335 +function ff_pred8x8_0lt_dc_neon, export=1
   1.336 +        sub             r2,  r0,  r1
   1.337 +        vld1.8          {d0},     [r2,:64]
   1.338 +        add             r2,  r0,  r1,  lsl #2
   1.339 +        sub             r2,  r2,  #1
   1.340 +        ldcol.8         d1,  r2,  r1,  4,  hi=1
   1.341 +        vtrn.32         d0,  d1
   1.342 +        vpaddl.u8       q0,  q0
   1.343 +        vpadd.u16       d0,  d0,  d1
   1.344 +        vpadd.u16       d1,  d0,  d0
   1.345 +        vrshrn.u16      d3,  q0,  #2
   1.346 +        vrshrn.u16      d2,  q0,  #3
   1.347 +        vdup.8          d0,  d3[0]
   1.348 +        vdup.8          d1,  d3[3]
   1.349 +        vdup.8          d4,  d3[2]
   1.350 +        vdup.8          d5,  d2[5]
   1.351 +        vtrn.32         q0,  q2
   1.352 +        b               .L_pred8x8_dc_end
   1.353 +endfunc
   1.354 +
   1.355 +function ff_pred8x8_0l0_dc_neon, export=1
   1.356 +        add             r2,  r0,  r1,  lsl #2
   1.357 +        sub             r2,  r2,  #1
   1.358 +        ldcol.8         d1,  r2,  r1,  4
   1.359 +        vpaddl.u8       d2,  d1
   1.360 +        vpadd.u16       d2,  d2,  d2
   1.361 +        vrshrn.u16      d1,  q1,  #2
   1.362 +        vmov.i8         d0,  #128
   1.363 +        vdup.8          d1,  d1[0]
   1.364 +        b               .L_pred8x8_dc_end
   1.365 +endfunc