diff libavcodec/arm/h264dsp_neon.S @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libavcodec/arm/h264dsp_neon.S	Tue Sep 25 15:55:33 2012 +0200
     1.3 @@ -0,0 +1,1883 @@
     1.4 +/*
     1.5 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
     1.6 + *
     1.7 + * This file is part of FFmpeg.
     1.8 + *
     1.9 + * FFmpeg is free software; you can redistribute it and/or
    1.10 + * modify it under the terms of the GNU Lesser General Public
    1.11 + * License as published by the Free Software Foundation; either
    1.12 + * version 2.1 of the License, or (at your option) any later version.
    1.13 + *
    1.14 + * FFmpeg is distributed in the hope that it will be useful,
    1.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.17 + * Lesser General Public License for more details.
    1.18 + *
    1.19 + * You should have received a copy of the GNU Lesser General Public
    1.20 + * License along with FFmpeg; if not, write to the Free Software
    1.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    1.22 + */
    1.23 +
    1.24 +#include "asm.S"
    1.25 +
    1.26 +        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
    1.27 +        vtrn.32         \r0, \r4
    1.28 +        vtrn.32         \r1, \r5
    1.29 +        vtrn.32         \r2, \r6
    1.30 +        vtrn.32         \r3, \r7
    1.31 +        vtrn.16         \r0, \r2
    1.32 +        vtrn.16         \r1, \r3
    1.33 +        vtrn.16         \r4, \r6
    1.34 +        vtrn.16         \r5, \r7
    1.35 +        vtrn.8          \r0, \r1
    1.36 +        vtrn.8          \r2, \r3
    1.37 +        vtrn.8          \r4, \r5
    1.38 +        vtrn.8          \r6, \r7
    1.39 +        .endm
    1.40 +
    1.41 +        .macro transpose_4x4 r0 r1 r2 r3
    1.42 +        vtrn.16         \r0, \r2
    1.43 +        vtrn.16         \r1, \r3
    1.44 +        vtrn.8          \r0, \r1
    1.45 +        vtrn.8          \r2, \r3
    1.46 +        .endm
    1.47 +
    1.48 +        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
    1.49 +        vswp            \r0, \r4
    1.50 +        vswp            \r1, \r5
    1.51 +        vswp            \r2, \r6
    1.52 +        vswp            \r3, \r7
    1.53 +        .endm
    1.54 +
    1.55 +        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
    1.56 +        vtrn.32         \r0, \r2
    1.57 +        vtrn.32         \r1, \r3
    1.58 +        vtrn.32         \r4, \r6
    1.59 +        vtrn.32         \r5, \r7
    1.60 +        vtrn.16         \r0, \r1
    1.61 +        vtrn.16         \r2, \r3
    1.62 +        vtrn.16         \r4, \r5
    1.63 +        vtrn.16         \r6, \r7
    1.64 +        .endm
    1.65 +
    1.66 +/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
    1.67 +        .macro  h264_chroma_mc8 type
    1.68 +function ff_\type\()_h264_chroma_mc8_neon, export=1
    1.69 +        push            {r4-r7, lr}
    1.70 +        ldrd            r4,  [sp, #20]
    1.71 +.ifc \type,avg
    1.72 +        mov             lr,  r0
    1.73 +.endif
    1.74 +        pld             [r1]
    1.75 +        pld             [r1, r2]
    1.76 +
    1.77 +        muls            r7,  r4,  r5
    1.78 +        rsb             r6,  r7,  r5,  lsl #3
    1.79 +        rsb             ip,  r7,  r4,  lsl #3
    1.80 +        sub             r4,  r7,  r4,  lsl #3
    1.81 +        sub             r4,  r4,  r5,  lsl #3
    1.82 +        add             r4,  r4,  #64
    1.83 +
    1.84 +        beq             2f
    1.85 +
    1.86 +        add             r5,  r1,  r2
    1.87 +
    1.88 +        vdup.8          d0,  r4
    1.89 +        lsl             r4,  r2,  #1
    1.90 +        vdup.8          d1,  ip
    1.91 +        vld1.64         {d4, d5}, [r1], r4
    1.92 +        vdup.8          d2,  r6
    1.93 +        vld1.64         {d6, d7}, [r5], r4
    1.94 +        vdup.8          d3,  r7
    1.95 +
    1.96 +        vext.8          d5,  d4,  d5,  #1
    1.97 +        vext.8          d7,  d6,  d7,  #1
    1.98 +
    1.99 +1:      pld             [r5]
   1.100 +        vmull.u8        q8,  d4,  d0
   1.101 +        vmlal.u8        q8,  d5,  d1
   1.102 +        vld1.64         {d4, d5}, [r1], r4
   1.103 +        vmlal.u8        q8,  d6,  d2
   1.104 +        vext.8          d5,  d4,  d5,  #1
   1.105 +        vmlal.u8        q8,  d7,  d3
   1.106 +        vmull.u8        q9,  d6,  d0
   1.107 +        subs            r3,  r3,  #2
   1.108 +        vmlal.u8        q9,  d7,  d1
   1.109 +        vmlal.u8        q9,  d4,  d2
   1.110 +        vmlal.u8        q9,  d5,  d3
   1.111 +        vrshrn.u16      d16, q8,  #6
   1.112 +        vld1.64         {d6, d7}, [r5], r4
   1.113 +        pld             [r1]
   1.114 +        vrshrn.u16      d17, q9,  #6
   1.115 +.ifc \type,avg
   1.116 +        vld1.64         {d20}, [lr,:64], r2
   1.117 +        vld1.64         {d21}, [lr,:64], r2
   1.118 +        vrhadd.u8       q8,  q8,  q10
   1.119 +.endif
   1.120 +        vext.8          d7,  d6,  d7,  #1
   1.121 +        vst1.64         {d16}, [r0,:64], r2
   1.122 +        vst1.64         {d17}, [r0,:64], r2
   1.123 +        bgt             1b
   1.124 +
   1.125 +        pop             {r4-r7, pc}
   1.126 +
   1.127 +2:      tst             r6,  r6
   1.128 +        add             ip,  ip,  r6
   1.129 +        vdup.8          d0,  r4
   1.130 +        vdup.8          d1,  ip
   1.131 +
   1.132 +        beq             4f
   1.133 +
   1.134 +        add             r5,  r1,  r2
   1.135 +        lsl             r4,  r2,  #1
   1.136 +        vld1.64         {d4}, [r1], r4
   1.137 +        vld1.64         {d6}, [r5], r4
   1.138 +
   1.139 +3:      pld             [r5]
   1.140 +        vmull.u8        q8,  d4,  d0
   1.141 +        vmlal.u8        q8,  d6,  d1
   1.142 +        vld1.64         {d4}, [r1], r4
   1.143 +        vmull.u8        q9,  d6,  d0
   1.144 +        vmlal.u8        q9,  d4,  d1
   1.145 +        vld1.64         {d6}, [r5], r4
   1.146 +        vrshrn.u16      d16, q8,  #6
   1.147 +        vrshrn.u16      d17, q9,  #6
   1.148 +.ifc \type,avg
   1.149 +        vld1.64         {d20}, [lr,:64], r2
   1.150 +        vld1.64         {d21}, [lr,:64], r2
   1.151 +        vrhadd.u8       q8,  q8,  q10
   1.152 +.endif
   1.153 +        subs            r3,  r3,  #2
   1.154 +        pld             [r1]
   1.155 +        vst1.64         {d16}, [r0,:64], r2
   1.156 +        vst1.64         {d17}, [r0,:64], r2
   1.157 +        bgt             3b
   1.158 +
   1.159 +        pop             {r4-r7, pc}
   1.160 +
   1.161 +4:      vld1.64         {d4, d5}, [r1], r2
   1.162 +        vld1.64         {d6, d7}, [r1], r2
   1.163 +        vext.8          d5,  d4,  d5,  #1
   1.164 +        vext.8          d7,  d6,  d7,  #1
   1.165 +
   1.166 +5:      pld             [r1]
   1.167 +        subs            r3,  r3,  #2
   1.168 +        vmull.u8        q8,  d4,  d0
   1.169 +        vmlal.u8        q8,  d5,  d1
   1.170 +        vld1.64         {d4, d5}, [r1], r2
   1.171 +        vmull.u8        q9,  d6,  d0
   1.172 +        vmlal.u8        q9,  d7,  d1
   1.173 +        pld             [r1]
   1.174 +        vext.8          d5,  d4,  d5,  #1
   1.175 +        vrshrn.u16      d16, q8,  #6
   1.176 +        vrshrn.u16      d17, q9,  #6
   1.177 +.ifc \type,avg
   1.178 +        vld1.64         {d20}, [lr,:64], r2
   1.179 +        vld1.64         {d21}, [lr,:64], r2
   1.180 +        vrhadd.u8       q8,  q8,  q10
   1.181 +.endif
   1.182 +        vld1.64         {d6, d7}, [r1], r2
   1.183 +        vext.8          d7,  d6,  d7,  #1
   1.184 +        vst1.64         {d16}, [r0,:64], r2
   1.185 +        vst1.64         {d17}, [r0,:64], r2
   1.186 +        bgt             5b
   1.187 +
   1.188 +        pop             {r4-r7, pc}
   1.189 +endfunc
   1.190 +        .endm
   1.191 +
   1.192 +/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
   1.193 +        .macro  h264_chroma_mc4 type
   1.194 +function ff_\type\()_h264_chroma_mc4_neon, export=1
   1.195 +        push            {r4-r7, lr}
   1.196 +        ldrd            r4,  [sp, #20]
   1.197 +.ifc \type,avg
   1.198 +        mov             lr,  r0
   1.199 +.endif
   1.200 +        pld             [r1]
   1.201 +        pld             [r1, r2]
   1.202 +
   1.203 +        muls            r7,  r4,  r5
   1.204 +        rsb             r6,  r7,  r5,  lsl #3
   1.205 +        rsb             ip,  r7,  r4,  lsl #3
   1.206 +        sub             r4,  r7,  r4,  lsl #3
   1.207 +        sub             r4,  r4,  r5,  lsl #3
   1.208 +        add             r4,  r4,  #64
   1.209 +
   1.210 +        beq             2f
   1.211 +
   1.212 +        add             r5,  r1,  r2
   1.213 +
   1.214 +        vdup.8          d0,  r4
   1.215 +        lsl             r4,  r2,  #1
   1.216 +        vdup.8          d1,  ip
   1.217 +        vld1.64         {d4},     [r1], r4
   1.218 +        vdup.8          d2,  r6
   1.219 +        vld1.64         {d6},     [r5], r4
   1.220 +        vdup.8          d3,  r7
   1.221 +
   1.222 +        vext.8          d5,  d4,  d5,  #1
   1.223 +        vext.8          d7,  d6,  d7,  #1
   1.224 +        vtrn.32         d4,  d5
   1.225 +        vtrn.32         d6,  d7
   1.226 +
   1.227 +        vtrn.32         d0,  d1
   1.228 +        vtrn.32         d2,  d3
   1.229 +
   1.230 +1:      pld             [r5]
   1.231 +        vmull.u8        q8,  d4,  d0
   1.232 +        vmlal.u8        q8,  d6,  d2
   1.233 +        vld1.64         {d4},     [r1], r4
   1.234 +        vext.8          d5,  d4,  d5,  #1
   1.235 +        vtrn.32         d4,  d5
   1.236 +        vmull.u8        q9,  d6,  d0
   1.237 +        vmlal.u8        q9,  d4,  d2
   1.238 +        vld1.64         {d6},     [r5], r4
   1.239 +        vadd.i16        d16, d16, d17
   1.240 +        vadd.i16        d17, d18, d19
   1.241 +        vrshrn.u16      d16, q8,  #6
   1.242 +        subs            r3,  r3,  #2
   1.243 +        pld             [r1]
   1.244 +.ifc \type,avg
   1.245 +        vld1.32         {d20[0]}, [lr,:32], r2
   1.246 +        vld1.32         {d20[1]}, [lr,:32], r2
   1.247 +        vrhadd.u8       d16, d16, d20
   1.248 +.endif
   1.249 +        vext.8          d7,  d6,  d7,  #1
   1.250 +        vtrn.32         d6,  d7
   1.251 +        vst1.32         {d16[0]}, [r0,:32], r2
   1.252 +        vst1.32         {d16[1]}, [r0,:32], r2
   1.253 +        bgt             1b
   1.254 +
   1.255 +        pop             {r4-r7, pc}
   1.256 +
   1.257 +2:      tst             r6,  r6
   1.258 +        add             ip,  ip,  r6
   1.259 +        vdup.8          d0,  r4
   1.260 +        vdup.8          d1,  ip
   1.261 +        vtrn.32         d0,  d1
   1.262 +
   1.263 +        beq             4f
   1.264 +
   1.265 +        vext.32         d1,  d0,  d1,  #1
   1.266 +        add             r5,  r1,  r2
   1.267 +        lsl             r4,  r2,  #1
   1.268 +        vld1.32         {d4[0]},  [r1], r4
   1.269 +        vld1.32         {d4[1]},  [r5], r4
   1.270 +
   1.271 +3:      pld             [r5]
   1.272 +        vmull.u8        q8,  d4,  d0
   1.273 +        vld1.32         {d4[0]},  [r1], r4
   1.274 +        vmull.u8        q9,  d4,  d1
   1.275 +        vld1.32         {d4[1]},  [r5], r4
   1.276 +        vadd.i16        d16, d16, d17
   1.277 +        vadd.i16        d17, d18, d19
   1.278 +        vrshrn.u16      d16, q8,  #6
   1.279 +.ifc \type,avg
   1.280 +        vld1.32         {d20[0]}, [lr,:32], r2
   1.281 +        vld1.32         {d20[1]}, [lr,:32], r2
   1.282 +        vrhadd.u8       d16, d16, d20
   1.283 +.endif
   1.284 +        subs            r3,  r3,  #2
   1.285 +        pld             [r1]
   1.286 +        vst1.32         {d16[0]}, [r0,:32], r2
   1.287 +        vst1.32         {d16[1]}, [r0,:32], r2
   1.288 +        bgt             3b
   1.289 +
   1.290 +        pop             {r4-r7, pc}
   1.291 +
   1.292 +4:      vld1.64         {d4},     [r1], r2
   1.293 +        vld1.64         {d6},     [r1], r2
   1.294 +        vext.8          d5,  d4,  d5,  #1
   1.295 +        vext.8          d7,  d6,  d7,  #1
   1.296 +        vtrn.32         d4,  d5
   1.297 +        vtrn.32         d6,  d7
   1.298 +
   1.299 +5:      vmull.u8        q8,  d4,  d0
   1.300 +        vmull.u8        q9,  d6,  d0
   1.301 +        subs            r3,  r3,  #2
   1.302 +        vld1.64         {d4},     [r1], r2
   1.303 +        vext.8          d5,  d4,  d5,  #1
   1.304 +        vtrn.32         d4,  d5
   1.305 +        vadd.i16        d16, d16, d17
   1.306 +        vadd.i16        d17, d18, d19
   1.307 +        pld             [r1]
   1.308 +        vrshrn.u16      d16, q8,  #6
   1.309 +.ifc \type,avg
   1.310 +        vld1.32         {d20[0]}, [lr,:32], r2
   1.311 +        vld1.32         {d20[1]}, [lr,:32], r2
   1.312 +        vrhadd.u8       d16, d16, d20
   1.313 +.endif
   1.314 +        vld1.64         {d6},     [r1], r2
   1.315 +        vext.8          d7,  d6,  d7,  #1
   1.316 +        vtrn.32         d6,  d7
   1.317 +        pld             [r1]
   1.318 +        vst1.32         {d16[0]}, [r0,:32], r2
   1.319 +        vst1.32         {d16[1]}, [r0,:32], r2
   1.320 +        bgt             5b
   1.321 +
   1.322 +        pop             {r4-r7, pc}
   1.323 +endfunc
   1.324 +        .endm
   1.325 +
   1.326 +        .macro  h264_chroma_mc2 type
   1.327 +function ff_\type\()_h264_chroma_mc2_neon, export=1
   1.328 +        push            {r4-r6, lr}
   1.329 +        ldr             r4,  [sp, #16]
   1.330 +        ldr             lr,  [sp, #20]
   1.331 +        pld             [r1]
   1.332 +        pld             [r1, r2]
   1.333 +        orrs            r5,  r4,  lr
   1.334 +        beq             2f
   1.335 +
   1.336 +        mul             r5,  r4,  lr
   1.337 +        rsb             r6,  r5,  lr,  lsl #3
   1.338 +        rsb             r12, r5,  r4,  lsl #3
   1.339 +        sub             r4,  r5,  r4,  lsl #3
   1.340 +        sub             r4,  r4,  lr,  lsl #3
   1.341 +        add             r4,  r4,  #64
   1.342 +        vdup.8          d0,  r4
   1.343 +        vdup.8          d2,  r12
   1.344 +        vdup.8          d1,  r6
   1.345 +        vdup.8          d3,  r5
   1.346 +        vtrn.16         q0,  q1
   1.347 +1:
   1.348 +        vld1.32         {d4[0]},  [r1], r2
   1.349 +        vld1.32         {d4[1]},  [r1], r2
   1.350 +        vrev64.32       d5,  d4
   1.351 +        vld1.32         {d5[1]},  [r1]
   1.352 +        vext.8          q3,  q2,  q2,  #1
   1.353 +        vtrn.16         q2,  q3
   1.354 +        vmull.u8        q8,  d4,  d0
   1.355 +        vmlal.u8        q8,  d5,  d1
   1.356 +.ifc \type,avg
   1.357 +        vld1.16         {d18[0]}, [r0,:16], r2
   1.358 +        vld1.16         {d18[1]}, [r0,:16]
   1.359 +        sub             r0,  r0,  r2
   1.360 +.endif
   1.361 +        vtrn.32         d16, d17
   1.362 +        vadd.i16        d16, d16, d17
   1.363 +        vrshrn.u16      d16, q8,  #6
   1.364 +.ifc \type,avg
   1.365 +        vrhadd.u8       d16, d16, d18
   1.366 +.endif
   1.367 +        vst1.16         {d16[0]}, [r0,:16], r2
   1.368 +        vst1.16         {d16[1]}, [r0,:16], r2
   1.369 +        subs            r3,  r3,  #2
   1.370 +        bgt             1b
   1.371 +        pop             {r4-r6, pc}
   1.372 +2:
   1.373 +.ifc \type,put
   1.374 +        ldrh            r5,  [r1], r2
   1.375 +        strh            r5,  [r0], r2
   1.376 +        ldrh            r6,  [r1], r2
   1.377 +        strh            r6,  [r0], r2
   1.378 +.else
   1.379 +        vld1.16         {d16[0]}, [r1], r2
   1.380 +        vld1.16         {d16[1]}, [r1], r2
   1.381 +        vld1.16         {d18[0]}, [r0,:16], r2
   1.382 +        vld1.16         {d18[1]}, [r0,:16]
   1.383 +        sub             r0,  r0,  r2
   1.384 +        vrhadd.u8       d16, d16, d18
   1.385 +        vst1.16         {d16[0]}, [r0,:16], r2
   1.386 +        vst1.16         {d16[1]}, [r0,:16], r2
   1.387 +.endif
   1.388 +        subs            r3,  r3,  #2
   1.389 +        bgt             2b
   1.390 +        pop             {r4-r6, pc}
   1.391 +endfunc
   1.392 +.endm
   1.393 +
   1.394 +        .text
   1.395 +        .align
   1.396 +
   1.397 +        h264_chroma_mc8 put
   1.398 +        h264_chroma_mc8 avg
   1.399 +        h264_chroma_mc4 put
   1.400 +        h264_chroma_mc4 avg
   1.401 +        h264_chroma_mc2 put
   1.402 +        h264_chroma_mc2 avg
   1.403 +
   1.404 +        /* H.264 loop filter */
   1.405 +
   1.406 +        .macro h264_loop_filter_start
   1.407 +        ldr             ip,  [sp]
   1.408 +        tst             r2,  r2
   1.409 +        ldr             ip,  [ip]
   1.410 +        tstne           r3,  r3
   1.411 +        vmov.32         d24[0], ip
   1.412 +        and             ip,  ip,  ip, lsl #16
   1.413 +        bxeq            lr
   1.414 +        ands            ip,  ip,  ip, lsl #8
   1.415 +        bxlt            lr
   1.416 +        .endm
   1.417 +
   1.418 +        .macro align_push_regs
   1.419 +        and             ip,  sp,  #15
   1.420 +        add             ip,  ip,  #32
   1.421 +        sub             sp,  sp,  ip
   1.422 +        vst1.64         {d12-d15}, [sp,:128]
   1.423 +        sub             sp,  sp,  #32
   1.424 +        vst1.64         {d8-d11},  [sp,:128]
   1.425 +        .endm
   1.426 +
   1.427 +        .macro align_pop_regs
   1.428 +        vld1.64         {d8-d11},  [sp,:128]!
   1.429 +        vld1.64         {d12-d15}, [sp,:128], ip
   1.430 +        .endm
   1.431 +
   1.432 +        .macro h264_loop_filter_luma
   1.433 +        vdup.8          q11, r2         @ alpha
   1.434 +        vmovl.u8        q12, d24
   1.435 +        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
   1.436 +        vmovl.u16       q12, d24
   1.437 +        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
   1.438 +        vsli.16         q12, q12, #8
   1.439 +        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
   1.440 +        vsli.32         q12, q12, #16
   1.441 +        vclt.u8         q6,  q6,  q11   @ < alpha
   1.442 +        vdup.8          q11, r3         @ beta
   1.443 +        vclt.s8         q7,  q12, #0
   1.444 +        vclt.u8         q14, q14, q11   @ < beta
   1.445 +        vclt.u8         q15, q15, q11   @ < beta
   1.446 +        vbic            q6,  q6,  q7
   1.447 +        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
   1.448 +        vand            q6,  q6,  q14
   1.449 +        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
   1.450 +        vclt.u8         q4,  q4,  q11   @ < beta
   1.451 +        vand            q6,  q6,  q15
   1.452 +        vclt.u8         q5,  q5,  q11   @ < beta
   1.453 +        vand            q4,  q4,  q6
   1.454 +        vand            q5,  q5,  q6
   1.455 +        vand            q12, q12, q6
   1.456 +        vrhadd.u8       q14, q8,  q0
   1.457 +        vsub.i8         q6,  q12, q4
   1.458 +        vqadd.u8        q7,  q9,  q12
   1.459 +        vhadd.u8        q10, q10, q14
   1.460 +        vsub.i8         q6,  q6,  q5
   1.461 +        vhadd.u8        q14, q2,  q14
   1.462 +        vmin.u8         q7,  q7,  q10
   1.463 +        vqsub.u8        q11, q9,  q12
   1.464 +        vqadd.u8        q2,  q1,  q12
   1.465 +        vmax.u8         q7,  q7,  q11
   1.466 +        vqsub.u8        q11, q1,  q12
   1.467 +        vmin.u8         q14, q2,  q14
   1.468 +        vmovl.u8        q2,  d0
   1.469 +        vmax.u8         q14, q14, q11
   1.470 +        vmovl.u8        q10, d1
   1.471 +        vsubw.u8        q2,  q2,  d16
   1.472 +        vsubw.u8        q10, q10, d17
   1.473 +        vshl.i16        q2,  q2,  #2
   1.474 +        vshl.i16        q10, q10, #2
   1.475 +        vaddw.u8        q2,  q2,  d18
   1.476 +        vaddw.u8        q10, q10, d19
   1.477 +        vsubw.u8        q2,  q2,  d2
   1.478 +        vsubw.u8        q10, q10, d3
   1.479 +        vrshrn.i16      d4,  q2,  #3
   1.480 +        vrshrn.i16      d5,  q10, #3
   1.481 +        vbsl            q4,  q7,  q9
   1.482 +        vbsl            q5,  q14, q1
   1.483 +        vneg.s8         q7,  q6
   1.484 +        vmovl.u8        q14, d16
   1.485 +        vmin.s8         q2,  q2,  q6
   1.486 +        vmovl.u8        q6,  d17
   1.487 +        vmax.s8         q2,  q2,  q7
   1.488 +        vmovl.u8        q11, d0
   1.489 +        vmovl.u8        q12, d1
   1.490 +        vaddw.s8        q14, q14, d4
   1.491 +        vaddw.s8        q6,  q6,  d5
   1.492 +        vsubw.s8        q11, q11, d4
   1.493 +        vsubw.s8        q12, q12, d5
   1.494 +        vqmovun.s16     d16, q14
   1.495 +        vqmovun.s16     d17, q6
   1.496 +        vqmovun.s16     d0,  q11
   1.497 +        vqmovun.s16     d1,  q12
   1.498 +        .endm
   1.499 +
   1.500 +function ff_h264_v_loop_filter_luma_neon, export=1
   1.501 +        h264_loop_filter_start
   1.502 +
   1.503 +        vld1.64         {d0, d1},  [r0,:128], r1
   1.504 +        vld1.64         {d2, d3},  [r0,:128], r1
   1.505 +        vld1.64         {d4, d5},  [r0,:128], r1
   1.506 +        sub             r0,  r0,  r1, lsl #2
   1.507 +        sub             r0,  r0,  r1, lsl #1
   1.508 +        vld1.64         {d20,d21}, [r0,:128], r1
   1.509 +        vld1.64         {d18,d19}, [r0,:128], r1
   1.510 +        vld1.64         {d16,d17}, [r0,:128], r1
   1.511 +
   1.512 +        align_push_regs
   1.513 +
   1.514 +        h264_loop_filter_luma
   1.515 +
   1.516 +        sub             r0,  r0,  r1, lsl #1
   1.517 +        vst1.64         {d8, d9},  [r0,:128], r1
   1.518 +        vst1.64         {d16,d17}, [r0,:128], r1
   1.519 +        vst1.64         {d0, d1},  [r0,:128], r1
   1.520 +        vst1.64         {d10,d11}, [r0,:128]
   1.521 +
   1.522 +        align_pop_regs
   1.523 +        bx              lr
   1.524 +endfunc
   1.525 +
   1.526 +function ff_h264_h_loop_filter_luma_neon, export=1
   1.527 +        h264_loop_filter_start
   1.528 +
   1.529 +        sub             r0,  r0,  #4
   1.530 +        vld1.64         {d6},  [r0], r1
   1.531 +        vld1.64         {d20}, [r0], r1
   1.532 +        vld1.64         {d18}, [r0], r1
   1.533 +        vld1.64         {d16}, [r0], r1
   1.534 +        vld1.64         {d0},  [r0], r1
   1.535 +        vld1.64         {d2},  [r0], r1
   1.536 +        vld1.64         {d4},  [r0], r1
   1.537 +        vld1.64         {d26}, [r0], r1
   1.538 +        vld1.64         {d7},  [r0], r1
   1.539 +        vld1.64         {d21}, [r0], r1
   1.540 +        vld1.64         {d19}, [r0], r1
   1.541 +        vld1.64         {d17}, [r0], r1
   1.542 +        vld1.64         {d1},  [r0], r1
   1.543 +        vld1.64         {d3},  [r0], r1
   1.544 +        vld1.64         {d5},  [r0], r1
   1.545 +        vld1.64         {d27}, [r0], r1
   1.546 +
   1.547 +        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
   1.548 +
   1.549 +        align_push_regs
   1.550 +
   1.551 +        h264_loop_filter_luma
   1.552 +
   1.553 +        transpose_4x4   q4, q8, q0, q5
   1.554 +
   1.555 +        sub             r0,  r0,  r1, lsl #4
   1.556 +        add             r0,  r0,  #2
   1.557 +        vst1.32         {d8[0]},  [r0], r1
   1.558 +        vst1.32         {d16[0]}, [r0], r1
   1.559 +        vst1.32         {d0[0]},  [r0], r1
   1.560 +        vst1.32         {d10[0]}, [r0], r1
   1.561 +        vst1.32         {d8[1]},  [r0], r1
   1.562 +        vst1.32         {d16[1]}, [r0], r1
   1.563 +        vst1.32         {d0[1]},  [r0], r1
   1.564 +        vst1.32         {d10[1]}, [r0], r1
   1.565 +        vst1.32         {d9[0]},  [r0], r1
   1.566 +        vst1.32         {d17[0]}, [r0], r1
   1.567 +        vst1.32         {d1[0]},  [r0], r1
   1.568 +        vst1.32         {d11[0]}, [r0], r1
   1.569 +        vst1.32         {d9[1]},  [r0], r1
   1.570 +        vst1.32         {d17[1]}, [r0], r1
   1.571 +        vst1.32         {d1[1]},  [r0], r1
   1.572 +        vst1.32         {d11[1]}, [r0], r1
   1.573 +
   1.574 +        align_pop_regs
   1.575 +        bx              lr
   1.576 +endfunc
   1.577 +
   1.578 +        .macro h264_loop_filter_chroma
   1.579 +        vdup.8          d22, r2         @ alpha
   1.580 +        vmovl.u8        q12, d24
   1.581 +        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
   1.582 +        vmovl.u8        q2,  d0
   1.583 +        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
   1.584 +        vsubw.u8        q2,  q2,  d16
   1.585 +        vsli.16         d24, d24, #8
   1.586 +        vshl.i16        q2,  q2,  #2
   1.587 +        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
   1.588 +        vaddw.u8        q2,  q2,  d18
   1.589 +        vclt.u8         d26, d26, d22   @ < alpha
   1.590 +        vsubw.u8        q2,  q2,  d2
   1.591 +        vdup.8          d22, r3         @ beta
   1.592 +        vclt.s8         d25, d24, #0
   1.593 +        vrshrn.i16      d4,  q2,  #3
   1.594 +        vclt.u8         d28, d28, d22   @ < beta
   1.595 +        vbic            d26, d26, d25
   1.596 +        vclt.u8         d30, d30, d22   @ < beta
   1.597 +        vand            d26, d26, d28
   1.598 +        vneg.s8         d25, d24
   1.599 +        vand            d26, d26, d30
   1.600 +        vmin.s8         d4,  d4,  d24
   1.601 +        vmovl.u8        q14, d16
   1.602 +        vand            d4,  d4,  d26
   1.603 +        vmax.s8         d4,  d4,  d25
   1.604 +        vmovl.u8        q11, d0
   1.605 +        vaddw.s8        q14, q14, d4
   1.606 +        vsubw.s8        q11, q11, d4
   1.607 +        vqmovun.s16     d16, q14
   1.608 +        vqmovun.s16     d0,  q11
   1.609 +        .endm
   1.610 +
   1.611 +function ff_h264_v_loop_filter_chroma_neon, export=1
   1.612 +        h264_loop_filter_start
   1.613 +
   1.614 +        sub             r0,  r0,  r1, lsl #1
   1.615 +        vld1.64         {d18}, [r0,:64], r1
   1.616 +        vld1.64         {d16}, [r0,:64], r1
   1.617 +        vld1.64         {d0},  [r0,:64], r1
   1.618 +        vld1.64         {d2},  [r0,:64]
   1.619 +
   1.620 +        h264_loop_filter_chroma
   1.621 +
   1.622 +        sub             r0,  r0,  r1, lsl #1
   1.623 +        vst1.64         {d16}, [r0,:64], r1
   1.624 +        vst1.64         {d0},  [r0,:64], r1
   1.625 +
   1.626 +        bx              lr
   1.627 +endfunc
   1.628 +
   1.629 +function ff_h264_h_loop_filter_chroma_neon, export=1
   1.630 +        h264_loop_filter_start
   1.631 +
   1.632 +        sub             r0,  r0,  #2
   1.633 +        vld1.32         {d18[0]}, [r0], r1
   1.634 +        vld1.32         {d16[0]}, [r0], r1
   1.635 +        vld1.32         {d0[0]},  [r0], r1
   1.636 +        vld1.32         {d2[0]},  [r0], r1
   1.637 +        vld1.32         {d18[1]}, [r0], r1
   1.638 +        vld1.32         {d16[1]}, [r0], r1
   1.639 +        vld1.32         {d0[1]},  [r0], r1
   1.640 +        vld1.32         {d2[1]},  [r0], r1
   1.641 +
   1.642 +        vtrn.16         d18, d0
   1.643 +        vtrn.16         d16, d2
   1.644 +        vtrn.8          d18, d16
   1.645 +        vtrn.8          d0,  d2
   1.646 +
   1.647 +        h264_loop_filter_chroma
   1.648 +
   1.649 +        vtrn.16         d18, d0
   1.650 +        vtrn.16         d16, d2
   1.651 +        vtrn.8          d18, d16
   1.652 +        vtrn.8          d0,  d2
   1.653 +
   1.654 +        sub             r0,  r0,  r1, lsl #3
   1.655 +        vst1.32         {d18[0]}, [r0], r1
   1.656 +        vst1.32         {d16[0]}, [r0], r1
   1.657 +        vst1.32         {d0[0]},  [r0], r1
   1.658 +        vst1.32         {d2[0]},  [r0], r1
   1.659 +        vst1.32         {d18[1]}, [r0], r1
   1.660 +        vst1.32         {d16[1]}, [r0], r1
   1.661 +        vst1.32         {d0[1]},  [r0], r1
   1.662 +        vst1.32         {d2[1]},  [r0], r1
   1.663 +
   1.664 +        bx              lr
   1.665 +endfunc
   1.666 +
   1.667 +        /* H.264 qpel MC */
   1.668 +
   1.669 +        .macro  lowpass_const r
   1.670 +        movw            \r,  #5
   1.671 +        movt            \r,  #20
   1.672 +        vmov.32         d6[0], \r
   1.673 +        .endm
   1.674 +
   1.675 +        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
   1.676 +.if \narrow
   1.677 +        t0 .req q0
   1.678 +        t1 .req q8
   1.679 +.else
   1.680 +        t0 .req \d0
   1.681 +        t1 .req \d1
   1.682 +.endif
   1.683 +        vext.8          d2,  \r0, \r1, #2
   1.684 +        vext.8          d3,  \r0, \r1, #3
   1.685 +        vaddl.u8        q1,  d2,  d3
   1.686 +        vext.8          d4,  \r0, \r1, #1
   1.687 +        vext.8          d5,  \r0, \r1, #4
   1.688 +        vaddl.u8        q2,  d4,  d5
   1.689 +        vext.8          d30, \r0, \r1, #5
   1.690 +        vaddl.u8        t0,  \r0, d30
   1.691 +        vext.8          d18, \r2, \r3, #2
   1.692 +        vmla.i16        t0,  q1,  d6[1]
   1.693 +        vext.8          d19, \r2, \r3, #3
   1.694 +        vaddl.u8        q9,  d18, d19
   1.695 +        vext.8          d20, \r2, \r3, #1
   1.696 +        vmls.i16        t0,  q2,  d6[0]
   1.697 +        vext.8          d21, \r2, \r3, #4
   1.698 +        vaddl.u8        q10, d20, d21
   1.699 +        vext.8          d31, \r2, \r3, #5
   1.700 +        vaddl.u8        t1,  \r2, d31
   1.701 +        vmla.i16        t1,  q9,  d6[1]
   1.702 +        vmls.i16        t1,  q10, d6[0]
   1.703 +.if \narrow
   1.704 +        vqrshrun.s16    \d0, t0,  #5
   1.705 +        vqrshrun.s16    \d1, t1,  #5
   1.706 +.endif
   1.707 +        .unreq  t0
   1.708 +        .unreq  t1
   1.709 +        .endm
   1.710 +
   1.711 +        .macro  lowpass_8_1 r0, r1, d0, narrow=1
   1.712 +.if \narrow
   1.713 +        t0 .req q0
   1.714 +.else
   1.715 +        t0 .req \d0
   1.716 +.endif
   1.717 +        vext.8          d2,  \r0, \r1, #2
   1.718 +        vext.8          d3,  \r0, \r1, #3
   1.719 +        vaddl.u8        q1,  d2,  d3
   1.720 +        vext.8          d4,  \r0, \r1, #1
   1.721 +        vext.8          d5,  \r0, \r1, #4
   1.722 +        vaddl.u8        q2,  d4,  d5
   1.723 +        vext.8          d30, \r0, \r1, #5
   1.724 +        vaddl.u8        t0,  \r0, d30
   1.725 +        vmla.i16        t0,  q1,  d6[1]
   1.726 +        vmls.i16        t0,  q2,  d6[0]
   1.727 +.if \narrow
   1.728 +        vqrshrun.s16    \d0, t0,  #5
   1.729 +.endif
   1.730 +        .unreq  t0
   1.731 +        .endm
   1.732 +
   1.733 +        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
   1.734 +        vext.16         q1,  \r0, \r1, #2
   1.735 +        vext.16         q0,  \r0, \r1, #3
   1.736 +        vaddl.s16       q9,  d2,  d0
   1.737 +        vext.16         q2,  \r0, \r1, #1
   1.738 +        vaddl.s16       q1,  d3,  d1
   1.739 +        vext.16         q3,  \r0, \r1, #4
   1.740 +        vaddl.s16       q10, d4,  d6
   1.741 +        vext.16         \r1, \r0, \r1, #5
   1.742 +        vaddl.s16       q2,  d5,  d7
   1.743 +        vaddl.s16       q0,  \h0, \h1
   1.744 +        vaddl.s16       q8,  \l0, \l1
   1.745 +
   1.746 +        vshl.i32        q3,  q9,  #4
   1.747 +        vshl.i32        q9,  q9,  #2
   1.748 +        vshl.i32        q15, q10, #2
   1.749 +        vadd.i32        q9,  q9,  q3
   1.750 +        vadd.i32        q10, q10, q15
   1.751 +
   1.752 +        vshl.i32        q3,  q1,  #4
   1.753 +        vshl.i32        q1,  q1,  #2
   1.754 +        vshl.i32        q15, q2,  #2
   1.755 +        vadd.i32        q1,  q1,  q3
   1.756 +        vadd.i32        q2,  q2,  q15
   1.757 +
   1.758 +        vadd.i32        q9,  q9,  q8
   1.759 +        vsub.i32        q9,  q9,  q10
   1.760 +
   1.761 +        vadd.i32        q1,  q1,  q0
   1.762 +        vsub.i32        q1,  q1,  q2
   1.763 +
   1.764 +        vrshrn.s32      d18, q9,  #10
   1.765 +        vrshrn.s32      d19, q1,  #10
   1.766 +
   1.767 +        vqmovun.s16     \d,  q9
   1.768 +        .endm
   1.769 +
   1.770 +function put_h264_qpel16_h_lowpass_neon_packed
   1.771 +        mov             r4,  lr
   1.772 +        mov             ip,  #16
   1.773 +        mov             r3,  #8
   1.774 +        bl              put_h264_qpel8_h_lowpass_neon
   1.775 +        sub             r1,  r1,  r2, lsl #4
   1.776 +        add             r1,  r1,  #8
   1.777 +        mov             ip,  #16
   1.778 +        mov             lr,  r4
   1.779 +        b               put_h264_qpel8_h_lowpass_neon
   1.780 +endfunc
   1.781 +
   1.782 +        .macro h264_qpel_h_lowpass type
   1.783 +function \type\()_h264_qpel16_h_lowpass_neon
   1.784 +        push            {lr}
   1.785 +        mov             ip,  #16
   1.786 +        bl              \type\()_h264_qpel8_h_lowpass_neon
   1.787 +        sub             r0,  r0,  r3, lsl #4
   1.788 +        sub             r1,  r1,  r2, lsl #4
   1.789 +        add             r0,  r0,  #8
   1.790 +        add             r1,  r1,  #8
   1.791 +        mov             ip,  #16
   1.792 +        pop             {lr}
   1.793 +endfunc
   1.794 +
   1.795 +function \type\()_h264_qpel8_h_lowpass_neon
   1.796 +1:      vld1.64         {d0, d1},  [r1], r2
   1.797 +        vld1.64         {d16,d17}, [r1], r2
   1.798 +        subs            ip,  ip,  #2
   1.799 +        lowpass_8       d0,  d1,  d16, d17, d0,  d16
   1.800 +.ifc \type,avg
   1.801 +        vld1.8          {d2},     [r0,:64], r3
   1.802 +        vrhadd.u8       d0,  d0,  d2
   1.803 +        vld1.8          {d3},     [r0,:64]
   1.804 +        vrhadd.u8       d16, d16, d3
   1.805 +        sub             r0,  r0,  r3
   1.806 +.endif
   1.807 +        vst1.64         {d0},     [r0,:64], r3
   1.808 +        vst1.64         {d16},    [r0,:64], r3
   1.809 +        bne             1b
   1.810 +        bx              lr
   1.811 +endfunc
   1.812 +        .endm
   1.813 +
   1.814 +        h264_qpel_h_lowpass put
   1.815 +        h264_qpel_h_lowpass avg
   1.816 +
   1.817 +        .macro h264_qpel_h_lowpass_l2 type
   1.818 +function \type\()_h264_qpel16_h_lowpass_l2_neon
   1.819 +        push            {lr}
   1.820 +        mov             ip,  #16
   1.821 +        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
   1.822 +        sub             r0,  r0,  r2, lsl #4
   1.823 +        sub             r1,  r1,  r2, lsl #4
   1.824 +        sub             r3,  r3,  r2, lsl #4
   1.825 +        add             r0,  r0,  #8
   1.826 +        add             r1,  r1,  #8
   1.827 +        add             r3,  r3,  #8
   1.828 +        mov             ip,  #16
   1.829 +        pop             {lr}
   1.830 +endfunc
   1.831 +
   1.832 +function \type\()_h264_qpel8_h_lowpass_l2_neon
   1.833 +1:      vld1.64         {d0, d1},  [r1], r2
   1.834 +        vld1.64         {d16,d17}, [r1], r2
   1.835 +        vld1.64         {d28},     [r3], r2
   1.836 +        vld1.64         {d29},     [r3], r2
   1.837 +        subs            ip,  ip,  #2
   1.838 +        lowpass_8       d0,  d1,  d16, d17, d0,  d1
   1.839 +        vrhadd.u8       q0,  q0,  q14
   1.840 +.ifc \type,avg
   1.841 +        vld1.8          {d2},      [r0,:64], r2
   1.842 +        vrhadd.u8       d0,  d0,  d2
   1.843 +        vld1.8          {d3},      [r0,:64]
   1.844 +        vrhadd.u8       d1,  d1,  d3
   1.845 +        sub             r0,  r0,  r2
   1.846 +.endif
   1.847 +        vst1.64         {d0},      [r0,:64], r2
   1.848 +        vst1.64         {d1},      [r0,:64], r2
   1.849 +        bne             1b
   1.850 +        bx              lr
   1.851 +endfunc
   1.852 +        .endm
   1.853 +
   1.854 +        h264_qpel_h_lowpass_l2 put
   1.855 +        h264_qpel_h_lowpass_l2 avg
   1.856 +
   1.857 +function put_h264_qpel16_v_lowpass_neon_packed
   1.858 +        mov             r4,  lr
   1.859 +        mov             r2,  #8
   1.860 +        bl              put_h264_qpel8_v_lowpass_neon
   1.861 +        sub             r1,  r1,  r3, lsl #2
   1.862 +        bl              put_h264_qpel8_v_lowpass_neon
   1.863 +        sub             r1,  r1,  r3, lsl #4
   1.864 +        sub             r1,  r1,  r3, lsl #2
   1.865 +        add             r1,  r1,  #8
   1.866 +        bl              put_h264_qpel8_v_lowpass_neon
   1.867 +        sub             r1,  r1,  r3, lsl #2
   1.868 +        mov             lr,  r4
   1.869 +        b               put_h264_qpel8_v_lowpass_neon
   1.870 +endfunc
   1.871 +
   1.872 +        .macro h264_qpel_v_lowpass type
   1.873 +function \type\()_h264_qpel16_v_lowpass_neon
   1.874 +        mov             r4,  lr
   1.875 +        bl              \type\()_h264_qpel8_v_lowpass_neon
   1.876 +        sub             r1,  r1,  r3, lsl #2
   1.877 +        bl              \type\()_h264_qpel8_v_lowpass_neon
   1.878 +        sub             r0,  r0,  r2, lsl #4
   1.879 +        add             r0,  r0,  #8
   1.880 +        sub             r1,  r1,  r3, lsl #4
   1.881 +        sub             r1,  r1,  r3, lsl #2
   1.882 +        add             r1,  r1,  #8
   1.883 +        bl              \type\()_h264_qpel8_v_lowpass_neon
   1.884 +        sub             r1,  r1,  r3, lsl #2
   1.885 +        mov             lr,  r4
   1.886 +endfunc
   1.887 +
   1.888 +function \type\()_h264_qpel8_v_lowpass_neon
   1.889 +        vld1.64         {d8},  [r1], r3
   1.890 +        vld1.64         {d10}, [r1], r3
   1.891 +        vld1.64         {d12}, [r1], r3
   1.892 +        vld1.64         {d14}, [r1], r3
   1.893 +        vld1.64         {d22}, [r1], r3
   1.894 +        vld1.64         {d24}, [r1], r3
   1.895 +        vld1.64         {d26}, [r1], r3
   1.896 +        vld1.64         {d28}, [r1], r3
   1.897 +        vld1.64         {d9},  [r1], r3
   1.898 +        vld1.64         {d11}, [r1], r3
   1.899 +        vld1.64         {d13}, [r1], r3
   1.900 +        vld1.64         {d15}, [r1], r3
   1.901 +        vld1.64         {d23}, [r1]
   1.902 +
   1.903 +        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
   1.904 +        lowpass_8       d8,  d9,  d10, d11, d8,  d10
   1.905 +        lowpass_8       d12, d13, d14, d15, d12, d14
   1.906 +        lowpass_8       d22, d23, d24, d25, d22, d24
   1.907 +        lowpass_8       d26, d27, d28, d29, d26, d28
   1.908 +        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
   1.909 +
   1.910 +.ifc \type,avg
   1.911 +        vld1.8          {d9},  [r0,:64], r2
   1.912 +        vrhadd.u8       d8,  d8,  d9
   1.913 +        vld1.8          {d11}, [r0,:64], r2
   1.914 +        vrhadd.u8       d10, d10, d11
   1.915 +        vld1.8          {d13}, [r0,:64], r2
   1.916 +        vrhadd.u8       d12, d12, d13
   1.917 +        vld1.8          {d15}, [r0,:64], r2
   1.918 +        vrhadd.u8       d14, d14, d15
   1.919 +        vld1.8          {d23}, [r0,:64], r2
   1.920 +        vrhadd.u8       d22, d22, d23
   1.921 +        vld1.8          {d25}, [r0,:64], r2
   1.922 +        vrhadd.u8       d24, d24, d25
   1.923 +        vld1.8          {d27}, [r0,:64], r2
   1.924 +        vrhadd.u8       d26, d26, d27
   1.925 +        vld1.8          {d29}, [r0,:64], r2
   1.926 +        vrhadd.u8       d28, d28, d29
   1.927 +        sub             r0,  r0,  r2,  lsl #3
   1.928 +.endif
   1.929 +
   1.930 +        vst1.64         {d8},  [r0,:64], r2
   1.931 +        vst1.64         {d10}, [r0,:64], r2
   1.932 +        vst1.64         {d12}, [r0,:64], r2
   1.933 +        vst1.64         {d14}, [r0,:64], r2
   1.934 +        vst1.64         {d22}, [r0,:64], r2
   1.935 +        vst1.64         {d24}, [r0,:64], r2
   1.936 +        vst1.64         {d26}, [r0,:64], r2
   1.937 +        vst1.64         {d28}, [r0,:64], r2
   1.938 +
   1.939 +        bx              lr
   1.940 +endfunc
   1.941 +        .endm
   1.942 +
   1.943 +        h264_qpel_v_lowpass put
   1.944 +        h264_qpel_v_lowpass avg
   1.945 +
   1.946 +        .macro h264_qpel_v_lowpass_l2 type
   1.947 +function \type\()_h264_qpel16_v_lowpass_l2_neon
   1.948 +        mov             r4,  lr
   1.949 +        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
   1.950 +        sub             r1,  r1,  r3, lsl #2
   1.951 +        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
   1.952 +        sub             r0,  r0,  r3, lsl #4
   1.953 +        sub             ip,  ip,  r2, lsl #4
   1.954 +        add             r0,  r0,  #8
   1.955 +        add             ip,  ip,  #8
   1.956 +        sub             r1,  r1,  r3, lsl #4
   1.957 +        sub             r1,  r1,  r3, lsl #2
   1.958 +        add             r1,  r1,  #8
   1.959 +        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
   1.960 +        sub             r1,  r1,  r3, lsl #2
   1.961 +        mov             lr,  r4
   1.962 +endfunc
   1.963 +
   1.964 +function \type\()_h264_qpel8_v_lowpass_l2_neon
   1.965 +        vld1.64         {d8},  [r1], r3
   1.966 +        vld1.64         {d10}, [r1], r3
   1.967 +        vld1.64         {d12}, [r1], r3
   1.968 +        vld1.64         {d14}, [r1], r3
   1.969 +        vld1.64         {d22}, [r1], r3
   1.970 +        vld1.64         {d24}, [r1], r3
   1.971 +        vld1.64         {d26}, [r1], r3
   1.972 +        vld1.64         {d28}, [r1], r3
   1.973 +        vld1.64         {d9},  [r1], r3
   1.974 +        vld1.64         {d11}, [r1], r3
   1.975 +        vld1.64         {d13}, [r1], r3
   1.976 +        vld1.64         {d15}, [r1], r3
   1.977 +        vld1.64         {d23}, [r1]
   1.978 +
   1.979 +        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
   1.980 +        lowpass_8       d8,  d9,  d10, d11, d8,  d9
   1.981 +        lowpass_8       d12, d13, d14, d15, d12, d13
   1.982 +        lowpass_8       d22, d23, d24, d25, d22, d23
   1.983 +        lowpass_8       d26, d27, d28, d29, d26, d27
   1.984 +        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
   1.985 +
   1.986 +        vld1.64         {d0},  [ip], r2
   1.987 +        vld1.64         {d1},  [ip], r2
   1.988 +        vld1.64         {d2},  [ip], r2
   1.989 +        vld1.64         {d3},  [ip], r2
   1.990 +        vld1.64         {d4},  [ip], r2
   1.991 +        vrhadd.u8       q0,  q0,  q4
   1.992 +        vld1.64         {d5},  [ip], r2
   1.993 +        vrhadd.u8       q1,  q1,  q6
   1.994 +        vld1.64         {d10}, [ip], r2
   1.995 +        vrhadd.u8       q2,  q2,  q11
   1.996 +        vld1.64         {d11}, [ip], r2
   1.997 +        vrhadd.u8       q5,  q5,  q13
   1.998 +
   1.999 +.ifc \type,avg
  1.1000 +        vld1.8          {d16}, [r0,:64], r3
  1.1001 +        vrhadd.u8       d0,  d0,  d16
  1.1002 +        vld1.8          {d17}, [r0,:64], r3
  1.1003 +        vrhadd.u8       d1,  d1,  d17
  1.1004 +        vld1.8          {d16}, [r0,:64], r3
  1.1005 +        vrhadd.u8       d2,  d2,  d16
  1.1006 +        vld1.8          {d17}, [r0,:64], r3
  1.1007 +        vrhadd.u8       d3,  d3,  d17
  1.1008 +        vld1.8          {d16}, [r0,:64], r3
  1.1009 +        vrhadd.u8       d4,  d4,  d16
  1.1010 +        vld1.8          {d17}, [r0,:64], r3
  1.1011 +        vrhadd.u8       d5,  d5,  d17
  1.1012 +        vld1.8          {d16}, [r0,:64], r3
  1.1013 +        vrhadd.u8       d10, d10, d16
  1.1014 +        vld1.8          {d17}, [r0,:64], r3
  1.1015 +        vrhadd.u8       d11, d11, d17
  1.1016 +        sub             r0,  r0,  r3,  lsl #3
  1.1017 +.endif
  1.1018 +
  1.1019 +        vst1.64         {d0},  [r0,:64], r3
  1.1020 +        vst1.64         {d1},  [r0,:64], r3
  1.1021 +        vst1.64         {d2},  [r0,:64], r3
  1.1022 +        vst1.64         {d3},  [r0,:64], r3
  1.1023 +        vst1.64         {d4},  [r0,:64], r3
  1.1024 +        vst1.64         {d5},  [r0,:64], r3
  1.1025 +        vst1.64         {d10}, [r0,:64], r3
  1.1026 +        vst1.64         {d11}, [r0,:64], r3
  1.1027 +
  1.1028 +        bx              lr
  1.1029 +endfunc
  1.1030 +        .endm
  1.1031 +
  1.1032 +        h264_qpel_v_lowpass_l2 put
  1.1033 +        h264_qpel_v_lowpass_l2 avg
  1.1034 +
  1.1035 +function put_h264_qpel8_hv_lowpass_neon_top
  1.1036 +        lowpass_const   ip
  1.1037 +        mov             ip,  #12
  1.1038 +1:      vld1.64         {d0, d1},  [r1], r3
  1.1039 +        vld1.64         {d16,d17}, [r1], r3
  1.1040 +        subs            ip,  ip,  #2
  1.1041 +        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
  1.1042 +        vst1.64         {d22-d25}, [r4,:128]!
  1.1043 +        bne             1b
  1.1044 +
  1.1045 +        vld1.64         {d0, d1},  [r1]
  1.1046 +        lowpass_8_1     d0,  d1,  q12, narrow=0
  1.1047 +
  1.1048 +        mov             ip,  #-16
  1.1049 +        add             r4,  r4,  ip
  1.1050 +        vld1.64         {d30,d31}, [r4,:128], ip
  1.1051 +        vld1.64         {d20,d21}, [r4,:128], ip
  1.1052 +        vld1.64         {d18,d19}, [r4,:128], ip
  1.1053 +        vld1.64         {d16,d17}, [r4,:128], ip
  1.1054 +        vld1.64         {d14,d15}, [r4,:128], ip
  1.1055 +        vld1.64         {d12,d13}, [r4,:128], ip
  1.1056 +        vld1.64         {d10,d11}, [r4,:128], ip
  1.1057 +        vld1.64         {d8, d9},  [r4,:128], ip
  1.1058 +        vld1.64         {d6, d7},  [r4,:128], ip
  1.1059 +        vld1.64         {d4, d5},  [r4,:128], ip
  1.1060 +        vld1.64         {d2, d3},  [r4,:128], ip
  1.1061 +        vld1.64         {d0, d1},  [r4,:128]
  1.1062 +
  1.1063 +        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
  1.1064 +        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
  1.1065 +
  1.1066 +        swap4           d17, d19, d21, d31, d24, d26, d28, d22
  1.1067 +        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
  1.1068 +
  1.1069 +        vst1.64         {d30,d31}, [r4,:128]!
  1.1070 +        vst1.64         {d6, d7},  [r4,:128]!
  1.1071 +        vst1.64         {d20,d21}, [r4,:128]!
  1.1072 +        vst1.64         {d4, d5},  [r4,:128]!
  1.1073 +        vst1.64         {d18,d19}, [r4,:128]!
  1.1074 +        vst1.64         {d2, d3},  [r4,:128]!
  1.1075 +        vst1.64         {d16,d17}, [r4,:128]!
  1.1076 +        vst1.64         {d0, d1},  [r4,:128]
  1.1077 +
  1.1078 +        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
  1.1079 +        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
  1.1080 +        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
  1.1081 +        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
  1.1082 +
  1.1083 +        vld1.64         {d16,d17}, [r4,:128], ip
  1.1084 +        vld1.64         {d30,d31}, [r4,:128], ip
  1.1085 +        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
  1.1086 +        vld1.64         {d16,d17}, [r4,:128], ip
  1.1087 +        vld1.64         {d30,d31}, [r4,:128], ip
  1.1088 +        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
  1.1089 +        vld1.64         {d16,d17}, [r4,:128], ip
  1.1090 +        vld1.64         {d30,d31}, [r4,:128], ip
  1.1091 +        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
  1.1092 +        vld1.64         {d16,d17}, [r4,:128], ip
  1.1093 +        vld1.64         {d30,d31}, [r4,:128]
  1.1094 +        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
  1.1095 +
  1.1096 +        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
  1.1097 +
  1.1098 +        bx              lr
  1.1099 +endfunc
  1.1100 +
  1.1101 +        .macro h264_qpel8_hv_lowpass type
  1.1102 +function \type\()_h264_qpel8_hv_lowpass_neon
  1.1103 +        mov             r10, lr
  1.1104 +        bl              put_h264_qpel8_hv_lowpass_neon_top
  1.1105 +.ifc \type,avg
  1.1106 +        vld1.8          {d0},      [r0,:64], r2
  1.1107 +        vrhadd.u8       d12, d12, d0
  1.1108 +        vld1.8          {d1},      [r0,:64], r2
  1.1109 +        vrhadd.u8       d13, d13, d1
  1.1110 +        vld1.8          {d2},      [r0,:64], r2
  1.1111 +        vrhadd.u8       d14, d14, d2
  1.1112 +        vld1.8          {d3},      [r0,:64], r2
  1.1113 +        vrhadd.u8       d15, d15, d3
  1.1114 +        vld1.8          {d4},      [r0,:64], r2
  1.1115 +        vrhadd.u8       d8,  d8,  d4
  1.1116 +        vld1.8          {d5},      [r0,:64], r2
  1.1117 +        vrhadd.u8       d9,  d9,  d5
  1.1118 +        vld1.8          {d6},      [r0,:64], r2
  1.1119 +        vrhadd.u8       d10, d10, d6
  1.1120 +        vld1.8          {d7},      [r0,:64], r2
  1.1121 +        vrhadd.u8       d11, d11, d7
  1.1122 +        sub             r0,  r0,  r2,  lsl #3
  1.1123 +.endif
  1.1124 +        vst1.64         {d12},     [r0,:64], r2
  1.1125 +        vst1.64         {d13},     [r0,:64], r2
  1.1126 +        vst1.64         {d14},     [r0,:64], r2
  1.1127 +        vst1.64         {d15},     [r0,:64], r2
  1.1128 +        vst1.64         {d8},      [r0,:64], r2
  1.1129 +        vst1.64         {d9},      [r0,:64], r2
  1.1130 +        vst1.64         {d10},     [r0,:64], r2
  1.1131 +        vst1.64         {d11},     [r0,:64], r2
  1.1132 +
  1.1133 +        mov             lr,  r10
  1.1134 +        bx              lr
  1.1135 +endfunc
  1.1136 +        .endm
  1.1137 +
  1.1138 +        h264_qpel8_hv_lowpass put
  1.1139 +        h264_qpel8_hv_lowpass avg
  1.1140 +
  1.1141 +        .macro h264_qpel8_hv_lowpass_l2 type
  1.1142 +function \type\()_h264_qpel8_hv_lowpass_l2_neon
  1.1143 +        mov             r10, lr
  1.1144 +        bl              put_h264_qpel8_hv_lowpass_neon_top
  1.1145 +
  1.1146 +        vld1.64         {d0, d1},  [r2,:128]!
  1.1147 +        vld1.64         {d2, d3},  [r2,:128]!
  1.1148 +        vrhadd.u8       q0,  q0,  q6
  1.1149 +        vld1.64         {d4, d5},  [r2,:128]!
  1.1150 +        vrhadd.u8       q1,  q1,  q7
  1.1151 +        vld1.64         {d6, d7},  [r2,:128]!
  1.1152 +        vrhadd.u8       q2,  q2,  q4
  1.1153 +        vrhadd.u8       q3,  q3,  q5
  1.1154 +.ifc \type,avg
  1.1155 +        vld1.8          {d16},     [r0,:64], r3
  1.1156 +        vrhadd.u8       d0,  d0,  d16
  1.1157 +        vld1.8          {d17},     [r0,:64], r3
  1.1158 +        vrhadd.u8       d1,  d1,  d17
  1.1159 +        vld1.8          {d18},     [r0,:64], r3
  1.1160 +        vrhadd.u8       d2,  d2,  d18
  1.1161 +        vld1.8          {d19},     [r0,:64], r3
  1.1162 +        vrhadd.u8       d3,  d3,  d19
  1.1163 +        vld1.8          {d20},     [r0,:64], r3
  1.1164 +        vrhadd.u8       d4,  d4,  d20
  1.1165 +        vld1.8          {d21},     [r0,:64], r3
  1.1166 +        vrhadd.u8       d5,  d5,  d21
  1.1167 +        vld1.8          {d22},     [r0,:64], r3
  1.1168 +        vrhadd.u8       d6,  d6,  d22
  1.1169 +        vld1.8          {d23},     [r0,:64], r3
  1.1170 +        vrhadd.u8       d7,  d7,  d23
  1.1171 +        sub             r0,  r0,  r3,  lsl #3
  1.1172 +.endif
  1.1173 +        vst1.64         {d0},      [r0,:64], r3
  1.1174 +        vst1.64         {d1},      [r0,:64], r3
  1.1175 +        vst1.64         {d2},      [r0,:64], r3
  1.1176 +        vst1.64         {d3},      [r0,:64], r3
  1.1177 +        vst1.64         {d4},      [r0,:64], r3
  1.1178 +        vst1.64         {d5},      [r0,:64], r3
  1.1179 +        vst1.64         {d6},      [r0,:64], r3
  1.1180 +        vst1.64         {d7},      [r0,:64], r3
  1.1181 +
  1.1182 +        mov             lr,  r10
  1.1183 +        bx              lr
  1.1184 +endfunc
  1.1185 +        .endm
  1.1186 +
  1.1187 +        h264_qpel8_hv_lowpass_l2 put
  1.1188 +        h264_qpel8_hv_lowpass_l2 avg
  1.1189 +
  1.1190 +        .macro h264_qpel16_hv type
  1.1191 +function \type\()_h264_qpel16_hv_lowpass_neon
  1.1192 +        mov             r9,  lr
  1.1193 +        bl              \type\()_h264_qpel8_hv_lowpass_neon
  1.1194 +        sub             r1,  r1,  r3, lsl #2
  1.1195 +        bl              \type\()_h264_qpel8_hv_lowpass_neon
  1.1196 +        sub             r1,  r1,  r3, lsl #4
  1.1197 +        sub             r1,  r1,  r3, lsl #2
  1.1198 +        add             r1,  r1,  #8
  1.1199 +        sub             r0,  r0,  r2, lsl #4
  1.1200 +        add             r0,  r0,  #8
  1.1201 +        bl              \type\()_h264_qpel8_hv_lowpass_neon
  1.1202 +        sub             r1,  r1,  r3, lsl #2
  1.1203 +        mov             lr,  r9
  1.1204 +        b               \type\()_h264_qpel8_hv_lowpass_neon
  1.1205 +endfunc
  1.1206 +
  1.1207 +function \type\()_h264_qpel16_hv_lowpass_l2_neon
  1.1208 +        mov             r9,  lr
  1.1209 +        sub             r2,  r4,  #256
  1.1210 +        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
  1.1211 +        sub             r1,  r1,  r3, lsl #2
  1.1212 +        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
  1.1213 +        sub             r1,  r1,  r3, lsl #4
  1.1214 +        sub             r1,  r1,  r3, lsl #2
  1.1215 +        add             r1,  r1,  #8
  1.1216 +        sub             r0,  r0,  r3, lsl #4
  1.1217 +        add             r0,  r0,  #8
  1.1218 +        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
  1.1219 +        sub             r1,  r1,  r3, lsl #2
  1.1220 +        mov             lr,  r9
  1.1221 +        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
  1.1222 +endfunc
  1.1223 +        .endm
  1.1224 +
  1.1225 +        h264_qpel16_hv put
  1.1226 +        h264_qpel16_hv avg
  1.1227 +
  1.1228 +        .macro h264_qpel8 type
  1.1229 +function ff_\type\()_h264_qpel8_mc10_neon, export=1
  1.1230 +        lowpass_const   r3
  1.1231 +        mov             r3,  r1
  1.1232 +        sub             r1,  r1,  #2
  1.1233 +        mov             ip,  #8
  1.1234 +        b               \type\()_h264_qpel8_h_lowpass_l2_neon
  1.1235 +endfunc
  1.1236 +
  1.1237 +function ff_\type\()_h264_qpel8_mc20_neon, export=1
  1.1238 +        lowpass_const   r3
  1.1239 +        sub             r1,  r1,  #2
  1.1240 +        mov             r3,  r2
  1.1241 +        mov             ip,  #8
  1.1242 +        b               \type\()_h264_qpel8_h_lowpass_neon
  1.1243 +endfunc
  1.1244 +
  1.1245 +function ff_\type\()_h264_qpel8_mc30_neon, export=1
  1.1246 +        lowpass_const   r3
  1.1247 +        add             r3,  r1,  #1
  1.1248 +        sub             r1,  r1,  #2
  1.1249 +        mov             ip,  #8
  1.1250 +        b               \type\()_h264_qpel8_h_lowpass_l2_neon
  1.1251 +endfunc
  1.1252 +
  1.1253 +function ff_\type\()_h264_qpel8_mc01_neon, export=1
  1.1254 +        push            {lr}
  1.1255 +        mov             ip,  r1
  1.1256 +\type\()_h264_qpel8_mc01:
  1.1257 +        lowpass_const   r3
  1.1258 +        mov             r3,  r2
  1.1259 +        sub             r1,  r1,  r2, lsl #1
  1.1260 +        vpush           {d8-d15}
  1.1261 +        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
  1.1262 +        vpop            {d8-d15}
  1.1263 +        pop             {pc}
  1.1264 +endfunc
  1.1265 +
  1.1266 +function ff_\type\()_h264_qpel8_mc11_neon, export=1
  1.1267 +        push            {r0, r1, r11, lr}
  1.1268 +\type\()_h264_qpel8_mc11:
  1.1269 +        lowpass_const   r3
  1.1270 +        mov             r11, sp
  1.1271 +        bic             sp,  sp,  #15
  1.1272 +        sub             sp,  sp,  #64
  1.1273 +        mov             r0,  sp
  1.1274 +        sub             r1,  r1,  #2
  1.1275 +        mov             r3,  #8
  1.1276 +        mov             ip,  #8
  1.1277 +        vpush           {d8-d15}
  1.1278 +        bl              put_h264_qpel8_h_lowpass_neon
  1.1279 +        ldrd            r0,  [r11]
  1.1280 +        mov             r3,  r2
  1.1281 +        add             ip,  sp,  #64
  1.1282 +        sub             r1,  r1,  r2, lsl #1
  1.1283 +        mov             r2,  #8
  1.1284 +        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
  1.1285 +        vpop            {d8-d15}
  1.1286 +        add             sp,  r11, #8
  1.1287 +        pop             {r11, pc}
  1.1288 +endfunc
  1.1289 +
  1.1290 +function ff_\type\()_h264_qpel8_mc21_neon, export=1
  1.1291 +        push            {r0, r1, r4, r10, r11, lr}
  1.1292 +\type\()_h264_qpel8_mc21:
  1.1293 +        lowpass_const   r3
  1.1294 +        mov             r11, sp
  1.1295 +        bic             sp,  sp,  #15
  1.1296 +        sub             sp,  sp,  #(8*8+16*12)
  1.1297 +        sub             r1,  r1,  #2
  1.1298 +        mov             r3,  #8
  1.1299 +        mov             r0,  sp
  1.1300 +        mov             ip,  #8
  1.1301 +        vpush           {d8-d15}
  1.1302 +        bl              put_h264_qpel8_h_lowpass_neon
  1.1303 +        mov             r4,  r0
  1.1304 +        ldrd            r0,  [r11]
  1.1305 +        sub             r1,  r1,  r2, lsl #1
  1.1306 +        sub             r1,  r1,  #2
  1.1307 +        mov             r3,  r2
  1.1308 +        sub             r2,  r4,  #64
  1.1309 +        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
  1.1310 +        vpop            {d8-d15}
  1.1311 +        add             sp,  r11,  #8
  1.1312 +        pop             {r4, r10, r11, pc}
  1.1313 +endfunc
  1.1314 +
  1.1315 +function ff_\type\()_h264_qpel8_mc31_neon, export=1
  1.1316 +        add             r1,  r1,  #1
  1.1317 +        push            {r0, r1, r11, lr}
  1.1318 +        sub             r1,  r1,  #1
  1.1319 +        b               \type\()_h264_qpel8_mc11
  1.1320 +endfunc
  1.1321 +
  1.1322 +function ff_\type\()_h264_qpel8_mc02_neon, export=1
  1.1323 +        push            {lr}
  1.1324 +        lowpass_const   r3
  1.1325 +        sub             r1,  r1,  r2, lsl #1
  1.1326 +        mov             r3,  r2
  1.1327 +        vpush           {d8-d15}
  1.1328 +        bl              \type\()_h264_qpel8_v_lowpass_neon
  1.1329 +        vpop            {d8-d15}
  1.1330 +        pop             {pc}
  1.1331 +endfunc
  1.1332 +
  1.1333 +function ff_\type\()_h264_qpel8_mc12_neon, export=1
  1.1334 +        push            {r0, r1, r4, r10, r11, lr}
  1.1335 +\type\()_h264_qpel8_mc12:
  1.1336 +        lowpass_const   r3
  1.1337 +        mov             r11, sp
  1.1338 +        bic             sp,  sp,  #15
  1.1339 +        sub             sp,  sp,  #(8*8+16*12)
  1.1340 +        sub             r1,  r1,  r2, lsl #1
  1.1341 +        mov             r3,  r2
  1.1342 +        mov             r2,  #8
  1.1343 +        mov             r0,  sp
  1.1344 +        vpush           {d8-d15}
  1.1345 +        bl              put_h264_qpel8_v_lowpass_neon
  1.1346 +        mov             r4,  r0
  1.1347 +        ldrd            r0,  [r11]
  1.1348 +        sub             r1,  r1,  r3, lsl #1
  1.1349 +        sub             r1,  r1,  #2
  1.1350 +        sub             r2,  r4,  #64
  1.1351 +        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
  1.1352 +        vpop            {d8-d15}
  1.1353 +        add             sp,  r11,  #8
  1.1354 +        pop             {r4, r10, r11, pc}
  1.1355 +endfunc
  1.1356 +
  1.1357 +function ff_\type\()_h264_qpel8_mc22_neon, export=1
  1.1358 +        push            {r4, r10, r11, lr}
  1.1359 +        mov             r11, sp
  1.1360 +        bic             sp,  sp,  #15
  1.1361 +        sub             r1,  r1,  r2, lsl #1
  1.1362 +        sub             r1,  r1,  #2
  1.1363 +        mov             r3,  r2
  1.1364 +        sub             sp,  sp,  #(16*12)
  1.1365 +        mov             r4,  sp
  1.1366 +        vpush           {d8-d15}
  1.1367 +        bl              \type\()_h264_qpel8_hv_lowpass_neon
  1.1368 +        vpop            {d8-d15}
  1.1369 +        mov             sp,  r11
  1.1370 +        pop             {r4, r10, r11, pc}
  1.1371 +endfunc
  1.1372 +
  1.1373 +function ff_\type\()_h264_qpel8_mc32_neon, export=1
  1.1374 +        push            {r0, r1, r4, r10, r11, lr}
  1.1375 +        add             r1,  r1,  #1
  1.1376 +        b               \type\()_h264_qpel8_mc12
  1.1377 +endfunc
  1.1378 +
  1.1379 +function ff_\type\()_h264_qpel8_mc03_neon, export=1
  1.1380 +        push            {lr}
  1.1381 +        add             ip,  r1,  r2
  1.1382 +        b               \type\()_h264_qpel8_mc01
  1.1383 +endfunc
  1.1384 +
  1.1385 +function ff_\type\()_h264_qpel8_mc13_neon, export=1
  1.1386 +        push            {r0, r1, r11, lr}
  1.1387 +        add             r1,  r1,  r2
  1.1388 +        b               \type\()_h264_qpel8_mc11
  1.1389 +endfunc
  1.1390 +
  1.1391 +function ff_\type\()_h264_qpel8_mc23_neon, export=1
  1.1392 +        push            {r0, r1, r4, r10, r11, lr}
  1.1393 +        add             r1,  r1,  r2
  1.1394 +        b               \type\()_h264_qpel8_mc21
  1.1395 +endfunc
  1.1396 +
  1.1397 +function ff_\type\()_h264_qpel8_mc33_neon, export=1
  1.1398 +        add             r1,  r1,  #1
  1.1399 +        push            {r0, r1, r11, lr}
  1.1400 +        add             r1,  r1,  r2
  1.1401 +        sub             r1,  r1,  #1
  1.1402 +        b               \type\()_h264_qpel8_mc11
  1.1403 +endfunc
  1.1404 +        .endm
  1.1405 +
  1.1406 +        h264_qpel8 put
  1.1407 +        h264_qpel8 avg
  1.1408 +
  1.1409 +        .macro h264_qpel16 type
  1.1410 +function ff_\type\()_h264_qpel16_mc10_neon, export=1
  1.1411 +        lowpass_const   r3
  1.1412 +        mov             r3,  r1
  1.1413 +        sub             r1,  r1,  #2
  1.1414 +        b               \type\()_h264_qpel16_h_lowpass_l2_neon
  1.1415 +endfunc
  1.1416 +
  1.1417 +function ff_\type\()_h264_qpel16_mc20_neon, export=1
  1.1418 +        lowpass_const   r3
  1.1419 +        sub             r1,  r1,  #2
  1.1420 +        mov             r3,  r2
  1.1421 +        b               \type\()_h264_qpel16_h_lowpass_neon
  1.1422 +endfunc
  1.1423 +
  1.1424 +function ff_\type\()_h264_qpel16_mc30_neon, export=1
  1.1425 +        lowpass_const   r3
  1.1426 +        add             r3,  r1,  #1
  1.1427 +        sub             r1,  r1,  #2
  1.1428 +        b               \type\()_h264_qpel16_h_lowpass_l2_neon
  1.1429 +endfunc
  1.1430 +
  1.1431 +function ff_\type\()_h264_qpel16_mc01_neon, export=1
  1.1432 +        push            {r4, lr}
  1.1433 +        mov             ip,  r1
  1.1434 +\type\()_h264_qpel16_mc01:
  1.1435 +        lowpass_const   r3
  1.1436 +        mov             r3,  r2
  1.1437 +        sub             r1,  r1,  r2, lsl #1
  1.1438 +        vpush           {d8-d15}
  1.1439 +        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
  1.1440 +        vpop            {d8-d15}
  1.1441 +        pop             {r4, pc}
  1.1442 +endfunc
  1.1443 +
  1.1444 +function ff_\type\()_h264_qpel16_mc11_neon, export=1
  1.1445 +        push            {r0, r1, r4, r11, lr}
  1.1446 +\type\()_h264_qpel16_mc11:
  1.1447 +        lowpass_const   r3
  1.1448 +        mov             r11, sp
  1.1449 +        bic             sp,  sp,  #15
  1.1450 +        sub             sp,  sp,  #256
  1.1451 +        mov             r0,  sp
  1.1452 +        sub             r1,  r1,  #2
  1.1453 +        mov             r3,  #16
  1.1454 +        vpush           {d8-d15}
  1.1455 +        bl              put_h264_qpel16_h_lowpass_neon
  1.1456 +        ldrd            r0,  [r11]
  1.1457 +        mov             r3,  r2
  1.1458 +        add             ip,  sp,  #64
  1.1459 +        sub             r1,  r1,  r2, lsl #1
  1.1460 +        mov             r2,  #16
  1.1461 +        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
  1.1462 +        vpop            {d8-d15}
  1.1463 +        add             sp,  r11, #8
  1.1464 +        pop             {r4, r11, pc}
  1.1465 +endfunc
  1.1466 +
  1.1467 +function ff_\type\()_h264_qpel16_mc21_neon, export=1
  1.1468 +        push            {r0, r1, r4-r5, r9-r11, lr}
  1.1469 +\type\()_h264_qpel16_mc21:
  1.1470 +        lowpass_const   r3
  1.1471 +        mov             r11, sp
  1.1472 +        bic             sp,  sp,  #15
  1.1473 +        sub             sp,  sp,  #(16*16+16*12)
  1.1474 +        sub             r1,  r1,  #2
  1.1475 +        mov             r0,  sp
  1.1476 +        vpush           {d8-d15}
  1.1477 +        bl              put_h264_qpel16_h_lowpass_neon_packed
  1.1478 +        mov             r4,  r0
  1.1479 +        ldrd            r0,  [r11]
  1.1480 +        sub             r1,  r1,  r2, lsl #1
  1.1481 +        sub             r1,  r1,  #2
  1.1482 +        mov             r3,  r2
  1.1483 +        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
  1.1484 +        vpop            {d8-d15}
  1.1485 +        add             sp,  r11,  #8
  1.1486 +        pop             {r4-r5, r9-r11, pc}
  1.1487 +endfunc
  1.1488 +
  1.1489 +function ff_\type\()_h264_qpel16_mc31_neon, export=1
  1.1490 +        add             r1,  r1,  #1
  1.1491 +        push            {r0, r1, r4, r11, lr}
  1.1492 +        sub             r1,  r1,  #1
  1.1493 +        b               \type\()_h264_qpel16_mc11
  1.1494 +endfunc
  1.1495 +
  1.1496 +function ff_\type\()_h264_qpel16_mc02_neon, export=1
  1.1497 +        push            {r4, lr}
  1.1498 +        lowpass_const   r3
  1.1499 +        sub             r1,  r1,  r2, lsl #1
  1.1500 +        mov             r3,  r2
  1.1501 +        vpush           {d8-d15}
  1.1502 +        bl              \type\()_h264_qpel16_v_lowpass_neon
  1.1503 +        vpop            {d8-d15}
  1.1504 +        pop             {r4, pc}
  1.1505 +endfunc
  1.1506 +
  1.1507 +function ff_\type\()_h264_qpel16_mc12_neon, export=1
  1.1508 +        push            {r0, r1, r4-r5, r9-r11, lr}
  1.1509 +\type\()_h264_qpel16_mc12:
  1.1510 +        lowpass_const   r3
  1.1511 +        mov             r11, sp
  1.1512 +        bic             sp,  sp,  #15
  1.1513 +        sub             sp,  sp,  #(16*16+16*12)
  1.1514 +        sub             r1,  r1,  r2, lsl #1
  1.1515 +        mov             r0,  sp
  1.1516 +        mov             r3,  r2
  1.1517 +        vpush           {d8-d15}
  1.1518 +        bl              put_h264_qpel16_v_lowpass_neon_packed
  1.1519 +        mov             r4,  r0
  1.1520 +        ldrd            r0,  [r11]
  1.1521 +        sub             r1,  r1,  r3, lsl #1
  1.1522 +        sub             r1,  r1,  #2
  1.1523 +        mov             r2,  r3
  1.1524 +        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
  1.1525 +        vpop            {d8-d15}
  1.1526 +        add             sp,  r11,  #8
  1.1527 +        pop             {r4-r5, r9-r11, pc}
  1.1528 +endfunc
  1.1529 +
  1.1530 +function ff_\type\()_h264_qpel16_mc22_neon, export=1
  1.1531 +        push            {r4, r9-r11, lr}
  1.1532 +        lowpass_const   r3
  1.1533 +        mov             r11, sp
  1.1534 +        bic             sp,  sp,  #15
  1.1535 +        sub             r1,  r1,  r2, lsl #1
  1.1536 +        sub             r1,  r1,  #2
  1.1537 +        mov             r3,  r2
  1.1538 +        sub             sp,  sp,  #(16*12)
  1.1539 +        mov             r4,  sp
  1.1540 +        vpush           {d8-d15}
  1.1541 +        bl              \type\()_h264_qpel16_hv_lowpass_neon
  1.1542 +        vpop            {d8-d15}
  1.1543 +        mov             sp,  r11
  1.1544 +        pop             {r4, r9-r11, pc}
  1.1545 +endfunc
  1.1546 +
  1.1547 +function ff_\type\()_h264_qpel16_mc32_neon, export=1
  1.1548 +        push            {r0, r1, r4-r5, r9-r11, lr}
  1.1549 +        add             r1,  r1,  #1
  1.1550 +        b               \type\()_h264_qpel16_mc12
  1.1551 +endfunc
  1.1552 +
  1.1553 +function ff_\type\()_h264_qpel16_mc03_neon, export=1
  1.1554 +        push            {r4, lr}
  1.1555 +        add             ip,  r1,  r2
  1.1556 +        b               \type\()_h264_qpel16_mc01
  1.1557 +endfunc
  1.1558 +
  1.1559 +function ff_\type\()_h264_qpel16_mc13_neon, export=1
  1.1560 +        push            {r0, r1, r4, r11, lr}
  1.1561 +        add             r1,  r1,  r2
  1.1562 +        b               \type\()_h264_qpel16_mc11
  1.1563 +endfunc
  1.1564 +
  1.1565 +function ff_\type\()_h264_qpel16_mc23_neon, export=1
  1.1566 +        push            {r0, r1, r4-r5, r9-r11, lr}
  1.1567 +        add             r1,  r1,  r2
  1.1568 +        b               \type\()_h264_qpel16_mc21
  1.1569 +endfunc
  1.1570 +
  1.1571 +function ff_\type\()_h264_qpel16_mc33_neon, export=1
  1.1572 +        add             r1,  r1,  #1
  1.1573 +        push            {r0, r1, r4, r11, lr}
  1.1574 +        add             r1,  r1,  r2
  1.1575 +        sub             r1,  r1,  #1
  1.1576 +        b               \type\()_h264_qpel16_mc11
  1.1577 +endfunc
  1.1578 +        .endm
  1.1579 +
  1.1580 +        h264_qpel16 put
  1.1581 +        h264_qpel16 avg
  1.1582 +
  1.1583 +@ Biweighted prediction
  1.1584 +
  1.1585 +        .macro  biweight_16 macs, macd
  1.1586 +        vdup.8          d0,  r4
  1.1587 +        vdup.8          d1,  r5
  1.1588 +        vmov            q2,  q8
  1.1589 +        vmov            q3,  q8
  1.1590 +1:      subs            ip,  ip,  #2
  1.1591 +        vld1.8          {d20-d21},[r0,:128], r2
  1.1592 +        \macd           q2,  d0,  d20
  1.1593 +        pld             [r0]
  1.1594 +        \macd           q3,  d0,  d21
  1.1595 +        vld1.8          {d22-d23},[r1,:128], r2
  1.1596 +        \macs           q2,  d1,  d22
  1.1597 +        pld             [r1]
  1.1598 +        \macs           q3,  d1,  d23
  1.1599 +        vmov            q12, q8
  1.1600 +        vld1.8          {d28-d29},[r0,:128], r2
  1.1601 +        vmov            q13, q8
  1.1602 +        \macd           q12, d0,  d28
  1.1603 +        pld             [r0]
  1.1604 +        \macd           q13, d0,  d29
  1.1605 +        vld1.8          {d30-d31},[r1,:128], r2
  1.1606 +        \macs           q12, d1,  d30
  1.1607 +        pld             [r1]
  1.1608 +        \macs           q13, d1,  d31
  1.1609 +        vshl.s16        q2,  q2,  q9
  1.1610 +        vshl.s16        q3,  q3,  q9
  1.1611 +        vqmovun.s16     d4,  q2
  1.1612 +        vqmovun.s16     d5,  q3
  1.1613 +        vshl.s16        q12, q12, q9
  1.1614 +        vshl.s16        q13, q13, q9
  1.1615 +        vqmovun.s16     d24, q12
  1.1616 +        vqmovun.s16     d25, q13
  1.1617 +        vmov            q3,  q8
  1.1618 +        vst1.8          {d4- d5}, [r6,:128], r2
  1.1619 +        vmov            q2,  q8
  1.1620 +        vst1.8          {d24-d25},[r6,:128], r2
  1.1621 +        bne             1b
  1.1622 +        pop             {r4-r6, pc}
  1.1623 +        .endm
  1.1624 +
  1.1625 +        .macro  biweight_8 macs, macd
  1.1626 +        vdup.8          d0,  r4
  1.1627 +        vdup.8          d1,  r5
  1.1628 +        vmov            q1,  q8
  1.1629 +        vmov            q10, q8
  1.1630 +1:      subs            ip,  ip,  #2
  1.1631 +        vld1.8          {d4},[r0,:64], r2
  1.1632 +        \macd           q1,  d0,  d4
  1.1633 +        pld             [r0]
  1.1634 +        vld1.8          {d5},[r1,:64], r2
  1.1635 +        \macs           q1,  d1,  d5
  1.1636 +        pld             [r1]
  1.1637 +        vld1.8          {d6},[r0,:64], r2
  1.1638 +        \macd           q10, d0,  d6
  1.1639 +        pld             [r0]
  1.1640 +        vld1.8          {d7},[r1,:64], r2
  1.1641 +        \macs           q10, d1,  d7
  1.1642 +        pld             [r1]
  1.1643 +        vshl.s16        q1,  q1,  q9
  1.1644 +        vqmovun.s16     d2,  q1
  1.1645 +        vshl.s16        q10, q10, q9
  1.1646 +        vqmovun.s16     d4,  q10
  1.1647 +        vmov            q10, q8
  1.1648 +        vst1.8          {d2},[r6,:64], r2
  1.1649 +        vmov            q1,  q8
  1.1650 +        vst1.8          {d4},[r6,:64], r2
  1.1651 +        bne             1b
  1.1652 +        pop             {r4-r6, pc}
  1.1653 +        .endm
  1.1654 +
  1.1655 +        .macro  biweight_4 macs, macd
  1.1656 +        vdup.8          d0,  r4
  1.1657 +        vdup.8          d1,  r5
  1.1658 +        vmov            q1,  q8
  1.1659 +        vmov            q10, q8
  1.1660 +1:      subs            ip,  ip,  #4
  1.1661 +        vld1.32         {d4[0]},[r0,:32], r2
  1.1662 +        vld1.32         {d4[1]},[r0,:32], r2
  1.1663 +        \macd           q1,  d0,  d4
  1.1664 +        pld             [r0]
  1.1665 +        vld1.32         {d5[0]},[r1,:32], r2
  1.1666 +        vld1.32         {d5[1]},[r1,:32], r2
  1.1667 +        \macs           q1,  d1,  d5
  1.1668 +        pld             [r1]
  1.1669 +        blt             2f
  1.1670 +        vld1.32         {d6[0]},[r0,:32], r2
  1.1671 +        vld1.32         {d6[1]},[r0,:32], r2
  1.1672 +        \macd           q10, d0,  d6
  1.1673 +        pld             [r0]
  1.1674 +        vld1.32         {d7[0]},[r1,:32], r2
  1.1675 +        vld1.32         {d7[1]},[r1,:32], r2
  1.1676 +        \macs           q10, d1,  d7
  1.1677 +        pld             [r1]
  1.1678 +        vshl.s16        q1,  q1,  q9
  1.1679 +        vqmovun.s16     d2,  q1
  1.1680 +        vshl.s16        q10, q10, q9
  1.1681 +        vqmovun.s16     d4,  q10
  1.1682 +        vmov            q10, q8
  1.1683 +        vst1.32         {d2[0]},[r6,:32], r2
  1.1684 +        vst1.32         {d2[1]},[r6,:32], r2
  1.1685 +        vmov            q1,  q8
  1.1686 +        vst1.32         {d4[0]},[r6,:32], r2
  1.1687 +        vst1.32         {d4[1]},[r6,:32], r2
  1.1688 +        bne             1b
  1.1689 +        pop             {r4-r6, pc}
  1.1690 +2:      vshl.s16        q1,  q1,  q9
  1.1691 +        vqmovun.s16     d2,  q1
  1.1692 +        vst1.32         {d2[0]},[r6,:32], r2
  1.1693 +        vst1.32         {d2[1]},[r6,:32], r2
  1.1694 +        pop             {r4-r6, pc}
  1.1695 +        .endm
  1.1696 +
  1.1697 +        .macro  biweight_func w
  1.1698 +function biweight_h264_pixels_\w\()_neon
  1.1699 +        push            {r4-r6, lr}
  1.1700 +        add             r4,  sp,  #16
  1.1701 +        ldm             r4,  {r4-r6}
  1.1702 +        lsr             lr,  r4,  #31
  1.1703 +        add             r6,  r6,  #1
  1.1704 +        eors            lr,  lr,  r5,  lsr #30
  1.1705 +        orr             r6,  r6,  #1
  1.1706 +        vdup.16         q9,  r3
  1.1707 +        lsl             r6,  r6,  r3
  1.1708 +        vmvn            q9,  q9
  1.1709 +        vdup.16         q8,  r6
  1.1710 +        mov             r6,  r0
  1.1711 +        beq             10f
  1.1712 +        subs            lr,  lr,  #1
  1.1713 +        beq             20f
  1.1714 +        subs            lr,  lr,  #1
  1.1715 +        beq             30f
  1.1716 +        b               40f
  1.1717 +10:     biweight_\w     vmlal.u8, vmlal.u8
  1.1718 +20:     rsb             r4,  r4,  #0
  1.1719 +        biweight_\w     vmlal.u8, vmlsl.u8
  1.1720 +30:     rsb             r4,  r4,  #0
  1.1721 +        rsb             r5,  r5,  #0
  1.1722 +        biweight_\w     vmlsl.u8, vmlsl.u8
  1.1723 +40:     rsb             r5,  r5,  #0
  1.1724 +        biweight_\w     vmlsl.u8, vmlal.u8
  1.1725 +endfunc
  1.1726 +        .endm
  1.1727 +
  1.1728 +        .macro  biweight_entry w, h, b=1
  1.1729 +function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
  1.1730 +        mov             ip,  #\h
  1.1731 +.if \b
  1.1732 +        b               biweight_h264_pixels_\w\()_neon
  1.1733 +.endif
  1.1734 +endfunc
  1.1735 +        .endm
  1.1736 +
  1.1737 +        biweight_entry  16, 8
  1.1738 +        biweight_entry  16, 16, b=0
  1.1739 +        biweight_func   16
  1.1740 +
  1.1741 +        biweight_entry  8,  16
  1.1742 +        biweight_entry  8,  4
  1.1743 +        biweight_entry  8,  8,  b=0
  1.1744 +        biweight_func   8
  1.1745 +
  1.1746 +        biweight_entry  4,  8
  1.1747 +        biweight_entry  4,  2
  1.1748 +        biweight_entry  4,  4,  b=0
  1.1749 +        biweight_func   4
  1.1750 +
  1.1751 +@ Weighted prediction
  1.1752 +
  1.1753 +        .macro  weight_16 add
  1.1754 +        vdup.8          d0,  r3
  1.1755 +1:      subs            ip,  ip,  #2
  1.1756 +        vld1.8          {d20-d21},[r0,:128], r1
  1.1757 +        vmull.u8        q2,  d0,  d20
  1.1758 +        pld             [r0]
  1.1759 +        vmull.u8        q3,  d0,  d21
  1.1760 +        vld1.8          {d28-d29},[r0,:128], r1
  1.1761 +        vmull.u8        q12, d0,  d28
  1.1762 +        pld             [r0]
  1.1763 +        vmull.u8        q13, d0,  d29
  1.1764 +        \add            q2,  q8,  q2
  1.1765 +        vrshl.s16       q2,  q2,  q9
  1.1766 +        \add            q3,  q8,  q3
  1.1767 +        vrshl.s16       q3,  q3,  q9
  1.1768 +        vqmovun.s16     d4,  q2
  1.1769 +        vqmovun.s16     d5,  q3
  1.1770 +        \add            q12, q8,  q12
  1.1771 +        vrshl.s16       q12, q12, q9
  1.1772 +        \add            q13, q8,  q13
  1.1773 +        vrshl.s16       q13, q13, q9
  1.1774 +        vqmovun.s16     d24, q12
  1.1775 +        vqmovun.s16     d25, q13
  1.1776 +        vst1.8          {d4- d5}, [r4,:128], r1
  1.1777 +        vst1.8          {d24-d25},[r4,:128], r1
  1.1778 +        bne             1b
  1.1779 +        pop             {r4, pc}
  1.1780 +        .endm
  1.1781 +
  1.1782 +        .macro  weight_8 add
  1.1783 +        vdup.8          d0,  r3
  1.1784 +1:      subs            ip,  ip,  #2
  1.1785 +        vld1.8          {d4},[r0,:64], r1
  1.1786 +        vmull.u8        q1,  d0,  d4
  1.1787 +        pld             [r0]
  1.1788 +        vld1.8          {d6},[r0,:64], r1
  1.1789 +        vmull.u8        q10, d0,  d6
  1.1790 +        \add            q1,  q8,  q1
  1.1791 +        pld             [r0]
  1.1792 +        vrshl.s16       q1,  q1,  q9
  1.1793 +        vqmovun.s16     d2,  q1
  1.1794 +        \add            q10, q8,  q10
  1.1795 +        vrshl.s16       q10, q10, q9
  1.1796 +        vqmovun.s16     d4,  q10
  1.1797 +        vst1.8          {d2},[r4,:64], r1
  1.1798 +        vst1.8          {d4},[r4,:64], r1
  1.1799 +        bne             1b
  1.1800 +        pop             {r4, pc}
  1.1801 +        .endm
  1.1802 +
  1.1803 +        .macro  weight_4 add
  1.1804 +        vdup.8          d0,  r3
  1.1805 +        vmov            q1,  q8
  1.1806 +        vmov            q10, q8
  1.1807 +1:      subs            ip,  ip,  #4
  1.1808 +        vld1.32         {d4[0]},[r0,:32], r1
  1.1809 +        vld1.32         {d4[1]},[r0,:32], r1
  1.1810 +        vmull.u8        q1,  d0,  d4
  1.1811 +        pld             [r0]
  1.1812 +        blt             2f
  1.1813 +        vld1.32         {d6[0]},[r0,:32], r1
  1.1814 +        vld1.32         {d6[1]},[r0,:32], r1
  1.1815 +        vmull.u8        q10, d0,  d6
  1.1816 +        pld             [r0]
  1.1817 +        \add            q1,  q8,  q1
  1.1818 +        vrshl.s16       q1,  q1,  q9
  1.1819 +        vqmovun.s16     d2,  q1
  1.1820 +        \add            q10, q8,  q10
  1.1821 +        vrshl.s16       q10, q10, q9
  1.1822 +        vqmovun.s16     d4,  q10
  1.1823 +        vmov            q10, q8
  1.1824 +        vst1.32         {d2[0]},[r4,:32], r1
  1.1825 +        vst1.32         {d2[1]},[r4,:32], r1
  1.1826 +        vmov            q1,  q8
  1.1827 +        vst1.32         {d4[0]},[r4,:32], r1
  1.1828 +        vst1.32         {d4[1]},[r4,:32], r1
  1.1829 +        bne             1b
  1.1830 +        pop             {r4, pc}
  1.1831 +2:      \add            q1,  q8,  q1
  1.1832 +        vrshl.s16       q1,  q1,  q9
  1.1833 +        vqmovun.s16     d2,  q1
  1.1834 +        vst1.32         {d2[0]},[r4,:32], r1
  1.1835 +        vst1.32         {d2[1]},[r4,:32], r1
  1.1836 +        pop             {r4, pc}
  1.1837 +        .endm
  1.1838 +
  1.1839 +        .macro  weight_func w
  1.1840 +function weight_h264_pixels_\w\()_neon
  1.1841 +        push            {r4, lr}
  1.1842 +        ldr             r4,  [sp, #8]
  1.1843 +        cmp             r2,  #1
  1.1844 +        lsl             r4,  r4,  r2
  1.1845 +        vdup.16         q8,  r4
  1.1846 +        mov             r4,  r0
  1.1847 +        ble             20f
  1.1848 +        rsb             lr,  r2,  #1
  1.1849 +        vdup.16         q9,  lr
  1.1850 +        cmp             r3,  #0
  1.1851 +        blt             10f
  1.1852 +        weight_\w       vhadd.s16
  1.1853 +10:     rsb             r3,  r3,  #0
  1.1854 +        weight_\w       vhsub.s16
  1.1855 +20:     rsb             lr,  r2,  #0
  1.1856 +        vdup.16         q9,  lr
  1.1857 +        cmp             r3,  #0
  1.1858 +        blt             10f
  1.1859 +        weight_\w       vadd.s16
  1.1860 +10:     rsb             r3,  r3,  #0
  1.1861 +        weight_\w       vsub.s16
  1.1862 +endfunc
  1.1863 +        .endm
  1.1864 +
  1.1865 +        .macro  weight_entry w, h, b=1
  1.1866 +function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
  1.1867 +        mov             ip,  #\h
  1.1868 +.if \b
  1.1869 +        b               weight_h264_pixels_\w\()_neon
  1.1870 +.endif
  1.1871 +endfunc
  1.1872 +        .endm
  1.1873 +
  1.1874 +        weight_entry    16, 8
  1.1875 +        weight_entry    16, 16, b=0
  1.1876 +        weight_func     16
  1.1877 +
  1.1878 +        weight_entry    8,  16
  1.1879 +        weight_entry    8,  4
  1.1880 +        weight_entry    8,  8,  b=0
  1.1881 +        weight_func     8
  1.1882 +
  1.1883 +        weight_entry    4,  8
  1.1884 +        weight_entry    4,  2
  1.1885 +        weight_entry    4,  4,  b=0
  1.1886 +        weight_func     4