Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
diff libavcodec/arm/h264dsp_neon.S @ 2:897f711a7157
rearrange to work with autoconf
| author | Nina Engelhardt <nengel@mailbox.tu-berlin.de> |
|---|---|
| date | Tue, 25 Sep 2012 15:55:33 +0200 |
| parents | |
| children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/libavcodec/arm/h264dsp_neon.S Tue Sep 25 15:55:33 2012 +0200 1.3 @@ -0,0 +1,1883 @@ 1.4 +/* 1.5 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 1.6 + * 1.7 + * This file is part of FFmpeg. 1.8 + * 1.9 + * FFmpeg is free software; you can redistribute it and/or 1.10 + * modify it under the terms of the GNU Lesser General Public 1.11 + * License as published by the Free Software Foundation; either 1.12 + * version 2.1 of the License, or (at your option) any later version. 1.13 + * 1.14 + * FFmpeg is distributed in the hope that it will be useful, 1.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 1.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1.17 + * Lesser General Public License for more details. 1.18 + * 1.19 + * You should have received a copy of the GNU Lesser General Public 1.20 + * License along with FFmpeg; if not, write to the Free Software 1.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 1.22 + */ 1.23 + 1.24 +#include "asm.S" 1.25 + 1.26 + .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 1.27 + vtrn.32 \r0, \r4 1.28 + vtrn.32 \r1, \r5 1.29 + vtrn.32 \r2, \r6 1.30 + vtrn.32 \r3, \r7 1.31 + vtrn.16 \r0, \r2 1.32 + vtrn.16 \r1, \r3 1.33 + vtrn.16 \r4, \r6 1.34 + vtrn.16 \r5, \r7 1.35 + vtrn.8 \r0, \r1 1.36 + vtrn.8 \r2, \r3 1.37 + vtrn.8 \r4, \r5 1.38 + vtrn.8 \r6, \r7 1.39 + .endm 1.40 + 1.41 + .macro transpose_4x4 r0 r1 r2 r3 1.42 + vtrn.16 \r0, \r2 1.43 + vtrn.16 \r1, \r3 1.44 + vtrn.8 \r0, \r1 1.45 + vtrn.8 \r2, \r3 1.46 + .endm 1.47 + 1.48 + .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 1.49 + vswp \r0, \r4 1.50 + vswp \r1, \r5 1.51 + vswp \r2, \r6 1.52 + vswp \r3, \r7 1.53 + .endm 1.54 + 1.55 + .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 1.56 + vtrn.32 \r0, \r2 1.57 + vtrn.32 \r1, \r3 1.58 + vtrn.32 \r4, \r6 1.59 + vtrn.32 \r5, \r7 1.60 + vtrn.16 \r0, \r1 1.61 + vtrn.16 \r2, \r3 1.62 + vtrn.16 \r4, \r5 1.63 + vtrn.16 \r6, \r7 1.64 + .endm 1.65 + 1.66 +/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ 1.67 + .macro h264_chroma_mc8 type 1.68 +function ff_\type\()_h264_chroma_mc8_neon, export=1 1.69 + push {r4-r7, lr} 1.70 + ldrd r4, [sp, #20] 1.71 +.ifc \type,avg 1.72 + mov lr, r0 1.73 +.endif 1.74 + pld [r1] 1.75 + pld [r1, r2] 1.76 + 1.77 + muls r7, r4, r5 1.78 + rsb r6, r7, r5, lsl #3 1.79 + rsb ip, r7, r4, lsl #3 1.80 + sub r4, r7, r4, lsl #3 1.81 + sub r4, r4, r5, lsl #3 1.82 + add r4, r4, #64 1.83 + 1.84 + beq 2f 1.85 + 1.86 + add r5, r1, r2 1.87 + 1.88 + vdup.8 d0, r4 1.89 + lsl r4, r2, #1 1.90 + vdup.8 d1, ip 1.91 + vld1.64 {d4, d5}, [r1], r4 1.92 + vdup.8 d2, r6 1.93 + vld1.64 {d6, d7}, [r5], r4 1.94 + vdup.8 d3, r7 1.95 + 1.96 + vext.8 d5, d4, d5, #1 1.97 + vext.8 d7, d6, d7, #1 1.98 + 1.99 +1: pld [r5] 1.100 + vmull.u8 q8, d4, d0 1.101 + vmlal.u8 q8, d5, d1 1.102 + vld1.64 {d4, d5}, [r1], r4 1.103 + vmlal.u8 q8, d6, d2 1.104 + vext.8 d5, d4, d5, #1 1.105 + vmlal.u8 q8, d7, d3 1.106 + vmull.u8 q9, d6, d0 1.107 + subs r3, r3, #2 1.108 + vmlal.u8 q9, d7, d1 1.109 + vmlal.u8 q9, d4, d2 1.110 + vmlal.u8 q9, d5, d3 1.111 + vrshrn.u16 d16, q8, #6 1.112 + vld1.64 {d6, d7}, [r5], r4 1.113 + pld [r1] 1.114 + vrshrn.u16 d17, q9, #6 1.115 +.ifc \type,avg 1.116 + vld1.64 {d20}, [lr,:64], r2 1.117 + vld1.64 {d21}, [lr,:64], r2 1.118 + vrhadd.u8 q8, q8, q10 1.119 +.endif 1.120 + vext.8 d7, d6, d7, #1 1.121 + vst1.64 {d16}, [r0,:64], r2 1.122 + vst1.64 {d17}, [r0,:64], r2 1.123 + bgt 1b 1.124 + 1.125 + pop {r4-r7, pc} 1.126 + 1.127 +2: tst r6, r6 1.128 + add ip, ip, r6 1.129 + vdup.8 d0, r4 1.130 + vdup.8 d1, ip 1.131 + 1.132 + beq 4f 1.133 + 1.134 + add r5, r1, r2 1.135 + lsl r4, r2, #1 1.136 + vld1.64 {d4}, [r1], r4 1.137 + vld1.64 {d6}, [r5], r4 1.138 + 1.139 +3: pld [r5] 1.140 + vmull.u8 q8, d4, d0 1.141 + vmlal.u8 q8, d6, d1 1.142 + vld1.64 {d4}, [r1], r4 1.143 + vmull.u8 q9, d6, d0 1.144 + vmlal.u8 q9, d4, d1 1.145 + vld1.64 {d6}, [r5], r4 1.146 + vrshrn.u16 d16, q8, #6 1.147 + vrshrn.u16 d17, q9, #6 1.148 +.ifc \type,avg 1.149 + vld1.64 {d20}, [lr,:64], r2 1.150 + vld1.64 {d21}, [lr,:64], r2 1.151 + vrhadd.u8 q8, q8, q10 1.152 +.endif 1.153 + subs r3, r3, #2 1.154 + pld [r1] 1.155 + vst1.64 {d16}, [r0,:64], r2 1.156 + vst1.64 {d17}, [r0,:64], r2 1.157 + bgt 3b 1.158 + 1.159 + pop {r4-r7, pc} 1.160 + 1.161 +4: vld1.64 {d4, d5}, [r1], r2 1.162 + vld1.64 {d6, d7}, [r1], r2 1.163 + vext.8 d5, d4, d5, #1 1.164 + vext.8 d7, d6, d7, #1 1.165 + 1.166 +5: pld [r1] 1.167 + subs r3, r3, #2 1.168 + vmull.u8 q8, d4, d0 1.169 + vmlal.u8 q8, d5, d1 1.170 + vld1.64 {d4, d5}, [r1], r2 1.171 + vmull.u8 q9, d6, d0 1.172 + vmlal.u8 q9, d7, d1 1.173 + pld [r1] 1.174 + vext.8 d5, d4, d5, #1 1.175 + vrshrn.u16 d16, q8, #6 1.176 + vrshrn.u16 d17, q9, #6 1.177 +.ifc \type,avg 1.178 + vld1.64 {d20}, [lr,:64], r2 1.179 + vld1.64 {d21}, [lr,:64], r2 1.180 + vrhadd.u8 q8, q8, q10 1.181 +.endif 1.182 + vld1.64 {d6, d7}, [r1], r2 1.183 + vext.8 d7, d6, d7, #1 1.184 + vst1.64 {d16}, [r0,:64], r2 1.185 + vst1.64 {d17}, [r0,:64], r2 1.186 + bgt 5b 1.187 + 1.188 + pop {r4-r7, pc} 1.189 +endfunc 1.190 + .endm 1.191 + 1.192 +/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ 1.193 + .macro h264_chroma_mc4 type 1.194 +function ff_\type\()_h264_chroma_mc4_neon, export=1 1.195 + push {r4-r7, lr} 1.196 + ldrd r4, [sp, #20] 1.197 +.ifc \type,avg 1.198 + mov lr, r0 1.199 +.endif 1.200 + pld [r1] 1.201 + pld [r1, r2] 1.202 + 1.203 + muls r7, r4, r5 1.204 + rsb r6, r7, r5, lsl #3 1.205 + rsb ip, r7, r4, lsl #3 1.206 + sub r4, r7, r4, lsl #3 1.207 + sub r4, r4, r5, lsl #3 1.208 + add r4, r4, #64 1.209 + 1.210 + beq 2f 1.211 + 1.212 + add r5, r1, r2 1.213 + 1.214 + vdup.8 d0, r4 1.215 + lsl r4, r2, #1 1.216 + vdup.8 d1, ip 1.217 + vld1.64 {d4}, [r1], r4 1.218 + vdup.8 d2, r6 1.219 + vld1.64 {d6}, [r5], r4 1.220 + vdup.8 d3, r7 1.221 + 1.222 + vext.8 d5, d4, d5, #1 1.223 + vext.8 d7, d6, d7, #1 1.224 + vtrn.32 d4, d5 1.225 + vtrn.32 d6, d7 1.226 + 1.227 + vtrn.32 d0, d1 1.228 + vtrn.32 d2, d3 1.229 + 1.230 +1: pld [r5] 1.231 + vmull.u8 q8, d4, d0 1.232 + vmlal.u8 q8, d6, d2 1.233 + vld1.64 {d4}, [r1], r4 1.234 + vext.8 d5, d4, d5, #1 1.235 + vtrn.32 d4, d5 1.236 + vmull.u8 q9, d6, d0 1.237 + vmlal.u8 q9, d4, d2 1.238 + vld1.64 {d6}, [r5], r4 1.239 + vadd.i16 d16, d16, d17 1.240 + vadd.i16 d17, d18, d19 1.241 + vrshrn.u16 d16, q8, #6 1.242 + subs r3, r3, #2 1.243 + pld [r1] 1.244 +.ifc \type,avg 1.245 + vld1.32 {d20[0]}, [lr,:32], r2 1.246 + vld1.32 {d20[1]}, [lr,:32], r2 1.247 + vrhadd.u8 d16, d16, d20 1.248 +.endif 1.249 + vext.8 d7, d6, d7, #1 1.250 + vtrn.32 d6, d7 1.251 + vst1.32 {d16[0]}, [r0,:32], r2 1.252 + vst1.32 {d16[1]}, [r0,:32], r2 1.253 + bgt 1b 1.254 + 1.255 + pop {r4-r7, pc} 1.256 + 1.257 +2: tst r6, r6 1.258 + add ip, ip, r6 1.259 + vdup.8 d0, r4 1.260 + vdup.8 d1, ip 1.261 + vtrn.32 d0, d1 1.262 + 1.263 + beq 4f 1.264 + 1.265 + vext.32 d1, d0, d1, #1 1.266 + add r5, r1, r2 1.267 + lsl r4, r2, #1 1.268 + vld1.32 {d4[0]}, [r1], r4 1.269 + vld1.32 {d4[1]}, [r5], r4 1.270 + 1.271 +3: pld [r5] 1.272 + vmull.u8 q8, d4, d0 1.273 + vld1.32 {d4[0]}, [r1], r4 1.274 + vmull.u8 q9, d4, d1 1.275 + vld1.32 {d4[1]}, [r5], r4 1.276 + vadd.i16 d16, d16, d17 1.277 + vadd.i16 d17, d18, d19 1.278 + vrshrn.u16 d16, q8, #6 1.279 +.ifc \type,avg 1.280 + vld1.32 {d20[0]}, [lr,:32], r2 1.281 + vld1.32 {d20[1]}, [lr,:32], r2 1.282 + vrhadd.u8 d16, d16, d20 1.283 +.endif 1.284 + subs r3, r3, #2 1.285 + pld [r1] 1.286 + vst1.32 {d16[0]}, [r0,:32], r2 1.287 + vst1.32 {d16[1]}, [r0,:32], r2 1.288 + bgt 3b 1.289 + 1.290 + pop {r4-r7, pc} 1.291 + 1.292 +4: vld1.64 {d4}, [r1], r2 1.293 + vld1.64 {d6}, [r1], r2 1.294 + vext.8 d5, d4, d5, #1 1.295 + vext.8 d7, d6, d7, #1 1.296 + vtrn.32 d4, d5 1.297 + vtrn.32 d6, d7 1.298 + 1.299 +5: vmull.u8 q8, d4, d0 1.300 + vmull.u8 q9, d6, d0 1.301 + subs r3, r3, #2 1.302 + vld1.64 {d4}, [r1], r2 1.303 + vext.8 d5, d4, d5, #1 1.304 + vtrn.32 d4, d5 1.305 + vadd.i16 d16, d16, d17 1.306 + vadd.i16 d17, d18, d19 1.307 + pld [r1] 1.308 + vrshrn.u16 d16, q8, #6 1.309 +.ifc \type,avg 1.310 + vld1.32 {d20[0]}, [lr,:32], r2 1.311 + vld1.32 {d20[1]}, [lr,:32], r2 1.312 + vrhadd.u8 d16, d16, d20 1.313 +.endif 1.314 + vld1.64 {d6}, [r1], r2 1.315 + vext.8 d7, d6, d7, #1 1.316 + vtrn.32 d6, d7 1.317 + pld [r1] 1.318 + vst1.32 {d16[0]}, [r0,:32], r2 1.319 + vst1.32 {d16[1]}, [r0,:32], r2 1.320 + bgt 5b 1.321 + 1.322 + pop {r4-r7, pc} 1.323 +endfunc 1.324 + .endm 1.325 + 1.326 + .macro h264_chroma_mc2 type 1.327 +function ff_\type\()_h264_chroma_mc2_neon, export=1 1.328 + push {r4-r6, lr} 1.329 + ldr r4, [sp, #16] 1.330 + ldr lr, [sp, #20] 1.331 + pld [r1] 1.332 + pld [r1, r2] 1.333 + orrs r5, r4, lr 1.334 + beq 2f 1.335 + 1.336 + mul r5, r4, lr 1.337 + rsb r6, r5, lr, lsl #3 1.338 + rsb r12, r5, r4, lsl #3 1.339 + sub r4, r5, r4, lsl #3 1.340 + sub r4, r4, lr, lsl #3 1.341 + add r4, r4, #64 1.342 + vdup.8 d0, r4 1.343 + vdup.8 d2, r12 1.344 + vdup.8 d1, r6 1.345 + vdup.8 d3, r5 1.346 + vtrn.16 q0, q1 1.347 +1: 1.348 + vld1.32 {d4[0]}, [r1], r2 1.349 + vld1.32 {d4[1]}, [r1], r2 1.350 + vrev64.32 d5, d4 1.351 + vld1.32 {d5[1]}, [r1] 1.352 + vext.8 q3, q2, q2, #1 1.353 + vtrn.16 q2, q3 1.354 + vmull.u8 q8, d4, d0 1.355 + vmlal.u8 q8, d5, d1 1.356 +.ifc \type,avg 1.357 + vld1.16 {d18[0]}, [r0,:16], r2 1.358 + vld1.16 {d18[1]}, [r0,:16] 1.359 + sub r0, r0, r2 1.360 +.endif 1.361 + vtrn.32 d16, d17 1.362 + vadd.i16 d16, d16, d17 1.363 + vrshrn.u16 d16, q8, #6 1.364 +.ifc \type,avg 1.365 + vrhadd.u8 d16, d16, d18 1.366 +.endif 1.367 + vst1.16 {d16[0]}, [r0,:16], r2 1.368 + vst1.16 {d16[1]}, [r0,:16], r2 1.369 + subs r3, r3, #2 1.370 + bgt 1b 1.371 + pop {r4-r6, pc} 1.372 +2: 1.373 +.ifc \type,put 1.374 + ldrh r5, [r1], r2 1.375 + strh r5, [r0], r2 1.376 + ldrh r6, [r1], r2 1.377 + strh r6, [r0], r2 1.378 +.else 1.379 + vld1.16 {d16[0]}, [r1], r2 1.380 + vld1.16 {d16[1]}, [r1], r2 1.381 + vld1.16 {d18[0]}, [r0,:16], r2 1.382 + vld1.16 {d18[1]}, [r0,:16] 1.383 + sub r0, r0, r2 1.384 + vrhadd.u8 d16, d16, d18 1.385 + vst1.16 {d16[0]}, [r0,:16], r2 1.386 + vst1.16 {d16[1]}, [r0,:16], r2 1.387 +.endif 1.388 + subs r3, r3, #2 1.389 + bgt 2b 1.390 + pop {r4-r6, pc} 1.391 +endfunc 1.392 +.endm 1.393 + 1.394 + .text 1.395 + .align 1.396 + 1.397 + h264_chroma_mc8 put 1.398 + h264_chroma_mc8 avg 1.399 + h264_chroma_mc4 put 1.400 + h264_chroma_mc4 avg 1.401 + h264_chroma_mc2 put 1.402 + h264_chroma_mc2 avg 1.403 + 1.404 + /* H.264 loop filter */ 1.405 + 1.406 + .macro h264_loop_filter_start 1.407 + ldr ip, [sp] 1.408 + tst r2, r2 1.409 + ldr ip, [ip] 1.410 + tstne r3, r3 1.411 + vmov.32 d24[0], ip 1.412 + and ip, ip, ip, lsl #16 1.413 + bxeq lr 1.414 + ands ip, ip, ip, lsl #8 1.415 + bxlt lr 1.416 + .endm 1.417 + 1.418 + .macro align_push_regs 1.419 + and ip, sp, #15 1.420 + add ip, ip, #32 1.421 + sub sp, sp, ip 1.422 + vst1.64 {d12-d15}, [sp,:128] 1.423 + sub sp, sp, #32 1.424 + vst1.64 {d8-d11}, [sp,:128] 1.425 + .endm 1.426 + 1.427 + .macro align_pop_regs 1.428 + vld1.64 {d8-d11}, [sp,:128]! 1.429 + vld1.64 {d12-d15}, [sp,:128], ip 1.430 + .endm 1.431 + 1.432 + .macro h264_loop_filter_luma 1.433 + vdup.8 q11, r2 @ alpha 1.434 + vmovl.u8 q12, d24 1.435 + vabd.u8 q6, q8, q0 @ abs(p0 - q0) 1.436 + vmovl.u16 q12, d24 1.437 + vabd.u8 q14, q9, q8 @ abs(p1 - p0) 1.438 + vsli.16 q12, q12, #8 1.439 + vabd.u8 q15, q1, q0 @ abs(q1 - q0) 1.440 + vsli.32 q12, q12, #16 1.441 + vclt.u8 q6, q6, q11 @ < alpha 1.442 + vdup.8 q11, r3 @ beta 1.443 + vclt.s8 q7, q12, #0 1.444 + vclt.u8 q14, q14, q11 @ < beta 1.445 + vclt.u8 q15, q15, q11 @ < beta 1.446 + vbic q6, q6, q7 1.447 + vabd.u8 q4, q10, q8 @ abs(p2 - p0) 1.448 + vand q6, q6, q14 1.449 + vabd.u8 q5, q2, q0 @ abs(q2 - q0) 1.450 + vclt.u8 q4, q4, q11 @ < beta 1.451 + vand q6, q6, q15 1.452 + vclt.u8 q5, q5, q11 @ < beta 1.453 + vand q4, q4, q6 1.454 + vand q5, q5, q6 1.455 + vand q12, q12, q6 1.456 + vrhadd.u8 q14, q8, q0 1.457 + vsub.i8 q6, q12, q4 1.458 + vqadd.u8 q7, q9, q12 1.459 + vhadd.u8 q10, q10, q14 1.460 + vsub.i8 q6, q6, q5 1.461 + vhadd.u8 q14, q2, q14 1.462 + vmin.u8 q7, q7, q10 1.463 + vqsub.u8 q11, q9, q12 1.464 + vqadd.u8 q2, q1, q12 1.465 + vmax.u8 q7, q7, q11 1.466 + vqsub.u8 q11, q1, q12 1.467 + vmin.u8 q14, q2, q14 1.468 + vmovl.u8 q2, d0 1.469 + vmax.u8 q14, q14, q11 1.470 + vmovl.u8 q10, d1 1.471 + vsubw.u8 q2, q2, d16 1.472 + vsubw.u8 q10, q10, d17 1.473 + vshl.i16 q2, q2, #2 1.474 + vshl.i16 q10, q10, #2 1.475 + vaddw.u8 q2, q2, d18 1.476 + vaddw.u8 q10, q10, d19 1.477 + vsubw.u8 q2, q2, d2 1.478 + vsubw.u8 q10, q10, d3 1.479 + vrshrn.i16 d4, q2, #3 1.480 + vrshrn.i16 d5, q10, #3 1.481 + vbsl q4, q7, q9 1.482 + vbsl q5, q14, q1 1.483 + vneg.s8 q7, q6 1.484 + vmovl.u8 q14, d16 1.485 + vmin.s8 q2, q2, q6 1.486 + vmovl.u8 q6, d17 1.487 + vmax.s8 q2, q2, q7 1.488 + vmovl.u8 q11, d0 1.489 + vmovl.u8 q12, d1 1.490 + vaddw.s8 q14, q14, d4 1.491 + vaddw.s8 q6, q6, d5 1.492 + vsubw.s8 q11, q11, d4 1.493 + vsubw.s8 q12, q12, d5 1.494 + vqmovun.s16 d16, q14 1.495 + vqmovun.s16 d17, q6 1.496 + vqmovun.s16 d0, q11 1.497 + vqmovun.s16 d1, q12 1.498 + .endm 1.499 + 1.500 +function ff_h264_v_loop_filter_luma_neon, export=1 1.501 + h264_loop_filter_start 1.502 + 1.503 + vld1.64 {d0, d1}, [r0,:128], r1 1.504 + vld1.64 {d2, d3}, [r0,:128], r1 1.505 + vld1.64 {d4, d5}, [r0,:128], r1 1.506 + sub r0, r0, r1, lsl #2 1.507 + sub r0, r0, r1, lsl #1 1.508 + vld1.64 {d20,d21}, [r0,:128], r1 1.509 + vld1.64 {d18,d19}, [r0,:128], r1 1.510 + vld1.64 {d16,d17}, [r0,:128], r1 1.511 + 1.512 + align_push_regs 1.513 + 1.514 + h264_loop_filter_luma 1.515 + 1.516 + sub r0, r0, r1, lsl #1 1.517 + vst1.64 {d8, d9}, [r0,:128], r1 1.518 + vst1.64 {d16,d17}, [r0,:128], r1 1.519 + vst1.64 {d0, d1}, [r0,:128], r1 1.520 + vst1.64 {d10,d11}, [r0,:128] 1.521 + 1.522 + align_pop_regs 1.523 + bx lr 1.524 +endfunc 1.525 + 1.526 +function ff_h264_h_loop_filter_luma_neon, export=1 1.527 + h264_loop_filter_start 1.528 + 1.529 + sub r0, r0, #4 1.530 + vld1.64 {d6}, [r0], r1 1.531 + vld1.64 {d20}, [r0], r1 1.532 + vld1.64 {d18}, [r0], r1 1.533 + vld1.64 {d16}, [r0], r1 1.534 + vld1.64 {d0}, [r0], r1 1.535 + vld1.64 {d2}, [r0], r1 1.536 + vld1.64 {d4}, [r0], r1 1.537 + vld1.64 {d26}, [r0], r1 1.538 + vld1.64 {d7}, [r0], r1 1.539 + vld1.64 {d21}, [r0], r1 1.540 + vld1.64 {d19}, [r0], r1 1.541 + vld1.64 {d17}, [r0], r1 1.542 + vld1.64 {d1}, [r0], r1 1.543 + vld1.64 {d3}, [r0], r1 1.544 + vld1.64 {d5}, [r0], r1 1.545 + vld1.64 {d27}, [r0], r1 1.546 + 1.547 + transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 1.548 + 1.549 + align_push_regs 1.550 + 1.551 + h264_loop_filter_luma 1.552 + 1.553 + transpose_4x4 q4, q8, q0, q5 1.554 + 1.555 + sub r0, r0, r1, lsl #4 1.556 + add r0, r0, #2 1.557 + vst1.32 {d8[0]}, [r0], r1 1.558 + vst1.32 {d16[0]}, [r0], r1 1.559 + vst1.32 {d0[0]}, [r0], r1 1.560 + vst1.32 {d10[0]}, [r0], r1 1.561 + vst1.32 {d8[1]}, [r0], r1 1.562 + vst1.32 {d16[1]}, [r0], r1 1.563 + vst1.32 {d0[1]}, [r0], r1 1.564 + vst1.32 {d10[1]}, [r0], r1 1.565 + vst1.32 {d9[0]}, [r0], r1 1.566 + vst1.32 {d17[0]}, [r0], r1 1.567 + vst1.32 {d1[0]}, [r0], r1 1.568 + vst1.32 {d11[0]}, [r0], r1 1.569 + vst1.32 {d9[1]}, [r0], r1 1.570 + vst1.32 {d17[1]}, [r0], r1 1.571 + vst1.32 {d1[1]}, [r0], r1 1.572 + vst1.32 {d11[1]}, [r0], r1 1.573 + 1.574 + align_pop_regs 1.575 + bx lr 1.576 +endfunc 1.577 + 1.578 + .macro h264_loop_filter_chroma 1.579 + vdup.8 d22, r2 @ alpha 1.580 + vmovl.u8 q12, d24 1.581 + vabd.u8 d26, d16, d0 @ abs(p0 - q0) 1.582 + vmovl.u8 q2, d0 1.583 + vabd.u8 d28, d18, d16 @ abs(p1 - p0) 1.584 + vsubw.u8 q2, q2, d16 1.585 + vsli.16 d24, d24, #8 1.586 + vshl.i16 q2, q2, #2 1.587 + vabd.u8 d30, d2, d0 @ abs(q1 - q0) 1.588 + vaddw.u8 q2, q2, d18 1.589 + vclt.u8 d26, d26, d22 @ < alpha 1.590 + vsubw.u8 q2, q2, d2 1.591 + vdup.8 d22, r3 @ beta 1.592 + vclt.s8 d25, d24, #0 1.593 + vrshrn.i16 d4, q2, #3 1.594 + vclt.u8 d28, d28, d22 @ < beta 1.595 + vbic d26, d26, d25 1.596 + vclt.u8 d30, d30, d22 @ < beta 1.597 + vand d26, d26, d28 1.598 + vneg.s8 d25, d24 1.599 + vand d26, d26, d30 1.600 + vmin.s8 d4, d4, d24 1.601 + vmovl.u8 q14, d16 1.602 + vand d4, d4, d26 1.603 + vmax.s8 d4, d4, d25 1.604 + vmovl.u8 q11, d0 1.605 + vaddw.s8 q14, q14, d4 1.606 + vsubw.s8 q11, q11, d4 1.607 + vqmovun.s16 d16, q14 1.608 + vqmovun.s16 d0, q11 1.609 + .endm 1.610 + 1.611 +function ff_h264_v_loop_filter_chroma_neon, export=1 1.612 + h264_loop_filter_start 1.613 + 1.614 + sub r0, r0, r1, lsl #1 1.615 + vld1.64 {d18}, [r0,:64], r1 1.616 + vld1.64 {d16}, [r0,:64], r1 1.617 + vld1.64 {d0}, [r0,:64], r1 1.618 + vld1.64 {d2}, [r0,:64] 1.619 + 1.620 + h264_loop_filter_chroma 1.621 + 1.622 + sub r0, r0, r1, lsl #1 1.623 + vst1.64 {d16}, [r0,:64], r1 1.624 + vst1.64 {d0}, [r0,:64], r1 1.625 + 1.626 + bx lr 1.627 +endfunc 1.628 + 1.629 +function ff_h264_h_loop_filter_chroma_neon, export=1 1.630 + h264_loop_filter_start 1.631 + 1.632 + sub r0, r0, #2 1.633 + vld1.32 {d18[0]}, [r0], r1 1.634 + vld1.32 {d16[0]}, [r0], r1 1.635 + vld1.32 {d0[0]}, [r0], r1 1.636 + vld1.32 {d2[0]}, [r0], r1 1.637 + vld1.32 {d18[1]}, [r0], r1 1.638 + vld1.32 {d16[1]}, [r0], r1 1.639 + vld1.32 {d0[1]}, [r0], r1 1.640 + vld1.32 {d2[1]}, [r0], r1 1.641 + 1.642 + vtrn.16 d18, d0 1.643 + vtrn.16 d16, d2 1.644 + vtrn.8 d18, d16 1.645 + vtrn.8 d0, d2 1.646 + 1.647 + h264_loop_filter_chroma 1.648 + 1.649 + vtrn.16 d18, d0 1.650 + vtrn.16 d16, d2 1.651 + vtrn.8 d18, d16 1.652 + vtrn.8 d0, d2 1.653 + 1.654 + sub r0, r0, r1, lsl #3 1.655 + vst1.32 {d18[0]}, [r0], r1 1.656 + vst1.32 {d16[0]}, [r0], r1 1.657 + vst1.32 {d0[0]}, [r0], r1 1.658 + vst1.32 {d2[0]}, [r0], r1 1.659 + vst1.32 {d18[1]}, [r0], r1 1.660 + vst1.32 {d16[1]}, [r0], r1 1.661 + vst1.32 {d0[1]}, [r0], r1 1.662 + vst1.32 {d2[1]}, [r0], r1 1.663 + 1.664 + bx lr 1.665 +endfunc 1.666 + 1.667 + /* H.264 qpel MC */ 1.668 + 1.669 + .macro lowpass_const r 1.670 + movw \r, #5 1.671 + movt \r, #20 1.672 + vmov.32 d6[0], \r 1.673 + .endm 1.674 + 1.675 + .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 1.676 +.if \narrow 1.677 + t0 .req q0 1.678 + t1 .req q8 1.679 +.else 1.680 + t0 .req \d0 1.681 + t1 .req \d1 1.682 +.endif 1.683 + vext.8 d2, \r0, \r1, #2 1.684 + vext.8 d3, \r0, \r1, #3 1.685 + vaddl.u8 q1, d2, d3 1.686 + vext.8 d4, \r0, \r1, #1 1.687 + vext.8 d5, \r0, \r1, #4 1.688 + vaddl.u8 q2, d4, d5 1.689 + vext.8 d30, \r0, \r1, #5 1.690 + vaddl.u8 t0, \r0, d30 1.691 + vext.8 d18, \r2, \r3, #2 1.692 + vmla.i16 t0, q1, d6[1] 1.693 + vext.8 d19, \r2, \r3, #3 1.694 + vaddl.u8 q9, d18, d19 1.695 + vext.8 d20, \r2, \r3, #1 1.696 + vmls.i16 t0, q2, d6[0] 1.697 + vext.8 d21, \r2, \r3, #4 1.698 + vaddl.u8 q10, d20, d21 1.699 + vext.8 d31, \r2, \r3, #5 1.700 + vaddl.u8 t1, \r2, d31 1.701 + vmla.i16 t1, q9, d6[1] 1.702 + vmls.i16 t1, q10, d6[0] 1.703 +.if \narrow 1.704 + vqrshrun.s16 \d0, t0, #5 1.705 + vqrshrun.s16 \d1, t1, #5 1.706 +.endif 1.707 + .unreq t0 1.708 + .unreq t1 1.709 + .endm 1.710 + 1.711 + .macro lowpass_8_1 r0, r1, d0, narrow=1 1.712 +.if \narrow 1.713 + t0 .req q0 1.714 +.else 1.715 + t0 .req \d0 1.716 +.endif 1.717 + vext.8 d2, \r0, \r1, #2 1.718 + vext.8 d3, \r0, \r1, #3 1.719 + vaddl.u8 q1, d2, d3 1.720 + vext.8 d4, \r0, \r1, #1 1.721 + vext.8 d5, \r0, \r1, #4 1.722 + vaddl.u8 q2, d4, d5 1.723 + vext.8 d30, \r0, \r1, #5 1.724 + vaddl.u8 t0, \r0, d30 1.725 + vmla.i16 t0, q1, d6[1] 1.726 + vmls.i16 t0, q2, d6[0] 1.727 +.if \narrow 1.728 + vqrshrun.s16 \d0, t0, #5 1.729 +.endif 1.730 + .unreq t0 1.731 + .endm 1.732 + 1.733 + .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d 1.734 + vext.16 q1, \r0, \r1, #2 1.735 + vext.16 q0, \r0, \r1, #3 1.736 + vaddl.s16 q9, d2, d0 1.737 + vext.16 q2, \r0, \r1, #1 1.738 + vaddl.s16 q1, d3, d1 1.739 + vext.16 q3, \r0, \r1, #4 1.740 + vaddl.s16 q10, d4, d6 1.741 + vext.16 \r1, \r0, \r1, #5 1.742 + vaddl.s16 q2, d5, d7 1.743 + vaddl.s16 q0, \h0, \h1 1.744 + vaddl.s16 q8, \l0, \l1 1.745 + 1.746 + vshl.i32 q3, q9, #4 1.747 + vshl.i32 q9, q9, #2 1.748 + vshl.i32 q15, q10, #2 1.749 + vadd.i32 q9, q9, q3 1.750 + vadd.i32 q10, q10, q15 1.751 + 1.752 + vshl.i32 q3, q1, #4 1.753 + vshl.i32 q1, q1, #2 1.754 + vshl.i32 q15, q2, #2 1.755 + vadd.i32 q1, q1, q3 1.756 + vadd.i32 q2, q2, q15 1.757 + 1.758 + vadd.i32 q9, q9, q8 1.759 + vsub.i32 q9, q9, q10 1.760 + 1.761 + vadd.i32 q1, q1, q0 1.762 + vsub.i32 q1, q1, q2 1.763 + 1.764 + vrshrn.s32 d18, q9, #10 1.765 + vrshrn.s32 d19, q1, #10 1.766 + 1.767 + vqmovun.s16 \d, q9 1.768 + .endm 1.769 + 1.770 +function put_h264_qpel16_h_lowpass_neon_packed 1.771 + mov r4, lr 1.772 + mov ip, #16 1.773 + mov r3, #8 1.774 + bl put_h264_qpel8_h_lowpass_neon 1.775 + sub r1, r1, r2, lsl #4 1.776 + add r1, r1, #8 1.777 + mov ip, #16 1.778 + mov lr, r4 1.779 + b put_h264_qpel8_h_lowpass_neon 1.780 +endfunc 1.781 + 1.782 + .macro h264_qpel_h_lowpass type 1.783 +function \type\()_h264_qpel16_h_lowpass_neon 1.784 + push {lr} 1.785 + mov ip, #16 1.786 + bl \type\()_h264_qpel8_h_lowpass_neon 1.787 + sub r0, r0, r3, lsl #4 1.788 + sub r1, r1, r2, lsl #4 1.789 + add r0, r0, #8 1.790 + add r1, r1, #8 1.791 + mov ip, #16 1.792 + pop {lr} 1.793 +endfunc 1.794 + 1.795 +function \type\()_h264_qpel8_h_lowpass_neon 1.796 +1: vld1.64 {d0, d1}, [r1], r2 1.797 + vld1.64 {d16,d17}, [r1], r2 1.798 + subs ip, ip, #2 1.799 + lowpass_8 d0, d1, d16, d17, d0, d16 1.800 +.ifc \type,avg 1.801 + vld1.8 {d2}, [r0,:64], r3 1.802 + vrhadd.u8 d0, d0, d2 1.803 + vld1.8 {d3}, [r0,:64] 1.804 + vrhadd.u8 d16, d16, d3 1.805 + sub r0, r0, r3 1.806 +.endif 1.807 + vst1.64 {d0}, [r0,:64], r3 1.808 + vst1.64 {d16}, [r0,:64], r3 1.809 + bne 1b 1.810 + bx lr 1.811 +endfunc 1.812 + .endm 1.813 + 1.814 + h264_qpel_h_lowpass put 1.815 + h264_qpel_h_lowpass avg 1.816 + 1.817 + .macro h264_qpel_h_lowpass_l2 type 1.818 +function \type\()_h264_qpel16_h_lowpass_l2_neon 1.819 + push {lr} 1.820 + mov ip, #16 1.821 + bl \type\()_h264_qpel8_h_lowpass_l2_neon 1.822 + sub r0, r0, r2, lsl #4 1.823 + sub r1, r1, r2, lsl #4 1.824 + sub r3, r3, r2, lsl #4 1.825 + add r0, r0, #8 1.826 + add r1, r1, #8 1.827 + add r3, r3, #8 1.828 + mov ip, #16 1.829 + pop {lr} 1.830 +endfunc 1.831 + 1.832 +function \type\()_h264_qpel8_h_lowpass_l2_neon 1.833 +1: vld1.64 {d0, d1}, [r1], r2 1.834 + vld1.64 {d16,d17}, [r1], r2 1.835 + vld1.64 {d28}, [r3], r2 1.836 + vld1.64 {d29}, [r3], r2 1.837 + subs ip, ip, #2 1.838 + lowpass_8 d0, d1, d16, d17, d0, d1 1.839 + vrhadd.u8 q0, q0, q14 1.840 +.ifc \type,avg 1.841 + vld1.8 {d2}, [r0,:64], r2 1.842 + vrhadd.u8 d0, d0, d2 1.843 + vld1.8 {d3}, [r0,:64] 1.844 + vrhadd.u8 d1, d1, d3 1.845 + sub r0, r0, r2 1.846 +.endif 1.847 + vst1.64 {d0}, [r0,:64], r2 1.848 + vst1.64 {d1}, [r0,:64], r2 1.849 + bne 1b 1.850 + bx lr 1.851 +endfunc 1.852 + .endm 1.853 + 1.854 + h264_qpel_h_lowpass_l2 put 1.855 + h264_qpel_h_lowpass_l2 avg 1.856 + 1.857 +function put_h264_qpel16_v_lowpass_neon_packed 1.858 + mov r4, lr 1.859 + mov r2, #8 1.860 + bl put_h264_qpel8_v_lowpass_neon 1.861 + sub r1, r1, r3, lsl #2 1.862 + bl put_h264_qpel8_v_lowpass_neon 1.863 + sub r1, r1, r3, lsl #4 1.864 + sub r1, r1, r3, lsl #2 1.865 + add r1, r1, #8 1.866 + bl put_h264_qpel8_v_lowpass_neon 1.867 + sub r1, r1, r3, lsl #2 1.868 + mov lr, r4 1.869 + b put_h264_qpel8_v_lowpass_neon 1.870 +endfunc 1.871 + 1.872 + .macro h264_qpel_v_lowpass type 1.873 +function \type\()_h264_qpel16_v_lowpass_neon 1.874 + mov r4, lr 1.875 + bl \type\()_h264_qpel8_v_lowpass_neon 1.876 + sub r1, r1, r3, lsl #2 1.877 + bl \type\()_h264_qpel8_v_lowpass_neon 1.878 + sub r0, r0, r2, lsl #4 1.879 + add r0, r0, #8 1.880 + sub r1, r1, r3, lsl #4 1.881 + sub r1, r1, r3, lsl #2 1.882 + add r1, r1, #8 1.883 + bl \type\()_h264_qpel8_v_lowpass_neon 1.884 + sub r1, r1, r3, lsl #2 1.885 + mov lr, r4 1.886 +endfunc 1.887 + 1.888 +function \type\()_h264_qpel8_v_lowpass_neon 1.889 + vld1.64 {d8}, [r1], r3 1.890 + vld1.64 {d10}, [r1], r3 1.891 + vld1.64 {d12}, [r1], r3 1.892 + vld1.64 {d14}, [r1], r3 1.893 + vld1.64 {d22}, [r1], r3 1.894 + vld1.64 {d24}, [r1], r3 1.895 + vld1.64 {d26}, [r1], r3 1.896 + vld1.64 {d28}, [r1], r3 1.897 + vld1.64 {d9}, [r1], r3 1.898 + vld1.64 {d11}, [r1], r3 1.899 + vld1.64 {d13}, [r1], r3 1.900 + vld1.64 {d15}, [r1], r3 1.901 + vld1.64 {d23}, [r1] 1.902 + 1.903 + transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 1.904 + lowpass_8 d8, d9, d10, d11, d8, d10 1.905 + lowpass_8 d12, d13, d14, d15, d12, d14 1.906 + lowpass_8 d22, d23, d24, d25, d22, d24 1.907 + lowpass_8 d26, d27, d28, d29, d26, d28 1.908 + transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 1.909 + 1.910 +.ifc \type,avg 1.911 + vld1.8 {d9}, [r0,:64], r2 1.912 + vrhadd.u8 d8, d8, d9 1.913 + vld1.8 {d11}, [r0,:64], r2 1.914 + vrhadd.u8 d10, d10, d11 1.915 + vld1.8 {d13}, [r0,:64], r2 1.916 + vrhadd.u8 d12, d12, d13 1.917 + vld1.8 {d15}, [r0,:64], r2 1.918 + vrhadd.u8 d14, d14, d15 1.919 + vld1.8 {d23}, [r0,:64], r2 1.920 + vrhadd.u8 d22, d22, d23 1.921 + vld1.8 {d25}, [r0,:64], r2 1.922 + vrhadd.u8 d24, d24, d25 1.923 + vld1.8 {d27}, [r0,:64], r2 1.924 + vrhadd.u8 d26, d26, d27 1.925 + vld1.8 {d29}, [r0,:64], r2 1.926 + vrhadd.u8 d28, d28, d29 1.927 + sub r0, r0, r2, lsl #3 1.928 +.endif 1.929 + 1.930 + vst1.64 {d8}, [r0,:64], r2 1.931 + vst1.64 {d10}, [r0,:64], r2 1.932 + vst1.64 {d12}, [r0,:64], r2 1.933 + vst1.64 {d14}, [r0,:64], r2 1.934 + vst1.64 {d22}, [r0,:64], r2 1.935 + vst1.64 {d24}, [r0,:64], r2 1.936 + vst1.64 {d26}, [r0,:64], r2 1.937 + vst1.64 {d28}, [r0,:64], r2 1.938 + 1.939 + bx lr 1.940 +endfunc 1.941 + .endm 1.942 + 1.943 + h264_qpel_v_lowpass put 1.944 + h264_qpel_v_lowpass avg 1.945 + 1.946 + .macro h264_qpel_v_lowpass_l2 type 1.947 +function \type\()_h264_qpel16_v_lowpass_l2_neon 1.948 + mov r4, lr 1.949 + bl \type\()_h264_qpel8_v_lowpass_l2_neon 1.950 + sub r1, r1, r3, lsl #2 1.951 + bl \type\()_h264_qpel8_v_lowpass_l2_neon 1.952 + sub r0, r0, r3, lsl #4 1.953 + sub ip, ip, r2, lsl #4 1.954 + add r0, r0, #8 1.955 + add ip, ip, #8 1.956 + sub r1, r1, r3, lsl #4 1.957 + sub r1, r1, r3, lsl #2 1.958 + add r1, r1, #8 1.959 + bl \type\()_h264_qpel8_v_lowpass_l2_neon 1.960 + sub r1, r1, r3, lsl #2 1.961 + mov lr, r4 1.962 +endfunc 1.963 + 1.964 +function \type\()_h264_qpel8_v_lowpass_l2_neon 1.965 + vld1.64 {d8}, [r1], r3 1.966 + vld1.64 {d10}, [r1], r3 1.967 + vld1.64 {d12}, [r1], r3 1.968 + vld1.64 {d14}, [r1], r3 1.969 + vld1.64 {d22}, [r1], r3 1.970 + vld1.64 {d24}, [r1], r3 1.971 + vld1.64 {d26}, [r1], r3 1.972 + vld1.64 {d28}, [r1], r3 1.973 + vld1.64 {d9}, [r1], r3 1.974 + vld1.64 {d11}, [r1], r3 1.975 + vld1.64 {d13}, [r1], r3 1.976 + vld1.64 {d15}, [r1], r3 1.977 + vld1.64 {d23}, [r1] 1.978 + 1.979 + transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 1.980 + lowpass_8 d8, d9, d10, d11, d8, d9 1.981 + lowpass_8 d12, d13, d14, d15, d12, d13 1.982 + lowpass_8 d22, d23, d24, d25, d22, d23 1.983 + lowpass_8 d26, d27, d28, d29, d26, d27 1.984 + transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 1.985 + 1.986 + vld1.64 {d0}, [ip], r2 1.987 + vld1.64 {d1}, [ip], r2 1.988 + vld1.64 {d2}, [ip], r2 1.989 + vld1.64 {d3}, [ip], r2 1.990 + vld1.64 {d4}, [ip], r2 1.991 + vrhadd.u8 q0, q0, q4 1.992 + vld1.64 {d5}, [ip], r2 1.993 + vrhadd.u8 q1, q1, q6 1.994 + vld1.64 {d10}, [ip], r2 1.995 + vrhadd.u8 q2, q2, q11 1.996 + vld1.64 {d11}, [ip], r2 1.997 + vrhadd.u8 q5, q5, q13 1.998 + 1.999 +.ifc \type,avg 1.1000 + vld1.8 {d16}, [r0,:64], r3 1.1001 + vrhadd.u8 d0, d0, d16 1.1002 + vld1.8 {d17}, [r0,:64], r3 1.1003 + vrhadd.u8 d1, d1, d17 1.1004 + vld1.8 {d16}, [r0,:64], r3 1.1005 + vrhadd.u8 d2, d2, d16 1.1006 + vld1.8 {d17}, [r0,:64], r3 1.1007 + vrhadd.u8 d3, d3, d17 1.1008 + vld1.8 {d16}, [r0,:64], r3 1.1009 + vrhadd.u8 d4, d4, d16 1.1010 + vld1.8 {d17}, [r0,:64], r3 1.1011 + vrhadd.u8 d5, d5, d17 1.1012 + vld1.8 {d16}, [r0,:64], r3 1.1013 + vrhadd.u8 d10, d10, d16 1.1014 + vld1.8 {d17}, [r0,:64], r3 1.1015 + vrhadd.u8 d11, d11, d17 1.1016 + sub r0, r0, r3, lsl #3 1.1017 +.endif 1.1018 + 1.1019 + vst1.64 {d0}, [r0,:64], r3 1.1020 + vst1.64 {d1}, [r0,:64], r3 1.1021 + vst1.64 {d2}, [r0,:64], r3 1.1022 + vst1.64 {d3}, [r0,:64], r3 1.1023 + vst1.64 {d4}, [r0,:64], r3 1.1024 + vst1.64 {d5}, [r0,:64], r3 1.1025 + vst1.64 {d10}, [r0,:64], r3 1.1026 + vst1.64 {d11}, [r0,:64], r3 1.1027 + 1.1028 + bx lr 1.1029 +endfunc 1.1030 + .endm 1.1031 + 1.1032 + h264_qpel_v_lowpass_l2 put 1.1033 + h264_qpel_v_lowpass_l2 avg 1.1034 + 1.1035 +function put_h264_qpel8_hv_lowpass_neon_top 1.1036 + lowpass_const ip 1.1037 + mov ip, #12 1.1038 +1: vld1.64 {d0, d1}, [r1], r3 1.1039 + vld1.64 {d16,d17}, [r1], r3 1.1040 + subs ip, ip, #2 1.1041 + lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 1.1042 + vst1.64 {d22-d25}, [r4,:128]! 1.1043 + bne 1b 1.1044 + 1.1045 + vld1.64 {d0, d1}, [r1] 1.1046 + lowpass_8_1 d0, d1, q12, narrow=0 1.1047 + 1.1048 + mov ip, #-16 1.1049 + add r4, r4, ip 1.1050 + vld1.64 {d30,d31}, [r4,:128], ip 1.1051 + vld1.64 {d20,d21}, [r4,:128], ip 1.1052 + vld1.64 {d18,d19}, [r4,:128], ip 1.1053 + vld1.64 {d16,d17}, [r4,:128], ip 1.1054 + vld1.64 {d14,d15}, [r4,:128], ip 1.1055 + vld1.64 {d12,d13}, [r4,:128], ip 1.1056 + vld1.64 {d10,d11}, [r4,:128], ip 1.1057 + vld1.64 {d8, d9}, [r4,:128], ip 1.1058 + vld1.64 {d6, d7}, [r4,:128], ip 1.1059 + vld1.64 {d4, d5}, [r4,:128], ip 1.1060 + vld1.64 {d2, d3}, [r4,:128], ip 1.1061 + vld1.64 {d0, d1}, [r4,:128] 1.1062 + 1.1063 + swap4 d1, d3, d5, d7, d8, d10, d12, d14 1.1064 + transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 1.1065 + 1.1066 + swap4 d17, d19, d21, d31, d24, d26, d28, d22 1.1067 + transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 1.1068 + 1.1069 + vst1.64 {d30,d31}, [r4,:128]! 1.1070 + vst1.64 {d6, d7}, [r4,:128]! 1.1071 + vst1.64 {d20,d21}, [r4,:128]! 1.1072 + vst1.64 {d4, d5}, [r4,:128]! 1.1073 + vst1.64 {d18,d19}, [r4,:128]! 1.1074 + vst1.64 {d2, d3}, [r4,:128]! 1.1075 + vst1.64 {d16,d17}, [r4,:128]! 1.1076 + vst1.64 {d0, d1}, [r4,:128] 1.1077 + 1.1078 + lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 1.1079 + lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 1.1080 + lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 1.1081 + lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 1.1082 + 1.1083 + vld1.64 {d16,d17}, [r4,:128], ip 1.1084 + vld1.64 {d30,d31}, [r4,:128], ip 1.1085 + lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 1.1086 + vld1.64 {d16,d17}, [r4,:128], ip 1.1087 + vld1.64 {d30,d31}, [r4,:128], ip 1.1088 + lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 1.1089 + vld1.64 {d16,d17}, [r4,:128], ip 1.1090 + vld1.64 {d30,d31}, [r4,:128], ip 1.1091 + lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 1.1092 + vld1.64 {d16,d17}, [r4,:128], ip 1.1093 + vld1.64 {d30,d31}, [r4,:128] 1.1094 + lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 1.1095 + 1.1096 + transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 1.1097 + 1.1098 + bx lr 1.1099 +endfunc 1.1100 + 1.1101 + .macro h264_qpel8_hv_lowpass type 1.1102 +function \type\()_h264_qpel8_hv_lowpass_neon 1.1103 + mov r10, lr 1.1104 + bl put_h264_qpel8_hv_lowpass_neon_top 1.1105 +.ifc \type,avg 1.1106 + vld1.8 {d0}, [r0,:64], r2 1.1107 + vrhadd.u8 d12, d12, d0 1.1108 + vld1.8 {d1}, [r0,:64], r2 1.1109 + vrhadd.u8 d13, d13, d1 1.1110 + vld1.8 {d2}, [r0,:64], r2 1.1111 + vrhadd.u8 d14, d14, d2 1.1112 + vld1.8 {d3}, [r0,:64], r2 1.1113 + vrhadd.u8 d15, d15, d3 1.1114 + vld1.8 {d4}, [r0,:64], r2 1.1115 + vrhadd.u8 d8, d8, d4 1.1116 + vld1.8 {d5}, [r0,:64], r2 1.1117 + vrhadd.u8 d9, d9, d5 1.1118 + vld1.8 {d6}, [r0,:64], r2 1.1119 + vrhadd.u8 d10, d10, d6 1.1120 + vld1.8 {d7}, [r0,:64], r2 1.1121 + vrhadd.u8 d11, d11, d7 1.1122 + sub r0, r0, r2, lsl #3 1.1123 +.endif 1.1124 + vst1.64 {d12}, [r0,:64], r2 1.1125 + vst1.64 {d13}, [r0,:64], r2 1.1126 + vst1.64 {d14}, [r0,:64], r2 1.1127 + vst1.64 {d15}, [r0,:64], r2 1.1128 + vst1.64 {d8}, [r0,:64], r2 1.1129 + vst1.64 {d9}, [r0,:64], r2 1.1130 + vst1.64 {d10}, [r0,:64], r2 1.1131 + vst1.64 {d11}, [r0,:64], r2 1.1132 + 1.1133 + mov lr, r10 1.1134 + bx lr 1.1135 +endfunc 1.1136 + .endm 1.1137 + 1.1138 + h264_qpel8_hv_lowpass put 1.1139 + h264_qpel8_hv_lowpass avg 1.1140 + 1.1141 + .macro h264_qpel8_hv_lowpass_l2 type 1.1142 +function \type\()_h264_qpel8_hv_lowpass_l2_neon 1.1143 + mov r10, lr 1.1144 + bl put_h264_qpel8_hv_lowpass_neon_top 1.1145 + 1.1146 + vld1.64 {d0, d1}, [r2,:128]! 1.1147 + vld1.64 {d2, d3}, [r2,:128]! 1.1148 + vrhadd.u8 q0, q0, q6 1.1149 + vld1.64 {d4, d5}, [r2,:128]! 1.1150 + vrhadd.u8 q1, q1, q7 1.1151 + vld1.64 {d6, d7}, [r2,:128]! 1.1152 + vrhadd.u8 q2, q2, q4 1.1153 + vrhadd.u8 q3, q3, q5 1.1154 +.ifc \type,avg 1.1155 + vld1.8 {d16}, [r0,:64], r3 1.1156 + vrhadd.u8 d0, d0, d16 1.1157 + vld1.8 {d17}, [r0,:64], r3 1.1158 + vrhadd.u8 d1, d1, d17 1.1159 + vld1.8 {d18}, [r0,:64], r3 1.1160 + vrhadd.u8 d2, d2, d18 1.1161 + vld1.8 {d19}, [r0,:64], r3 1.1162 + vrhadd.u8 d3, d3, d19 1.1163 + vld1.8 {d20}, [r0,:64], r3 1.1164 + vrhadd.u8 d4, d4, d20 1.1165 + vld1.8 {d21}, [r0,:64], r3 1.1166 + vrhadd.u8 d5, d5, d21 1.1167 + vld1.8 {d22}, [r0,:64], r3 1.1168 + vrhadd.u8 d6, d6, d22 1.1169 + vld1.8 {d23}, [r0,:64], r3 1.1170 + vrhadd.u8 d7, d7, d23 1.1171 + sub r0, r0, r3, lsl #3 1.1172 +.endif 1.1173 + vst1.64 {d0}, [r0,:64], r3 1.1174 + vst1.64 {d1}, [r0,:64], r3 1.1175 + vst1.64 {d2}, [r0,:64], r3 1.1176 + vst1.64 {d3}, [r0,:64], r3 1.1177 + vst1.64 {d4}, [r0,:64], r3 1.1178 + vst1.64 {d5}, [r0,:64], r3 1.1179 + vst1.64 {d6}, [r0,:64], r3 1.1180 + vst1.64 {d7}, [r0,:64], r3 1.1181 + 1.1182 + mov lr, r10 1.1183 + bx lr 1.1184 +endfunc 1.1185 + .endm 1.1186 + 1.1187 + h264_qpel8_hv_lowpass_l2 put 1.1188 + h264_qpel8_hv_lowpass_l2 avg 1.1189 + 1.1190 + .macro h264_qpel16_hv type 1.1191 +function \type\()_h264_qpel16_hv_lowpass_neon 1.1192 + mov r9, lr 1.1193 + bl \type\()_h264_qpel8_hv_lowpass_neon 1.1194 + sub r1, r1, r3, lsl #2 1.1195 + bl \type\()_h264_qpel8_hv_lowpass_neon 1.1196 + sub r1, r1, r3, lsl #4 1.1197 + sub r1, r1, r3, lsl #2 1.1198 + add r1, r1, #8 1.1199 + sub r0, r0, r2, lsl #4 1.1200 + add r0, r0, #8 1.1201 + bl \type\()_h264_qpel8_hv_lowpass_neon 1.1202 + sub r1, r1, r3, lsl #2 1.1203 + mov lr, r9 1.1204 + b \type\()_h264_qpel8_hv_lowpass_neon 1.1205 +endfunc 1.1206 + 1.1207 +function \type\()_h264_qpel16_hv_lowpass_l2_neon 1.1208 + mov r9, lr 1.1209 + sub r2, r4, #256 1.1210 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon 1.1211 + sub r1, r1, r3, lsl #2 1.1212 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon 1.1213 + sub r1, r1, r3, lsl #4 1.1214 + sub r1, r1, r3, lsl #2 1.1215 + add r1, r1, #8 1.1216 + sub r0, r0, r3, lsl #4 1.1217 + add r0, r0, #8 1.1218 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon 1.1219 + sub r1, r1, r3, lsl #2 1.1220 + mov lr, r9 1.1221 + b \type\()_h264_qpel8_hv_lowpass_l2_neon 1.1222 +endfunc 1.1223 + .endm 1.1224 + 1.1225 + h264_qpel16_hv put 1.1226 + h264_qpel16_hv avg 1.1227 + 1.1228 + .macro h264_qpel8 type 1.1229 +function ff_\type\()_h264_qpel8_mc10_neon, export=1 1.1230 + lowpass_const r3 1.1231 + mov r3, r1 1.1232 + sub r1, r1, #2 1.1233 + mov ip, #8 1.1234 + b \type\()_h264_qpel8_h_lowpass_l2_neon 1.1235 +endfunc 1.1236 + 1.1237 +function ff_\type\()_h264_qpel8_mc20_neon, export=1 1.1238 + lowpass_const r3 1.1239 + sub r1, r1, #2 1.1240 + mov r3, r2 1.1241 + mov ip, #8 1.1242 + b \type\()_h264_qpel8_h_lowpass_neon 1.1243 +endfunc 1.1244 + 1.1245 +function ff_\type\()_h264_qpel8_mc30_neon, export=1 1.1246 + lowpass_const r3 1.1247 + add r3, r1, #1 1.1248 + sub r1, r1, #2 1.1249 + mov ip, #8 1.1250 + b \type\()_h264_qpel8_h_lowpass_l2_neon 1.1251 +endfunc 1.1252 + 1.1253 +function ff_\type\()_h264_qpel8_mc01_neon, export=1 1.1254 + push {lr} 1.1255 + mov ip, r1 1.1256 +\type\()_h264_qpel8_mc01: 1.1257 + lowpass_const r3 1.1258 + mov r3, r2 1.1259 + sub r1, r1, r2, lsl #1 1.1260 + vpush {d8-d15} 1.1261 + bl \type\()_h264_qpel8_v_lowpass_l2_neon 1.1262 + vpop {d8-d15} 1.1263 + pop {pc} 1.1264 +endfunc 1.1265 + 1.1266 +function ff_\type\()_h264_qpel8_mc11_neon, export=1 1.1267 + push {r0, r1, r11, lr} 1.1268 +\type\()_h264_qpel8_mc11: 1.1269 + lowpass_const r3 1.1270 + mov r11, sp 1.1271 + bic sp, sp, #15 1.1272 + sub sp, sp, #64 1.1273 + mov r0, sp 1.1274 + sub r1, r1, #2 1.1275 + mov r3, #8 1.1276 + mov ip, #8 1.1277 + vpush {d8-d15} 1.1278 + bl put_h264_qpel8_h_lowpass_neon 1.1279 + ldrd r0, [r11] 1.1280 + mov r3, r2 1.1281 + add ip, sp, #64 1.1282 + sub r1, r1, r2, lsl #1 1.1283 + mov r2, #8 1.1284 + bl \type\()_h264_qpel8_v_lowpass_l2_neon 1.1285 + vpop {d8-d15} 1.1286 + add sp, r11, #8 1.1287 + pop {r11, pc} 1.1288 +endfunc 1.1289 + 1.1290 +function ff_\type\()_h264_qpel8_mc21_neon, export=1 1.1291 + push {r0, r1, r4, r10, r11, lr} 1.1292 +\type\()_h264_qpel8_mc21: 1.1293 + lowpass_const r3 1.1294 + mov r11, sp 1.1295 + bic sp, sp, #15 1.1296 + sub sp, sp, #(8*8+16*12) 1.1297 + sub r1, r1, #2 1.1298 + mov r3, #8 1.1299 + mov r0, sp 1.1300 + mov ip, #8 1.1301 + vpush {d8-d15} 1.1302 + bl put_h264_qpel8_h_lowpass_neon 1.1303 + mov r4, r0 1.1304 + ldrd r0, [r11] 1.1305 + sub r1, r1, r2, lsl #1 1.1306 + sub r1, r1, #2 1.1307 + mov r3, r2 1.1308 + sub r2, r4, #64 1.1309 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon 1.1310 + vpop {d8-d15} 1.1311 + add sp, r11, #8 1.1312 + pop {r4, r10, r11, pc} 1.1313 +endfunc 1.1314 + 1.1315 +function ff_\type\()_h264_qpel8_mc31_neon, export=1 1.1316 + add r1, r1, #1 1.1317 + push {r0, r1, r11, lr} 1.1318 + sub r1, r1, #1 1.1319 + b \type\()_h264_qpel8_mc11 1.1320 +endfunc 1.1321 + 1.1322 +function ff_\type\()_h264_qpel8_mc02_neon, export=1 1.1323 + push {lr} 1.1324 + lowpass_const r3 1.1325 + sub r1, r1, r2, lsl #1 1.1326 + mov r3, r2 1.1327 + vpush {d8-d15} 1.1328 + bl \type\()_h264_qpel8_v_lowpass_neon 1.1329 + vpop {d8-d15} 1.1330 + pop {pc} 1.1331 +endfunc 1.1332 + 1.1333 +function ff_\type\()_h264_qpel8_mc12_neon, export=1 1.1334 + push {r0, r1, r4, r10, r11, lr} 1.1335 +\type\()_h264_qpel8_mc12: 1.1336 + lowpass_const r3 1.1337 + mov r11, sp 1.1338 + bic sp, sp, #15 1.1339 + sub sp, sp, #(8*8+16*12) 1.1340 + sub r1, r1, r2, lsl #1 1.1341 + mov r3, r2 1.1342 + mov r2, #8 1.1343 + mov r0, sp 1.1344 + vpush {d8-d15} 1.1345 + bl put_h264_qpel8_v_lowpass_neon 1.1346 + mov r4, r0 1.1347 + ldrd r0, [r11] 1.1348 + sub r1, r1, r3, lsl #1 1.1349 + sub r1, r1, #2 1.1350 + sub r2, r4, #64 1.1351 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon 1.1352 + vpop {d8-d15} 1.1353 + add sp, r11, #8 1.1354 + pop {r4, r10, r11, pc} 1.1355 +endfunc 1.1356 + 1.1357 +function ff_\type\()_h264_qpel8_mc22_neon, export=1 1.1358 + push {r4, r10, r11, lr} 1.1359 + mov r11, sp 1.1360 + bic sp, sp, #15 1.1361 + sub r1, r1, r2, lsl #1 1.1362 + sub r1, r1, #2 1.1363 + mov r3, r2 1.1364 + sub sp, sp, #(16*12) 1.1365 + mov r4, sp 1.1366 + vpush {d8-d15} 1.1367 + bl \type\()_h264_qpel8_hv_lowpass_neon 1.1368 + vpop {d8-d15} 1.1369 + mov sp, r11 1.1370 + pop {r4, r10, r11, pc} 1.1371 +endfunc 1.1372 + 1.1373 +function ff_\type\()_h264_qpel8_mc32_neon, export=1 1.1374 + push {r0, r1, r4, r10, r11, lr} 1.1375 + add r1, r1, #1 1.1376 + b \type\()_h264_qpel8_mc12 1.1377 +endfunc 1.1378 + 1.1379 +function ff_\type\()_h264_qpel8_mc03_neon, export=1 1.1380 + push {lr} 1.1381 + add ip, r1, r2 1.1382 + b \type\()_h264_qpel8_mc01 1.1383 +endfunc 1.1384 + 1.1385 +function ff_\type\()_h264_qpel8_mc13_neon, export=1 1.1386 + push {r0, r1, r11, lr} 1.1387 + add r1, r1, r2 1.1388 + b \type\()_h264_qpel8_mc11 1.1389 +endfunc 1.1390 + 1.1391 +function ff_\type\()_h264_qpel8_mc23_neon, export=1 1.1392 + push {r0, r1, r4, r10, r11, lr} 1.1393 + add r1, r1, r2 1.1394 + b \type\()_h264_qpel8_mc21 1.1395 +endfunc 1.1396 + 1.1397 +function ff_\type\()_h264_qpel8_mc33_neon, export=1 1.1398 + add r1, r1, #1 1.1399 + push {r0, r1, r11, lr} 1.1400 + add r1, r1, r2 1.1401 + sub r1, r1, #1 1.1402 + b \type\()_h264_qpel8_mc11 1.1403 +endfunc 1.1404 + .endm 1.1405 + 1.1406 + h264_qpel8 put 1.1407 + h264_qpel8 avg 1.1408 + 1.1409 + .macro h264_qpel16 type 1.1410 +function ff_\type\()_h264_qpel16_mc10_neon, export=1 1.1411 + lowpass_const r3 1.1412 + mov r3, r1 1.1413 + sub r1, r1, #2 1.1414 + b \type\()_h264_qpel16_h_lowpass_l2_neon 1.1415 +endfunc 1.1416 + 1.1417 +function ff_\type\()_h264_qpel16_mc20_neon, export=1 1.1418 + lowpass_const r3 1.1419 + sub r1, r1, #2 1.1420 + mov r3, r2 1.1421 + b \type\()_h264_qpel16_h_lowpass_neon 1.1422 +endfunc 1.1423 + 1.1424 +function ff_\type\()_h264_qpel16_mc30_neon, export=1 1.1425 + lowpass_const r3 1.1426 + add r3, r1, #1 1.1427 + sub r1, r1, #2 1.1428 + b \type\()_h264_qpel16_h_lowpass_l2_neon 1.1429 +endfunc 1.1430 + 1.1431 +function ff_\type\()_h264_qpel16_mc01_neon, export=1 1.1432 + push {r4, lr} 1.1433 + mov ip, r1 1.1434 +\type\()_h264_qpel16_mc01: 1.1435 + lowpass_const r3 1.1436 + mov r3, r2 1.1437 + sub r1, r1, r2, lsl #1 1.1438 + vpush {d8-d15} 1.1439 + bl \type\()_h264_qpel16_v_lowpass_l2_neon 1.1440 + vpop {d8-d15} 1.1441 + pop {r4, pc} 1.1442 +endfunc 1.1443 + 1.1444 +function ff_\type\()_h264_qpel16_mc11_neon, export=1 1.1445 + push {r0, r1, r4, r11, lr} 1.1446 +\type\()_h264_qpel16_mc11: 1.1447 + lowpass_const r3 1.1448 + mov r11, sp 1.1449 + bic sp, sp, #15 1.1450 + sub sp, sp, #256 1.1451 + mov r0, sp 1.1452 + sub r1, r1, #2 1.1453 + mov r3, #16 1.1454 + vpush {d8-d15} 1.1455 + bl put_h264_qpel16_h_lowpass_neon 1.1456 + ldrd r0, [r11] 1.1457 + mov r3, r2 1.1458 + add ip, sp, #64 1.1459 + sub r1, r1, r2, lsl #1 1.1460 + mov r2, #16 1.1461 + bl \type\()_h264_qpel16_v_lowpass_l2_neon 1.1462 + vpop {d8-d15} 1.1463 + add sp, r11, #8 1.1464 + pop {r4, r11, pc} 1.1465 +endfunc 1.1466 + 1.1467 +function ff_\type\()_h264_qpel16_mc21_neon, export=1 1.1468 + push {r0, r1, r4-r5, r9-r11, lr} 1.1469 +\type\()_h264_qpel16_mc21: 1.1470 + lowpass_const r3 1.1471 + mov r11, sp 1.1472 + bic sp, sp, #15 1.1473 + sub sp, sp, #(16*16+16*12) 1.1474 + sub r1, r1, #2 1.1475 + mov r0, sp 1.1476 + vpush {d8-d15} 1.1477 + bl put_h264_qpel16_h_lowpass_neon_packed 1.1478 + mov r4, r0 1.1479 + ldrd r0, [r11] 1.1480 + sub r1, r1, r2, lsl #1 1.1481 + sub r1, r1, #2 1.1482 + mov r3, r2 1.1483 + bl \type\()_h264_qpel16_hv_lowpass_l2_neon 1.1484 + vpop {d8-d15} 1.1485 + add sp, r11, #8 1.1486 + pop {r4-r5, r9-r11, pc} 1.1487 +endfunc 1.1488 + 1.1489 +function ff_\type\()_h264_qpel16_mc31_neon, export=1 1.1490 + add r1, r1, #1 1.1491 + push {r0, r1, r4, r11, lr} 1.1492 + sub r1, r1, #1 1.1493 + b \type\()_h264_qpel16_mc11 1.1494 +endfunc 1.1495 + 1.1496 +function ff_\type\()_h264_qpel16_mc02_neon, export=1 1.1497 + push {r4, lr} 1.1498 + lowpass_const r3 1.1499 + sub r1, r1, r2, lsl #1 1.1500 + mov r3, r2 1.1501 + vpush {d8-d15} 1.1502 + bl \type\()_h264_qpel16_v_lowpass_neon 1.1503 + vpop {d8-d15} 1.1504 + pop {r4, pc} 1.1505 +endfunc 1.1506 + 1.1507 +function ff_\type\()_h264_qpel16_mc12_neon, export=1 1.1508 + push {r0, r1, r4-r5, r9-r11, lr} 1.1509 +\type\()_h264_qpel16_mc12: 1.1510 + lowpass_const r3 1.1511 + mov r11, sp 1.1512 + bic sp, sp, #15 1.1513 + sub sp, sp, #(16*16+16*12) 1.1514 + sub r1, r1, r2, lsl #1 1.1515 + mov r0, sp 1.1516 + mov r3, r2 1.1517 + vpush {d8-d15} 1.1518 + bl put_h264_qpel16_v_lowpass_neon_packed 1.1519 + mov r4, r0 1.1520 + ldrd r0, [r11] 1.1521 + sub r1, r1, r3, lsl #1 1.1522 + sub r1, r1, #2 1.1523 + mov r2, r3 1.1524 + bl \type\()_h264_qpel16_hv_lowpass_l2_neon 1.1525 + vpop {d8-d15} 1.1526 + add sp, r11, #8 1.1527 + pop {r4-r5, r9-r11, pc} 1.1528 +endfunc 1.1529 + 1.1530 +function ff_\type\()_h264_qpel16_mc22_neon, export=1 1.1531 + push {r4, r9-r11, lr} 1.1532 + lowpass_const r3 1.1533 + mov r11, sp 1.1534 + bic sp, sp, #15 1.1535 + sub r1, r1, r2, lsl #1 1.1536 + sub r1, r1, #2 1.1537 + mov r3, r2 1.1538 + sub sp, sp, #(16*12) 1.1539 + mov r4, sp 1.1540 + vpush {d8-d15} 1.1541 + bl \type\()_h264_qpel16_hv_lowpass_neon 1.1542 + vpop {d8-d15} 1.1543 + mov sp, r11 1.1544 + pop {r4, r9-r11, pc} 1.1545 +endfunc 1.1546 + 1.1547 +function ff_\type\()_h264_qpel16_mc32_neon, export=1 1.1548 + push {r0, r1, r4-r5, r9-r11, lr} 1.1549 + add r1, r1, #1 1.1550 + b \type\()_h264_qpel16_mc12 1.1551 +endfunc 1.1552 + 1.1553 +function ff_\type\()_h264_qpel16_mc03_neon, export=1 1.1554 + push {r4, lr} 1.1555 + add ip, r1, r2 1.1556 + b \type\()_h264_qpel16_mc01 1.1557 +endfunc 1.1558 + 1.1559 +function ff_\type\()_h264_qpel16_mc13_neon, export=1 1.1560 + push {r0, r1, r4, r11, lr} 1.1561 + add r1, r1, r2 1.1562 + b \type\()_h264_qpel16_mc11 1.1563 +endfunc 1.1564 + 1.1565 +function ff_\type\()_h264_qpel16_mc23_neon, export=1 1.1566 + push {r0, r1, r4-r5, r9-r11, lr} 1.1567 + add r1, r1, r2 1.1568 + b \type\()_h264_qpel16_mc21 1.1569 +endfunc 1.1570 + 1.1571 +function ff_\type\()_h264_qpel16_mc33_neon, export=1 1.1572 + add r1, r1, #1 1.1573 + push {r0, r1, r4, r11, lr} 1.1574 + add r1, r1, r2 1.1575 + sub r1, r1, #1 1.1576 + b \type\()_h264_qpel16_mc11 1.1577 +endfunc 1.1578 + .endm 1.1579 + 1.1580 + h264_qpel16 put 1.1581 + h264_qpel16 avg 1.1582 + 1.1583 +@ Biweighted prediction 1.1584 + 1.1585 + .macro biweight_16 macs, macd 1.1586 + vdup.8 d0, r4 1.1587 + vdup.8 d1, r5 1.1588 + vmov q2, q8 1.1589 + vmov q3, q8 1.1590 +1: subs ip, ip, #2 1.1591 + vld1.8 {d20-d21},[r0,:128], r2 1.1592 + \macd q2, d0, d20 1.1593 + pld [r0] 1.1594 + \macd q3, d0, d21 1.1595 + vld1.8 {d22-d23},[r1,:128], r2 1.1596 + \macs q2, d1, d22 1.1597 + pld [r1] 1.1598 + \macs q3, d1, d23 1.1599 + vmov q12, q8 1.1600 + vld1.8 {d28-d29},[r0,:128], r2 1.1601 + vmov q13, q8 1.1602 + \macd q12, d0, d28 1.1603 + pld [r0] 1.1604 + \macd q13, d0, d29 1.1605 + vld1.8 {d30-d31},[r1,:128], r2 1.1606 + \macs q12, d1, d30 1.1607 + pld [r1] 1.1608 + \macs q13, d1, d31 1.1609 + vshl.s16 q2, q2, q9 1.1610 + vshl.s16 q3, q3, q9 1.1611 + vqmovun.s16 d4, q2 1.1612 + vqmovun.s16 d5, q3 1.1613 + vshl.s16 q12, q12, q9 1.1614 + vshl.s16 q13, q13, q9 1.1615 + vqmovun.s16 d24, q12 1.1616 + vqmovun.s16 d25, q13 1.1617 + vmov q3, q8 1.1618 + vst1.8 {d4- d5}, [r6,:128], r2 1.1619 + vmov q2, q8 1.1620 + vst1.8 {d24-d25},[r6,:128], r2 1.1621 + bne 1b 1.1622 + pop {r4-r6, pc} 1.1623 + .endm 1.1624 + 1.1625 + .macro biweight_8 macs, macd 1.1626 + vdup.8 d0, r4 1.1627 + vdup.8 d1, r5 1.1628 + vmov q1, q8 1.1629 + vmov q10, q8 1.1630 +1: subs ip, ip, #2 1.1631 + vld1.8 {d4},[r0,:64], r2 1.1632 + \macd q1, d0, d4 1.1633 + pld [r0] 1.1634 + vld1.8 {d5},[r1,:64], r2 1.1635 + \macs q1, d1, d5 1.1636 + pld [r1] 1.1637 + vld1.8 {d6},[r0,:64], r2 1.1638 + \macd q10, d0, d6 1.1639 + pld [r0] 1.1640 + vld1.8 {d7},[r1,:64], r2 1.1641 + \macs q10, d1, d7 1.1642 + pld [r1] 1.1643 + vshl.s16 q1, q1, q9 1.1644 + vqmovun.s16 d2, q1 1.1645 + vshl.s16 q10, q10, q9 1.1646 + vqmovun.s16 d4, q10 1.1647 + vmov q10, q8 1.1648 + vst1.8 {d2},[r6,:64], r2 1.1649 + vmov q1, q8 1.1650 + vst1.8 {d4},[r6,:64], r2 1.1651 + bne 1b 1.1652 + pop {r4-r6, pc} 1.1653 + .endm 1.1654 + 1.1655 + .macro biweight_4 macs, macd 1.1656 + vdup.8 d0, r4 1.1657 + vdup.8 d1, r5 1.1658 + vmov q1, q8 1.1659 + vmov q10, q8 1.1660 +1: subs ip, ip, #4 1.1661 + vld1.32 {d4[0]},[r0,:32], r2 1.1662 + vld1.32 {d4[1]},[r0,:32], r2 1.1663 + \macd q1, d0, d4 1.1664 + pld [r0] 1.1665 + vld1.32 {d5[0]},[r1,:32], r2 1.1666 + vld1.32 {d5[1]},[r1,:32], r2 1.1667 + \macs q1, d1, d5 1.1668 + pld [r1] 1.1669 + blt 2f 1.1670 + vld1.32 {d6[0]},[r0,:32], r2 1.1671 + vld1.32 {d6[1]},[r0,:32], r2 1.1672 + \macd q10, d0, d6 1.1673 + pld [r0] 1.1674 + vld1.32 {d7[0]},[r1,:32], r2 1.1675 + vld1.32 {d7[1]},[r1,:32], r2 1.1676 + \macs q10, d1, d7 1.1677 + pld [r1] 1.1678 + vshl.s16 q1, q1, q9 1.1679 + vqmovun.s16 d2, q1 1.1680 + vshl.s16 q10, q10, q9 1.1681 + vqmovun.s16 d4, q10 1.1682 + vmov q10, q8 1.1683 + vst1.32 {d2[0]},[r6,:32], r2 1.1684 + vst1.32 {d2[1]},[r6,:32], r2 1.1685 + vmov q1, q8 1.1686 + vst1.32 {d4[0]},[r6,:32], r2 1.1687 + vst1.32 {d4[1]},[r6,:32], r2 1.1688 + bne 1b 1.1689 + pop {r4-r6, pc} 1.1690 +2: vshl.s16 q1, q1, q9 1.1691 + vqmovun.s16 d2, q1 1.1692 + vst1.32 {d2[0]},[r6,:32], r2 1.1693 + vst1.32 {d2[1]},[r6,:32], r2 1.1694 + pop {r4-r6, pc} 1.1695 + .endm 1.1696 + 1.1697 + .macro biweight_func w 1.1698 +function biweight_h264_pixels_\w\()_neon 1.1699 + push {r4-r6, lr} 1.1700 + add r4, sp, #16 1.1701 + ldm r4, {r4-r6} 1.1702 + lsr lr, r4, #31 1.1703 + add r6, r6, #1 1.1704 + eors lr, lr, r5, lsr #30 1.1705 + orr r6, r6, #1 1.1706 + vdup.16 q9, r3 1.1707 + lsl r6, r6, r3 1.1708 + vmvn q9, q9 1.1709 + vdup.16 q8, r6 1.1710 + mov r6, r0 1.1711 + beq 10f 1.1712 + subs lr, lr, #1 1.1713 + beq 20f 1.1714 + subs lr, lr, #1 1.1715 + beq 30f 1.1716 + b 40f 1.1717 +10: biweight_\w vmlal.u8, vmlal.u8 1.1718 +20: rsb r4, r4, #0 1.1719 + biweight_\w vmlal.u8, vmlsl.u8 1.1720 +30: rsb r4, r4, #0 1.1721 + rsb r5, r5, #0 1.1722 + biweight_\w vmlsl.u8, vmlsl.u8 1.1723 +40: rsb r5, r5, #0 1.1724 + biweight_\w vmlsl.u8, vmlal.u8 1.1725 +endfunc 1.1726 + .endm 1.1727 + 1.1728 + .macro biweight_entry w, h, b=1 1.1729 +function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 1.1730 + mov ip, #\h 1.1731 +.if \b 1.1732 + b biweight_h264_pixels_\w\()_neon 1.1733 +.endif 1.1734 +endfunc 1.1735 + .endm 1.1736 + 1.1737 + biweight_entry 16, 8 1.1738 + biweight_entry 16, 16, b=0 1.1739 + biweight_func 16 1.1740 + 1.1741 + biweight_entry 8, 16 1.1742 + biweight_entry 8, 4 1.1743 + biweight_entry 8, 8, b=0 1.1744 + biweight_func 8 1.1745 + 1.1746 + biweight_entry 4, 8 1.1747 + biweight_entry 4, 2 1.1748 + biweight_entry 4, 4, b=0 1.1749 + biweight_func 4 1.1750 + 1.1751 +@ Weighted prediction 1.1752 + 1.1753 + .macro weight_16 add 1.1754 + vdup.8 d0, r3 1.1755 +1: subs ip, ip, #2 1.1756 + vld1.8 {d20-d21},[r0,:128], r1 1.1757 + vmull.u8 q2, d0, d20 1.1758 + pld [r0] 1.1759 + vmull.u8 q3, d0, d21 1.1760 + vld1.8 {d28-d29},[r0,:128], r1 1.1761 + vmull.u8 q12, d0, d28 1.1762 + pld [r0] 1.1763 + vmull.u8 q13, d0, d29 1.1764 + \add q2, q8, q2 1.1765 + vrshl.s16 q2, q2, q9 1.1766 + \add q3, q8, q3 1.1767 + vrshl.s16 q3, q3, q9 1.1768 + vqmovun.s16 d4, q2 1.1769 + vqmovun.s16 d5, q3 1.1770 + \add q12, q8, q12 1.1771 + vrshl.s16 q12, q12, q9 1.1772 + \add q13, q8, q13 1.1773 + vrshl.s16 q13, q13, q9 1.1774 + vqmovun.s16 d24, q12 1.1775 + vqmovun.s16 d25, q13 1.1776 + vst1.8 {d4- d5}, [r4,:128], r1 1.1777 + vst1.8 {d24-d25},[r4,:128], r1 1.1778 + bne 1b 1.1779 + pop {r4, pc} 1.1780 + .endm 1.1781 + 1.1782 + .macro weight_8 add 1.1783 + vdup.8 d0, r3 1.1784 +1: subs ip, ip, #2 1.1785 + vld1.8 {d4},[r0,:64], r1 1.1786 + vmull.u8 q1, d0, d4 1.1787 + pld [r0] 1.1788 + vld1.8 {d6},[r0,:64], r1 1.1789 + vmull.u8 q10, d0, d6 1.1790 + \add q1, q8, q1 1.1791 + pld [r0] 1.1792 + vrshl.s16 q1, q1, q9 1.1793 + vqmovun.s16 d2, q1 1.1794 + \add q10, q8, q10 1.1795 + vrshl.s16 q10, q10, q9 1.1796 + vqmovun.s16 d4, q10 1.1797 + vst1.8 {d2},[r4,:64], r1 1.1798 + vst1.8 {d4},[r4,:64], r1 1.1799 + bne 1b 1.1800 + pop {r4, pc} 1.1801 + .endm 1.1802 + 1.1803 + .macro weight_4 add 1.1804 + vdup.8 d0, r3 1.1805 + vmov q1, q8 1.1806 + vmov q10, q8 1.1807 +1: subs ip, ip, #4 1.1808 + vld1.32 {d4[0]},[r0,:32], r1 1.1809 + vld1.32 {d4[1]},[r0,:32], r1 1.1810 + vmull.u8 q1, d0, d4 1.1811 + pld [r0] 1.1812 + blt 2f 1.1813 + vld1.32 {d6[0]},[r0,:32], r1 1.1814 + vld1.32 {d6[1]},[r0,:32], r1 1.1815 + vmull.u8 q10, d0, d6 1.1816 + pld [r0] 1.1817 + \add q1, q8, q1 1.1818 + vrshl.s16 q1, q1, q9 1.1819 + vqmovun.s16 d2, q1 1.1820 + \add q10, q8, q10 1.1821 + vrshl.s16 q10, q10, q9 1.1822 + vqmovun.s16 d4, q10 1.1823 + vmov q10, q8 1.1824 + vst1.32 {d2[0]},[r4,:32], r1 1.1825 + vst1.32 {d2[1]},[r4,:32], r1 1.1826 + vmov q1, q8 1.1827 + vst1.32 {d4[0]},[r4,:32], r1 1.1828 + vst1.32 {d4[1]},[r4,:32], r1 1.1829 + bne 1b 1.1830 + pop {r4, pc} 1.1831 +2: \add q1, q8, q1 1.1832 + vrshl.s16 q1, q1, q9 1.1833 + vqmovun.s16 d2, q1 1.1834 + vst1.32 {d2[0]},[r4,:32], r1 1.1835 + vst1.32 {d2[1]},[r4,:32], r1 1.1836 + pop {r4, pc} 1.1837 + .endm 1.1838 + 1.1839 + .macro weight_func w 1.1840 +function weight_h264_pixels_\w\()_neon 1.1841 + push {r4, lr} 1.1842 + ldr r4, [sp, #8] 1.1843 + cmp r2, #1 1.1844 + lsl r4, r4, r2 1.1845 + vdup.16 q8, r4 1.1846 + mov r4, r0 1.1847 + ble 20f 1.1848 + rsb lr, r2, #1 1.1849 + vdup.16 q9, lr 1.1850 + cmp r3, #0 1.1851 + blt 10f 1.1852 + weight_\w vhadd.s16 1.1853 +10: rsb r3, r3, #0 1.1854 + weight_\w vhsub.s16 1.1855 +20: rsb lr, r2, #0 1.1856 + vdup.16 q9, lr 1.1857 + cmp r3, #0 1.1858 + blt 10f 1.1859 + weight_\w vadd.s16 1.1860 +10: rsb r3, r3, #0 1.1861 + weight_\w vsub.s16 1.1862 +endfunc 1.1863 + .endm 1.1864 + 1.1865 + .macro weight_entry w, h, b=1 1.1866 +function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 1.1867 + mov ip, #\h 1.1868 +.if \b 1.1869 + b weight_h264_pixels_\w\()_neon 1.1870 +.endif 1.1871 +endfunc 1.1872 + .endm 1.1873 + 1.1874 + weight_entry 16, 8 1.1875 + weight_entry 16, 16, b=0 1.1876 + weight_func 16 1.1877 + 1.1878 + weight_entry 8, 16 1.1879 + weight_entry 8, 4 1.1880 + weight_entry 8, 8, b=0 1.1881 + weight_func 8 1.1882 + 1.1883 + weight_entry 4, 8 1.1884 + weight_entry 4, 2 1.1885 + weight_entry 4, 4, b=0 1.1886 + weight_func 4
