Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
comparison libavcodec/arm/h264dsp_neon.S @ 2:897f711a7157
rearrange to work with autoconf
| author | Nina Engelhardt <nengel@mailbox.tu-berlin.de> |
|---|---|
| date | Tue, 25 Sep 2012 15:55:33 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:51821dfa9d72 |
|---|---|
| 1 /* | |
| 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
| 3 * | |
| 4 * This file is part of FFmpeg. | |
| 5 * | |
| 6 * FFmpeg is free software; you can redistribute it and/or | |
| 7 * modify it under the terms of the GNU Lesser General Public | |
| 8 * License as published by the Free Software Foundation; either | |
| 9 * version 2.1 of the License, or (at your option) any later version. | |
| 10 * | |
| 11 * FFmpeg is distributed in the hope that it will be useful, | |
| 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 14 * Lesser General Public License for more details. | |
| 15 * | |
| 16 * You should have received a copy of the GNU Lesser General Public | |
| 17 * License along with FFmpeg; if not, write to the Free Software | |
| 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 19 */ | |
| 20 | |
| 21 #include "asm.S" | |
| 22 | |
| 23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 | |
| 24 vtrn.32 \r0, \r4 | |
| 25 vtrn.32 \r1, \r5 | |
| 26 vtrn.32 \r2, \r6 | |
| 27 vtrn.32 \r3, \r7 | |
| 28 vtrn.16 \r0, \r2 | |
| 29 vtrn.16 \r1, \r3 | |
| 30 vtrn.16 \r4, \r6 | |
| 31 vtrn.16 \r5, \r7 | |
| 32 vtrn.8 \r0, \r1 | |
| 33 vtrn.8 \r2, \r3 | |
| 34 vtrn.8 \r4, \r5 | |
| 35 vtrn.8 \r6, \r7 | |
| 36 .endm | |
| 37 | |
| 38 .macro transpose_4x4 r0 r1 r2 r3 | |
| 39 vtrn.16 \r0, \r2 | |
| 40 vtrn.16 \r1, \r3 | |
| 41 vtrn.8 \r0, \r1 | |
| 42 vtrn.8 \r2, \r3 | |
| 43 .endm | |
| 44 | |
| 45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 | |
| 46 vswp \r0, \r4 | |
| 47 vswp \r1, \r5 | |
| 48 vswp \r2, \r6 | |
| 49 vswp \r3, \r7 | |
| 50 .endm | |
| 51 | |
| 52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 | |
| 53 vtrn.32 \r0, \r2 | |
| 54 vtrn.32 \r1, \r3 | |
| 55 vtrn.32 \r4, \r6 | |
| 56 vtrn.32 \r5, \r7 | |
| 57 vtrn.16 \r0, \r1 | |
| 58 vtrn.16 \r2, \r3 | |
| 59 vtrn.16 \r4, \r5 | |
| 60 vtrn.16 \r6, \r7 | |
| 61 .endm | |
| 62 | |
| 63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
| 64 .macro h264_chroma_mc8 type | |
| 65 function ff_\type\()_h264_chroma_mc8_neon, export=1 | |
| 66 push {r4-r7, lr} | |
| 67 ldrd r4, [sp, #20] | |
| 68 .ifc \type,avg | |
| 69 mov lr, r0 | |
| 70 .endif | |
| 71 pld [r1] | |
| 72 pld [r1, r2] | |
| 73 | |
| 74 muls r7, r4, r5 | |
| 75 rsb r6, r7, r5, lsl #3 | |
| 76 rsb ip, r7, r4, lsl #3 | |
| 77 sub r4, r7, r4, lsl #3 | |
| 78 sub r4, r4, r5, lsl #3 | |
| 79 add r4, r4, #64 | |
| 80 | |
| 81 beq 2f | |
| 82 | |
| 83 add r5, r1, r2 | |
| 84 | |
| 85 vdup.8 d0, r4 | |
| 86 lsl r4, r2, #1 | |
| 87 vdup.8 d1, ip | |
| 88 vld1.64 {d4, d5}, [r1], r4 | |
| 89 vdup.8 d2, r6 | |
| 90 vld1.64 {d6, d7}, [r5], r4 | |
| 91 vdup.8 d3, r7 | |
| 92 | |
| 93 vext.8 d5, d4, d5, #1 | |
| 94 vext.8 d7, d6, d7, #1 | |
| 95 | |
| 96 1: pld [r5] | |
| 97 vmull.u8 q8, d4, d0 | |
| 98 vmlal.u8 q8, d5, d1 | |
| 99 vld1.64 {d4, d5}, [r1], r4 | |
| 100 vmlal.u8 q8, d6, d2 | |
| 101 vext.8 d5, d4, d5, #1 | |
| 102 vmlal.u8 q8, d7, d3 | |
| 103 vmull.u8 q9, d6, d0 | |
| 104 subs r3, r3, #2 | |
| 105 vmlal.u8 q9, d7, d1 | |
| 106 vmlal.u8 q9, d4, d2 | |
| 107 vmlal.u8 q9, d5, d3 | |
| 108 vrshrn.u16 d16, q8, #6 | |
| 109 vld1.64 {d6, d7}, [r5], r4 | |
| 110 pld [r1] | |
| 111 vrshrn.u16 d17, q9, #6 | |
| 112 .ifc \type,avg | |
| 113 vld1.64 {d20}, [lr,:64], r2 | |
| 114 vld1.64 {d21}, [lr,:64], r2 | |
| 115 vrhadd.u8 q8, q8, q10 | |
| 116 .endif | |
| 117 vext.8 d7, d6, d7, #1 | |
| 118 vst1.64 {d16}, [r0,:64], r2 | |
| 119 vst1.64 {d17}, [r0,:64], r2 | |
| 120 bgt 1b | |
| 121 | |
| 122 pop {r4-r7, pc} | |
| 123 | |
| 124 2: tst r6, r6 | |
| 125 add ip, ip, r6 | |
| 126 vdup.8 d0, r4 | |
| 127 vdup.8 d1, ip | |
| 128 | |
| 129 beq 4f | |
| 130 | |
| 131 add r5, r1, r2 | |
| 132 lsl r4, r2, #1 | |
| 133 vld1.64 {d4}, [r1], r4 | |
| 134 vld1.64 {d6}, [r5], r4 | |
| 135 | |
| 136 3: pld [r5] | |
| 137 vmull.u8 q8, d4, d0 | |
| 138 vmlal.u8 q8, d6, d1 | |
| 139 vld1.64 {d4}, [r1], r4 | |
| 140 vmull.u8 q9, d6, d0 | |
| 141 vmlal.u8 q9, d4, d1 | |
| 142 vld1.64 {d6}, [r5], r4 | |
| 143 vrshrn.u16 d16, q8, #6 | |
| 144 vrshrn.u16 d17, q9, #6 | |
| 145 .ifc \type,avg | |
| 146 vld1.64 {d20}, [lr,:64], r2 | |
| 147 vld1.64 {d21}, [lr,:64], r2 | |
| 148 vrhadd.u8 q8, q8, q10 | |
| 149 .endif | |
| 150 subs r3, r3, #2 | |
| 151 pld [r1] | |
| 152 vst1.64 {d16}, [r0,:64], r2 | |
| 153 vst1.64 {d17}, [r0,:64], r2 | |
| 154 bgt 3b | |
| 155 | |
| 156 pop {r4-r7, pc} | |
| 157 | |
| 158 4: vld1.64 {d4, d5}, [r1], r2 | |
| 159 vld1.64 {d6, d7}, [r1], r2 | |
| 160 vext.8 d5, d4, d5, #1 | |
| 161 vext.8 d7, d6, d7, #1 | |
| 162 | |
| 163 5: pld [r1] | |
| 164 subs r3, r3, #2 | |
| 165 vmull.u8 q8, d4, d0 | |
| 166 vmlal.u8 q8, d5, d1 | |
| 167 vld1.64 {d4, d5}, [r1], r2 | |
| 168 vmull.u8 q9, d6, d0 | |
| 169 vmlal.u8 q9, d7, d1 | |
| 170 pld [r1] | |
| 171 vext.8 d5, d4, d5, #1 | |
| 172 vrshrn.u16 d16, q8, #6 | |
| 173 vrshrn.u16 d17, q9, #6 | |
| 174 .ifc \type,avg | |
| 175 vld1.64 {d20}, [lr,:64], r2 | |
| 176 vld1.64 {d21}, [lr,:64], r2 | |
| 177 vrhadd.u8 q8, q8, q10 | |
| 178 .endif | |
| 179 vld1.64 {d6, d7}, [r1], r2 | |
| 180 vext.8 d7, d6, d7, #1 | |
| 181 vst1.64 {d16}, [r0,:64], r2 | |
| 182 vst1.64 {d17}, [r0,:64], r2 | |
| 183 bgt 5b | |
| 184 | |
| 185 pop {r4-r7, pc} | |
| 186 endfunc | |
| 187 .endm | |
| 188 | |
| 189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
| 190 .macro h264_chroma_mc4 type | |
| 191 function ff_\type\()_h264_chroma_mc4_neon, export=1 | |
| 192 push {r4-r7, lr} | |
| 193 ldrd r4, [sp, #20] | |
| 194 .ifc \type,avg | |
| 195 mov lr, r0 | |
| 196 .endif | |
| 197 pld [r1] | |
| 198 pld [r1, r2] | |
| 199 | |
| 200 muls r7, r4, r5 | |
| 201 rsb r6, r7, r5, lsl #3 | |
| 202 rsb ip, r7, r4, lsl #3 | |
| 203 sub r4, r7, r4, lsl #3 | |
| 204 sub r4, r4, r5, lsl #3 | |
| 205 add r4, r4, #64 | |
| 206 | |
| 207 beq 2f | |
| 208 | |
| 209 add r5, r1, r2 | |
| 210 | |
| 211 vdup.8 d0, r4 | |
| 212 lsl r4, r2, #1 | |
| 213 vdup.8 d1, ip | |
| 214 vld1.64 {d4}, [r1], r4 | |
| 215 vdup.8 d2, r6 | |
| 216 vld1.64 {d6}, [r5], r4 | |
| 217 vdup.8 d3, r7 | |
| 218 | |
| 219 vext.8 d5, d4, d5, #1 | |
| 220 vext.8 d7, d6, d7, #1 | |
| 221 vtrn.32 d4, d5 | |
| 222 vtrn.32 d6, d7 | |
| 223 | |
| 224 vtrn.32 d0, d1 | |
| 225 vtrn.32 d2, d3 | |
| 226 | |
| 227 1: pld [r5] | |
| 228 vmull.u8 q8, d4, d0 | |
| 229 vmlal.u8 q8, d6, d2 | |
| 230 vld1.64 {d4}, [r1], r4 | |
| 231 vext.8 d5, d4, d5, #1 | |
| 232 vtrn.32 d4, d5 | |
| 233 vmull.u8 q9, d6, d0 | |
| 234 vmlal.u8 q9, d4, d2 | |
| 235 vld1.64 {d6}, [r5], r4 | |
| 236 vadd.i16 d16, d16, d17 | |
| 237 vadd.i16 d17, d18, d19 | |
| 238 vrshrn.u16 d16, q8, #6 | |
| 239 subs r3, r3, #2 | |
| 240 pld [r1] | |
| 241 .ifc \type,avg | |
| 242 vld1.32 {d20[0]}, [lr,:32], r2 | |
| 243 vld1.32 {d20[1]}, [lr,:32], r2 | |
| 244 vrhadd.u8 d16, d16, d20 | |
| 245 .endif | |
| 246 vext.8 d7, d6, d7, #1 | |
| 247 vtrn.32 d6, d7 | |
| 248 vst1.32 {d16[0]}, [r0,:32], r2 | |
| 249 vst1.32 {d16[1]}, [r0,:32], r2 | |
| 250 bgt 1b | |
| 251 | |
| 252 pop {r4-r7, pc} | |
| 253 | |
| 254 2: tst r6, r6 | |
| 255 add ip, ip, r6 | |
| 256 vdup.8 d0, r4 | |
| 257 vdup.8 d1, ip | |
| 258 vtrn.32 d0, d1 | |
| 259 | |
| 260 beq 4f | |
| 261 | |
| 262 vext.32 d1, d0, d1, #1 | |
| 263 add r5, r1, r2 | |
| 264 lsl r4, r2, #1 | |
| 265 vld1.32 {d4[0]}, [r1], r4 | |
| 266 vld1.32 {d4[1]}, [r5], r4 | |
| 267 | |
| 268 3: pld [r5] | |
| 269 vmull.u8 q8, d4, d0 | |
| 270 vld1.32 {d4[0]}, [r1], r4 | |
| 271 vmull.u8 q9, d4, d1 | |
| 272 vld1.32 {d4[1]}, [r5], r4 | |
| 273 vadd.i16 d16, d16, d17 | |
| 274 vadd.i16 d17, d18, d19 | |
| 275 vrshrn.u16 d16, q8, #6 | |
| 276 .ifc \type,avg | |
| 277 vld1.32 {d20[0]}, [lr,:32], r2 | |
| 278 vld1.32 {d20[1]}, [lr,:32], r2 | |
| 279 vrhadd.u8 d16, d16, d20 | |
| 280 .endif | |
| 281 subs r3, r3, #2 | |
| 282 pld [r1] | |
| 283 vst1.32 {d16[0]}, [r0,:32], r2 | |
| 284 vst1.32 {d16[1]}, [r0,:32], r2 | |
| 285 bgt 3b | |
| 286 | |
| 287 pop {r4-r7, pc} | |
| 288 | |
| 289 4: vld1.64 {d4}, [r1], r2 | |
| 290 vld1.64 {d6}, [r1], r2 | |
| 291 vext.8 d5, d4, d5, #1 | |
| 292 vext.8 d7, d6, d7, #1 | |
| 293 vtrn.32 d4, d5 | |
| 294 vtrn.32 d6, d7 | |
| 295 | |
| 296 5: vmull.u8 q8, d4, d0 | |
| 297 vmull.u8 q9, d6, d0 | |
| 298 subs r3, r3, #2 | |
| 299 vld1.64 {d4}, [r1], r2 | |
| 300 vext.8 d5, d4, d5, #1 | |
| 301 vtrn.32 d4, d5 | |
| 302 vadd.i16 d16, d16, d17 | |
| 303 vadd.i16 d17, d18, d19 | |
| 304 pld [r1] | |
| 305 vrshrn.u16 d16, q8, #6 | |
| 306 .ifc \type,avg | |
| 307 vld1.32 {d20[0]}, [lr,:32], r2 | |
| 308 vld1.32 {d20[1]}, [lr,:32], r2 | |
| 309 vrhadd.u8 d16, d16, d20 | |
| 310 .endif | |
| 311 vld1.64 {d6}, [r1], r2 | |
| 312 vext.8 d7, d6, d7, #1 | |
| 313 vtrn.32 d6, d7 | |
| 314 pld [r1] | |
| 315 vst1.32 {d16[0]}, [r0,:32], r2 | |
| 316 vst1.32 {d16[1]}, [r0,:32], r2 | |
| 317 bgt 5b | |
| 318 | |
| 319 pop {r4-r7, pc} | |
| 320 endfunc | |
| 321 .endm | |
| 322 | |
| 323 .macro h264_chroma_mc2 type | |
| 324 function ff_\type\()_h264_chroma_mc2_neon, export=1 | |
| 325 push {r4-r6, lr} | |
| 326 ldr r4, [sp, #16] | |
| 327 ldr lr, [sp, #20] | |
| 328 pld [r1] | |
| 329 pld [r1, r2] | |
| 330 orrs r5, r4, lr | |
| 331 beq 2f | |
| 332 | |
| 333 mul r5, r4, lr | |
| 334 rsb r6, r5, lr, lsl #3 | |
| 335 rsb r12, r5, r4, lsl #3 | |
| 336 sub r4, r5, r4, lsl #3 | |
| 337 sub r4, r4, lr, lsl #3 | |
| 338 add r4, r4, #64 | |
| 339 vdup.8 d0, r4 | |
| 340 vdup.8 d2, r12 | |
| 341 vdup.8 d1, r6 | |
| 342 vdup.8 d3, r5 | |
| 343 vtrn.16 q0, q1 | |
| 344 1: | |
| 345 vld1.32 {d4[0]}, [r1], r2 | |
| 346 vld1.32 {d4[1]}, [r1], r2 | |
| 347 vrev64.32 d5, d4 | |
| 348 vld1.32 {d5[1]}, [r1] | |
| 349 vext.8 q3, q2, q2, #1 | |
| 350 vtrn.16 q2, q3 | |
| 351 vmull.u8 q8, d4, d0 | |
| 352 vmlal.u8 q8, d5, d1 | |
| 353 .ifc \type,avg | |
| 354 vld1.16 {d18[0]}, [r0,:16], r2 | |
| 355 vld1.16 {d18[1]}, [r0,:16] | |
| 356 sub r0, r0, r2 | |
| 357 .endif | |
| 358 vtrn.32 d16, d17 | |
| 359 vadd.i16 d16, d16, d17 | |
| 360 vrshrn.u16 d16, q8, #6 | |
| 361 .ifc \type,avg | |
| 362 vrhadd.u8 d16, d16, d18 | |
| 363 .endif | |
| 364 vst1.16 {d16[0]}, [r0,:16], r2 | |
| 365 vst1.16 {d16[1]}, [r0,:16], r2 | |
| 366 subs r3, r3, #2 | |
| 367 bgt 1b | |
| 368 pop {r4-r6, pc} | |
| 369 2: | |
| 370 .ifc \type,put | |
| 371 ldrh r5, [r1], r2 | |
| 372 strh r5, [r0], r2 | |
| 373 ldrh r6, [r1], r2 | |
| 374 strh r6, [r0], r2 | |
| 375 .else | |
| 376 vld1.16 {d16[0]}, [r1], r2 | |
| 377 vld1.16 {d16[1]}, [r1], r2 | |
| 378 vld1.16 {d18[0]}, [r0,:16], r2 | |
| 379 vld1.16 {d18[1]}, [r0,:16] | |
| 380 sub r0, r0, r2 | |
| 381 vrhadd.u8 d16, d16, d18 | |
| 382 vst1.16 {d16[0]}, [r0,:16], r2 | |
| 383 vst1.16 {d16[1]}, [r0,:16], r2 | |
| 384 .endif | |
| 385 subs r3, r3, #2 | |
| 386 bgt 2b | |
| 387 pop {r4-r6, pc} | |
| 388 endfunc | |
| 389 .endm | |
| 390 | |
| 391 .text | |
| 392 .align | |
| 393 | |
| 394 h264_chroma_mc8 put | |
| 395 h264_chroma_mc8 avg | |
| 396 h264_chroma_mc4 put | |
| 397 h264_chroma_mc4 avg | |
| 398 h264_chroma_mc2 put | |
| 399 h264_chroma_mc2 avg | |
| 400 | |
| 401 /* H.264 loop filter */ | |
| 402 | |
| 403 .macro h264_loop_filter_start | |
| 404 ldr ip, [sp] | |
| 405 tst r2, r2 | |
| 406 ldr ip, [ip] | |
| 407 tstne r3, r3 | |
| 408 vmov.32 d24[0], ip | |
| 409 and ip, ip, ip, lsl #16 | |
| 410 bxeq lr | |
| 411 ands ip, ip, ip, lsl #8 | |
| 412 bxlt lr | |
| 413 .endm | |
| 414 | |
| 415 .macro align_push_regs | |
| 416 and ip, sp, #15 | |
| 417 add ip, ip, #32 | |
| 418 sub sp, sp, ip | |
| 419 vst1.64 {d12-d15}, [sp,:128] | |
| 420 sub sp, sp, #32 | |
| 421 vst1.64 {d8-d11}, [sp,:128] | |
| 422 .endm | |
| 423 | |
| 424 .macro align_pop_regs | |
| 425 vld1.64 {d8-d11}, [sp,:128]! | |
| 426 vld1.64 {d12-d15}, [sp,:128], ip | |
| 427 .endm | |
| 428 | |
| 429 .macro h264_loop_filter_luma | |
| 430 vdup.8 q11, r2 @ alpha | |
| 431 vmovl.u8 q12, d24 | |
| 432 vabd.u8 q6, q8, q0 @ abs(p0 - q0) | |
| 433 vmovl.u16 q12, d24 | |
| 434 vabd.u8 q14, q9, q8 @ abs(p1 - p0) | |
| 435 vsli.16 q12, q12, #8 | |
| 436 vabd.u8 q15, q1, q0 @ abs(q1 - q0) | |
| 437 vsli.32 q12, q12, #16 | |
| 438 vclt.u8 q6, q6, q11 @ < alpha | |
| 439 vdup.8 q11, r3 @ beta | |
| 440 vclt.s8 q7, q12, #0 | |
| 441 vclt.u8 q14, q14, q11 @ < beta | |
| 442 vclt.u8 q15, q15, q11 @ < beta | |
| 443 vbic q6, q6, q7 | |
| 444 vabd.u8 q4, q10, q8 @ abs(p2 - p0) | |
| 445 vand q6, q6, q14 | |
| 446 vabd.u8 q5, q2, q0 @ abs(q2 - q0) | |
| 447 vclt.u8 q4, q4, q11 @ < beta | |
| 448 vand q6, q6, q15 | |
| 449 vclt.u8 q5, q5, q11 @ < beta | |
| 450 vand q4, q4, q6 | |
| 451 vand q5, q5, q6 | |
| 452 vand q12, q12, q6 | |
| 453 vrhadd.u8 q14, q8, q0 | |
| 454 vsub.i8 q6, q12, q4 | |
| 455 vqadd.u8 q7, q9, q12 | |
| 456 vhadd.u8 q10, q10, q14 | |
| 457 vsub.i8 q6, q6, q5 | |
| 458 vhadd.u8 q14, q2, q14 | |
| 459 vmin.u8 q7, q7, q10 | |
| 460 vqsub.u8 q11, q9, q12 | |
| 461 vqadd.u8 q2, q1, q12 | |
| 462 vmax.u8 q7, q7, q11 | |
| 463 vqsub.u8 q11, q1, q12 | |
| 464 vmin.u8 q14, q2, q14 | |
| 465 vmovl.u8 q2, d0 | |
| 466 vmax.u8 q14, q14, q11 | |
| 467 vmovl.u8 q10, d1 | |
| 468 vsubw.u8 q2, q2, d16 | |
| 469 vsubw.u8 q10, q10, d17 | |
| 470 vshl.i16 q2, q2, #2 | |
| 471 vshl.i16 q10, q10, #2 | |
| 472 vaddw.u8 q2, q2, d18 | |
| 473 vaddw.u8 q10, q10, d19 | |
| 474 vsubw.u8 q2, q2, d2 | |
| 475 vsubw.u8 q10, q10, d3 | |
| 476 vrshrn.i16 d4, q2, #3 | |
| 477 vrshrn.i16 d5, q10, #3 | |
| 478 vbsl q4, q7, q9 | |
| 479 vbsl q5, q14, q1 | |
| 480 vneg.s8 q7, q6 | |
| 481 vmovl.u8 q14, d16 | |
| 482 vmin.s8 q2, q2, q6 | |
| 483 vmovl.u8 q6, d17 | |
| 484 vmax.s8 q2, q2, q7 | |
| 485 vmovl.u8 q11, d0 | |
| 486 vmovl.u8 q12, d1 | |
| 487 vaddw.s8 q14, q14, d4 | |
| 488 vaddw.s8 q6, q6, d5 | |
| 489 vsubw.s8 q11, q11, d4 | |
| 490 vsubw.s8 q12, q12, d5 | |
| 491 vqmovun.s16 d16, q14 | |
| 492 vqmovun.s16 d17, q6 | |
| 493 vqmovun.s16 d0, q11 | |
| 494 vqmovun.s16 d1, q12 | |
| 495 .endm | |
| 496 | |
| 497 function ff_h264_v_loop_filter_luma_neon, export=1 | |
| 498 h264_loop_filter_start | |
| 499 | |
| 500 vld1.64 {d0, d1}, [r0,:128], r1 | |
| 501 vld1.64 {d2, d3}, [r0,:128], r1 | |
| 502 vld1.64 {d4, d5}, [r0,:128], r1 | |
| 503 sub r0, r0, r1, lsl #2 | |
| 504 sub r0, r0, r1, lsl #1 | |
| 505 vld1.64 {d20,d21}, [r0,:128], r1 | |
| 506 vld1.64 {d18,d19}, [r0,:128], r1 | |
| 507 vld1.64 {d16,d17}, [r0,:128], r1 | |
| 508 | |
| 509 align_push_regs | |
| 510 | |
| 511 h264_loop_filter_luma | |
| 512 | |
| 513 sub r0, r0, r1, lsl #1 | |
| 514 vst1.64 {d8, d9}, [r0,:128], r1 | |
| 515 vst1.64 {d16,d17}, [r0,:128], r1 | |
| 516 vst1.64 {d0, d1}, [r0,:128], r1 | |
| 517 vst1.64 {d10,d11}, [r0,:128] | |
| 518 | |
| 519 align_pop_regs | |
| 520 bx lr | |
| 521 endfunc | |
| 522 | |
| 523 function ff_h264_h_loop_filter_luma_neon, export=1 | |
| 524 h264_loop_filter_start | |
| 525 | |
| 526 sub r0, r0, #4 | |
| 527 vld1.64 {d6}, [r0], r1 | |
| 528 vld1.64 {d20}, [r0], r1 | |
| 529 vld1.64 {d18}, [r0], r1 | |
| 530 vld1.64 {d16}, [r0], r1 | |
| 531 vld1.64 {d0}, [r0], r1 | |
| 532 vld1.64 {d2}, [r0], r1 | |
| 533 vld1.64 {d4}, [r0], r1 | |
| 534 vld1.64 {d26}, [r0], r1 | |
| 535 vld1.64 {d7}, [r0], r1 | |
| 536 vld1.64 {d21}, [r0], r1 | |
| 537 vld1.64 {d19}, [r0], r1 | |
| 538 vld1.64 {d17}, [r0], r1 | |
| 539 vld1.64 {d1}, [r0], r1 | |
| 540 vld1.64 {d3}, [r0], r1 | |
| 541 vld1.64 {d5}, [r0], r1 | |
| 542 vld1.64 {d27}, [r0], r1 | |
| 543 | |
| 544 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 | |
| 545 | |
| 546 align_push_regs | |
| 547 | |
| 548 h264_loop_filter_luma | |
| 549 | |
| 550 transpose_4x4 q4, q8, q0, q5 | |
| 551 | |
| 552 sub r0, r0, r1, lsl #4 | |
| 553 add r0, r0, #2 | |
| 554 vst1.32 {d8[0]}, [r0], r1 | |
| 555 vst1.32 {d16[0]}, [r0], r1 | |
| 556 vst1.32 {d0[0]}, [r0], r1 | |
| 557 vst1.32 {d10[0]}, [r0], r1 | |
| 558 vst1.32 {d8[1]}, [r0], r1 | |
| 559 vst1.32 {d16[1]}, [r0], r1 | |
| 560 vst1.32 {d0[1]}, [r0], r1 | |
| 561 vst1.32 {d10[1]}, [r0], r1 | |
| 562 vst1.32 {d9[0]}, [r0], r1 | |
| 563 vst1.32 {d17[0]}, [r0], r1 | |
| 564 vst1.32 {d1[0]}, [r0], r1 | |
| 565 vst1.32 {d11[0]}, [r0], r1 | |
| 566 vst1.32 {d9[1]}, [r0], r1 | |
| 567 vst1.32 {d17[1]}, [r0], r1 | |
| 568 vst1.32 {d1[1]}, [r0], r1 | |
| 569 vst1.32 {d11[1]}, [r0], r1 | |
| 570 | |
| 571 align_pop_regs | |
| 572 bx lr | |
| 573 endfunc | |
| 574 | |
| 575 .macro h264_loop_filter_chroma | |
| 576 vdup.8 d22, r2 @ alpha | |
| 577 vmovl.u8 q12, d24 | |
| 578 vabd.u8 d26, d16, d0 @ abs(p0 - q0) | |
| 579 vmovl.u8 q2, d0 | |
| 580 vabd.u8 d28, d18, d16 @ abs(p1 - p0) | |
| 581 vsubw.u8 q2, q2, d16 | |
| 582 vsli.16 d24, d24, #8 | |
| 583 vshl.i16 q2, q2, #2 | |
| 584 vabd.u8 d30, d2, d0 @ abs(q1 - q0) | |
| 585 vaddw.u8 q2, q2, d18 | |
| 586 vclt.u8 d26, d26, d22 @ < alpha | |
| 587 vsubw.u8 q2, q2, d2 | |
| 588 vdup.8 d22, r3 @ beta | |
| 589 vclt.s8 d25, d24, #0 | |
| 590 vrshrn.i16 d4, q2, #3 | |
| 591 vclt.u8 d28, d28, d22 @ < beta | |
| 592 vbic d26, d26, d25 | |
| 593 vclt.u8 d30, d30, d22 @ < beta | |
| 594 vand d26, d26, d28 | |
| 595 vneg.s8 d25, d24 | |
| 596 vand d26, d26, d30 | |
| 597 vmin.s8 d4, d4, d24 | |
| 598 vmovl.u8 q14, d16 | |
| 599 vand d4, d4, d26 | |
| 600 vmax.s8 d4, d4, d25 | |
| 601 vmovl.u8 q11, d0 | |
| 602 vaddw.s8 q14, q14, d4 | |
| 603 vsubw.s8 q11, q11, d4 | |
| 604 vqmovun.s16 d16, q14 | |
| 605 vqmovun.s16 d0, q11 | |
| 606 .endm | |
| 607 | |
| 608 function ff_h264_v_loop_filter_chroma_neon, export=1 | |
| 609 h264_loop_filter_start | |
| 610 | |
| 611 sub r0, r0, r1, lsl #1 | |
| 612 vld1.64 {d18}, [r0,:64], r1 | |
| 613 vld1.64 {d16}, [r0,:64], r1 | |
| 614 vld1.64 {d0}, [r0,:64], r1 | |
| 615 vld1.64 {d2}, [r0,:64] | |
| 616 | |
| 617 h264_loop_filter_chroma | |
| 618 | |
| 619 sub r0, r0, r1, lsl #1 | |
| 620 vst1.64 {d16}, [r0,:64], r1 | |
| 621 vst1.64 {d0}, [r0,:64], r1 | |
| 622 | |
| 623 bx lr | |
| 624 endfunc | |
| 625 | |
| 626 function ff_h264_h_loop_filter_chroma_neon, export=1 | |
| 627 h264_loop_filter_start | |
| 628 | |
| 629 sub r0, r0, #2 | |
| 630 vld1.32 {d18[0]}, [r0], r1 | |
| 631 vld1.32 {d16[0]}, [r0], r1 | |
| 632 vld1.32 {d0[0]}, [r0], r1 | |
| 633 vld1.32 {d2[0]}, [r0], r1 | |
| 634 vld1.32 {d18[1]}, [r0], r1 | |
| 635 vld1.32 {d16[1]}, [r0], r1 | |
| 636 vld1.32 {d0[1]}, [r0], r1 | |
| 637 vld1.32 {d2[1]}, [r0], r1 | |
| 638 | |
| 639 vtrn.16 d18, d0 | |
| 640 vtrn.16 d16, d2 | |
| 641 vtrn.8 d18, d16 | |
| 642 vtrn.8 d0, d2 | |
| 643 | |
| 644 h264_loop_filter_chroma | |
| 645 | |
| 646 vtrn.16 d18, d0 | |
| 647 vtrn.16 d16, d2 | |
| 648 vtrn.8 d18, d16 | |
| 649 vtrn.8 d0, d2 | |
| 650 | |
| 651 sub r0, r0, r1, lsl #3 | |
| 652 vst1.32 {d18[0]}, [r0], r1 | |
| 653 vst1.32 {d16[0]}, [r0], r1 | |
| 654 vst1.32 {d0[0]}, [r0], r1 | |
| 655 vst1.32 {d2[0]}, [r0], r1 | |
| 656 vst1.32 {d18[1]}, [r0], r1 | |
| 657 vst1.32 {d16[1]}, [r0], r1 | |
| 658 vst1.32 {d0[1]}, [r0], r1 | |
| 659 vst1.32 {d2[1]}, [r0], r1 | |
| 660 | |
| 661 bx lr | |
| 662 endfunc | |
| 663 | |
| 664 /* H.264 qpel MC */ | |
| 665 | |
| 666 .macro lowpass_const r | |
| 667 movw \r, #5 | |
| 668 movt \r, #20 | |
| 669 vmov.32 d6[0], \r | |
| 670 .endm | |
| 671 | |
| 672 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 | |
| 673 .if \narrow | |
| 674 t0 .req q0 | |
| 675 t1 .req q8 | |
| 676 .else | |
| 677 t0 .req \d0 | |
| 678 t1 .req \d1 | |
| 679 .endif | |
| 680 vext.8 d2, \r0, \r1, #2 | |
| 681 vext.8 d3, \r0, \r1, #3 | |
| 682 vaddl.u8 q1, d2, d3 | |
| 683 vext.8 d4, \r0, \r1, #1 | |
| 684 vext.8 d5, \r0, \r1, #4 | |
| 685 vaddl.u8 q2, d4, d5 | |
| 686 vext.8 d30, \r0, \r1, #5 | |
| 687 vaddl.u8 t0, \r0, d30 | |
| 688 vext.8 d18, \r2, \r3, #2 | |
| 689 vmla.i16 t0, q1, d6[1] | |
| 690 vext.8 d19, \r2, \r3, #3 | |
| 691 vaddl.u8 q9, d18, d19 | |
| 692 vext.8 d20, \r2, \r3, #1 | |
| 693 vmls.i16 t0, q2, d6[0] | |
| 694 vext.8 d21, \r2, \r3, #4 | |
| 695 vaddl.u8 q10, d20, d21 | |
| 696 vext.8 d31, \r2, \r3, #5 | |
| 697 vaddl.u8 t1, \r2, d31 | |
| 698 vmla.i16 t1, q9, d6[1] | |
| 699 vmls.i16 t1, q10, d6[0] | |
| 700 .if \narrow | |
| 701 vqrshrun.s16 \d0, t0, #5 | |
| 702 vqrshrun.s16 \d1, t1, #5 | |
| 703 .endif | |
| 704 .unreq t0 | |
| 705 .unreq t1 | |
| 706 .endm | |
| 707 | |
| 708 .macro lowpass_8_1 r0, r1, d0, narrow=1 | |
| 709 .if \narrow | |
| 710 t0 .req q0 | |
| 711 .else | |
| 712 t0 .req \d0 | |
| 713 .endif | |
| 714 vext.8 d2, \r0, \r1, #2 | |
| 715 vext.8 d3, \r0, \r1, #3 | |
| 716 vaddl.u8 q1, d2, d3 | |
| 717 vext.8 d4, \r0, \r1, #1 | |
| 718 vext.8 d5, \r0, \r1, #4 | |
| 719 vaddl.u8 q2, d4, d5 | |
| 720 vext.8 d30, \r0, \r1, #5 | |
| 721 vaddl.u8 t0, \r0, d30 | |
| 722 vmla.i16 t0, q1, d6[1] | |
| 723 vmls.i16 t0, q2, d6[0] | |
| 724 .if \narrow | |
| 725 vqrshrun.s16 \d0, t0, #5 | |
| 726 .endif | |
| 727 .unreq t0 | |
| 728 .endm | |
| 729 | |
| 730 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d | |
| 731 vext.16 q1, \r0, \r1, #2 | |
| 732 vext.16 q0, \r0, \r1, #3 | |
| 733 vaddl.s16 q9, d2, d0 | |
| 734 vext.16 q2, \r0, \r1, #1 | |
| 735 vaddl.s16 q1, d3, d1 | |
| 736 vext.16 q3, \r0, \r1, #4 | |
| 737 vaddl.s16 q10, d4, d6 | |
| 738 vext.16 \r1, \r0, \r1, #5 | |
| 739 vaddl.s16 q2, d5, d7 | |
| 740 vaddl.s16 q0, \h0, \h1 | |
| 741 vaddl.s16 q8, \l0, \l1 | |
| 742 | |
| 743 vshl.i32 q3, q9, #4 | |
| 744 vshl.i32 q9, q9, #2 | |
| 745 vshl.i32 q15, q10, #2 | |
| 746 vadd.i32 q9, q9, q3 | |
| 747 vadd.i32 q10, q10, q15 | |
| 748 | |
| 749 vshl.i32 q3, q1, #4 | |
| 750 vshl.i32 q1, q1, #2 | |
| 751 vshl.i32 q15, q2, #2 | |
| 752 vadd.i32 q1, q1, q3 | |
| 753 vadd.i32 q2, q2, q15 | |
| 754 | |
| 755 vadd.i32 q9, q9, q8 | |
| 756 vsub.i32 q9, q9, q10 | |
| 757 | |
| 758 vadd.i32 q1, q1, q0 | |
| 759 vsub.i32 q1, q1, q2 | |
| 760 | |
| 761 vrshrn.s32 d18, q9, #10 | |
| 762 vrshrn.s32 d19, q1, #10 | |
| 763 | |
| 764 vqmovun.s16 \d, q9 | |
| 765 .endm | |
| 766 | |
| 767 function put_h264_qpel16_h_lowpass_neon_packed | |
| 768 mov r4, lr | |
| 769 mov ip, #16 | |
| 770 mov r3, #8 | |
| 771 bl put_h264_qpel8_h_lowpass_neon | |
| 772 sub r1, r1, r2, lsl #4 | |
| 773 add r1, r1, #8 | |
| 774 mov ip, #16 | |
| 775 mov lr, r4 | |
| 776 b put_h264_qpel8_h_lowpass_neon | |
| 777 endfunc | |
| 778 | |
| 779 .macro h264_qpel_h_lowpass type | |
| 780 function \type\()_h264_qpel16_h_lowpass_neon | |
| 781 push {lr} | |
| 782 mov ip, #16 | |
| 783 bl \type\()_h264_qpel8_h_lowpass_neon | |
| 784 sub r0, r0, r3, lsl #4 | |
| 785 sub r1, r1, r2, lsl #4 | |
| 786 add r0, r0, #8 | |
| 787 add r1, r1, #8 | |
| 788 mov ip, #16 | |
| 789 pop {lr} | |
| 790 endfunc | |
| 791 | |
| 792 function \type\()_h264_qpel8_h_lowpass_neon | |
| 793 1: vld1.64 {d0, d1}, [r1], r2 | |
| 794 vld1.64 {d16,d17}, [r1], r2 | |
| 795 subs ip, ip, #2 | |
| 796 lowpass_8 d0, d1, d16, d17, d0, d16 | |
| 797 .ifc \type,avg | |
| 798 vld1.8 {d2}, [r0,:64], r3 | |
| 799 vrhadd.u8 d0, d0, d2 | |
| 800 vld1.8 {d3}, [r0,:64] | |
| 801 vrhadd.u8 d16, d16, d3 | |
| 802 sub r0, r0, r3 | |
| 803 .endif | |
| 804 vst1.64 {d0}, [r0,:64], r3 | |
| 805 vst1.64 {d16}, [r0,:64], r3 | |
| 806 bne 1b | |
| 807 bx lr | |
| 808 endfunc | |
| 809 .endm | |
| 810 | |
| 811 h264_qpel_h_lowpass put | |
| 812 h264_qpel_h_lowpass avg | |
| 813 | |
| 814 .macro h264_qpel_h_lowpass_l2 type | |
| 815 function \type\()_h264_qpel16_h_lowpass_l2_neon | |
| 816 push {lr} | |
| 817 mov ip, #16 | |
| 818 bl \type\()_h264_qpel8_h_lowpass_l2_neon | |
| 819 sub r0, r0, r2, lsl #4 | |
| 820 sub r1, r1, r2, lsl #4 | |
| 821 sub r3, r3, r2, lsl #4 | |
| 822 add r0, r0, #8 | |
| 823 add r1, r1, #8 | |
| 824 add r3, r3, #8 | |
| 825 mov ip, #16 | |
| 826 pop {lr} | |
| 827 endfunc | |
| 828 | |
| 829 function \type\()_h264_qpel8_h_lowpass_l2_neon | |
| 830 1: vld1.64 {d0, d1}, [r1], r2 | |
| 831 vld1.64 {d16,d17}, [r1], r2 | |
| 832 vld1.64 {d28}, [r3], r2 | |
| 833 vld1.64 {d29}, [r3], r2 | |
| 834 subs ip, ip, #2 | |
| 835 lowpass_8 d0, d1, d16, d17, d0, d1 | |
| 836 vrhadd.u8 q0, q0, q14 | |
| 837 .ifc \type,avg | |
| 838 vld1.8 {d2}, [r0,:64], r2 | |
| 839 vrhadd.u8 d0, d0, d2 | |
| 840 vld1.8 {d3}, [r0,:64] | |
| 841 vrhadd.u8 d1, d1, d3 | |
| 842 sub r0, r0, r2 | |
| 843 .endif | |
| 844 vst1.64 {d0}, [r0,:64], r2 | |
| 845 vst1.64 {d1}, [r0,:64], r2 | |
| 846 bne 1b | |
| 847 bx lr | |
| 848 endfunc | |
| 849 .endm | |
| 850 | |
| 851 h264_qpel_h_lowpass_l2 put | |
| 852 h264_qpel_h_lowpass_l2 avg | |
| 853 | |
| 854 function put_h264_qpel16_v_lowpass_neon_packed | |
| 855 mov r4, lr | |
| 856 mov r2, #8 | |
| 857 bl put_h264_qpel8_v_lowpass_neon | |
| 858 sub r1, r1, r3, lsl #2 | |
| 859 bl put_h264_qpel8_v_lowpass_neon | |
| 860 sub r1, r1, r3, lsl #4 | |
| 861 sub r1, r1, r3, lsl #2 | |
| 862 add r1, r1, #8 | |
| 863 bl put_h264_qpel8_v_lowpass_neon | |
| 864 sub r1, r1, r3, lsl #2 | |
| 865 mov lr, r4 | |
| 866 b put_h264_qpel8_v_lowpass_neon | |
| 867 endfunc | |
| 868 | |
| 869 .macro h264_qpel_v_lowpass type | |
| 870 function \type\()_h264_qpel16_v_lowpass_neon | |
| 871 mov r4, lr | |
| 872 bl \type\()_h264_qpel8_v_lowpass_neon | |
| 873 sub r1, r1, r3, lsl #2 | |
| 874 bl \type\()_h264_qpel8_v_lowpass_neon | |
| 875 sub r0, r0, r2, lsl #4 | |
| 876 add r0, r0, #8 | |
| 877 sub r1, r1, r3, lsl #4 | |
| 878 sub r1, r1, r3, lsl #2 | |
| 879 add r1, r1, #8 | |
| 880 bl \type\()_h264_qpel8_v_lowpass_neon | |
| 881 sub r1, r1, r3, lsl #2 | |
| 882 mov lr, r4 | |
| 883 endfunc | |
| 884 | |
| 885 function \type\()_h264_qpel8_v_lowpass_neon | |
| 886 vld1.64 {d8}, [r1], r3 | |
| 887 vld1.64 {d10}, [r1], r3 | |
| 888 vld1.64 {d12}, [r1], r3 | |
| 889 vld1.64 {d14}, [r1], r3 | |
| 890 vld1.64 {d22}, [r1], r3 | |
| 891 vld1.64 {d24}, [r1], r3 | |
| 892 vld1.64 {d26}, [r1], r3 | |
| 893 vld1.64 {d28}, [r1], r3 | |
| 894 vld1.64 {d9}, [r1], r3 | |
| 895 vld1.64 {d11}, [r1], r3 | |
| 896 vld1.64 {d13}, [r1], r3 | |
| 897 vld1.64 {d15}, [r1], r3 | |
| 898 vld1.64 {d23}, [r1] | |
| 899 | |
| 900 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
| 901 lowpass_8 d8, d9, d10, d11, d8, d10 | |
| 902 lowpass_8 d12, d13, d14, d15, d12, d14 | |
| 903 lowpass_8 d22, d23, d24, d25, d22, d24 | |
| 904 lowpass_8 d26, d27, d28, d29, d26, d28 | |
| 905 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 | |
| 906 | |
| 907 .ifc \type,avg | |
| 908 vld1.8 {d9}, [r0,:64], r2 | |
| 909 vrhadd.u8 d8, d8, d9 | |
| 910 vld1.8 {d11}, [r0,:64], r2 | |
| 911 vrhadd.u8 d10, d10, d11 | |
| 912 vld1.8 {d13}, [r0,:64], r2 | |
| 913 vrhadd.u8 d12, d12, d13 | |
| 914 vld1.8 {d15}, [r0,:64], r2 | |
| 915 vrhadd.u8 d14, d14, d15 | |
| 916 vld1.8 {d23}, [r0,:64], r2 | |
| 917 vrhadd.u8 d22, d22, d23 | |
| 918 vld1.8 {d25}, [r0,:64], r2 | |
| 919 vrhadd.u8 d24, d24, d25 | |
| 920 vld1.8 {d27}, [r0,:64], r2 | |
| 921 vrhadd.u8 d26, d26, d27 | |
| 922 vld1.8 {d29}, [r0,:64], r2 | |
| 923 vrhadd.u8 d28, d28, d29 | |
| 924 sub r0, r0, r2, lsl #3 | |
| 925 .endif | |
| 926 | |
| 927 vst1.64 {d8}, [r0,:64], r2 | |
| 928 vst1.64 {d10}, [r0,:64], r2 | |
| 929 vst1.64 {d12}, [r0,:64], r2 | |
| 930 vst1.64 {d14}, [r0,:64], r2 | |
| 931 vst1.64 {d22}, [r0,:64], r2 | |
| 932 vst1.64 {d24}, [r0,:64], r2 | |
| 933 vst1.64 {d26}, [r0,:64], r2 | |
| 934 vst1.64 {d28}, [r0,:64], r2 | |
| 935 | |
| 936 bx lr | |
| 937 endfunc | |
| 938 .endm | |
| 939 | |
| 940 h264_qpel_v_lowpass put | |
| 941 h264_qpel_v_lowpass avg | |
| 942 | |
| 943 .macro h264_qpel_v_lowpass_l2 type | |
| 944 function \type\()_h264_qpel16_v_lowpass_l2_neon | |
| 945 mov r4, lr | |
| 946 bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
| 947 sub r1, r1, r3, lsl #2 | |
| 948 bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
| 949 sub r0, r0, r3, lsl #4 | |
| 950 sub ip, ip, r2, lsl #4 | |
| 951 add r0, r0, #8 | |
| 952 add ip, ip, #8 | |
| 953 sub r1, r1, r3, lsl #4 | |
| 954 sub r1, r1, r3, lsl #2 | |
| 955 add r1, r1, #8 | |
| 956 bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
| 957 sub r1, r1, r3, lsl #2 | |
| 958 mov lr, r4 | |
| 959 endfunc | |
| 960 | |
| 961 function \type\()_h264_qpel8_v_lowpass_l2_neon | |
| 962 vld1.64 {d8}, [r1], r3 | |
| 963 vld1.64 {d10}, [r1], r3 | |
| 964 vld1.64 {d12}, [r1], r3 | |
| 965 vld1.64 {d14}, [r1], r3 | |
| 966 vld1.64 {d22}, [r1], r3 | |
| 967 vld1.64 {d24}, [r1], r3 | |
| 968 vld1.64 {d26}, [r1], r3 | |
| 969 vld1.64 {d28}, [r1], r3 | |
| 970 vld1.64 {d9}, [r1], r3 | |
| 971 vld1.64 {d11}, [r1], r3 | |
| 972 vld1.64 {d13}, [r1], r3 | |
| 973 vld1.64 {d15}, [r1], r3 | |
| 974 vld1.64 {d23}, [r1] | |
| 975 | |
| 976 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
| 977 lowpass_8 d8, d9, d10, d11, d8, d9 | |
| 978 lowpass_8 d12, d13, d14, d15, d12, d13 | |
| 979 lowpass_8 d22, d23, d24, d25, d22, d23 | |
| 980 lowpass_8 d26, d27, d28, d29, d26, d27 | |
| 981 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 | |
| 982 | |
| 983 vld1.64 {d0}, [ip], r2 | |
| 984 vld1.64 {d1}, [ip], r2 | |
| 985 vld1.64 {d2}, [ip], r2 | |
| 986 vld1.64 {d3}, [ip], r2 | |
| 987 vld1.64 {d4}, [ip], r2 | |
| 988 vrhadd.u8 q0, q0, q4 | |
| 989 vld1.64 {d5}, [ip], r2 | |
| 990 vrhadd.u8 q1, q1, q6 | |
| 991 vld1.64 {d10}, [ip], r2 | |
| 992 vrhadd.u8 q2, q2, q11 | |
| 993 vld1.64 {d11}, [ip], r2 | |
| 994 vrhadd.u8 q5, q5, q13 | |
| 995 | |
| 996 .ifc \type,avg | |
| 997 vld1.8 {d16}, [r0,:64], r3 | |
| 998 vrhadd.u8 d0, d0, d16 | |
| 999 vld1.8 {d17}, [r0,:64], r3 | |
| 1000 vrhadd.u8 d1, d1, d17 | |
| 1001 vld1.8 {d16}, [r0,:64], r3 | |
| 1002 vrhadd.u8 d2, d2, d16 | |
| 1003 vld1.8 {d17}, [r0,:64], r3 | |
| 1004 vrhadd.u8 d3, d3, d17 | |
| 1005 vld1.8 {d16}, [r0,:64], r3 | |
| 1006 vrhadd.u8 d4, d4, d16 | |
| 1007 vld1.8 {d17}, [r0,:64], r3 | |
| 1008 vrhadd.u8 d5, d5, d17 | |
| 1009 vld1.8 {d16}, [r0,:64], r3 | |
| 1010 vrhadd.u8 d10, d10, d16 | |
| 1011 vld1.8 {d17}, [r0,:64], r3 | |
| 1012 vrhadd.u8 d11, d11, d17 | |
| 1013 sub r0, r0, r3, lsl #3 | |
| 1014 .endif | |
| 1015 | |
| 1016 vst1.64 {d0}, [r0,:64], r3 | |
| 1017 vst1.64 {d1}, [r0,:64], r3 | |
| 1018 vst1.64 {d2}, [r0,:64], r3 | |
| 1019 vst1.64 {d3}, [r0,:64], r3 | |
| 1020 vst1.64 {d4}, [r0,:64], r3 | |
| 1021 vst1.64 {d5}, [r0,:64], r3 | |
| 1022 vst1.64 {d10}, [r0,:64], r3 | |
| 1023 vst1.64 {d11}, [r0,:64], r3 | |
| 1024 | |
| 1025 bx lr | |
| 1026 endfunc | |
| 1027 .endm | |
| 1028 | |
| 1029 h264_qpel_v_lowpass_l2 put | |
| 1030 h264_qpel_v_lowpass_l2 avg | |
| 1031 | |
| 1032 function put_h264_qpel8_hv_lowpass_neon_top | |
| 1033 lowpass_const ip | |
| 1034 mov ip, #12 | |
| 1035 1: vld1.64 {d0, d1}, [r1], r3 | |
| 1036 vld1.64 {d16,d17}, [r1], r3 | |
| 1037 subs ip, ip, #2 | |
| 1038 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 | |
| 1039 vst1.64 {d22-d25}, [r4,:128]! | |
| 1040 bne 1b | |
| 1041 | |
| 1042 vld1.64 {d0, d1}, [r1] | |
| 1043 lowpass_8_1 d0, d1, q12, narrow=0 | |
| 1044 | |
| 1045 mov ip, #-16 | |
| 1046 add r4, r4, ip | |
| 1047 vld1.64 {d30,d31}, [r4,:128], ip | |
| 1048 vld1.64 {d20,d21}, [r4,:128], ip | |
| 1049 vld1.64 {d18,d19}, [r4,:128], ip | |
| 1050 vld1.64 {d16,d17}, [r4,:128], ip | |
| 1051 vld1.64 {d14,d15}, [r4,:128], ip | |
| 1052 vld1.64 {d12,d13}, [r4,:128], ip | |
| 1053 vld1.64 {d10,d11}, [r4,:128], ip | |
| 1054 vld1.64 {d8, d9}, [r4,:128], ip | |
| 1055 vld1.64 {d6, d7}, [r4,:128], ip | |
| 1056 vld1.64 {d4, d5}, [r4,:128], ip | |
| 1057 vld1.64 {d2, d3}, [r4,:128], ip | |
| 1058 vld1.64 {d0, d1}, [r4,:128] | |
| 1059 | |
| 1060 swap4 d1, d3, d5, d7, d8, d10, d12, d14 | |
| 1061 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 | |
| 1062 | |
| 1063 swap4 d17, d19, d21, d31, d24, d26, d28, d22 | |
| 1064 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 | |
| 1065 | |
| 1066 vst1.64 {d30,d31}, [r4,:128]! | |
| 1067 vst1.64 {d6, d7}, [r4,:128]! | |
| 1068 vst1.64 {d20,d21}, [r4,:128]! | |
| 1069 vst1.64 {d4, d5}, [r4,:128]! | |
| 1070 vst1.64 {d18,d19}, [r4,:128]! | |
| 1071 vst1.64 {d2, d3}, [r4,:128]! | |
| 1072 vst1.64 {d16,d17}, [r4,:128]! | |
| 1073 vst1.64 {d0, d1}, [r4,:128] | |
| 1074 | |
| 1075 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 | |
| 1076 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 | |
| 1077 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 | |
| 1078 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 | |
| 1079 | |
| 1080 vld1.64 {d16,d17}, [r4,:128], ip | |
| 1081 vld1.64 {d30,d31}, [r4,:128], ip | |
| 1082 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 | |
| 1083 vld1.64 {d16,d17}, [r4,:128], ip | |
| 1084 vld1.64 {d30,d31}, [r4,:128], ip | |
| 1085 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 | |
| 1086 vld1.64 {d16,d17}, [r4,:128], ip | |
| 1087 vld1.64 {d30,d31}, [r4,:128], ip | |
| 1088 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 | |
| 1089 vld1.64 {d16,d17}, [r4,:128], ip | |
| 1090 vld1.64 {d30,d31}, [r4,:128] | |
| 1091 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 | |
| 1092 | |
| 1093 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 | |
| 1094 | |
| 1095 bx lr | |
| 1096 endfunc | |
| 1097 | |
| 1098 .macro h264_qpel8_hv_lowpass type | |
| 1099 function \type\()_h264_qpel8_hv_lowpass_neon | |
| 1100 mov r10, lr | |
| 1101 bl put_h264_qpel8_hv_lowpass_neon_top | |
| 1102 .ifc \type,avg | |
| 1103 vld1.8 {d0}, [r0,:64], r2 | |
| 1104 vrhadd.u8 d12, d12, d0 | |
| 1105 vld1.8 {d1}, [r0,:64], r2 | |
| 1106 vrhadd.u8 d13, d13, d1 | |
| 1107 vld1.8 {d2}, [r0,:64], r2 | |
| 1108 vrhadd.u8 d14, d14, d2 | |
| 1109 vld1.8 {d3}, [r0,:64], r2 | |
| 1110 vrhadd.u8 d15, d15, d3 | |
| 1111 vld1.8 {d4}, [r0,:64], r2 | |
| 1112 vrhadd.u8 d8, d8, d4 | |
| 1113 vld1.8 {d5}, [r0,:64], r2 | |
| 1114 vrhadd.u8 d9, d9, d5 | |
| 1115 vld1.8 {d6}, [r0,:64], r2 | |
| 1116 vrhadd.u8 d10, d10, d6 | |
| 1117 vld1.8 {d7}, [r0,:64], r2 | |
| 1118 vrhadd.u8 d11, d11, d7 | |
| 1119 sub r0, r0, r2, lsl #3 | |
| 1120 .endif | |
| 1121 vst1.64 {d12}, [r0,:64], r2 | |
| 1122 vst1.64 {d13}, [r0,:64], r2 | |
| 1123 vst1.64 {d14}, [r0,:64], r2 | |
| 1124 vst1.64 {d15}, [r0,:64], r2 | |
| 1125 vst1.64 {d8}, [r0,:64], r2 | |
| 1126 vst1.64 {d9}, [r0,:64], r2 | |
| 1127 vst1.64 {d10}, [r0,:64], r2 | |
| 1128 vst1.64 {d11}, [r0,:64], r2 | |
| 1129 | |
| 1130 mov lr, r10 | |
| 1131 bx lr | |
| 1132 endfunc | |
| 1133 .endm | |
| 1134 | |
| 1135 h264_qpel8_hv_lowpass put | |
| 1136 h264_qpel8_hv_lowpass avg | |
| 1137 | |
| 1138 .macro h264_qpel8_hv_lowpass_l2 type | |
| 1139 function \type\()_h264_qpel8_hv_lowpass_l2_neon | |
| 1140 mov r10, lr | |
| 1141 bl put_h264_qpel8_hv_lowpass_neon_top | |
| 1142 | |
| 1143 vld1.64 {d0, d1}, [r2,:128]! | |
| 1144 vld1.64 {d2, d3}, [r2,:128]! | |
| 1145 vrhadd.u8 q0, q0, q6 | |
| 1146 vld1.64 {d4, d5}, [r2,:128]! | |
| 1147 vrhadd.u8 q1, q1, q7 | |
| 1148 vld1.64 {d6, d7}, [r2,:128]! | |
| 1149 vrhadd.u8 q2, q2, q4 | |
| 1150 vrhadd.u8 q3, q3, q5 | |
| 1151 .ifc \type,avg | |
| 1152 vld1.8 {d16}, [r0,:64], r3 | |
| 1153 vrhadd.u8 d0, d0, d16 | |
| 1154 vld1.8 {d17}, [r0,:64], r3 | |
| 1155 vrhadd.u8 d1, d1, d17 | |
| 1156 vld1.8 {d18}, [r0,:64], r3 | |
| 1157 vrhadd.u8 d2, d2, d18 | |
| 1158 vld1.8 {d19}, [r0,:64], r3 | |
| 1159 vrhadd.u8 d3, d3, d19 | |
| 1160 vld1.8 {d20}, [r0,:64], r3 | |
| 1161 vrhadd.u8 d4, d4, d20 | |
| 1162 vld1.8 {d21}, [r0,:64], r3 | |
| 1163 vrhadd.u8 d5, d5, d21 | |
| 1164 vld1.8 {d22}, [r0,:64], r3 | |
| 1165 vrhadd.u8 d6, d6, d22 | |
| 1166 vld1.8 {d23}, [r0,:64], r3 | |
| 1167 vrhadd.u8 d7, d7, d23 | |
| 1168 sub r0, r0, r3, lsl #3 | |
| 1169 .endif | |
| 1170 vst1.64 {d0}, [r0,:64], r3 | |
| 1171 vst1.64 {d1}, [r0,:64], r3 | |
| 1172 vst1.64 {d2}, [r0,:64], r3 | |
| 1173 vst1.64 {d3}, [r0,:64], r3 | |
| 1174 vst1.64 {d4}, [r0,:64], r3 | |
| 1175 vst1.64 {d5}, [r0,:64], r3 | |
| 1176 vst1.64 {d6}, [r0,:64], r3 | |
| 1177 vst1.64 {d7}, [r0,:64], r3 | |
| 1178 | |
| 1179 mov lr, r10 | |
| 1180 bx lr | |
| 1181 endfunc | |
| 1182 .endm | |
| 1183 | |
| 1184 h264_qpel8_hv_lowpass_l2 put | |
| 1185 h264_qpel8_hv_lowpass_l2 avg | |
| 1186 | |
| 1187 .macro h264_qpel16_hv type | |
| 1188 function \type\()_h264_qpel16_hv_lowpass_neon | |
| 1189 mov r9, lr | |
| 1190 bl \type\()_h264_qpel8_hv_lowpass_neon | |
| 1191 sub r1, r1, r3, lsl #2 | |
| 1192 bl \type\()_h264_qpel8_hv_lowpass_neon | |
| 1193 sub r1, r1, r3, lsl #4 | |
| 1194 sub r1, r1, r3, lsl #2 | |
| 1195 add r1, r1, #8 | |
| 1196 sub r0, r0, r2, lsl #4 | |
| 1197 add r0, r0, #8 | |
| 1198 bl \type\()_h264_qpel8_hv_lowpass_neon | |
| 1199 sub r1, r1, r3, lsl #2 | |
| 1200 mov lr, r9 | |
| 1201 b \type\()_h264_qpel8_hv_lowpass_neon | |
| 1202 endfunc | |
| 1203 | |
| 1204 function \type\()_h264_qpel16_hv_lowpass_l2_neon | |
| 1205 mov r9, lr | |
| 1206 sub r2, r4, #256 | |
| 1207 bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
| 1208 sub r1, r1, r3, lsl #2 | |
| 1209 bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
| 1210 sub r1, r1, r3, lsl #4 | |
| 1211 sub r1, r1, r3, lsl #2 | |
| 1212 add r1, r1, #8 | |
| 1213 sub r0, r0, r3, lsl #4 | |
| 1214 add r0, r0, #8 | |
| 1215 bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
| 1216 sub r1, r1, r3, lsl #2 | |
| 1217 mov lr, r9 | |
| 1218 b \type\()_h264_qpel8_hv_lowpass_l2_neon | |
| 1219 endfunc | |
| 1220 .endm | |
| 1221 | |
| 1222 h264_qpel16_hv put | |
| 1223 h264_qpel16_hv avg | |
| 1224 | |
| 1225 .macro h264_qpel8 type | |
| 1226 function ff_\type\()_h264_qpel8_mc10_neon, export=1 | |
| 1227 lowpass_const r3 | |
| 1228 mov r3, r1 | |
| 1229 sub r1, r1, #2 | |
| 1230 mov ip, #8 | |
| 1231 b \type\()_h264_qpel8_h_lowpass_l2_neon | |
| 1232 endfunc | |
| 1233 | |
| 1234 function ff_\type\()_h264_qpel8_mc20_neon, export=1 | |
| 1235 lowpass_const r3 | |
| 1236 sub r1, r1, #2 | |
| 1237 mov r3, r2 | |
| 1238 mov ip, #8 | |
| 1239 b \type\()_h264_qpel8_h_lowpass_neon | |
| 1240 endfunc | |
| 1241 | |
| 1242 function ff_\type\()_h264_qpel8_mc30_neon, export=1 | |
| 1243 lowpass_const r3 | |
| 1244 add r3, r1, #1 | |
| 1245 sub r1, r1, #2 | |
| 1246 mov ip, #8 | |
| 1247 b \type\()_h264_qpel8_h_lowpass_l2_neon | |
| 1248 endfunc | |
| 1249 | |
| 1250 function ff_\type\()_h264_qpel8_mc01_neon, export=1 | |
| 1251 push {lr} | |
| 1252 mov ip, r1 | |
| 1253 \type\()_h264_qpel8_mc01: | |
| 1254 lowpass_const r3 | |
| 1255 mov r3, r2 | |
| 1256 sub r1, r1, r2, lsl #1 | |
| 1257 vpush {d8-d15} | |
| 1258 bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
| 1259 vpop {d8-d15} | |
| 1260 pop {pc} | |
| 1261 endfunc | |
| 1262 | |
| 1263 function ff_\type\()_h264_qpel8_mc11_neon, export=1 | |
| 1264 push {r0, r1, r11, lr} | |
| 1265 \type\()_h264_qpel8_mc11: | |
| 1266 lowpass_const r3 | |
| 1267 mov r11, sp | |
| 1268 bic sp, sp, #15 | |
| 1269 sub sp, sp, #64 | |
| 1270 mov r0, sp | |
| 1271 sub r1, r1, #2 | |
| 1272 mov r3, #8 | |
| 1273 mov ip, #8 | |
| 1274 vpush {d8-d15} | |
| 1275 bl put_h264_qpel8_h_lowpass_neon | |
| 1276 ldrd r0, [r11] | |
| 1277 mov r3, r2 | |
| 1278 add ip, sp, #64 | |
| 1279 sub r1, r1, r2, lsl #1 | |
| 1280 mov r2, #8 | |
| 1281 bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
| 1282 vpop {d8-d15} | |
| 1283 add sp, r11, #8 | |
| 1284 pop {r11, pc} | |
| 1285 endfunc | |
| 1286 | |
| 1287 function ff_\type\()_h264_qpel8_mc21_neon, export=1 | |
| 1288 push {r0, r1, r4, r10, r11, lr} | |
| 1289 \type\()_h264_qpel8_mc21: | |
| 1290 lowpass_const r3 | |
| 1291 mov r11, sp | |
| 1292 bic sp, sp, #15 | |
| 1293 sub sp, sp, #(8*8+16*12) | |
| 1294 sub r1, r1, #2 | |
| 1295 mov r3, #8 | |
| 1296 mov r0, sp | |
| 1297 mov ip, #8 | |
| 1298 vpush {d8-d15} | |
| 1299 bl put_h264_qpel8_h_lowpass_neon | |
| 1300 mov r4, r0 | |
| 1301 ldrd r0, [r11] | |
| 1302 sub r1, r1, r2, lsl #1 | |
| 1303 sub r1, r1, #2 | |
| 1304 mov r3, r2 | |
| 1305 sub r2, r4, #64 | |
| 1306 bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
| 1307 vpop {d8-d15} | |
| 1308 add sp, r11, #8 | |
| 1309 pop {r4, r10, r11, pc} | |
| 1310 endfunc | |
| 1311 | |
| 1312 function ff_\type\()_h264_qpel8_mc31_neon, export=1 | |
| 1313 add r1, r1, #1 | |
| 1314 push {r0, r1, r11, lr} | |
| 1315 sub r1, r1, #1 | |
| 1316 b \type\()_h264_qpel8_mc11 | |
| 1317 endfunc | |
| 1318 | |
| 1319 function ff_\type\()_h264_qpel8_mc02_neon, export=1 | |
| 1320 push {lr} | |
| 1321 lowpass_const r3 | |
| 1322 sub r1, r1, r2, lsl #1 | |
| 1323 mov r3, r2 | |
| 1324 vpush {d8-d15} | |
| 1325 bl \type\()_h264_qpel8_v_lowpass_neon | |
| 1326 vpop {d8-d15} | |
| 1327 pop {pc} | |
| 1328 endfunc | |
| 1329 | |
| 1330 function ff_\type\()_h264_qpel8_mc12_neon, export=1 | |
| 1331 push {r0, r1, r4, r10, r11, lr} | |
| 1332 \type\()_h264_qpel8_mc12: | |
| 1333 lowpass_const r3 | |
| 1334 mov r11, sp | |
| 1335 bic sp, sp, #15 | |
| 1336 sub sp, sp, #(8*8+16*12) | |
| 1337 sub r1, r1, r2, lsl #1 | |
| 1338 mov r3, r2 | |
| 1339 mov r2, #8 | |
| 1340 mov r0, sp | |
| 1341 vpush {d8-d15} | |
| 1342 bl put_h264_qpel8_v_lowpass_neon | |
| 1343 mov r4, r0 | |
| 1344 ldrd r0, [r11] | |
| 1345 sub r1, r1, r3, lsl #1 | |
| 1346 sub r1, r1, #2 | |
| 1347 sub r2, r4, #64 | |
| 1348 bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
| 1349 vpop {d8-d15} | |
| 1350 add sp, r11, #8 | |
| 1351 pop {r4, r10, r11, pc} | |
| 1352 endfunc | |
| 1353 | |
| 1354 function ff_\type\()_h264_qpel8_mc22_neon, export=1 | |
| 1355 push {r4, r10, r11, lr} | |
| 1356 mov r11, sp | |
| 1357 bic sp, sp, #15 | |
| 1358 sub r1, r1, r2, lsl #1 | |
| 1359 sub r1, r1, #2 | |
| 1360 mov r3, r2 | |
| 1361 sub sp, sp, #(16*12) | |
| 1362 mov r4, sp | |
| 1363 vpush {d8-d15} | |
| 1364 bl \type\()_h264_qpel8_hv_lowpass_neon | |
| 1365 vpop {d8-d15} | |
| 1366 mov sp, r11 | |
| 1367 pop {r4, r10, r11, pc} | |
| 1368 endfunc | |
| 1369 | |
| 1370 function ff_\type\()_h264_qpel8_mc32_neon, export=1 | |
| 1371 push {r0, r1, r4, r10, r11, lr} | |
| 1372 add r1, r1, #1 | |
| 1373 b \type\()_h264_qpel8_mc12 | |
| 1374 endfunc | |
| 1375 | |
| 1376 function ff_\type\()_h264_qpel8_mc03_neon, export=1 | |
| 1377 push {lr} | |
| 1378 add ip, r1, r2 | |
| 1379 b \type\()_h264_qpel8_mc01 | |
| 1380 endfunc | |
| 1381 | |
| 1382 function ff_\type\()_h264_qpel8_mc13_neon, export=1 | |
| 1383 push {r0, r1, r11, lr} | |
| 1384 add r1, r1, r2 | |
| 1385 b \type\()_h264_qpel8_mc11 | |
| 1386 endfunc | |
| 1387 | |
| 1388 function ff_\type\()_h264_qpel8_mc23_neon, export=1 | |
| 1389 push {r0, r1, r4, r10, r11, lr} | |
| 1390 add r1, r1, r2 | |
| 1391 b \type\()_h264_qpel8_mc21 | |
| 1392 endfunc | |
| 1393 | |
| 1394 function ff_\type\()_h264_qpel8_mc33_neon, export=1 | |
| 1395 add r1, r1, #1 | |
| 1396 push {r0, r1, r11, lr} | |
| 1397 add r1, r1, r2 | |
| 1398 sub r1, r1, #1 | |
| 1399 b \type\()_h264_qpel8_mc11 | |
| 1400 endfunc | |
| 1401 .endm | |
| 1402 | |
| 1403 h264_qpel8 put | |
| 1404 h264_qpel8 avg | |
| 1405 | |
| 1406 .macro h264_qpel16 type | |
| 1407 function ff_\type\()_h264_qpel16_mc10_neon, export=1 | |
| 1408 lowpass_const r3 | |
| 1409 mov r3, r1 | |
| 1410 sub r1, r1, #2 | |
| 1411 b \type\()_h264_qpel16_h_lowpass_l2_neon | |
| 1412 endfunc | |
| 1413 | |
| 1414 function ff_\type\()_h264_qpel16_mc20_neon, export=1 | |
| 1415 lowpass_const r3 | |
| 1416 sub r1, r1, #2 | |
| 1417 mov r3, r2 | |
| 1418 b \type\()_h264_qpel16_h_lowpass_neon | |
| 1419 endfunc | |
| 1420 | |
| 1421 function ff_\type\()_h264_qpel16_mc30_neon, export=1 | |
| 1422 lowpass_const r3 | |
| 1423 add r3, r1, #1 | |
| 1424 sub r1, r1, #2 | |
| 1425 b \type\()_h264_qpel16_h_lowpass_l2_neon | |
| 1426 endfunc | |
| 1427 | |
| 1428 function ff_\type\()_h264_qpel16_mc01_neon, export=1 | |
| 1429 push {r4, lr} | |
| 1430 mov ip, r1 | |
| 1431 \type\()_h264_qpel16_mc01: | |
| 1432 lowpass_const r3 | |
| 1433 mov r3, r2 | |
| 1434 sub r1, r1, r2, lsl #1 | |
| 1435 vpush {d8-d15} | |
| 1436 bl \type\()_h264_qpel16_v_lowpass_l2_neon | |
| 1437 vpop {d8-d15} | |
| 1438 pop {r4, pc} | |
| 1439 endfunc | |
| 1440 | |
| 1441 function ff_\type\()_h264_qpel16_mc11_neon, export=1 | |
| 1442 push {r0, r1, r4, r11, lr} | |
| 1443 \type\()_h264_qpel16_mc11: | |
| 1444 lowpass_const r3 | |
| 1445 mov r11, sp | |
| 1446 bic sp, sp, #15 | |
| 1447 sub sp, sp, #256 | |
| 1448 mov r0, sp | |
| 1449 sub r1, r1, #2 | |
| 1450 mov r3, #16 | |
| 1451 vpush {d8-d15} | |
| 1452 bl put_h264_qpel16_h_lowpass_neon | |
| 1453 ldrd r0, [r11] | |
| 1454 mov r3, r2 | |
| 1455 add ip, sp, #64 | |
| 1456 sub r1, r1, r2, lsl #1 | |
| 1457 mov r2, #16 | |
| 1458 bl \type\()_h264_qpel16_v_lowpass_l2_neon | |
| 1459 vpop {d8-d15} | |
| 1460 add sp, r11, #8 | |
| 1461 pop {r4, r11, pc} | |
| 1462 endfunc | |
| 1463 | |
| 1464 function ff_\type\()_h264_qpel16_mc21_neon, export=1 | |
| 1465 push {r0, r1, r4-r5, r9-r11, lr} | |
| 1466 \type\()_h264_qpel16_mc21: | |
| 1467 lowpass_const r3 | |
| 1468 mov r11, sp | |
| 1469 bic sp, sp, #15 | |
| 1470 sub sp, sp, #(16*16+16*12) | |
| 1471 sub r1, r1, #2 | |
| 1472 mov r0, sp | |
| 1473 vpush {d8-d15} | |
| 1474 bl put_h264_qpel16_h_lowpass_neon_packed | |
| 1475 mov r4, r0 | |
| 1476 ldrd r0, [r11] | |
| 1477 sub r1, r1, r2, lsl #1 | |
| 1478 sub r1, r1, #2 | |
| 1479 mov r3, r2 | |
| 1480 bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |
| 1481 vpop {d8-d15} | |
| 1482 add sp, r11, #8 | |
| 1483 pop {r4-r5, r9-r11, pc} | |
| 1484 endfunc | |
| 1485 | |
| 1486 function ff_\type\()_h264_qpel16_mc31_neon, export=1 | |
| 1487 add r1, r1, #1 | |
| 1488 push {r0, r1, r4, r11, lr} | |
| 1489 sub r1, r1, #1 | |
| 1490 b \type\()_h264_qpel16_mc11 | |
| 1491 endfunc | |
| 1492 | |
| 1493 function ff_\type\()_h264_qpel16_mc02_neon, export=1 | |
| 1494 push {r4, lr} | |
| 1495 lowpass_const r3 | |
| 1496 sub r1, r1, r2, lsl #1 | |
| 1497 mov r3, r2 | |
| 1498 vpush {d8-d15} | |
| 1499 bl \type\()_h264_qpel16_v_lowpass_neon | |
| 1500 vpop {d8-d15} | |
| 1501 pop {r4, pc} | |
| 1502 endfunc | |
| 1503 | |
| 1504 function ff_\type\()_h264_qpel16_mc12_neon, export=1 | |
| 1505 push {r0, r1, r4-r5, r9-r11, lr} | |
| 1506 \type\()_h264_qpel16_mc12: | |
| 1507 lowpass_const r3 | |
| 1508 mov r11, sp | |
| 1509 bic sp, sp, #15 | |
| 1510 sub sp, sp, #(16*16+16*12) | |
| 1511 sub r1, r1, r2, lsl #1 | |
| 1512 mov r0, sp | |
| 1513 mov r3, r2 | |
| 1514 vpush {d8-d15} | |
| 1515 bl put_h264_qpel16_v_lowpass_neon_packed | |
| 1516 mov r4, r0 | |
| 1517 ldrd r0, [r11] | |
| 1518 sub r1, r1, r3, lsl #1 | |
| 1519 sub r1, r1, #2 | |
| 1520 mov r2, r3 | |
| 1521 bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |
| 1522 vpop {d8-d15} | |
| 1523 add sp, r11, #8 | |
| 1524 pop {r4-r5, r9-r11, pc} | |
| 1525 endfunc | |
| 1526 | |
| 1527 function ff_\type\()_h264_qpel16_mc22_neon, export=1 | |
| 1528 push {r4, r9-r11, lr} | |
| 1529 lowpass_const r3 | |
| 1530 mov r11, sp | |
| 1531 bic sp, sp, #15 | |
| 1532 sub r1, r1, r2, lsl #1 | |
| 1533 sub r1, r1, #2 | |
| 1534 mov r3, r2 | |
| 1535 sub sp, sp, #(16*12) | |
| 1536 mov r4, sp | |
| 1537 vpush {d8-d15} | |
| 1538 bl \type\()_h264_qpel16_hv_lowpass_neon | |
| 1539 vpop {d8-d15} | |
| 1540 mov sp, r11 | |
| 1541 pop {r4, r9-r11, pc} | |
| 1542 endfunc | |
| 1543 | |
| 1544 function ff_\type\()_h264_qpel16_mc32_neon, export=1 | |
| 1545 push {r0, r1, r4-r5, r9-r11, lr} | |
| 1546 add r1, r1, #1 | |
| 1547 b \type\()_h264_qpel16_mc12 | |
| 1548 endfunc | |
| 1549 | |
| 1550 function ff_\type\()_h264_qpel16_mc03_neon, export=1 | |
| 1551 push {r4, lr} | |
| 1552 add ip, r1, r2 | |
| 1553 b \type\()_h264_qpel16_mc01 | |
| 1554 endfunc | |
| 1555 | |
| 1556 function ff_\type\()_h264_qpel16_mc13_neon, export=1 | |
| 1557 push {r0, r1, r4, r11, lr} | |
| 1558 add r1, r1, r2 | |
| 1559 b \type\()_h264_qpel16_mc11 | |
| 1560 endfunc | |
| 1561 | |
| 1562 function ff_\type\()_h264_qpel16_mc23_neon, export=1 | |
| 1563 push {r0, r1, r4-r5, r9-r11, lr} | |
| 1564 add r1, r1, r2 | |
| 1565 b \type\()_h264_qpel16_mc21 | |
| 1566 endfunc | |
| 1567 | |
| 1568 function ff_\type\()_h264_qpel16_mc33_neon, export=1 | |
| 1569 add r1, r1, #1 | |
| 1570 push {r0, r1, r4, r11, lr} | |
| 1571 add r1, r1, r2 | |
| 1572 sub r1, r1, #1 | |
| 1573 b \type\()_h264_qpel16_mc11 | |
| 1574 endfunc | |
| 1575 .endm | |
| 1576 | |
| 1577 h264_qpel16 put | |
| 1578 h264_qpel16 avg | |
| 1579 | |
| 1580 @ Biweighted prediction | |
| 1581 | |
| 1582 .macro biweight_16 macs, macd | |
| 1583 vdup.8 d0, r4 | |
| 1584 vdup.8 d1, r5 | |
| 1585 vmov q2, q8 | |
| 1586 vmov q3, q8 | |
| 1587 1: subs ip, ip, #2 | |
| 1588 vld1.8 {d20-d21},[r0,:128], r2 | |
| 1589 \macd q2, d0, d20 | |
| 1590 pld [r0] | |
| 1591 \macd q3, d0, d21 | |
| 1592 vld1.8 {d22-d23},[r1,:128], r2 | |
| 1593 \macs q2, d1, d22 | |
| 1594 pld [r1] | |
| 1595 \macs q3, d1, d23 | |
| 1596 vmov q12, q8 | |
| 1597 vld1.8 {d28-d29},[r0,:128], r2 | |
| 1598 vmov q13, q8 | |
| 1599 \macd q12, d0, d28 | |
| 1600 pld [r0] | |
| 1601 \macd q13, d0, d29 | |
| 1602 vld1.8 {d30-d31},[r1,:128], r2 | |
| 1603 \macs q12, d1, d30 | |
| 1604 pld [r1] | |
| 1605 \macs q13, d1, d31 | |
| 1606 vshl.s16 q2, q2, q9 | |
| 1607 vshl.s16 q3, q3, q9 | |
| 1608 vqmovun.s16 d4, q2 | |
| 1609 vqmovun.s16 d5, q3 | |
| 1610 vshl.s16 q12, q12, q9 | |
| 1611 vshl.s16 q13, q13, q9 | |
| 1612 vqmovun.s16 d24, q12 | |
| 1613 vqmovun.s16 d25, q13 | |
| 1614 vmov q3, q8 | |
| 1615 vst1.8 {d4- d5}, [r6,:128], r2 | |
| 1616 vmov q2, q8 | |
| 1617 vst1.8 {d24-d25},[r6,:128], r2 | |
| 1618 bne 1b | |
| 1619 pop {r4-r6, pc} | |
| 1620 .endm | |
| 1621 | |
| 1622 .macro biweight_8 macs, macd | |
| 1623 vdup.8 d0, r4 | |
| 1624 vdup.8 d1, r5 | |
| 1625 vmov q1, q8 | |
| 1626 vmov q10, q8 | |
| 1627 1: subs ip, ip, #2 | |
| 1628 vld1.8 {d4},[r0,:64], r2 | |
| 1629 \macd q1, d0, d4 | |
| 1630 pld [r0] | |
| 1631 vld1.8 {d5},[r1,:64], r2 | |
| 1632 \macs q1, d1, d5 | |
| 1633 pld [r1] | |
| 1634 vld1.8 {d6},[r0,:64], r2 | |
| 1635 \macd q10, d0, d6 | |
| 1636 pld [r0] | |
| 1637 vld1.8 {d7},[r1,:64], r2 | |
| 1638 \macs q10, d1, d7 | |
| 1639 pld [r1] | |
| 1640 vshl.s16 q1, q1, q9 | |
| 1641 vqmovun.s16 d2, q1 | |
| 1642 vshl.s16 q10, q10, q9 | |
| 1643 vqmovun.s16 d4, q10 | |
| 1644 vmov q10, q8 | |
| 1645 vst1.8 {d2},[r6,:64], r2 | |
| 1646 vmov q1, q8 | |
| 1647 vst1.8 {d4},[r6,:64], r2 | |
| 1648 bne 1b | |
| 1649 pop {r4-r6, pc} | |
| 1650 .endm | |
| 1651 | |
| 1652 .macro biweight_4 macs, macd | |
| 1653 vdup.8 d0, r4 | |
| 1654 vdup.8 d1, r5 | |
| 1655 vmov q1, q8 | |
| 1656 vmov q10, q8 | |
| 1657 1: subs ip, ip, #4 | |
| 1658 vld1.32 {d4[0]},[r0,:32], r2 | |
| 1659 vld1.32 {d4[1]},[r0,:32], r2 | |
| 1660 \macd q1, d0, d4 | |
| 1661 pld [r0] | |
| 1662 vld1.32 {d5[0]},[r1,:32], r2 | |
| 1663 vld1.32 {d5[1]},[r1,:32], r2 | |
| 1664 \macs q1, d1, d5 | |
| 1665 pld [r1] | |
| 1666 blt 2f | |
| 1667 vld1.32 {d6[0]},[r0,:32], r2 | |
| 1668 vld1.32 {d6[1]},[r0,:32], r2 | |
| 1669 \macd q10, d0, d6 | |
| 1670 pld [r0] | |
| 1671 vld1.32 {d7[0]},[r1,:32], r2 | |
| 1672 vld1.32 {d7[1]},[r1,:32], r2 | |
| 1673 \macs q10, d1, d7 | |
| 1674 pld [r1] | |
| 1675 vshl.s16 q1, q1, q9 | |
| 1676 vqmovun.s16 d2, q1 | |
| 1677 vshl.s16 q10, q10, q9 | |
| 1678 vqmovun.s16 d4, q10 | |
| 1679 vmov q10, q8 | |
| 1680 vst1.32 {d2[0]},[r6,:32], r2 | |
| 1681 vst1.32 {d2[1]},[r6,:32], r2 | |
| 1682 vmov q1, q8 | |
| 1683 vst1.32 {d4[0]},[r6,:32], r2 | |
| 1684 vst1.32 {d4[1]},[r6,:32], r2 | |
| 1685 bne 1b | |
| 1686 pop {r4-r6, pc} | |
| 1687 2: vshl.s16 q1, q1, q9 | |
| 1688 vqmovun.s16 d2, q1 | |
| 1689 vst1.32 {d2[0]},[r6,:32], r2 | |
| 1690 vst1.32 {d2[1]},[r6,:32], r2 | |
| 1691 pop {r4-r6, pc} | |
| 1692 .endm | |
| 1693 | |
| 1694 .macro biweight_func w | |
| 1695 function biweight_h264_pixels_\w\()_neon | |
| 1696 push {r4-r6, lr} | |
| 1697 add r4, sp, #16 | |
| 1698 ldm r4, {r4-r6} | |
| 1699 lsr lr, r4, #31 | |
| 1700 add r6, r6, #1 | |
| 1701 eors lr, lr, r5, lsr #30 | |
| 1702 orr r6, r6, #1 | |
| 1703 vdup.16 q9, r3 | |
| 1704 lsl r6, r6, r3 | |
| 1705 vmvn q9, q9 | |
| 1706 vdup.16 q8, r6 | |
| 1707 mov r6, r0 | |
| 1708 beq 10f | |
| 1709 subs lr, lr, #1 | |
| 1710 beq 20f | |
| 1711 subs lr, lr, #1 | |
| 1712 beq 30f | |
| 1713 b 40f | |
| 1714 10: biweight_\w vmlal.u8, vmlal.u8 | |
| 1715 20: rsb r4, r4, #0 | |
| 1716 biweight_\w vmlal.u8, vmlsl.u8 | |
| 1717 30: rsb r4, r4, #0 | |
| 1718 rsb r5, r5, #0 | |
| 1719 biweight_\w vmlsl.u8, vmlsl.u8 | |
| 1720 40: rsb r5, r5, #0 | |
| 1721 biweight_\w vmlsl.u8, vmlal.u8 | |
| 1722 endfunc | |
| 1723 .endm | |
| 1724 | |
| 1725 .macro biweight_entry w, h, b=1 | |
| 1726 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 | |
| 1727 mov ip, #\h | |
| 1728 .if \b | |
| 1729 b biweight_h264_pixels_\w\()_neon | |
| 1730 .endif | |
| 1731 endfunc | |
| 1732 .endm | |
| 1733 | |
| 1734 biweight_entry 16, 8 | |
| 1735 biweight_entry 16, 16, b=0 | |
| 1736 biweight_func 16 | |
| 1737 | |
| 1738 biweight_entry 8, 16 | |
| 1739 biweight_entry 8, 4 | |
| 1740 biweight_entry 8, 8, b=0 | |
| 1741 biweight_func 8 | |
| 1742 | |
| 1743 biweight_entry 4, 8 | |
| 1744 biweight_entry 4, 2 | |
| 1745 biweight_entry 4, 4, b=0 | |
| 1746 biweight_func 4 | |
| 1747 | |
| 1748 @ Weighted prediction | |
| 1749 | |
| 1750 .macro weight_16 add | |
| 1751 vdup.8 d0, r3 | |
| 1752 1: subs ip, ip, #2 | |
| 1753 vld1.8 {d20-d21},[r0,:128], r1 | |
| 1754 vmull.u8 q2, d0, d20 | |
| 1755 pld [r0] | |
| 1756 vmull.u8 q3, d0, d21 | |
| 1757 vld1.8 {d28-d29},[r0,:128], r1 | |
| 1758 vmull.u8 q12, d0, d28 | |
| 1759 pld [r0] | |
| 1760 vmull.u8 q13, d0, d29 | |
| 1761 \add q2, q8, q2 | |
| 1762 vrshl.s16 q2, q2, q9 | |
| 1763 \add q3, q8, q3 | |
| 1764 vrshl.s16 q3, q3, q9 | |
| 1765 vqmovun.s16 d4, q2 | |
| 1766 vqmovun.s16 d5, q3 | |
| 1767 \add q12, q8, q12 | |
| 1768 vrshl.s16 q12, q12, q9 | |
| 1769 \add q13, q8, q13 | |
| 1770 vrshl.s16 q13, q13, q9 | |
| 1771 vqmovun.s16 d24, q12 | |
| 1772 vqmovun.s16 d25, q13 | |
| 1773 vst1.8 {d4- d5}, [r4,:128], r1 | |
| 1774 vst1.8 {d24-d25},[r4,:128], r1 | |
| 1775 bne 1b | |
| 1776 pop {r4, pc} | |
| 1777 .endm | |
| 1778 | |
| 1779 .macro weight_8 add | |
| 1780 vdup.8 d0, r3 | |
| 1781 1: subs ip, ip, #2 | |
| 1782 vld1.8 {d4},[r0,:64], r1 | |
| 1783 vmull.u8 q1, d0, d4 | |
| 1784 pld [r0] | |
| 1785 vld1.8 {d6},[r0,:64], r1 | |
| 1786 vmull.u8 q10, d0, d6 | |
| 1787 \add q1, q8, q1 | |
| 1788 pld [r0] | |
| 1789 vrshl.s16 q1, q1, q9 | |
| 1790 vqmovun.s16 d2, q1 | |
| 1791 \add q10, q8, q10 | |
| 1792 vrshl.s16 q10, q10, q9 | |
| 1793 vqmovun.s16 d4, q10 | |
| 1794 vst1.8 {d2},[r4,:64], r1 | |
| 1795 vst1.8 {d4},[r4,:64], r1 | |
| 1796 bne 1b | |
| 1797 pop {r4, pc} | |
| 1798 .endm | |
| 1799 | |
| 1800 .macro weight_4 add | |
| 1801 vdup.8 d0, r3 | |
| 1802 vmov q1, q8 | |
| 1803 vmov q10, q8 | |
| 1804 1: subs ip, ip, #4 | |
| 1805 vld1.32 {d4[0]},[r0,:32], r1 | |
| 1806 vld1.32 {d4[1]},[r0,:32], r1 | |
| 1807 vmull.u8 q1, d0, d4 | |
| 1808 pld [r0] | |
| 1809 blt 2f | |
| 1810 vld1.32 {d6[0]},[r0,:32], r1 | |
| 1811 vld1.32 {d6[1]},[r0,:32], r1 | |
| 1812 vmull.u8 q10, d0, d6 | |
| 1813 pld [r0] | |
| 1814 \add q1, q8, q1 | |
| 1815 vrshl.s16 q1, q1, q9 | |
| 1816 vqmovun.s16 d2, q1 | |
| 1817 \add q10, q8, q10 | |
| 1818 vrshl.s16 q10, q10, q9 | |
| 1819 vqmovun.s16 d4, q10 | |
| 1820 vmov q10, q8 | |
| 1821 vst1.32 {d2[0]},[r4,:32], r1 | |
| 1822 vst1.32 {d2[1]},[r4,:32], r1 | |
| 1823 vmov q1, q8 | |
| 1824 vst1.32 {d4[0]},[r4,:32], r1 | |
| 1825 vst1.32 {d4[1]},[r4,:32], r1 | |
| 1826 bne 1b | |
| 1827 pop {r4, pc} | |
| 1828 2: \add q1, q8, q1 | |
| 1829 vrshl.s16 q1, q1, q9 | |
| 1830 vqmovun.s16 d2, q1 | |
| 1831 vst1.32 {d2[0]},[r4,:32], r1 | |
| 1832 vst1.32 {d2[1]},[r4,:32], r1 | |
| 1833 pop {r4, pc} | |
| 1834 .endm | |
| 1835 | |
| 1836 .macro weight_func w | |
| 1837 function weight_h264_pixels_\w\()_neon | |
| 1838 push {r4, lr} | |
| 1839 ldr r4, [sp, #8] | |
| 1840 cmp r2, #1 | |
| 1841 lsl r4, r4, r2 | |
| 1842 vdup.16 q8, r4 | |
| 1843 mov r4, r0 | |
| 1844 ble 20f | |
| 1845 rsb lr, r2, #1 | |
| 1846 vdup.16 q9, lr | |
| 1847 cmp r3, #0 | |
| 1848 blt 10f | |
| 1849 weight_\w vhadd.s16 | |
| 1850 10: rsb r3, r3, #0 | |
| 1851 weight_\w vhsub.s16 | |
| 1852 20: rsb lr, r2, #0 | |
| 1853 vdup.16 q9, lr | |
| 1854 cmp r3, #0 | |
| 1855 blt 10f | |
| 1856 weight_\w vadd.s16 | |
| 1857 10: rsb r3, r3, #0 | |
| 1858 weight_\w vsub.s16 | |
| 1859 endfunc | |
| 1860 .endm | |
| 1861 | |
| 1862 .macro weight_entry w, h, b=1 | |
| 1863 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 | |
| 1864 mov ip, #\h | |
| 1865 .if \b | |
| 1866 b weight_h264_pixels_\w\()_neon | |
| 1867 .endif | |
| 1868 endfunc | |
| 1869 .endm | |
| 1870 | |
| 1871 weight_entry 16, 8 | |
| 1872 weight_entry 16, 16, b=0 | |
| 1873 weight_func 16 | |
| 1874 | |
| 1875 weight_entry 8, 16 | |
| 1876 weight_entry 8, 4 | |
| 1877 weight_entry 8, 8, b=0 | |
| 1878 weight_func 8 | |
| 1879 | |
| 1880 weight_entry 4, 8 | |
| 1881 weight_entry 4, 2 | |
| 1882 weight_entry 4, 4, b=0 | |
| 1883 weight_func 4 |
