Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
comparison libavcodec/arm/dsputil_neon.S @ 9:ea1ba68cf0ed
update to match api changes + add sscc produced source
| author | Nina Engelhardt <nengel@mailbox.tu-berlin.de> |
|---|---|
| date | Wed, 05 Jun 2013 14:43:26 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:fee666aeae78 |
|---|---|
| 1 /* | |
| 2 * ARM NEON optimised DSP functions | |
| 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
| 4 * | |
| 5 * This file is part of FFmpeg. | |
| 6 * | |
| 7 * FFmpeg is free software; you can redistribute it and/or | |
| 8 * modify it under the terms of the GNU Lesser General Public | |
| 9 * License as published by the Free Software Foundation; either | |
| 10 * version 2.1 of the License, or (at your option) any later version. | |
| 11 * | |
| 12 * FFmpeg is distributed in the hope that it will be useful, | |
| 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 15 * Lesser General Public License for more details. | |
| 16 * | |
| 17 * You should have received a copy of the GNU Lesser General Public | |
| 18 * License along with FFmpeg; if not, write to the Free Software | |
| 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 20 */ | |
| 21 | |
| 22 #include "config.h" | |
| 23 #include "asm.S" | |
| 24 | |
| 25 preserve8 | |
| 26 .text | |
| 27 | |
| 28 .macro pixels16 avg=0 | |
| 29 .if \avg | |
| 30 mov ip, r0 | |
| 31 .endif | |
| 32 1: vld1.64 {d0, d1}, [r1], r2 | |
| 33 vld1.64 {d2, d3}, [r1], r2 | |
| 34 vld1.64 {d4, d5}, [r1], r2 | |
| 35 pld [r1, r2, lsl #2] | |
| 36 vld1.64 {d6, d7}, [r1], r2 | |
| 37 pld [r1] | |
| 38 pld [r1, r2] | |
| 39 pld [r1, r2, lsl #1] | |
| 40 .if \avg | |
| 41 vld1.64 {d16,d17}, [ip,:128], r2 | |
| 42 vrhadd.u8 q0, q0, q8 | |
| 43 vld1.64 {d18,d19}, [ip,:128], r2 | |
| 44 vrhadd.u8 q1, q1, q9 | |
| 45 vld1.64 {d20,d21}, [ip,:128], r2 | |
| 46 vrhadd.u8 q2, q2, q10 | |
| 47 vld1.64 {d22,d23}, [ip,:128], r2 | |
| 48 vrhadd.u8 q3, q3, q11 | |
| 49 .endif | |
| 50 subs r3, r3, #4 | |
| 51 vst1.64 {d0, d1}, [r0,:128], r2 | |
| 52 vst1.64 {d2, d3}, [r0,:128], r2 | |
| 53 vst1.64 {d4, d5}, [r0,:128], r2 | |
| 54 vst1.64 {d6, d7}, [r0,:128], r2 | |
| 55 bne 1b | |
| 56 bx lr | |
| 57 .endm | |
| 58 | |
| 59 .macro pixels16_x2 vhadd=vrhadd.u8 | |
| 60 1: vld1.64 {d0-d2}, [r1], r2 | |
| 61 vld1.64 {d4-d6}, [r1], r2 | |
| 62 pld [r1] | |
| 63 pld [r1, r2] | |
| 64 subs r3, r3, #2 | |
| 65 vext.8 q1, q0, q1, #1 | |
| 66 \vhadd q0, q0, q1 | |
| 67 vext.8 q3, q2, q3, #1 | |
| 68 \vhadd q2, q2, q3 | |
| 69 vst1.64 {d0, d1}, [r0,:128], r2 | |
| 70 vst1.64 {d4, d5}, [r0,:128], r2 | |
| 71 bne 1b | |
| 72 bx lr | |
| 73 .endm | |
| 74 | |
| 75 .macro pixels16_y2 vhadd=vrhadd.u8 | |
| 76 vld1.64 {d0, d1}, [r1], r2 | |
| 77 vld1.64 {d2, d3}, [r1], r2 | |
| 78 1: subs r3, r3, #2 | |
| 79 \vhadd q2, q0, q1 | |
| 80 vld1.64 {d0, d1}, [r1], r2 | |
| 81 \vhadd q3, q0, q1 | |
| 82 vld1.64 {d2, d3}, [r1], r2 | |
| 83 pld [r1] | |
| 84 pld [r1, r2] | |
| 85 vst1.64 {d4, d5}, [r0,:128], r2 | |
| 86 vst1.64 {d6, d7}, [r0,:128], r2 | |
| 87 bne 1b | |
| 88 bx lr | |
| 89 .endm | |
| 90 | |
| 91 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
| 92 vld1.64 {d0-d2}, [r1], r2 | |
| 93 vld1.64 {d4-d6}, [r1], r2 | |
| 94 .if \no_rnd | |
| 95 vmov.i16 q13, #1 | |
| 96 .endif | |
| 97 pld [r1] | |
| 98 pld [r1, r2] | |
| 99 vext.8 q1, q0, q1, #1 | |
| 100 vext.8 q3, q2, q3, #1 | |
| 101 vaddl.u8 q8, d0, d2 | |
| 102 vaddl.u8 q10, d1, d3 | |
| 103 vaddl.u8 q9, d4, d6 | |
| 104 vaddl.u8 q11, d5, d7 | |
| 105 1: subs r3, r3, #2 | |
| 106 vld1.64 {d0-d2}, [r1], r2 | |
| 107 vadd.u16 q12, q8, q9 | |
| 108 pld [r1] | |
| 109 .if \no_rnd | |
| 110 vadd.u16 q12, q12, q13 | |
| 111 .endif | |
| 112 vext.8 q15, q0, q1, #1 | |
| 113 vadd.u16 q1 , q10, q11 | |
| 114 \vshrn d28, q12, #2 | |
| 115 .if \no_rnd | |
| 116 vadd.u16 q1, q1, q13 | |
| 117 .endif | |
| 118 \vshrn d29, q1, #2 | |
| 119 vaddl.u8 q8, d0, d30 | |
| 120 vld1.64 {d2-d4}, [r1], r2 | |
| 121 vaddl.u8 q10, d1, d31 | |
| 122 vst1.64 {d28,d29}, [r0,:128], r2 | |
| 123 vadd.u16 q12, q8, q9 | |
| 124 pld [r1, r2] | |
| 125 .if \no_rnd | |
| 126 vadd.u16 q12, q12, q13 | |
| 127 .endif | |
| 128 vext.8 q2, q1, q2, #1 | |
| 129 vadd.u16 q0, q10, q11 | |
| 130 \vshrn d30, q12, #2 | |
| 131 .if \no_rnd | |
| 132 vadd.u16 q0, q0, q13 | |
| 133 .endif | |
| 134 \vshrn d31, q0, #2 | |
| 135 vaddl.u8 q9, d2, d4 | |
| 136 vaddl.u8 q11, d3, d5 | |
| 137 vst1.64 {d30,d31}, [r0,:128], r2 | |
| 138 bgt 1b | |
| 139 bx lr | |
| 140 .endm | |
| 141 | |
| 142 .macro pixels8 avg=0 | |
| 143 1: vld1.64 {d0}, [r1], r2 | |
| 144 vld1.64 {d1}, [r1], r2 | |
| 145 vld1.64 {d2}, [r1], r2 | |
| 146 pld [r1, r2, lsl #2] | |
| 147 vld1.64 {d3}, [r1], r2 | |
| 148 pld [r1] | |
| 149 pld [r1, r2] | |
| 150 pld [r1, r2, lsl #1] | |
| 151 .if \avg | |
| 152 vld1.64 {d4}, [r0,:64], r2 | |
| 153 vrhadd.u8 d0, d0, d4 | |
| 154 vld1.64 {d5}, [r0,:64], r2 | |
| 155 vrhadd.u8 d1, d1, d5 | |
| 156 vld1.64 {d6}, [r0,:64], r2 | |
| 157 vrhadd.u8 d2, d2, d6 | |
| 158 vld1.64 {d7}, [r0,:64], r2 | |
| 159 vrhadd.u8 d3, d3, d7 | |
| 160 sub r0, r0, r2, lsl #2 | |
| 161 .endif | |
| 162 subs r3, r3, #4 | |
| 163 vst1.64 {d0}, [r0,:64], r2 | |
| 164 vst1.64 {d1}, [r0,:64], r2 | |
| 165 vst1.64 {d2}, [r0,:64], r2 | |
| 166 vst1.64 {d3}, [r0,:64], r2 | |
| 167 bne 1b | |
| 168 bx lr | |
| 169 .endm | |
| 170 | |
| 171 .macro pixels8_x2 vhadd=vrhadd.u8 | |
| 172 1: vld1.64 {d0, d1}, [r1], r2 | |
| 173 vext.8 d1, d0, d1, #1 | |
| 174 vld1.64 {d2, d3}, [r1], r2 | |
| 175 vext.8 d3, d2, d3, #1 | |
| 176 pld [r1] | |
| 177 pld [r1, r2] | |
| 178 subs r3, r3, #2 | |
| 179 vswp d1, d2 | |
| 180 \vhadd q0, q0, q1 | |
| 181 vst1.64 {d0}, [r0,:64], r2 | |
| 182 vst1.64 {d1}, [r0,:64], r2 | |
| 183 bne 1b | |
| 184 bx lr | |
| 185 .endm | |
| 186 | |
| 187 .macro pixels8_y2 vhadd=vrhadd.u8 | |
| 188 vld1.64 {d0}, [r1], r2 | |
| 189 vld1.64 {d1}, [r1], r2 | |
| 190 1: subs r3, r3, #2 | |
| 191 \vhadd d4, d0, d1 | |
| 192 vld1.64 {d0}, [r1], r2 | |
| 193 \vhadd d5, d0, d1 | |
| 194 vld1.64 {d1}, [r1], r2 | |
| 195 pld [r1] | |
| 196 pld [r1, r2] | |
| 197 vst1.64 {d4}, [r0,:64], r2 | |
| 198 vst1.64 {d5}, [r0,:64], r2 | |
| 199 bne 1b | |
| 200 bx lr | |
| 201 .endm | |
| 202 | |
| 203 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
| 204 vld1.64 {d0, d1}, [r1], r2 | |
| 205 vld1.64 {d2, d3}, [r1], r2 | |
| 206 .if \no_rnd | |
| 207 vmov.i16 q11, #1 | |
| 208 .endif | |
| 209 pld [r1] | |
| 210 pld [r1, r2] | |
| 211 vext.8 d4, d0, d1, #1 | |
| 212 vext.8 d6, d2, d3, #1 | |
| 213 vaddl.u8 q8, d0, d4 | |
| 214 vaddl.u8 q9, d2, d6 | |
| 215 1: subs r3, r3, #2 | |
| 216 vld1.64 {d0, d1}, [r1], r2 | |
| 217 pld [r1] | |
| 218 vadd.u16 q10, q8, q9 | |
| 219 vext.8 d4, d0, d1, #1 | |
| 220 .if \no_rnd | |
| 221 vadd.u16 q10, q10, q11 | |
| 222 .endif | |
| 223 vaddl.u8 q8, d0, d4 | |
| 224 \vshrn d5, q10, #2 | |
| 225 vld1.64 {d2, d3}, [r1], r2 | |
| 226 vadd.u16 q10, q8, q9 | |
| 227 pld [r1, r2] | |
| 228 .if \no_rnd | |
| 229 vadd.u16 q10, q10, q11 | |
| 230 .endif | |
| 231 vst1.64 {d5}, [r0,:64], r2 | |
| 232 \vshrn d7, q10, #2 | |
| 233 vext.8 d6, d2, d3, #1 | |
| 234 vaddl.u8 q9, d2, d6 | |
| 235 vst1.64 {d7}, [r0,:64], r2 | |
| 236 bgt 1b | |
| 237 bx lr | |
| 238 .endm | |
| 239 | |
| 240 .macro pixfunc pfx name suf rnd_op args:vararg | |
| 241 function ff_\pfx\name\suf\()_neon, export=1 | |
| 242 \name \rnd_op \args | |
| 243 endfunc | |
| 244 .endm | |
| 245 | |
| 246 .macro pixfunc2 pfx name args:vararg | |
| 247 pixfunc \pfx \name | |
| 248 pixfunc \pfx \name \args | |
| 249 .endm | |
| 250 | |
| 251 function ff_put_h264_qpel16_mc00_neon, export=1 | |
| 252 mov r3, #16 | |
| 253 endfunc | |
| 254 | |
| 255 pixfunc put_ pixels16 | |
| 256 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | |
| 257 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | |
| 258 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | |
| 259 | |
| 260 function ff_avg_h264_qpel16_mc00_neon, export=1 | |
| 261 mov r3, #16 | |
| 262 endfunc | |
| 263 | |
| 264 pixfunc avg_ pixels16,, 1 | |
| 265 | |
| 266 function ff_put_h264_qpel8_mc00_neon, export=1 | |
| 267 mov r3, #8 | |
| 268 endfunc | |
| 269 | |
| 270 pixfunc put_ pixels8 | |
| 271 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | |
| 272 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | |
| 273 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | |
| 274 | |
| 275 function ff_avg_h264_qpel8_mc00_neon, export=1 | |
| 276 mov r3, #8 | |
| 277 endfunc | |
| 278 | |
| 279 pixfunc avg_ pixels8,, 1 | |
| 280 | |
| 281 function ff_put_pixels_clamped_neon, export=1 | |
| 282 vld1.64 {d16-d19}, [r0,:128]! | |
| 283 vqmovun.s16 d0, q8 | |
| 284 vld1.64 {d20-d23}, [r0,:128]! | |
| 285 vqmovun.s16 d1, q9 | |
| 286 vld1.64 {d24-d27}, [r0,:128]! | |
| 287 vqmovun.s16 d2, q10 | |
| 288 vld1.64 {d28-d31}, [r0,:128]! | |
| 289 vqmovun.s16 d3, q11 | |
| 290 vst1.64 {d0}, [r1,:64], r2 | |
| 291 vqmovun.s16 d4, q12 | |
| 292 vst1.64 {d1}, [r1,:64], r2 | |
| 293 vqmovun.s16 d5, q13 | |
| 294 vst1.64 {d2}, [r1,:64], r2 | |
| 295 vqmovun.s16 d6, q14 | |
| 296 vst1.64 {d3}, [r1,:64], r2 | |
| 297 vqmovun.s16 d7, q15 | |
| 298 vst1.64 {d4}, [r1,:64], r2 | |
| 299 vst1.64 {d5}, [r1,:64], r2 | |
| 300 vst1.64 {d6}, [r1,:64], r2 | |
| 301 vst1.64 {d7}, [r1,:64], r2 | |
| 302 bx lr | |
| 303 endfunc | |
| 304 | |
| 305 function ff_put_signed_pixels_clamped_neon, export=1 | |
| 306 vmov.u8 d31, #128 | |
| 307 vld1.64 {d16-d17}, [r0,:128]! | |
| 308 vqmovn.s16 d0, q8 | |
| 309 vld1.64 {d18-d19}, [r0,:128]! | |
| 310 vqmovn.s16 d1, q9 | |
| 311 vld1.64 {d16-d17}, [r0,:128]! | |
| 312 vqmovn.s16 d2, q8 | |
| 313 vld1.64 {d18-d19}, [r0,:128]! | |
| 314 vadd.u8 d0, d0, d31 | |
| 315 vld1.64 {d20-d21}, [r0,:128]! | |
| 316 vadd.u8 d1, d1, d31 | |
| 317 vld1.64 {d22-d23}, [r0,:128]! | |
| 318 vadd.u8 d2, d2, d31 | |
| 319 vst1.64 {d0}, [r1,:64], r2 | |
| 320 vqmovn.s16 d3, q9 | |
| 321 vst1.64 {d1}, [r1,:64], r2 | |
| 322 vqmovn.s16 d4, q10 | |
| 323 vst1.64 {d2}, [r1,:64], r2 | |
| 324 vqmovn.s16 d5, q11 | |
| 325 vld1.64 {d24-d25}, [r0,:128]! | |
| 326 vadd.u8 d3, d3, d31 | |
| 327 vld1.64 {d26-d27}, [r0,:128]! | |
| 328 vadd.u8 d4, d4, d31 | |
| 329 vadd.u8 d5, d5, d31 | |
| 330 vst1.64 {d3}, [r1,:64], r2 | |
| 331 vqmovn.s16 d6, q12 | |
| 332 vst1.64 {d4}, [r1,:64], r2 | |
| 333 vqmovn.s16 d7, q13 | |
| 334 vst1.64 {d5}, [r1,:64], r2 | |
| 335 vadd.u8 d6, d6, d31 | |
| 336 vadd.u8 d7, d7, d31 | |
| 337 vst1.64 {d6}, [r1,:64], r2 | |
| 338 vst1.64 {d7}, [r1,:64], r2 | |
| 339 bx lr | |
| 340 endfunc | |
| 341 | |
| 342 function ff_add_pixels_clamped_neon, export=1 | |
| 343 mov r3, r1 | |
| 344 vld1.64 {d16}, [r1,:64], r2 | |
| 345 vld1.64 {d0-d1}, [r0,:128]! | |
| 346 vaddw.u8 q0, q0, d16 | |
| 347 vld1.64 {d17}, [r1,:64], r2 | |
| 348 vld1.64 {d2-d3}, [r0,:128]! | |
| 349 vqmovun.s16 d0, q0 | |
| 350 vld1.64 {d18}, [r1,:64], r2 | |
| 351 vaddw.u8 q1, q1, d17 | |
| 352 vld1.64 {d4-d5}, [r0,:128]! | |
| 353 vaddw.u8 q2, q2, d18 | |
| 354 vst1.64 {d0}, [r3,:64], r2 | |
| 355 vqmovun.s16 d2, q1 | |
| 356 vld1.64 {d19}, [r1,:64], r2 | |
| 357 vld1.64 {d6-d7}, [r0,:128]! | |
| 358 vaddw.u8 q3, q3, d19 | |
| 359 vqmovun.s16 d4, q2 | |
| 360 vst1.64 {d2}, [r3,:64], r2 | |
| 361 vld1.64 {d16}, [r1,:64], r2 | |
| 362 vqmovun.s16 d6, q3 | |
| 363 vld1.64 {d0-d1}, [r0,:128]! | |
| 364 vaddw.u8 q0, q0, d16 | |
| 365 vst1.64 {d4}, [r3,:64], r2 | |
| 366 vld1.64 {d17}, [r1,:64], r2 | |
| 367 vld1.64 {d2-d3}, [r0,:128]! | |
| 368 vaddw.u8 q1, q1, d17 | |
| 369 vst1.64 {d6}, [r3,:64], r2 | |
| 370 vqmovun.s16 d0, q0 | |
| 371 vld1.64 {d18}, [r1,:64], r2 | |
| 372 vld1.64 {d4-d5}, [r0,:128]! | |
| 373 vaddw.u8 q2, q2, d18 | |
| 374 vst1.64 {d0}, [r3,:64], r2 | |
| 375 vqmovun.s16 d2, q1 | |
| 376 vld1.64 {d19}, [r1,:64], r2 | |
| 377 vqmovun.s16 d4, q2 | |
| 378 vld1.64 {d6-d7}, [r0,:128]! | |
| 379 vaddw.u8 q3, q3, d19 | |
| 380 vst1.64 {d2}, [r3,:64], r2 | |
| 381 vqmovun.s16 d6, q3 | |
| 382 vst1.64 {d4}, [r3,:64], r2 | |
| 383 vst1.64 {d6}, [r3,:64], r2 | |
| 384 bx lr | |
| 385 endfunc | |
| 386 | |
| 387 function ff_float_to_int16_neon, export=1 | |
| 388 subs r2, r2, #8 | |
| 389 vld1.64 {d0-d1}, [r1,:128]! | |
| 390 vcvt.s32.f32 q8, q0, #16 | |
| 391 vld1.64 {d2-d3}, [r1,:128]! | |
| 392 vcvt.s32.f32 q9, q1, #16 | |
| 393 beq 3f | |
| 394 bics ip, r2, #15 | |
| 395 beq 2f | |
| 396 1: subs ip, ip, #16 | |
| 397 vshrn.s32 d4, q8, #16 | |
| 398 vld1.64 {d0-d1}, [r1,:128]! | |
| 399 vcvt.s32.f32 q0, q0, #16 | |
| 400 vshrn.s32 d5, q9, #16 | |
| 401 vld1.64 {d2-d3}, [r1,:128]! | |
| 402 vcvt.s32.f32 q1, q1, #16 | |
| 403 vshrn.s32 d6, q0, #16 | |
| 404 vst1.64 {d4-d5}, [r0,:128]! | |
| 405 vshrn.s32 d7, q1, #16 | |
| 406 vld1.64 {d16-d17},[r1,:128]! | |
| 407 vcvt.s32.f32 q8, q8, #16 | |
| 408 vld1.64 {d18-d19},[r1,:128]! | |
| 409 vcvt.s32.f32 q9, q9, #16 | |
| 410 vst1.64 {d6-d7}, [r0,:128]! | |
| 411 bne 1b | |
| 412 ands r2, r2, #15 | |
| 413 beq 3f | |
| 414 2: vld1.64 {d0-d1}, [r1,:128]! | |
| 415 vshrn.s32 d4, q8, #16 | |
| 416 vcvt.s32.f32 q0, q0, #16 | |
| 417 vld1.64 {d2-d3}, [r1,:128]! | |
| 418 vshrn.s32 d5, q9, #16 | |
| 419 vcvt.s32.f32 q1, q1, #16 | |
| 420 vshrn.s32 d6, q0, #16 | |
| 421 vst1.64 {d4-d5}, [r0,:128]! | |
| 422 vshrn.s32 d7, q1, #16 | |
| 423 vst1.64 {d6-d7}, [r0,:128]! | |
| 424 bx lr | |
| 425 3: vshrn.s32 d4, q8, #16 | |
| 426 vshrn.s32 d5, q9, #16 | |
| 427 vst1.64 {d4-d5}, [r0,:128]! | |
| 428 bx lr | |
| 429 endfunc | |
| 430 | |
| 431 function ff_float_to_int16_interleave_neon, export=1 | |
| 432 cmp r3, #2 | |
| 433 ldrlt r1, [r1] | |
| 434 blt ff_float_to_int16_neon | |
| 435 bne 4f | |
| 436 | |
| 437 ldr r3, [r1] | |
| 438 ldr r1, [r1, #4] | |
| 439 | |
| 440 subs r2, r2, #8 | |
| 441 vld1.64 {d0-d1}, [r3,:128]! | |
| 442 vcvt.s32.f32 q8, q0, #16 | |
| 443 vld1.64 {d2-d3}, [r3,:128]! | |
| 444 vcvt.s32.f32 q9, q1, #16 | |
| 445 vld1.64 {d20-d21},[r1,:128]! | |
| 446 vcvt.s32.f32 q10, q10, #16 | |
| 447 vld1.64 {d22-d23},[r1,:128]! | |
| 448 vcvt.s32.f32 q11, q11, #16 | |
| 449 beq 3f | |
| 450 bics ip, r2, #15 | |
| 451 beq 2f | |
| 452 1: subs ip, ip, #16 | |
| 453 vld1.64 {d0-d1}, [r3,:128]! | |
| 454 vcvt.s32.f32 q0, q0, #16 | |
| 455 vsri.32 q10, q8, #16 | |
| 456 vld1.64 {d2-d3}, [r3,:128]! | |
| 457 vcvt.s32.f32 q1, q1, #16 | |
| 458 vld1.64 {d24-d25},[r1,:128]! | |
| 459 vcvt.s32.f32 q12, q12, #16 | |
| 460 vld1.64 {d26-d27},[r1,:128]! | |
| 461 vsri.32 q11, q9, #16 | |
| 462 vst1.64 {d20-d21},[r0,:128]! | |
| 463 vcvt.s32.f32 q13, q13, #16 | |
| 464 vst1.64 {d22-d23},[r0,:128]! | |
| 465 vsri.32 q12, q0, #16 | |
| 466 vld1.64 {d16-d17},[r3,:128]! | |
| 467 vsri.32 q13, q1, #16 | |
| 468 vst1.64 {d24-d25},[r0,:128]! | |
| 469 vcvt.s32.f32 q8, q8, #16 | |
| 470 vld1.64 {d18-d19},[r3,:128]! | |
| 471 vcvt.s32.f32 q9, q9, #16 | |
| 472 vld1.64 {d20-d21},[r1,:128]! | |
| 473 vcvt.s32.f32 q10, q10, #16 | |
| 474 vld1.64 {d22-d23},[r1,:128]! | |
| 475 vcvt.s32.f32 q11, q11, #16 | |
| 476 vst1.64 {d26-d27},[r0,:128]! | |
| 477 bne 1b | |
| 478 ands r2, r2, #15 | |
| 479 beq 3f | |
| 480 2: vsri.32 q10, q8, #16 | |
| 481 vld1.64 {d0-d1}, [r3,:128]! | |
| 482 vcvt.s32.f32 q0, q0, #16 | |
| 483 vld1.64 {d2-d3}, [r3,:128]! | |
| 484 vcvt.s32.f32 q1, q1, #16 | |
| 485 vld1.64 {d24-d25},[r1,:128]! | |
| 486 vcvt.s32.f32 q12, q12, #16 | |
| 487 vsri.32 q11, q9, #16 | |
| 488 vld1.64 {d26-d27},[r1,:128]! | |
| 489 vcvt.s32.f32 q13, q13, #16 | |
| 490 vst1.64 {d20-d21},[r0,:128]! | |
| 491 vsri.32 q12, q0, #16 | |
| 492 vst1.64 {d22-d23},[r0,:128]! | |
| 493 vsri.32 q13, q1, #16 | |
| 494 vst1.64 {d24-d27},[r0,:128]! | |
| 495 bx lr | |
| 496 3: vsri.32 q10, q8, #16 | |
| 497 vsri.32 q11, q9, #16 | |
| 498 vst1.64 {d20-d23},[r0,:128]! | |
| 499 bx lr | |
| 500 | |
| 501 4: push {r4-r8,lr} | |
| 502 cmp r3, #4 | |
| 503 lsl ip, r3, #1 | |
| 504 blt 4f | |
| 505 | |
| 506 @ 4 channels | |
| 507 5: ldmia r1!, {r4-r7} | |
| 508 mov lr, r2 | |
| 509 mov r8, r0 | |
| 510 vld1.64 {d16-d17},[r4,:128]! | |
| 511 vcvt.s32.f32 q8, q8, #16 | |
| 512 vld1.64 {d18-d19},[r5,:128]! | |
| 513 vcvt.s32.f32 q9, q9, #16 | |
| 514 vld1.64 {d20-d21},[r6,:128]! | |
| 515 vcvt.s32.f32 q10, q10, #16 | |
| 516 vld1.64 {d22-d23},[r7,:128]! | |
| 517 vcvt.s32.f32 q11, q11, #16 | |
| 518 6: subs lr, lr, #8 | |
| 519 vld1.64 {d0-d1}, [r4,:128]! | |
| 520 vcvt.s32.f32 q0, q0, #16 | |
| 521 vsri.32 q9, q8, #16 | |
| 522 vld1.64 {d2-d3}, [r5,:128]! | |
| 523 vcvt.s32.f32 q1, q1, #16 | |
| 524 vsri.32 q11, q10, #16 | |
| 525 vld1.64 {d4-d5}, [r6,:128]! | |
| 526 vcvt.s32.f32 q2, q2, #16 | |
| 527 vzip.32 d18, d22 | |
| 528 vld1.64 {d6-d7}, [r7,:128]! | |
| 529 vcvt.s32.f32 q3, q3, #16 | |
| 530 vzip.32 d19, d23 | |
| 531 vst1.64 {d18}, [r8], ip | |
| 532 vsri.32 q1, q0, #16 | |
| 533 vst1.64 {d22}, [r8], ip | |
| 534 vsri.32 q3, q2, #16 | |
| 535 vst1.64 {d19}, [r8], ip | |
| 536 vzip.32 d2, d6 | |
| 537 vst1.64 {d23}, [r8], ip | |
| 538 vzip.32 d3, d7 | |
| 539 beq 7f | |
| 540 vld1.64 {d16-d17},[r4,:128]! | |
| 541 vcvt.s32.f32 q8, q8, #16 | |
| 542 vst1.64 {d2}, [r8], ip | |
| 543 vld1.64 {d18-d19},[r5,:128]! | |
| 544 vcvt.s32.f32 q9, q9, #16 | |
| 545 vst1.64 {d6}, [r8], ip | |
| 546 vld1.64 {d20-d21},[r6,:128]! | |
| 547 vcvt.s32.f32 q10, q10, #16 | |
| 548 vst1.64 {d3}, [r8], ip | |
| 549 vld1.64 {d22-d23},[r7,:128]! | |
| 550 vcvt.s32.f32 q11, q11, #16 | |
| 551 vst1.64 {d7}, [r8], ip | |
| 552 b 6b | |
| 553 7: vst1.64 {d2}, [r8], ip | |
| 554 vst1.64 {d6}, [r8], ip | |
| 555 vst1.64 {d3}, [r8], ip | |
| 556 vst1.64 {d7}, [r8], ip | |
| 557 subs r3, r3, #4 | |
| 558 popeq {r4-r8,pc} | |
| 559 cmp r3, #4 | |
| 560 add r0, r0, #8 | |
| 561 bge 5b | |
| 562 | |
| 563 @ 2 channels | |
| 564 4: cmp r3, #2 | |
| 565 blt 4f | |
| 566 ldmia r1!, {r4-r5} | |
| 567 mov lr, r2 | |
| 568 mov r8, r0 | |
| 569 tst lr, #8 | |
| 570 vld1.64 {d16-d17},[r4,:128]! | |
| 571 vcvt.s32.f32 q8, q8, #16 | |
| 572 vld1.64 {d18-d19},[r5,:128]! | |
| 573 vcvt.s32.f32 q9, q9, #16 | |
| 574 vld1.64 {d20-d21},[r4,:128]! | |
| 575 vcvt.s32.f32 q10, q10, #16 | |
| 576 vld1.64 {d22-d23},[r5,:128]! | |
| 577 vcvt.s32.f32 q11, q11, #16 | |
| 578 beq 6f | |
| 579 subs lr, lr, #8 | |
| 580 beq 7f | |
| 581 vsri.32 d18, d16, #16 | |
| 582 vsri.32 d19, d17, #16 | |
| 583 vld1.64 {d16-d17},[r4,:128]! | |
| 584 vcvt.s32.f32 q8, q8, #16 | |
| 585 vst1.32 {d18[0]}, [r8], ip | |
| 586 vsri.32 d22, d20, #16 | |
| 587 vst1.32 {d18[1]}, [r8], ip | |
| 588 vsri.32 d23, d21, #16 | |
| 589 vst1.32 {d19[0]}, [r8], ip | |
| 590 vst1.32 {d19[1]}, [r8], ip | |
| 591 vld1.64 {d18-d19},[r5,:128]! | |
| 592 vcvt.s32.f32 q9, q9, #16 | |
| 593 vst1.32 {d22[0]}, [r8], ip | |
| 594 vst1.32 {d22[1]}, [r8], ip | |
| 595 vld1.64 {d20-d21},[r4,:128]! | |
| 596 vcvt.s32.f32 q10, q10, #16 | |
| 597 vst1.32 {d23[0]}, [r8], ip | |
| 598 vst1.32 {d23[1]}, [r8], ip | |
| 599 vld1.64 {d22-d23},[r5,:128]! | |
| 600 vcvt.s32.f32 q11, q11, #16 | |
| 601 6: subs lr, lr, #16 | |
| 602 vld1.64 {d0-d1}, [r4,:128]! | |
| 603 vcvt.s32.f32 q0, q0, #16 | |
| 604 vsri.32 d18, d16, #16 | |
| 605 vld1.64 {d2-d3}, [r5,:128]! | |
| 606 vcvt.s32.f32 q1, q1, #16 | |
| 607 vsri.32 d19, d17, #16 | |
| 608 vld1.64 {d4-d5}, [r4,:128]! | |
| 609 vcvt.s32.f32 q2, q2, #16 | |
| 610 vld1.64 {d6-d7}, [r5,:128]! | |
| 611 vcvt.s32.f32 q3, q3, #16 | |
| 612 vst1.32 {d18[0]}, [r8], ip | |
| 613 vsri.32 d22, d20, #16 | |
| 614 vst1.32 {d18[1]}, [r8], ip | |
| 615 vsri.32 d23, d21, #16 | |
| 616 vst1.32 {d19[0]}, [r8], ip | |
| 617 vsri.32 d2, d0, #16 | |
| 618 vst1.32 {d19[1]}, [r8], ip | |
| 619 vsri.32 d3, d1, #16 | |
| 620 vst1.32 {d22[0]}, [r8], ip | |
| 621 vsri.32 d6, d4, #16 | |
| 622 vst1.32 {d22[1]}, [r8], ip | |
| 623 vsri.32 d7, d5, #16 | |
| 624 vst1.32 {d23[0]}, [r8], ip | |
| 625 vst1.32 {d23[1]}, [r8], ip | |
| 626 beq 6f | |
| 627 vld1.64 {d16-d17},[r4,:128]! | |
| 628 vcvt.s32.f32 q8, q8, #16 | |
| 629 vst1.32 {d2[0]}, [r8], ip | |
| 630 vst1.32 {d2[1]}, [r8], ip | |
| 631 vld1.64 {d18-d19},[r5,:128]! | |
| 632 vcvt.s32.f32 q9, q9, #16 | |
| 633 vst1.32 {d3[0]}, [r8], ip | |
| 634 vst1.32 {d3[1]}, [r8], ip | |
| 635 vld1.64 {d20-d21},[r4,:128]! | |
| 636 vcvt.s32.f32 q10, q10, #16 | |
| 637 vst1.32 {d6[0]}, [r8], ip | |
| 638 vst1.32 {d6[1]}, [r8], ip | |
| 639 vld1.64 {d22-d23},[r5,:128]! | |
| 640 vcvt.s32.f32 q11, q11, #16 | |
| 641 vst1.32 {d7[0]}, [r8], ip | |
| 642 vst1.32 {d7[1]}, [r8], ip | |
| 643 bgt 6b | |
| 644 6: vst1.32 {d2[0]}, [r8], ip | |
| 645 vst1.32 {d2[1]}, [r8], ip | |
| 646 vst1.32 {d3[0]}, [r8], ip | |
| 647 vst1.32 {d3[1]}, [r8], ip | |
| 648 vst1.32 {d6[0]}, [r8], ip | |
| 649 vst1.32 {d6[1]}, [r8], ip | |
| 650 vst1.32 {d7[0]}, [r8], ip | |
| 651 vst1.32 {d7[1]}, [r8], ip | |
| 652 b 8f | |
| 653 7: vsri.32 d18, d16, #16 | |
| 654 vsri.32 d19, d17, #16 | |
| 655 vst1.32 {d18[0]}, [r8], ip | |
| 656 vsri.32 d22, d20, #16 | |
| 657 vst1.32 {d18[1]}, [r8], ip | |
| 658 vsri.32 d23, d21, #16 | |
| 659 vst1.32 {d19[0]}, [r8], ip | |
| 660 vst1.32 {d19[1]}, [r8], ip | |
| 661 vst1.32 {d22[0]}, [r8], ip | |
| 662 vst1.32 {d22[1]}, [r8], ip | |
| 663 vst1.32 {d23[0]}, [r8], ip | |
| 664 vst1.32 {d23[1]}, [r8], ip | |
| 665 8: subs r3, r3, #2 | |
| 666 add r0, r0, #4 | |
| 667 popeq {r4-r8,pc} | |
| 668 | |
| 669 @ 1 channel | |
| 670 4: ldr r4, [r1],#4 | |
| 671 tst r2, #8 | |
| 672 mov lr, r2 | |
| 673 mov r5, r0 | |
| 674 vld1.64 {d0-d1}, [r4,:128]! | |
| 675 vcvt.s32.f32 q0, q0, #16 | |
| 676 vld1.64 {d2-d3}, [r4,:128]! | |
| 677 vcvt.s32.f32 q1, q1, #16 | |
| 678 bne 8f | |
| 679 6: subs lr, lr, #16 | |
| 680 vld1.64 {d4-d5}, [r4,:128]! | |
| 681 vcvt.s32.f32 q2, q2, #16 | |
| 682 vld1.64 {d6-d7}, [r4,:128]! | |
| 683 vcvt.s32.f32 q3, q3, #16 | |
| 684 vst1.16 {d0[1]}, [r5,:16], ip | |
| 685 vst1.16 {d0[3]}, [r5,:16], ip | |
| 686 vst1.16 {d1[1]}, [r5,:16], ip | |
| 687 vst1.16 {d1[3]}, [r5,:16], ip | |
| 688 vst1.16 {d2[1]}, [r5,:16], ip | |
| 689 vst1.16 {d2[3]}, [r5,:16], ip | |
| 690 vst1.16 {d3[1]}, [r5,:16], ip | |
| 691 vst1.16 {d3[3]}, [r5,:16], ip | |
| 692 beq 7f | |
| 693 vld1.64 {d0-d1}, [r4,:128]! | |
| 694 vcvt.s32.f32 q0, q0, #16 | |
| 695 vld1.64 {d2-d3}, [r4,:128]! | |
| 696 vcvt.s32.f32 q1, q1, #16 | |
| 697 7: vst1.16 {d4[1]}, [r5,:16], ip | |
| 698 vst1.16 {d4[3]}, [r5,:16], ip | |
| 699 vst1.16 {d5[1]}, [r5,:16], ip | |
| 700 vst1.16 {d5[3]}, [r5,:16], ip | |
| 701 vst1.16 {d6[1]}, [r5,:16], ip | |
| 702 vst1.16 {d6[3]}, [r5,:16], ip | |
| 703 vst1.16 {d7[1]}, [r5,:16], ip | |
| 704 vst1.16 {d7[3]}, [r5,:16], ip | |
| 705 bgt 6b | |
| 706 pop {r4-r8,pc} | |
| 707 8: subs lr, lr, #8 | |
| 708 vst1.16 {d0[1]}, [r5,:16], ip | |
| 709 vst1.16 {d0[3]}, [r5,:16], ip | |
| 710 vst1.16 {d1[1]}, [r5,:16], ip | |
| 711 vst1.16 {d1[3]}, [r5,:16], ip | |
| 712 vst1.16 {d2[1]}, [r5,:16], ip | |
| 713 vst1.16 {d2[3]}, [r5,:16], ip | |
| 714 vst1.16 {d3[1]}, [r5,:16], ip | |
| 715 vst1.16 {d3[3]}, [r5,:16], ip | |
| 716 popeq {r4-r8,pc} | |
| 717 vld1.64 {d0-d1}, [r4,:128]! | |
| 718 vcvt.s32.f32 q0, q0, #16 | |
| 719 vld1.64 {d2-d3}, [r4,:128]! | |
| 720 vcvt.s32.f32 q1, q1, #16 | |
| 721 b 6b | |
| 722 endfunc | |
| 723 | |
| 724 function ff_vector_fmul_neon, export=1 | |
| 725 mov r3, r0 | |
| 726 subs r2, r2, #8 | |
| 727 vld1.64 {d0-d3}, [r0,:128]! | |
| 728 vld1.64 {d4-d7}, [r1,:128]! | |
| 729 vmul.f32 q8, q0, q2 | |
| 730 vmul.f32 q9, q1, q3 | |
| 731 beq 3f | |
| 732 bics ip, r2, #15 | |
| 733 beq 2f | |
| 734 1: subs ip, ip, #16 | |
| 735 vld1.64 {d0-d1}, [r0,:128]! | |
| 736 vld1.64 {d4-d5}, [r1,:128]! | |
| 737 vmul.f32 q10, q0, q2 | |
| 738 vld1.64 {d2-d3}, [r0,:128]! | |
| 739 vld1.64 {d6-d7}, [r1,:128]! | |
| 740 vmul.f32 q11, q1, q3 | |
| 741 vst1.64 {d16-d19},[r3,:128]! | |
| 742 vld1.64 {d0-d1}, [r0,:128]! | |
| 743 vld1.64 {d4-d5}, [r1,:128]! | |
| 744 vmul.f32 q8, q0, q2 | |
| 745 vld1.64 {d2-d3}, [r0,:128]! | |
| 746 vld1.64 {d6-d7}, [r1,:128]! | |
| 747 vmul.f32 q9, q1, q3 | |
| 748 vst1.64 {d20-d23},[r3,:128]! | |
| 749 bne 1b | |
| 750 ands r2, r2, #15 | |
| 751 beq 3f | |
| 752 2: vld1.64 {d0-d1}, [r0,:128]! | |
| 753 vld1.64 {d4-d5}, [r1,:128]! | |
| 754 vst1.64 {d16-d17},[r3,:128]! | |
| 755 vmul.f32 q8, q0, q2 | |
| 756 vld1.64 {d2-d3}, [r0,:128]! | |
| 757 vld1.64 {d6-d7}, [r1,:128]! | |
| 758 vst1.64 {d18-d19},[r3,:128]! | |
| 759 vmul.f32 q9, q1, q3 | |
| 760 3: vst1.64 {d16-d19},[r3,:128]! | |
| 761 bx lr | |
| 762 endfunc | |
| 763 | |
| 764 function ff_vector_fmul_window_neon, export=1 | |
| 765 VFP vdup.32 q8, d0[0] | |
| 766 NOVFP vld1.32 {d16[],d17[]}, [sp,:32] | |
| 767 push {r4,r5,lr} | |
| 768 VFP ldr lr, [sp, #12] | |
| 769 NOVFP ldr lr, [sp, #16] | |
| 770 sub r2, r2, #8 | |
| 771 sub r5, lr, #2 | |
| 772 add r2, r2, r5, lsl #2 | |
| 773 add r4, r3, r5, lsl #3 | |
| 774 add ip, r0, r5, lsl #3 | |
| 775 mov r5, #-16 | |
| 776 vld1.64 {d0,d1}, [r1,:128]! | |
| 777 vld1.64 {d2,d3}, [r2,:128], r5 | |
| 778 vld1.64 {d4,d5}, [r3,:128]! | |
| 779 vld1.64 {d6,d7}, [r4,:128], r5 | |
| 780 1: subs lr, lr, #4 | |
| 781 vmov q11, q8 | |
| 782 vmla.f32 d22, d0, d4 | |
| 783 vmov q10, q8 | |
| 784 vmla.f32 d23, d1, d5 | |
| 785 vrev64.32 q3, q3 | |
| 786 vmla.f32 d20, d0, d7 | |
| 787 vrev64.32 q1, q1 | |
| 788 vmla.f32 d21, d1, d6 | |
| 789 beq 2f | |
| 790 vmla.f32 d22, d3, d7 | |
| 791 vld1.64 {d0,d1}, [r1,:128]! | |
| 792 vmla.f32 d23, d2, d6 | |
| 793 vld1.64 {d18,d19},[r2,:128], r5 | |
| 794 vmls.f32 d20, d3, d4 | |
| 795 vld1.64 {d24,d25},[r3,:128]! | |
| 796 vmls.f32 d21, d2, d5 | |
| 797 vld1.64 {d6,d7}, [r4,:128], r5 | |
| 798 vmov q1, q9 | |
| 799 vrev64.32 q11, q11 | |
| 800 vmov q2, q12 | |
| 801 vswp d22, d23 | |
| 802 vst1.64 {d20,d21},[r0,:128]! | |
| 803 vst1.64 {d22,d23},[ip,:128], r5 | |
| 804 b 1b | |
| 805 2: vmla.f32 d22, d3, d7 | |
| 806 vmla.f32 d23, d2, d6 | |
| 807 vmls.f32 d20, d3, d4 | |
| 808 vmls.f32 d21, d2, d5 | |
| 809 vrev64.32 q11, q11 | |
| 810 vswp d22, d23 | |
| 811 vst1.64 {d20,d21},[r0,:128]! | |
| 812 vst1.64 {d22,d23},[ip,:128], r5 | |
| 813 pop {r4,r5,pc} | |
| 814 endfunc | |
| 815 | |
| 816 #if CONFIG_VORBIS_DECODER | |
| 817 function ff_vorbis_inverse_coupling_neon, export=1 | |
| 818 vmov.i32 q10, #1<<31 | |
| 819 subs r2, r2, #4 | |
| 820 mov r3, r0 | |
| 821 mov r12, r1 | |
| 822 beq 3f | |
| 823 | |
| 824 vld1.32 {d24-d25},[r1,:128]! | |
| 825 vld1.32 {d22-d23},[r0,:128]! | |
| 826 vcle.s32 q8, q12, #0 | |
| 827 vand q9, q11, q10 | |
| 828 veor q12, q12, q9 | |
| 829 vand q2, q12, q8 | |
| 830 vbic q3, q12, q8 | |
| 831 vadd.f32 q12, q11, q2 | |
| 832 vsub.f32 q11, q11, q3 | |
| 833 1: vld1.32 {d2-d3}, [r1,:128]! | |
| 834 vld1.32 {d0-d1}, [r0,:128]! | |
| 835 vcle.s32 q8, q1, #0 | |
| 836 vand q9, q0, q10 | |
| 837 veor q1, q1, q9 | |
| 838 vst1.32 {d24-d25},[r3, :128]! | |
| 839 vst1.32 {d22-d23},[r12,:128]! | |
| 840 vand q2, q1, q8 | |
| 841 vbic q3, q1, q8 | |
| 842 vadd.f32 q1, q0, q2 | |
| 843 vsub.f32 q0, q0, q3 | |
| 844 subs r2, r2, #8 | |
| 845 ble 2f | |
| 846 vld1.32 {d24-d25},[r1,:128]! | |
| 847 vld1.32 {d22-d23},[r0,:128]! | |
| 848 vcle.s32 q8, q12, #0 | |
| 849 vand q9, q11, q10 | |
| 850 veor q12, q12, q9 | |
| 851 vst1.32 {d2-d3}, [r3, :128]! | |
| 852 vst1.32 {d0-d1}, [r12,:128]! | |
| 853 vand q2, q12, q8 | |
| 854 vbic q3, q12, q8 | |
| 855 vadd.f32 q12, q11, q2 | |
| 856 vsub.f32 q11, q11, q3 | |
| 857 b 1b | |
| 858 | |
| 859 2: vst1.32 {d2-d3}, [r3, :128]! | |
| 860 vst1.32 {d0-d1}, [r12,:128]! | |
| 861 bxlt lr | |
| 862 | |
| 863 3: vld1.32 {d2-d3}, [r1,:128] | |
| 864 vld1.32 {d0-d1}, [r0,:128] | |
| 865 vcle.s32 q8, q1, #0 | |
| 866 vand q9, q0, q10 | |
| 867 veor q1, q1, q9 | |
| 868 vand q2, q1, q8 | |
| 869 vbic q3, q1, q8 | |
| 870 vadd.f32 q1, q0, q2 | |
| 871 vsub.f32 q0, q0, q3 | |
| 872 vst1.32 {d2-d3}, [r0,:128]! | |
| 873 vst1.32 {d0-d1}, [r1,:128]! | |
| 874 bx lr | |
| 875 endfunc | |
| 876 #endif | |
| 877 | |
| 878 function ff_vector_fmul_scalar_neon, export=1 | |
| 879 VFP len .req r2 | |
| 880 NOVFP len .req r3 | |
| 881 VFP vdup.32 q8, d0[0] | |
| 882 NOVFP vdup.32 q8, r2 | |
| 883 bics r12, len, #15 | |
| 884 beq 3f | |
| 885 vld1.32 {q0},[r1,:128]! | |
| 886 vld1.32 {q1},[r1,:128]! | |
| 887 1: vmul.f32 q0, q0, q8 | |
| 888 vld1.32 {q2},[r1,:128]! | |
| 889 vmul.f32 q1, q1, q8 | |
| 890 vld1.32 {q3},[r1,:128]! | |
| 891 vmul.f32 q2, q2, q8 | |
| 892 vst1.32 {q0},[r0,:128]! | |
| 893 vmul.f32 q3, q3, q8 | |
| 894 vst1.32 {q1},[r0,:128]! | |
| 895 subs r12, r12, #16 | |
| 896 beq 2f | |
| 897 vld1.32 {q0},[r1,:128]! | |
| 898 vst1.32 {q2},[r0,:128]! | |
| 899 vld1.32 {q1},[r1,:128]! | |
| 900 vst1.32 {q3},[r0,:128]! | |
| 901 b 1b | |
| 902 2: vst1.32 {q2},[r0,:128]! | |
| 903 vst1.32 {q3},[r0,:128]! | |
| 904 ands len, len, #15 | |
| 905 bxeq lr | |
| 906 3: vld1.32 {q0},[r1,:128]! | |
| 907 vmul.f32 q0, q0, q8 | |
| 908 vst1.32 {q0},[r0,:128]! | |
| 909 subs len, len, #4 | |
| 910 bgt 3b | |
| 911 bx lr | |
| 912 .unreq len | |
| 913 endfunc | |
| 914 | |
| 915 function ff_vector_fmul_sv_scalar_2_neon, export=1 | |
| 916 VFP vdup.32 d16, d0[0] | |
| 917 NOVFP vdup.32 d16, r3 | |
| 918 NOVFP ldr r3, [sp] | |
| 919 vld1.32 {d0},[r1,:64]! | |
| 920 vld1.32 {d1},[r1,:64]! | |
| 921 1: subs r3, r3, #4 | |
| 922 vmul.f32 d4, d0, d16 | |
| 923 vmul.f32 d5, d1, d16 | |
| 924 ldr r12, [r2], #4 | |
| 925 vld1.32 {d2},[r12,:64] | |
| 926 ldr r12, [r2], #4 | |
| 927 vld1.32 {d3},[r12,:64] | |
| 928 vmul.f32 d4, d4, d2 | |
| 929 vmul.f32 d5, d5, d3 | |
| 930 beq 2f | |
| 931 vld1.32 {d0},[r1,:64]! | |
| 932 vld1.32 {d1},[r1,:64]! | |
| 933 vst1.32 {d4},[r0,:64]! | |
| 934 vst1.32 {d5},[r0,:64]! | |
| 935 b 1b | |
| 936 2: vst1.32 {d4},[r0,:64]! | |
| 937 vst1.32 {d5},[r0,:64]! | |
| 938 bx lr | |
| 939 endfunc | |
| 940 | |
| 941 function ff_vector_fmul_sv_scalar_4_neon, export=1 | |
| 942 VFP vdup.32 q10, d0[0] | |
| 943 NOVFP vdup.32 q10, r3 | |
| 944 NOVFP ldr r3, [sp] | |
| 945 push {lr} | |
| 946 bics lr, r3, #7 | |
| 947 beq 3f | |
| 948 vld1.32 {q0},[r1,:128]! | |
| 949 vld1.32 {q2},[r1,:128]! | |
| 950 1: ldr r12, [r2], #4 | |
| 951 vld1.32 {q1},[r12,:128] | |
| 952 ldr r12, [r2], #4 | |
| 953 vld1.32 {q3},[r12,:128] | |
| 954 vmul.f32 q8, q0, q10 | |
| 955 vmul.f32 q8, q8, q1 | |
| 956 vmul.f32 q9, q2, q10 | |
| 957 vmul.f32 q9, q9, q3 | |
| 958 subs lr, lr, #8 | |
| 959 beq 2f | |
| 960 vld1.32 {q0},[r1,:128]! | |
| 961 vld1.32 {q2},[r1,:128]! | |
| 962 vst1.32 {q8},[r0,:128]! | |
| 963 vst1.32 {q9},[r0,:128]! | |
| 964 b 1b | |
| 965 2: vst1.32 {q8},[r0,:128]! | |
| 966 vst1.32 {q9},[r0,:128]! | |
| 967 ands r3, r3, #7 | |
| 968 popeq {pc} | |
| 969 3: vld1.32 {q0},[r1,:128]! | |
| 970 ldr r12, [r2], #4 | |
| 971 vld1.32 {q1},[r12,:128] | |
| 972 vmul.f32 q0, q0, q10 | |
| 973 vmul.f32 q0, q0, q1 | |
| 974 vst1.32 {q0},[r0,:128]! | |
| 975 subs r3, r3, #4 | |
| 976 bgt 3b | |
| 977 pop {pc} | |
| 978 endfunc | |
| 979 | |
| 980 function ff_sv_fmul_scalar_2_neon, export=1 | |
| 981 VFP len .req r2 | |
| 982 NOVFP len .req r3 | |
| 983 VFP vdup.32 q8, d0[0] | |
| 984 NOVFP vdup.32 q8, r2 | |
| 985 ldr r12, [r1], #4 | |
| 986 vld1.32 {d0},[r12,:64] | |
| 987 ldr r12, [r1], #4 | |
| 988 vld1.32 {d1},[r12,:64] | |
| 989 1: vmul.f32 q1, q0, q8 | |
| 990 subs len, len, #4 | |
| 991 beq 2f | |
| 992 ldr r12, [r1], #4 | |
| 993 vld1.32 {d0},[r12,:64] | |
| 994 ldr r12, [r1], #4 | |
| 995 vld1.32 {d1},[r12,:64] | |
| 996 vst1.32 {q1},[r0,:128]! | |
| 997 b 1b | |
| 998 2: vst1.32 {q1},[r0,:128]! | |
| 999 bx lr | |
| 1000 .unreq len | |
| 1001 endfunc | |
| 1002 | |
| 1003 function ff_sv_fmul_scalar_4_neon, export=1 | |
| 1004 VFP len .req r2 | |
| 1005 NOVFP len .req r3 | |
| 1006 VFP vdup.32 q8, d0[0] | |
| 1007 NOVFP vdup.32 q8, r2 | |
| 1008 1: ldr r12, [r1], #4 | |
| 1009 vld1.32 {q0},[r12,:128] | |
| 1010 vmul.f32 q0, q0, q8 | |
| 1011 vst1.32 {q0},[r0,:128]! | |
| 1012 subs len, len, #4 | |
| 1013 bgt 1b | |
| 1014 bx lr | |
| 1015 .unreq len | |
| 1016 endfunc | |
| 1017 | |
| 1018 function ff_butterflies_float_neon, export=1 | |
| 1019 1: vld1.32 {q0},[r0,:128] | |
| 1020 vld1.32 {q1},[r1,:128] | |
| 1021 vsub.f32 q2, q0, q1 | |
| 1022 vadd.f32 q1, q0, q1 | |
| 1023 vst1.32 {q2},[r1,:128]! | |
| 1024 vst1.32 {q1},[r0,:128]! | |
| 1025 subs r2, r2, #4 | |
| 1026 bgt 1b | |
| 1027 bx lr | |
| 1028 endfunc | |
| 1029 | |
| 1030 function ff_scalarproduct_float_neon, export=1 | |
| 1031 vmov.f32 q2, #0.0 | |
| 1032 1: vld1.32 {q0},[r0,:128]! | |
| 1033 vld1.32 {q1},[r1,:128]! | |
| 1034 vmla.f32 q2, q0, q1 | |
| 1035 subs r2, r2, #4 | |
| 1036 bgt 1b | |
| 1037 vadd.f32 d0, d4, d5 | |
| 1038 vpadd.f32 d0, d0, d0 | |
| 1039 NOVFP vmov.32 r0, d0[0] | |
| 1040 bx lr | |
| 1041 endfunc | |
| 1042 | |
| 1043 function ff_int32_to_float_fmul_scalar_neon, export=1 | |
| 1044 VFP vdup.32 q0, d0[0] | |
| 1045 VFP len .req r2 | |
| 1046 NOVFP vdup.32 q0, r2 | |
| 1047 NOVFP len .req r3 | |
| 1048 | |
| 1049 vld1.32 {q1},[r1,:128]! | |
| 1050 vcvt.f32.s32 q3, q1 | |
| 1051 vld1.32 {q2},[r1,:128]! | |
| 1052 vcvt.f32.s32 q8, q2 | |
| 1053 1: subs len, len, #8 | |
| 1054 pld [r1, #16] | |
| 1055 vmul.f32 q9, q3, q0 | |
| 1056 vmul.f32 q10, q8, q0 | |
| 1057 beq 2f | |
| 1058 vld1.32 {q1},[r1,:128]! | |
| 1059 vcvt.f32.s32 q3, q1 | |
| 1060 vld1.32 {q2},[r1,:128]! | |
| 1061 vcvt.f32.s32 q8, q2 | |
| 1062 vst1.32 {q9}, [r0,:128]! | |
| 1063 vst1.32 {q10},[r0,:128]! | |
| 1064 b 1b | |
| 1065 2: vst1.32 {q9}, [r0,:128]! | |
| 1066 vst1.32 {q10},[r0,:128]! | |
| 1067 bx lr | |
| 1068 .unreq len | |
| 1069 endfunc | |
| 1070 | |
| 1071 function ff_vector_fmul_reverse_neon, export=1 | |
| 1072 add r2, r2, r3, lsl #2 | |
| 1073 sub r2, r2, #32 | |
| 1074 mov r12, #-32 | |
| 1075 vld1.32 {q0-q1}, [r1,:128]! | |
| 1076 vld1.32 {q2-q3}, [r2,:128], r12 | |
| 1077 1: pld [r1, #32] | |
| 1078 vrev64.32 q3, q3 | |
| 1079 vmul.f32 d16, d0, d7 | |
| 1080 vmul.f32 d17, d1, d6 | |
| 1081 pld [r2, #-32] | |
| 1082 vrev64.32 q2, q2 | |
| 1083 vmul.f32 d18, d2, d5 | |
| 1084 vmul.f32 d19, d3, d4 | |
| 1085 subs r3, r3, #8 | |
| 1086 beq 2f | |
| 1087 vld1.32 {q0-q1}, [r1,:128]! | |
| 1088 vld1.32 {q2-q3}, [r2,:128], r12 | |
| 1089 vst1.32 {q8-q9}, [r0,:128]! | |
| 1090 b 1b | |
| 1091 2: vst1.32 {q8-q9}, [r0,:128]! | |
| 1092 bx lr | |
| 1093 endfunc | |
| 1094 | |
| 1095 function ff_vector_fmul_add_neon, export=1 | |
| 1096 ldr r12, [sp] | |
| 1097 vld1.32 {q0-q1}, [r1,:128]! | |
| 1098 vld1.32 {q8-q9}, [r2,:128]! | |
| 1099 vld1.32 {q2-q3}, [r3,:128]! | |
| 1100 vmul.f32 q10, q0, q8 | |
| 1101 vmul.f32 q11, q1, q9 | |
| 1102 1: vadd.f32 q12, q2, q10 | |
| 1103 vadd.f32 q13, q3, q11 | |
| 1104 pld [r1, #16] | |
| 1105 pld [r2, #16] | |
| 1106 pld [r3, #16] | |
| 1107 subs r12, r12, #8 | |
| 1108 beq 2f | |
| 1109 vld1.32 {q0}, [r1,:128]! | |
| 1110 vld1.32 {q8}, [r2,:128]! | |
| 1111 vmul.f32 q10, q0, q8 | |
| 1112 vld1.32 {q1}, [r1,:128]! | |
| 1113 vld1.32 {q9}, [r2,:128]! | |
| 1114 vmul.f32 q11, q1, q9 | |
| 1115 vld1.32 {q2-q3}, [r3,:128]! | |
| 1116 vst1.32 {q12-q13},[r0,:128]! | |
| 1117 b 1b | |
| 1118 2: vst1.32 {q12-q13},[r0,:128]! | |
| 1119 bx lr | |
| 1120 endfunc | |
| 1121 | |
| 1122 function ff_vector_clipf_neon, export=1 | |
| 1123 VFP vdup.32 q1, d0[1] | |
| 1124 VFP vdup.32 q0, d0[0] | |
| 1125 NOVFP vdup.32 q0, r2 | |
| 1126 NOVFP vdup.32 q1, r3 | |
| 1127 NOVFP ldr r2, [sp] | |
| 1128 vld1.f32 {q2},[r1,:128]! | |
| 1129 vmin.f32 q10, q2, q1 | |
| 1130 vld1.f32 {q3},[r1,:128]! | |
| 1131 vmin.f32 q11, q3, q1 | |
| 1132 1: vmax.f32 q8, q10, q0 | |
| 1133 vmax.f32 q9, q11, q0 | |
| 1134 subs r2, r2, #8 | |
| 1135 beq 2f | |
| 1136 vld1.f32 {q2},[r1,:128]! | |
| 1137 vmin.f32 q10, q2, q1 | |
| 1138 vld1.f32 {q3},[r1,:128]! | |
| 1139 vmin.f32 q11, q3, q1 | |
| 1140 vst1.f32 {q8},[r0,:128]! | |
| 1141 vst1.f32 {q9},[r0,:128]! | |
| 1142 b 1b | |
| 1143 2: vst1.f32 {q8},[r0,:128]! | |
| 1144 vst1.f32 {q9},[r0,:128]! | |
| 1145 bx lr | |
| 1146 endfunc |
