Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
diff libavcodec/arm/h264pred_neon.S @ 2:897f711a7157
rearrange to work with autoconf
| author | Nina Engelhardt <nengel@mailbox.tu-berlin.de> |
|---|---|
| date | Tue, 25 Sep 2012 15:55:33 +0200 |
| parents | |
| children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/libavcodec/arm/h264pred_neon.S Tue Sep 25 15:55:33 2012 +0200 1.3 @@ -0,0 +1,362 @@ 1.4 +/* 1.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 1.6 + * 1.7 + * This file is part of FFmpeg. 1.8 + * 1.9 + * FFmpeg is free software; you can redistribute it and/or 1.10 + * modify it under the terms of the GNU Lesser General Public 1.11 + * License as published by the Free Software Foundation; either 1.12 + * version 2.1 of the License, or (at your option) any later version. 1.13 + * 1.14 + * FFmpeg is distributed in the hope that it will be useful, 1.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 1.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1.17 + * Lesser General Public License for more details. 1.18 + * 1.19 + * You should have received a copy of the GNU Lesser General Public 1.20 + * License along with FFmpeg; if not, write to the Free Software 1.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 1.22 + */ 1.23 + 1.24 +#include "asm.S" 1.25 + 1.26 + .macro ldcol.8 rd, rs, rt, n=8, hi=0 1.27 +.if \n == 8 || \hi == 0 1.28 + vld1.8 {\rd[0]}, [\rs], \rt 1.29 + vld1.8 {\rd[1]}, [\rs], \rt 1.30 + vld1.8 {\rd[2]}, [\rs], \rt 1.31 + vld1.8 {\rd[3]}, [\rs], \rt 1.32 +.endif 1.33 +.if \n == 8 || \hi == 1 1.34 + vld1.8 {\rd[4]}, [\rs], \rt 1.35 + vld1.8 {\rd[5]}, [\rs], \rt 1.36 + vld1.8 {\rd[6]}, [\rs], \rt 1.37 + vld1.8 {\rd[7]}, [\rs], \rt 1.38 +.endif 1.39 + .endm 1.40 + 1.41 + .macro add16x8 dq, dl, dh, rl, rh 1.42 + vaddl.u8 \dq, \rl, \rh 1.43 + vadd.u16 \dl, \dl, \dh 1.44 + vpadd.u16 \dl, \dl, \dl 1.45 + vpadd.u16 \dl, \dl, \dl 1.46 + .endm 1.47 + 1.48 +function ff_pred16x16_128_dc_neon, export=1 1.49 + vmov.i8 q0, #128 1.50 + b .L_pred16x16_dc_end 1.51 +endfunc 1.52 + 1.53 +function ff_pred16x16_top_dc_neon, export=1 1.54 + sub r2, r0, r1 1.55 + vld1.8 {q0}, [r2,:128] 1.56 + add16x8 q0, d0, d1, d0, d1 1.57 + vrshrn.u16 d0, q0, #4 1.58 + vdup.8 q0, d0[0] 1.59 + b .L_pred16x16_dc_end 1.60 +endfunc 1.61 + 1.62 +function ff_pred16x16_left_dc_neon, export=1 1.63 + sub r2, r0, #1 1.64 + ldcol.8 d0, r2, r1 1.65 + ldcol.8 d1, r2, r1 1.66 + add16x8 q0, d0, d1, d0, d1 1.67 + vrshrn.u16 d0, q0, #4 1.68 + vdup.8 q0, d0[0] 1.69 + b .L_pred16x16_dc_end 1.70 +endfunc 1.71 + 1.72 +function ff_pred16x16_dc_neon, export=1 1.73 + sub r2, r0, r1 1.74 + vld1.8 {q0}, [r2,:128] 1.75 + sub r2, r0, #1 1.76 + ldcol.8 d2, r2, r1 1.77 + ldcol.8 d3, r2, r1 1.78 + vaddl.u8 q0, d0, d1 1.79 + vaddl.u8 q1, d2, d3 1.80 + vadd.u16 q0, q0, q1 1.81 + vadd.u16 d0, d0, d1 1.82 + vpadd.u16 d0, d0, d0 1.83 + vpadd.u16 d0, d0, d0 1.84 + vrshrn.u16 d0, q0, #5 1.85 + vdup.8 q0, d0[0] 1.86 +.L_pred16x16_dc_end: 1.87 + mov r3, #8 1.88 +6: vst1.8 {q0}, [r0,:128], r1 1.89 + vst1.8 {q0}, [r0,:128], r1 1.90 + subs r3, r3, #1 1.91 + bne 6b 1.92 + bx lr 1.93 +endfunc 1.94 + 1.95 +function ff_pred16x16_hor_neon, export=1 1.96 + sub r2, r0, #1 1.97 + mov r3, #16 1.98 +1: vld1.8 {d0[],d1[]},[r2], r1 1.99 + vst1.8 {q0}, [r0,:128], r1 1.100 + subs r3, r3, #1 1.101 + bne 1b 1.102 + bx lr 1.103 +endfunc 1.104 + 1.105 +function ff_pred16x16_vert_neon, export=1 1.106 + sub r0, r0, r1 1.107 + vld1.8 {q0}, [r0,:128], r1 1.108 + mov r3, #8 1.109 +1: vst1.8 {q0}, [r0,:128], r1 1.110 + vst1.8 {q0}, [r0,:128], r1 1.111 + subs r3, r3, #1 1.112 + bne 1b 1.113 + bx lr 1.114 +endfunc 1.115 + 1.116 +function ff_pred16x16_plane_neon, export=1 1.117 + sub r3, r0, r1 1.118 + add r2, r3, #8 1.119 + sub r3, r3, #1 1.120 + vld1.8 {d0}, [r3] 1.121 + vld1.8 {d2}, [r2,:64], r1 1.122 + ldcol.8 d1, r3, r1 1.123 + add r3, r3, r1 1.124 + ldcol.8 d3, r3, r1 1.125 + vrev64.8 q0, q0 1.126 + vaddl.u8 q8, d2, d3 1.127 + vsubl.u8 q2, d2, d0 1.128 + vsubl.u8 q3, d3, d1 1.129 + movrel r3, p16weight 1.130 + vld1.8 {q0}, [r3,:128] 1.131 + vmul.s16 q2, q2, q0 1.132 + vmul.s16 q3, q3, q0 1.133 + vadd.i16 d4, d4, d5 1.134 + vadd.i16 d5, d6, d7 1.135 + vpadd.i16 d4, d4, d5 1.136 + vpadd.i16 d4, d4, d4 1.137 + vshl.i16 d5, d4, #2 1.138 + vaddl.s16 q2, d4, d5 1.139 + vrshrn.s32 d4, q2, #6 1.140 + mov r3, #0 1.141 + vtrn.16 d4, d5 1.142 + vadd.i16 d2, d4, d5 1.143 + vshl.i16 d3, d2, #3 1.144 + vrev64.16 d16, d17 1.145 + vsub.i16 d3, d3, d2 1.146 + vadd.i16 d16, d16, d0 1.147 + vshl.i16 d2, d16, #4 1.148 + vsub.i16 d2, d2, d3 1.149 + vshl.i16 d3, d4, #4 1.150 + vext.16 q0, q0, q0, #7 1.151 + vsub.i16 d6, d5, d3 1.152 + vmov.16 d0[0], r3 1.153 + vmul.i16 q0, q0, d4[0] 1.154 + vdup.16 q1, d2[0] 1.155 + vdup.16 q2, d4[0] 1.156 + vdup.16 q3, d6[0] 1.157 + vshl.i16 q2, q2, #3 1.158 + vadd.i16 q1, q1, q0 1.159 + vadd.i16 q3, q3, q2 1.160 + mov r3, #16 1.161 +1: 1.162 + vqshrun.s16 d0, q1, #5 1.163 + vadd.i16 q1, q1, q2 1.164 + vqshrun.s16 d1, q1, #5 1.165 + vadd.i16 q1, q1, q3 1.166 + vst1.8 {q0}, [r0,:128], r1 1.167 + subs r3, r3, #1 1.168 + bne 1b 1.169 + bx lr 1.170 +endfunc 1.171 + 1.172 + .section .rodata 1.173 + .align 4 1.174 +p16weight: 1.175 + .short 1,2,3,4,5,6,7,8 1.176 + 1.177 + .text 1.178 + 1.179 +function ff_pred8x8_hor_neon, export=1 1.180 + sub r2, r0, #1 1.181 + mov r3, #8 1.182 +1: vld1.8 {d0[]}, [r2], r1 1.183 + vst1.8 {d0}, [r0,:64], r1 1.184 + subs r3, r3, #1 1.185 + bne 1b 1.186 + bx lr 1.187 +endfunc 1.188 + 1.189 +function ff_pred8x8_vert_neon, export=1 1.190 + sub r0, r0, r1 1.191 + vld1.8 {d0}, [r0,:64], r1 1.192 + mov r3, #4 1.193 +1: vst1.8 {d0}, [r0,:64], r1 1.194 + vst1.8 {d0}, [r0,:64], r1 1.195 + subs r3, r3, #1 1.196 + bne 1b 1.197 + bx lr 1.198 +endfunc 1.199 + 1.200 +function ff_pred8x8_plane_neon, export=1 1.201 + sub r3, r0, r1 1.202 + add r2, r3, #4 1.203 + sub r3, r3, #1 1.204 + vld1.32 {d0[0]}, [r3] 1.205 + vld1.32 {d2[0]}, [r2,:32], r1 1.206 + ldcol.8 d0, r3, r1, 4, hi=1 1.207 + add r3, r3, r1 1.208 + ldcol.8 d3, r3, r1, 4 1.209 + vaddl.u8 q8, d2, d3 1.210 + vrev32.8 d0, d0 1.211 + vtrn.32 d2, d3 1.212 + vsubl.u8 q2, d2, d0 1.213 + movrel r3, p16weight 1.214 + vld1.16 {q0}, [r3,:128] 1.215 + vmul.s16 d4, d4, d0 1.216 + vmul.s16 d5, d5, d0 1.217 + vpadd.i16 d4, d4, d5 1.218 + vpaddl.s16 d4, d4 1.219 + vshl.i32 d5, d4, #4 1.220 + vadd.s32 d4, d4, d5 1.221 + vrshrn.s32 d4, q2, #5 1.222 + mov r3, #0 1.223 + vtrn.16 d4, d5 1.224 + vadd.i16 d2, d4, d5 1.225 + vshl.i16 d3, d2, #2 1.226 + vrev64.16 d16, d16 1.227 + vsub.i16 d3, d3, d2 1.228 + vadd.i16 d16, d16, d0 1.229 + vshl.i16 d2, d16, #4 1.230 + vsub.i16 d2, d2, d3 1.231 + vshl.i16 d3, d4, #3 1.232 + vext.16 q0, q0, q0, #7 1.233 + vsub.i16 d6, d5, d3 1.234 + vmov.16 d0[0], r3 1.235 + vmul.i16 q0, q0, d4[0] 1.236 + vdup.16 q1, d2[0] 1.237 + vdup.16 q2, d4[0] 1.238 + vdup.16 q3, d6[0] 1.239 + vshl.i16 q2, q2, #3 1.240 + vadd.i16 q1, q1, q0 1.241 + vadd.i16 q3, q3, q2 1.242 + mov r3, #8 1.243 +1: 1.244 + vqshrun.s16 d0, q1, #5 1.245 + vadd.i16 q1, q1, q3 1.246 + vst1.8 {d0}, [r0,:64], r1 1.247 + subs r3, r3, #1 1.248 + bne 1b 1.249 + bx lr 1.250 +endfunc 1.251 + 1.252 +function ff_pred8x8_128_dc_neon, export=1 1.253 + vmov.i8 q0, #128 1.254 + b .L_pred8x8_dc_end 1.255 +endfunc 1.256 + 1.257 +function ff_pred8x8_top_dc_neon, export=1 1.258 + sub r2, r0, r1 1.259 + vld1.8 {d0}, [r2,:64] 1.260 + vpaddl.u8 d0, d0 1.261 + vpadd.u16 d0, d0, d0 1.262 + vrshrn.u16 d0, q0, #2 1.263 + vdup.8 d1, d0[1] 1.264 + vdup.8 d0, d0[0] 1.265 + vtrn.32 d0, d1 1.266 + b .L_pred8x8_dc_end 1.267 +endfunc 1.268 + 1.269 +function ff_pred8x8_left_dc_neon, export=1 1.270 + sub r2, r0, #1 1.271 + ldcol.8 d0, r2, r1 1.272 + vpaddl.u8 d0, d0 1.273 + vpadd.u16 d0, d0, d0 1.274 + vrshrn.u16 d0, q0, #2 1.275 + vdup.8 d1, d0[1] 1.276 + vdup.8 d0, d0[0] 1.277 + b .L_pred8x8_dc_end 1.278 +endfunc 1.279 + 1.280 +function ff_pred8x8_dc_neon, export=1 1.281 + sub r2, r0, r1 1.282 + vld1.8 {d0}, [r2,:64] 1.283 + sub r2, r0, #1 1.284 + ldcol.8 d1, r2, r1 1.285 + vtrn.32 d0, d1 1.286 + vpaddl.u8 q0, q0 1.287 + vpadd.u16 d0, d0, d1 1.288 + vpadd.u16 d1, d0, d0 1.289 + vrshrn.u16 d2, q0, #3 1.290 + vrshrn.u16 d3, q0, #2 1.291 + vdup.8 d0, d2[4] 1.292 + vdup.8 d1, d3[3] 1.293 + vdup.8 d4, d3[2] 1.294 + vdup.8 d5, d2[5] 1.295 + vtrn.32 q0, q2 1.296 +.L_pred8x8_dc_end: 1.297 + mov r3, #4 1.298 + add r2, r0, r1, lsl #2 1.299 +6: vst1.8 {d0}, [r0,:64], r1 1.300 + vst1.8 {d1}, [r2,:64], r1 1.301 + subs r3, r3, #1 1.302 + bne 6b 1.303 + bx lr 1.304 +endfunc 1.305 + 1.306 +function ff_pred8x8_l0t_dc_neon, export=1 1.307 + sub r2, r0, r1 1.308 + vld1.8 {d0}, [r2,:64] 1.309 + sub r2, r0, #1 1.310 + ldcol.8 d1, r2, r1, 4 1.311 + vtrn.32 d0, d1 1.312 + vpaddl.u8 q0, q0 1.313 + vpadd.u16 d0, d0, d1 1.314 + vpadd.u16 d1, d0, d0 1.315 + vrshrn.u16 d2, q0, #3 1.316 + vrshrn.u16 d3, q0, #2 1.317 + vdup.8 d0, d2[4] 1.318 + vdup.8 d1, d3[0] 1.319 + vdup.8 q2, d3[2] 1.320 + vtrn.32 q0, q2 1.321 + b .L_pred8x8_dc_end 1.322 +endfunc 1.323 + 1.324 +function ff_pred8x8_l00_dc_neon, export=1 1.325 + sub r2, r0, #1 1.326 + ldcol.8 d0, r2, r1, 4 1.327 + vpaddl.u8 d0, d0 1.328 + vpadd.u16 d0, d0, d0 1.329 + vrshrn.u16 d0, q0, #2 1.330 + vmov.i8 d1, #128 1.331 + vdup.8 d0, d0[0] 1.332 + b .L_pred8x8_dc_end 1.333 +endfunc 1.334 + 1.335 +function ff_pred8x8_0lt_dc_neon, export=1 1.336 + sub r2, r0, r1 1.337 + vld1.8 {d0}, [r2,:64] 1.338 + add r2, r0, r1, lsl #2 1.339 + sub r2, r2, #1 1.340 + ldcol.8 d1, r2, r1, 4, hi=1 1.341 + vtrn.32 d0, d1 1.342 + vpaddl.u8 q0, q0 1.343 + vpadd.u16 d0, d0, d1 1.344 + vpadd.u16 d1, d0, d0 1.345 + vrshrn.u16 d3, q0, #2 1.346 + vrshrn.u16 d2, q0, #3 1.347 + vdup.8 d0, d3[0] 1.348 + vdup.8 d1, d3[3] 1.349 + vdup.8 d4, d3[2] 1.350 + vdup.8 d5, d2[5] 1.351 + vtrn.32 q0, q2 1.352 + b .L_pred8x8_dc_end 1.353 +endfunc 1.354 + 1.355 +function ff_pred8x8_0l0_dc_neon, export=1 1.356 + add r2, r0, r1, lsl #2 1.357 + sub r2, r2, #1 1.358 + ldcol.8 d1, r2, r1, 4 1.359 + vpaddl.u8 d2, d1 1.360 + vpadd.u16 d2, d2, d2 1.361 + vrshrn.u16 d1, q1, #2 1.362 + vmov.i8 d0, #128 1.363 + vdup.8 d1, d1[0] 1.364 + b .L_pred8x8_dc_end 1.365 +endfunc
