diff libavcodec/dsputil.c @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libavcodec/dsputil.c	Tue Sep 25 15:55:33 2012 +0200
     1.3 @@ -0,0 +1,1057 @@
     1.4 +/*
     1.5 + * DSP utils
     1.6 + * Copyright (c) 2000, 2001 Fabrice Bellard
     1.7 + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
     1.8 + *
     1.9 + * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
    1.10 + *
    1.11 + * This file is part of FFmpeg.
    1.12 + *
    1.13 + * FFmpeg is free software; you can redistribute it and/or
    1.14 + * modify it under the terms of the GNU Lesser General Public
    1.15 + * License as published by the Free Software Foundation; either
    1.16 + * version 2.1 of the License, or (at your option) any later version.
    1.17 + *
    1.18 + * FFmpeg is distributed in the hope that it will be useful,
    1.19 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.20 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.21 + * Lesser General Public License for more details.
    1.22 + *
    1.23 + * You should have received a copy of the GNU Lesser General Public
    1.24 + * License along with FFmpeg; if not, write to the Free Software
    1.25 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    1.26 + */
    1.27 +
    1.28 +/**
    1.29 + * @file
    1.30 + * DSP utils
    1.31 + */
    1.32 +
    1.33 +#include "libavutil/log.h"
    1.34 +#include "dsputil.h"
    1.35 +#include "simple_idct.h"
    1.36 +#include "mathops.h"
    1.37 +#include "config.h"
    1.38 +
    1.39 +uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
    1.40 +uint32_t ff_squareTbl[512] = {0, };
    1.41 +
    1.42 +const uint8_t ff_zigzag_direct[64] = {
    1.43 +    0,   1,  8, 16,  9,  2,  3, 10,
    1.44 +    17, 24, 32, 25, 18, 11,  4,  5,
    1.45 +    12, 19, 26, 33, 40, 48, 41, 34,
    1.46 +    27, 20, 13,  6,  7, 14, 21, 28,
    1.47 +    35, 42, 49, 56, 57, 50, 43, 36,
    1.48 +    29, 22, 15, 23, 30, 37, 44, 51,
    1.49 +    58, 59, 52, 45, 38, 31, 39, 46,
    1.50 +    53, 60, 61, 54, 47, 55, 62, 63
    1.51 +};
    1.52 +
    1.53 +
    1.54 +#define PIXOP2(OPNAME, OP) \
    1.55 +static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
    1.56 +    int i;\
    1.57 +    for(i=0; i<h; i++){\
    1.58 +        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
    1.59 +        pixels+=line_size;\
    1.60 +        block +=line_size;\
    1.61 +    }\
    1.62 +}\
    1.63 +static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
    1.64 +    int i;\
    1.65 +    for(i=0; i<h; i++){\
    1.66 +        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
    1.67 +        pixels+=line_size;\
    1.68 +        block +=line_size;\
    1.69 +    }\
    1.70 +}\
    1.71 +static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
    1.72 +    int i;\
    1.73 +    for(i=0; i<h; i++){\
    1.74 +        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
    1.75 +        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
    1.76 +        pixels+=line_size;\
    1.77 +        block +=line_size;\
    1.78 +    }\
    1.79 +}\
    1.80 +static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
    1.81 +    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
    1.82 +}\
    1.83 +\
    1.84 +static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
    1.85 +                                                int src_stride1, int src_stride2, int h){\
    1.86 +    int i;\
    1.87 +    for(i=0; i<h; i++){\
    1.88 +        uint32_t a,b;\
    1.89 +        a= AV_RN32(&src1[i*src_stride1  ]);\
    1.90 +        b= AV_RN32(&src2[i*src_stride2  ]);\
    1.91 +        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
    1.92 +        a= AV_RN32(&src1[i*src_stride1+4]);\
    1.93 +        b= AV_RN32(&src2[i*src_stride2+4]);\
    1.94 +        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
    1.95 +    }\
    1.96 +}\
    1.97 +\
    1.98 +static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
    1.99 +                                                int src_stride1, int src_stride2, int h){\
   1.100 +    int i;\
   1.101 +    for(i=0; i<h; i++){\
   1.102 +        uint32_t a,b;\
   1.103 +        a= AV_RN32(&src1[i*src_stride1  ]);\
   1.104 +        b= AV_RN32(&src2[i*src_stride2  ]);\
   1.105 +        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
   1.106 +        a= AV_RN32(&src1[i*src_stride1+4]);\
   1.107 +        b= AV_RN32(&src2[i*src_stride2+4]);\
   1.108 +        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
   1.109 +    }\
   1.110 +}\
   1.111 +\
   1.112 +static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
   1.113 +                                                int src_stride1, int src_stride2, int h){\
   1.114 +    int i;\
   1.115 +    for(i=0; i<h; i++){\
   1.116 +        uint32_t a,b;\
   1.117 +        a= AV_RN32(&src1[i*src_stride1  ]);\
   1.118 +        b= AV_RN32(&src2[i*src_stride2  ]);\
   1.119 +        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
   1.120 +    }\
   1.121 +}\
   1.122 +\
   1.123 +static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
   1.124 +                                                int src_stride1, int src_stride2, int h){\
   1.125 +    int i;\
   1.126 +    for(i=0; i<h; i++){\
   1.127 +        uint32_t a,b;\
   1.128 +        a= AV_RN16(&src1[i*src_stride1  ]);\
   1.129 +        b= AV_RN16(&src2[i*src_stride2  ]);\
   1.130 +        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
   1.131 +    }\
   1.132 +}\
   1.133 +\
   1.134 +static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
   1.135 +                                                int src_stride1, int src_stride2, int h){\
   1.136 +    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
   1.137 +    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
   1.138 +}\
   1.139 +\
   1.140 +static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
   1.141 +                                                int src_stride1, int src_stride2, int h){\
   1.142 +    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
   1.143 +    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
   1.144 +}\
   1.145 +\
   1.146 +static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   1.147 +    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
   1.148 +}\
   1.149 +\
   1.150 +static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   1.151 +    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
   1.152 +}\
   1.153 +\
   1.154 +static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   1.155 +    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
   1.156 +}\
   1.157 +\
   1.158 +static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   1.159 +    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
   1.160 +}\
   1.161 +\
   1.162 +static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
   1.163 +                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
   1.164 +    int i;\
   1.165 +    for(i=0; i<h; i++){\
   1.166 +        uint32_t a, b, c, d, l0, l1, h0, h1;\
   1.167 +        a= AV_RN32(&src1[i*src_stride1]);\
   1.168 +        b= AV_RN32(&src2[i*src_stride2]);\
   1.169 +        c= AV_RN32(&src3[i*src_stride3]);\
   1.170 +        d= AV_RN32(&src4[i*src_stride4]);\
   1.171 +        l0=  (a&0x03030303UL)\
   1.172 +           + (b&0x03030303UL)\
   1.173 +           + 0x02020202UL;\
   1.174 +        h0= ((a&0xFCFCFCFCUL)>>2)\
   1.175 +          + ((b&0xFCFCFCFCUL)>>2);\
   1.176 +        l1=  (c&0x03030303UL)\
   1.177 +           + (d&0x03030303UL);\
   1.178 +        h1= ((c&0xFCFCFCFCUL)>>2)\
   1.179 +          + ((d&0xFCFCFCFCUL)>>2);\
   1.180 +        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
   1.181 +        a= AV_RN32(&src1[i*src_stride1+4]);\
   1.182 +        b= AV_RN32(&src2[i*src_stride2+4]);\
   1.183 +        c= AV_RN32(&src3[i*src_stride3+4]);\
   1.184 +        d= AV_RN32(&src4[i*src_stride4+4]);\
   1.185 +        l0=  (a&0x03030303UL)\
   1.186 +           + (b&0x03030303UL)\
   1.187 +           + 0x02020202UL;\
   1.188 +        h0= ((a&0xFCFCFCFCUL)>>2)\
   1.189 +          + ((b&0xFCFCFCFCUL)>>2);\
   1.190 +        l1=  (c&0x03030303UL)\
   1.191 +           + (d&0x03030303UL);\
   1.192 +        h1= ((c&0xFCFCFCFCUL)>>2)\
   1.193 +          + ((d&0xFCFCFCFCUL)>>2);\
   1.194 +        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
   1.195 +    }\
   1.196 +}\
   1.197 +\
   1.198 +static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   1.199 +    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
   1.200 +}\
   1.201 +\
   1.202 +static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   1.203 +    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
   1.204 +}\
   1.205 +\
   1.206 +static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   1.207 +    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
   1.208 +}\
   1.209 +\
   1.210 +static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   1.211 +    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
   1.212 +}\
   1.213 +\
   1.214 +static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
   1.215 +                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
   1.216 +    int i;\
   1.217 +    for(i=0; i<h; i++){\
   1.218 +        uint32_t a, b, c, d, l0, l1, h0, h1;\
   1.219 +        a= AV_RN32(&src1[i*src_stride1]);\
   1.220 +        b= AV_RN32(&src2[i*src_stride2]);\
   1.221 +        c= AV_RN32(&src3[i*src_stride3]);\
   1.222 +        d= AV_RN32(&src4[i*src_stride4]);\
   1.223 +        l0=  (a&0x03030303UL)\
   1.224 +           + (b&0x03030303UL)\
   1.225 +           + 0x01010101UL;\
   1.226 +        h0= ((a&0xFCFCFCFCUL)>>2)\
   1.227 +          + ((b&0xFCFCFCFCUL)>>2);\
   1.228 +        l1=  (c&0x03030303UL)\
   1.229 +           + (d&0x03030303UL);\
   1.230 +        h1= ((c&0xFCFCFCFCUL)>>2)\
   1.231 +          + ((d&0xFCFCFCFCUL)>>2);\
   1.232 +        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
   1.233 +        a= AV_RN32(&src1[i*src_stride1+4]);\
   1.234 +        b= AV_RN32(&src2[i*src_stride2+4]);\
   1.235 +        c= AV_RN32(&src3[i*src_stride3+4]);\
   1.236 +        d= AV_RN32(&src4[i*src_stride4+4]);\
   1.237 +        l0=  (a&0x03030303UL)\
   1.238 +           + (b&0x03030303UL)\
   1.239 +           + 0x01010101UL;\
   1.240 +        h0= ((a&0xFCFCFCFCUL)>>2)\
   1.241 +          + ((b&0xFCFCFCFCUL)>>2);\
   1.242 +        l1=  (c&0x03030303UL)\
   1.243 +           + (d&0x03030303UL);\
   1.244 +        h1= ((c&0xFCFCFCFCUL)>>2)\
   1.245 +          + ((d&0xFCFCFCFCUL)>>2);\
   1.246 +        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
   1.247 +    }\
   1.248 +}\
   1.249 +static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
   1.250 +                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
   1.251 +    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
   1.252 +    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
   1.253 +}\
   1.254 +static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
   1.255 +                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
   1.256 +    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
   1.257 +    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
   1.258 +}\
   1.259 +\
   1.260 +static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
   1.261 +{\
   1.262 +        int i, a0, b0, a1, b1;\
   1.263 +        a0= pixels[0];\
   1.264 +        b0= pixels[1] + 2;\
   1.265 +        a0 += b0;\
   1.266 +        b0 += pixels[2];\
   1.267 +\
   1.268 +        pixels+=line_size;\
   1.269 +        for(i=0; i<h; i+=2){\
   1.270 +            a1= pixels[0];\
   1.271 +            b1= pixels[1];\
   1.272 +            a1 += b1;\
   1.273 +            b1 += pixels[2];\
   1.274 +\
   1.275 +            block[0]= (a1+a0)>>2; /* FIXME non put */\
   1.276 +            block[1]= (b1+b0)>>2;\
   1.277 +\
   1.278 +            pixels+=line_size;\
   1.279 +            block +=line_size;\
   1.280 +\
   1.281 +            a0= pixels[0];\
   1.282 +            b0= pixels[1] + 2;\
   1.283 +            a0 += b0;\
   1.284 +            b0 += pixels[2];\
   1.285 +\
   1.286 +            block[0]= (a1+a0)>>2;\
   1.287 +            block[1]= (b1+b0)>>2;\
   1.288 +            pixels+=line_size;\
   1.289 +            block +=line_size;\
   1.290 +        }\
   1.291 +}\
   1.292 +\
   1.293 +static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
   1.294 +{\
   1.295 +        int i;\
   1.296 +        const uint32_t a= AV_RN32(pixels  );\
   1.297 +        const uint32_t b= AV_RN32(pixels+1);\
   1.298 +        uint32_t l0=  (a&0x03030303UL)\
   1.299 +                    + (b&0x03030303UL)\
   1.300 +                    + 0x02020202UL;\
   1.301 +        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
   1.302 +                   + ((b&0xFCFCFCFCUL)>>2);\
   1.303 +        uint32_t l1,h1;\
   1.304 +\
   1.305 +        pixels+=line_size;\
   1.306 +        for(i=0; i<h; i+=2){\
   1.307 +            uint32_t a= AV_RN32(pixels  );\
   1.308 +            uint32_t b= AV_RN32(pixels+1);\
   1.309 +            l1=  (a&0x03030303UL)\
   1.310 +               + (b&0x03030303UL);\
   1.311 +            h1= ((a&0xFCFCFCFCUL)>>2)\
   1.312 +              + ((b&0xFCFCFCFCUL)>>2);\
   1.313 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
   1.314 +            pixels+=line_size;\
   1.315 +            block +=line_size;\
   1.316 +            a= AV_RN32(pixels  );\
   1.317 +            b= AV_RN32(pixels+1);\
   1.318 +            l0=  (a&0x03030303UL)\
   1.319 +               + (b&0x03030303UL)\
   1.320 +               + 0x02020202UL;\
   1.321 +            h0= ((a&0xFCFCFCFCUL)>>2)\
   1.322 +              + ((b&0xFCFCFCFCUL)>>2);\
   1.323 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
   1.324 +            pixels+=line_size;\
   1.325 +            block +=line_size;\
   1.326 +        }\
   1.327 +}\
   1.328 +\
   1.329 +static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
   1.330 +{\
   1.331 +    int j;\
   1.332 +    for(j=0; j<2; j++){\
   1.333 +        int i;\
   1.334 +        const uint32_t a= AV_RN32(pixels  );\
   1.335 +        const uint32_t b= AV_RN32(pixels+1);\
   1.336 +        uint32_t l0=  (a&0x03030303UL)\
   1.337 +                    + (b&0x03030303UL)\
   1.338 +                    + 0x02020202UL;\
   1.339 +        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
   1.340 +                   + ((b&0xFCFCFCFCUL)>>2);\
   1.341 +        uint32_t l1,h1;\
   1.342 +\
   1.343 +        pixels+=line_size;\
   1.344 +        for(i=0; i<h; i+=2){\
   1.345 +            uint32_t a= AV_RN32(pixels  );\
   1.346 +            uint32_t b= AV_RN32(pixels+1);\
   1.347 +            l1=  (a&0x03030303UL)\
   1.348 +               + (b&0x03030303UL);\
   1.349 +            h1= ((a&0xFCFCFCFCUL)>>2)\
   1.350 +              + ((b&0xFCFCFCFCUL)>>2);\
   1.351 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
   1.352 +            pixels+=line_size;\
   1.353 +            block +=line_size;\
   1.354 +            a= AV_RN32(pixels  );\
   1.355 +            b= AV_RN32(pixels+1);\
   1.356 +            l0=  (a&0x03030303UL)\
   1.357 +               + (b&0x03030303UL)\
   1.358 +               + 0x02020202UL;\
   1.359 +            h0= ((a&0xFCFCFCFCUL)>>2)\
   1.360 +              + ((b&0xFCFCFCFCUL)>>2);\
   1.361 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
   1.362 +            pixels+=line_size;\
   1.363 +            block +=line_size;\
   1.364 +        }\
   1.365 +        pixels+=4-line_size*(h+1);\
   1.366 +        block +=4-line_size*h;\
   1.367 +    }\
   1.368 +}\
   1.369 +\
   1.370 +static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
   1.371 +{\
   1.372 +    int j;\
   1.373 +    for(j=0; j<2; j++){\
   1.374 +        int i;\
   1.375 +        const uint32_t a= AV_RN32(pixels  );\
   1.376 +        const uint32_t b= AV_RN32(pixels+1);\
   1.377 +        uint32_t l0=  (a&0x03030303UL)\
   1.378 +                    + (b&0x03030303UL)\
   1.379 +                    + 0x01010101UL;\
   1.380 +        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
   1.381 +                   + ((b&0xFCFCFCFCUL)>>2);\
   1.382 +        uint32_t l1,h1;\
   1.383 +\
   1.384 +        pixels+=line_size;\
   1.385 +        for(i=0; i<h; i+=2){\
   1.386 +            uint32_t a= AV_RN32(pixels  );\
   1.387 +            uint32_t b= AV_RN32(pixels+1);\
   1.388 +            l1=  (a&0x03030303UL)\
   1.389 +               + (b&0x03030303UL);\
   1.390 +            h1= ((a&0xFCFCFCFCUL)>>2)\
   1.391 +              + ((b&0xFCFCFCFCUL)>>2);\
   1.392 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
   1.393 +            pixels+=line_size;\
   1.394 +            block +=line_size;\
   1.395 +            a= AV_RN32(pixels  );\
   1.396 +            b= AV_RN32(pixels+1);\
   1.397 +            l0=  (a&0x03030303UL)\
   1.398 +               + (b&0x03030303UL)\
   1.399 +               + 0x01010101UL;\
   1.400 +            h0= ((a&0xFCFCFCFCUL)>>2)\
   1.401 +              + ((b&0xFCFCFCFCUL)>>2);\
   1.402 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
   1.403 +            pixels+=line_size;\
   1.404 +            block +=line_size;\
   1.405 +        }\
   1.406 +        pixels+=4-line_size*(h+1);\
   1.407 +        block +=4-line_size*h;\
   1.408 +    }\
   1.409 +}\
   1.410 +\
   1.411 +CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
   1.412 +
   1.413 +#define op_avg(a, b) a = rnd_avg32(a, b)
   1.414 +
   1.415 +#define op_put(a, b) a = b
   1.416 +
   1.417 +PIXOP2(avg, op_avg)
   1.418 +PIXOP2(put, op_put)
   1.419 +#undef op_avg
   1.420 +#undef op_put
   1.421 +
   1.422 +
   1.423 +#define H264_CHROMA_MC(OPNAME, OP)\
   1.424 +static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
   1.425 +    const int A=(8-x)*(8-y);\
   1.426 +    const int B=(  x)*(8-y);\
   1.427 +    const int C=(8-x)*(  y);\
   1.428 +    const int D=(  x)*(  y);\
   1.429 +    int i;\
   1.430 +    \
   1.431 +    assert(x<8 && y<8 && x>=0 && y>=0);\
   1.432 +\
   1.433 +    if(D){\
   1.434 +        for(i=0; i<h; i++){\
   1.435 +            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
   1.436 +            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
   1.437 +            dst+= stride;\
   1.438 +            src+= stride;\
   1.439 +        }\
   1.440 +    }else{\
   1.441 +        const int E= B+C;\
   1.442 +        const int step= C ? stride : 1;\
   1.443 +        for(i=0; i<h; i++){\
   1.444 +            OP(dst[0], (A*src[0] + E*src[step+0]));\
   1.445 +            OP(dst[1], (A*src[1] + E*src[step+1]));\
   1.446 +            dst+= stride;\
   1.447 +            src+= stride;\
   1.448 +        }\
   1.449 +    }\
   1.450 +}\
   1.451 +\
   1.452 +static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
   1.453 +    const int A=(8-x)*(8-y);\
   1.454 +    const int B=(  x)*(8-y);\
   1.455 +    const int C=(8-x)*(  y);\
   1.456 +    const int D=(  x)*(  y);\
   1.457 +    int i;\
   1.458 +    \
   1.459 +    assert(x<8 && y<8 && x>=0 && y>=0);\
   1.460 +\
   1.461 +    if(D){\
   1.462 +        for(i=0; i<h; i++){\
   1.463 +            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
   1.464 +            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
   1.465 +            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
   1.466 +            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
   1.467 +            dst+= stride;\
   1.468 +            src+= stride;\
   1.469 +        }\
   1.470 +    }else{\
   1.471 +        const int E= B+C;\
   1.472 +        const int step= C ? stride : 1;\
   1.473 +        for(i=0; i<h; i++){\
   1.474 +            OP(dst[0], (A*src[0] + E*src[step+0]));\
   1.475 +            OP(dst[1], (A*src[1] + E*src[step+1]));\
   1.476 +            OP(dst[2], (A*src[2] + E*src[step+2]));\
   1.477 +            OP(dst[3], (A*src[3] + E*src[step+3]));\
   1.478 +            dst+= stride;\
   1.479 +            src+= stride;\
   1.480 +        }\
   1.481 +    }\
   1.482 +}\
   1.483 +\
   1.484 +static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
   1.485 +    const int A=(8-x)*(8-y);\
   1.486 +    const int B=(  x)*(8-y);\
   1.487 +    const int C=(8-x)*(  y);\
   1.488 +    const int D=(  x)*(  y);\
   1.489 +    int i;\
   1.490 +    \
   1.491 +    assert(x<8 && y<8 && x>=0 && y>=0);\
   1.492 +\
   1.493 +    if(D){\
   1.494 +        for(i=0; i<h; i++){\
   1.495 +            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
   1.496 +            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
   1.497 +            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
   1.498 +            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
   1.499 +            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
   1.500 +            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
   1.501 +            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
   1.502 +            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
   1.503 +            dst+= stride;\
   1.504 +            src+= stride;\
   1.505 +        }\
   1.506 +    }else{\
   1.507 +        const int E= B+C;\
   1.508 +        const int step= C ? stride : 1;\
   1.509 +        for(i=0; i<h; i++){\
   1.510 +            OP(dst[0], (A*src[0] + E*src[step+0]));\
   1.511 +            OP(dst[1], (A*src[1] + E*src[step+1]));\
   1.512 +            OP(dst[2], (A*src[2] + E*src[step+2]));\
   1.513 +            OP(dst[3], (A*src[3] + E*src[step+3]));\
   1.514 +            OP(dst[4], (A*src[4] + E*src[step+4]));\
   1.515 +            OP(dst[5], (A*src[5] + E*src[step+5]));\
   1.516 +            OP(dst[6], (A*src[6] + E*src[step+6]));\
   1.517 +            OP(dst[7], (A*src[7] + E*src[step+7]));\
   1.518 +            dst+= stride;\
   1.519 +            src+= stride;\
   1.520 +        }\
   1.521 +    }\
   1.522 +}
   1.523 +
   1.524 +#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
   1.525 +#define op_put(a, b) a = (((b) + 32)>>6)
   1.526 +
   1.527 +H264_CHROMA_MC(put_       , op_put)
   1.528 +H264_CHROMA_MC(avg_       , op_avg)
   1.529 +#undef op_avg
   1.530 +#undef op_put
   1.531 +
   1.532 +
   1.533 +#define H264_LOWPASS(OPNAME, OP, OP2) \
   1.534 +static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
   1.535 +    const int h=2;\
   1.536 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
   1.537 +    int i;\
   1.538 +    for(i=0; i<h; i++)\
   1.539 +    {\
   1.540 +        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
   1.541 +        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
   1.542 +        dst+=dstStride;\
   1.543 +        src+=srcStride;\
   1.544 +    }\
   1.545 +}\
   1.546 +\
   1.547 +static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
   1.548 +    const int w=2;\
   1.549 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
   1.550 +    int i;\
   1.551 +    for(i=0; i<w; i++)\
   1.552 +    {\
   1.553 +        const int srcB= src[-2*srcStride];\
   1.554 +        const int srcA= src[-1*srcStride];\
   1.555 +        const int src0= src[0 *srcStride];\
   1.556 +        const int src1= src[1 *srcStride];\
   1.557 +        const int src2= src[2 *srcStride];\
   1.558 +        const int src3= src[3 *srcStride];\
   1.559 +        const int src4= src[4 *srcStride];\
   1.560 +        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
   1.561 +        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
   1.562 +        dst++;\
   1.563 +        src++;\
   1.564 +    }\
   1.565 +}\
   1.566 +\
   1.567 +static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
   1.568 +    const int h=2;\
   1.569 +    const int w=2;\
   1.570 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
   1.571 +    int i;\
   1.572 +    src -= 2*srcStride;\
   1.573 +    for(i=0; i<h+5; i++)\
   1.574 +    {\
   1.575 +        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
   1.576 +        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
   1.577 +        tmp+=tmpStride;\
   1.578 +        src+=srcStride;\
   1.579 +    }\
   1.580 +    tmp -= tmpStride*(h+5-2);\
   1.581 +    for(i=0; i<w; i++)\
   1.582 +    {\
   1.583 +        const int tmpB= tmp[-2*tmpStride];\
   1.584 +        const int tmpA= tmp[-1*tmpStride];\
   1.585 +        const int tmp0= tmp[0 *tmpStride];\
   1.586 +        const int tmp1= tmp[1 *tmpStride];\
   1.587 +        const int tmp2= tmp[2 *tmpStride];\
   1.588 +        const int tmp3= tmp[3 *tmpStride];\
   1.589 +        const int tmp4= tmp[4 *tmpStride];\
   1.590 +        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
   1.591 +        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
   1.592 +        dst++;\
   1.593 +        tmp++;\
   1.594 +    }\
   1.595 +}\
   1.596 +static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
   1.597 +    const int h=4;\
   1.598 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
   1.599 +    int i;\
   1.600 +    for(i=0; i<h; i++)\
   1.601 +    {\
   1.602 +        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
   1.603 +        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
   1.604 +        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
   1.605 +        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
   1.606 +        dst+=dstStride;\
   1.607 +        src+=srcStride;\
   1.608 +    }\
   1.609 +}\
   1.610 +\
   1.611 +static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
   1.612 +    const int w=4;\
   1.613 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
   1.614 +    int i;\
   1.615 +    for(i=0; i<w; i++)\
   1.616 +    {\
   1.617 +        const int srcB= src[-2*srcStride];\
   1.618 +        const int srcA= src[-1*srcStride];\
   1.619 +        const int src0= src[0 *srcStride];\
   1.620 +        const int src1= src[1 *srcStride];\
   1.621 +        const int src2= src[2 *srcStride];\
   1.622 +        const int src3= src[3 *srcStride];\
   1.623 +        const int src4= src[4 *srcStride];\
   1.624 +        const int src5= src[5 *srcStride];\
   1.625 +        const int src6= src[6 *srcStride];\
   1.626 +        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
   1.627 +        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
   1.628 +        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
   1.629 +        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
   1.630 +        dst++;\
   1.631 +        src++;\
   1.632 +    }\
   1.633 +}\
   1.634 +\
   1.635 +static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
   1.636 +    const int h=4;\
   1.637 +    const int w=4;\
   1.638 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
   1.639 +    int i;\
   1.640 +    src -= 2*srcStride;\
   1.641 +    for(i=0; i<h+5; i++)\
   1.642 +    {\
   1.643 +        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
   1.644 +        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
   1.645 +        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
   1.646 +        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
   1.647 +        tmp+=tmpStride;\
   1.648 +        src+=srcStride;\
   1.649 +    }\
   1.650 +    tmp -= tmpStride*(h+5-2);\
   1.651 +    for(i=0; i<w; i++)\
   1.652 +    {\
   1.653 +        const int tmpB= tmp[-2*tmpStride];\
   1.654 +        const int tmpA= tmp[-1*tmpStride];\
   1.655 +        const int tmp0= tmp[0 *tmpStride];\
   1.656 +        const int tmp1= tmp[1 *tmpStride];\
   1.657 +        const int tmp2= tmp[2 *tmpStride];\
   1.658 +        const int tmp3= tmp[3 *tmpStride];\
   1.659 +        const int tmp4= tmp[4 *tmpStride];\
   1.660 +        const int tmp5= tmp[5 *tmpStride];\
   1.661 +        const int tmp6= tmp[6 *tmpStride];\
   1.662 +        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
   1.663 +        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
   1.664 +        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
   1.665 +        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
   1.666 +        dst++;\
   1.667 +        tmp++;\
   1.668 +    }\
   1.669 +}\
   1.670 +\
   1.671 +static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
   1.672 +    const int h=8;\
   1.673 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
   1.674 +    int i;\
   1.675 +    for(i=0; i<h; i++)\
   1.676 +    {\
   1.677 +        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
   1.678 +        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
   1.679 +        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
   1.680 +        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
   1.681 +        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
   1.682 +        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
   1.683 +        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
   1.684 +        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
   1.685 +        dst+=dstStride;\
   1.686 +        src+=srcStride;\
   1.687 +    }\
   1.688 +}\
   1.689 +\
   1.690 +static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
   1.691 +    const int w=8;\
   1.692 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
   1.693 +    int i;\
   1.694 +    for(i=0; i<w; i++)\
   1.695 +    {\
   1.696 +        const int srcB= src[-2*srcStride];\
   1.697 +        const int srcA= src[-1*srcStride];\
   1.698 +        const int src0= src[0 *srcStride];\
   1.699 +        const int src1= src[1 *srcStride];\
   1.700 +        const int src2= src[2 *srcStride];\
   1.701 +        const int src3= src[3 *srcStride];\
   1.702 +        const int src4= src[4 *srcStride];\
   1.703 +        const int src5= src[5 *srcStride];\
   1.704 +        const int src6= src[6 *srcStride];\
   1.705 +        const int src7= src[7 *srcStride];\
   1.706 +        const int src8= src[8 *srcStride];\
   1.707 +        const int src9= src[9 *srcStride];\
   1.708 +        const int src10=src[10*srcStride];\
   1.709 +        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
   1.710 +        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
   1.711 +        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
   1.712 +        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
   1.713 +        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
   1.714 +        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
   1.715 +        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
   1.716 +        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
   1.717 +        dst++;\
   1.718 +        src++;\
   1.719 +    }\
   1.720 +}\
   1.721 +\
   1.722 +static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
   1.723 +    const int h=8;\
   1.724 +    const int w=8;\
   1.725 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
   1.726 +    int i;\
   1.727 +    src -= 2*srcStride;\
   1.728 +    for(i=0; i<h+5; i++)\
   1.729 +    {\
   1.730 +        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
   1.731 +        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
   1.732 +        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
   1.733 +        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
   1.734 +        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
   1.735 +        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
   1.736 +        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
   1.737 +        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
   1.738 +        tmp+=tmpStride;\
   1.739 +        src+=srcStride;\
   1.740 +    }\
   1.741 +    tmp -= tmpStride*(h+5-2);\
   1.742 +    for(i=0; i<w; i++)\
   1.743 +    {\
   1.744 +        const int tmpB= tmp[-2*tmpStride];\
   1.745 +        const int tmpA= tmp[-1*tmpStride];\
   1.746 +        const int tmp0= tmp[0 *tmpStride];\
   1.747 +        const int tmp1= tmp[1 *tmpStride];\
   1.748 +        const int tmp2= tmp[2 *tmpStride];\
   1.749 +        const int tmp3= tmp[3 *tmpStride];\
   1.750 +        const int tmp4= tmp[4 *tmpStride];\
   1.751 +        const int tmp5= tmp[5 *tmpStride];\
   1.752 +        const int tmp6= tmp[6 *tmpStride];\
   1.753 +        const int tmp7= tmp[7 *tmpStride];\
   1.754 +        const int tmp8= tmp[8 *tmpStride];\
   1.755 +        const int tmp9= tmp[9 *tmpStride];\
   1.756 +        const int tmp10=tmp[10*tmpStride];\
   1.757 +        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
   1.758 +        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
   1.759 +        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
   1.760 +        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
   1.761 +        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
   1.762 +        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
   1.763 +        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
   1.764 +        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
   1.765 +        dst++;\
   1.766 +        tmp++;\
   1.767 +    }\
   1.768 +}\
   1.769 +\
   1.770 +static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
   1.771 +    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
   1.772 +    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
   1.773 +    src += 8*srcStride;\
   1.774 +    dst += 8*dstStride;\
   1.775 +    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
   1.776 +    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
   1.777 +}\
   1.778 +\
   1.779 +static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
   1.780 +    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
   1.781 +    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
   1.782 +    src += 8*srcStride;\
   1.783 +    dst += 8*dstStride;\
   1.784 +    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
   1.785 +    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
   1.786 +}\
   1.787 +\
   1.788 +static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
   1.789 +    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
   1.790 +    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
   1.791 +    src += 8*srcStride;\
   1.792 +    dst += 8*dstStride;\
   1.793 +    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
   1.794 +    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
   1.795 +}\
   1.796 +
   1.797 +#define H264_MC(OPNAME, SIZE) \
   1.798 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
   1.799 +    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
   1.800 +}\
   1.801 +\
   1.802 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
   1.803 +    uint8_t half[SIZE*SIZE];\
   1.804 +    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
   1.805 +    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
   1.806 +}\
   1.807 +\
   1.808 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
   1.809 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
   1.810 +}\
   1.811 +\
   1.812 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
   1.813 +    uint8_t half[SIZE*SIZE];\
   1.814 +    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
   1.815 +    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
   1.816 +}\
   1.817 +\
   1.818 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
   1.819 +    uint8_t full[SIZE*(SIZE+5)];\
   1.820 +    uint8_t * const full_mid= full + SIZE*2;\
   1.821 +    uint8_t half[SIZE*SIZE];\
   1.822 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
   1.823 +    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
   1.824 +    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
   1.825 +}\
   1.826 +\
   1.827 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
   1.828 +    uint8_t full[SIZE*(SIZE+5)];\
   1.829 +    uint8_t * const full_mid= full + SIZE*2;\
   1.830 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
   1.831 +    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
   1.832 +}\
   1.833 +\
   1.834 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
   1.835 +    uint8_t full[SIZE*(SIZE+5)];\
   1.836 +    uint8_t * const full_mid= full + SIZE*2;\
   1.837 +    uint8_t half[SIZE*SIZE];\
   1.838 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
   1.839 +    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
   1.840 +    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
   1.841 +}\
   1.842 +\
   1.843 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
   1.844 +    uint8_t full[SIZE*(SIZE+5)];\
   1.845 +    uint8_t * const full_mid= full + SIZE*2;\
   1.846 +    uint8_t halfH[SIZE*SIZE];\
   1.847 +    uint8_t halfV[SIZE*SIZE];\
   1.848 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
   1.849 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
   1.850 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
   1.851 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
   1.852 +}\
   1.853 +\
   1.854 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
   1.855 +    uint8_t full[SIZE*(SIZE+5)];\
   1.856 +    uint8_t * const full_mid= full + SIZE*2;\
   1.857 +    uint8_t halfH[SIZE*SIZE];\
   1.858 +    uint8_t halfV[SIZE*SIZE];\
   1.859 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
   1.860 +    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
   1.861 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
   1.862 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
   1.863 +}\
   1.864 +\
   1.865 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
   1.866 +    uint8_t full[SIZE*(SIZE+5)];\
   1.867 +    uint8_t * const full_mid= full + SIZE*2;\
   1.868 +    uint8_t halfH[SIZE*SIZE];\
   1.869 +    uint8_t halfV[SIZE*SIZE];\
   1.870 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
   1.871 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
   1.872 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
   1.873 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
   1.874 +}\
   1.875 +\
   1.876 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
   1.877 +    uint8_t full[SIZE*(SIZE+5)];\
   1.878 +    uint8_t * const full_mid= full + SIZE*2;\
   1.879 +    uint8_t halfH[SIZE*SIZE];\
   1.880 +    uint8_t halfV[SIZE*SIZE];\
   1.881 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
   1.882 +    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
   1.883 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
   1.884 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
   1.885 +}\
   1.886 +\
   1.887 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
   1.888 +    int16_t tmp[SIZE*(SIZE+5)];\
   1.889 +    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
   1.890 +}\
   1.891 +\
   1.892 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
   1.893 +    int16_t tmp[SIZE*(SIZE+5)];\
   1.894 +    uint8_t halfH[SIZE*SIZE];\
   1.895 +    uint8_t halfHV[SIZE*SIZE];\
   1.896 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
   1.897 +    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
   1.898 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
   1.899 +}\
   1.900 +\
   1.901 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
   1.902 +    int16_t tmp[SIZE*(SIZE+5)];\
   1.903 +    uint8_t halfH[SIZE*SIZE];\
   1.904 +    uint8_t halfHV[SIZE*SIZE];\
   1.905 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
   1.906 +    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
   1.907 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
   1.908 +}\
   1.909 +\
   1.910 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
   1.911 +    uint8_t full[SIZE*(SIZE+5)];\
   1.912 +    uint8_t * const full_mid= full + SIZE*2;\
   1.913 +    int16_t tmp[SIZE*(SIZE+5)];\
   1.914 +    uint8_t halfV[SIZE*SIZE];\
   1.915 +    uint8_t halfHV[SIZE*SIZE];\
   1.916 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
   1.917 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
   1.918 +    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
   1.919 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
   1.920 +}\
   1.921 +\
   1.922 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
   1.923 +    uint8_t full[SIZE*(SIZE+5)];\
   1.924 +    uint8_t * const full_mid= full + SIZE*2;\
   1.925 +    int16_t tmp[SIZE*(SIZE+5)];\
   1.926 +    uint8_t halfV[SIZE*SIZE];\
   1.927 +    uint8_t halfHV[SIZE*SIZE];\
   1.928 +    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
   1.929 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
   1.930 +    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
   1.931 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
   1.932 +}\
   1.933 +
   1.934 +#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
   1.935 +#define op_put(a, b)  a = cm[((b) + 16)>>5]
   1.936 +#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
   1.937 +#define op2_put(a, b)  a = cm[((b) + 512)>>10]
   1.938 +
   1.939 +H264_LOWPASS(put_       , op_put, op2_put)
   1.940 +H264_LOWPASS(avg_       , op_avg, op2_avg)
   1.941 +H264_MC(put_, 2)
   1.942 +H264_MC(put_, 4)
   1.943 +H264_MC(put_, 8)
   1.944 +H264_MC(put_, 16)
   1.945 +H264_MC(avg_, 4)
   1.946 +H264_MC(avg_, 8)
   1.947 +H264_MC(avg_, 16)
   1.948 +
   1.949 +#undef op_avg
   1.950 +#undef op_put
   1.951 +#undef op2_avg
   1.952 +#undef op2_put
   1.953 +
   1.954 +static void clear_block_c(DCTELEM *block)
   1.955 +{
   1.956 +    memset(block, 0, sizeof(DCTELEM)*64);
   1.957 +}
   1.958 +
   1.959 +/**
   1.960 + * memset(blocks, 0, sizeof(DCTELEM)*6*64)
   1.961 + */
   1.962 +static void clear_blocks_c(DCTELEM *blocks)
   1.963 +{
   1.964 +    memset(blocks, 0, sizeof(DCTELEM)*6*64);
   1.965 +}
   1.966 +
   1.967 +static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
   1.968 +
   1.969 +/* init static data */
   1.970 +av_cold void dsputil_static_init(void)
   1.971 +{
   1.972 +    int i;
   1.973 +
   1.974 +    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
   1.975 +    for(i=0;i<MAX_NEG_CROP;i++) {
   1.976 +        ff_cropTbl[i] = 0;
   1.977 +        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
   1.978 +    }
   1.979 +
   1.980 +    for(i=0;i<512;i++) {
   1.981 +        ff_squareTbl[i] = (i - 256) * (i - 256);
   1.982 +    }
   1.983 +}
   1.984 +
   1.985 +int ff_check_alignment(void){
   1.986 +    static int did_fail=0;
   1.987 +    DECLARE_ALIGNED(16, int, aligned);
   1.988 +
   1.989 +    if((intptr_t)&aligned & 15){
   1.990 +        if(!did_fail){
   1.991 +#if HAVE_MMX || HAVE_ALTIVEC
   1.992 +            av_log(AV_LOG_ERROR,
   1.993 +                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
   1.994 +                "and may be very slow or crash. This is not a bug in libavcodec,\n"
   1.995 +                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
   1.996 +                "Do not report crashes to FFmpeg developers.\n");
   1.997 +#endif
   1.998 +            did_fail=1;
   1.999 +        }
  1.1000 +        return -1;
  1.1001 +    }
  1.1002 +    return 0;
  1.1003 +}
  1.1004 +
  1.1005 +av_cold void dsputil_init(DSPContext* c)
  1.1006 +{
  1.1007 +    (void) avg_pixels2_c; // kill a warning, avg_pixels2_c is a macro created function.
  1.1008 +    ff_check_alignment();
  1.1009 +    dsputil_static_init();
  1.1010 + 
  1.1011 +    c->idct_put= ff_simple_idct_put;
  1.1012 +    c->idct_add= ff_simple_idct_add;
  1.1013 +    c->idct    = ff_simple_idct;
  1.1014 +
  1.1015 +    c->clear_block = clear_block_c;
  1.1016 +    c->clear_blocks = clear_blocks_c;
  1.1017 +
  1.1018 +#define dspfunc(PFX, IDX, NUM) \
  1.1019 +    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
  1.1020 +    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
  1.1021 +    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
  1.1022 +    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
  1.1023 +    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
  1.1024 +    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
  1.1025 +    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
  1.1026 +    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
  1.1027 +    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
  1.1028 +    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
  1.1029 +    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
  1.1030 +    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
  1.1031 +    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
  1.1032 +    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
  1.1033 +    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
  1.1034 +    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
  1.1035 +
  1.1036 +
  1.1037 +    dspfunc(put_h264_qpel, 0, 16);
  1.1038 +    dspfunc(put_h264_qpel, 1, 8);
  1.1039 +    dspfunc(put_h264_qpel, 2, 4);
  1.1040 +    dspfunc(put_h264_qpel, 3, 2);
  1.1041 +    dspfunc(avg_h264_qpel, 0, 16);
  1.1042 +    dspfunc(avg_h264_qpel, 1, 8);
  1.1043 +    dspfunc(avg_h264_qpel, 2, 4);
  1.1044 +
  1.1045 +#undef dspfunc
  1.1046 +    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
  1.1047 +    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
  1.1048 +    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
  1.1049 +    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
  1.1050 +    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
  1.1051 +    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
  1.1052 +
  1.1053 +
  1.1054 +    c->prefetch= just_return;
  1.1055 +
  1.1056 +    if (HAVE_MMX)        dsputil_init_mmx   (c);
  1.1057 +    if (ARCH_ARM)        dsputil_init_arm   (c);
  1.1058 +    if (HAVE_ALTIVEC)    dsputil_init_ppc   (c); //fixme PPC prefetch
  1.1059 +}
  1.1060 +