diff libavcodec/cell/dsputil_spu.c @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libavcodec/cell/dsputil_spu.c	Tue Sep 25 15:55:33 2012 +0200
     1.3 @@ -0,0 +1,1147 @@
     1.4 +/*
     1.5 + * Copyright (c) 2009 TUDelft 
     1.6 + * 
     1.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 
     1.8 + */
     1.9 +
    1.10 +/**
    1.11 + * @file libavcodec/cell/spu/h264_main_spu.c
    1.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding
    1.13 + * @author C C Chi <c.c.chi@student.tudelft.nl>
    1.14 + * 
    1.15 + * SIMD SPU kernels 
    1.16 + * H.264/AVC motion compensation
    1.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu>
    1.18 + * @author Albert Paradis <apar7632@hotmail.com>
    1.19 + */ 
    1.20 +
    1.21 +
    1.22 +#include "dsputil_spu.h"
    1.23 +#include "h264_idct_spu.h"
    1.24 +#include "h264_deblock_spu.h"
    1.25 +#include "types_spu.h"
    1.26 +#include "libavutil/intreadwrite.h"
    1.27 +
    1.28 +#include <stdio.h>
    1.29 +#include <spu_intrinsics.h>
    1.30 +#include <spu_mfcio.h>
    1.31 +#include <assert.h>
    1.32 +
    1.33 +//Luma interpolation
    1.34 +#define PUT_OP_U8_SPU(d, s, dst) (void) dst; d = s
    1.35 +#define AVG_OP_U8_SPU(d, s, dst) d = spu_avg(dst, s)
    1.36 +
    1.37 +#define OP_U8_SPU                          PUT_OP_U8_SPU
    1.38 +#define PREFIX_h264_qpel16_h_lowpass_spu   put_h264_qpel16_h_lowpass_spu
    1.39 +#define PREFIX_h264_qpel16_v_lowpass_spu   put_h264_qpel16_v_lowpass_spu
    1.40 +#define PREFIX_h264_qpel16_hv_lowpass_spu  put_h264_qpel16_hv_lowpass_spu
    1.41 +#define PREFIX_h264_qpel8_h_lowpass_spu    put_h264_qpel8_h_lowpass_spu
    1.42 +#define PREFIX_h264_qpel8_v_lowpass_spu    put_h264_qpel8_v_lowpass_spu
    1.43 +#define PREFIX_h264_qpel8_hv_lowpass_spu   put_h264_qpel8_hv_lowpass_spu
    1.44 +#define PREFIX_h264_qpel4_h_lowpass_spu    put_h264_qpel4_h_lowpass_spu
    1.45 +#define PREFIX_h264_qpel4_v_lowpass_spu    put_h264_qpel4_v_lowpass_spu
    1.46 +#define PREFIX_h264_qpel4_hv_lowpass_spu   put_h264_qpel4_hv_lowpass_spu
    1.47 +#include "h264_luma_template_spu.c"
    1.48 +#undef OP_U8_SPU                          
    1.49 +#undef PREFIX_h264_qpel16_h_lowpass_spu
    1.50 +#undef PREFIX_h264_qpel16_v_lowpass_spu
    1.51 +#undef PREFIX_h264_qpel16_hv_lowpass_spu
    1.52 +#undef PREFIX_h264_qpel8_h_lowpass_spu
    1.53 +#undef PREFIX_h264_qpel8_v_lowpass_spu
    1.54 +#undef PREFIX_h264_qpel8_hv_lowpass_spu
    1.55 +#undef PREFIX_h264_qpel4_h_lowpass_spu
    1.56 +#undef PREFIX_h264_qpel4_v_lowpass_spu
    1.57 +#undef PREFIX_h264_qpel4_hv_lowpass_spu
    1.58 +
    1.59 +#define OP_U8_SPU                          AVG_OP_U8_SPU
    1.60 +#define PREFIX_h264_qpel16_h_lowpass_spu   avg_h264_qpel16_h_lowpass_spu
    1.61 +#define PREFIX_h264_qpel16_v_lowpass_spu   avg_h264_qpel16_v_lowpass_spu
    1.62 +#define PREFIX_h264_qpel16_hv_lowpass_spu  avg_h264_qpel16_hv_lowpass_spu
    1.63 +#define PREFIX_h264_qpel8_h_lowpass_spu    avg_h264_qpel8_h_lowpass_spu
    1.64 +#define PREFIX_h264_qpel8_v_lowpass_spu    avg_h264_qpel8_v_lowpass_spu
    1.65 +#define PREFIX_h264_qpel8_hv_lowpass_spu   avg_h264_qpel8_hv_lowpass_spu
    1.66 +#define PREFIX_h264_qpel4_h_lowpass_spu    avg_h264_qpel4_h_lowpass_spu
    1.67 +#define PREFIX_h264_qpel4_v_lowpass_spu    avg_h264_qpel4_v_lowpass_spu
    1.68 +#define PREFIX_h264_qpel4_hv_lowpass_spu   avg_h264_qpel4_hv_lowpass_spu
    1.69 +#include "h264_luma_template_spu.c"
    1.70 +#undef OP_U8_SPU                          
    1.71 +#undef PREFIX_h264_qpel16_h_lowpass_spu
    1.72 +#undef PREFIX_h264_qpel16_v_lowpass_spu
    1.73 +#undef PREFIX_h264_qpel16_hv_lowpass_spu
    1.74 +#undef PREFIX_h264_qpel8_h_lowpass_spu
    1.75 +#undef PREFIX_h264_qpel8_v_lowpass_spu
    1.76 +#undef PREFIX_h264_qpel8_hv_lowpass_spu
    1.77 +#undef PREFIX_h264_qpel4_h_lowpass_spu
    1.78 +#undef PREFIX_h264_qpel4_v_lowpass_spu
    1.79 +#undef PREFIX_h264_qpel4_hv_lowpass_spu
    1.80 +
    1.81 +#define H264_MC(OPNAME, SIZE, CODETYPE) \
    1.82 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
    1.83 +    OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, dst_stride, STRIDE_Y, h);\
    1.84 +}\
    1.85 +\
    1.86 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){ \
    1.87 +    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
    1.88 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\
    1.89 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\
    1.90 +}\
    1.91 +\
    1.92 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
    1.93 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\
    1.94 +}\
    1.95 +\
    1.96 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
    1.97 +    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
    1.98 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\
    1.99 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, dst_stride, STRIDE_Y, h);\
   1.100 +}\
   1.101 +\
   1.102 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.103 +    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
   1.104 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\
   1.105 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\
   1.106 +}\
   1.107 +\
   1.108 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.109 +    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\
   1.110 +}\
   1.111 +\
   1.112 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.113 +    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
   1.114 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\
   1.115 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+STRIDE_Y, half, dst_stride, STRIDE_Y, h);\
   1.116 +}\
   1.117 +\
   1.118 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.119 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
   1.120 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
   1.121 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
   1.122 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
   1.123 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
   1.124 +}\
   1.125 +\
   1.126 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.127 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
   1.128 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
   1.129 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
   1.130 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
   1.131 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
   1.132 +}\
   1.133 +\
   1.134 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.135 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
   1.136 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
   1.137 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
   1.138 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
   1.139 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
   1.140 +}\
   1.141 +\
   1.142 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.143 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
   1.144 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
   1.145 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
   1.146 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
   1.147 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
   1.148 +}\
   1.149 +\
   1.150 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.151 +    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
   1.152 +    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, dst_stride, 16, h);\
   1.153 +}\
   1.154 +\
   1.155 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.156 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
   1.157 +    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
   1.158 +    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
   1.159 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
   1.160 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
   1.161 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\
   1.162 +}\
   1.163 +\
   1.164 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.165 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
   1.166 +    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
   1.167 +    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
   1.168 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
   1.169 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
   1.170 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\
   1.171 +}\
   1.172 +\
   1.173 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.174 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
   1.175 +    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
   1.176 +    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
   1.177 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
   1.178 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
   1.179 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\
   1.180 +}\
   1.181 +\
   1.182 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   1.183 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
   1.184 +    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
   1.185 +    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
   1.186 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
   1.187 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
   1.188 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\
   1.189 +}\
   1.190 +
   1.191 +
   1.192 +/**************************/
   1.193 +/* put pixels functions   */
   1.194 +/*************************/
   1.195 +
   1.196 +static void put_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,
   1.197 +                                    const uint8_t * src2, int dst_stride,
   1.198 +                                    int src_stride1, int h)
   1.199 +{
   1.200 +  int i;
   1.201 +
   1.202 +  const int perm_src1 = (unsigned int) src1 & 15;
   1.203 +
   1.204 +  for (i=0; i<h; i++){
   1.205 +      //unaligned load of src1
   1.206 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
   1.207 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
   1.208 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
   1.209 +
   1.210 +      //aligned load of src2
   1.211 +      const vuint8_t srcb = *(vuint8_t *)(src2);
   1.212 +
   1.213 +      //average and rounding
   1.214 +      const vuint8_t avgc = spu_avg(srca,srcb);
   1.215 +
   1.216 +      // 16x16 dest luma blocks are always aligned
   1.217 +      *(vuint8_t *)dst=avgc;
   1.218 +
   1.219 +      src1 +=src_stride1;
   1.220 +      src2 +=16;
   1.221 +      dst  +=dst_stride;
   1.222 +  }
   1.223 +}
   1.224 +
   1.225 +static void avg_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,
   1.226 +                                    const uint8_t * src2, int dst_stride,
   1.227 +                                    int src_stride1, int h)
   1.228 +{
   1.229 +  int i;
   1.230 +
   1.231 +  const int perm_src1 = (unsigned int) src1 & 15;
   1.232 +
   1.233 +  for (i=0; i<h; i++){
   1.234 +      //unaligned load of src1
   1.235 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
   1.236 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
   1.237 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
   1.238 +
   1.239 +      //aligned load of src2
   1.240 +      const vuint8_t srcb = *(vuint8_t *)(src2);
   1.241 +
   1.242 +      //average and rounding
   1.243 +      const vuint8_t avgc = spu_avg(spu_avg(srca,srcb), *(vuint8_t *)dst);
   1.244 +
   1.245 +      // 16x16 dest luma blocks are always aligned
   1.246 +      *(vuint8_t *)dst=avgc;
   1.247 +
   1.248 +      src1 +=src_stride1;
   1.249 +      src2 +=16;
   1.250 +      dst  +=dst_stride;
   1.251 +  }
   1.252 +}
   1.253 +
   1.254 +// next one assumes that ((line_size % 16) == 0)
   1.255 +void put_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
   1.256 +{
   1.257 +    register vector unsigned char pixelsv1, pixelsv2;
   1.258 +    register vector unsigned char pixelsv1B, pixelsv2B;
   1.259 +    register vector unsigned char pixelsv1C, pixelsv2C;
   1.260 +    register vector unsigned char pixelsv1D, pixelsv2D;
   1.261 +
   1.262 +    const int perm = (unsigned int) src & 15;
   1.263 +    int i;
   1.264 +	register int line_size   = src_stride;
   1.265 +    register int line_size_2 = line_size << 1;
   1.266 +    register int line_size_3 = line_size + line_size_2;
   1.267 +    register int line_size_4 = line_size << 2;
   1.268 +
   1.269 +    register int dst_stride_2 = dst_stride << 1;
   1.270 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
   1.271 +    register int dst_stride_4 = dst_stride << 2;
   1.272 +
   1.273 +    for(i=0; i<h; i+=4) {
   1.274 +      pixelsv1 = *(vuint8_t *)(src);
   1.275 +      pixelsv2 = *(vuint8_t *)(src+16);
   1.276 +      pixelsv1B = *(vuint8_t *)(src + line_size);
   1.277 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
   1.278 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
   1.279 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
   1.280 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
   1.281 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
   1.282 +
   1.283 +      *(vuint8_t *) dst                 = spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16));
   1.284 +      *(vuint8_t *)(dst +   dst_stride) = spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16));
   1.285 +      *(vuint8_t *)(dst + dst_stride_2) = spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16));
   1.286 +      *(vuint8_t *)(dst + dst_stride_3) = spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16));
   1.287 +
   1.288 +      src+= line_size_4;
   1.289 +      dst+= dst_stride_4;
   1.290 +    }
   1.291 +}
   1.292 +
   1.293 +// next one assumes that ((line_size % 16) == 0)
   1.294 +void avg_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
   1.295 +{
   1.296 +    register vector unsigned char pixelsv1, pixelsv2;
   1.297 +    register vector unsigned char pixelsv1B, pixelsv2B;
   1.298 +    register vector unsigned char pixelsv1C, pixelsv2C;
   1.299 +    register vector unsigned char pixelsv1D, pixelsv2D;
   1.300 +
   1.301 +    const int perm = (unsigned int) src & 15;
   1.302 +    int i;
   1.303 +	register int line_size   = src_stride;
   1.304 +    register int line_size_2 = line_size << 1;
   1.305 +    register int line_size_3 = line_size + line_size_2;
   1.306 +    register int line_size_4 = line_size << 2;
   1.307 +
   1.308 +    register int dst_stride_2 = dst_stride << 1;
   1.309 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
   1.310 +    register int dst_stride_4 = dst_stride << 2;
   1.311 +
   1.312 +
   1.313 +    for(i=0; i<h; i+=4) {
   1.314 +      pixelsv1 = *(vuint8_t *)(src);
   1.315 +      pixelsv2 = *(vuint8_t *)(src+16);
   1.316 +      pixelsv1B = *(vuint8_t *)(src + line_size);
   1.317 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
   1.318 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
   1.319 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
   1.320 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
   1.321 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
   1.322 +
   1.323 +      *(vuint8_t *)dst = spu_avg(spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16)), *(vuint8_t *)dst);
   1.324 +      *(vuint8_t *)(dst + dst_stride) = spu_avg(spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), *(vuint8_t *)(dst+dst_stride));
   1.325 +      *(vuint8_t *)(dst + dst_stride_2) = spu_avg(spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), *(vuint8_t *)(dst+dst_stride_2));
   1.326 +      *(vuint8_t *)(dst + dst_stride_3) = spu_avg(spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), *(vuint8_t *)(dst+dst_stride_3));
   1.327 +
   1.328 +      src+= line_size_4;
   1.329 +      dst+= dst_stride_4;
   1.330 +    }
   1.331 +}
   1.332 +
   1.333 +void put_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
   1.334 +				   int dst_stride, int src_stride1, int h)
   1.335 +{
   1.336 +  int i;
   1.337 +
   1.338 +  const int perm_src1 = (unsigned int) src1 & 15;
   1.339 +  const int shift_dst = (unsigned int) dst & 15;
   1.340 +
   1.341 +  // 8x dest luma blocks are aligned or desaligned by 8
   1.342 +  vuint8_t dstmask;
   1.343 +  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.344 +  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
   1.345 +
   1.346 +  if(shift_dst==0){
   1.347 +    dstmask = dst8mask1;
   1.348 +  }
   1.349 +  else{
   1.350 +    dstmask = dst8mask2;
   1.351 +  }
   1.352 +
   1.353 +  for (i=0; i<h; i++){
   1.354 +      //unaligned load of src1
   1.355 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
   1.356 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
   1.357 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
   1.358 +
   1.359 +      //aligned load of src2
   1.360 +      const vuint8_t srcb = *(vuint8_t *)(src2);
   1.361 +
   1.362 +      //average and rounding
   1.363 +      const vuint8_t avgc = spu_avg(srca,srcb);
   1.364 +
   1.365 +      const vuint8_t dst1 = *(vuint8_t *)dst;
   1.366 +
   1.367 +      const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);
   1.368 +
   1.369 +      *(vuint8_t *)dst=davgc;
   1.370 +
   1.371 +      src1 +=src_stride1;
   1.372 +      src2 +=16;
   1.373 +      dst  +=dst_stride;
   1.374 +  }
   1.375 +}
   1.376 +
   1.377 +void avg_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
   1.378 +				   int dst_stride, int src_stride1, int h)
   1.379 +{
   1.380 +  int i;
   1.381 +
   1.382 +  const int perm_src1 = (unsigned int) src1 & 15;
   1.383 +  const int shift_dst = (unsigned int) dst & 15;
   1.384 +
   1.385 +  // 8x dest luma blocks are aligned or desaligned by 8
   1.386 +  vuint8_t dstmask;
   1.387 +  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.388 +  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
   1.389 +
   1.390 +  if(shift_dst==0){
   1.391 +    dstmask = dst8mask1;
   1.392 +  }
   1.393 +  else{
   1.394 +    dstmask = dst8mask2;
   1.395 +  }
   1.396 +
   1.397 +  for (i=0; i<h; i++){
   1.398 +      //unaligned load of src1
   1.399 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
   1.400 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
   1.401 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
   1.402 +
   1.403 +      //aligned load of src2
   1.404 +      const vuint8_t srcb = *(vuint8_t *)(src2);
   1.405 +
   1.406 +      //average and rounding
   1.407 +      const vuint8_t avgc = spu_avg(srca,srcb);
   1.408 +
   1.409 +      const vuint8_t dst1 = *(vuint8_t *)dst;
   1.410 +
   1.411 +      const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);
   1.412 +
   1.413 +      const vuint8_t davgc = spu_avg(dst1,davgc1);
   1.414 +
   1.415 +      *(vuint8_t *)dst=davgc;
   1.416 +
   1.417 +      src1 +=src_stride1;
   1.418 +      src2 +=16;
   1.419 +      dst  +=dst_stride;
   1.420 +  }
   1.421 +}
   1.422 +
   1.423 +// next one assumes that ((line_size % 16) == 0)
   1.424 +void put_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
   1.425 +{
   1.426 +    register vector unsigned char pixelsv1A, pixelsv2A;
   1.427 +    register vector unsigned char pixelsv1B, pixelsv2B;
   1.428 +    register vector unsigned char pixelsv1C, pixelsv2C;
   1.429 +    register vector unsigned char pixelsv1D, pixelsv2D;
   1.430 +
   1.431 +    const int perm = (unsigned int) src & 15;
   1.432 +    const int shift_dst = (unsigned int) dst & 15;
   1.433 +
   1.434 +    // 8x dest luma blocks are aligned or desaligned by 8
   1.435 +    vuint8_t dstmask;
   1.436 +    const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.437 +    const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
   1.438 +
   1.439 +    if(shift_dst==0){
   1.440 +      dstmask = dst8mask1;
   1.441 +    }
   1.442 +    else{
   1.443 +      dstmask = dst8mask2;
   1.444 +    }
   1.445 +
   1.446 +    int i;
   1.447 +	register int line_size   = src_stride;
   1.448 +    register int line_size_2 = line_size << 1;
   1.449 +    register int line_size_3 = line_size + line_size_2;
   1.450 +    register int line_size_4 = line_size << 2;
   1.451 +
   1.452 +    register int dst_stride_2 = dst_stride << 1;
   1.453 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
   1.454 +    register int dst_stride_4 = dst_stride << 2;
   1.455 +
   1.456 +    for(i=0; i<h; i+=4) {
   1.457 +      pixelsv1A = *(vuint8_t *)(src);
   1.458 +      pixelsv2A = *(vuint8_t *)(src+16);
   1.459 +      pixelsv1B = *(vuint8_t *)(src + line_size);
   1.460 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
   1.461 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
   1.462 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
   1.463 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
   1.464 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
   1.465 +
   1.466 +      const vuint8_t block1 = *(vuint8_t *)dst;
   1.467 +      const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
   1.468 +      const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);
   1.469 +      const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
   1.470 +      const vuint8_t block3 = *(vuint8_t *)(dst+2*dst_stride);
   1.471 +      const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
   1.472 +      const vuint8_t block4 = *(vuint8_t *)(dst+3*dst_stride);
   1.473 +      const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
   1.474 +
   1.475 +      *(vuint8_t *) dst = put1;
   1.476 +      *(vuint8_t *)(dst + dst_stride) = put2;
   1.477 +      *(vuint8_t *)(dst + dst_stride_2) = put3;
   1.478 +      *(vuint8_t *)(dst + dst_stride_3) = put4;
   1.479 +
   1.480 +      src += line_size_4;
   1.481 +      dst += dst_stride_4;
   1.482 +    }
   1.483 +}
   1.484 +
   1.485 +// next one assumes that ((line_size % 16) == 0)
   1.486 +void avg_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
   1.487 +{
   1.488 +    register vector unsigned char pixelsv1A, pixelsv2A;
   1.489 +    register vector unsigned char pixelsv1B, pixelsv2B;
   1.490 +    register vector unsigned char pixelsv1C, pixelsv2C;
   1.491 +    register vector unsigned char pixelsv1D, pixelsv2D;
   1.492 +
   1.493 +    const int perm = (unsigned int) src & 15;
   1.494 +    const int shift_dst = (unsigned int) dst & 15;
   1.495 +
   1.496 +    // 8x dest luma blocks are aligned or desaligned by 8
   1.497 +    vuint8_t dstmask;
   1.498 +    const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.499 +    const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
   1.500 +
   1.501 +    if(shift_dst==0){
   1.502 +      dstmask = dst8mask1;
   1.503 +    }
   1.504 +    else{
   1.505 +      dstmask = dst8mask2;
   1.506 +    }
   1.507 +
   1.508 +    int i;
   1.509 +	register int line_size   = src_stride;
   1.510 +    register int line_size_2 = line_size << 1;
   1.511 +    register int line_size_3 = line_size + line_size_2;
   1.512 +    register int line_size_4 = line_size << 2;
   1.513 +
   1.514 +	register int dst_stride_2 = dst_stride << 1;
   1.515 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
   1.516 +    register int dst_stride_4 = dst_stride << 2;
   1.517 +
   1.518 +    for(i=0; i<h; i+=4) {
   1.519 +      pixelsv1A = *(vuint8_t *)(src);
   1.520 +      pixelsv2A = *(vuint8_t *)(src+16);
   1.521 +      pixelsv1B = *(vuint8_t *)(src + line_size);
   1.522 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
   1.523 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
   1.524 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
   1.525 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
   1.526 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
   1.527 +
   1.528 +      const vuint8_t block1 = *(vuint8_t *) dst;
   1.529 +      const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
   1.530 +      const vuint8_t put1 = spu_avg(block1,put1a);
   1.531 +
   1.532 +      const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);
   1.533 +      const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
   1.534 +      const vuint8_t put2 = spu_avg(block2,put2a);
   1.535 +
   1.536 +      const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);
   1.537 +      const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
   1.538 +      const vuint8_t put3 = spu_avg(block3,put3a);
   1.539 +
   1.540 +      const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);
   1.541 +      const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
   1.542 +      const vuint8_t put4 = spu_avg(block4,put4a);
   1.543 +
   1.544 +      *(vuint8_t *) dst = put1;
   1.545 +      *(vuint8_t *)(dst + dst_stride) = put2;
   1.546 +      *(vuint8_t *)(dst + dst_stride_2) = put3;
   1.547 +      *(vuint8_t *)(dst + dst_stride_3) = put4;
   1.548 +
   1.549 +      src+= line_size_4;
   1.550 +      dst+= dst_stride_4;
   1.551 +    }
   1.552 +}
   1.553 +
   1.554 +void put_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
   1.555 +				   int dst_stride, int src_stride1, int h)
   1.556 +{
   1.557 +  int i;
   1.558 +
   1.559 +  const int perm_src1 = (unsigned int) src1 & 15;
   1.560 +  const int shift_dst = (unsigned int) dst & 15;
   1.561 +
   1.562 +  // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
   1.563 +  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   1.564 +  const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.565 +  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.566 +  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
   1.567 +  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
   1.568 +
   1.569 +  switch(shift_dst){
   1.570 +    case 0:  dstmask = dstmask0;
   1.571 +             break;
   1.572 +    case 4:  dstmask = dstmask4;
   1.573 +             break;
   1.574 +    case 8:  dstmask = dstmask8;
   1.575 +             break;
   1.576 +    case 12: dstmask = dstmask12;
   1.577 +             break;
   1.578 +  }
   1.579 +
   1.580 +  for (i=0; i<h; i++){
   1.581 +      //unaligned load of src1
   1.582 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
   1.583 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
   1.584 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
   1.585 +
   1.586 +      //aligned load of src2
   1.587 +      const vuint8_t srcb = *(vuint8_t *)(src2);
   1.588 +
   1.589 +      //average and rounding
   1.590 +      const vuint8_t avgc = spu_avg(srca,srcb);
   1.591 +
   1.592 +      const vuint8_t dst1 = *(vuint8_t *)dst;
   1.593 +
   1.594 +      const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);
   1.595 +
   1.596 +      *(vuint8_t *)dst=davgc;
   1.597 +
   1.598 +      src1 +=src_stride1;
   1.599 +      src2 +=16;
   1.600 +      dst  +=dst_stride;
   1.601 +  }
   1.602 +}
   1.603 +
   1.604 +void avg_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
   1.605 +				   int dst_stride, int src_stride1, int h)
   1.606 +{
   1.607 +  int i;
   1.608 +
   1.609 +  const int perm_src1 = (unsigned int) src1 & 15;
   1.610 +  const int shift_dst = (unsigned int) dst & 15;
   1.611 +
   1.612 +  // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
   1.613 +  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   1.614 +  const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.615 +  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.616 +  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
   1.617 +  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
   1.618 +
   1.619 +  switch(shift_dst){
   1.620 +    case 0:  dstmask = dstmask0;
   1.621 +             break;
   1.622 +    case 4:  dstmask = dstmask4;
   1.623 +             break;
   1.624 +    case 8:  dstmask = dstmask8;
   1.625 +             break;
   1.626 +    case 12: dstmask = dstmask12;
   1.627 +             break;
   1.628 +  }
   1.629 +
   1.630 +  for (i=0; i<h; i++){
   1.631 +      //unaligned load of src1
   1.632 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
   1.633 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
   1.634 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
   1.635 +
   1.636 +      //aligned load of src2
   1.637 +      const vuint8_t srcb = *(vuint8_t *)(src2);
   1.638 +
   1.639 +      //average and rounding
   1.640 +      const vuint8_t avgc = spu_avg(srca,srcb);
   1.641 +
   1.642 +      const vuint8_t dst1 = *(vuint8_t *)dst;
   1.643 +
   1.644 +      const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);
   1.645 +
   1.646 +      const vuint8_t davgc = spu_avg(dst1,davgc1);
   1.647 +
   1.648 +      *(vuint8_t *)dst=davgc;
   1.649 +
   1.650 +      src1 +=src_stride1;
   1.651 +      src2 +=16;
   1.652 +      dst  +=dst_stride;
   1.653 +  }
   1.654 +}
   1.655 +
   1.656 +// next one assumes that ((line_size % 16) == 0)
   1.657 +void put_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
   1.658 +{
   1.659 +    register vector unsigned char pixelsv1A, pixelsv2A;
   1.660 +    register vector unsigned char pixelsv1B, pixelsv2B;
   1.661 +    register vector unsigned char pixelsv1C, pixelsv2C;
   1.662 +    register vector unsigned char pixelsv1D, pixelsv2D;
   1.663 +
   1.664 +    const int perm = (unsigned int) src & 15;
   1.665 +    const int shift_dst = (unsigned int) dst & 15;
   1.666 +
   1.667 +    // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
   1.668 +    vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   1.669 +    const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.670 +    const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.671 +    const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
   1.672 +    const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
   1.673 +
   1.674 +    switch(shift_dst){
   1.675 +      case 0:  dstmask = dstmask0;
   1.676 +               break;
   1.677 +      case 4:  dstmask = dstmask4;
   1.678 +               break;
   1.679 +      case 8:  dstmask = dstmask8;
   1.680 +               break;
   1.681 +      case 12: dstmask = dstmask12;
   1.682 +               break;
   1.683 +    }
   1.684 +
   1.685 +    int i;
   1.686 +	register int line_size   = src_stride;
   1.687 +    register int line_size_2 = line_size << 1;
   1.688 +    register int line_size_3 = line_size + line_size_2;
   1.689 +    register int line_size_4 = line_size << 2;
   1.690 +
   1.691 +	register int dst_stride_2 = dst_stride << 1;
   1.692 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
   1.693 +    register int dst_stride_4 = dst_stride << 2;
   1.694 +
   1.695 +    for(i=0; i<h; i+=4) {
   1.696 +	  pixelsv1A = *(vuint8_t *)(src);
   1.697 +      pixelsv2A = *(vuint8_t *)(src+16);
   1.698 +      pixelsv1B = *(vuint8_t *)(src + line_size);
   1.699 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
   1.700 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
   1.701 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
   1.702 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
   1.703 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
   1.704 +
   1.705 +      const vuint8_t block1 = *(vuint8_t *)dst;
   1.706 +      const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
   1.707 +      const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);
   1.708 +      const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
   1.709 +      const vuint8_t block3 = *(vuint8_t *)(dst+dst_stride_2);
   1.710 +      const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
   1.711 +      const vuint8_t block4 = *(vuint8_t *)(dst+dst_stride_3);
   1.712 +      const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
   1.713 +
   1.714 +      *(vuint8_t *) dst = put1;
   1.715 +      *(vuint8_t *)(dst + dst_stride) = put2;
   1.716 +      *(vuint8_t *)(dst + dst_stride_2) = put3;
   1.717 +      *(vuint8_t *)(dst + dst_stride_3) = put4;
   1.718 +
   1.719 +      src += line_size_4;
   1.720 +      dst += dst_stride_4;
   1.721 +    }
   1.722 +}
   1.723 +
   1.724 +// next one assumes that ((line_size % 16) == 0)
   1.725 +void avg_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
   1.726 +{
   1.727 +    register vector unsigned char pixelsv1A, pixelsv2A;
   1.728 +    register vector unsigned char pixelsv1B, pixelsv2B;
   1.729 +    register vector unsigned char pixelsv1C, pixelsv2C;
   1.730 +    register vector unsigned char pixelsv1D, pixelsv2D;
   1.731 +
   1.732 +    const int perm = (unsigned int) src & 15;
   1.733 +    const int shift_dst = (unsigned int) dst & 15;
   1.734 +
   1.735 +    // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
   1.736 +    vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   1.737 +    const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.738 +    const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   1.739 +    const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
   1.740 +    const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
   1.741 +
   1.742 +    switch(shift_dst){
   1.743 +      case 0:  dstmask = dstmask0;
   1.744 +               break;
   1.745 +      case 4:  dstmask = dstmask4;
   1.746 +               break;
   1.747 +      case 8:  dstmask = dstmask8;
   1.748 +               break;
   1.749 +      case 12: dstmask = dstmask12;
   1.750 +               break;
   1.751 +    }
   1.752 +
   1.753 +    int i;
   1.754 +	register int line_size   = src_stride;
   1.755 +    register int line_size_2 = line_size << 1;
   1.756 +    register int line_size_3 = line_size + line_size_2;
   1.757 +    register int line_size_4 = line_size << 2;
   1.758 +
   1.759 +	register int dst_stride_2 = dst_stride << 1;
   1.760 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
   1.761 +    register int dst_stride_4 = dst_stride << 2;
   1.762 +
   1.763 +    for(i=0; i<h; i+=4) {
   1.764 +	  pixelsv1A = *(vuint8_t *)(src);
   1.765 +      pixelsv2A = *(vuint8_t *)(src+16);
   1.766 +      pixelsv1B = *(vuint8_t *)(src + line_size);
   1.767 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
   1.768 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
   1.769 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
   1.770 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
   1.771 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
   1.772 +
   1.773 +      const vuint8_t block1 = *(vuint8_t *) dst;
   1.774 +      const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
   1.775 +      const vuint8_t put1 = spu_avg(block1,put1a);
   1.776 +
   1.777 +      const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);
   1.778 +      const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
   1.779 +      const vuint8_t put2 = spu_avg(block2,put2a);
   1.780 +
   1.781 +      const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);
   1.782 +      const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
   1.783 +      const vuint8_t put3 = spu_avg(block3,put3a);
   1.784 +
   1.785 +      const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);
   1.786 +      const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
   1.787 +      const vuint8_t put4 = spu_avg(block4,put4a);
   1.788 +
   1.789 +      *(vuint8_t *) dst = put1;
   1.790 +      *(vuint8_t *)(dst + dst_stride) = put2;
   1.791 +      *(vuint8_t *)(dst + dst_stride_2) = put3;
   1.792 +      *(vuint8_t *)(dst + dst_stride_3) = put4;
   1.793 +
   1.794 +      src+= line_size_4;
   1.795 +      dst+= dst_stride_4;
   1.796 +    }
   1.797 +}
   1.798 +
   1.799 +/* Here we create all the interpolation modes H.264 motion compensation stage for luma */
   1.800 +  H264_MC(put_, 16, spu)
   1.801 +  H264_MC(put_, 8, spu)
   1.802 +  H264_MC(put_, 4, spu)
   1.803 +
   1.804 +  H264_MC(avg_, 16, spu)
   1.805 +  H264_MC(avg_, 8, spu)
   1.806 +  H264_MC(avg_, 4, spu)
   1.807 +
   1.808 +
   1.809 +//Chroma interpolation:
   1.810 +
   1.811 +#define OP_U8_SPU                          PUT_OP_U8_SPU
   1.812 +#define PREFIX_h264_chroma_mc8_spu         put_h264_chroma_mc8_spu
   1.813 +#define PREFIX_h264_chroma_mc4_spu         put_h264_chroma_mc4_spu
   1.814 +#define PREFIX_h264_chroma_mc2_spu         put_h264_chroma_mc2_spu
   1.815 +#include "h264_chroma_template_spu.c"
   1.816 +#undef OP_U8_SPU
   1.817 +#undef PREFIX_h264_chroma_mc8_spu
   1.818 +#undef PREFIX_h264_chroma_mc4_spu
   1.819 +#undef PREFIX_h264_chroma_mc2_spu
   1.820 +
   1.821 +#define OP_U8_SPU                          AVG_OP_U8_SPU
   1.822 +#define PREFIX_h264_chroma_mc8_spu         avg_h264_chroma_mc8_spu
   1.823 +#define PREFIX_h264_chroma_mc4_spu         avg_h264_chroma_mc4_spu
   1.824 +#define PREFIX_h264_chroma_mc2_spu         avg_h264_chroma_mc2_spu
   1.825 +#include "h264_chroma_template_spu.c"
   1.826 +#undef OP_U8_SPU
   1.827 +#undef PREFIX_h264_chroma_mc8_spu
   1.828 +#undef PREFIX_h264_chroma_mc4_spu
   1.829 +#undef PREFIX_h264_chroma_mc2_spu
   1.830 +
   1.831 +// Weight and Biweight functions
   1.832 +
   1.833 +#define op_scale1(x)  dst[x] = av_clip_uint8( (dst[x]*weight + offset) >> log2_denom )
   1.834 +#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
   1.835 +#define H264_WEIGHT(W,H) \
   1.836 +static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
   1.837 +    int y; \
   1.838 +    offset <<= log2_denom; \
   1.839 +    if(log2_denom) offset += 1<<(log2_denom-1); \
   1.840 +    for(y=0; y<H; y++, dst += stride){ \
   1.841 +        op_scale1(0); \
   1.842 +        op_scale1(1); \
   1.843 +        if(W==2) continue; \
   1.844 +        op_scale1(2); \
   1.845 +        op_scale1(3); \
   1.846 +        if(W==4) continue; \
   1.847 +        op_scale1(4); \
   1.848 +        op_scale1(5); \
   1.849 +        op_scale1(6); \
   1.850 +        op_scale1(7); \
   1.851 +        if(W==8) continue; \
   1.852 +        op_scale1(8); \
   1.853 +        op_scale1(9); \
   1.854 +        op_scale1(10); \
   1.855 +        op_scale1(11); \
   1.856 +        op_scale1(12); \
   1.857 +        op_scale1(13); \
   1.858 +        op_scale1(14); \
   1.859 +        op_scale1(15); \
   1.860 +    } \
   1.861 +} \
   1.862 +static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd, int weights, int offset){ \
   1.863 +    int y; \
   1.864 +    offset = ((offset + 1) | 1) << log2_denom; \
   1.865 +    for(y=0; y<H; y++, dst += dst_stride, src += src_stride){ \
   1.866 +        op_scale2(0); \
   1.867 +        op_scale2(1); \
   1.868 +        if(W==2) continue; \
   1.869 +        op_scale2(2); \
   1.870 +        op_scale2(3); \
   1.871 +        if(W==4) continue; \
   1.872 +        op_scale2(4); \
   1.873 +        op_scale2(5); \
   1.874 +        op_scale2(6); \
   1.875 +        op_scale2(7); \
   1.876 +        if(W==8) continue; \
   1.877 +        op_scale2(8); \
   1.878 +        op_scale2(9); \
   1.879 +        op_scale2(10); \
   1.880 +        op_scale2(11); \
   1.881 +        op_scale2(12); \
   1.882 +        op_scale2(13); \
   1.883 +        op_scale2(14); \
   1.884 +        op_scale2(15); \
   1.885 +    } \
   1.886 +}
   1.887 +
   1.888 +H264_WEIGHT(16,16)
   1.889 +H264_WEIGHT(16,8)
   1.890 +H264_WEIGHT(8,16)
   1.891 +H264_WEIGHT(8,8)
   1.892 +H264_WEIGHT(8,4)
   1.893 +H264_WEIGHT(4,8)
   1.894 +H264_WEIGHT(4,4)
   1.895 +H264_WEIGHT(4,2)
   1.896 +H264_WEIGHT(2,4)
   1.897 +H264_WEIGHT(2,2)
   1.898 +
   1.899 +#undef op_scale1
   1.900 +#undef op_scale2
   1.901 +#undef H264_WEIGHT
   1.902 +
   1.903 +/////////////////////////////////////////////////////////////////////////////////////////
   1.904 +
   1.905 +static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
   1.906 +{
   1.907 +    int i, d;
   1.908 +    for( i = 0; i < 4; i++ ) {
   1.909 +        if( tc0[i] < 0 ) {
   1.910 +            pix += 4*ystride;
   1.911 +            continue;
   1.912 +        }
   1.913 +        for( d = 0; d < 4; d++ ) {
   1.914 +            const int p0 = pix[-1*xstride];
   1.915 +            const int p1 = pix[-2*xstride];
   1.916 +            const int p2 = pix[-3*xstride];
   1.917 +            const int q0 = pix[0];
   1.918 +            const int q1 = pix[1*xstride];
   1.919 +            const int q2 = pix[2*xstride];
   1.920 +
   1.921 +            if( FFABS( p0 - q0 ) < alpha &&
   1.922 +                FFABS( p1 - p0 ) < beta &&
   1.923 +                FFABS( q1 - q0 ) < beta ) {
   1.924 +
   1.925 +                int tc = tc0[i];
   1.926 +                int i_delta;
   1.927 +
   1.928 +                if( FFABS( p2 - p0 ) < beta ) {
   1.929 +                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
   1.930 +                    tc++;
   1.931 +                }
   1.932 +                if( FFABS( q2 - q0 ) < beta ) {
   1.933 +                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
   1.934 +                    tc++;
   1.935 +                }
   1.936 +
   1.937 +                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
   1.938 +                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
   1.939 +                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
   1.940 +            }
   1.941 +            pix += ystride;
   1.942 +        }
   1.943 +    }
   1.944 +}
   1.945 +static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
   1.946 +{
   1.947 +    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
   1.948 +}
   1.949 +static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
   1.950 +{
   1.951 +    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
   1.952 +}
   1.953 +
   1.954 +static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
   1.955 +{
   1.956 +    int d;
   1.957 +    for( d = 0; d < 16; d++ ) {
   1.958 +        const int p2 = pix[-3*xstride];
   1.959 +        const int p1 = pix[-2*xstride];
   1.960 +        const int p0 = pix[-1*xstride];
   1.961 +
   1.962 +        const int q0 = pix[ 0*xstride];
   1.963 +        const int q1 = pix[ 1*xstride];
   1.964 +        const int q2 = pix[ 2*xstride];
   1.965 +
   1.966 +        if( FFABS( p0 - q0 ) < alpha &&
   1.967 +            FFABS( p1 - p0 ) < beta &&
   1.968 +            FFABS( q1 - q0 ) < beta ) {
   1.969 +
   1.970 +            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
   1.971 +                if( FFABS( p2 - p0 ) < beta)
   1.972 +                {
   1.973 +                    const int p3 = pix[-4*xstride];
   1.974 +                    /* p0', p1', p2' */
   1.975 +                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
   1.976 +                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
   1.977 +                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
   1.978 +                } else {
   1.979 +                    /* p0' */
   1.980 +                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
   1.981 +                }
   1.982 +                if( FFABS( q2 - q0 ) < beta)
   1.983 +                {
   1.984 +                    const int q3 = pix[3*xstride];
   1.985 +                    /* q0', q1', q2' */
   1.986 +                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
   1.987 +                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
   1.988 +                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
   1.989 +                } else {
   1.990 +                    /* q0' */
   1.991 +                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
   1.992 +                }
   1.993 +            }else{
   1.994 +                /* p0', q0' */
   1.995 +                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
   1.996 +                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
   1.997 +            }
   1.998 +        }
   1.999 +        pix += ystride;
  1.1000 +    }
  1.1001 +}
  1.1002 +static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
  1.1003 +{
  1.1004 +    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
  1.1005 +}
  1.1006 +static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
  1.1007 +{
  1.1008 +    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
  1.1009 +}
  1.1010 +
  1.1011 +static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
  1.1012 +{
  1.1013 +    int i, d;
  1.1014 +    for( i = 0; i < 4; i++ ) {
  1.1015 +        const int tc = tc0[i];
  1.1016 +        if( tc <= 0 ) {
  1.1017 +            pix += 2*ystride;
  1.1018 +            continue;
  1.1019 +        }
  1.1020 +        for( d = 0; d < 2; d++ ) {
  1.1021 +            const int p0 = pix[-1*xstride];
  1.1022 +            const int p1 = pix[-2*xstride];
  1.1023 +            const int q0 = pix[0];
  1.1024 +            const int q1 = pix[1*xstride];
  1.1025 +
  1.1026 +            if( FFABS( p0 - q0 ) < alpha &&
  1.1027 +                FFABS( p1 - p0 ) < beta &&
  1.1028 +                FFABS( q1 - q0 ) < beta ) {
  1.1029 +
  1.1030 +                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
  1.1031 +
  1.1032 +                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
  1.1033 +                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
  1.1034 +            }
  1.1035 +            pix += ystride;
  1.1036 +        }
  1.1037 +    }
  1.1038 +}
  1.1039 +static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  1.1040 +{
  1.1041 +    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
  1.1042 +}
  1.1043 +static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  1.1044 +{
  1.1045 +    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
  1.1046 +}
  1.1047 +
  1.1048 +static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
  1.1049 +{
  1.1050 +    int d;
  1.1051 +    for( d = 0; d < 8; d++ ) {
  1.1052 +        const int p0 = pix[-1*xstride];
  1.1053 +        const int p1 = pix[-2*xstride];
  1.1054 +        const int q0 = pix[0];
  1.1055 +        const int q1 = pix[1*xstride];
  1.1056 +
  1.1057 +        if( FFABS( p0 - q0 ) < alpha &&
  1.1058 +            FFABS( p1 - p0 ) < beta &&
  1.1059 +            FFABS( q1 - q0 ) < beta ) {
  1.1060 +
  1.1061 +            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
  1.1062 +            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
  1.1063 +        }
  1.1064 +        pix += ystride;
  1.1065 +    }
  1.1066 +}
  1.1067 +static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
  1.1068 +{
  1.1069 +    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
  1.1070 +}
  1.1071 +static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
  1.1072 +{
  1.1073 +    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
  1.1074 +}
  1.1075 +
  1.1076 +
  1.1077 +void dsputil_h264_init_cell(DSPContext_spu* c) {
  1.1078 +
  1.1079 +	c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
  1.1080 +    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
  1.1081 +    c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
  1.1082 +    c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
  1.1083 +    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
  1.1084 +    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
  1.1085 +    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
  1.1086 +    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
  1.1087 +
  1.1088 +    c->h264_idct_add[0] = h264_idct8_add_spu;
  1.1089 +    c->h264_idct_add[1] = h264_idct4_add_spu;
  1.1090 +
  1.1091 +
  1.1092 +    c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_spu;
  1.1093 +    c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_spu;
  1.1094 +    c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_spu;
  1.1095 +    c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_spu;
  1.1096 +    c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_spu;
  1.1097 +    c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_spu;
  1.1098 +
  1.1099 +    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
  1.1100 +    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
  1.1101 +    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
  1.1102 +    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
  1.1103 +    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
  1.1104 +    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
  1.1105 +    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
  1.1106 +    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
  1.1107 +    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
  1.1108 +    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
  1.1109 +    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
  1.1110 +    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
  1.1111 +    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
  1.1112 +    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
  1.1113 +    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
  1.1114 +    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
  1.1115 +    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
  1.1116 +    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
  1.1117 +    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
  1.1118 +    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
  1.1119 +
  1.1120 +
  1.1121 +#define dspfunc(PFX, IDX, NUM) \
  1.1122 +    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_spu; \
  1.1123 +    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_spu; \
  1.1124 +    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_spu; \
  1.1125 +    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_spu; \
  1.1126 +    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_spu; \
  1.1127 +    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_spu; \
  1.1128 +    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_spu; \
  1.1129 +    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_spu; \
  1.1130 +    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_spu; \
  1.1131 +    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_spu; \
  1.1132 +    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_spu; \
  1.1133 +    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_spu; \
  1.1134 +    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_spu; \
  1.1135 +    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_spu; \
  1.1136 +    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_spu; \
  1.1137 +    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_spu
  1.1138 +
  1.1139 +    dspfunc(put_h264_qpel, 0, 16);
  1.1140 +    dspfunc(put_h264_qpel, 1, 8);
  1.1141 +    dspfunc(put_h264_qpel, 2, 4);
  1.1142 +
  1.1143 +    dspfunc(avg_h264_qpel, 0, 16);
  1.1144 +    dspfunc(avg_h264_qpel, 1, 8);
  1.1145 +    dspfunc(avg_h264_qpel, 2, 4);
  1.1146 +
  1.1147 +#undef dspfunc
  1.1148 +
  1.1149 +
  1.1150 +}