Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
diff libavcodec/cell/dsputil_spu.c @ 2:897f711a7157
rearrange to work with autoconf
| author | Nina Engelhardt <nengel@mailbox.tu-berlin.de> |
|---|---|
| date | Tue, 25 Sep 2012 15:55:33 +0200 |
| parents | |
| children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/libavcodec/cell/dsputil_spu.c Tue Sep 25 15:55:33 2012 +0200 1.3 @@ -0,0 +1,1147 @@ 1.4 +/* 1.5 + * Copyright (c) 2009 TUDelft 1.6 + * 1.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 1.8 + */ 1.9 + 1.10 +/** 1.11 + * @file libavcodec/cell/spu/h264_main_spu.c 1.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding 1.13 + * @author C C Chi <c.c.chi@student.tudelft.nl> 1.14 + * 1.15 + * SIMD SPU kernels 1.16 + * H.264/AVC motion compensation 1.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu> 1.18 + * @author Albert Paradis <apar7632@hotmail.com> 1.19 + */ 1.20 + 1.21 + 1.22 +#include "dsputil_spu.h" 1.23 +#include "h264_idct_spu.h" 1.24 +#include "h264_deblock_spu.h" 1.25 +#include "types_spu.h" 1.26 +#include "libavutil/intreadwrite.h" 1.27 + 1.28 +#include <stdio.h> 1.29 +#include <spu_intrinsics.h> 1.30 +#include <spu_mfcio.h> 1.31 +#include <assert.h> 1.32 + 1.33 +//Luma interpolation 1.34 +#define PUT_OP_U8_SPU(d, s, dst) (void) dst; d = s 1.35 +#define AVG_OP_U8_SPU(d, s, dst) d = spu_avg(dst, s) 1.36 + 1.37 +#define OP_U8_SPU PUT_OP_U8_SPU 1.38 +#define PREFIX_h264_qpel16_h_lowpass_spu put_h264_qpel16_h_lowpass_spu 1.39 +#define PREFIX_h264_qpel16_v_lowpass_spu put_h264_qpel16_v_lowpass_spu 1.40 +#define PREFIX_h264_qpel16_hv_lowpass_spu put_h264_qpel16_hv_lowpass_spu 1.41 +#define PREFIX_h264_qpel8_h_lowpass_spu put_h264_qpel8_h_lowpass_spu 1.42 +#define PREFIX_h264_qpel8_v_lowpass_spu put_h264_qpel8_v_lowpass_spu 1.43 +#define PREFIX_h264_qpel8_hv_lowpass_spu put_h264_qpel8_hv_lowpass_spu 1.44 +#define PREFIX_h264_qpel4_h_lowpass_spu put_h264_qpel4_h_lowpass_spu 1.45 +#define PREFIX_h264_qpel4_v_lowpass_spu put_h264_qpel4_v_lowpass_spu 1.46 +#define PREFIX_h264_qpel4_hv_lowpass_spu put_h264_qpel4_hv_lowpass_spu 1.47 +#include "h264_luma_template_spu.c" 1.48 +#undef OP_U8_SPU 1.49 +#undef PREFIX_h264_qpel16_h_lowpass_spu 1.50 +#undef PREFIX_h264_qpel16_v_lowpass_spu 1.51 +#undef PREFIX_h264_qpel16_hv_lowpass_spu 1.52 +#undef PREFIX_h264_qpel8_h_lowpass_spu 1.53 +#undef PREFIX_h264_qpel8_v_lowpass_spu 1.54 +#undef PREFIX_h264_qpel8_hv_lowpass_spu 1.55 +#undef PREFIX_h264_qpel4_h_lowpass_spu 1.56 +#undef PREFIX_h264_qpel4_v_lowpass_spu 1.57 +#undef PREFIX_h264_qpel4_hv_lowpass_spu 1.58 + 1.59 +#define OP_U8_SPU AVG_OP_U8_SPU 1.60 +#define PREFIX_h264_qpel16_h_lowpass_spu avg_h264_qpel16_h_lowpass_spu 1.61 +#define PREFIX_h264_qpel16_v_lowpass_spu avg_h264_qpel16_v_lowpass_spu 1.62 +#define PREFIX_h264_qpel16_hv_lowpass_spu avg_h264_qpel16_hv_lowpass_spu 1.63 +#define PREFIX_h264_qpel8_h_lowpass_spu avg_h264_qpel8_h_lowpass_spu 1.64 +#define PREFIX_h264_qpel8_v_lowpass_spu avg_h264_qpel8_v_lowpass_spu 1.65 +#define PREFIX_h264_qpel8_hv_lowpass_spu avg_h264_qpel8_hv_lowpass_spu 1.66 +#define PREFIX_h264_qpel4_h_lowpass_spu avg_h264_qpel4_h_lowpass_spu 1.67 +#define PREFIX_h264_qpel4_v_lowpass_spu avg_h264_qpel4_v_lowpass_spu 1.68 +#define PREFIX_h264_qpel4_hv_lowpass_spu avg_h264_qpel4_hv_lowpass_spu 1.69 +#include "h264_luma_template_spu.c" 1.70 +#undef OP_U8_SPU 1.71 +#undef PREFIX_h264_qpel16_h_lowpass_spu 1.72 +#undef PREFIX_h264_qpel16_v_lowpass_spu 1.73 +#undef PREFIX_h264_qpel16_hv_lowpass_spu 1.74 +#undef PREFIX_h264_qpel8_h_lowpass_spu 1.75 +#undef PREFIX_h264_qpel8_v_lowpass_spu 1.76 +#undef PREFIX_h264_qpel8_hv_lowpass_spu 1.77 +#undef PREFIX_h264_qpel4_h_lowpass_spu 1.78 +#undef PREFIX_h264_qpel4_v_lowpass_spu 1.79 +#undef PREFIX_h264_qpel4_hv_lowpass_spu 1.80 + 1.81 +#define H264_MC(OPNAME, SIZE, CODETYPE) \ 1.82 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.83 + OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, dst_stride, STRIDE_Y, h);\ 1.84 +}\ 1.85 +\ 1.86 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){ \ 1.87 + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ 1.88 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\ 1.89 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\ 1.90 +}\ 1.91 +\ 1.92 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.93 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\ 1.94 +}\ 1.95 +\ 1.96 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.97 + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ 1.98 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\ 1.99 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, dst_stride, STRIDE_Y, h);\ 1.100 +}\ 1.101 +\ 1.102 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.103 + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ 1.104 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\ 1.105 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\ 1.106 +}\ 1.107 +\ 1.108 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.109 + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\ 1.110 +}\ 1.111 +\ 1.112 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.113 + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ 1.114 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\ 1.115 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+STRIDE_Y, half, dst_stride, STRIDE_Y, h);\ 1.116 +}\ 1.117 +\ 1.118 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.119 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 1.120 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 1.121 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ 1.122 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ 1.123 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ 1.124 +}\ 1.125 +\ 1.126 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.127 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 1.128 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 1.129 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ 1.130 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ 1.131 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ 1.132 +}\ 1.133 +\ 1.134 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.135 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 1.136 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 1.137 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ 1.138 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ 1.139 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ 1.140 +}\ 1.141 +\ 1.142 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.143 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 1.144 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 1.145 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ 1.146 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ 1.147 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ 1.148 +}\ 1.149 +\ 1.150 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.151 + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ 1.152 + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, dst_stride, 16, h);\ 1.153 +}\ 1.154 +\ 1.155 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.156 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 1.157 + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ 1.158 + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ 1.159 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ 1.160 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ 1.161 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\ 1.162 +}\ 1.163 +\ 1.164 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.165 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 1.166 + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ 1.167 + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ 1.168 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ 1.169 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ 1.170 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\ 1.171 +}\ 1.172 +\ 1.173 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.174 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 1.175 + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ 1.176 + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ 1.177 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ 1.178 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ 1.179 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\ 1.180 +}\ 1.181 +\ 1.182 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 1.183 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 1.184 + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ 1.185 + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ 1.186 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ 1.187 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ 1.188 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\ 1.189 +}\ 1.190 + 1.191 + 1.192 +/**************************/ 1.193 +/* put pixels functions */ 1.194 +/*************************/ 1.195 + 1.196 +static void put_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1, 1.197 + const uint8_t * src2, int dst_stride, 1.198 + int src_stride1, int h) 1.199 +{ 1.200 + int i; 1.201 + 1.202 + const int perm_src1 = (unsigned int) src1 & 15; 1.203 + 1.204 + for (i=0; i<h; i++){ 1.205 + //unaligned load of src1 1.206 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 1.207 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 1.208 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 1.209 + 1.210 + //aligned load of src2 1.211 + const vuint8_t srcb = *(vuint8_t *)(src2); 1.212 + 1.213 + //average and rounding 1.214 + const vuint8_t avgc = spu_avg(srca,srcb); 1.215 + 1.216 + // 16x16 dest luma blocks are always aligned 1.217 + *(vuint8_t *)dst=avgc; 1.218 + 1.219 + src1 +=src_stride1; 1.220 + src2 +=16; 1.221 + dst +=dst_stride; 1.222 + } 1.223 +} 1.224 + 1.225 +static void avg_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1, 1.226 + const uint8_t * src2, int dst_stride, 1.227 + int src_stride1, int h) 1.228 +{ 1.229 + int i; 1.230 + 1.231 + const int perm_src1 = (unsigned int) src1 & 15; 1.232 + 1.233 + for (i=0; i<h; i++){ 1.234 + //unaligned load of src1 1.235 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 1.236 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 1.237 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 1.238 + 1.239 + //aligned load of src2 1.240 + const vuint8_t srcb = *(vuint8_t *)(src2); 1.241 + 1.242 + //average and rounding 1.243 + const vuint8_t avgc = spu_avg(spu_avg(srca,srcb), *(vuint8_t *)dst); 1.244 + 1.245 + // 16x16 dest luma blocks are always aligned 1.246 + *(vuint8_t *)dst=avgc; 1.247 + 1.248 + src1 +=src_stride1; 1.249 + src2 +=16; 1.250 + dst +=dst_stride; 1.251 + } 1.252 +} 1.253 + 1.254 +// next one assumes that ((line_size % 16) == 0) 1.255 +void put_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 1.256 +{ 1.257 + register vector unsigned char pixelsv1, pixelsv2; 1.258 + register vector unsigned char pixelsv1B, pixelsv2B; 1.259 + register vector unsigned char pixelsv1C, pixelsv2C; 1.260 + register vector unsigned char pixelsv1D, pixelsv2D; 1.261 + 1.262 + const int perm = (unsigned int) src & 15; 1.263 + int i; 1.264 + register int line_size = src_stride; 1.265 + register int line_size_2 = line_size << 1; 1.266 + register int line_size_3 = line_size + line_size_2; 1.267 + register int line_size_4 = line_size << 2; 1.268 + 1.269 + register int dst_stride_2 = dst_stride << 1; 1.270 + register int dst_stride_3 = dst_stride_2 + dst_stride; 1.271 + register int dst_stride_4 = dst_stride << 2; 1.272 + 1.273 + for(i=0; i<h; i+=4) { 1.274 + pixelsv1 = *(vuint8_t *)(src); 1.275 + pixelsv2 = *(vuint8_t *)(src+16); 1.276 + pixelsv1B = *(vuint8_t *)(src + line_size); 1.277 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 1.278 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 1.279 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 1.280 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 1.281 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 1.282 + 1.283 + *(vuint8_t *) dst = spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16)); 1.284 + *(vuint8_t *)(dst + dst_stride) = spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)); 1.285 + *(vuint8_t *)(dst + dst_stride_2) = spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)); 1.286 + *(vuint8_t *)(dst + dst_stride_3) = spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)); 1.287 + 1.288 + src+= line_size_4; 1.289 + dst+= dst_stride_4; 1.290 + } 1.291 +} 1.292 + 1.293 +// next one assumes that ((line_size % 16) == 0) 1.294 +void avg_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 1.295 +{ 1.296 + register vector unsigned char pixelsv1, pixelsv2; 1.297 + register vector unsigned char pixelsv1B, pixelsv2B; 1.298 + register vector unsigned char pixelsv1C, pixelsv2C; 1.299 + register vector unsigned char pixelsv1D, pixelsv2D; 1.300 + 1.301 + const int perm = (unsigned int) src & 15; 1.302 + int i; 1.303 + register int line_size = src_stride; 1.304 + register int line_size_2 = line_size << 1; 1.305 + register int line_size_3 = line_size + line_size_2; 1.306 + register int line_size_4 = line_size << 2; 1.307 + 1.308 + register int dst_stride_2 = dst_stride << 1; 1.309 + register int dst_stride_3 = dst_stride_2 + dst_stride; 1.310 + register int dst_stride_4 = dst_stride << 2; 1.311 + 1.312 + 1.313 + for(i=0; i<h; i+=4) { 1.314 + pixelsv1 = *(vuint8_t *)(src); 1.315 + pixelsv2 = *(vuint8_t *)(src+16); 1.316 + pixelsv1B = *(vuint8_t *)(src + line_size); 1.317 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 1.318 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 1.319 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 1.320 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 1.321 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 1.322 + 1.323 + *(vuint8_t *)dst = spu_avg(spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16)), *(vuint8_t *)dst); 1.324 + *(vuint8_t *)(dst + dst_stride) = spu_avg(spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), *(vuint8_t *)(dst+dst_stride)); 1.325 + *(vuint8_t *)(dst + dst_stride_2) = spu_avg(spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), *(vuint8_t *)(dst+dst_stride_2)); 1.326 + *(vuint8_t *)(dst + dst_stride_3) = spu_avg(spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), *(vuint8_t *)(dst+dst_stride_3)); 1.327 + 1.328 + src+= line_size_4; 1.329 + dst+= dst_stride_4; 1.330 + } 1.331 +} 1.332 + 1.333 +void put_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, 1.334 + int dst_stride, int src_stride1, int h) 1.335 +{ 1.336 + int i; 1.337 + 1.338 + const int perm_src1 = (unsigned int) src1 & 15; 1.339 + const int shift_dst = (unsigned int) dst & 15; 1.340 + 1.341 + // 8x dest luma blocks are aligned or desaligned by 8 1.342 + vuint8_t dstmask; 1.343 + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.344 + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 1.345 + 1.346 + if(shift_dst==0){ 1.347 + dstmask = dst8mask1; 1.348 + } 1.349 + else{ 1.350 + dstmask = dst8mask2; 1.351 + } 1.352 + 1.353 + for (i=0; i<h; i++){ 1.354 + //unaligned load of src1 1.355 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 1.356 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 1.357 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 1.358 + 1.359 + //aligned load of src2 1.360 + const vuint8_t srcb = *(vuint8_t *)(src2); 1.361 + 1.362 + //average and rounding 1.363 + const vuint8_t avgc = spu_avg(srca,srcb); 1.364 + 1.365 + const vuint8_t dst1 = *(vuint8_t *)dst; 1.366 + 1.367 + const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask); 1.368 + 1.369 + *(vuint8_t *)dst=davgc; 1.370 + 1.371 + src1 +=src_stride1; 1.372 + src2 +=16; 1.373 + dst +=dst_stride; 1.374 + } 1.375 +} 1.376 + 1.377 +void avg_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, 1.378 + int dst_stride, int src_stride1, int h) 1.379 +{ 1.380 + int i; 1.381 + 1.382 + const int perm_src1 = (unsigned int) src1 & 15; 1.383 + const int shift_dst = (unsigned int) dst & 15; 1.384 + 1.385 + // 8x dest luma blocks are aligned or desaligned by 8 1.386 + vuint8_t dstmask; 1.387 + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.388 + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 1.389 + 1.390 + if(shift_dst==0){ 1.391 + dstmask = dst8mask1; 1.392 + } 1.393 + else{ 1.394 + dstmask = dst8mask2; 1.395 + } 1.396 + 1.397 + for (i=0; i<h; i++){ 1.398 + //unaligned load of src1 1.399 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 1.400 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 1.401 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 1.402 + 1.403 + //aligned load of src2 1.404 + const vuint8_t srcb = *(vuint8_t *)(src2); 1.405 + 1.406 + //average and rounding 1.407 + const vuint8_t avgc = spu_avg(srca,srcb); 1.408 + 1.409 + const vuint8_t dst1 = *(vuint8_t *)dst; 1.410 + 1.411 + const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask); 1.412 + 1.413 + const vuint8_t davgc = spu_avg(dst1,davgc1); 1.414 + 1.415 + *(vuint8_t *)dst=davgc; 1.416 + 1.417 + src1 +=src_stride1; 1.418 + src2 +=16; 1.419 + dst +=dst_stride; 1.420 + } 1.421 +} 1.422 + 1.423 +// next one assumes that ((line_size % 16) == 0) 1.424 +void put_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 1.425 +{ 1.426 + register vector unsigned char pixelsv1A, pixelsv2A; 1.427 + register vector unsigned char pixelsv1B, pixelsv2B; 1.428 + register vector unsigned char pixelsv1C, pixelsv2C; 1.429 + register vector unsigned char pixelsv1D, pixelsv2D; 1.430 + 1.431 + const int perm = (unsigned int) src & 15; 1.432 + const int shift_dst = (unsigned int) dst & 15; 1.433 + 1.434 + // 8x dest luma blocks are aligned or desaligned by 8 1.435 + vuint8_t dstmask; 1.436 + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.437 + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 1.438 + 1.439 + if(shift_dst==0){ 1.440 + dstmask = dst8mask1; 1.441 + } 1.442 + else{ 1.443 + dstmask = dst8mask2; 1.444 + } 1.445 + 1.446 + int i; 1.447 + register int line_size = src_stride; 1.448 + register int line_size_2 = line_size << 1; 1.449 + register int line_size_3 = line_size + line_size_2; 1.450 + register int line_size_4 = line_size << 2; 1.451 + 1.452 + register int dst_stride_2 = dst_stride << 1; 1.453 + register int dst_stride_3 = dst_stride_2 + dst_stride; 1.454 + register int dst_stride_4 = dst_stride << 2; 1.455 + 1.456 + for(i=0; i<h; i+=4) { 1.457 + pixelsv1A = *(vuint8_t *)(src); 1.458 + pixelsv2A = *(vuint8_t *)(src+16); 1.459 + pixelsv1B = *(vuint8_t *)(src + line_size); 1.460 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 1.461 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 1.462 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 1.463 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 1.464 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 1.465 + 1.466 + const vuint8_t block1 = *(vuint8_t *)dst; 1.467 + const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask); 1.468 + const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride); 1.469 + const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask); 1.470 + const vuint8_t block3 = *(vuint8_t *)(dst+2*dst_stride); 1.471 + const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask); 1.472 + const vuint8_t block4 = *(vuint8_t *)(dst+3*dst_stride); 1.473 + const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask); 1.474 + 1.475 + *(vuint8_t *) dst = put1; 1.476 + *(vuint8_t *)(dst + dst_stride) = put2; 1.477 + *(vuint8_t *)(dst + dst_stride_2) = put3; 1.478 + *(vuint8_t *)(dst + dst_stride_3) = put4; 1.479 + 1.480 + src += line_size_4; 1.481 + dst += dst_stride_4; 1.482 + } 1.483 +} 1.484 + 1.485 +// next one assumes that ((line_size % 16) == 0) 1.486 +void avg_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 1.487 +{ 1.488 + register vector unsigned char pixelsv1A, pixelsv2A; 1.489 + register vector unsigned char pixelsv1B, pixelsv2B; 1.490 + register vector unsigned char pixelsv1C, pixelsv2C; 1.491 + register vector unsigned char pixelsv1D, pixelsv2D; 1.492 + 1.493 + const int perm = (unsigned int) src & 15; 1.494 + const int shift_dst = (unsigned int) dst & 15; 1.495 + 1.496 + // 8x dest luma blocks are aligned or desaligned by 8 1.497 + vuint8_t dstmask; 1.498 + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.499 + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 1.500 + 1.501 + if(shift_dst==0){ 1.502 + dstmask = dst8mask1; 1.503 + } 1.504 + else{ 1.505 + dstmask = dst8mask2; 1.506 + } 1.507 + 1.508 + int i; 1.509 + register int line_size = src_stride; 1.510 + register int line_size_2 = line_size << 1; 1.511 + register int line_size_3 = line_size + line_size_2; 1.512 + register int line_size_4 = line_size << 2; 1.513 + 1.514 + register int dst_stride_2 = dst_stride << 1; 1.515 + register int dst_stride_3 = dst_stride_2 + dst_stride; 1.516 + register int dst_stride_4 = dst_stride << 2; 1.517 + 1.518 + for(i=0; i<h; i+=4) { 1.519 + pixelsv1A = *(vuint8_t *)(src); 1.520 + pixelsv2A = *(vuint8_t *)(src+16); 1.521 + pixelsv1B = *(vuint8_t *)(src + line_size); 1.522 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 1.523 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 1.524 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 1.525 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 1.526 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 1.527 + 1.528 + const vuint8_t block1 = *(vuint8_t *) dst; 1.529 + const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask); 1.530 + const vuint8_t put1 = spu_avg(block1,put1a); 1.531 + 1.532 + const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride); 1.533 + const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask); 1.534 + const vuint8_t put2 = spu_avg(block2,put2a); 1.535 + 1.536 + const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2); 1.537 + const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask); 1.538 + const vuint8_t put3 = spu_avg(block3,put3a); 1.539 + 1.540 + const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3); 1.541 + const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask); 1.542 + const vuint8_t put4 = spu_avg(block4,put4a); 1.543 + 1.544 + *(vuint8_t *) dst = put1; 1.545 + *(vuint8_t *)(dst + dst_stride) = put2; 1.546 + *(vuint8_t *)(dst + dst_stride_2) = put3; 1.547 + *(vuint8_t *)(dst + dst_stride_3) = put4; 1.548 + 1.549 + src+= line_size_4; 1.550 + dst+= dst_stride_4; 1.551 + } 1.552 +} 1.553 + 1.554 +void put_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, 1.555 + int dst_stride, int src_stride1, int h) 1.556 +{ 1.557 + int i; 1.558 + 1.559 + const int perm_src1 = (unsigned int) src1 & 15; 1.560 + const int shift_dst = (unsigned int) dst & 15; 1.561 + 1.562 + // 4x dest luma blocks are desaligned by 0, 4, 8, or 12 1.563 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1.564 + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.565 + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.566 + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 1.567 + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 1.568 + 1.569 + switch(shift_dst){ 1.570 + case 0: dstmask = dstmask0; 1.571 + break; 1.572 + case 4: dstmask = dstmask4; 1.573 + break; 1.574 + case 8: dstmask = dstmask8; 1.575 + break; 1.576 + case 12: dstmask = dstmask12; 1.577 + break; 1.578 + } 1.579 + 1.580 + for (i=0; i<h; i++){ 1.581 + //unaligned load of src1 1.582 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 1.583 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 1.584 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 1.585 + 1.586 + //aligned load of src2 1.587 + const vuint8_t srcb = *(vuint8_t *)(src2); 1.588 + 1.589 + //average and rounding 1.590 + const vuint8_t avgc = spu_avg(srca,srcb); 1.591 + 1.592 + const vuint8_t dst1 = *(vuint8_t *)dst; 1.593 + 1.594 + const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask); 1.595 + 1.596 + *(vuint8_t *)dst=davgc; 1.597 + 1.598 + src1 +=src_stride1; 1.599 + src2 +=16; 1.600 + dst +=dst_stride; 1.601 + } 1.602 +} 1.603 + 1.604 +void avg_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, 1.605 + int dst_stride, int src_stride1, int h) 1.606 +{ 1.607 + int i; 1.608 + 1.609 + const int perm_src1 = (unsigned int) src1 & 15; 1.610 + const int shift_dst = (unsigned int) dst & 15; 1.611 + 1.612 + // 4x dest luma blocks are desaligned by 0, 4, 8, or 12 1.613 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1.614 + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.615 + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.616 + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 1.617 + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 1.618 + 1.619 + switch(shift_dst){ 1.620 + case 0: dstmask = dstmask0; 1.621 + break; 1.622 + case 4: dstmask = dstmask4; 1.623 + break; 1.624 + case 8: dstmask = dstmask8; 1.625 + break; 1.626 + case 12: dstmask = dstmask12; 1.627 + break; 1.628 + } 1.629 + 1.630 + for (i=0; i<h; i++){ 1.631 + //unaligned load of src1 1.632 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 1.633 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 1.634 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 1.635 + 1.636 + //aligned load of src2 1.637 + const vuint8_t srcb = *(vuint8_t *)(src2); 1.638 + 1.639 + //average and rounding 1.640 + const vuint8_t avgc = spu_avg(srca,srcb); 1.641 + 1.642 + const vuint8_t dst1 = *(vuint8_t *)dst; 1.643 + 1.644 + const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask); 1.645 + 1.646 + const vuint8_t davgc = spu_avg(dst1,davgc1); 1.647 + 1.648 + *(vuint8_t *)dst=davgc; 1.649 + 1.650 + src1 +=src_stride1; 1.651 + src2 +=16; 1.652 + dst +=dst_stride; 1.653 + } 1.654 +} 1.655 + 1.656 +// next one assumes that ((line_size % 16) == 0) 1.657 +void put_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 1.658 +{ 1.659 + register vector unsigned char pixelsv1A, pixelsv2A; 1.660 + register vector unsigned char pixelsv1B, pixelsv2B; 1.661 + register vector unsigned char pixelsv1C, pixelsv2C; 1.662 + register vector unsigned char pixelsv1D, pixelsv2D; 1.663 + 1.664 + const int perm = (unsigned int) src & 15; 1.665 + const int shift_dst = (unsigned int) dst & 15; 1.666 + 1.667 + // 4x dest luma blocks are desaligned by 0, 4, 8, or 12 1.668 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1.669 + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.670 + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.671 + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 1.672 + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 1.673 + 1.674 + switch(shift_dst){ 1.675 + case 0: dstmask = dstmask0; 1.676 + break; 1.677 + case 4: dstmask = dstmask4; 1.678 + break; 1.679 + case 8: dstmask = dstmask8; 1.680 + break; 1.681 + case 12: dstmask = dstmask12; 1.682 + break; 1.683 + } 1.684 + 1.685 + int i; 1.686 + register int line_size = src_stride; 1.687 + register int line_size_2 = line_size << 1; 1.688 + register int line_size_3 = line_size + line_size_2; 1.689 + register int line_size_4 = line_size << 2; 1.690 + 1.691 + register int dst_stride_2 = dst_stride << 1; 1.692 + register int dst_stride_3 = dst_stride_2 + dst_stride; 1.693 + register int dst_stride_4 = dst_stride << 2; 1.694 + 1.695 + for(i=0; i<h; i+=4) { 1.696 + pixelsv1A = *(vuint8_t *)(src); 1.697 + pixelsv2A = *(vuint8_t *)(src+16); 1.698 + pixelsv1B = *(vuint8_t *)(src + line_size); 1.699 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 1.700 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 1.701 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 1.702 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 1.703 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 1.704 + 1.705 + const vuint8_t block1 = *(vuint8_t *)dst; 1.706 + const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask); 1.707 + const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride); 1.708 + const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask); 1.709 + const vuint8_t block3 = *(vuint8_t *)(dst+dst_stride_2); 1.710 + const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask); 1.711 + const vuint8_t block4 = *(vuint8_t *)(dst+dst_stride_3); 1.712 + const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask); 1.713 + 1.714 + *(vuint8_t *) dst = put1; 1.715 + *(vuint8_t *)(dst + dst_stride) = put2; 1.716 + *(vuint8_t *)(dst + dst_stride_2) = put3; 1.717 + *(vuint8_t *)(dst + dst_stride_3) = put4; 1.718 + 1.719 + src += line_size_4; 1.720 + dst += dst_stride_4; 1.721 + } 1.722 +} 1.723 + 1.724 +// next one assumes that ((line_size % 16) == 0) 1.725 +void avg_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 1.726 +{ 1.727 + register vector unsigned char pixelsv1A, pixelsv2A; 1.728 + register vector unsigned char pixelsv1B, pixelsv2B; 1.729 + register vector unsigned char pixelsv1C, pixelsv2C; 1.730 + register vector unsigned char pixelsv1D, pixelsv2D; 1.731 + 1.732 + const int perm = (unsigned int) src & 15; 1.733 + const int shift_dst = (unsigned int) dst & 15; 1.734 + 1.735 + // 4x dest luma blocks are desaligned by 0, 4, 8, or 12 1.736 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 1.737 + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.738 + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 1.739 + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 1.740 + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 1.741 + 1.742 + switch(shift_dst){ 1.743 + case 0: dstmask = dstmask0; 1.744 + break; 1.745 + case 4: dstmask = dstmask4; 1.746 + break; 1.747 + case 8: dstmask = dstmask8; 1.748 + break; 1.749 + case 12: dstmask = dstmask12; 1.750 + break; 1.751 + } 1.752 + 1.753 + int i; 1.754 + register int line_size = src_stride; 1.755 + register int line_size_2 = line_size << 1; 1.756 + register int line_size_3 = line_size + line_size_2; 1.757 + register int line_size_4 = line_size << 2; 1.758 + 1.759 + register int dst_stride_2 = dst_stride << 1; 1.760 + register int dst_stride_3 = dst_stride_2 + dst_stride; 1.761 + register int dst_stride_4 = dst_stride << 2; 1.762 + 1.763 + for(i=0; i<h; i+=4) { 1.764 + pixelsv1A = *(vuint8_t *)(src); 1.765 + pixelsv2A = *(vuint8_t *)(src+16); 1.766 + pixelsv1B = *(vuint8_t *)(src + line_size); 1.767 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 1.768 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 1.769 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 1.770 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 1.771 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 1.772 + 1.773 + const vuint8_t block1 = *(vuint8_t *) dst; 1.774 + const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask); 1.775 + const vuint8_t put1 = spu_avg(block1,put1a); 1.776 + 1.777 + const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride); 1.778 + const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask); 1.779 + const vuint8_t put2 = spu_avg(block2,put2a); 1.780 + 1.781 + const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2); 1.782 + const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask); 1.783 + const vuint8_t put3 = spu_avg(block3,put3a); 1.784 + 1.785 + const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3); 1.786 + const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask); 1.787 + const vuint8_t put4 = spu_avg(block4,put4a); 1.788 + 1.789 + *(vuint8_t *) dst = put1; 1.790 + *(vuint8_t *)(dst + dst_stride) = put2; 1.791 + *(vuint8_t *)(dst + dst_stride_2) = put3; 1.792 + *(vuint8_t *)(dst + dst_stride_3) = put4; 1.793 + 1.794 + src+= line_size_4; 1.795 + dst+= dst_stride_4; 1.796 + } 1.797 +} 1.798 + 1.799 +/* Here we create all the interpolation modes H.264 motion compensation stage for luma */ 1.800 + H264_MC(put_, 16, spu) 1.801 + H264_MC(put_, 8, spu) 1.802 + H264_MC(put_, 4, spu) 1.803 + 1.804 + H264_MC(avg_, 16, spu) 1.805 + H264_MC(avg_, 8, spu) 1.806 + H264_MC(avg_, 4, spu) 1.807 + 1.808 + 1.809 +//Chroma interpolation: 1.810 + 1.811 +#define OP_U8_SPU PUT_OP_U8_SPU 1.812 +#define PREFIX_h264_chroma_mc8_spu put_h264_chroma_mc8_spu 1.813 +#define PREFIX_h264_chroma_mc4_spu put_h264_chroma_mc4_spu 1.814 +#define PREFIX_h264_chroma_mc2_spu put_h264_chroma_mc2_spu 1.815 +#include "h264_chroma_template_spu.c" 1.816 +#undef OP_U8_SPU 1.817 +#undef PREFIX_h264_chroma_mc8_spu 1.818 +#undef PREFIX_h264_chroma_mc4_spu 1.819 +#undef PREFIX_h264_chroma_mc2_spu 1.820 + 1.821 +#define OP_U8_SPU AVG_OP_U8_SPU 1.822 +#define PREFIX_h264_chroma_mc8_spu avg_h264_chroma_mc8_spu 1.823 +#define PREFIX_h264_chroma_mc4_spu avg_h264_chroma_mc4_spu 1.824 +#define PREFIX_h264_chroma_mc2_spu avg_h264_chroma_mc2_spu 1.825 +#include "h264_chroma_template_spu.c" 1.826 +#undef OP_U8_SPU 1.827 +#undef PREFIX_h264_chroma_mc8_spu 1.828 +#undef PREFIX_h264_chroma_mc4_spu 1.829 +#undef PREFIX_h264_chroma_mc2_spu 1.830 + 1.831 +// Weight and Biweight functions 1.832 + 1.833 +#define op_scale1(x) dst[x] = av_clip_uint8( (dst[x]*weight + offset) >> log2_denom ) 1.834 +#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) 1.835 +#define H264_WEIGHT(W,H) \ 1.836 +static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ 1.837 + int y; \ 1.838 + offset <<= log2_denom; \ 1.839 + if(log2_denom) offset += 1<<(log2_denom-1); \ 1.840 + for(y=0; y<H; y++, dst += stride){ \ 1.841 + op_scale1(0); \ 1.842 + op_scale1(1); \ 1.843 + if(W==2) continue; \ 1.844 + op_scale1(2); \ 1.845 + op_scale1(3); \ 1.846 + if(W==4) continue; \ 1.847 + op_scale1(4); \ 1.848 + op_scale1(5); \ 1.849 + op_scale1(6); \ 1.850 + op_scale1(7); \ 1.851 + if(W==8) continue; \ 1.852 + op_scale1(8); \ 1.853 + op_scale1(9); \ 1.854 + op_scale1(10); \ 1.855 + op_scale1(11); \ 1.856 + op_scale1(12); \ 1.857 + op_scale1(13); \ 1.858 + op_scale1(14); \ 1.859 + op_scale1(15); \ 1.860 + } \ 1.861 +} \ 1.862 +static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd, int weights, int offset){ \ 1.863 + int y; \ 1.864 + offset = ((offset + 1) | 1) << log2_denom; \ 1.865 + for(y=0; y<H; y++, dst += dst_stride, src += src_stride){ \ 1.866 + op_scale2(0); \ 1.867 + op_scale2(1); \ 1.868 + if(W==2) continue; \ 1.869 + op_scale2(2); \ 1.870 + op_scale2(3); \ 1.871 + if(W==4) continue; \ 1.872 + op_scale2(4); \ 1.873 + op_scale2(5); \ 1.874 + op_scale2(6); \ 1.875 + op_scale2(7); \ 1.876 + if(W==8) continue; \ 1.877 + op_scale2(8); \ 1.878 + op_scale2(9); \ 1.879 + op_scale2(10); \ 1.880 + op_scale2(11); \ 1.881 + op_scale2(12); \ 1.882 + op_scale2(13); \ 1.883 + op_scale2(14); \ 1.884 + op_scale2(15); \ 1.885 + } \ 1.886 +} 1.887 + 1.888 +H264_WEIGHT(16,16) 1.889 +H264_WEIGHT(16,8) 1.890 +H264_WEIGHT(8,16) 1.891 +H264_WEIGHT(8,8) 1.892 +H264_WEIGHT(8,4) 1.893 +H264_WEIGHT(4,8) 1.894 +H264_WEIGHT(4,4) 1.895 +H264_WEIGHT(4,2) 1.896 +H264_WEIGHT(2,4) 1.897 +H264_WEIGHT(2,2) 1.898 + 1.899 +#undef op_scale1 1.900 +#undef op_scale2 1.901 +#undef H264_WEIGHT 1.902 + 1.903 +///////////////////////////////////////////////////////////////////////////////////////// 1.904 + 1.905 +static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) 1.906 +{ 1.907 + int i, d; 1.908 + for( i = 0; i < 4; i++ ) { 1.909 + if( tc0[i] < 0 ) { 1.910 + pix += 4*ystride; 1.911 + continue; 1.912 + } 1.913 + for( d = 0; d < 4; d++ ) { 1.914 + const int p0 = pix[-1*xstride]; 1.915 + const int p1 = pix[-2*xstride]; 1.916 + const int p2 = pix[-3*xstride]; 1.917 + const int q0 = pix[0]; 1.918 + const int q1 = pix[1*xstride]; 1.919 + const int q2 = pix[2*xstride]; 1.920 + 1.921 + if( FFABS( p0 - q0 ) < alpha && 1.922 + FFABS( p1 - p0 ) < beta && 1.923 + FFABS( q1 - q0 ) < beta ) { 1.924 + 1.925 + int tc = tc0[i]; 1.926 + int i_delta; 1.927 + 1.928 + if( FFABS( p2 - p0 ) < beta ) { 1.929 + pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); 1.930 + tc++; 1.931 + } 1.932 + if( FFABS( q2 - q0 ) < beta ) { 1.933 + pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); 1.934 + tc++; 1.935 + } 1.936 + 1.937 + i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); 1.938 + pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ 1.939 + pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ 1.940 + } 1.941 + pix += ystride; 1.942 + } 1.943 + } 1.944 +} 1.945 +static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 1.946 +{ 1.947 + h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); 1.948 +} 1.949 +static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 1.950 +{ 1.951 + h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0); 1.952 +} 1.953 + 1.954 +static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) 1.955 +{ 1.956 + int d; 1.957 + for( d = 0; d < 16; d++ ) { 1.958 + const int p2 = pix[-3*xstride]; 1.959 + const int p1 = pix[-2*xstride]; 1.960 + const int p0 = pix[-1*xstride]; 1.961 + 1.962 + const int q0 = pix[ 0*xstride]; 1.963 + const int q1 = pix[ 1*xstride]; 1.964 + const int q2 = pix[ 2*xstride]; 1.965 + 1.966 + if( FFABS( p0 - q0 ) < alpha && 1.967 + FFABS( p1 - p0 ) < beta && 1.968 + FFABS( q1 - q0 ) < beta ) { 1.969 + 1.970 + if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ 1.971 + if( FFABS( p2 - p0 ) < beta) 1.972 + { 1.973 + const int p3 = pix[-4*xstride]; 1.974 + /* p0', p1', p2' */ 1.975 + pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; 1.976 + pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; 1.977 + pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; 1.978 + } else { 1.979 + /* p0' */ 1.980 + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; 1.981 + } 1.982 + if( FFABS( q2 - q0 ) < beta) 1.983 + { 1.984 + const int q3 = pix[3*xstride]; 1.985 + /* q0', q1', q2' */ 1.986 + pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; 1.987 + pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; 1.988 + pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; 1.989 + } else { 1.990 + /* q0' */ 1.991 + pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; 1.992 + } 1.993 + }else{ 1.994 + /* p0', q0' */ 1.995 + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; 1.996 + pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; 1.997 + } 1.998 + } 1.999 + pix += ystride; 1.1000 + } 1.1001 +} 1.1002 +static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 1.1003 +{ 1.1004 + h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta); 1.1005 +} 1.1006 +static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 1.1007 +{ 1.1008 + h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta); 1.1009 +} 1.1010 + 1.1011 +static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) 1.1012 +{ 1.1013 + int i, d; 1.1014 + for( i = 0; i < 4; i++ ) { 1.1015 + const int tc = tc0[i]; 1.1016 + if( tc <= 0 ) { 1.1017 + pix += 2*ystride; 1.1018 + continue; 1.1019 + } 1.1020 + for( d = 0; d < 2; d++ ) { 1.1021 + const int p0 = pix[-1*xstride]; 1.1022 + const int p1 = pix[-2*xstride]; 1.1023 + const int q0 = pix[0]; 1.1024 + const int q1 = pix[1*xstride]; 1.1025 + 1.1026 + if( FFABS( p0 - q0 ) < alpha && 1.1027 + FFABS( p1 - p0 ) < beta && 1.1028 + FFABS( q1 - q0 ) < beta ) { 1.1029 + 1.1030 + int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); 1.1031 + 1.1032 + pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ 1.1033 + pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ 1.1034 + } 1.1035 + pix += ystride; 1.1036 + } 1.1037 + } 1.1038 +} 1.1039 +static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 1.1040 +{ 1.1041 + h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0); 1.1042 +} 1.1043 +static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 1.1044 +{ 1.1045 + h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0); 1.1046 +} 1.1047 + 1.1048 +static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) 1.1049 +{ 1.1050 + int d; 1.1051 + for( d = 0; d < 8; d++ ) { 1.1052 + const int p0 = pix[-1*xstride]; 1.1053 + const int p1 = pix[-2*xstride]; 1.1054 + const int q0 = pix[0]; 1.1055 + const int q1 = pix[1*xstride]; 1.1056 + 1.1057 + if( FFABS( p0 - q0 ) < alpha && 1.1058 + FFABS( p1 - p0 ) < beta && 1.1059 + FFABS( q1 - q0 ) < beta ) { 1.1060 + 1.1061 + pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ 1.1062 + pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ 1.1063 + } 1.1064 + pix += ystride; 1.1065 + } 1.1066 +} 1.1067 +static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 1.1068 +{ 1.1069 + h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta); 1.1070 +} 1.1071 +static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 1.1072 +{ 1.1073 + h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta); 1.1074 +} 1.1075 + 1.1076 + 1.1077 +void dsputil_h264_init_cell(DSPContext_spu* c) { 1.1078 + 1.1079 + c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; 1.1080 + c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; 1.1081 + c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c; 1.1082 + c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c; 1.1083 + c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c; 1.1084 + c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; 1.1085 + c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; 1.1086 + c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; 1.1087 + 1.1088 + c->h264_idct_add[0] = h264_idct8_add_spu; 1.1089 + c->h264_idct_add[1] = h264_idct4_add_spu; 1.1090 + 1.1091 + 1.1092 + c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_spu; 1.1093 + c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_spu; 1.1094 + c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_spu; 1.1095 + c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_spu; 1.1096 + c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_spu; 1.1097 + c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_spu; 1.1098 + 1.1099 + c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; 1.1100 + c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; 1.1101 + c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c; 1.1102 + c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c; 1.1103 + c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c; 1.1104 + c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c; 1.1105 + c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c; 1.1106 + c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c; 1.1107 + c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c; 1.1108 + c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c; 1.1109 + c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c; 1.1110 + c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c; 1.1111 + c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c; 1.1112 + c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c; 1.1113 + c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c; 1.1114 + c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c; 1.1115 + c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c; 1.1116 + c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c; 1.1117 + c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c; 1.1118 + c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c; 1.1119 + 1.1120 + 1.1121 +#define dspfunc(PFX, IDX, NUM) \ 1.1122 + c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_spu; \ 1.1123 + c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_spu; \ 1.1124 + c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_spu; \ 1.1125 + c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_spu; \ 1.1126 + c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_spu; \ 1.1127 + c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_spu; \ 1.1128 + c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_spu; \ 1.1129 + c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_spu; \ 1.1130 + c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_spu; \ 1.1131 + c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_spu; \ 1.1132 + c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_spu; \ 1.1133 + c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_spu; \ 1.1134 + c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_spu; \ 1.1135 + c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_spu; \ 1.1136 + c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_spu; \ 1.1137 + c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_spu 1.1138 + 1.1139 + dspfunc(put_h264_qpel, 0, 16); 1.1140 + dspfunc(put_h264_qpel, 1, 8); 1.1141 + dspfunc(put_h264_qpel, 2, 4); 1.1142 + 1.1143 + dspfunc(avg_h264_qpel, 0, 16); 1.1144 + dspfunc(avg_h264_qpel, 1, 8); 1.1145 + dspfunc(avg_h264_qpel, 2, 4); 1.1146 + 1.1147 +#undef dspfunc 1.1148 + 1.1149 + 1.1150 +}
