Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
view libavcodec/cell/dsputil_spu.c @ 9:ea1ba68cf0ed
update to match api changes + add sscc produced source
| author | Nina Engelhardt <nengel@mailbox.tu-berlin.de> |
|---|---|
| date | Wed, 05 Jun 2013 14:43:26 +0200 |
| parents | |
| children |
line source
1 /*
2 * Copyright (c) 2009 TUDelft
3 *
4 * Cell Parallel SPU - 2DWave Macroblock Decoding.
5 */
7 /**
8 * @file libavcodec/cell/spu/h264_main_spu.c
9 * Cell Parallel SPU - 2DWave Macroblock Decoding
10 * @author C C Chi <c.c.chi@student.tudelft.nl>
11 *
12 * SIMD SPU kernels
13 * H.264/AVC motion compensation
14 * @author Mauricio Alvarez <alvarez@ac.upc.edu>
15 * @author Albert Paradis <apar7632@hotmail.com>
16 */
19 #include "dsputil_spu.h"
20 #include "h264_idct_spu.h"
21 #include "h264_deblock_spu.h"
22 #include "types_spu.h"
23 #include "libavutil/intreadwrite.h"
25 #include <stdio.h>
26 #include <spu_intrinsics.h>
27 #include <spu_mfcio.h>
28 #include <assert.h>
30 //Luma interpolation
31 #define PUT_OP_U8_SPU(d, s, dst) (void) dst; d = s
32 #define AVG_OP_U8_SPU(d, s, dst) d = spu_avg(dst, s)
34 #define OP_U8_SPU PUT_OP_U8_SPU
35 #define PREFIX_h264_qpel16_h_lowpass_spu put_h264_qpel16_h_lowpass_spu
36 #define PREFIX_h264_qpel16_v_lowpass_spu put_h264_qpel16_v_lowpass_spu
37 #define PREFIX_h264_qpel16_hv_lowpass_spu put_h264_qpel16_hv_lowpass_spu
38 #define PREFIX_h264_qpel8_h_lowpass_spu put_h264_qpel8_h_lowpass_spu
39 #define PREFIX_h264_qpel8_v_lowpass_spu put_h264_qpel8_v_lowpass_spu
40 #define PREFIX_h264_qpel8_hv_lowpass_spu put_h264_qpel8_hv_lowpass_spu
41 #define PREFIX_h264_qpel4_h_lowpass_spu put_h264_qpel4_h_lowpass_spu
42 #define PREFIX_h264_qpel4_v_lowpass_spu put_h264_qpel4_v_lowpass_spu
43 #define PREFIX_h264_qpel4_hv_lowpass_spu put_h264_qpel4_hv_lowpass_spu
44 #include "h264_luma_template_spu.c"
45 #undef OP_U8_SPU
46 #undef PREFIX_h264_qpel16_h_lowpass_spu
47 #undef PREFIX_h264_qpel16_v_lowpass_spu
48 #undef PREFIX_h264_qpel16_hv_lowpass_spu
49 #undef PREFIX_h264_qpel8_h_lowpass_spu
50 #undef PREFIX_h264_qpel8_v_lowpass_spu
51 #undef PREFIX_h264_qpel8_hv_lowpass_spu
52 #undef PREFIX_h264_qpel4_h_lowpass_spu
53 #undef PREFIX_h264_qpel4_v_lowpass_spu
54 #undef PREFIX_h264_qpel4_hv_lowpass_spu
56 #define OP_U8_SPU AVG_OP_U8_SPU
57 #define PREFIX_h264_qpel16_h_lowpass_spu avg_h264_qpel16_h_lowpass_spu
58 #define PREFIX_h264_qpel16_v_lowpass_spu avg_h264_qpel16_v_lowpass_spu
59 #define PREFIX_h264_qpel16_hv_lowpass_spu avg_h264_qpel16_hv_lowpass_spu
60 #define PREFIX_h264_qpel8_h_lowpass_spu avg_h264_qpel8_h_lowpass_spu
61 #define PREFIX_h264_qpel8_v_lowpass_spu avg_h264_qpel8_v_lowpass_spu
62 #define PREFIX_h264_qpel8_hv_lowpass_spu avg_h264_qpel8_hv_lowpass_spu
63 #define PREFIX_h264_qpel4_h_lowpass_spu avg_h264_qpel4_h_lowpass_spu
64 #define PREFIX_h264_qpel4_v_lowpass_spu avg_h264_qpel4_v_lowpass_spu
65 #define PREFIX_h264_qpel4_hv_lowpass_spu avg_h264_qpel4_hv_lowpass_spu
66 #include "h264_luma_template_spu.c"
67 #undef OP_U8_SPU
68 #undef PREFIX_h264_qpel16_h_lowpass_spu
69 #undef PREFIX_h264_qpel16_v_lowpass_spu
70 #undef PREFIX_h264_qpel16_hv_lowpass_spu
71 #undef PREFIX_h264_qpel8_h_lowpass_spu
72 #undef PREFIX_h264_qpel8_v_lowpass_spu
73 #undef PREFIX_h264_qpel8_hv_lowpass_spu
74 #undef PREFIX_h264_qpel4_h_lowpass_spu
75 #undef PREFIX_h264_qpel4_v_lowpass_spu
76 #undef PREFIX_h264_qpel4_hv_lowpass_spu
78 #define H264_MC(OPNAME, SIZE, CODETYPE) \
79 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
80 OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, dst_stride, STRIDE_Y, h);\
81 }\
82 \
83 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){ \
84 DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
85 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\
86 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\
87 }\
88 \
89 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
90 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\
91 }\
92 \
93 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
94 DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
95 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\
96 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, dst_stride, STRIDE_Y, h);\
97 }\
98 \
99 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
100 DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
101 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\
102 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\
103 }\
104 \
105 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
106 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\
107 }\
108 \
109 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
110 DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
111 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\
112 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+STRIDE_Y, half, dst_stride, STRIDE_Y, h);\
113 }\
114 \
115 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
116 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
117 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
118 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
119 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
120 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
121 }\
122 \
123 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
124 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
125 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
126 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
127 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
128 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
129 }\
130 \
131 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
132 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
133 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
134 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
135 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
136 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
137 }\
138 \
139 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
140 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
141 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
142 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
143 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
144 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
145 }\
146 \
147 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
148 DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
149 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, dst_stride, 16, h);\
150 }\
151 \
152 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
153 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
154 DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
155 DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
156 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
157 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
158 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\
159 }\
160 \
161 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
162 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
163 DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
164 DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
165 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
166 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
167 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\
168 }\
169 \
170 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
171 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
172 DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
173 DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
174 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
175 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
176 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\
177 }\
178 \
179 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
180 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
181 DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
182 DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
183 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
184 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
185 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\
186 }\
189 /**************************/
190 /* put pixels functions */
191 /*************************/
193 static void put_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,
194 const uint8_t * src2, int dst_stride,
195 int src_stride1, int h)
196 {
197 int i;
199 const int perm_src1 = (unsigned int) src1 & 15;
201 for (i=0; i<h; i++){
202 //unaligned load of src1
203 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
204 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
205 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
207 //aligned load of src2
208 const vuint8_t srcb = *(vuint8_t *)(src2);
210 //average and rounding
211 const vuint8_t avgc = spu_avg(srca,srcb);
213 // 16x16 dest luma blocks are always aligned
214 *(vuint8_t *)dst=avgc;
216 src1 +=src_stride1;
217 src2 +=16;
218 dst +=dst_stride;
219 }
220 }
222 static void avg_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,
223 const uint8_t * src2, int dst_stride,
224 int src_stride1, int h)
225 {
226 int i;
228 const int perm_src1 = (unsigned int) src1 & 15;
230 for (i=0; i<h; i++){
231 //unaligned load of src1
232 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
233 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
234 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
236 //aligned load of src2
237 const vuint8_t srcb = *(vuint8_t *)(src2);
239 //average and rounding
240 const vuint8_t avgc = spu_avg(spu_avg(srca,srcb), *(vuint8_t *)dst);
242 // 16x16 dest luma blocks are always aligned
243 *(vuint8_t *)dst=avgc;
245 src1 +=src_stride1;
246 src2 +=16;
247 dst +=dst_stride;
248 }
249 }
251 // next one assumes that ((line_size % 16) == 0)
252 void put_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
253 {
254 register vector unsigned char pixelsv1, pixelsv2;
255 register vector unsigned char pixelsv1B, pixelsv2B;
256 register vector unsigned char pixelsv1C, pixelsv2C;
257 register vector unsigned char pixelsv1D, pixelsv2D;
259 const int perm = (unsigned int) src & 15;
260 int i;
261 register int line_size = src_stride;
262 register int line_size_2 = line_size << 1;
263 register int line_size_3 = line_size + line_size_2;
264 register int line_size_4 = line_size << 2;
266 register int dst_stride_2 = dst_stride << 1;
267 register int dst_stride_3 = dst_stride_2 + dst_stride;
268 register int dst_stride_4 = dst_stride << 2;
270 for(i=0; i<h; i+=4) {
271 pixelsv1 = *(vuint8_t *)(src);
272 pixelsv2 = *(vuint8_t *)(src+16);
273 pixelsv1B = *(vuint8_t *)(src + line_size);
274 pixelsv2B = *(vuint8_t *)(src+16 + line_size);
275 pixelsv1C = *(vuint8_t *)(src + line_size_2);
276 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
277 pixelsv1D = *(vuint8_t *)(src + line_size_3);
278 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
280 *(vuint8_t *) dst = spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16));
281 *(vuint8_t *)(dst + dst_stride) = spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16));
282 *(vuint8_t *)(dst + dst_stride_2) = spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16));
283 *(vuint8_t *)(dst + dst_stride_3) = spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16));
285 src+= line_size_4;
286 dst+= dst_stride_4;
287 }
288 }
290 // next one assumes that ((line_size % 16) == 0)
291 void avg_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
292 {
293 register vector unsigned char pixelsv1, pixelsv2;
294 register vector unsigned char pixelsv1B, pixelsv2B;
295 register vector unsigned char pixelsv1C, pixelsv2C;
296 register vector unsigned char pixelsv1D, pixelsv2D;
298 const int perm = (unsigned int) src & 15;
299 int i;
300 register int line_size = src_stride;
301 register int line_size_2 = line_size << 1;
302 register int line_size_3 = line_size + line_size_2;
303 register int line_size_4 = line_size << 2;
305 register int dst_stride_2 = dst_stride << 1;
306 register int dst_stride_3 = dst_stride_2 + dst_stride;
307 register int dst_stride_4 = dst_stride << 2;
310 for(i=0; i<h; i+=4) {
311 pixelsv1 = *(vuint8_t *)(src);
312 pixelsv2 = *(vuint8_t *)(src+16);
313 pixelsv1B = *(vuint8_t *)(src + line_size);
314 pixelsv2B = *(vuint8_t *)(src+16 + line_size);
315 pixelsv1C = *(vuint8_t *)(src + line_size_2);
316 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
317 pixelsv1D = *(vuint8_t *)(src + line_size_3);
318 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
320 *(vuint8_t *)dst = spu_avg(spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16)), *(vuint8_t *)dst);
321 *(vuint8_t *)(dst + dst_stride) = spu_avg(spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), *(vuint8_t *)(dst+dst_stride));
322 *(vuint8_t *)(dst + dst_stride_2) = spu_avg(spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), *(vuint8_t *)(dst+dst_stride_2));
323 *(vuint8_t *)(dst + dst_stride_3) = spu_avg(spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), *(vuint8_t *)(dst+dst_stride_3));
325 src+= line_size_4;
326 dst+= dst_stride_4;
327 }
328 }
330 void put_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
331 int dst_stride, int src_stride1, int h)
332 {
333 int i;
335 const int perm_src1 = (unsigned int) src1 & 15;
336 const int shift_dst = (unsigned int) dst & 15;
338 // 8x dest luma blocks are aligned or desaligned by 8
339 vuint8_t dstmask;
340 const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
341 const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
343 if(shift_dst==0){
344 dstmask = dst8mask1;
345 }
346 else{
347 dstmask = dst8mask2;
348 }
350 for (i=0; i<h; i++){
351 //unaligned load of src1
352 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
353 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
354 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
356 //aligned load of src2
357 const vuint8_t srcb = *(vuint8_t *)(src2);
359 //average and rounding
360 const vuint8_t avgc = spu_avg(srca,srcb);
362 const vuint8_t dst1 = *(vuint8_t *)dst;
364 const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);
366 *(vuint8_t *)dst=davgc;
368 src1 +=src_stride1;
369 src2 +=16;
370 dst +=dst_stride;
371 }
372 }
374 void avg_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
375 int dst_stride, int src_stride1, int h)
376 {
377 int i;
379 const int perm_src1 = (unsigned int) src1 & 15;
380 const int shift_dst = (unsigned int) dst & 15;
382 // 8x dest luma blocks are aligned or desaligned by 8
383 vuint8_t dstmask;
384 const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
385 const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
387 if(shift_dst==0){
388 dstmask = dst8mask1;
389 }
390 else{
391 dstmask = dst8mask2;
392 }
394 for (i=0; i<h; i++){
395 //unaligned load of src1
396 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
397 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
398 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
400 //aligned load of src2
401 const vuint8_t srcb = *(vuint8_t *)(src2);
403 //average and rounding
404 const vuint8_t avgc = spu_avg(srca,srcb);
406 const vuint8_t dst1 = *(vuint8_t *)dst;
408 const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);
410 const vuint8_t davgc = spu_avg(dst1,davgc1);
412 *(vuint8_t *)dst=davgc;
414 src1 +=src_stride1;
415 src2 +=16;
416 dst +=dst_stride;
417 }
418 }
420 // next one assumes that ((line_size % 16) == 0)
421 void put_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
422 {
423 register vector unsigned char pixelsv1A, pixelsv2A;
424 register vector unsigned char pixelsv1B, pixelsv2B;
425 register vector unsigned char pixelsv1C, pixelsv2C;
426 register vector unsigned char pixelsv1D, pixelsv2D;
428 const int perm = (unsigned int) src & 15;
429 const int shift_dst = (unsigned int) dst & 15;
431 // 8x dest luma blocks are aligned or desaligned by 8
432 vuint8_t dstmask;
433 const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
434 const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
436 if(shift_dst==0){
437 dstmask = dst8mask1;
438 }
439 else{
440 dstmask = dst8mask2;
441 }
443 int i;
444 register int line_size = src_stride;
445 register int line_size_2 = line_size << 1;
446 register int line_size_3 = line_size + line_size_2;
447 register int line_size_4 = line_size << 2;
449 register int dst_stride_2 = dst_stride << 1;
450 register int dst_stride_3 = dst_stride_2 + dst_stride;
451 register int dst_stride_4 = dst_stride << 2;
453 for(i=0; i<h; i+=4) {
454 pixelsv1A = *(vuint8_t *)(src);
455 pixelsv2A = *(vuint8_t *)(src+16);
456 pixelsv1B = *(vuint8_t *)(src + line_size);
457 pixelsv2B = *(vuint8_t *)(src+16 + line_size);
458 pixelsv1C = *(vuint8_t *)(src + line_size_2);
459 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
460 pixelsv1D = *(vuint8_t *)(src + line_size_3);
461 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
463 const vuint8_t block1 = *(vuint8_t *)dst;
464 const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
465 const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);
466 const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
467 const vuint8_t block3 = *(vuint8_t *)(dst+2*dst_stride);
468 const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
469 const vuint8_t block4 = *(vuint8_t *)(dst+3*dst_stride);
470 const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
472 *(vuint8_t *) dst = put1;
473 *(vuint8_t *)(dst + dst_stride) = put2;
474 *(vuint8_t *)(dst + dst_stride_2) = put3;
475 *(vuint8_t *)(dst + dst_stride_3) = put4;
477 src += line_size_4;
478 dst += dst_stride_4;
479 }
480 }
482 // next one assumes that ((line_size % 16) == 0)
483 void avg_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
484 {
485 register vector unsigned char pixelsv1A, pixelsv2A;
486 register vector unsigned char pixelsv1B, pixelsv2B;
487 register vector unsigned char pixelsv1C, pixelsv2C;
488 register vector unsigned char pixelsv1D, pixelsv2D;
490 const int perm = (unsigned int) src & 15;
491 const int shift_dst = (unsigned int) dst & 15;
493 // 8x dest luma blocks are aligned or desaligned by 8
494 vuint8_t dstmask;
495 const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
496 const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
498 if(shift_dst==0){
499 dstmask = dst8mask1;
500 }
501 else{
502 dstmask = dst8mask2;
503 }
505 int i;
506 register int line_size = src_stride;
507 register int line_size_2 = line_size << 1;
508 register int line_size_3 = line_size + line_size_2;
509 register int line_size_4 = line_size << 2;
511 register int dst_stride_2 = dst_stride << 1;
512 register int dst_stride_3 = dst_stride_2 + dst_stride;
513 register int dst_stride_4 = dst_stride << 2;
515 for(i=0; i<h; i+=4) {
516 pixelsv1A = *(vuint8_t *)(src);
517 pixelsv2A = *(vuint8_t *)(src+16);
518 pixelsv1B = *(vuint8_t *)(src + line_size);
519 pixelsv2B = *(vuint8_t *)(src+16 + line_size);
520 pixelsv1C = *(vuint8_t *)(src + line_size_2);
521 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
522 pixelsv1D = *(vuint8_t *)(src + line_size_3);
523 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
525 const vuint8_t block1 = *(vuint8_t *) dst;
526 const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
527 const vuint8_t put1 = spu_avg(block1,put1a);
529 const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);
530 const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
531 const vuint8_t put2 = spu_avg(block2,put2a);
533 const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);
534 const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
535 const vuint8_t put3 = spu_avg(block3,put3a);
537 const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);
538 const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
539 const vuint8_t put4 = spu_avg(block4,put4a);
541 *(vuint8_t *) dst = put1;
542 *(vuint8_t *)(dst + dst_stride) = put2;
543 *(vuint8_t *)(dst + dst_stride_2) = put3;
544 *(vuint8_t *)(dst + dst_stride_3) = put4;
546 src+= line_size_4;
547 dst+= dst_stride_4;
548 }
549 }
551 void put_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
552 int dst_stride, int src_stride1, int h)
553 {
554 int i;
556 const int perm_src1 = (unsigned int) src1 & 15;
557 const int shift_dst = (unsigned int) dst & 15;
559 // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
560 vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
561 const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
562 const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
563 const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
564 const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
566 switch(shift_dst){
567 case 0: dstmask = dstmask0;
568 break;
569 case 4: dstmask = dstmask4;
570 break;
571 case 8: dstmask = dstmask8;
572 break;
573 case 12: dstmask = dstmask12;
574 break;
575 }
577 for (i=0; i<h; i++){
578 //unaligned load of src1
579 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
580 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
581 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
583 //aligned load of src2
584 const vuint8_t srcb = *(vuint8_t *)(src2);
586 //average and rounding
587 const vuint8_t avgc = spu_avg(srca,srcb);
589 const vuint8_t dst1 = *(vuint8_t *)dst;
591 const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);
593 *(vuint8_t *)dst=davgc;
595 src1 +=src_stride1;
596 src2 +=16;
597 dst +=dst_stride;
598 }
599 }
601 void avg_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
602 int dst_stride, int src_stride1, int h)
603 {
604 int i;
606 const int perm_src1 = (unsigned int) src1 & 15;
607 const int shift_dst = (unsigned int) dst & 15;
609 // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
610 vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
611 const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
612 const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
613 const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
614 const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
616 switch(shift_dst){
617 case 0: dstmask = dstmask0;
618 break;
619 case 4: dstmask = dstmask4;
620 break;
621 case 8: dstmask = dstmask8;
622 break;
623 case 12: dstmask = dstmask12;
624 break;
625 }
627 for (i=0; i<h; i++){
628 //unaligned load of src1
629 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
630 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
631 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
633 //aligned load of src2
634 const vuint8_t srcb = *(vuint8_t *)(src2);
636 //average and rounding
637 const vuint8_t avgc = spu_avg(srca,srcb);
639 const vuint8_t dst1 = *(vuint8_t *)dst;
641 const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);
643 const vuint8_t davgc = spu_avg(dst1,davgc1);
645 *(vuint8_t *)dst=davgc;
647 src1 +=src_stride1;
648 src2 +=16;
649 dst +=dst_stride;
650 }
651 }
653 // next one assumes that ((line_size % 16) == 0)
654 void put_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
655 {
656 register vector unsigned char pixelsv1A, pixelsv2A;
657 register vector unsigned char pixelsv1B, pixelsv2B;
658 register vector unsigned char pixelsv1C, pixelsv2C;
659 register vector unsigned char pixelsv1D, pixelsv2D;
661 const int perm = (unsigned int) src & 15;
662 const int shift_dst = (unsigned int) dst & 15;
664 // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
665 vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
666 const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
667 const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
668 const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
669 const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
671 switch(shift_dst){
672 case 0: dstmask = dstmask0;
673 break;
674 case 4: dstmask = dstmask4;
675 break;
676 case 8: dstmask = dstmask8;
677 break;
678 case 12: dstmask = dstmask12;
679 break;
680 }
682 int i;
683 register int line_size = src_stride;
684 register int line_size_2 = line_size << 1;
685 register int line_size_3 = line_size + line_size_2;
686 register int line_size_4 = line_size << 2;
688 register int dst_stride_2 = dst_stride << 1;
689 register int dst_stride_3 = dst_stride_2 + dst_stride;
690 register int dst_stride_4 = dst_stride << 2;
692 for(i=0; i<h; i+=4) {
693 pixelsv1A = *(vuint8_t *)(src);
694 pixelsv2A = *(vuint8_t *)(src+16);
695 pixelsv1B = *(vuint8_t *)(src + line_size);
696 pixelsv2B = *(vuint8_t *)(src+16 + line_size);
697 pixelsv1C = *(vuint8_t *)(src + line_size_2);
698 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
699 pixelsv1D = *(vuint8_t *)(src + line_size_3);
700 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
702 const vuint8_t block1 = *(vuint8_t *)dst;
703 const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
704 const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);
705 const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
706 const vuint8_t block3 = *(vuint8_t *)(dst+dst_stride_2);
707 const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
708 const vuint8_t block4 = *(vuint8_t *)(dst+dst_stride_3);
709 const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
711 *(vuint8_t *) dst = put1;
712 *(vuint8_t *)(dst + dst_stride) = put2;
713 *(vuint8_t *)(dst + dst_stride_2) = put3;
714 *(vuint8_t *)(dst + dst_stride_3) = put4;
716 src += line_size_4;
717 dst += dst_stride_4;
718 }
719 }
721 // next one assumes that ((line_size % 16) == 0)
722 void avg_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
723 {
724 register vector unsigned char pixelsv1A, pixelsv2A;
725 register vector unsigned char pixelsv1B, pixelsv2B;
726 register vector unsigned char pixelsv1C, pixelsv2C;
727 register vector unsigned char pixelsv1D, pixelsv2D;
729 const int perm = (unsigned int) src & 15;
730 const int shift_dst = (unsigned int) dst & 15;
732 // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
733 vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
734 const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
735 const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
736 const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
737 const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
739 switch(shift_dst){
740 case 0: dstmask = dstmask0;
741 break;
742 case 4: dstmask = dstmask4;
743 break;
744 case 8: dstmask = dstmask8;
745 break;
746 case 12: dstmask = dstmask12;
747 break;
748 }
750 int i;
751 register int line_size = src_stride;
752 register int line_size_2 = line_size << 1;
753 register int line_size_3 = line_size + line_size_2;
754 register int line_size_4 = line_size << 2;
756 register int dst_stride_2 = dst_stride << 1;
757 register int dst_stride_3 = dst_stride_2 + dst_stride;
758 register int dst_stride_4 = dst_stride << 2;
760 for(i=0; i<h; i+=4) {
761 pixelsv1A = *(vuint8_t *)(src);
762 pixelsv2A = *(vuint8_t *)(src+16);
763 pixelsv1B = *(vuint8_t *)(src + line_size);
764 pixelsv2B = *(vuint8_t *)(src+16 + line_size);
765 pixelsv1C = *(vuint8_t *)(src + line_size_2);
766 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
767 pixelsv1D = *(vuint8_t *)(src + line_size_3);
768 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
770 const vuint8_t block1 = *(vuint8_t *) dst;
771 const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
772 const vuint8_t put1 = spu_avg(block1,put1a);
774 const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);
775 const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
776 const vuint8_t put2 = spu_avg(block2,put2a);
778 const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);
779 const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
780 const vuint8_t put3 = spu_avg(block3,put3a);
782 const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);
783 const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
784 const vuint8_t put4 = spu_avg(block4,put4a);
786 *(vuint8_t *) dst = put1;
787 *(vuint8_t *)(dst + dst_stride) = put2;
788 *(vuint8_t *)(dst + dst_stride_2) = put3;
789 *(vuint8_t *)(dst + dst_stride_3) = put4;
791 src+= line_size_4;
792 dst+= dst_stride_4;
793 }
794 }
796 /* Here we create all the interpolation modes H.264 motion compensation stage for luma */
797 H264_MC(put_, 16, spu)
798 H264_MC(put_, 8, spu)
799 H264_MC(put_, 4, spu)
801 H264_MC(avg_, 16, spu)
802 H264_MC(avg_, 8, spu)
803 H264_MC(avg_, 4, spu)
806 //Chroma interpolation:
808 #define OP_U8_SPU PUT_OP_U8_SPU
809 #define PREFIX_h264_chroma_mc8_spu put_h264_chroma_mc8_spu
810 #define PREFIX_h264_chroma_mc4_spu put_h264_chroma_mc4_spu
811 #define PREFIX_h264_chroma_mc2_spu put_h264_chroma_mc2_spu
812 #include "h264_chroma_template_spu.c"
813 #undef OP_U8_SPU
814 #undef PREFIX_h264_chroma_mc8_spu
815 #undef PREFIX_h264_chroma_mc4_spu
816 #undef PREFIX_h264_chroma_mc2_spu
818 #define OP_U8_SPU AVG_OP_U8_SPU
819 #define PREFIX_h264_chroma_mc8_spu avg_h264_chroma_mc8_spu
820 #define PREFIX_h264_chroma_mc4_spu avg_h264_chroma_mc4_spu
821 #define PREFIX_h264_chroma_mc2_spu avg_h264_chroma_mc2_spu
822 #include "h264_chroma_template_spu.c"
823 #undef OP_U8_SPU
824 #undef PREFIX_h264_chroma_mc8_spu
825 #undef PREFIX_h264_chroma_mc4_spu
826 #undef PREFIX_h264_chroma_mc2_spu
828 // Weight and Biweight functions
830 #define op_scale1(x) dst[x] = av_clip_uint8( (dst[x]*weight + offset) >> log2_denom )
831 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
832 #define H264_WEIGHT(W,H) \
833 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
834 int y; \
835 offset <<= log2_denom; \
836 if(log2_denom) offset += 1<<(log2_denom-1); \
837 for(y=0; y<H; y++, dst += stride){ \
838 op_scale1(0); \
839 op_scale1(1); \
840 if(W==2) continue; \
841 op_scale1(2); \
842 op_scale1(3); \
843 if(W==4) continue; \
844 op_scale1(4); \
845 op_scale1(5); \
846 op_scale1(6); \
847 op_scale1(7); \
848 if(W==8) continue; \
849 op_scale1(8); \
850 op_scale1(9); \
851 op_scale1(10); \
852 op_scale1(11); \
853 op_scale1(12); \
854 op_scale1(13); \
855 op_scale1(14); \
856 op_scale1(15); \
857 } \
858 } \
859 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd, int weights, int offset){ \
860 int y; \
861 offset = ((offset + 1) | 1) << log2_denom; \
862 for(y=0; y<H; y++, dst += dst_stride, src += src_stride){ \
863 op_scale2(0); \
864 op_scale2(1); \
865 if(W==2) continue; \
866 op_scale2(2); \
867 op_scale2(3); \
868 if(W==4) continue; \
869 op_scale2(4); \
870 op_scale2(5); \
871 op_scale2(6); \
872 op_scale2(7); \
873 if(W==8) continue; \
874 op_scale2(8); \
875 op_scale2(9); \
876 op_scale2(10); \
877 op_scale2(11); \
878 op_scale2(12); \
879 op_scale2(13); \
880 op_scale2(14); \
881 op_scale2(15); \
882 } \
883 }
885 H264_WEIGHT(16,16)
886 H264_WEIGHT(16,8)
887 H264_WEIGHT(8,16)
888 H264_WEIGHT(8,8)
889 H264_WEIGHT(8,4)
890 H264_WEIGHT(4,8)
891 H264_WEIGHT(4,4)
892 H264_WEIGHT(4,2)
893 H264_WEIGHT(2,4)
894 H264_WEIGHT(2,2)
896 #undef op_scale1
897 #undef op_scale2
898 #undef H264_WEIGHT
900 /////////////////////////////////////////////////////////////////////////////////////////
902 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
903 {
904 int i, d;
905 for( i = 0; i < 4; i++ ) {
906 if( tc0[i] < 0 ) {
907 pix += 4*ystride;
908 continue;
909 }
910 for( d = 0; d < 4; d++ ) {
911 const int p0 = pix[-1*xstride];
912 const int p1 = pix[-2*xstride];
913 const int p2 = pix[-3*xstride];
914 const int q0 = pix[0];
915 const int q1 = pix[1*xstride];
916 const int q2 = pix[2*xstride];
918 if( FFABS( p0 - q0 ) < alpha &&
919 FFABS( p1 - p0 ) < beta &&
920 FFABS( q1 - q0 ) < beta ) {
922 int tc = tc0[i];
923 int i_delta;
925 if( FFABS( p2 - p0 ) < beta ) {
926 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
927 tc++;
928 }
929 if( FFABS( q2 - q0 ) < beta ) {
930 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
931 tc++;
932 }
934 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
935 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
936 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
937 }
938 pix += ystride;
939 }
940 }
941 }
942 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
943 {
944 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
945 }
946 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
947 {
948 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
949 }
951 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
952 {
953 int d;
954 for( d = 0; d < 16; d++ ) {
955 const int p2 = pix[-3*xstride];
956 const int p1 = pix[-2*xstride];
957 const int p0 = pix[-1*xstride];
959 const int q0 = pix[ 0*xstride];
960 const int q1 = pix[ 1*xstride];
961 const int q2 = pix[ 2*xstride];
963 if( FFABS( p0 - q0 ) < alpha &&
964 FFABS( p1 - p0 ) < beta &&
965 FFABS( q1 - q0 ) < beta ) {
967 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
968 if( FFABS( p2 - p0 ) < beta)
969 {
970 const int p3 = pix[-4*xstride];
971 /* p0', p1', p2' */
972 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
973 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
974 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
975 } else {
976 /* p0' */
977 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
978 }
979 if( FFABS( q2 - q0 ) < beta)
980 {
981 const int q3 = pix[3*xstride];
982 /* q0', q1', q2' */
983 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
984 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
985 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
986 } else {
987 /* q0' */
988 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
989 }
990 }else{
991 /* p0', q0' */
992 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
993 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
994 }
995 }
996 pix += ystride;
997 }
998 }
999 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
1000 {
1001 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
1002 }
1003 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
1004 {
1005 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
1006 }
1008 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
1009 {
1010 int i, d;
1011 for( i = 0; i < 4; i++ ) {
1012 const int tc = tc0[i];
1013 if( tc <= 0 ) {
1014 pix += 2*ystride;
1015 continue;
1016 }
1017 for( d = 0; d < 2; d++ ) {
1018 const int p0 = pix[-1*xstride];
1019 const int p1 = pix[-2*xstride];
1020 const int q0 = pix[0];
1021 const int q1 = pix[1*xstride];
1023 if( FFABS( p0 - q0 ) < alpha &&
1024 FFABS( p1 - p0 ) < beta &&
1025 FFABS( q1 - q0 ) < beta ) {
1027 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
1029 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
1030 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
1031 }
1032 pix += ystride;
1033 }
1034 }
1035 }
1036 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
1037 {
1038 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
1039 }
1040 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
1041 {
1042 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
1043 }
1045 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
1046 {
1047 int d;
1048 for( d = 0; d < 8; d++ ) {
1049 const int p0 = pix[-1*xstride];
1050 const int p1 = pix[-2*xstride];
1051 const int q0 = pix[0];
1052 const int q1 = pix[1*xstride];
1054 if( FFABS( p0 - q0 ) < alpha &&
1055 FFABS( p1 - p0 ) < beta &&
1056 FFABS( q1 - q0 ) < beta ) {
1058 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
1059 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
1060 }
1061 pix += ystride;
1062 }
1063 }
1064 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
1065 {
1066 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
1067 }
1068 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
1069 {
1070 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
1071 }
1074 void dsputil_h264_init_cell(DSPContext_spu* c) {
1076 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
1077 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
1078 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
1079 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
1080 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
1081 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
1082 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
1083 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
1085 c->h264_idct_add[0] = h264_idct8_add_spu;
1086 c->h264_idct_add[1] = h264_idct4_add_spu;
1089 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_spu;
1090 c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_spu;
1091 c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_spu;
1092 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_spu;
1093 c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_spu;
1094 c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_spu;
1096 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
1097 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
1098 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
1099 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
1100 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
1101 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
1102 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
1103 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
1104 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
1105 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
1106 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
1107 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
1108 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
1109 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
1110 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
1111 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
1112 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
1113 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
1114 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
1115 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
1118 #define dspfunc(PFX, IDX, NUM) \
1119 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_spu; \
1120 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_spu; \
1121 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_spu; \
1122 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_spu; \
1123 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_spu; \
1124 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_spu; \
1125 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_spu; \
1126 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_spu; \
1127 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_spu; \
1128 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_spu; \
1129 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_spu; \
1130 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_spu; \
1131 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_spu; \
1132 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_spu; \
1133 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_spu; \
1134 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_spu
1136 dspfunc(put_h264_qpel, 0, 16);
1137 dspfunc(put_h264_qpel, 1, 8);
1138 dspfunc(put_h264_qpel, 2, 4);
1140 dspfunc(avg_h264_qpel, 0, 16);
1141 dspfunc(avg_h264_qpel, 1, 8);
1142 dspfunc(avg_h264_qpel, 2, 4);
1144 #undef dspfunc
1147 }
