Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
view libavcodec/x86/dsputil_mmx.c @ 3:0b056460c67d
changed code to use VSs
| author | Nina Engelhardt <nengel@mailbox.tu-berlin.de> |
|---|---|
| date | Mon, 29 Oct 2012 16:44:27 +0100 |
| parents | |
| children |
line source
1 /*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23 */
25 #include "libavutil/x86_cpu.h"
26 #include "libavutil/internal.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264_dsp.h"
29 #include "dsputil_mmx.h"
32 //#undef NDEBUG
33 //#include <assert.h>
35 int mm_flags; /* multimedia extension flags */
37 /* pixel operations */
38 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
39 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
41 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
42 {0x8000000080000000ULL, 0x8000000080000000ULL};
44 DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
48 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
50 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
59 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
60 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
63 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
64 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
68 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
69 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
71 #define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t"
72 #define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
73 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
75 #define MOVQ_BFE(regd) \
76 __asm__ volatile ( \
77 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
78 "paddb %%" #regd ", %%" #regd " \n\t" ::)
80 #ifndef PIC
81 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
82 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
83 #else
84 // for shared library it's better to use this way for accessing constants
85 // pcmpeqd -> -1
86 #define MOVQ_BONE(regd) \
87 __asm__ volatile ( \
88 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
89 "psrlw $15, %%" #regd " \n\t" \
90 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
92 #define MOVQ_WTWO(regd) \
93 __asm__ volatile ( \
94 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
95 "psrlw $15, %%" #regd " \n\t" \
96 "psllw $1, %%" #regd " \n\t"::)
98 #endif
100 // using regr as temporary and for the output result
101 // first argument is unmodifed and second is trashed
102 // regfe is supposed to contain 0xfefefefefefefefe
103 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
104 "movq " #rega ", " #regr " \n\t"\
105 "pand " #regb ", " #regr " \n\t"\
106 "pxor " #rega ", " #regb " \n\t"\
107 "pand " #regfe "," #regb " \n\t"\
108 "psrlq $1, " #regb " \n\t"\
109 "paddb " #regb ", " #regr " \n\t"
111 #define PAVGB_MMX(rega, regb, regr, regfe) \
112 "movq " #rega ", " #regr " \n\t"\
113 "por " #regb ", " #regr " \n\t"\
114 "pxor " #rega ", " #regb " \n\t"\
115 "pand " #regfe "," #regb " \n\t"\
116 "psrlq $1, " #regb " \n\t"\
117 "psubb " #regb ", " #regr " \n\t"
119 // mm6 is supposed to contain 0xfefefefefefefefe
120 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
121 "movq " #rega ", " #regr " \n\t"\
122 "movq " #regc ", " #regp " \n\t"\
123 "pand " #regb ", " #regr " \n\t"\
124 "pand " #regd ", " #regp " \n\t"\
125 "pxor " #rega ", " #regb " \n\t"\
126 "pxor " #regc ", " #regd " \n\t"\
127 "pand %%mm6, " #regb " \n\t"\
128 "pand %%mm6, " #regd " \n\t"\
129 "psrlq $1, " #regb " \n\t"\
130 "psrlq $1, " #regd " \n\t"\
131 "paddb " #regb ", " #regr " \n\t"\
132 "paddb " #regd ", " #regp " \n\t"
134 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
135 "movq " #rega ", " #regr " \n\t"\
136 "movq " #regc ", " #regp " \n\t"\
137 "por " #regb ", " #regr " \n\t"\
138 "por " #regd ", " #regp " \n\t"\
139 "pxor " #rega ", " #regb " \n\t"\
140 "pxor " #regc ", " #regd " \n\t"\
141 "pand %%mm6, " #regb " \n\t"\
142 "pand %%mm6, " #regd " \n\t"\
143 "psrlq $1, " #regd " \n\t"\
144 "psrlq $1, " #regb " \n\t"\
145 "psubb " #regb ", " #regr " \n\t"\
146 "psubb " #regd ", " #regp " \n\t"
148 /***********************************/
149 /* MMX2 specific */
151 #define DEF(x) x ## _mmx2
153 /* Introduced only in MMX2 set */
154 #define PAVGB "pavgb"
155 #define OP_AVG PAVGB
157 #include "dsputil_mmx_avg_template.c"
159 #undef DEF
160 #undef PAVGB
161 #undef OP_AVG
163 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
164 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
165 #define put_pixels16_mmx2 put_pixels16_mmx
166 #define put_pixels8_mmx2 put_pixels8_mmx
167 #define put_pixels4_mmx2 put_pixels4_mmx
168 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
169 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
170 #define put_pixels16_3dnow put_pixels16_mmx
171 #define put_pixels8_3dnow put_pixels8_mmx
172 #define put_pixels4_3dnow put_pixels4_mmx
173 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
174 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
176 /***********************************/
177 /* standard MMX */
179 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
180 {
181 const DCTELEM *p;
182 uint8_t *pix;
184 /* read the pixels */
185 p = block;
186 pix = pixels;
187 /* unrolled loop */
188 __asm__ volatile(
189 "movq %3, %%mm0 \n\t"
190 "movq 8%3, %%mm1 \n\t"
191 "movq 16%3, %%mm2 \n\t"
192 "movq 24%3, %%mm3 \n\t"
193 "movq 32%3, %%mm4 \n\t"
194 "movq 40%3, %%mm5 \n\t"
195 "movq 48%3, %%mm6 \n\t"
196 "movq 56%3, %%mm7 \n\t"
197 "packuswb %%mm1, %%mm0 \n\t"
198 "packuswb %%mm3, %%mm2 \n\t"
199 "packuswb %%mm5, %%mm4 \n\t"
200 "packuswb %%mm7, %%mm6 \n\t"
201 "movq %%mm0, (%0) \n\t"
202 "movq %%mm2, (%0, %1) \n\t"
203 "movq %%mm4, (%0, %1, 2) \n\t"
204 "movq %%mm6, (%0, %2) \n\t"
205 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
206 :"memory");
207 pix += line_size*4;
208 p += 32;
210 // if here would be an exact copy of the code above
211 // compiler would generate some very strange code
212 // thus using "r"
213 __asm__ volatile(
214 "movq (%3), %%mm0 \n\t"
215 "movq 8(%3), %%mm1 \n\t"
216 "movq 16(%3), %%mm2 \n\t"
217 "movq 24(%3), %%mm3 \n\t"
218 "movq 32(%3), %%mm4 \n\t"
219 "movq 40(%3), %%mm5 \n\t"
220 "movq 48(%3), %%mm6 \n\t"
221 "movq 56(%3), %%mm7 \n\t"
222 "packuswb %%mm1, %%mm0 \n\t"
223 "packuswb %%mm3, %%mm2 \n\t"
224 "packuswb %%mm5, %%mm4 \n\t"
225 "packuswb %%mm7, %%mm6 \n\t"
226 "movq %%mm0, (%0) \n\t"
227 "movq %%mm2, (%0, %1) \n\t"
228 "movq %%mm4, (%0, %1, 2) \n\t"
229 "movq %%mm6, (%0, %2) \n\t"
230 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
231 :"memory");
232 }
234 DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] =
235 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
237 #define put_signed_pixels_clamped_mmx_half(off) \
238 "movq "#off"(%2), %%mm1 \n\t"\
239 "movq 16+"#off"(%2), %%mm2 \n\t"\
240 "movq 32+"#off"(%2), %%mm3 \n\t"\
241 "movq 48+"#off"(%2), %%mm4 \n\t"\
242 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
243 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
244 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
245 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
246 "paddb %%mm0, %%mm1 \n\t"\
247 "paddb %%mm0, %%mm2 \n\t"\
248 "paddb %%mm0, %%mm3 \n\t"\
249 "paddb %%mm0, %%mm4 \n\t"\
250 "movq %%mm1, (%0) \n\t"\
251 "movq %%mm2, (%0, %3) \n\t"\
252 "movq %%mm3, (%0, %3, 2) \n\t"\
253 "movq %%mm4, (%0, %1) \n\t"
255 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
256 {
257 x86_reg line_skip = line_size;
258 x86_reg line_skip3;
260 __asm__ volatile (
261 "movq "MANGLE(ff_vector128)", %%mm0 \n\t"
262 "lea (%3, %3, 2), %1 \n\t"
263 put_signed_pixels_clamped_mmx_half(0)
264 "lea (%0, %3, 4), %0 \n\t"
265 put_signed_pixels_clamped_mmx_half(64)
266 :"+&r" (pixels), "=&r" (line_skip3)
267 :"r" (block), "r"(line_skip)
268 :"memory");
269 }
271 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
272 {
273 const DCTELEM *p;
274 uint8_t *pix;
275 int i;
277 /* read the pixels */
278 p = block;
279 pix = pixels;
280 MOVQ_ZERO(mm7);
281 i = 4;
282 do {
283 __asm__ volatile(
284 "movq (%2), %%mm0 \n\t"
285 "movq 8(%2), %%mm1 \n\t"
286 "movq 16(%2), %%mm2 \n\t"
287 "movq 24(%2), %%mm3 \n\t"
288 "movq %0, %%mm4 \n\t"
289 "movq %1, %%mm6 \n\t"
290 "movq %%mm4, %%mm5 \n\t"
291 "punpcklbw %%mm7, %%mm4 \n\t"
292 "punpckhbw %%mm7, %%mm5 \n\t"
293 "paddsw %%mm4, %%mm0 \n\t"
294 "paddsw %%mm5, %%mm1 \n\t"
295 "movq %%mm6, %%mm5 \n\t"
296 "punpcklbw %%mm7, %%mm6 \n\t"
297 "punpckhbw %%mm7, %%mm5 \n\t"
298 "paddsw %%mm6, %%mm2 \n\t"
299 "paddsw %%mm5, %%mm3 \n\t"
300 "packuswb %%mm1, %%mm0 \n\t"
301 "packuswb %%mm3, %%mm2 \n\t"
302 "movq %%mm0, %0 \n\t"
303 "movq %%mm2, %1 \n\t"
304 :"+m"(*pix), "+m"(*(pix+line_size))
305 :"r"(p)
306 :"memory");
307 pix += line_size*2;
308 p += 16;
309 } while (--i);
310 }
312 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
313 {
314 __asm__ volatile(
315 "lea (%3, %3), %%"REG_a" \n\t"
316 ASMALIGN(3)
317 "1: \n\t"
318 "movq (%1), %%mm0 \n\t"
319 "movq (%1, %3), %%mm1 \n\t"
320 "movq %%mm0, (%2) \n\t"
321 "movq %%mm1, (%2, %3) \n\t"
322 "add %%"REG_a", %1 \n\t"
323 "add %%"REG_a", %2 \n\t"
324 "movq (%1), %%mm0 \n\t"
325 "movq (%1, %3), %%mm1 \n\t"
326 "movq %%mm0, (%2) \n\t"
327 "movq %%mm1, (%2, %3) \n\t"
328 "add %%"REG_a", %1 \n\t"
329 "add %%"REG_a", %2 \n\t"
330 "subl $4, %0 \n\t"
331 "jnz 1b \n\t"
332 : "+g"(h), "+r" (pixels), "+r" (block)
333 : "r"((x86_reg)line_size)
334 : "%"REG_a, "memory"
335 );
336 }
338 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
339 {
340 __asm__ volatile(
341 "1: \n\t"
342 "movdqu (%1), %%xmm0 \n\t"
343 "movdqu (%1,%3), %%xmm1 \n\t"
344 "movdqu (%1,%3,2), %%xmm2 \n\t"
345 "movdqu (%1,%4), %%xmm3 \n\t"
346 "movdqa %%xmm0, (%2) \n\t"
347 "movdqa %%xmm1, (%2,%3) \n\t"
348 "movdqa %%xmm2, (%2,%3,2) \n\t"
349 "movdqa %%xmm3, (%2,%4) \n\t"
350 "subl $4, %0 \n\t"
351 "lea (%1,%3,4), %1 \n\t"
352 "lea (%2,%3,4), %2 \n\t"
353 "jnz 1b \n\t"
354 : "+g"(h), "+r" (pixels), "+r" (block)
355 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
356 : "memory"
357 );
358 }
360 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
361 {
362 __asm__ volatile(
363 "1: \n\t"
364 "movdqu (%1), %%xmm0 \n\t"
365 "movdqu (%1,%3), %%xmm1 \n\t"
366 "movdqu (%1,%3,2), %%xmm2 \n\t"
367 "movdqu (%1,%4), %%xmm3 \n\t"
368 "pavgb (%2), %%xmm0 \n\t"
369 "pavgb (%2,%3), %%xmm1 \n\t"
370 "pavgb (%2,%3,2), %%xmm2 \n\t"
371 "pavgb (%2,%4), %%xmm3 \n\t"
372 "movdqa %%xmm0, (%2) \n\t"
373 "movdqa %%xmm1, (%2,%3) \n\t"
374 "movdqa %%xmm2, (%2,%3,2) \n\t"
375 "movdqa %%xmm3, (%2,%4) \n\t"
376 "subl $4, %0 \n\t"
377 "lea (%1,%3,4), %1 \n\t"
378 "lea (%2,%3,4), %2 \n\t"
379 "jnz 1b \n\t"
380 : "+g"(h), "+r" (pixels), "+r" (block)
381 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
382 : "memory"
383 );
384 }
386 static void clear_block_sse(DCTELEM *block)
387 {
388 __asm__ volatile(
389 "xorps %%xmm0, %%xmm0 \n"
390 "movaps %%xmm0, (%0) \n"
391 "movaps %%xmm0, 16(%0) \n"
392 "movaps %%xmm0, 32(%0) \n"
393 "movaps %%xmm0, 48(%0) \n"
394 "movaps %%xmm0, 64(%0) \n"
395 "movaps %%xmm0, 80(%0) \n"
396 "movaps %%xmm0, 96(%0) \n"
397 "movaps %%xmm0, 112(%0) \n"
398 :: "r"(block)
399 : "memory"
400 );
401 }
403 static void clear_blocks_sse(DCTELEM *blocks)
404 {\
405 __asm__ volatile(
406 "xorps %%xmm0, %%xmm0 \n"
407 "mov %1, %%"REG_a" \n"
408 "1: \n"
409 "movaps %%xmm0, (%0, %%"REG_a") \n"
410 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
411 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
412 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
413 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
414 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
415 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
416 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
417 "add $128, %%"REG_a" \n"
418 " js 1b \n"
419 : : "r" (((uint8_t *)blocks)+128*6),
420 "i" (-128*6)
421 : "%"REG_a
422 );
423 }
425 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
426 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
427 "movd %4, %%mm0 \n\t"
428 "movd %5, %%mm1 \n\t"
429 "movd %6, %%mm2 \n\t"
430 "movd %7, %%mm3 \n\t"
431 "punpcklbw %%mm1, %%mm0 \n\t"
432 "punpcklbw %%mm3, %%mm2 \n\t"
433 "movq %%mm0, %%mm1 \n\t"
434 "punpcklwd %%mm2, %%mm0 \n\t"
435 "punpckhwd %%mm2, %%mm1 \n\t"
436 "movd %%mm0, %0 \n\t"
437 "punpckhdq %%mm0, %%mm0 \n\t"
438 "movd %%mm0, %1 \n\t"
439 "movd %%mm1, %2 \n\t"
440 "punpckhdq %%mm1, %%mm1 \n\t"
441 "movd %%mm1, %3 \n\t"
443 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
444 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
445 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
446 "=m" (*(uint32_t*)(dst + 3*dst_stride))
447 : "m" (*(uint32_t*)(src + 0*src_stride)),
448 "m" (*(uint32_t*)(src + 1*src_stride)),
449 "m" (*(uint32_t*)(src + 2*src_stride)),
450 "m" (*(uint32_t*)(src + 3*src_stride))
451 );
452 }
454 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
455 \
456 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
457 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
458 }\
459 \
460 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
461 uint64_t temp[8];\
462 uint8_t * const half= (uint8_t*)temp;\
463 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
464 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
465 }\
466 \
467 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
468 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
469 }\
470 \
471 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
472 uint64_t temp[8];\
473 uint8_t * const half= (uint8_t*)temp;\
474 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
475 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
476 }\
477 \
478 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
479 uint64_t temp[8];\
480 uint8_t * const half= (uint8_t*)temp;\
481 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
482 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
483 }\
484 \
485 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
486 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
487 }\
488 \
489 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
490 uint64_t temp[8];\
491 uint8_t * const half= (uint8_t*)temp;\
492 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
493 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
494 }\
495 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
496 uint64_t half[8 + 9];\
497 uint8_t * const halfH= ((uint8_t*)half) + 64;\
498 uint8_t * const halfHV= ((uint8_t*)half);\
499 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
500 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
501 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
502 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
503 }\
504 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
505 uint64_t half[8 + 9];\
506 uint8_t * const halfH= ((uint8_t*)half) + 64;\
507 uint8_t * const halfHV= ((uint8_t*)half);\
508 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
509 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
510 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
511 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
512 }\
513 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
514 uint64_t half[8 + 9];\
515 uint8_t * const halfH= ((uint8_t*)half) + 64;\
516 uint8_t * const halfHV= ((uint8_t*)half);\
517 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
518 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
519 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
520 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
521 }\
522 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
523 uint64_t half[8 + 9];\
524 uint8_t * const halfH= ((uint8_t*)half) + 64;\
525 uint8_t * const halfHV= ((uint8_t*)half);\
526 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
527 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
528 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
529 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
530 }\
531 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
532 uint64_t half[8 + 9];\
533 uint8_t * const halfH= ((uint8_t*)half) + 64;\
534 uint8_t * const halfHV= ((uint8_t*)half);\
535 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
536 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
537 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
538 }\
539 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
540 uint64_t half[8 + 9];\
541 uint8_t * const halfH= ((uint8_t*)half) + 64;\
542 uint8_t * const halfHV= ((uint8_t*)half);\
543 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
544 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
545 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
546 }\
547 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
548 uint64_t half[8 + 9];\
549 uint8_t * const halfH= ((uint8_t*)half);\
550 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
551 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
552 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
553 }\
554 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
555 uint64_t half[8 + 9];\
556 uint8_t * const halfH= ((uint8_t*)half);\
557 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
558 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
559 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
560 }\
561 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
562 uint64_t half[9];\
563 uint8_t * const halfH= ((uint8_t*)half);\
564 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
565 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
566 }\
567 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
568 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
569 }\
570 \
571 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
572 uint64_t temp[32];\
573 uint8_t * const half= (uint8_t*)temp;\
574 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
575 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
576 }\
577 \
578 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
579 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
580 }\
581 \
582 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
583 uint64_t temp[32];\
584 uint8_t * const half= (uint8_t*)temp;\
585 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
586 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
587 }\
588 \
589 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
590 uint64_t temp[32];\
591 uint8_t * const half= (uint8_t*)temp;\
592 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
593 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
594 }\
595 \
596 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
597 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
598 }\
599 \
600 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
601 uint64_t temp[32];\
602 uint8_t * const half= (uint8_t*)temp;\
603 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
604 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
605 }\
606 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
607 uint64_t half[16*2 + 17*2];\
608 uint8_t * const halfH= ((uint8_t*)half) + 256;\
609 uint8_t * const halfHV= ((uint8_t*)half);\
610 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
611 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
612 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
613 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
614 }\
615 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
616 uint64_t half[16*2 + 17*2];\
617 uint8_t * const halfH= ((uint8_t*)half) + 256;\
618 uint8_t * const halfHV= ((uint8_t*)half);\
619 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
620 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
621 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
622 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
623 }\
624 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
625 uint64_t half[16*2 + 17*2];\
626 uint8_t * const halfH= ((uint8_t*)half) + 256;\
627 uint8_t * const halfHV= ((uint8_t*)half);\
628 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
629 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
630 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
631 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
632 }\
633 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
634 uint64_t half[16*2 + 17*2];\
635 uint8_t * const halfH= ((uint8_t*)half) + 256;\
636 uint8_t * const halfHV= ((uint8_t*)half);\
637 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
638 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
639 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
640 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
641 }\
642 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
643 uint64_t half[16*2 + 17*2];\
644 uint8_t * const halfH= ((uint8_t*)half) + 256;\
645 uint8_t * const halfHV= ((uint8_t*)half);\
646 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
647 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
648 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
649 }\
650 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
651 uint64_t half[16*2 + 17*2];\
652 uint8_t * const halfH= ((uint8_t*)half) + 256;\
653 uint8_t * const halfHV= ((uint8_t*)half);\
654 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
655 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
656 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
657 }\
658 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
659 uint64_t half[17*2];\
660 uint8_t * const halfH= ((uint8_t*)half);\
661 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
662 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
663 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
664 }\
665 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
666 uint64_t half[17*2];\
667 uint8_t * const halfH= ((uint8_t*)half);\
668 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
669 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
670 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
671 }\
672 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
673 uint64_t half[17*2];\
674 uint8_t * const halfH= ((uint8_t*)half);\
675 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
676 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
677 }
679 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
680 #define AVG_3DNOW_OP(a,b,temp, size) \
681 "mov" #size " " #b ", " #temp " \n\t"\
682 "pavgusb " #temp ", " #a " \n\t"\
683 "mov" #size " " #a ", " #b " \n\t"
684 #define AVG_MMX2_OP(a,b,temp, size) \
685 "mov" #size " " #b ", " #temp " \n\t"\
686 "pavgb " #temp ", " #a " \n\t"\
687 "mov" #size " " #a ", " #b " \n\t"
689 #define PREFETCH(name, op) \
690 static void name(void *mem, int stride, int h){\
691 const uint8_t *p= mem;\
692 do{\
693 __asm__ volatile(#op" %0" :: "m"(*p));\
694 p+= stride;\
695 }while(--h);\
696 }
697 PREFETCH(prefetch_mmx2, prefetcht0)
698 #undef PREFETCH
700 #include "h264dsp_mmx.c"
702 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
703 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
704 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
705 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
706 void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
708 void dsputil_init_mmx(DSPContext* c)
709 {
710 mm_flags = mm_support();
712 if (mm_flags & FF_MM_MMX) {
713 c->clear_block = clear_block_sse;
714 c->clear_blocks = clear_blocks_sse;
715 c->prefetch = prefetch_mmx2;
718 #define H264_QPEL_FUNCS(x, y, CPU)\
719 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
720 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
721 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
722 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
724 if((mm_flags & FF_MM_SSE2)){
725 c->put_pixels_tab[0][0] = put_pixels16_sse2;
726 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
728 }
729 if(mm_flags & FF_MM_SSE2){
730 H264_QPEL_FUNCS(0, 1, sse2);
731 H264_QPEL_FUNCS(0, 2, sse2);
732 H264_QPEL_FUNCS(0, 3, sse2);
733 H264_QPEL_FUNCS(1, 1, sse2);
734 H264_QPEL_FUNCS(1, 2, sse2);
735 H264_QPEL_FUNCS(1, 3, sse2);
736 H264_QPEL_FUNCS(2, 1, sse2);
737 H264_QPEL_FUNCS(2, 2, sse2);
738 H264_QPEL_FUNCS(2, 3, sse2);
739 H264_QPEL_FUNCS(3, 1, sse2);
740 H264_QPEL_FUNCS(3, 2, sse2);
741 H264_QPEL_FUNCS(3, 3, sse2);
742 }
743 #if HAVE_SSSE3
744 if(mm_flags & FF_MM_SSSE3){
745 H264_QPEL_FUNCS(1, 0, ssse3);
746 H264_QPEL_FUNCS(1, 1, ssse3);
747 H264_QPEL_FUNCS(1, 2, ssse3);
748 H264_QPEL_FUNCS(1, 3, ssse3);
749 H264_QPEL_FUNCS(2, 0, ssse3);
750 H264_QPEL_FUNCS(2, 1, ssse3);
751 H264_QPEL_FUNCS(2, 2, ssse3);
752 H264_QPEL_FUNCS(2, 3, ssse3);
753 H264_QPEL_FUNCS(3, 0, ssse3);
754 H264_QPEL_FUNCS(3, 1, ssse3);
755 H264_QPEL_FUNCS(3, 2, ssse3);
756 H264_QPEL_FUNCS(3, 3, ssse3);
758 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
759 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
760 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
761 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
762 }
763 #endif
766 }
767 }
769 void ff_h264dsp_init_x86(H264DSPContext *c)
770 {
771 mm_flags = mm_support();
773 if (mm_flags & FF_MM_MMX) {
774 c->h264_idct_dc_add=
775 c->h264_idct_add= ff_h264_idct_add_mmx;
776 c->h264_idct8_dc_add=
777 c->h264_idct8_add= ff_h264_idct8_add_mmx;
779 if (mm_flags & FF_MM_MMX2) {
780 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
781 c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
782 c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
783 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
785 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
786 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
788 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
789 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
790 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
791 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
792 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
793 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
794 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
796 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
797 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
798 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
799 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
800 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
801 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
802 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
803 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
805 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
806 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
807 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
808 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
809 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
810 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
811 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
812 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
813 }
814 if(mm_flags & FF_MM_SSE2){
815 c->h264_idct8_add = ff_h264_idct8_add_sse2;
816 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
817 }
819 }
820 }
