nengel@2: /* nengel@2: * MMX optimized DSP utils nengel@2: * Copyright (c) 2000, 2001 Fabrice Bellard nengel@2: * Copyright (c) 2002-2004 Michael Niedermayer nengel@2: * nengel@2: * This file is part of FFmpeg. nengel@2: * nengel@2: * FFmpeg is free software; you can redistribute it and/or nengel@2: * modify it under the terms of the GNU Lesser General Public nengel@2: * License as published by the Free Software Foundation; either nengel@2: * version 2.1 of the License, or (at your option) any later version. nengel@2: * nengel@2: * FFmpeg is distributed in the hope that it will be useful, nengel@2: * but WITHOUT ANY WARRANTY; without even the implied warranty of nengel@2: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU nengel@2: * Lesser General Public License for more details. nengel@2: * nengel@2: * You should have received a copy of the GNU Lesser General Public nengel@2: * License along with FFmpeg; if not, write to the Free Software nengel@2: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA nengel@2: * nengel@2: * MMX optimization by Nick Kurshev nengel@2: */ nengel@2: nengel@2: #include "libavutil/x86_cpu.h" nengel@2: #include "libavutil/internal.h" nengel@2: #include "libavcodec/dsputil.h" nengel@2: #include "libavcodec/h264_dsp.h" nengel@2: #include "dsputil_mmx.h" nengel@2: nengel@2: nengel@2: //#undef NDEBUG nengel@2: //#include nengel@2: nengel@2: int mm_flags; /* multimedia extension flags */ nengel@2: nengel@2: /* pixel operations */ nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; nengel@2: nengel@2: DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = nengel@2: {0x8000000080000000ULL, 0x8000000080000000ULL}; nengel@2: nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; nengel@2: DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; nengel@2: DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; nengel@2: DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; nengel@2: DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; nengel@2: DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; nengel@2: DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; nengel@2: nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; nengel@2: DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; nengel@2: nengel@2: DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; nengel@2: DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; nengel@2: nengel@2: #define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t" nengel@2: #define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::) nengel@2: #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) nengel@2: nengel@2: #define MOVQ_BFE(regd) \ nengel@2: __asm__ volatile ( \ nengel@2: "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ nengel@2: "paddb %%" #regd ", %%" #regd " \n\t" ::) nengel@2: nengel@2: #ifndef PIC nengel@2: #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) nengel@2: #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) nengel@2: #else nengel@2: // for shared library it's better to use this way for accessing constants nengel@2: // pcmpeqd -> -1 nengel@2: #define MOVQ_BONE(regd) \ nengel@2: __asm__ volatile ( \ nengel@2: "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ nengel@2: "psrlw $15, %%" #regd " \n\t" \ nengel@2: "packuswb %%" #regd ", %%" #regd " \n\t" ::) nengel@2: nengel@2: #define MOVQ_WTWO(regd) \ nengel@2: __asm__ volatile ( \ nengel@2: "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ nengel@2: "psrlw $15, %%" #regd " \n\t" \ nengel@2: "psllw $1, %%" #regd " \n\t"::) nengel@2: nengel@2: #endif nengel@2: nengel@2: // using regr as temporary and for the output result nengel@2: // first argument is unmodifed and second is trashed nengel@2: // regfe is supposed to contain 0xfefefefefefefefe nengel@2: #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ nengel@2: "movq " #rega ", " #regr " \n\t"\ nengel@2: "pand " #regb ", " #regr " \n\t"\ nengel@2: "pxor " #rega ", " #regb " \n\t"\ nengel@2: "pand " #regfe "," #regb " \n\t"\ nengel@2: "psrlq $1, " #regb " \n\t"\ nengel@2: "paddb " #regb ", " #regr " \n\t" nengel@2: nengel@2: #define PAVGB_MMX(rega, regb, regr, regfe) \ nengel@2: "movq " #rega ", " #regr " \n\t"\ nengel@2: "por " #regb ", " #regr " \n\t"\ nengel@2: "pxor " #rega ", " #regb " \n\t"\ nengel@2: "pand " #regfe "," #regb " \n\t"\ nengel@2: "psrlq $1, " #regb " \n\t"\ nengel@2: "psubb " #regb ", " #regr " \n\t" nengel@2: nengel@2: // mm6 is supposed to contain 0xfefefefefefefefe nengel@2: #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ nengel@2: "movq " #rega ", " #regr " \n\t"\ nengel@2: "movq " #regc ", " #regp " \n\t"\ nengel@2: "pand " #regb ", " #regr " \n\t"\ nengel@2: "pand " #regd ", " #regp " \n\t"\ nengel@2: "pxor " #rega ", " #regb " \n\t"\ nengel@2: "pxor " #regc ", " #regd " \n\t"\ nengel@2: "pand %%mm6, " #regb " \n\t"\ nengel@2: "pand %%mm6, " #regd " \n\t"\ nengel@2: "psrlq $1, " #regb " \n\t"\ nengel@2: "psrlq $1, " #regd " \n\t"\ nengel@2: "paddb " #regb ", " #regr " \n\t"\ nengel@2: "paddb " #regd ", " #regp " \n\t" nengel@2: nengel@2: #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ nengel@2: "movq " #rega ", " #regr " \n\t"\ nengel@2: "movq " #regc ", " #regp " \n\t"\ nengel@2: "por " #regb ", " #regr " \n\t"\ nengel@2: "por " #regd ", " #regp " \n\t"\ nengel@2: "pxor " #rega ", " #regb " \n\t"\ nengel@2: "pxor " #regc ", " #regd " \n\t"\ nengel@2: "pand %%mm6, " #regb " \n\t"\ nengel@2: "pand %%mm6, " #regd " \n\t"\ nengel@2: "psrlq $1, " #regd " \n\t"\ nengel@2: "psrlq $1, " #regb " \n\t"\ nengel@2: "psubb " #regb ", " #regr " \n\t"\ nengel@2: "psubb " #regd ", " #regp " \n\t" nengel@2: nengel@2: /***********************************/ nengel@2: /* MMX2 specific */ nengel@2: nengel@2: #define DEF(x) x ## _mmx2 nengel@2: nengel@2: /* Introduced only in MMX2 set */ nengel@2: #define PAVGB "pavgb" nengel@2: #define OP_AVG PAVGB nengel@2: nengel@2: #include "dsputil_mmx_avg_template.c" nengel@2: nengel@2: #undef DEF nengel@2: #undef PAVGB nengel@2: #undef OP_AVG nengel@2: nengel@2: #define put_no_rnd_pixels16_mmx put_pixels16_mmx nengel@2: #define put_no_rnd_pixels8_mmx put_pixels8_mmx nengel@2: #define put_pixels16_mmx2 put_pixels16_mmx nengel@2: #define put_pixels8_mmx2 put_pixels8_mmx nengel@2: #define put_pixels4_mmx2 put_pixels4_mmx nengel@2: #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx nengel@2: #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx nengel@2: #define put_pixels16_3dnow put_pixels16_mmx nengel@2: #define put_pixels8_3dnow put_pixels8_mmx nengel@2: #define put_pixels4_3dnow put_pixels4_mmx nengel@2: #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx nengel@2: #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx nengel@2: nengel@2: /***********************************/ nengel@2: /* standard MMX */ nengel@2: nengel@2: void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) nengel@2: { nengel@2: const DCTELEM *p; nengel@2: uint8_t *pix; nengel@2: nengel@2: /* read the pixels */ nengel@2: p = block; nengel@2: pix = pixels; nengel@2: /* unrolled loop */ nengel@2: __asm__ volatile( nengel@2: "movq %3, %%mm0 \n\t" nengel@2: "movq 8%3, %%mm1 \n\t" nengel@2: "movq 16%3, %%mm2 \n\t" nengel@2: "movq 24%3, %%mm3 \n\t" nengel@2: "movq 32%3, %%mm4 \n\t" nengel@2: "movq 40%3, %%mm5 \n\t" nengel@2: "movq 48%3, %%mm6 \n\t" nengel@2: "movq 56%3, %%mm7 \n\t" nengel@2: "packuswb %%mm1, %%mm0 \n\t" nengel@2: "packuswb %%mm3, %%mm2 \n\t" nengel@2: "packuswb %%mm5, %%mm4 \n\t" nengel@2: "packuswb %%mm7, %%mm6 \n\t" nengel@2: "movq %%mm0, (%0) \n\t" nengel@2: "movq %%mm2, (%0, %1) \n\t" nengel@2: "movq %%mm4, (%0, %1, 2) \n\t" nengel@2: "movq %%mm6, (%0, %2) \n\t" nengel@2: ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) nengel@2: :"memory"); nengel@2: pix += line_size*4; nengel@2: p += 32; nengel@2: nengel@2: // if here would be an exact copy of the code above nengel@2: // compiler would generate some very strange code nengel@2: // thus using "r" nengel@2: __asm__ volatile( nengel@2: "movq (%3), %%mm0 \n\t" nengel@2: "movq 8(%3), %%mm1 \n\t" nengel@2: "movq 16(%3), %%mm2 \n\t" nengel@2: "movq 24(%3), %%mm3 \n\t" nengel@2: "movq 32(%3), %%mm4 \n\t" nengel@2: "movq 40(%3), %%mm5 \n\t" nengel@2: "movq 48(%3), %%mm6 \n\t" nengel@2: "movq 56(%3), %%mm7 \n\t" nengel@2: "packuswb %%mm1, %%mm0 \n\t" nengel@2: "packuswb %%mm3, %%mm2 \n\t" nengel@2: "packuswb %%mm5, %%mm4 \n\t" nengel@2: "packuswb %%mm7, %%mm6 \n\t" nengel@2: "movq %%mm0, (%0) \n\t" nengel@2: "movq %%mm2, (%0, %1) \n\t" nengel@2: "movq %%mm4, (%0, %1, 2) \n\t" nengel@2: "movq %%mm6, (%0, %2) \n\t" nengel@2: ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) nengel@2: :"memory"); nengel@2: } nengel@2: nengel@2: DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] = nengel@2: { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; nengel@2: nengel@2: #define put_signed_pixels_clamped_mmx_half(off) \ nengel@2: "movq "#off"(%2), %%mm1 \n\t"\ nengel@2: "movq 16+"#off"(%2), %%mm2 \n\t"\ nengel@2: "movq 32+"#off"(%2), %%mm3 \n\t"\ nengel@2: "movq 48+"#off"(%2), %%mm4 \n\t"\ nengel@2: "packsswb 8+"#off"(%2), %%mm1 \n\t"\ nengel@2: "packsswb 24+"#off"(%2), %%mm2 \n\t"\ nengel@2: "packsswb 40+"#off"(%2), %%mm3 \n\t"\ nengel@2: "packsswb 56+"#off"(%2), %%mm4 \n\t"\ nengel@2: "paddb %%mm0, %%mm1 \n\t"\ nengel@2: "paddb %%mm0, %%mm2 \n\t"\ nengel@2: "paddb %%mm0, %%mm3 \n\t"\ nengel@2: "paddb %%mm0, %%mm4 \n\t"\ nengel@2: "movq %%mm1, (%0) \n\t"\ nengel@2: "movq %%mm2, (%0, %3) \n\t"\ nengel@2: "movq %%mm3, (%0, %3, 2) \n\t"\ nengel@2: "movq %%mm4, (%0, %1) \n\t" nengel@2: nengel@2: void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) nengel@2: { nengel@2: x86_reg line_skip = line_size; nengel@2: x86_reg line_skip3; nengel@2: nengel@2: __asm__ volatile ( nengel@2: "movq "MANGLE(ff_vector128)", %%mm0 \n\t" nengel@2: "lea (%3, %3, 2), %1 \n\t" nengel@2: put_signed_pixels_clamped_mmx_half(0) nengel@2: "lea (%0, %3, 4), %0 \n\t" nengel@2: put_signed_pixels_clamped_mmx_half(64) nengel@2: :"+&r" (pixels), "=&r" (line_skip3) nengel@2: :"r" (block), "r"(line_skip) nengel@2: :"memory"); nengel@2: } nengel@2: nengel@2: void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) nengel@2: { nengel@2: const DCTELEM *p; nengel@2: uint8_t *pix; nengel@2: int i; nengel@2: nengel@2: /* read the pixels */ nengel@2: p = block; nengel@2: pix = pixels; nengel@2: MOVQ_ZERO(mm7); nengel@2: i = 4; nengel@2: do { nengel@2: __asm__ volatile( nengel@2: "movq (%2), %%mm0 \n\t" nengel@2: "movq 8(%2), %%mm1 \n\t" nengel@2: "movq 16(%2), %%mm2 \n\t" nengel@2: "movq 24(%2), %%mm3 \n\t" nengel@2: "movq %0, %%mm4 \n\t" nengel@2: "movq %1, %%mm6 \n\t" nengel@2: "movq %%mm4, %%mm5 \n\t" nengel@2: "punpcklbw %%mm7, %%mm4 \n\t" nengel@2: "punpckhbw %%mm7, %%mm5 \n\t" nengel@2: "paddsw %%mm4, %%mm0 \n\t" nengel@2: "paddsw %%mm5, %%mm1 \n\t" nengel@2: "movq %%mm6, %%mm5 \n\t" nengel@2: "punpcklbw %%mm7, %%mm6 \n\t" nengel@2: "punpckhbw %%mm7, %%mm5 \n\t" nengel@2: "paddsw %%mm6, %%mm2 \n\t" nengel@2: "paddsw %%mm5, %%mm3 \n\t" nengel@2: "packuswb %%mm1, %%mm0 \n\t" nengel@2: "packuswb %%mm3, %%mm2 \n\t" nengel@2: "movq %%mm0, %0 \n\t" nengel@2: "movq %%mm2, %1 \n\t" nengel@2: :"+m"(*pix), "+m"(*(pix+line_size)) nengel@2: :"r"(p) nengel@2: :"memory"); nengel@2: pix += line_size*2; nengel@2: p += 16; nengel@2: } while (--i); nengel@2: } nengel@2: nengel@2: static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) nengel@2: { nengel@2: __asm__ volatile( nengel@2: "lea (%3, %3), %%"REG_a" \n\t" nengel@2: ASMALIGN(3) nengel@2: "1: \n\t" nengel@2: "movq (%1), %%mm0 \n\t" nengel@2: "movq (%1, %3), %%mm1 \n\t" nengel@2: "movq %%mm0, (%2) \n\t" nengel@2: "movq %%mm1, (%2, %3) \n\t" nengel@2: "add %%"REG_a", %1 \n\t" nengel@2: "add %%"REG_a", %2 \n\t" nengel@2: "movq (%1), %%mm0 \n\t" nengel@2: "movq (%1, %3), %%mm1 \n\t" nengel@2: "movq %%mm0, (%2) \n\t" nengel@2: "movq %%mm1, (%2, %3) \n\t" nengel@2: "add %%"REG_a", %1 \n\t" nengel@2: "add %%"REG_a", %2 \n\t" nengel@2: "subl $4, %0 \n\t" nengel@2: "jnz 1b \n\t" nengel@2: : "+g"(h), "+r" (pixels), "+r" (block) nengel@2: : "r"((x86_reg)line_size) nengel@2: : "%"REG_a, "memory" nengel@2: ); nengel@2: } nengel@2: nengel@2: static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) nengel@2: { nengel@2: __asm__ volatile( nengel@2: "1: \n\t" nengel@2: "movdqu (%1), %%xmm0 \n\t" nengel@2: "movdqu (%1,%3), %%xmm1 \n\t" nengel@2: "movdqu (%1,%3,2), %%xmm2 \n\t" nengel@2: "movdqu (%1,%4), %%xmm3 \n\t" nengel@2: "movdqa %%xmm0, (%2) \n\t" nengel@2: "movdqa %%xmm1, (%2,%3) \n\t" nengel@2: "movdqa %%xmm2, (%2,%3,2) \n\t" nengel@2: "movdqa %%xmm3, (%2,%4) \n\t" nengel@2: "subl $4, %0 \n\t" nengel@2: "lea (%1,%3,4), %1 \n\t" nengel@2: "lea (%2,%3,4), %2 \n\t" nengel@2: "jnz 1b \n\t" nengel@2: : "+g"(h), "+r" (pixels), "+r" (block) nengel@2: : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) nengel@2: : "memory" nengel@2: ); nengel@2: } nengel@2: nengel@2: static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) nengel@2: { nengel@2: __asm__ volatile( nengel@2: "1: \n\t" nengel@2: "movdqu (%1), %%xmm0 \n\t" nengel@2: "movdqu (%1,%3), %%xmm1 \n\t" nengel@2: "movdqu (%1,%3,2), %%xmm2 \n\t" nengel@2: "movdqu (%1,%4), %%xmm3 \n\t" nengel@2: "pavgb (%2), %%xmm0 \n\t" nengel@2: "pavgb (%2,%3), %%xmm1 \n\t" nengel@2: "pavgb (%2,%3,2), %%xmm2 \n\t" nengel@2: "pavgb (%2,%4), %%xmm3 \n\t" nengel@2: "movdqa %%xmm0, (%2) \n\t" nengel@2: "movdqa %%xmm1, (%2,%3) \n\t" nengel@2: "movdqa %%xmm2, (%2,%3,2) \n\t" nengel@2: "movdqa %%xmm3, (%2,%4) \n\t" nengel@2: "subl $4, %0 \n\t" nengel@2: "lea (%1,%3,4), %1 \n\t" nengel@2: "lea (%2,%3,4), %2 \n\t" nengel@2: "jnz 1b \n\t" nengel@2: : "+g"(h), "+r" (pixels), "+r" (block) nengel@2: : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) nengel@2: : "memory" nengel@2: ); nengel@2: } nengel@2: nengel@2: static void clear_block_sse(DCTELEM *block) nengel@2: { nengel@2: __asm__ volatile( nengel@2: "xorps %%xmm0, %%xmm0 \n" nengel@2: "movaps %%xmm0, (%0) \n" nengel@2: "movaps %%xmm0, 16(%0) \n" nengel@2: "movaps %%xmm0, 32(%0) \n" nengel@2: "movaps %%xmm0, 48(%0) \n" nengel@2: "movaps %%xmm0, 64(%0) \n" nengel@2: "movaps %%xmm0, 80(%0) \n" nengel@2: "movaps %%xmm0, 96(%0) \n" nengel@2: "movaps %%xmm0, 112(%0) \n" nengel@2: :: "r"(block) nengel@2: : "memory" nengel@2: ); nengel@2: } nengel@2: nengel@2: static void clear_blocks_sse(DCTELEM *blocks) nengel@2: {\ nengel@2: __asm__ volatile( nengel@2: "xorps %%xmm0, %%xmm0 \n" nengel@2: "mov %1, %%"REG_a" \n" nengel@2: "1: \n" nengel@2: "movaps %%xmm0, (%0, %%"REG_a") \n" nengel@2: "movaps %%xmm0, 16(%0, %%"REG_a") \n" nengel@2: "movaps %%xmm0, 32(%0, %%"REG_a") \n" nengel@2: "movaps %%xmm0, 48(%0, %%"REG_a") \n" nengel@2: "movaps %%xmm0, 64(%0, %%"REG_a") \n" nengel@2: "movaps %%xmm0, 80(%0, %%"REG_a") \n" nengel@2: "movaps %%xmm0, 96(%0, %%"REG_a") \n" nengel@2: "movaps %%xmm0, 112(%0, %%"REG_a") \n" nengel@2: "add $128, %%"REG_a" \n" nengel@2: " js 1b \n" nengel@2: : : "r" (((uint8_t *)blocks)+128*6), nengel@2: "i" (-128*6) nengel@2: : "%"REG_a nengel@2: ); nengel@2: } nengel@2: nengel@2: static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ nengel@2: __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... nengel@2: "movd %4, %%mm0 \n\t" nengel@2: "movd %5, %%mm1 \n\t" nengel@2: "movd %6, %%mm2 \n\t" nengel@2: "movd %7, %%mm3 \n\t" nengel@2: "punpcklbw %%mm1, %%mm0 \n\t" nengel@2: "punpcklbw %%mm3, %%mm2 \n\t" nengel@2: "movq %%mm0, %%mm1 \n\t" nengel@2: "punpcklwd %%mm2, %%mm0 \n\t" nengel@2: "punpckhwd %%mm2, %%mm1 \n\t" nengel@2: "movd %%mm0, %0 \n\t" nengel@2: "punpckhdq %%mm0, %%mm0 \n\t" nengel@2: "movd %%mm0, %1 \n\t" nengel@2: "movd %%mm1, %2 \n\t" nengel@2: "punpckhdq %%mm1, %%mm1 \n\t" nengel@2: "movd %%mm1, %3 \n\t" nengel@2: nengel@2: : "=m" (*(uint32_t*)(dst + 0*dst_stride)), nengel@2: "=m" (*(uint32_t*)(dst + 1*dst_stride)), nengel@2: "=m" (*(uint32_t*)(dst + 2*dst_stride)), nengel@2: "=m" (*(uint32_t*)(dst + 3*dst_stride)) nengel@2: : "m" (*(uint32_t*)(src + 0*src_stride)), nengel@2: "m" (*(uint32_t*)(src + 1*src_stride)), nengel@2: "m" (*(uint32_t*)(src + 2*src_stride)), nengel@2: "m" (*(uint32_t*)(src + 3*src_stride)) nengel@2: ); nengel@2: } nengel@2: nengel@2: #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ nengel@2: \ nengel@2: static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ nengel@2: OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t temp[8];\ nengel@2: uint8_t * const half= (uint8_t*)temp;\ nengel@2: put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ nengel@2: OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t temp[8];\ nengel@2: uint8_t * const half= (uint8_t*)temp;\ nengel@2: put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ nengel@2: OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t temp[8];\ nengel@2: uint8_t * const half= (uint8_t*)temp;\ nengel@2: put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ nengel@2: OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t temp[8];\ nengel@2: uint8_t * const half= (uint8_t*)temp;\ nengel@2: put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ nengel@2: OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[8 + 9];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 64;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ nengel@2: put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ nengel@2: put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ nengel@2: OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[8 + 9];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 64;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ nengel@2: put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ nengel@2: put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ nengel@2: OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[8 + 9];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 64;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ nengel@2: put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ nengel@2: put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ nengel@2: OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[8 + 9];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 64;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ nengel@2: put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ nengel@2: put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ nengel@2: OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[8 + 9];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 64;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ nengel@2: put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ nengel@2: OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[8 + 9];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 64;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ nengel@2: put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ nengel@2: OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[8 + 9];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ nengel@2: put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ nengel@2: OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[8 + 9];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ nengel@2: put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ nengel@2: OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[9];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ nengel@2: OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ nengel@2: OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t temp[32];\ nengel@2: uint8_t * const half= (uint8_t*)temp;\ nengel@2: put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ nengel@2: OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t temp[32];\ nengel@2: uint8_t * const half= (uint8_t*)temp;\ nengel@2: put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ nengel@2: OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t temp[32];\ nengel@2: uint8_t * const half= (uint8_t*)temp;\ nengel@2: put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ nengel@2: OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ nengel@2: }\ nengel@2: \ nengel@2: static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t temp[32];\ nengel@2: uint8_t * const half= (uint8_t*)temp;\ nengel@2: put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ nengel@2: OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[16*2 + 17*2];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 256;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ nengel@2: put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ nengel@2: put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ nengel@2: OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[16*2 + 17*2];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 256;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ nengel@2: put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ nengel@2: put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ nengel@2: OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[16*2 + 17*2];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 256;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ nengel@2: put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ nengel@2: put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ nengel@2: OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[16*2 + 17*2];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 256;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ nengel@2: put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ nengel@2: put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ nengel@2: OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[16*2 + 17*2];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 256;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ nengel@2: put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ nengel@2: OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[16*2 + 17*2];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half) + 256;\ nengel@2: uint8_t * const halfHV= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ nengel@2: put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ nengel@2: OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[17*2];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ nengel@2: put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ nengel@2: OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[17*2];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ nengel@2: put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ nengel@2: OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ nengel@2: }\ nengel@2: static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ nengel@2: uint64_t half[17*2];\ nengel@2: uint8_t * const halfH= ((uint8_t*)half);\ nengel@2: put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ nengel@2: OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ nengel@2: } nengel@2: nengel@2: #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" nengel@2: #define AVG_3DNOW_OP(a,b,temp, size) \ nengel@2: "mov" #size " " #b ", " #temp " \n\t"\ nengel@2: "pavgusb " #temp ", " #a " \n\t"\ nengel@2: "mov" #size " " #a ", " #b " \n\t" nengel@2: #define AVG_MMX2_OP(a,b,temp, size) \ nengel@2: "mov" #size " " #b ", " #temp " \n\t"\ nengel@2: "pavgb " #temp ", " #a " \n\t"\ nengel@2: "mov" #size " " #a ", " #b " \n\t" nengel@2: nengel@2: #define PREFETCH(name, op) \ nengel@2: static void name(void *mem, int stride, int h){\ nengel@2: const uint8_t *p= mem;\ nengel@2: do{\ nengel@2: __asm__ volatile(#op" %0" :: "m"(*p));\ nengel@2: p+= stride;\ nengel@2: }while(--h);\ nengel@2: } nengel@2: PREFETCH(prefetch_mmx2, prefetcht0) nengel@2: #undef PREFETCH nengel@2: nengel@2: #include "h264dsp_mmx.c" nengel@2: nengel@2: void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); nengel@2: void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); nengel@2: void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); nengel@2: void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); nengel@2: void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); nengel@2: nengel@2: void dsputil_init_mmx(DSPContext* c) nengel@2: { nengel@2: mm_flags = mm_support(); nengel@2: nengel@2: if (mm_flags & FF_MM_MMX) { nengel@2: c->clear_block = clear_block_sse; nengel@2: c->clear_blocks = clear_blocks_sse; nengel@2: c->prefetch = prefetch_mmx2; nengel@2: nengel@2: nengel@2: #define H264_QPEL_FUNCS(x, y, CPU)\ nengel@2: c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ nengel@2: c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ nengel@2: c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ nengel@2: c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; nengel@2: nengel@2: if((mm_flags & FF_MM_SSE2)){ nengel@2: c->put_pixels_tab[0][0] = put_pixels16_sse2; nengel@2: c->avg_pixels_tab[0][0] = avg_pixels16_sse2; nengel@2: nengel@2: } nengel@2: if(mm_flags & FF_MM_SSE2){ nengel@2: H264_QPEL_FUNCS(0, 1, sse2); nengel@2: H264_QPEL_FUNCS(0, 2, sse2); nengel@2: H264_QPEL_FUNCS(0, 3, sse2); nengel@2: H264_QPEL_FUNCS(1, 1, sse2); nengel@2: H264_QPEL_FUNCS(1, 2, sse2); nengel@2: H264_QPEL_FUNCS(1, 3, sse2); nengel@2: H264_QPEL_FUNCS(2, 1, sse2); nengel@2: H264_QPEL_FUNCS(2, 2, sse2); nengel@2: H264_QPEL_FUNCS(2, 3, sse2); nengel@2: H264_QPEL_FUNCS(3, 1, sse2); nengel@2: H264_QPEL_FUNCS(3, 2, sse2); nengel@2: H264_QPEL_FUNCS(3, 3, sse2); nengel@2: } nengel@2: #if HAVE_SSSE3 nengel@2: if(mm_flags & FF_MM_SSSE3){ nengel@2: H264_QPEL_FUNCS(1, 0, ssse3); nengel@2: H264_QPEL_FUNCS(1, 1, ssse3); nengel@2: H264_QPEL_FUNCS(1, 2, ssse3); nengel@2: H264_QPEL_FUNCS(1, 3, ssse3); nengel@2: H264_QPEL_FUNCS(2, 0, ssse3); nengel@2: H264_QPEL_FUNCS(2, 1, ssse3); nengel@2: H264_QPEL_FUNCS(2, 2, ssse3); nengel@2: H264_QPEL_FUNCS(2, 3, ssse3); nengel@2: H264_QPEL_FUNCS(3, 0, ssse3); nengel@2: H264_QPEL_FUNCS(3, 1, ssse3); nengel@2: H264_QPEL_FUNCS(3, 2, ssse3); nengel@2: H264_QPEL_FUNCS(3, 3, ssse3); nengel@2: nengel@2: c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd; nengel@2: c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd; nengel@2: c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; nengel@2: c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; nengel@2: } nengel@2: #endif nengel@2: nengel@2: nengel@2: } nengel@2: } nengel@2: nengel@2: void ff_h264dsp_init_x86(H264DSPContext *c) nengel@2: { nengel@2: mm_flags = mm_support(); nengel@2: nengel@2: if (mm_flags & FF_MM_MMX) { nengel@2: c->h264_idct_dc_add= nengel@2: c->h264_idct_add= ff_h264_idct_add_mmx; nengel@2: c->h264_idct8_dc_add= nengel@2: c->h264_idct8_add= ff_h264_idct8_add_mmx; nengel@2: nengel@2: if (mm_flags & FF_MM_MMX2) { nengel@2: c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; nengel@2: c->h264_idct_add8 = ff_h264_idct_add8_mmx2; nengel@2: c->h264_idct_add16 = ff_h264_idct_add16_mmx2; nengel@2: c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; nengel@2: nengel@2: c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; nengel@2: c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; nengel@2: nengel@2: c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; nengel@2: c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; nengel@2: c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; nengel@2: c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; nengel@2: c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; nengel@2: c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; nengel@2: c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; nengel@2: nengel@2: c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; nengel@2: c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; nengel@2: c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; nengel@2: c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; nengel@2: c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; nengel@2: c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; nengel@2: c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; nengel@2: c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; nengel@2: nengel@2: c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; nengel@2: c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; nengel@2: c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; nengel@2: c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; nengel@2: c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; nengel@2: c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; nengel@2: c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; nengel@2: c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; nengel@2: } nengel@2: if(mm_flags & FF_MM_SSE2){ nengel@2: c->h264_idct8_add = ff_h264_idct8_add_sse2; nengel@2: c->h264_idct8_add4= ff_h264_idct8_add4_sse2; nengel@2: } nengel@2: nengel@2: } nengel@2: } nengel@2: