PR/Applications/VSs/VSs__H264__App: libavcodec/x86/dsputil

comparison libavcodec/x86/dsputil_mmx.c @ 3:0b056460c67d

changed code to use VSs

author	Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date	Mon, 29 Oct 2012 16:44:27 +0100
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:9ac291ad5518
+/*
+* MMX optimized DSP utils
+* Copyright (c) 2000, 2001 Fabrice Bellard
+* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*
+* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+*/
+#include "libavutil/x86_cpu.h"
+#include "libavutil/internal.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/h264_dsp.h"
+#include "dsputil_mmx.h"
+//#undef NDEBUG
+//#include <assert.h>
+int mm_flags; /* multimedia extension flags */
+/* pixel operations */
+DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
+DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
+{0x8000000080000000ULL, 0x8000000080000000ULL};
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8  ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
+DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
+DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
+#define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t"
+#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
+#define MOVQ_ZERO(regd)  __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
+#define MOVQ_BFE(regd) \
+__asm__ volatile ( \
+"pcmpeqd %%" #regd ", %%" #regd " \n\t"\
+"paddb %%" #regd ", %%" #regd " \n\t" ::)
+#ifndef PIC
+#define MOVQ_BONE(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
+#define MOVQ_WTWO(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
+#else
+// for shared library it's better to use this way for accessing constants
+// pcmpeqd -> -1
+#define MOVQ_BONE(regd) \
+__asm__ volatile ( \
+"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+"psrlw $15, %%" #regd " \n\t" \
+"packuswb %%" #regd ", %%" #regd " \n\t" ::)
+#define MOVQ_WTWO(regd) \
+__asm__ volatile ( \
+"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+"psrlw $15, %%" #regd " \n\t" \
+"psllw $1, %%" #regd " \n\t"::)
+#endif
+// using regr as temporary and for the output result
+// first argument is unmodifed and second is trashed
+// regfe is supposed to contain 0xfefefefefefefefe
+#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
+"movq " #rega ", " #regr "  \n\t"\
+"pand " #regb ", " #regr "  \n\t"\
+"pxor " #rega ", " #regb "  \n\t"\
+"pand " #regfe "," #regb "  \n\t"\
+"psrlq $1, " #regb "        \n\t"\
+"paddb " #regb ", " #regr " \n\t"
+#define PAVGB_MMX(rega, regb, regr, regfe) \
+"movq " #rega ", " #regr "  \n\t"\
+"por  " #regb ", " #regr "  \n\t"\
+"pxor " #rega ", " #regb "  \n\t"\
+"pand " #regfe "," #regb "  \n\t"\
+"psrlq $1, " #regb "        \n\t"\
+"psubb " #regb ", " #regr " \n\t"
+// mm6 is supposed to contain 0xfefefefefefefefe
+#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
+"movq " #rega ", " #regr "  \n\t"\
+"movq " #regc ", " #regp "  \n\t"\
+"pand " #regb ", " #regr "  \n\t"\
+"pand " #regd ", " #regp "  \n\t"\
+"pxor " #rega ", " #regb "  \n\t"\
+"pxor " #regc ", " #regd "  \n\t"\
+"pand %%mm6, " #regb "      \n\t"\
+"pand %%mm6, " #regd "      \n\t"\
+"psrlq $1, " #regb "        \n\t"\
+"psrlq $1, " #regd "        \n\t"\
+"paddb " #regb ", " #regr " \n\t"\
+"paddb " #regd ", " #regp " \n\t"
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
+"movq " #rega ", " #regr "  \n\t"\
+"movq " #regc ", " #regp "  \n\t"\
+"por  " #regb ", " #regr "  \n\t"\
+"por  " #regd ", " #regp "  \n\t"\
+"pxor " #rega ", " #regb "  \n\t"\
+"pxor " #regc ", " #regd "  \n\t"\
+"pand %%mm6, " #regb "      \n\t"\
+"pand %%mm6, " #regd "      \n\t"\
+"psrlq $1, " #regd "        \n\t"\
+"psrlq $1, " #regb "        \n\t"\
+"psubb " #regb ", " #regr " \n\t"\
+"psubb " #regd ", " #regp " \n\t"
+/***********************************/
+/* MMX2 specific */
+#define DEF(x) x ## _mmx2
+/* Introduced only in MMX2 set */
+#define PAVGB "pavgb"
+#define OP_AVG PAVGB
+#include "dsputil_mmx_avg_template.c"
+#undef DEF
+#undef PAVGB
+#undef OP_AVG
+#define put_no_rnd_pixels16_mmx put_pixels16_mmx
+#define put_no_rnd_pixels8_mmx put_pixels8_mmx
+#define put_pixels16_mmx2 put_pixels16_mmx
+#define put_pixels8_mmx2 put_pixels8_mmx
+#define put_pixels4_mmx2 put_pixels4_mmx
+#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
+#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
+#define put_pixels16_3dnow put_pixels16_mmx
+#define put_pixels8_3dnow put_pixels8_mmx
+#define put_pixels4_3dnow put_pixels4_mmx
+#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
+#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
+/***********************************/
+/* standard MMX */
+void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+const DCTELEM *p;
+uint8_t *pix;
+/* read the pixels */
+p = block;
+pix = pixels;
+/* unrolled loop */
+__asm__ volatile(
+"movq   %3, %%mm0               \n\t"
+"movq   8%3, %%mm1              \n\t"
+"movq   16%3, %%mm2             \n\t"
+"movq   24%3, %%mm3             \n\t"
+"movq   32%3, %%mm4             \n\t"
+"movq   40%3, %%mm5             \n\t"
+"movq   48%3, %%mm6             \n\t"
+"movq   56%3, %%mm7             \n\t"
+"packuswb %%mm1, %%mm0          \n\t"
+"packuswb %%mm3, %%mm2          \n\t"
+"packuswb %%mm5, %%mm4          \n\t"
+"packuswb %%mm7, %%mm6          \n\t"
+"movq   %%mm0, (%0)             \n\t"
+"movq   %%mm2, (%0, %1)         \n\t"
+"movq   %%mm4, (%0, %1, 2)      \n\t"
+"movq   %%mm6, (%0, %2)         \n\t"
+::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
+:"memory");
+pix += line_size*4;
+p += 32;
+// if here would be an exact copy of the code above
+// compiler would generate some very strange code
+// thus using "r"
+__asm__ volatile(
+"movq       (%3), %%mm0             \n\t"
+"movq       8(%3), %%mm1            \n\t"
+"movq       16(%3), %%mm2           \n\t"
+"movq       24(%3), %%mm3           \n\t"
+"movq       32(%3), %%mm4           \n\t"
+"movq       40(%3), %%mm5           \n\t"
+"movq       48(%3), %%mm6           \n\t"
+"movq       56(%3), %%mm7           \n\t"
+"packuswb %%mm1, %%mm0              \n\t"
+"packuswb %%mm3, %%mm2              \n\t"
+"packuswb %%mm5, %%mm4              \n\t"
+"packuswb %%mm7, %%mm6              \n\t"
+"movq       %%mm0, (%0)             \n\t"
+"movq       %%mm2, (%0, %1)         \n\t"
+"movq       %%mm4, (%0, %1, 2)      \n\t"
+"movq       %%mm6, (%0, %2)         \n\t"
+::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
+:"memory");
+}
+DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] =
+{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+#define put_signed_pixels_clamped_mmx_half(off) \
+"movq    "#off"(%2), %%mm1          \n\t"\
+"movq 16+"#off"(%2), %%mm2          \n\t"\
+"movq 32+"#off"(%2), %%mm3          \n\t"\
+"movq 48+"#off"(%2), %%mm4          \n\t"\
+"packsswb  8+"#off"(%2), %%mm1      \n\t"\
+"packsswb 24+"#off"(%2), %%mm2      \n\t"\
+"packsswb 40+"#off"(%2), %%mm3      \n\t"\
+"packsswb 56+"#off"(%2), %%mm4      \n\t"\
+"paddb %%mm0, %%mm1                 \n\t"\
+"paddb %%mm0, %%mm2                 \n\t"\
+"paddb %%mm0, %%mm3                 \n\t"\
+"paddb %%mm0, %%mm4                 \n\t"\
+"movq %%mm1, (%0)                   \n\t"\
+"movq %%mm2, (%0, %3)               \n\t"\
+"movq %%mm3, (%0, %3, 2)            \n\t"\
+"movq %%mm4, (%0, %1)               \n\t"
+void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+x86_reg line_skip = line_size;
+x86_reg line_skip3;
+__asm__ volatile (
+"movq "MANGLE(ff_vector128)", %%mm0 \n\t"
+"lea (%3, %3, 2), %1                \n\t"
+put_signed_pixels_clamped_mmx_half(0)
+"lea (%0, %3, 4), %0                \n\t"
+put_signed_pixels_clamped_mmx_half(64)
+:"+&r" (pixels), "=&r" (line_skip3)
+:"r" (block), "r"(line_skip)
+:"memory");
+}
+void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+const DCTELEM *p;
+uint8_t *pix;
+int i;
+/* read the pixels */
+p = block;
+pix = pixels;
+MOVQ_ZERO(mm7);
+i = 4;
+do {
+__asm__ volatile(
+"movq   (%2), %%mm0     \n\t"
+"movq   8(%2), %%mm1    \n\t"
+"movq   16(%2), %%mm2   \n\t"
+"movq   24(%2), %%mm3   \n\t"
+"movq   %0, %%mm4       \n\t"
+"movq   %1, %%mm6       \n\t"
+"movq   %%mm4, %%mm5    \n\t"
+"punpcklbw %%mm7, %%mm4 \n\t"
+"punpckhbw %%mm7, %%mm5 \n\t"
+"paddsw %%mm4, %%mm0    \n\t"
+"paddsw %%mm5, %%mm1    \n\t"
+"movq   %%mm6, %%mm5    \n\t"
+"punpcklbw %%mm7, %%mm6 \n\t"
+"punpckhbw %%mm7, %%mm5 \n\t"
+"paddsw %%mm6, %%mm2    \n\t"
+"paddsw %%mm5, %%mm3    \n\t"
+"packuswb %%mm1, %%mm0  \n\t"
+"packuswb %%mm3, %%mm2  \n\t"
+"movq   %%mm0, %0       \n\t"
+"movq   %%mm2, %1       \n\t"
+:"+m"(*pix), "+m"(*(pix+line_size))
+:"r"(p)
+:"memory");
+pix += line_size*2;
+p += 16;
+} while (--i);
+}
+static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+__asm__ volatile(
+"lea (%3, %3), %%"REG_a"       \n\t"
+ASMALIGN(3)
+"1:                            \n\t"
+"movq (%1), %%mm0              \n\t"
+"movq (%1, %3), %%mm1          \n\t"
+"movq %%mm0, (%2)              \n\t"
+"movq %%mm1, (%2, %3)          \n\t"
+"add %%"REG_a", %1             \n\t"
+"add %%"REG_a", %2             \n\t"
+"movq (%1), %%mm0              \n\t"
+"movq (%1, %3), %%mm1          \n\t"
+"movq %%mm0, (%2)              \n\t"
+"movq %%mm1, (%2, %3)          \n\t"
+"add %%"REG_a", %1             \n\t"
+"add %%"REG_a", %2             \n\t"
+"subl $4, %0                   \n\t"
+"jnz 1b                        \n\t"
+: "+g"(h), "+r" (pixels),  "+r" (block)
+: "r"((x86_reg)line_size)
+: "%"REG_a, "memory"
+);
+}
+static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+__asm__ volatile(
+"1:                            \n\t"
+"movdqu (%1), %%xmm0           \n\t"
+"movdqu (%1,%3), %%xmm1        \n\t"
+"movdqu (%1,%3,2), %%xmm2      \n\t"
+"movdqu (%1,%4), %%xmm3        \n\t"
+"movdqa %%xmm0, (%2)           \n\t"
+"movdqa %%xmm1, (%2,%3)        \n\t"
+"movdqa %%xmm2, (%2,%3,2)      \n\t"
+"movdqa %%xmm3, (%2,%4)        \n\t"
+"subl $4, %0                   \n\t"
+"lea (%1,%3,4), %1             \n\t"
+"lea (%2,%3,4), %2             \n\t"
+"jnz 1b                        \n\t"
+: "+g"(h), "+r" (pixels),  "+r" (block)
+: "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
+: "memory"
+);
+}
+static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+__asm__ volatile(
+"1:                            \n\t"
+"movdqu (%1), %%xmm0           \n\t"
+"movdqu (%1,%3), %%xmm1        \n\t"
+"movdqu (%1,%3,2), %%xmm2      \n\t"
+"movdqu (%1,%4), %%xmm3        \n\t"
+"pavgb  (%2), %%xmm0           \n\t"
+"pavgb  (%2,%3), %%xmm1        \n\t"
+"pavgb  (%2,%3,2), %%xmm2      \n\t"
+"pavgb  (%2,%4), %%xmm3        \n\t"
+"movdqa %%xmm0, (%2)           \n\t"
+"movdqa %%xmm1, (%2,%3)        \n\t"
+"movdqa %%xmm2, (%2,%3,2)      \n\t"
+"movdqa %%xmm3, (%2,%4)        \n\t"
+"subl $4, %0                   \n\t"
+"lea (%1,%3,4), %1             \n\t"
+"lea (%2,%3,4), %2             \n\t"
+"jnz 1b                        \n\t"
+: "+g"(h), "+r" (pixels),  "+r" (block)
+: "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
+: "memory"
+);
+}
+static void clear_block_sse(DCTELEM *block)
+{
+__asm__ volatile(
+"xorps  %%xmm0, %%xmm0  \n"
+"movaps %%xmm0,    (%0) \n"
+"movaps %%xmm0,  16(%0) \n"
+"movaps %%xmm0,  32(%0) \n"
+"movaps %%xmm0,  48(%0) \n"
+"movaps %%xmm0,  64(%0) \n"
+"movaps %%xmm0,  80(%0) \n"
+"movaps %%xmm0,  96(%0) \n"
+"movaps %%xmm0, 112(%0) \n"
+:: "r"(block)
+: "memory"
+);
+}
+static void clear_blocks_sse(DCTELEM *blocks)
+{\
+__asm__ volatile(
+"xorps  %%xmm0, %%xmm0  \n"
+"mov     %1, %%"REG_a"  \n"
+"1:                     \n"
+"movaps %%xmm0,    (%0, %%"REG_a") \n"
+"movaps %%xmm0,  16(%0, %%"REG_a") \n"
+"movaps %%xmm0,  32(%0, %%"REG_a") \n"
+"movaps %%xmm0,  48(%0, %%"REG_a") \n"
+"movaps %%xmm0,  64(%0, %%"REG_a") \n"
+"movaps %%xmm0,  80(%0, %%"REG_a") \n"
+"movaps %%xmm0,  96(%0, %%"REG_a") \n"
+"movaps %%xmm0, 112(%0, %%"REG_a") \n"
+"add $128, %%"REG_a"    \n"
+" js 1b                 \n"
+: : "r" (((uint8_t *)blocks)+128*6),
+"i" (-128*6)
+: "%"REG_a
+);
+}
+static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
+__asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
+"movd  %4, %%mm0                \n\t"
+"movd  %5, %%mm1                \n\t"
+"movd  %6, %%mm2                \n\t"
+"movd  %7, %%mm3                \n\t"
+"punpcklbw %%mm1, %%mm0         \n\t"
+"punpcklbw %%mm3, %%mm2         \n\t"
+"movq %%mm0, %%mm1              \n\t"
+"punpcklwd %%mm2, %%mm0         \n\t"
+"punpckhwd %%mm2, %%mm1         \n\t"
+"movd  %%mm0, %0                \n\t"
+"punpckhdq %%mm0, %%mm0         \n\t"
+"movd  %%mm0, %1                \n\t"
+"movd  %%mm1, %2                \n\t"
+"punpckhdq %%mm1, %%mm1         \n\t"
+"movd  %%mm1, %3                \n\t"
+: "=m" (*(uint32_t*)(dst + 0*dst_stride)),
+"=m" (*(uint32_t*)(dst + 1*dst_stride)),
+"=m" (*(uint32_t*)(dst + 2*dst_stride)),
+"=m" (*(uint32_t*)(dst + 3*dst_stride))
+:  "m" (*(uint32_t*)(src + 0*src_stride)),
+"m" (*(uint32_t*)(src + 1*src_stride)),
+"m" (*(uint32_t*)(src + 2*src_stride)),
+"m" (*(uint32_t*)(src + 3*src_stride))
+);
+}
+#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
+\
+static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t temp[8];\
+uint8_t * const half= (uint8_t*)temp;\
+put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
+OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t temp[8];\
+uint8_t * const half= (uint8_t*)temp;\
+put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
+OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t temp[8];\
+uint8_t * const half= (uint8_t*)temp;\
+put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
+OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t temp[8];\
+uint8_t * const half= (uint8_t*)temp;\
+put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
+OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[8 + 9];\
+uint8_t * const halfH= ((uint8_t*)half) + 64;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[8 + 9];\
+uint8_t * const halfH= ((uint8_t*)half) + 64;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[8 + 9];\
+uint8_t * const halfH= ((uint8_t*)half) + 64;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[8 + 9];\
+uint8_t * const halfH= ((uint8_t*)half) + 64;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[8 + 9];\
+uint8_t * const halfH= ((uint8_t*)half) + 64;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[8 + 9];\
+uint8_t * const halfH= ((uint8_t*)half) + 64;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[8 + 9];\
+uint8_t * const halfH= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[8 + 9];\
+uint8_t * const halfH= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[9];\
+uint8_t * const halfH= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t temp[32];\
+uint8_t * const half= (uint8_t*)temp;\
+put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
+OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t temp[32];\
+uint8_t * const half= (uint8_t*)temp;\
+put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
+OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t temp[32];\
+uint8_t * const half= (uint8_t*)temp;\
+put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
+OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t temp[32];\
+uint8_t * const half= (uint8_t*)temp;\
+put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
+OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[16*2 + 17*2];\
+uint8_t * const halfH= ((uint8_t*)half) + 256;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[16*2 + 17*2];\
+uint8_t * const halfH= ((uint8_t*)half) + 256;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[16*2 + 17*2];\
+uint8_t * const halfH= ((uint8_t*)half) + 256;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[16*2 + 17*2];\
+uint8_t * const halfH= ((uint8_t*)half) + 256;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[16*2 + 17*2];\
+uint8_t * const halfH= ((uint8_t*)half) + 256;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[16*2 + 17*2];\
+uint8_t * const halfH= ((uint8_t*)half) + 256;\
+uint8_t * const halfHV= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[17*2];\
+uint8_t * const halfH= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[17*2];\
+uint8_t * const halfH= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+uint64_t half[17*2];\
+uint8_t * const halfH= ((uint8_t*)half);\
+put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgusb " #temp ", " #a "        \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+#define AVG_MMX2_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgb " #temp ", " #a "          \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+#define PREFETCH(name, op) \
+static void name(void *mem, int stride, int h){\
+const uint8_t *p= mem;\
+do{\
+__asm__ volatile(#op" %0" :: "m"(*p));\
+p+= stride;\
+}while(--h);\
+}
+PREFETCH(prefetch_mmx2,  prefetcht0)
+#undef PREFETCH
+#include "h264dsp_mmx.c"
+void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
+void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
+void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
+void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
+void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
+void dsputil_init_mmx(DSPContext* c)
+{
+mm_flags = mm_support();
+if (mm_flags & FF_MM_MMX) {
+c->clear_block  = clear_block_sse;
+c->clear_blocks = clear_blocks_sse;
+c->prefetch = prefetch_mmx2;
+#define H264_QPEL_FUNCS(x, y, CPU)\
+c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
+c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
+c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
+c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
+if((mm_flags & FF_MM_SSE2)){
+c->put_pixels_tab[0][0] = put_pixels16_sse2;
+c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
+}
+if(mm_flags & FF_MM_SSE2){
+H264_QPEL_FUNCS(0, 1, sse2);
+H264_QPEL_FUNCS(0, 2, sse2);
+H264_QPEL_FUNCS(0, 3, sse2);
+H264_QPEL_FUNCS(1, 1, sse2);
+H264_QPEL_FUNCS(1, 2, sse2);
+H264_QPEL_FUNCS(1, 3, sse2);
+H264_QPEL_FUNCS(2, 1, sse2);
+H264_QPEL_FUNCS(2, 2, sse2);
+H264_QPEL_FUNCS(2, 3, sse2);
+H264_QPEL_FUNCS(3, 1, sse2);
+H264_QPEL_FUNCS(3, 2, sse2);
+H264_QPEL_FUNCS(3, 3, sse2);
+}
+#if HAVE_SSSE3
+if(mm_flags & FF_MM_SSSE3){
+H264_QPEL_FUNCS(1, 0, ssse3);
+H264_QPEL_FUNCS(1, 1, ssse3);
+H264_QPEL_FUNCS(1, 2, ssse3);
+H264_QPEL_FUNCS(1, 3, ssse3);
+H264_QPEL_FUNCS(2, 0, ssse3);
+H264_QPEL_FUNCS(2, 1, ssse3);
+H264_QPEL_FUNCS(2, 2, ssse3);
+H264_QPEL_FUNCS(2, 3, ssse3);
+H264_QPEL_FUNCS(3, 0, ssse3);
+H264_QPEL_FUNCS(3, 1, ssse3);
+H264_QPEL_FUNCS(3, 2, ssse3);
+H264_QPEL_FUNCS(3, 3, ssse3);
+c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
+c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
+c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
+c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
+}
+#endif
+}
+}
+void ff_h264dsp_init_x86(H264DSPContext *c)
+{
+mm_flags = mm_support();
+if (mm_flags & FF_MM_MMX) {
+c->h264_idct_dc_add=
+c->h264_idct_add= ff_h264_idct_add_mmx;
+c->h264_idct8_dc_add=
+c->h264_idct8_add= ff_h264_idct8_add_mmx;
+if (mm_flags & FF_MM_MMX2) {
+c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
+c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
+			c->h264_idct_add16     = ff_h264_idct_add16_mmx2;
+c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
+			c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
+			c->h264_idct8_add4     = ff_h264_idct8_add4_mmx2;
+			c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
+c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
+c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
+c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
+c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
+c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
+c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
+c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
+c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
+c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
+c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
+c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
+c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
+c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
+c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
+c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
+c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
+c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
+c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
+c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
+c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
+c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
+c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+}
+if(mm_flags & FF_MM_SSE2){
+c->h264_idct8_add = ff_h264_idct8_add_sse2;
+c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
+}
+}
+}

Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App

comparison libavcodec/x86/dsputil_mmx.c @ 3:0b056460c67d

Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSsH264App