annotate libavcodec/x86/dsputil_mmx.c @ 3:0b056460c67d

changed code to use VSs
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Mon, 29 Oct 2012 16:44:27 +0100
parents
children
rev   line source
nengel@2 1 /*
nengel@2 2 * MMX optimized DSP utils
nengel@2 3 * Copyright (c) 2000, 2001 Fabrice Bellard
nengel@2 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
nengel@2 5 *
nengel@2 6 * This file is part of FFmpeg.
nengel@2 7 *
nengel@2 8 * FFmpeg is free software; you can redistribute it and/or
nengel@2 9 * modify it under the terms of the GNU Lesser General Public
nengel@2 10 * License as published by the Free Software Foundation; either
nengel@2 11 * version 2.1 of the License, or (at your option) any later version.
nengel@2 12 *
nengel@2 13 * FFmpeg is distributed in the hope that it will be useful,
nengel@2 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
nengel@2 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
nengel@2 16 * Lesser General Public License for more details.
nengel@2 17 *
nengel@2 18 * You should have received a copy of the GNU Lesser General Public
nengel@2 19 * License along with FFmpeg; if not, write to the Free Software
nengel@2 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
nengel@2 21 *
nengel@2 22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
nengel@2 23 */
nengel@2 24
nengel@2 25 #include "libavutil/x86_cpu.h"
nengel@2 26 #include "libavutil/internal.h"
nengel@2 27 #include "libavcodec/dsputil.h"
nengel@2 28 #include "libavcodec/h264_dsp.h"
nengel@2 29 #include "dsputil_mmx.h"
nengel@2 30
nengel@2 31
nengel@2 32 //#undef NDEBUG
nengel@2 33 //#include <assert.h>
nengel@2 34
nengel@2 35 int mm_flags; /* multimedia extension flags */
nengel@2 36
nengel@2 37 /* pixel operations */
nengel@2 38 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
nengel@2 39 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
nengel@2 40
nengel@2 41 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
nengel@2 42 {0x8000000080000000ULL, 0x8000000080000000ULL};
nengel@2 43
nengel@2 44 DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
nengel@2 45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
nengel@2 46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
nengel@2 47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
nengel@2 48 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
nengel@2 49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
nengel@2 50 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
nengel@2 51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
nengel@2 52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
nengel@2 53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
nengel@2 54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
nengel@2 55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
nengel@2 56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
nengel@2 57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
nengel@2 58
nengel@2 59 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
nengel@2 60 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
nengel@2 61 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
nengel@2 62 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
nengel@2 63 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
nengel@2 64 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
nengel@2 65 DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
nengel@2 66 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
nengel@2 67
nengel@2 68 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
nengel@2 69 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
nengel@2 70
nengel@2 71 #define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t"
nengel@2 72 #define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
nengel@2 73 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
nengel@2 74
nengel@2 75 #define MOVQ_BFE(regd) \
nengel@2 76 __asm__ volatile ( \
nengel@2 77 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
nengel@2 78 "paddb %%" #regd ", %%" #regd " \n\t" ::)
nengel@2 79
nengel@2 80 #ifndef PIC
nengel@2 81 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
nengel@2 82 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
nengel@2 83 #else
nengel@2 84 // for shared library it's better to use this way for accessing constants
nengel@2 85 // pcmpeqd -> -1
nengel@2 86 #define MOVQ_BONE(regd) \
nengel@2 87 __asm__ volatile ( \
nengel@2 88 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
nengel@2 89 "psrlw $15, %%" #regd " \n\t" \
nengel@2 90 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
nengel@2 91
nengel@2 92 #define MOVQ_WTWO(regd) \
nengel@2 93 __asm__ volatile ( \
nengel@2 94 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
nengel@2 95 "psrlw $15, %%" #regd " \n\t" \
nengel@2 96 "psllw $1, %%" #regd " \n\t"::)
nengel@2 97
nengel@2 98 #endif
nengel@2 99
nengel@2 100 // using regr as temporary and for the output result
nengel@2 101 // first argument is unmodifed and second is trashed
nengel@2 102 // regfe is supposed to contain 0xfefefefefefefefe
nengel@2 103 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
nengel@2 104 "movq " #rega ", " #regr " \n\t"\
nengel@2 105 "pand " #regb ", " #regr " \n\t"\
nengel@2 106 "pxor " #rega ", " #regb " \n\t"\
nengel@2 107 "pand " #regfe "," #regb " \n\t"\
nengel@2 108 "psrlq $1, " #regb " \n\t"\
nengel@2 109 "paddb " #regb ", " #regr " \n\t"
nengel@2 110
nengel@2 111 #define PAVGB_MMX(rega, regb, regr, regfe) \
nengel@2 112 "movq " #rega ", " #regr " \n\t"\
nengel@2 113 "por " #regb ", " #regr " \n\t"\
nengel@2 114 "pxor " #rega ", " #regb " \n\t"\
nengel@2 115 "pand " #regfe "," #regb " \n\t"\
nengel@2 116 "psrlq $1, " #regb " \n\t"\
nengel@2 117 "psubb " #regb ", " #regr " \n\t"
nengel@2 118
nengel@2 119 // mm6 is supposed to contain 0xfefefefefefefefe
nengel@2 120 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
nengel@2 121 "movq " #rega ", " #regr " \n\t"\
nengel@2 122 "movq " #regc ", " #regp " \n\t"\
nengel@2 123 "pand " #regb ", " #regr " \n\t"\
nengel@2 124 "pand " #regd ", " #regp " \n\t"\
nengel@2 125 "pxor " #rega ", " #regb " \n\t"\
nengel@2 126 "pxor " #regc ", " #regd " \n\t"\
nengel@2 127 "pand %%mm6, " #regb " \n\t"\
nengel@2 128 "pand %%mm6, " #regd " \n\t"\
nengel@2 129 "psrlq $1, " #regb " \n\t"\
nengel@2 130 "psrlq $1, " #regd " \n\t"\
nengel@2 131 "paddb " #regb ", " #regr " \n\t"\
nengel@2 132 "paddb " #regd ", " #regp " \n\t"
nengel@2 133
nengel@2 134 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
nengel@2 135 "movq " #rega ", " #regr " \n\t"\
nengel@2 136 "movq " #regc ", " #regp " \n\t"\
nengel@2 137 "por " #regb ", " #regr " \n\t"\
nengel@2 138 "por " #regd ", " #regp " \n\t"\
nengel@2 139 "pxor " #rega ", " #regb " \n\t"\
nengel@2 140 "pxor " #regc ", " #regd " \n\t"\
nengel@2 141 "pand %%mm6, " #regb " \n\t"\
nengel@2 142 "pand %%mm6, " #regd " \n\t"\
nengel@2 143 "psrlq $1, " #regd " \n\t"\
nengel@2 144 "psrlq $1, " #regb " \n\t"\
nengel@2 145 "psubb " #regb ", " #regr " \n\t"\
nengel@2 146 "psubb " #regd ", " #regp " \n\t"
nengel@2 147
nengel@2 148 /***********************************/
nengel@2 149 /* MMX2 specific */
nengel@2 150
nengel@2 151 #define DEF(x) x ## _mmx2
nengel@2 152
nengel@2 153 /* Introduced only in MMX2 set */
nengel@2 154 #define PAVGB "pavgb"
nengel@2 155 #define OP_AVG PAVGB
nengel@2 156
nengel@2 157 #include "dsputil_mmx_avg_template.c"
nengel@2 158
nengel@2 159 #undef DEF
nengel@2 160 #undef PAVGB
nengel@2 161 #undef OP_AVG
nengel@2 162
nengel@2 163 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
nengel@2 164 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
nengel@2 165 #define put_pixels16_mmx2 put_pixels16_mmx
nengel@2 166 #define put_pixels8_mmx2 put_pixels8_mmx
nengel@2 167 #define put_pixels4_mmx2 put_pixels4_mmx
nengel@2 168 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
nengel@2 169 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
nengel@2 170 #define put_pixels16_3dnow put_pixels16_mmx
nengel@2 171 #define put_pixels8_3dnow put_pixels8_mmx
nengel@2 172 #define put_pixels4_3dnow put_pixels4_mmx
nengel@2 173 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
nengel@2 174 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
nengel@2 175
nengel@2 176 /***********************************/
nengel@2 177 /* standard MMX */
nengel@2 178
nengel@2 179 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
nengel@2 180 {
nengel@2 181 const DCTELEM *p;
nengel@2 182 uint8_t *pix;
nengel@2 183
nengel@2 184 /* read the pixels */
nengel@2 185 p = block;
nengel@2 186 pix = pixels;
nengel@2 187 /* unrolled loop */
nengel@2 188 __asm__ volatile(
nengel@2 189 "movq %3, %%mm0 \n\t"
nengel@2 190 "movq 8%3, %%mm1 \n\t"
nengel@2 191 "movq 16%3, %%mm2 \n\t"
nengel@2 192 "movq 24%3, %%mm3 \n\t"
nengel@2 193 "movq 32%3, %%mm4 \n\t"
nengel@2 194 "movq 40%3, %%mm5 \n\t"
nengel@2 195 "movq 48%3, %%mm6 \n\t"
nengel@2 196 "movq 56%3, %%mm7 \n\t"
nengel@2 197 "packuswb %%mm1, %%mm0 \n\t"
nengel@2 198 "packuswb %%mm3, %%mm2 \n\t"
nengel@2 199 "packuswb %%mm5, %%mm4 \n\t"
nengel@2 200 "packuswb %%mm7, %%mm6 \n\t"
nengel@2 201 "movq %%mm0, (%0) \n\t"
nengel@2 202 "movq %%mm2, (%0, %1) \n\t"
nengel@2 203 "movq %%mm4, (%0, %1, 2) \n\t"
nengel@2 204 "movq %%mm6, (%0, %2) \n\t"
nengel@2 205 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
nengel@2 206 :"memory");
nengel@2 207 pix += line_size*4;
nengel@2 208 p += 32;
nengel@2 209
nengel@2 210 // if here would be an exact copy of the code above
nengel@2 211 // compiler would generate some very strange code
nengel@2 212 // thus using "r"
nengel@2 213 __asm__ volatile(
nengel@2 214 "movq (%3), %%mm0 \n\t"
nengel@2 215 "movq 8(%3), %%mm1 \n\t"
nengel@2 216 "movq 16(%3), %%mm2 \n\t"
nengel@2 217 "movq 24(%3), %%mm3 \n\t"
nengel@2 218 "movq 32(%3), %%mm4 \n\t"
nengel@2 219 "movq 40(%3), %%mm5 \n\t"
nengel@2 220 "movq 48(%3), %%mm6 \n\t"
nengel@2 221 "movq 56(%3), %%mm7 \n\t"
nengel@2 222 "packuswb %%mm1, %%mm0 \n\t"
nengel@2 223 "packuswb %%mm3, %%mm2 \n\t"
nengel@2 224 "packuswb %%mm5, %%mm4 \n\t"
nengel@2 225 "packuswb %%mm7, %%mm6 \n\t"
nengel@2 226 "movq %%mm0, (%0) \n\t"
nengel@2 227 "movq %%mm2, (%0, %1) \n\t"
nengel@2 228 "movq %%mm4, (%0, %1, 2) \n\t"
nengel@2 229 "movq %%mm6, (%0, %2) \n\t"
nengel@2 230 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
nengel@2 231 :"memory");
nengel@2 232 }
nengel@2 233
nengel@2 234 DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] =
nengel@2 235 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
nengel@2 236
nengel@2 237 #define put_signed_pixels_clamped_mmx_half(off) \
nengel@2 238 "movq "#off"(%2), %%mm1 \n\t"\
nengel@2 239 "movq 16+"#off"(%2), %%mm2 \n\t"\
nengel@2 240 "movq 32+"#off"(%2), %%mm3 \n\t"\
nengel@2 241 "movq 48+"#off"(%2), %%mm4 \n\t"\
nengel@2 242 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
nengel@2 243 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
nengel@2 244 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
nengel@2 245 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
nengel@2 246 "paddb %%mm0, %%mm1 \n\t"\
nengel@2 247 "paddb %%mm0, %%mm2 \n\t"\
nengel@2 248 "paddb %%mm0, %%mm3 \n\t"\
nengel@2 249 "paddb %%mm0, %%mm4 \n\t"\
nengel@2 250 "movq %%mm1, (%0) \n\t"\
nengel@2 251 "movq %%mm2, (%0, %3) \n\t"\
nengel@2 252 "movq %%mm3, (%0, %3, 2) \n\t"\
nengel@2 253 "movq %%mm4, (%0, %1) \n\t"
nengel@2 254
nengel@2 255 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
nengel@2 256 {
nengel@2 257 x86_reg line_skip = line_size;
nengel@2 258 x86_reg line_skip3;
nengel@2 259
nengel@2 260 __asm__ volatile (
nengel@2 261 "movq "MANGLE(ff_vector128)", %%mm0 \n\t"
nengel@2 262 "lea (%3, %3, 2), %1 \n\t"
nengel@2 263 put_signed_pixels_clamped_mmx_half(0)
nengel@2 264 "lea (%0, %3, 4), %0 \n\t"
nengel@2 265 put_signed_pixels_clamped_mmx_half(64)
nengel@2 266 :"+&r" (pixels), "=&r" (line_skip3)
nengel@2 267 :"r" (block), "r"(line_skip)
nengel@2 268 :"memory");
nengel@2 269 }
nengel@2 270
nengel@2 271 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
nengel@2 272 {
nengel@2 273 const DCTELEM *p;
nengel@2 274 uint8_t *pix;
nengel@2 275 int i;
nengel@2 276
nengel@2 277 /* read the pixels */
nengel@2 278 p = block;
nengel@2 279 pix = pixels;
nengel@2 280 MOVQ_ZERO(mm7);
nengel@2 281 i = 4;
nengel@2 282 do {
nengel@2 283 __asm__ volatile(
nengel@2 284 "movq (%2), %%mm0 \n\t"
nengel@2 285 "movq 8(%2), %%mm1 \n\t"
nengel@2 286 "movq 16(%2), %%mm2 \n\t"
nengel@2 287 "movq 24(%2), %%mm3 \n\t"
nengel@2 288 "movq %0, %%mm4 \n\t"
nengel@2 289 "movq %1, %%mm6 \n\t"
nengel@2 290 "movq %%mm4, %%mm5 \n\t"
nengel@2 291 "punpcklbw %%mm7, %%mm4 \n\t"
nengel@2 292 "punpckhbw %%mm7, %%mm5 \n\t"
nengel@2 293 "paddsw %%mm4, %%mm0 \n\t"
nengel@2 294 "paddsw %%mm5, %%mm1 \n\t"
nengel@2 295 "movq %%mm6, %%mm5 \n\t"
nengel@2 296 "punpcklbw %%mm7, %%mm6 \n\t"
nengel@2 297 "punpckhbw %%mm7, %%mm5 \n\t"
nengel@2 298 "paddsw %%mm6, %%mm2 \n\t"
nengel@2 299 "paddsw %%mm5, %%mm3 \n\t"
nengel@2 300 "packuswb %%mm1, %%mm0 \n\t"
nengel@2 301 "packuswb %%mm3, %%mm2 \n\t"
nengel@2 302 "movq %%mm0, %0 \n\t"
nengel@2 303 "movq %%mm2, %1 \n\t"
nengel@2 304 :"+m"(*pix), "+m"(*(pix+line_size))
nengel@2 305 :"r"(p)
nengel@2 306 :"memory");
nengel@2 307 pix += line_size*2;
nengel@2 308 p += 16;
nengel@2 309 } while (--i);
nengel@2 310 }
nengel@2 311
nengel@2 312 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
nengel@2 313 {
nengel@2 314 __asm__ volatile(
nengel@2 315 "lea (%3, %3), %%"REG_a" \n\t"
nengel@2 316 ASMALIGN(3)
nengel@2 317 "1: \n\t"
nengel@2 318 "movq (%1), %%mm0 \n\t"
nengel@2 319 "movq (%1, %3), %%mm1 \n\t"
nengel@2 320 "movq %%mm0, (%2) \n\t"
nengel@2 321 "movq %%mm1, (%2, %3) \n\t"
nengel@2 322 "add %%"REG_a", %1 \n\t"
nengel@2 323 "add %%"REG_a", %2 \n\t"
nengel@2 324 "movq (%1), %%mm0 \n\t"
nengel@2 325 "movq (%1, %3), %%mm1 \n\t"
nengel@2 326 "movq %%mm0, (%2) \n\t"
nengel@2 327 "movq %%mm1, (%2, %3) \n\t"
nengel@2 328 "add %%"REG_a", %1 \n\t"
nengel@2 329 "add %%"REG_a", %2 \n\t"
nengel@2 330 "subl $4, %0 \n\t"
nengel@2 331 "jnz 1b \n\t"
nengel@2 332 : "+g"(h), "+r" (pixels), "+r" (block)
nengel@2 333 : "r"((x86_reg)line_size)
nengel@2 334 : "%"REG_a, "memory"
nengel@2 335 );
nengel@2 336 }
nengel@2 337
nengel@2 338 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
nengel@2 339 {
nengel@2 340 __asm__ volatile(
nengel@2 341 "1: \n\t"
nengel@2 342 "movdqu (%1), %%xmm0 \n\t"
nengel@2 343 "movdqu (%1,%3), %%xmm1 \n\t"
nengel@2 344 "movdqu (%1,%3,2), %%xmm2 \n\t"
nengel@2 345 "movdqu (%1,%4), %%xmm3 \n\t"
nengel@2 346 "movdqa %%xmm0, (%2) \n\t"
nengel@2 347 "movdqa %%xmm1, (%2,%3) \n\t"
nengel@2 348 "movdqa %%xmm2, (%2,%3,2) \n\t"
nengel@2 349 "movdqa %%xmm3, (%2,%4) \n\t"
nengel@2 350 "subl $4, %0 \n\t"
nengel@2 351 "lea (%1,%3,4), %1 \n\t"
nengel@2 352 "lea (%2,%3,4), %2 \n\t"
nengel@2 353 "jnz 1b \n\t"
nengel@2 354 : "+g"(h), "+r" (pixels), "+r" (block)
nengel@2 355 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
nengel@2 356 : "memory"
nengel@2 357 );
nengel@2 358 }
nengel@2 359
nengel@2 360 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
nengel@2 361 {
nengel@2 362 __asm__ volatile(
nengel@2 363 "1: \n\t"
nengel@2 364 "movdqu (%1), %%xmm0 \n\t"
nengel@2 365 "movdqu (%1,%3), %%xmm1 \n\t"
nengel@2 366 "movdqu (%1,%3,2), %%xmm2 \n\t"
nengel@2 367 "movdqu (%1,%4), %%xmm3 \n\t"
nengel@2 368 "pavgb (%2), %%xmm0 \n\t"
nengel@2 369 "pavgb (%2,%3), %%xmm1 \n\t"
nengel@2 370 "pavgb (%2,%3,2), %%xmm2 \n\t"
nengel@2 371 "pavgb (%2,%4), %%xmm3 \n\t"
nengel@2 372 "movdqa %%xmm0, (%2) \n\t"
nengel@2 373 "movdqa %%xmm1, (%2,%3) \n\t"
nengel@2 374 "movdqa %%xmm2, (%2,%3,2) \n\t"
nengel@2 375 "movdqa %%xmm3, (%2,%4) \n\t"
nengel@2 376 "subl $4, %0 \n\t"
nengel@2 377 "lea (%1,%3,4), %1 \n\t"
nengel@2 378 "lea (%2,%3,4), %2 \n\t"
nengel@2 379 "jnz 1b \n\t"
nengel@2 380 : "+g"(h), "+r" (pixels), "+r" (block)
nengel@2 381 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
nengel@2 382 : "memory"
nengel@2 383 );
nengel@2 384 }
nengel@2 385
nengel@2 386 static void clear_block_sse(DCTELEM *block)
nengel@2 387 {
nengel@2 388 __asm__ volatile(
nengel@2 389 "xorps %%xmm0, %%xmm0 \n"
nengel@2 390 "movaps %%xmm0, (%0) \n"
nengel@2 391 "movaps %%xmm0, 16(%0) \n"
nengel@2 392 "movaps %%xmm0, 32(%0) \n"
nengel@2 393 "movaps %%xmm0, 48(%0) \n"
nengel@2 394 "movaps %%xmm0, 64(%0) \n"
nengel@2 395 "movaps %%xmm0, 80(%0) \n"
nengel@2 396 "movaps %%xmm0, 96(%0) \n"
nengel@2 397 "movaps %%xmm0, 112(%0) \n"
nengel@2 398 :: "r"(block)
nengel@2 399 : "memory"
nengel@2 400 );
nengel@2 401 }
nengel@2 402
nengel@2 403 static void clear_blocks_sse(DCTELEM *blocks)
nengel@2 404 {\
nengel@2 405 __asm__ volatile(
nengel@2 406 "xorps %%xmm0, %%xmm0 \n"
nengel@2 407 "mov %1, %%"REG_a" \n"
nengel@2 408 "1: \n"
nengel@2 409 "movaps %%xmm0, (%0, %%"REG_a") \n"
nengel@2 410 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
nengel@2 411 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
nengel@2 412 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
nengel@2 413 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
nengel@2 414 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
nengel@2 415 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
nengel@2 416 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
nengel@2 417 "add $128, %%"REG_a" \n"
nengel@2 418 " js 1b \n"
nengel@2 419 : : "r" (((uint8_t *)blocks)+128*6),
nengel@2 420 "i" (-128*6)
nengel@2 421 : "%"REG_a
nengel@2 422 );
nengel@2 423 }
nengel@2 424
nengel@2 425 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
nengel@2 426 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
nengel@2 427 "movd %4, %%mm0 \n\t"
nengel@2 428 "movd %5, %%mm1 \n\t"
nengel@2 429 "movd %6, %%mm2 \n\t"
nengel@2 430 "movd %7, %%mm3 \n\t"
nengel@2 431 "punpcklbw %%mm1, %%mm0 \n\t"
nengel@2 432 "punpcklbw %%mm3, %%mm2 \n\t"
nengel@2 433 "movq %%mm0, %%mm1 \n\t"
nengel@2 434 "punpcklwd %%mm2, %%mm0 \n\t"
nengel@2 435 "punpckhwd %%mm2, %%mm1 \n\t"
nengel@2 436 "movd %%mm0, %0 \n\t"
nengel@2 437 "punpckhdq %%mm0, %%mm0 \n\t"
nengel@2 438 "movd %%mm0, %1 \n\t"
nengel@2 439 "movd %%mm1, %2 \n\t"
nengel@2 440 "punpckhdq %%mm1, %%mm1 \n\t"
nengel@2 441 "movd %%mm1, %3 \n\t"
nengel@2 442
nengel@2 443 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
nengel@2 444 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
nengel@2 445 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
nengel@2 446 "=m" (*(uint32_t*)(dst + 3*dst_stride))
nengel@2 447 : "m" (*(uint32_t*)(src + 0*src_stride)),
nengel@2 448 "m" (*(uint32_t*)(src + 1*src_stride)),
nengel@2 449 "m" (*(uint32_t*)(src + 2*src_stride)),
nengel@2 450 "m" (*(uint32_t*)(src + 3*src_stride))
nengel@2 451 );
nengel@2 452 }
nengel@2 453
nengel@2 454 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
nengel@2 455 \
nengel@2 456 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
nengel@2 457 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
nengel@2 458 }\
nengel@2 459 \
nengel@2 460 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 461 uint64_t temp[8];\
nengel@2 462 uint8_t * const half= (uint8_t*)temp;\
nengel@2 463 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
nengel@2 464 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
nengel@2 465 }\
nengel@2 466 \
nengel@2 467 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 468 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
nengel@2 469 }\
nengel@2 470 \
nengel@2 471 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 472 uint64_t temp[8];\
nengel@2 473 uint8_t * const half= (uint8_t*)temp;\
nengel@2 474 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
nengel@2 475 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
nengel@2 476 }\
nengel@2 477 \
nengel@2 478 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 479 uint64_t temp[8];\
nengel@2 480 uint8_t * const half= (uint8_t*)temp;\
nengel@2 481 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
nengel@2 482 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
nengel@2 483 }\
nengel@2 484 \
nengel@2 485 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 486 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
nengel@2 487 }\
nengel@2 488 \
nengel@2 489 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 490 uint64_t temp[8];\
nengel@2 491 uint8_t * const half= (uint8_t*)temp;\
nengel@2 492 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
nengel@2 493 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
nengel@2 494 }\
nengel@2 495 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 496 uint64_t half[8 + 9];\
nengel@2 497 uint8_t * const halfH= ((uint8_t*)half) + 64;\
nengel@2 498 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 499 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
nengel@2 500 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
nengel@2 501 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
nengel@2 502 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
nengel@2 503 }\
nengel@2 504 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 505 uint64_t half[8 + 9];\
nengel@2 506 uint8_t * const halfH= ((uint8_t*)half) + 64;\
nengel@2 507 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 508 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
nengel@2 509 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
nengel@2 510 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
nengel@2 511 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
nengel@2 512 }\
nengel@2 513 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 514 uint64_t half[8 + 9];\
nengel@2 515 uint8_t * const halfH= ((uint8_t*)half) + 64;\
nengel@2 516 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 517 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
nengel@2 518 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
nengel@2 519 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
nengel@2 520 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
nengel@2 521 }\
nengel@2 522 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 523 uint64_t half[8 + 9];\
nengel@2 524 uint8_t * const halfH= ((uint8_t*)half) + 64;\
nengel@2 525 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 526 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
nengel@2 527 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
nengel@2 528 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
nengel@2 529 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
nengel@2 530 }\
nengel@2 531 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 532 uint64_t half[8 + 9];\
nengel@2 533 uint8_t * const halfH= ((uint8_t*)half) + 64;\
nengel@2 534 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 535 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
nengel@2 536 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
nengel@2 537 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
nengel@2 538 }\
nengel@2 539 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 540 uint64_t half[8 + 9];\
nengel@2 541 uint8_t * const halfH= ((uint8_t*)half) + 64;\
nengel@2 542 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 543 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
nengel@2 544 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
nengel@2 545 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
nengel@2 546 }\
nengel@2 547 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 548 uint64_t half[8 + 9];\
nengel@2 549 uint8_t * const halfH= ((uint8_t*)half);\
nengel@2 550 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
nengel@2 551 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
nengel@2 552 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
nengel@2 553 }\
nengel@2 554 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 555 uint64_t half[8 + 9];\
nengel@2 556 uint8_t * const halfH= ((uint8_t*)half);\
nengel@2 557 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
nengel@2 558 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
nengel@2 559 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
nengel@2 560 }\
nengel@2 561 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 562 uint64_t half[9];\
nengel@2 563 uint8_t * const halfH= ((uint8_t*)half);\
nengel@2 564 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
nengel@2 565 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
nengel@2 566 }\
nengel@2 567 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
nengel@2 568 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
nengel@2 569 }\
nengel@2 570 \
nengel@2 571 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 572 uint64_t temp[32];\
nengel@2 573 uint8_t * const half= (uint8_t*)temp;\
nengel@2 574 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
nengel@2 575 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
nengel@2 576 }\
nengel@2 577 \
nengel@2 578 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 579 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
nengel@2 580 }\
nengel@2 581 \
nengel@2 582 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 583 uint64_t temp[32];\
nengel@2 584 uint8_t * const half= (uint8_t*)temp;\
nengel@2 585 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
nengel@2 586 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
nengel@2 587 }\
nengel@2 588 \
nengel@2 589 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 590 uint64_t temp[32];\
nengel@2 591 uint8_t * const half= (uint8_t*)temp;\
nengel@2 592 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
nengel@2 593 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
nengel@2 594 }\
nengel@2 595 \
nengel@2 596 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 597 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
nengel@2 598 }\
nengel@2 599 \
nengel@2 600 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 601 uint64_t temp[32];\
nengel@2 602 uint8_t * const half= (uint8_t*)temp;\
nengel@2 603 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
nengel@2 604 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
nengel@2 605 }\
nengel@2 606 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 607 uint64_t half[16*2 + 17*2];\
nengel@2 608 uint8_t * const halfH= ((uint8_t*)half) + 256;\
nengel@2 609 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 610 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
nengel@2 611 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
nengel@2 612 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
nengel@2 613 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
nengel@2 614 }\
nengel@2 615 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 616 uint64_t half[16*2 + 17*2];\
nengel@2 617 uint8_t * const halfH= ((uint8_t*)half) + 256;\
nengel@2 618 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 619 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
nengel@2 620 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
nengel@2 621 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
nengel@2 622 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
nengel@2 623 }\
nengel@2 624 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 625 uint64_t half[16*2 + 17*2];\
nengel@2 626 uint8_t * const halfH= ((uint8_t*)half) + 256;\
nengel@2 627 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 628 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
nengel@2 629 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
nengel@2 630 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
nengel@2 631 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
nengel@2 632 }\
nengel@2 633 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 634 uint64_t half[16*2 + 17*2];\
nengel@2 635 uint8_t * const halfH= ((uint8_t*)half) + 256;\
nengel@2 636 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 637 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
nengel@2 638 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
nengel@2 639 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
nengel@2 640 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
nengel@2 641 }\
nengel@2 642 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 643 uint64_t half[16*2 + 17*2];\
nengel@2 644 uint8_t * const halfH= ((uint8_t*)half) + 256;\
nengel@2 645 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 646 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
nengel@2 647 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
nengel@2 648 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
nengel@2 649 }\
nengel@2 650 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 651 uint64_t half[16*2 + 17*2];\
nengel@2 652 uint8_t * const halfH= ((uint8_t*)half) + 256;\
nengel@2 653 uint8_t * const halfHV= ((uint8_t*)half);\
nengel@2 654 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
nengel@2 655 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
nengel@2 656 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
nengel@2 657 }\
nengel@2 658 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 659 uint64_t half[17*2];\
nengel@2 660 uint8_t * const halfH= ((uint8_t*)half);\
nengel@2 661 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
nengel@2 662 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
nengel@2 663 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
nengel@2 664 }\
nengel@2 665 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 666 uint64_t half[17*2];\
nengel@2 667 uint8_t * const halfH= ((uint8_t*)half);\
nengel@2 668 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
nengel@2 669 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
nengel@2 670 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
nengel@2 671 }\
nengel@2 672 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
nengel@2 673 uint64_t half[17*2];\
nengel@2 674 uint8_t * const halfH= ((uint8_t*)half);\
nengel@2 675 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
nengel@2 676 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
nengel@2 677 }
nengel@2 678
nengel@2 679 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
nengel@2 680 #define AVG_3DNOW_OP(a,b,temp, size) \
nengel@2 681 "mov" #size " " #b ", " #temp " \n\t"\
nengel@2 682 "pavgusb " #temp ", " #a " \n\t"\
nengel@2 683 "mov" #size " " #a ", " #b " \n\t"
nengel@2 684 #define AVG_MMX2_OP(a,b,temp, size) \
nengel@2 685 "mov" #size " " #b ", " #temp " \n\t"\
nengel@2 686 "pavgb " #temp ", " #a " \n\t"\
nengel@2 687 "mov" #size " " #a ", " #b " \n\t"
nengel@2 688
nengel@2 689 #define PREFETCH(name, op) \
nengel@2 690 static void name(void *mem, int stride, int h){\
nengel@2 691 const uint8_t *p= mem;\
nengel@2 692 do{\
nengel@2 693 __asm__ volatile(#op" %0" :: "m"(*p));\
nengel@2 694 p+= stride;\
nengel@2 695 }while(--h);\
nengel@2 696 }
nengel@2 697 PREFETCH(prefetch_mmx2, prefetcht0)
nengel@2 698 #undef PREFETCH
nengel@2 699
nengel@2 700 #include "h264dsp_mmx.c"
nengel@2 701
nengel@2 702 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
nengel@2 703 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
nengel@2 704 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
nengel@2 705 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
nengel@2 706 void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
nengel@2 707
nengel@2 708 void dsputil_init_mmx(DSPContext* c)
nengel@2 709 {
nengel@2 710 mm_flags = mm_support();
nengel@2 711
nengel@2 712 if (mm_flags & FF_MM_MMX) {
nengel@2 713 c->clear_block = clear_block_sse;
nengel@2 714 c->clear_blocks = clear_blocks_sse;
nengel@2 715 c->prefetch = prefetch_mmx2;
nengel@2 716
nengel@2 717
nengel@2 718 #define H264_QPEL_FUNCS(x, y, CPU)\
nengel@2 719 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
nengel@2 720 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
nengel@2 721 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
nengel@2 722 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
nengel@2 723
nengel@2 724 if((mm_flags & FF_MM_SSE2)){
nengel@2 725 c->put_pixels_tab[0][0] = put_pixels16_sse2;
nengel@2 726 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
nengel@2 727
nengel@2 728 }
nengel@2 729 if(mm_flags & FF_MM_SSE2){
nengel@2 730 H264_QPEL_FUNCS(0, 1, sse2);
nengel@2 731 H264_QPEL_FUNCS(0, 2, sse2);
nengel@2 732 H264_QPEL_FUNCS(0, 3, sse2);
nengel@2 733 H264_QPEL_FUNCS(1, 1, sse2);
nengel@2 734 H264_QPEL_FUNCS(1, 2, sse2);
nengel@2 735 H264_QPEL_FUNCS(1, 3, sse2);
nengel@2 736 H264_QPEL_FUNCS(2, 1, sse2);
nengel@2 737 H264_QPEL_FUNCS(2, 2, sse2);
nengel@2 738 H264_QPEL_FUNCS(2, 3, sse2);
nengel@2 739 H264_QPEL_FUNCS(3, 1, sse2);
nengel@2 740 H264_QPEL_FUNCS(3, 2, sse2);
nengel@2 741 H264_QPEL_FUNCS(3, 3, sse2);
nengel@2 742 }
nengel@2 743 #if HAVE_SSSE3
nengel@2 744 if(mm_flags & FF_MM_SSSE3){
nengel@2 745 H264_QPEL_FUNCS(1, 0, ssse3);
nengel@2 746 H264_QPEL_FUNCS(1, 1, ssse3);
nengel@2 747 H264_QPEL_FUNCS(1, 2, ssse3);
nengel@2 748 H264_QPEL_FUNCS(1, 3, ssse3);
nengel@2 749 H264_QPEL_FUNCS(2, 0, ssse3);
nengel@2 750 H264_QPEL_FUNCS(2, 1, ssse3);
nengel@2 751 H264_QPEL_FUNCS(2, 2, ssse3);
nengel@2 752 H264_QPEL_FUNCS(2, 3, ssse3);
nengel@2 753 H264_QPEL_FUNCS(3, 0, ssse3);
nengel@2 754 H264_QPEL_FUNCS(3, 1, ssse3);
nengel@2 755 H264_QPEL_FUNCS(3, 2, ssse3);
nengel@2 756 H264_QPEL_FUNCS(3, 3, ssse3);
nengel@2 757
nengel@2 758 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
nengel@2 759 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
nengel@2 760 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
nengel@2 761 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
nengel@2 762 }
nengel@2 763 #endif
nengel@2 764
nengel@2 765
nengel@2 766 }
nengel@2 767 }
nengel@2 768
nengel@2 769 void ff_h264dsp_init_x86(H264DSPContext *c)
nengel@2 770 {
nengel@2 771 mm_flags = mm_support();
nengel@2 772
nengel@2 773 if (mm_flags & FF_MM_MMX) {
nengel@2 774 c->h264_idct_dc_add=
nengel@2 775 c->h264_idct_add= ff_h264_idct_add_mmx;
nengel@2 776 c->h264_idct8_dc_add=
nengel@2 777 c->h264_idct8_add= ff_h264_idct8_add_mmx;
nengel@2 778
nengel@2 779 if (mm_flags & FF_MM_MMX2) {
nengel@2 780 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
nengel@2 781 c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
nengel@2 782 c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
nengel@2 783 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
nengel@2 784
nengel@2 785 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
nengel@2 786 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
nengel@2 787
nengel@2 788 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
nengel@2 789 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
nengel@2 790 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
nengel@2 791 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
nengel@2 792 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
nengel@2 793 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
nengel@2 794 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
nengel@2 795
nengel@2 796 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
nengel@2 797 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
nengel@2 798 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
nengel@2 799 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
nengel@2 800 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
nengel@2 801 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
nengel@2 802 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
nengel@2 803 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
nengel@2 804
nengel@2 805 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
nengel@2 806 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
nengel@2 807 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
nengel@2 808 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
nengel@2 809 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
nengel@2 810 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
nengel@2 811 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
nengel@2 812 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
nengel@2 813 }
nengel@2 814 if(mm_flags & FF_MM_SSE2){
nengel@2 815 c->h264_idct8_add = ff_h264_idct8_add_sse2;
nengel@2 816 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
nengel@2 817 }
nengel@2 818
nengel@2 819 }
nengel@2 820 }
nengel@2 821