annotate libavcodec/ppc/dsputil_altivec.c @ 9:ea1ba68cf0ed

update to match api changes + add sscc produced source
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Wed, 05 Jun 2013 14:43:26 +0200
parents
children
rev   line source
nengel@2 1 /*
nengel@2 2 * Copyright (c) 2002 Brian Foley
nengel@2 3 * Copyright (c) 2002 Dieter Shirley
nengel@2 4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
nengel@2 5 *
nengel@2 6 * This file is part of FFmpeg.
nengel@2 7 *
nengel@2 8 * FFmpeg is free software; you can redistribute it and/or
nengel@2 9 * modify it under the terms of the GNU Lesser General Public
nengel@2 10 * License as published by the Free Software Foundation; either
nengel@2 11 * version 2.1 of the License, or (at your option) any later version.
nengel@2 12 *
nengel@2 13 * FFmpeg is distributed in the hope that it will be useful,
nengel@2 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
nengel@2 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
nengel@2 16 * Lesser General Public License for more details.
nengel@2 17 *
nengel@2 18 * You should have received a copy of the GNU Lesser General Public
nengel@2 19 * License along with FFmpeg; if not, write to the Free Software
nengel@2 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
nengel@2 21 */
nengel@2 22
nengel@2 23 #include "config.h"
nengel@2 24 #if HAVE_ALTIVEC_H
nengel@2 25 #include <altivec.h>
nengel@2 26 #endif
nengel@2 27 #include "libavcodec/dsputil.h"
nengel@2 28 #include "dsputil_ppc.h"
nengel@2 29 #include "util_altivec.h"
nengel@2 30 #include "types_altivec.h"
nengel@2 31 #include "dsputil_altivec.h"
nengel@2 32
nengel@2 33
nengel@2 34 static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
nengel@2 35 {
nengel@2 36 int i;
nengel@2 37 vector unsigned char perm, bytes, *pixv;
nengel@2 38 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
nengel@2 39 vector signed short shorts;
nengel@2 40
nengel@2 41 for (i = 0; i < 8; i++) {
nengel@2 42 // Read potentially unaligned pixels.
nengel@2 43 // We're reading 16 pixels, and actually only want 8,
nengel@2 44 // but we simply ignore the extras.
nengel@2 45 perm = vec_lvsl(0, pixels);
nengel@2 46 pixv = (vector unsigned char *) pixels;
nengel@2 47 bytes = vec_perm(pixv[0], pixv[1], perm);
nengel@2 48
nengel@2 49 // convert the bytes into shorts
nengel@2 50 shorts = (vector signed short)vec_mergeh(zero, bytes);
nengel@2 51
nengel@2 52 // save the data to the block, we assume the block is 16-byte aligned
nengel@2 53 vec_st(shorts, i*16, (vector signed short*)block);
nengel@2 54
nengel@2 55 pixels += line_size;
nengel@2 56 }
nengel@2 57 }
nengel@2 58
nengel@2 59 static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
nengel@2 60 const uint8_t *s2, int stride)
nengel@2 61 {
nengel@2 62 int i;
nengel@2 63 vector unsigned char perm, bytes, *pixv;
nengel@2 64 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
nengel@2 65 vector signed short shorts1, shorts2;
nengel@2 66
nengel@2 67 for (i = 0; i < 4; i++) {
nengel@2 68 // Read potentially unaligned pixels
nengel@2 69 // We're reading 16 pixels, and actually only want 8,
nengel@2 70 // but we simply ignore the extras.
nengel@2 71 perm = vec_lvsl(0, s1);
nengel@2 72 pixv = (vector unsigned char *) s1;
nengel@2 73 bytes = vec_perm(pixv[0], pixv[1], perm);
nengel@2 74
nengel@2 75 // convert the bytes into shorts
nengel@2 76 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
nengel@2 77
nengel@2 78 // Do the same for the second block of pixels
nengel@2 79 perm = vec_lvsl(0, s2);
nengel@2 80 pixv = (vector unsigned char *) s2;
nengel@2 81 bytes = vec_perm(pixv[0], pixv[1], perm);
nengel@2 82
nengel@2 83 // convert the bytes into shorts
nengel@2 84 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
nengel@2 85
nengel@2 86 // Do the subtraction
nengel@2 87 shorts1 = vec_sub(shorts1, shorts2);
nengel@2 88
nengel@2 89 // save the data to the block, we assume the block is 16-byte aligned
nengel@2 90 vec_st(shorts1, 0, (vector signed short*)block);
nengel@2 91
nengel@2 92 s1 += stride;
nengel@2 93 s2 += stride;
nengel@2 94 block += 8;
nengel@2 95
nengel@2 96
nengel@2 97 // The code below is a copy of the code above... This is a manual
nengel@2 98 // unroll.
nengel@2 99
nengel@2 100 // Read potentially unaligned pixels
nengel@2 101 // We're reading 16 pixels, and actually only want 8,
nengel@2 102 // but we simply ignore the extras.
nengel@2 103 perm = vec_lvsl(0, s1);
nengel@2 104 pixv = (vector unsigned char *) s1;
nengel@2 105 bytes = vec_perm(pixv[0], pixv[1], perm);
nengel@2 106
nengel@2 107 // convert the bytes into shorts
nengel@2 108 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
nengel@2 109
nengel@2 110 // Do the same for the second block of pixels
nengel@2 111 perm = vec_lvsl(0, s2);
nengel@2 112 pixv = (vector unsigned char *) s2;
nengel@2 113 bytes = vec_perm(pixv[0], pixv[1], perm);
nengel@2 114
nengel@2 115 // convert the bytes into shorts
nengel@2 116 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
nengel@2 117
nengel@2 118 // Do the subtraction
nengel@2 119 shorts1 = vec_sub(shorts1, shorts2);
nengel@2 120
nengel@2 121 // save the data to the block, we assume the block is 16-byte aligned
nengel@2 122 vec_st(shorts1, 0, (vector signed short*)block);
nengel@2 123
nengel@2 124 s1 += stride;
nengel@2 125 s2 += stride;
nengel@2 126 block += 8;
nengel@2 127 }
nengel@2 128 }
nengel@2 129
nengel@2 130
nengel@2 131 static void clear_block_altivec(DCTELEM *block) {
nengel@2 132 LOAD_ZERO;
nengel@2 133 vec_st(zero_s16v, 0, block);
nengel@2 134 vec_st(zero_s16v, 16, block);
nengel@2 135 vec_st(zero_s16v, 32, block);
nengel@2 136 vec_st(zero_s16v, 48, block);
nengel@2 137 vec_st(zero_s16v, 64, block);
nengel@2 138 vec_st(zero_s16v, 80, block);
nengel@2 139 vec_st(zero_s16v, 96, block);
nengel@2 140 vec_st(zero_s16v, 112, block);
nengel@2 141 }
nengel@2 142
nengel@2 143
nengel@2 144
nengel@2 145 /* next one assumes that ((line_size % 16) == 0) */
nengel@2 146 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
nengel@2 147 {
nengel@2 148 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
nengel@2 149 register vector unsigned char pixelsv1, pixelsv2;
nengel@2 150 register vector unsigned char pixelsv1B, pixelsv2B;
nengel@2 151 register vector unsigned char pixelsv1C, pixelsv2C;
nengel@2 152 register vector unsigned char pixelsv1D, pixelsv2D;
nengel@2 153
nengel@2 154 register vector unsigned char perm = vec_lvsl(0, pixels);
nengel@2 155 int i;
nengel@2 156 register int line_size_2 = line_size << 1;
nengel@2 157 register int line_size_3 = line_size + line_size_2;
nengel@2 158 register int line_size_4 = line_size << 2;
nengel@2 159
nengel@2 160 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
nengel@2 161 // hand-unrolling the loop by 4 gains about 15%
nengel@2 162 // mininum execution time goes from 74 to 60 cycles
nengel@2 163 // it's faster than -funroll-loops, but using
nengel@2 164 // -funroll-loops w/ this is bad - 74 cycles again.
nengel@2 165 // all this is on a 7450, tuning for the 7450
nengel@2 166 #if 0
nengel@2 167 for (i = 0; i < h; i++) {
nengel@2 168 pixelsv1 = vec_ld(0, pixels);
nengel@2 169 pixelsv2 = vec_ld(16, pixels);
nengel@2 170 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
nengel@2 171 0, block);
nengel@2 172 pixels+=line_size;
nengel@2 173 block +=line_size;
nengel@2 174 }
nengel@2 175 #else
nengel@2 176 for (i = 0; i < h; i += 4) {
nengel@2 177 pixelsv1 = vec_ld( 0, pixels);
nengel@2 178 pixelsv2 = vec_ld(15, pixels);
nengel@2 179 pixelsv1B = vec_ld(line_size, pixels);
nengel@2 180 pixelsv2B = vec_ld(15 + line_size, pixels);
nengel@2 181 pixelsv1C = vec_ld(line_size_2, pixels);
nengel@2 182 pixelsv2C = vec_ld(15 + line_size_2, pixels);
nengel@2 183 pixelsv1D = vec_ld(line_size_3, pixels);
nengel@2 184 pixelsv2D = vec_ld(15 + line_size_3, pixels);
nengel@2 185 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
nengel@2 186 0, (unsigned char*)block);
nengel@2 187 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
nengel@2 188 line_size, (unsigned char*)block);
nengel@2 189 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
nengel@2 190 line_size_2, (unsigned char*)block);
nengel@2 191 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
nengel@2 192 line_size_3, (unsigned char*)block);
nengel@2 193 pixels+=line_size_4;
nengel@2 194 block +=line_size_4;
nengel@2 195 }
nengel@2 196 #endif
nengel@2 197 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
nengel@2 198 }
nengel@2 199
nengel@2 200 /* next one assumes that ((line_size % 16) == 0) */
nengel@2 201 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
nengel@2 202 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
nengel@2 203 {
nengel@2 204 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
nengel@2 205 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
nengel@2 206 register vector unsigned char perm = vec_lvsl(0, pixels);
nengel@2 207 int i;
nengel@2 208
nengel@2 209 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
nengel@2 210
nengel@2 211 for (i = 0; i < h; i++) {
nengel@2 212 pixelsv1 = vec_ld( 0, pixels);
nengel@2 213 pixelsv2 = vec_ld(16,pixels);
nengel@2 214 blockv = vec_ld(0, block);
nengel@2 215 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
nengel@2 216 blockv = vec_avg(blockv,pixelsv);
nengel@2 217 vec_st(blockv, 0, (unsigned char*)block);
nengel@2 218 pixels+=line_size;
nengel@2 219 block +=line_size;
nengel@2 220 }
nengel@2 221
nengel@2 222 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
nengel@2 223 }
nengel@2 224
nengel@2 225 /* next one assumes that ((line_size % 8) == 0) */
nengel@2 226 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
nengel@2 227 {
nengel@2 228 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
nengel@2 229 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
nengel@2 230 int i;
nengel@2 231
nengel@2 232 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
nengel@2 233
nengel@2 234 for (i = 0; i < h; i++) {
nengel@2 235 /* block is 8 bytes-aligned, so we're either in the
nengel@2 236 left block (16 bytes-aligned) or in the right block (not) */
nengel@2 237 int rightside = ((unsigned long)block & 0x0000000F);
nengel@2 238
nengel@2 239 blockv = vec_ld(0, block);
nengel@2 240 pixelsv1 = vec_ld( 0, pixels);
nengel@2 241 pixelsv2 = vec_ld(16, pixels);
nengel@2 242 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
nengel@2 243
nengel@2 244 if (rightside) {
nengel@2 245 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
nengel@2 246 } else {
nengel@2 247 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
nengel@2 248 }
nengel@2 249
nengel@2 250 blockv = vec_avg(blockv, pixelsv);
nengel@2 251
nengel@2 252 vec_st(blockv, 0, block);
nengel@2 253
nengel@2 254 pixels += line_size;
nengel@2 255 block += line_size;
nengel@2 256 }
nengel@2 257
nengel@2 258 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
nengel@2 259 }
nengel@2 260
nengel@2 261 /* next one assumes that ((line_size % 8) == 0) */
nengel@2 262 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
nengel@2 263 {
nengel@2 264 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
nengel@2 265 register int i;
nengel@2 266 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
nengel@2 267 register vector unsigned char blockv, temp1, temp2;
nengel@2 268 register vector unsigned short pixelssum1, pixelssum2, temp3;
nengel@2 269 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
nengel@2 270 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
nengel@2 271
nengel@2 272 temp1 = vec_ld(0, pixels);
nengel@2 273 temp2 = vec_ld(16, pixels);
nengel@2 274 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
nengel@2 275 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
nengel@2 276 pixelsv2 = temp2;
nengel@2 277 } else {
nengel@2 278 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
nengel@2 279 }
nengel@2 280 pixelsv1 = vec_mergeh(vczero, pixelsv1);
nengel@2 281 pixelsv2 = vec_mergeh(vczero, pixelsv2);
nengel@2 282 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
nengel@2 283 (vector unsigned short)pixelsv2);
nengel@2 284 pixelssum1 = vec_add(pixelssum1, vctwo);
nengel@2 285
nengel@2 286 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
nengel@2 287 for (i = 0; i < h ; i++) {
nengel@2 288 int rightside = ((unsigned long)block & 0x0000000F);
nengel@2 289 blockv = vec_ld(0, block);
nengel@2 290
nengel@2 291 temp1 = vec_ld(line_size, pixels);
nengel@2 292 temp2 = vec_ld(line_size + 16, pixels);
nengel@2 293 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
nengel@2 294 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
nengel@2 295 pixelsv2 = temp2;
nengel@2 296 } else {
nengel@2 297 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
nengel@2 298 }
nengel@2 299
nengel@2 300 pixelsv1 = vec_mergeh(vczero, pixelsv1);
nengel@2 301 pixelsv2 = vec_mergeh(vczero, pixelsv2);
nengel@2 302 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
nengel@2 303 (vector unsigned short)pixelsv2);
nengel@2 304 temp3 = vec_add(pixelssum1, pixelssum2);
nengel@2 305 temp3 = vec_sra(temp3, vctwo);
nengel@2 306 pixelssum1 = vec_add(pixelssum2, vctwo);
nengel@2 307 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
nengel@2 308
nengel@2 309 if (rightside) {
nengel@2 310 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
nengel@2 311 } else {
nengel@2 312 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
nengel@2 313 }
nengel@2 314
nengel@2 315 vec_st(blockv, 0, block);
nengel@2 316
nengel@2 317 block += line_size;
nengel@2 318 pixels += line_size;
nengel@2 319 }
nengel@2 320
nengel@2 321 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
nengel@2 322 }
nengel@2 323
nengel@2 324 /* next one assumes that ((line_size % 8) == 0) */
nengel@2 325 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
nengel@2 326 {
nengel@2 327 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
nengel@2 328 register int i;
nengel@2 329 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
nengel@2 330 register vector unsigned char blockv, temp1, temp2;
nengel@2 331 register vector unsigned short pixelssum1, pixelssum2, temp3;
nengel@2 332 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
nengel@2 333 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
nengel@2 334 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
nengel@2 335
nengel@2 336 temp1 = vec_ld(0, pixels);
nengel@2 337 temp2 = vec_ld(16, pixels);
nengel@2 338 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
nengel@2 339 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
nengel@2 340 pixelsv2 = temp2;
nengel@2 341 } else {
nengel@2 342 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
nengel@2 343 }
nengel@2 344 pixelsv1 = vec_mergeh(vczero, pixelsv1);
nengel@2 345 pixelsv2 = vec_mergeh(vczero, pixelsv2);
nengel@2 346 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
nengel@2 347 (vector unsigned short)pixelsv2);
nengel@2 348 pixelssum1 = vec_add(pixelssum1, vcone);
nengel@2 349
nengel@2 350 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
nengel@2 351 for (i = 0; i < h ; i++) {
nengel@2 352 int rightside = ((unsigned long)block & 0x0000000F);
nengel@2 353 blockv = vec_ld(0, block);
nengel@2 354
nengel@2 355 temp1 = vec_ld(line_size, pixels);
nengel@2 356 temp2 = vec_ld(line_size + 16, pixels);
nengel@2 357 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
nengel@2 358 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
nengel@2 359 pixelsv2 = temp2;
nengel@2 360 } else {
nengel@2 361 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
nengel@2 362 }
nengel@2 363
nengel@2 364 pixelsv1 = vec_mergeh(vczero, pixelsv1);
nengel@2 365 pixelsv2 = vec_mergeh(vczero, pixelsv2);
nengel@2 366 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
nengel@2 367 (vector unsigned short)pixelsv2);
nengel@2 368 temp3 = vec_add(pixelssum1, pixelssum2);
nengel@2 369 temp3 = vec_sra(temp3, vctwo);
nengel@2 370 pixelssum1 = vec_add(pixelssum2, vcone);
nengel@2 371 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
nengel@2 372
nengel@2 373 if (rightside) {
nengel@2 374 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
nengel@2 375 } else {
nengel@2 376 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
nengel@2 377 }
nengel@2 378
nengel@2 379 vec_st(blockv, 0, block);
nengel@2 380
nengel@2 381 block += line_size;
nengel@2 382 pixels += line_size;
nengel@2 383 }
nengel@2 384
nengel@2 385 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
nengel@2 386 }
nengel@2 387
nengel@2 388 /* next one assumes that ((line_size % 16) == 0) */
nengel@2 389 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
nengel@2 390 {
nengel@2 391 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
nengel@2 392 register int i;
nengel@2 393 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
nengel@2 394 register vector unsigned char blockv, temp1, temp2;
nengel@2 395 register vector unsigned short temp3, temp4,
nengel@2 396 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
nengel@2 397 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
nengel@2 398 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
nengel@2 399
nengel@2 400 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
nengel@2 401
nengel@2 402 temp1 = vec_ld(0, pixels);
nengel@2 403 temp2 = vec_ld(16, pixels);
nengel@2 404 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
nengel@2 405 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
nengel@2 406 pixelsv2 = temp2;
nengel@2 407 } else {
nengel@2 408 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
nengel@2 409 }
nengel@2 410 pixelsv3 = vec_mergel(vczero, pixelsv1);
nengel@2 411 pixelsv4 = vec_mergel(vczero, pixelsv2);
nengel@2 412 pixelsv1 = vec_mergeh(vczero, pixelsv1);
nengel@2 413 pixelsv2 = vec_mergeh(vczero, pixelsv2);
nengel@2 414 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
nengel@2 415 (vector unsigned short)pixelsv4);
nengel@2 416 pixelssum3 = vec_add(pixelssum3, vctwo);
nengel@2 417 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
nengel@2 418 (vector unsigned short)pixelsv2);
nengel@2 419 pixelssum1 = vec_add(pixelssum1, vctwo);
nengel@2 420
nengel@2 421 for (i = 0; i < h ; i++) {
nengel@2 422 blockv = vec_ld(0, block);
nengel@2 423
nengel@2 424 temp1 = vec_ld(line_size, pixels);
nengel@2 425 temp2 = vec_ld(line_size + 16, pixels);
nengel@2 426 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
nengel@2 427 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
nengel@2 428 pixelsv2 = temp2;
nengel@2 429 } else {
nengel@2 430 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
nengel@2 431 }
nengel@2 432
nengel@2 433 pixelsv3 = vec_mergel(vczero, pixelsv1);
nengel@2 434 pixelsv4 = vec_mergel(vczero, pixelsv2);
nengel@2 435 pixelsv1 = vec_mergeh(vczero, pixelsv1);
nengel@2 436 pixelsv2 = vec_mergeh(vczero, pixelsv2);
nengel@2 437
nengel@2 438 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
nengel@2 439 (vector unsigned short)pixelsv4);
nengel@2 440 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
nengel@2 441 (vector unsigned short)pixelsv2);
nengel@2 442 temp4 = vec_add(pixelssum3, pixelssum4);
nengel@2 443 temp4 = vec_sra(temp4, vctwo);
nengel@2 444 temp3 = vec_add(pixelssum1, pixelssum2);
nengel@2 445 temp3 = vec_sra(temp3, vctwo);
nengel@2 446
nengel@2 447 pixelssum3 = vec_add(pixelssum4, vctwo);
nengel@2 448 pixelssum1 = vec_add(pixelssum2, vctwo);
nengel@2 449
nengel@2 450 blockv = vec_packsu(temp3, temp4);
nengel@2 451
nengel@2 452 vec_st(blockv, 0, block);
nengel@2 453
nengel@2 454 block += line_size;
nengel@2 455 pixels += line_size;
nengel@2 456 }
nengel@2 457
nengel@2 458 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
nengel@2 459 }
nengel@2 460
nengel@2 461 /* next one assumes that ((line_size % 16) == 0) */
nengel@2 462 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
nengel@2 463 {
nengel@2 464 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
nengel@2 465 register int i;
nengel@2 466 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
nengel@2 467 register vector unsigned char blockv, temp1, temp2;
nengel@2 468 register vector unsigned short temp3, temp4,
nengel@2 469 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
nengel@2 470 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
nengel@2 471 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
nengel@2 472 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
nengel@2 473
nengel@2 474 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
nengel@2 475
nengel@2 476 temp1 = vec_ld(0, pixels);
nengel@2 477 temp2 = vec_ld(16, pixels);
nengel@2 478 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
nengel@2 479 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
nengel@2 480 pixelsv2 = temp2;
nengel@2 481 } else {
nengel@2 482 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
nengel@2 483 }
nengel@2 484 pixelsv3 = vec_mergel(vczero, pixelsv1);
nengel@2 485 pixelsv4 = vec_mergel(vczero, pixelsv2);
nengel@2 486 pixelsv1 = vec_mergeh(vczero, pixelsv1);
nengel@2 487 pixelsv2 = vec_mergeh(vczero, pixelsv2);
nengel@2 488 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
nengel@2 489 (vector unsigned short)pixelsv4);
nengel@2 490 pixelssum3 = vec_add(pixelssum3, vcone);
nengel@2 491 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
nengel@2 492 (vector unsigned short)pixelsv2);
nengel@2 493 pixelssum1 = vec_add(pixelssum1, vcone);
nengel@2 494
nengel@2 495 for (i = 0; i < h ; i++) {
nengel@2 496 blockv = vec_ld(0, block);
nengel@2 497
nengel@2 498 temp1 = vec_ld(line_size, pixels);
nengel@2 499 temp2 = vec_ld(line_size + 16, pixels);
nengel@2 500 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
nengel@2 501 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
nengel@2 502 pixelsv2 = temp2;
nengel@2 503 } else {
nengel@2 504 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
nengel@2 505 }
nengel@2 506
nengel@2 507 pixelsv3 = vec_mergel(vczero, pixelsv1);
nengel@2 508 pixelsv4 = vec_mergel(vczero, pixelsv2);
nengel@2 509 pixelsv1 = vec_mergeh(vczero, pixelsv1);
nengel@2 510 pixelsv2 = vec_mergeh(vczero, pixelsv2);
nengel@2 511
nengel@2 512 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
nengel@2 513 (vector unsigned short)pixelsv4);
nengel@2 514 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
nengel@2 515 (vector unsigned short)pixelsv2);
nengel@2 516 temp4 = vec_add(pixelssum3, pixelssum4);
nengel@2 517 temp4 = vec_sra(temp4, vctwo);
nengel@2 518 temp3 = vec_add(pixelssum1, pixelssum2);
nengel@2 519 temp3 = vec_sra(temp3, vctwo);
nengel@2 520
nengel@2 521 pixelssum3 = vec_add(pixelssum4, vcone);
nengel@2 522 pixelssum1 = vec_add(pixelssum2, vcone);
nengel@2 523
nengel@2 524 blockv = vec_packsu(temp3, temp4);
nengel@2 525
nengel@2 526 vec_st(blockv, 0, block);
nengel@2 527
nengel@2 528 block += line_size;
nengel@2 529 pixels += line_size;
nengel@2 530 }
nengel@2 531
nengel@2 532 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
nengel@2 533 }
nengel@2 534
nengel@2 535 /* next one assumes that ((line_size % 8) == 0) */
nengel@2 536 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
nengel@2 537 {
nengel@2 538 POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
nengel@2 539 register int i;
nengel@2 540 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
nengel@2 541 register vector unsigned char blockv, temp1, temp2, blocktemp;
nengel@2 542 register vector unsigned short pixelssum1, pixelssum2, temp3;
nengel@2 543
nengel@2 544 register const vector unsigned char vczero = (const vector unsigned char)
nengel@2 545 vec_splat_u8(0);
nengel@2 546 register const vector unsigned short vctwo = (const vector unsigned short)
nengel@2 547 vec_splat_u16(2);
nengel@2 548
nengel@2 549 temp1 = vec_ld(0, pixels);
nengel@2 550 temp2 = vec_ld(16, pixels);
nengel@2 551 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
nengel@2 552 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
nengel@2 553 pixelsv2 = temp2;
nengel@2 554 } else {
nengel@2 555 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
nengel@2 556 }
nengel@2 557 pixelsv1 = vec_mergeh(vczero, pixelsv1);
nengel@2 558 pixelsv2 = vec_mergeh(vczero, pixelsv2);
nengel@2 559 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
nengel@2 560 (vector unsigned short)pixelsv2);
nengel@2 561 pixelssum1 = vec_add(pixelssum1, vctwo);
nengel@2 562
nengel@2 563 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
nengel@2 564 for (i = 0; i < h ; i++) {
nengel@2 565 int rightside = ((unsigned long)block & 0x0000000F);
nengel@2 566 blockv = vec_ld(0, block);
nengel@2 567
nengel@2 568 temp1 = vec_ld(line_size, pixels);
nengel@2 569 temp2 = vec_ld(line_size + 16, pixels);
nengel@2 570 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
nengel@2 571 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
nengel@2 572 pixelsv2 = temp2;
nengel@2 573 } else {
nengel@2 574 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
nengel@2 575 }
nengel@2 576
nengel@2 577 pixelsv1 = vec_mergeh(vczero, pixelsv1);
nengel@2 578 pixelsv2 = vec_mergeh(vczero, pixelsv2);
nengel@2 579 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
nengel@2 580 (vector unsigned short)pixelsv2);
nengel@2 581 temp3 = vec_add(pixelssum1, pixelssum2);
nengel@2 582 temp3 = vec_sra(temp3, vctwo);
nengel@2 583 pixelssum1 = vec_add(pixelssum2, vctwo);
nengel@2 584 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
nengel@2 585
nengel@2 586 if (rightside) {
nengel@2 587 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
nengel@2 588 } else {
nengel@2 589 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
nengel@2 590 }
nengel@2 591
nengel@2 592 blockv = vec_avg(blocktemp, blockv);
nengel@2 593 vec_st(blockv, 0, block);
nengel@2 594
nengel@2 595 block += line_size;
nengel@2 596 pixels += line_size;
nengel@2 597 }
nengel@2 598
nengel@2 599 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
nengel@2 600 }
nengel@2 601
nengel@2 602 void dsputil_init_altivec(DSPContext* c)
nengel@2 603 {
nengel@2 604 c->diff_pixels = diff_pixels_altivec;
nengel@2 605 c->get_pixels = get_pixels_altivec;
nengel@2 606 c->clear_block = clear_block_altivec;
nengel@2 607
nengel@2 608 c->put_pixels_tab[0][0] = put_pixels16_altivec;
nengel@2 609 /* the two functions do the same thing, so use the same code */
nengel@2 610 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
nengel@2 611 c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
nengel@2 612 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
nengel@2 613 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
nengel@2 614 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
nengel@2 615 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
nengel@2 616 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
nengel@2 617 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
nengel@2 618
nengel@2 619 }