nengel@2: /* nengel@2: * Copyright (c) 2002 Brian Foley nengel@2: * Copyright (c) 2002 Dieter Shirley nengel@2: * Copyright (c) 2003-2004 Romain Dolbeau nengel@2: * nengel@2: * This file is part of FFmpeg. nengel@2: * nengel@2: * FFmpeg is free software; you can redistribute it and/or nengel@2: * modify it under the terms of the GNU Lesser General Public nengel@2: * License as published by the Free Software Foundation; either nengel@2: * version 2.1 of the License, or (at your option) any later version. nengel@2: * nengel@2: * FFmpeg is distributed in the hope that it will be useful, nengel@2: * but WITHOUT ANY WARRANTY; without even the implied warranty of nengel@2: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU nengel@2: * Lesser General Public License for more details. nengel@2: * nengel@2: * You should have received a copy of the GNU Lesser General Public nengel@2: * License along with FFmpeg; if not, write to the Free Software nengel@2: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA nengel@2: */ nengel@2: nengel@2: #include "config.h" nengel@2: #if HAVE_ALTIVEC_H nengel@2: #include nengel@2: #endif nengel@2: #include "libavcodec/dsputil.h" nengel@2: #include "dsputil_ppc.h" nengel@2: #include "util_altivec.h" nengel@2: #include "types_altivec.h" nengel@2: #include "dsputil_altivec.h" nengel@2: nengel@2: nengel@2: static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) nengel@2: { nengel@2: int i; nengel@2: vector unsigned char perm, bytes, *pixv; nengel@2: const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); nengel@2: vector signed short shorts; nengel@2: nengel@2: for (i = 0; i < 8; i++) { nengel@2: // Read potentially unaligned pixels. nengel@2: // We're reading 16 pixels, and actually only want 8, nengel@2: // but we simply ignore the extras. nengel@2: perm = vec_lvsl(0, pixels); nengel@2: pixv = (vector unsigned char *) pixels; nengel@2: bytes = vec_perm(pixv[0], pixv[1], perm); nengel@2: nengel@2: // convert the bytes into shorts nengel@2: shorts = (vector signed short)vec_mergeh(zero, bytes); nengel@2: nengel@2: // save the data to the block, we assume the block is 16-byte aligned nengel@2: vec_st(shorts, i*16, (vector signed short*)block); nengel@2: nengel@2: pixels += line_size; nengel@2: } nengel@2: } nengel@2: nengel@2: static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, nengel@2: const uint8_t *s2, int stride) nengel@2: { nengel@2: int i; nengel@2: vector unsigned char perm, bytes, *pixv; nengel@2: const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); nengel@2: vector signed short shorts1, shorts2; nengel@2: nengel@2: for (i = 0; i < 4; i++) { nengel@2: // Read potentially unaligned pixels nengel@2: // We're reading 16 pixels, and actually only want 8, nengel@2: // but we simply ignore the extras. nengel@2: perm = vec_lvsl(0, s1); nengel@2: pixv = (vector unsigned char *) s1; nengel@2: bytes = vec_perm(pixv[0], pixv[1], perm); nengel@2: nengel@2: // convert the bytes into shorts nengel@2: shorts1 = (vector signed short)vec_mergeh(zero, bytes); nengel@2: nengel@2: // Do the same for the second block of pixels nengel@2: perm = vec_lvsl(0, s2); nengel@2: pixv = (vector unsigned char *) s2; nengel@2: bytes = vec_perm(pixv[0], pixv[1], perm); nengel@2: nengel@2: // convert the bytes into shorts nengel@2: shorts2 = (vector signed short)vec_mergeh(zero, bytes); nengel@2: nengel@2: // Do the subtraction nengel@2: shorts1 = vec_sub(shorts1, shorts2); nengel@2: nengel@2: // save the data to the block, we assume the block is 16-byte aligned nengel@2: vec_st(shorts1, 0, (vector signed short*)block); nengel@2: nengel@2: s1 += stride; nengel@2: s2 += stride; nengel@2: block += 8; nengel@2: nengel@2: nengel@2: // The code below is a copy of the code above... This is a manual nengel@2: // unroll. nengel@2: nengel@2: // Read potentially unaligned pixels nengel@2: // We're reading 16 pixels, and actually only want 8, nengel@2: // but we simply ignore the extras. nengel@2: perm = vec_lvsl(0, s1); nengel@2: pixv = (vector unsigned char *) s1; nengel@2: bytes = vec_perm(pixv[0], pixv[1], perm); nengel@2: nengel@2: // convert the bytes into shorts nengel@2: shorts1 = (vector signed short)vec_mergeh(zero, bytes); nengel@2: nengel@2: // Do the same for the second block of pixels nengel@2: perm = vec_lvsl(0, s2); nengel@2: pixv = (vector unsigned char *) s2; nengel@2: bytes = vec_perm(pixv[0], pixv[1], perm); nengel@2: nengel@2: // convert the bytes into shorts nengel@2: shorts2 = (vector signed short)vec_mergeh(zero, bytes); nengel@2: nengel@2: // Do the subtraction nengel@2: shorts1 = vec_sub(shorts1, shorts2); nengel@2: nengel@2: // save the data to the block, we assume the block is 16-byte aligned nengel@2: vec_st(shorts1, 0, (vector signed short*)block); nengel@2: nengel@2: s1 += stride; nengel@2: s2 += stride; nengel@2: block += 8; nengel@2: } nengel@2: } nengel@2: nengel@2: nengel@2: static void clear_block_altivec(DCTELEM *block) { nengel@2: LOAD_ZERO; nengel@2: vec_st(zero_s16v, 0, block); nengel@2: vec_st(zero_s16v, 16, block); nengel@2: vec_st(zero_s16v, 32, block); nengel@2: vec_st(zero_s16v, 48, block); nengel@2: vec_st(zero_s16v, 64, block); nengel@2: vec_st(zero_s16v, 80, block); nengel@2: vec_st(zero_s16v, 96, block); nengel@2: vec_st(zero_s16v, 112, block); nengel@2: } nengel@2: nengel@2: nengel@2: nengel@2: /* next one assumes that ((line_size % 16) == 0) */ nengel@2: void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) nengel@2: { nengel@2: POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); nengel@2: register vector unsigned char pixelsv1, pixelsv2; nengel@2: register vector unsigned char pixelsv1B, pixelsv2B; nengel@2: register vector unsigned char pixelsv1C, pixelsv2C; nengel@2: register vector unsigned char pixelsv1D, pixelsv2D; nengel@2: nengel@2: register vector unsigned char perm = vec_lvsl(0, pixels); nengel@2: int i; nengel@2: register int line_size_2 = line_size << 1; nengel@2: register int line_size_3 = line_size + line_size_2; nengel@2: register int line_size_4 = line_size << 2; nengel@2: nengel@2: POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); nengel@2: // hand-unrolling the loop by 4 gains about 15% nengel@2: // mininum execution time goes from 74 to 60 cycles nengel@2: // it's faster than -funroll-loops, but using nengel@2: // -funroll-loops w/ this is bad - 74 cycles again. nengel@2: // all this is on a 7450, tuning for the 7450 nengel@2: #if 0 nengel@2: for (i = 0; i < h; i++) { nengel@2: pixelsv1 = vec_ld(0, pixels); nengel@2: pixelsv2 = vec_ld(16, pixels); nengel@2: vec_st(vec_perm(pixelsv1, pixelsv2, perm), nengel@2: 0, block); nengel@2: pixels+=line_size; nengel@2: block +=line_size; nengel@2: } nengel@2: #else nengel@2: for (i = 0; i < h; i += 4) { nengel@2: pixelsv1 = vec_ld( 0, pixels); nengel@2: pixelsv2 = vec_ld(15, pixels); nengel@2: pixelsv1B = vec_ld(line_size, pixels); nengel@2: pixelsv2B = vec_ld(15 + line_size, pixels); nengel@2: pixelsv1C = vec_ld(line_size_2, pixels); nengel@2: pixelsv2C = vec_ld(15 + line_size_2, pixels); nengel@2: pixelsv1D = vec_ld(line_size_3, pixels); nengel@2: pixelsv2D = vec_ld(15 + line_size_3, pixels); nengel@2: vec_st(vec_perm(pixelsv1, pixelsv2, perm), nengel@2: 0, (unsigned char*)block); nengel@2: vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), nengel@2: line_size, (unsigned char*)block); nengel@2: vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), nengel@2: line_size_2, (unsigned char*)block); nengel@2: vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), nengel@2: line_size_3, (unsigned char*)block); nengel@2: pixels+=line_size_4; nengel@2: block +=line_size_4; nengel@2: } nengel@2: #endif nengel@2: POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); nengel@2: } nengel@2: nengel@2: /* next one assumes that ((line_size % 16) == 0) */ nengel@2: #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) nengel@2: void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) nengel@2: { nengel@2: POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); nengel@2: register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; nengel@2: register vector unsigned char perm = vec_lvsl(0, pixels); nengel@2: int i; nengel@2: nengel@2: POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); nengel@2: nengel@2: for (i = 0; i < h; i++) { nengel@2: pixelsv1 = vec_ld( 0, pixels); nengel@2: pixelsv2 = vec_ld(16,pixels); nengel@2: blockv = vec_ld(0, block); nengel@2: pixelsv = vec_perm(pixelsv1, pixelsv2, perm); nengel@2: blockv = vec_avg(blockv,pixelsv); nengel@2: vec_st(blockv, 0, (unsigned char*)block); nengel@2: pixels+=line_size; nengel@2: block +=line_size; nengel@2: } nengel@2: nengel@2: POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); nengel@2: } nengel@2: nengel@2: /* next one assumes that ((line_size % 8) == 0) */ nengel@2: static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) nengel@2: { nengel@2: POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); nengel@2: register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; nengel@2: int i; nengel@2: nengel@2: POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); nengel@2: nengel@2: for (i = 0; i < h; i++) { nengel@2: /* block is 8 bytes-aligned, so we're either in the nengel@2: left block (16 bytes-aligned) or in the right block (not) */ nengel@2: int rightside = ((unsigned long)block & 0x0000000F); nengel@2: nengel@2: blockv = vec_ld(0, block); nengel@2: pixelsv1 = vec_ld( 0, pixels); nengel@2: pixelsv2 = vec_ld(16, pixels); nengel@2: pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); nengel@2: nengel@2: if (rightside) { nengel@2: pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); nengel@2: } else { nengel@2: pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); nengel@2: } nengel@2: nengel@2: blockv = vec_avg(blockv, pixelsv); nengel@2: nengel@2: vec_st(blockv, 0, block); nengel@2: nengel@2: pixels += line_size; nengel@2: block += line_size; nengel@2: } nengel@2: nengel@2: POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); nengel@2: } nengel@2: nengel@2: /* next one assumes that ((line_size % 8) == 0) */ nengel@2: static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) nengel@2: { nengel@2: POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); nengel@2: register int i; nengel@2: register vector unsigned char pixelsv1, pixelsv2, pixelsavg; nengel@2: register vector unsigned char blockv, temp1, temp2; nengel@2: register vector unsigned short pixelssum1, pixelssum2, temp3; nengel@2: register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); nengel@2: register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); nengel@2: nengel@2: temp1 = vec_ld(0, pixels); nengel@2: temp2 = vec_ld(16, pixels); nengel@2: pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); nengel@2: if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { nengel@2: pixelsv2 = temp2; nengel@2: } else { nengel@2: pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); nengel@2: } nengel@2: pixelsv1 = vec_mergeh(vczero, pixelsv1); nengel@2: pixelsv2 = vec_mergeh(vczero, pixelsv2); nengel@2: pixelssum1 = vec_add((vector unsigned short)pixelsv1, nengel@2: (vector unsigned short)pixelsv2); nengel@2: pixelssum1 = vec_add(pixelssum1, vctwo); nengel@2: nengel@2: POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); nengel@2: for (i = 0; i < h ; i++) { nengel@2: int rightside = ((unsigned long)block & 0x0000000F); nengel@2: blockv = vec_ld(0, block); nengel@2: nengel@2: temp1 = vec_ld(line_size, pixels); nengel@2: temp2 = vec_ld(line_size + 16, pixels); nengel@2: pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); nengel@2: if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { nengel@2: pixelsv2 = temp2; nengel@2: } else { nengel@2: pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); nengel@2: } nengel@2: nengel@2: pixelsv1 = vec_mergeh(vczero, pixelsv1); nengel@2: pixelsv2 = vec_mergeh(vczero, pixelsv2); nengel@2: pixelssum2 = vec_add((vector unsigned short)pixelsv1, nengel@2: (vector unsigned short)pixelsv2); nengel@2: temp3 = vec_add(pixelssum1, pixelssum2); nengel@2: temp3 = vec_sra(temp3, vctwo); nengel@2: pixelssum1 = vec_add(pixelssum2, vctwo); nengel@2: pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); nengel@2: nengel@2: if (rightside) { nengel@2: blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); nengel@2: } else { nengel@2: blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); nengel@2: } nengel@2: nengel@2: vec_st(blockv, 0, block); nengel@2: nengel@2: block += line_size; nengel@2: pixels += line_size; nengel@2: } nengel@2: nengel@2: POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); nengel@2: } nengel@2: nengel@2: /* next one assumes that ((line_size % 8) == 0) */ nengel@2: static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) nengel@2: { nengel@2: POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); nengel@2: register int i; nengel@2: register vector unsigned char pixelsv1, pixelsv2, pixelsavg; nengel@2: register vector unsigned char blockv, temp1, temp2; nengel@2: register vector unsigned short pixelssum1, pixelssum2, temp3; nengel@2: register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); nengel@2: register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); nengel@2: register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); nengel@2: nengel@2: temp1 = vec_ld(0, pixels); nengel@2: temp2 = vec_ld(16, pixels); nengel@2: pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); nengel@2: if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { nengel@2: pixelsv2 = temp2; nengel@2: } else { nengel@2: pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); nengel@2: } nengel@2: pixelsv1 = vec_mergeh(vczero, pixelsv1); nengel@2: pixelsv2 = vec_mergeh(vczero, pixelsv2); nengel@2: pixelssum1 = vec_add((vector unsigned short)pixelsv1, nengel@2: (vector unsigned short)pixelsv2); nengel@2: pixelssum1 = vec_add(pixelssum1, vcone); nengel@2: nengel@2: POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); nengel@2: for (i = 0; i < h ; i++) { nengel@2: int rightside = ((unsigned long)block & 0x0000000F); nengel@2: blockv = vec_ld(0, block); nengel@2: nengel@2: temp1 = vec_ld(line_size, pixels); nengel@2: temp2 = vec_ld(line_size + 16, pixels); nengel@2: pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); nengel@2: if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { nengel@2: pixelsv2 = temp2; nengel@2: } else { nengel@2: pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); nengel@2: } nengel@2: nengel@2: pixelsv1 = vec_mergeh(vczero, pixelsv1); nengel@2: pixelsv2 = vec_mergeh(vczero, pixelsv2); nengel@2: pixelssum2 = vec_add((vector unsigned short)pixelsv1, nengel@2: (vector unsigned short)pixelsv2); nengel@2: temp3 = vec_add(pixelssum1, pixelssum2); nengel@2: temp3 = vec_sra(temp3, vctwo); nengel@2: pixelssum1 = vec_add(pixelssum2, vcone); nengel@2: pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); nengel@2: nengel@2: if (rightside) { nengel@2: blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); nengel@2: } else { nengel@2: blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); nengel@2: } nengel@2: nengel@2: vec_st(blockv, 0, block); nengel@2: nengel@2: block += line_size; nengel@2: pixels += line_size; nengel@2: } nengel@2: nengel@2: POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); nengel@2: } nengel@2: nengel@2: /* next one assumes that ((line_size % 16) == 0) */ nengel@2: static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) nengel@2: { nengel@2: POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); nengel@2: register int i; nengel@2: register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; nengel@2: register vector unsigned char blockv, temp1, temp2; nengel@2: register vector unsigned short temp3, temp4, nengel@2: pixelssum1, pixelssum2, pixelssum3, pixelssum4; nengel@2: register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); nengel@2: register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); nengel@2: nengel@2: POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); nengel@2: nengel@2: temp1 = vec_ld(0, pixels); nengel@2: temp2 = vec_ld(16, pixels); nengel@2: pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); nengel@2: if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { nengel@2: pixelsv2 = temp2; nengel@2: } else { nengel@2: pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); nengel@2: } nengel@2: pixelsv3 = vec_mergel(vczero, pixelsv1); nengel@2: pixelsv4 = vec_mergel(vczero, pixelsv2); nengel@2: pixelsv1 = vec_mergeh(vczero, pixelsv1); nengel@2: pixelsv2 = vec_mergeh(vczero, pixelsv2); nengel@2: pixelssum3 = vec_add((vector unsigned short)pixelsv3, nengel@2: (vector unsigned short)pixelsv4); nengel@2: pixelssum3 = vec_add(pixelssum3, vctwo); nengel@2: pixelssum1 = vec_add((vector unsigned short)pixelsv1, nengel@2: (vector unsigned short)pixelsv2); nengel@2: pixelssum1 = vec_add(pixelssum1, vctwo); nengel@2: nengel@2: for (i = 0; i < h ; i++) { nengel@2: blockv = vec_ld(0, block); nengel@2: nengel@2: temp1 = vec_ld(line_size, pixels); nengel@2: temp2 = vec_ld(line_size + 16, pixels); nengel@2: pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); nengel@2: if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { nengel@2: pixelsv2 = temp2; nengel@2: } else { nengel@2: pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); nengel@2: } nengel@2: nengel@2: pixelsv3 = vec_mergel(vczero, pixelsv1); nengel@2: pixelsv4 = vec_mergel(vczero, pixelsv2); nengel@2: pixelsv1 = vec_mergeh(vczero, pixelsv1); nengel@2: pixelsv2 = vec_mergeh(vczero, pixelsv2); nengel@2: nengel@2: pixelssum4 = vec_add((vector unsigned short)pixelsv3, nengel@2: (vector unsigned short)pixelsv4); nengel@2: pixelssum2 = vec_add((vector unsigned short)pixelsv1, nengel@2: (vector unsigned short)pixelsv2); nengel@2: temp4 = vec_add(pixelssum3, pixelssum4); nengel@2: temp4 = vec_sra(temp4, vctwo); nengel@2: temp3 = vec_add(pixelssum1, pixelssum2); nengel@2: temp3 = vec_sra(temp3, vctwo); nengel@2: nengel@2: pixelssum3 = vec_add(pixelssum4, vctwo); nengel@2: pixelssum1 = vec_add(pixelssum2, vctwo); nengel@2: nengel@2: blockv = vec_packsu(temp3, temp4); nengel@2: nengel@2: vec_st(blockv, 0, block); nengel@2: nengel@2: block += line_size; nengel@2: pixels += line_size; nengel@2: } nengel@2: nengel@2: POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); nengel@2: } nengel@2: nengel@2: /* next one assumes that ((line_size % 16) == 0) */ nengel@2: static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) nengel@2: { nengel@2: POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); nengel@2: register int i; nengel@2: register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; nengel@2: register vector unsigned char blockv, temp1, temp2; nengel@2: register vector unsigned short temp3, temp4, nengel@2: pixelssum1, pixelssum2, pixelssum3, pixelssum4; nengel@2: register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); nengel@2: register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); nengel@2: register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); nengel@2: nengel@2: POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); nengel@2: nengel@2: temp1 = vec_ld(0, pixels); nengel@2: temp2 = vec_ld(16, pixels); nengel@2: pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); nengel@2: if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { nengel@2: pixelsv2 = temp2; nengel@2: } else { nengel@2: pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); nengel@2: } nengel@2: pixelsv3 = vec_mergel(vczero, pixelsv1); nengel@2: pixelsv4 = vec_mergel(vczero, pixelsv2); nengel@2: pixelsv1 = vec_mergeh(vczero, pixelsv1); nengel@2: pixelsv2 = vec_mergeh(vczero, pixelsv2); nengel@2: pixelssum3 = vec_add((vector unsigned short)pixelsv3, nengel@2: (vector unsigned short)pixelsv4); nengel@2: pixelssum3 = vec_add(pixelssum3, vcone); nengel@2: pixelssum1 = vec_add((vector unsigned short)pixelsv1, nengel@2: (vector unsigned short)pixelsv2); nengel@2: pixelssum1 = vec_add(pixelssum1, vcone); nengel@2: nengel@2: for (i = 0; i < h ; i++) { nengel@2: blockv = vec_ld(0, block); nengel@2: nengel@2: temp1 = vec_ld(line_size, pixels); nengel@2: temp2 = vec_ld(line_size + 16, pixels); nengel@2: pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); nengel@2: if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { nengel@2: pixelsv2 = temp2; nengel@2: } else { nengel@2: pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); nengel@2: } nengel@2: nengel@2: pixelsv3 = vec_mergel(vczero, pixelsv1); nengel@2: pixelsv4 = vec_mergel(vczero, pixelsv2); nengel@2: pixelsv1 = vec_mergeh(vczero, pixelsv1); nengel@2: pixelsv2 = vec_mergeh(vczero, pixelsv2); nengel@2: nengel@2: pixelssum4 = vec_add((vector unsigned short)pixelsv3, nengel@2: (vector unsigned short)pixelsv4); nengel@2: pixelssum2 = vec_add((vector unsigned short)pixelsv1, nengel@2: (vector unsigned short)pixelsv2); nengel@2: temp4 = vec_add(pixelssum3, pixelssum4); nengel@2: temp4 = vec_sra(temp4, vctwo); nengel@2: temp3 = vec_add(pixelssum1, pixelssum2); nengel@2: temp3 = vec_sra(temp3, vctwo); nengel@2: nengel@2: pixelssum3 = vec_add(pixelssum4, vcone); nengel@2: pixelssum1 = vec_add(pixelssum2, vcone); nengel@2: nengel@2: blockv = vec_packsu(temp3, temp4); nengel@2: nengel@2: vec_st(blockv, 0, block); nengel@2: nengel@2: block += line_size; nengel@2: pixels += line_size; nengel@2: } nengel@2: nengel@2: POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); nengel@2: } nengel@2: nengel@2: /* next one assumes that ((line_size % 8) == 0) */ nengel@2: static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) nengel@2: { nengel@2: POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); nengel@2: register int i; nengel@2: register vector unsigned char pixelsv1, pixelsv2, pixelsavg; nengel@2: register vector unsigned char blockv, temp1, temp2, blocktemp; nengel@2: register vector unsigned short pixelssum1, pixelssum2, temp3; nengel@2: nengel@2: register const vector unsigned char vczero = (const vector unsigned char) nengel@2: vec_splat_u8(0); nengel@2: register const vector unsigned short vctwo = (const vector unsigned short) nengel@2: vec_splat_u16(2); nengel@2: nengel@2: temp1 = vec_ld(0, pixels); nengel@2: temp2 = vec_ld(16, pixels); nengel@2: pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); nengel@2: if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { nengel@2: pixelsv2 = temp2; nengel@2: } else { nengel@2: pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); nengel@2: } nengel@2: pixelsv1 = vec_mergeh(vczero, pixelsv1); nengel@2: pixelsv2 = vec_mergeh(vczero, pixelsv2); nengel@2: pixelssum1 = vec_add((vector unsigned short)pixelsv1, nengel@2: (vector unsigned short)pixelsv2); nengel@2: pixelssum1 = vec_add(pixelssum1, vctwo); nengel@2: nengel@2: POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); nengel@2: for (i = 0; i < h ; i++) { nengel@2: int rightside = ((unsigned long)block & 0x0000000F); nengel@2: blockv = vec_ld(0, block); nengel@2: nengel@2: temp1 = vec_ld(line_size, pixels); nengel@2: temp2 = vec_ld(line_size + 16, pixels); nengel@2: pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); nengel@2: if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { nengel@2: pixelsv2 = temp2; nengel@2: } else { nengel@2: pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); nengel@2: } nengel@2: nengel@2: pixelsv1 = vec_mergeh(vczero, pixelsv1); nengel@2: pixelsv2 = vec_mergeh(vczero, pixelsv2); nengel@2: pixelssum2 = vec_add((vector unsigned short)pixelsv1, nengel@2: (vector unsigned short)pixelsv2); nengel@2: temp3 = vec_add(pixelssum1, pixelssum2); nengel@2: temp3 = vec_sra(temp3, vctwo); nengel@2: pixelssum1 = vec_add(pixelssum2, vctwo); nengel@2: pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); nengel@2: nengel@2: if (rightside) { nengel@2: blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); nengel@2: } else { nengel@2: blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); nengel@2: } nengel@2: nengel@2: blockv = vec_avg(blocktemp, blockv); nengel@2: vec_st(blockv, 0, block); nengel@2: nengel@2: block += line_size; nengel@2: pixels += line_size; nengel@2: } nengel@2: nengel@2: POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1); nengel@2: } nengel@2: nengel@2: void dsputil_init_altivec(DSPContext* c) nengel@2: { nengel@2: c->diff_pixels = diff_pixels_altivec; nengel@2: c->get_pixels = get_pixels_altivec; nengel@2: c->clear_block = clear_block_altivec; nengel@2: nengel@2: c->put_pixels_tab[0][0] = put_pixels16_altivec; nengel@2: /* the two functions do the same thing, so use the same code */ nengel@2: c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; nengel@2: c->avg_pixels_tab[0][0] = avg_pixels16_altivec; nengel@2: c->avg_pixels_tab[1][0] = avg_pixels8_altivec; nengel@2: c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; nengel@2: c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; nengel@2: c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; nengel@2: c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; nengel@2: c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; nengel@2: nengel@2: }