| rev |
line source |
|
nengel@2
|
1 /*
|
|
nengel@2
|
2 * Copyright (c) 2002 Brian Foley
|
|
nengel@2
|
3 * Copyright (c) 2002 Dieter Shirley
|
|
nengel@2
|
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
|
|
nengel@2
|
5 *
|
|
nengel@2
|
6 * This file is part of FFmpeg.
|
|
nengel@2
|
7 *
|
|
nengel@2
|
8 * FFmpeg is free software; you can redistribute it and/or
|
|
nengel@2
|
9 * modify it under the terms of the GNU Lesser General Public
|
|
nengel@2
|
10 * License as published by the Free Software Foundation; either
|
|
nengel@2
|
11 * version 2.1 of the License, or (at your option) any later version.
|
|
nengel@2
|
12 *
|
|
nengel@2
|
13 * FFmpeg is distributed in the hope that it will be useful,
|
|
nengel@2
|
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
nengel@2
|
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
nengel@2
|
16 * Lesser General Public License for more details.
|
|
nengel@2
|
17 *
|
|
nengel@2
|
18 * You should have received a copy of the GNU Lesser General Public
|
|
nengel@2
|
19 * License along with FFmpeg; if not, write to the Free Software
|
|
nengel@2
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
nengel@2
|
21 */
|
|
nengel@2
|
22
|
|
nengel@2
|
23 #include "config.h"
|
|
nengel@2
|
24 #if HAVE_ALTIVEC_H
|
|
nengel@2
|
25 #include <altivec.h>
|
|
nengel@2
|
26 #endif
|
|
nengel@2
|
27 #include "libavcodec/dsputil.h"
|
|
nengel@2
|
28 #include "dsputil_ppc.h"
|
|
nengel@2
|
29 #include "util_altivec.h"
|
|
nengel@2
|
30 #include "types_altivec.h"
|
|
nengel@2
|
31 #include "dsputil_altivec.h"
|
|
nengel@2
|
32
|
|
nengel@2
|
33
|
|
nengel@2
|
34 static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
|
|
nengel@2
|
35 {
|
|
nengel@2
|
36 int i;
|
|
nengel@2
|
37 vector unsigned char perm, bytes, *pixv;
|
|
nengel@2
|
38 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
|
|
nengel@2
|
39 vector signed short shorts;
|
|
nengel@2
|
40
|
|
nengel@2
|
41 for (i = 0; i < 8; i++) {
|
|
nengel@2
|
42 // Read potentially unaligned pixels.
|
|
nengel@2
|
43 // We're reading 16 pixels, and actually only want 8,
|
|
nengel@2
|
44 // but we simply ignore the extras.
|
|
nengel@2
|
45 perm = vec_lvsl(0, pixels);
|
|
nengel@2
|
46 pixv = (vector unsigned char *) pixels;
|
|
nengel@2
|
47 bytes = vec_perm(pixv[0], pixv[1], perm);
|
|
nengel@2
|
48
|
|
nengel@2
|
49 // convert the bytes into shorts
|
|
nengel@2
|
50 shorts = (vector signed short)vec_mergeh(zero, bytes);
|
|
nengel@2
|
51
|
|
nengel@2
|
52 // save the data to the block, we assume the block is 16-byte aligned
|
|
nengel@2
|
53 vec_st(shorts, i*16, (vector signed short*)block);
|
|
nengel@2
|
54
|
|
nengel@2
|
55 pixels += line_size;
|
|
nengel@2
|
56 }
|
|
nengel@2
|
57 }
|
|
nengel@2
|
58
|
|
nengel@2
|
59 static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
|
|
nengel@2
|
60 const uint8_t *s2, int stride)
|
|
nengel@2
|
61 {
|
|
nengel@2
|
62 int i;
|
|
nengel@2
|
63 vector unsigned char perm, bytes, *pixv;
|
|
nengel@2
|
64 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
|
|
nengel@2
|
65 vector signed short shorts1, shorts2;
|
|
nengel@2
|
66
|
|
nengel@2
|
67 for (i = 0; i < 4; i++) {
|
|
nengel@2
|
68 // Read potentially unaligned pixels
|
|
nengel@2
|
69 // We're reading 16 pixels, and actually only want 8,
|
|
nengel@2
|
70 // but we simply ignore the extras.
|
|
nengel@2
|
71 perm = vec_lvsl(0, s1);
|
|
nengel@2
|
72 pixv = (vector unsigned char *) s1;
|
|
nengel@2
|
73 bytes = vec_perm(pixv[0], pixv[1], perm);
|
|
nengel@2
|
74
|
|
nengel@2
|
75 // convert the bytes into shorts
|
|
nengel@2
|
76 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
|
|
nengel@2
|
77
|
|
nengel@2
|
78 // Do the same for the second block of pixels
|
|
nengel@2
|
79 perm = vec_lvsl(0, s2);
|
|
nengel@2
|
80 pixv = (vector unsigned char *) s2;
|
|
nengel@2
|
81 bytes = vec_perm(pixv[0], pixv[1], perm);
|
|
nengel@2
|
82
|
|
nengel@2
|
83 // convert the bytes into shorts
|
|
nengel@2
|
84 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
|
|
nengel@2
|
85
|
|
nengel@2
|
86 // Do the subtraction
|
|
nengel@2
|
87 shorts1 = vec_sub(shorts1, shorts2);
|
|
nengel@2
|
88
|
|
nengel@2
|
89 // save the data to the block, we assume the block is 16-byte aligned
|
|
nengel@2
|
90 vec_st(shorts1, 0, (vector signed short*)block);
|
|
nengel@2
|
91
|
|
nengel@2
|
92 s1 += stride;
|
|
nengel@2
|
93 s2 += stride;
|
|
nengel@2
|
94 block += 8;
|
|
nengel@2
|
95
|
|
nengel@2
|
96
|
|
nengel@2
|
97 // The code below is a copy of the code above... This is a manual
|
|
nengel@2
|
98 // unroll.
|
|
nengel@2
|
99
|
|
nengel@2
|
100 // Read potentially unaligned pixels
|
|
nengel@2
|
101 // We're reading 16 pixels, and actually only want 8,
|
|
nengel@2
|
102 // but we simply ignore the extras.
|
|
nengel@2
|
103 perm = vec_lvsl(0, s1);
|
|
nengel@2
|
104 pixv = (vector unsigned char *) s1;
|
|
nengel@2
|
105 bytes = vec_perm(pixv[0], pixv[1], perm);
|
|
nengel@2
|
106
|
|
nengel@2
|
107 // convert the bytes into shorts
|
|
nengel@2
|
108 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
|
|
nengel@2
|
109
|
|
nengel@2
|
110 // Do the same for the second block of pixels
|
|
nengel@2
|
111 perm = vec_lvsl(0, s2);
|
|
nengel@2
|
112 pixv = (vector unsigned char *) s2;
|
|
nengel@2
|
113 bytes = vec_perm(pixv[0], pixv[1], perm);
|
|
nengel@2
|
114
|
|
nengel@2
|
115 // convert the bytes into shorts
|
|
nengel@2
|
116 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
|
|
nengel@2
|
117
|
|
nengel@2
|
118 // Do the subtraction
|
|
nengel@2
|
119 shorts1 = vec_sub(shorts1, shorts2);
|
|
nengel@2
|
120
|
|
nengel@2
|
121 // save the data to the block, we assume the block is 16-byte aligned
|
|
nengel@2
|
122 vec_st(shorts1, 0, (vector signed short*)block);
|
|
nengel@2
|
123
|
|
nengel@2
|
124 s1 += stride;
|
|
nengel@2
|
125 s2 += stride;
|
|
nengel@2
|
126 block += 8;
|
|
nengel@2
|
127 }
|
|
nengel@2
|
128 }
|
|
nengel@2
|
129
|
|
nengel@2
|
130
|
|
nengel@2
|
131 static void clear_block_altivec(DCTELEM *block) {
|
|
nengel@2
|
132 LOAD_ZERO;
|
|
nengel@2
|
133 vec_st(zero_s16v, 0, block);
|
|
nengel@2
|
134 vec_st(zero_s16v, 16, block);
|
|
nengel@2
|
135 vec_st(zero_s16v, 32, block);
|
|
nengel@2
|
136 vec_st(zero_s16v, 48, block);
|
|
nengel@2
|
137 vec_st(zero_s16v, 64, block);
|
|
nengel@2
|
138 vec_st(zero_s16v, 80, block);
|
|
nengel@2
|
139 vec_st(zero_s16v, 96, block);
|
|
nengel@2
|
140 vec_st(zero_s16v, 112, block);
|
|
nengel@2
|
141 }
|
|
nengel@2
|
142
|
|
nengel@2
|
143
|
|
nengel@2
|
144
|
|
nengel@2
|
145 /* next one assumes that ((line_size % 16) == 0) */
|
|
nengel@2
|
146 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
nengel@2
|
147 {
|
|
nengel@2
|
148 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
|
|
nengel@2
|
149 register vector unsigned char pixelsv1, pixelsv2;
|
|
nengel@2
|
150 register vector unsigned char pixelsv1B, pixelsv2B;
|
|
nengel@2
|
151 register vector unsigned char pixelsv1C, pixelsv2C;
|
|
nengel@2
|
152 register vector unsigned char pixelsv1D, pixelsv2D;
|
|
nengel@2
|
153
|
|
nengel@2
|
154 register vector unsigned char perm = vec_lvsl(0, pixels);
|
|
nengel@2
|
155 int i;
|
|
nengel@2
|
156 register int line_size_2 = line_size << 1;
|
|
nengel@2
|
157 register int line_size_3 = line_size + line_size_2;
|
|
nengel@2
|
158 register int line_size_4 = line_size << 2;
|
|
nengel@2
|
159
|
|
nengel@2
|
160 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
|
|
nengel@2
|
161 // hand-unrolling the loop by 4 gains about 15%
|
|
nengel@2
|
162 // mininum execution time goes from 74 to 60 cycles
|
|
nengel@2
|
163 // it's faster than -funroll-loops, but using
|
|
nengel@2
|
164 // -funroll-loops w/ this is bad - 74 cycles again.
|
|
nengel@2
|
165 // all this is on a 7450, tuning for the 7450
|
|
nengel@2
|
166 #if 0
|
|
nengel@2
|
167 for (i = 0; i < h; i++) {
|
|
nengel@2
|
168 pixelsv1 = vec_ld(0, pixels);
|
|
nengel@2
|
169 pixelsv2 = vec_ld(16, pixels);
|
|
nengel@2
|
170 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
|
|
nengel@2
|
171 0, block);
|
|
nengel@2
|
172 pixels+=line_size;
|
|
nengel@2
|
173 block +=line_size;
|
|
nengel@2
|
174 }
|
|
nengel@2
|
175 #else
|
|
nengel@2
|
176 for (i = 0; i < h; i += 4) {
|
|
nengel@2
|
177 pixelsv1 = vec_ld( 0, pixels);
|
|
nengel@2
|
178 pixelsv2 = vec_ld(15, pixels);
|
|
nengel@2
|
179 pixelsv1B = vec_ld(line_size, pixels);
|
|
nengel@2
|
180 pixelsv2B = vec_ld(15 + line_size, pixels);
|
|
nengel@2
|
181 pixelsv1C = vec_ld(line_size_2, pixels);
|
|
nengel@2
|
182 pixelsv2C = vec_ld(15 + line_size_2, pixels);
|
|
nengel@2
|
183 pixelsv1D = vec_ld(line_size_3, pixels);
|
|
nengel@2
|
184 pixelsv2D = vec_ld(15 + line_size_3, pixels);
|
|
nengel@2
|
185 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
|
|
nengel@2
|
186 0, (unsigned char*)block);
|
|
nengel@2
|
187 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
|
|
nengel@2
|
188 line_size, (unsigned char*)block);
|
|
nengel@2
|
189 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
|
|
nengel@2
|
190 line_size_2, (unsigned char*)block);
|
|
nengel@2
|
191 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
|
|
nengel@2
|
192 line_size_3, (unsigned char*)block);
|
|
nengel@2
|
193 pixels+=line_size_4;
|
|
nengel@2
|
194 block +=line_size_4;
|
|
nengel@2
|
195 }
|
|
nengel@2
|
196 #endif
|
|
nengel@2
|
197 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
|
|
nengel@2
|
198 }
|
|
nengel@2
|
199
|
|
nengel@2
|
200 /* next one assumes that ((line_size % 16) == 0) */
|
|
nengel@2
|
201 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
|
|
nengel@2
|
202 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
nengel@2
|
203 {
|
|
nengel@2
|
204 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
|
|
nengel@2
|
205 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
|
|
nengel@2
|
206 register vector unsigned char perm = vec_lvsl(0, pixels);
|
|
nengel@2
|
207 int i;
|
|
nengel@2
|
208
|
|
nengel@2
|
209 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
|
|
nengel@2
|
210
|
|
nengel@2
|
211 for (i = 0; i < h; i++) {
|
|
nengel@2
|
212 pixelsv1 = vec_ld( 0, pixels);
|
|
nengel@2
|
213 pixelsv2 = vec_ld(16,pixels);
|
|
nengel@2
|
214 blockv = vec_ld(0, block);
|
|
nengel@2
|
215 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
|
|
nengel@2
|
216 blockv = vec_avg(blockv,pixelsv);
|
|
nengel@2
|
217 vec_st(blockv, 0, (unsigned char*)block);
|
|
nengel@2
|
218 pixels+=line_size;
|
|
nengel@2
|
219 block +=line_size;
|
|
nengel@2
|
220 }
|
|
nengel@2
|
221
|
|
nengel@2
|
222 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
|
|
nengel@2
|
223 }
|
|
nengel@2
|
224
|
|
nengel@2
|
225 /* next one assumes that ((line_size % 8) == 0) */
|
|
nengel@2
|
226 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
|
|
nengel@2
|
227 {
|
|
nengel@2
|
228 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
|
|
nengel@2
|
229 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
|
|
nengel@2
|
230 int i;
|
|
nengel@2
|
231
|
|
nengel@2
|
232 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
|
|
nengel@2
|
233
|
|
nengel@2
|
234 for (i = 0; i < h; i++) {
|
|
nengel@2
|
235 /* block is 8 bytes-aligned, so we're either in the
|
|
nengel@2
|
236 left block (16 bytes-aligned) or in the right block (not) */
|
|
nengel@2
|
237 int rightside = ((unsigned long)block & 0x0000000F);
|
|
nengel@2
|
238
|
|
nengel@2
|
239 blockv = vec_ld(0, block);
|
|
nengel@2
|
240 pixelsv1 = vec_ld( 0, pixels);
|
|
nengel@2
|
241 pixelsv2 = vec_ld(16, pixels);
|
|
nengel@2
|
242 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
|
|
nengel@2
|
243
|
|
nengel@2
|
244 if (rightside) {
|
|
nengel@2
|
245 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
|
|
nengel@2
|
246 } else {
|
|
nengel@2
|
247 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
|
|
nengel@2
|
248 }
|
|
nengel@2
|
249
|
|
nengel@2
|
250 blockv = vec_avg(blockv, pixelsv);
|
|
nengel@2
|
251
|
|
nengel@2
|
252 vec_st(blockv, 0, block);
|
|
nengel@2
|
253
|
|
nengel@2
|
254 pixels += line_size;
|
|
nengel@2
|
255 block += line_size;
|
|
nengel@2
|
256 }
|
|
nengel@2
|
257
|
|
nengel@2
|
258 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
|
|
nengel@2
|
259 }
|
|
nengel@2
|
260
|
|
nengel@2
|
261 /* next one assumes that ((line_size % 8) == 0) */
|
|
nengel@2
|
262 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
nengel@2
|
263 {
|
|
nengel@2
|
264 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
|
|
nengel@2
|
265 register int i;
|
|
nengel@2
|
266 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
|
|
nengel@2
|
267 register vector unsigned char blockv, temp1, temp2;
|
|
nengel@2
|
268 register vector unsigned short pixelssum1, pixelssum2, temp3;
|
|
nengel@2
|
269 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
|
|
nengel@2
|
270 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
|
|
nengel@2
|
271
|
|
nengel@2
|
272 temp1 = vec_ld(0, pixels);
|
|
nengel@2
|
273 temp2 = vec_ld(16, pixels);
|
|
nengel@2
|
274 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
|
nengel@2
|
275 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
|
|
nengel@2
|
276 pixelsv2 = temp2;
|
|
nengel@2
|
277 } else {
|
|
nengel@2
|
278 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
|
nengel@2
|
279 }
|
|
nengel@2
|
280 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
nengel@2
|
281 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
nengel@2
|
282 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
|
nengel@2
|
283 (vector unsigned short)pixelsv2);
|
|
nengel@2
|
284 pixelssum1 = vec_add(pixelssum1, vctwo);
|
|
nengel@2
|
285
|
|
nengel@2
|
286 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
|
|
nengel@2
|
287 for (i = 0; i < h ; i++) {
|
|
nengel@2
|
288 int rightside = ((unsigned long)block & 0x0000000F);
|
|
nengel@2
|
289 blockv = vec_ld(0, block);
|
|
nengel@2
|
290
|
|
nengel@2
|
291 temp1 = vec_ld(line_size, pixels);
|
|
nengel@2
|
292 temp2 = vec_ld(line_size + 16, pixels);
|
|
nengel@2
|
293 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
|
nengel@2
|
294 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
|
|
nengel@2
|
295 pixelsv2 = temp2;
|
|
nengel@2
|
296 } else {
|
|
nengel@2
|
297 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
|
nengel@2
|
298 }
|
|
nengel@2
|
299
|
|
nengel@2
|
300 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
nengel@2
|
301 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
nengel@2
|
302 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
|
nengel@2
|
303 (vector unsigned short)pixelsv2);
|
|
nengel@2
|
304 temp3 = vec_add(pixelssum1, pixelssum2);
|
|
nengel@2
|
305 temp3 = vec_sra(temp3, vctwo);
|
|
nengel@2
|
306 pixelssum1 = vec_add(pixelssum2, vctwo);
|
|
nengel@2
|
307 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
|
|
nengel@2
|
308
|
|
nengel@2
|
309 if (rightside) {
|
|
nengel@2
|
310 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
|
|
nengel@2
|
311 } else {
|
|
nengel@2
|
312 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
|
|
nengel@2
|
313 }
|
|
nengel@2
|
314
|
|
nengel@2
|
315 vec_st(blockv, 0, block);
|
|
nengel@2
|
316
|
|
nengel@2
|
317 block += line_size;
|
|
nengel@2
|
318 pixels += line_size;
|
|
nengel@2
|
319 }
|
|
nengel@2
|
320
|
|
nengel@2
|
321 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
|
|
nengel@2
|
322 }
|
|
nengel@2
|
323
|
|
nengel@2
|
324 /* next one assumes that ((line_size % 8) == 0) */
|
|
nengel@2
|
325 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
nengel@2
|
326 {
|
|
nengel@2
|
327 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
|
|
nengel@2
|
328 register int i;
|
|
nengel@2
|
329 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
|
|
nengel@2
|
330 register vector unsigned char blockv, temp1, temp2;
|
|
nengel@2
|
331 register vector unsigned short pixelssum1, pixelssum2, temp3;
|
|
nengel@2
|
332 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
|
|
nengel@2
|
333 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
|
|
nengel@2
|
334 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
|
|
nengel@2
|
335
|
|
nengel@2
|
336 temp1 = vec_ld(0, pixels);
|
|
nengel@2
|
337 temp2 = vec_ld(16, pixels);
|
|
nengel@2
|
338 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
|
nengel@2
|
339 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
|
|
nengel@2
|
340 pixelsv2 = temp2;
|
|
nengel@2
|
341 } else {
|
|
nengel@2
|
342 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
|
nengel@2
|
343 }
|
|
nengel@2
|
344 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
nengel@2
|
345 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
nengel@2
|
346 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
|
nengel@2
|
347 (vector unsigned short)pixelsv2);
|
|
nengel@2
|
348 pixelssum1 = vec_add(pixelssum1, vcone);
|
|
nengel@2
|
349
|
|
nengel@2
|
350 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
|
|
nengel@2
|
351 for (i = 0; i < h ; i++) {
|
|
nengel@2
|
352 int rightside = ((unsigned long)block & 0x0000000F);
|
|
nengel@2
|
353 blockv = vec_ld(0, block);
|
|
nengel@2
|
354
|
|
nengel@2
|
355 temp1 = vec_ld(line_size, pixels);
|
|
nengel@2
|
356 temp2 = vec_ld(line_size + 16, pixels);
|
|
nengel@2
|
357 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
|
nengel@2
|
358 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
|
|
nengel@2
|
359 pixelsv2 = temp2;
|
|
nengel@2
|
360 } else {
|
|
nengel@2
|
361 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
|
nengel@2
|
362 }
|
|
nengel@2
|
363
|
|
nengel@2
|
364 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
nengel@2
|
365 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
nengel@2
|
366 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
|
nengel@2
|
367 (vector unsigned short)pixelsv2);
|
|
nengel@2
|
368 temp3 = vec_add(pixelssum1, pixelssum2);
|
|
nengel@2
|
369 temp3 = vec_sra(temp3, vctwo);
|
|
nengel@2
|
370 pixelssum1 = vec_add(pixelssum2, vcone);
|
|
nengel@2
|
371 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
|
|
nengel@2
|
372
|
|
nengel@2
|
373 if (rightside) {
|
|
nengel@2
|
374 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
|
|
nengel@2
|
375 } else {
|
|
nengel@2
|
376 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
|
|
nengel@2
|
377 }
|
|
nengel@2
|
378
|
|
nengel@2
|
379 vec_st(blockv, 0, block);
|
|
nengel@2
|
380
|
|
nengel@2
|
381 block += line_size;
|
|
nengel@2
|
382 pixels += line_size;
|
|
nengel@2
|
383 }
|
|
nengel@2
|
384
|
|
nengel@2
|
385 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
|
|
nengel@2
|
386 }
|
|
nengel@2
|
387
|
|
nengel@2
|
388 /* next one assumes that ((line_size % 16) == 0) */
|
|
nengel@2
|
389 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
|
|
nengel@2
|
390 {
|
|
nengel@2
|
391 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
|
|
nengel@2
|
392 register int i;
|
|
nengel@2
|
393 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
|
|
nengel@2
|
394 register vector unsigned char blockv, temp1, temp2;
|
|
nengel@2
|
395 register vector unsigned short temp3, temp4,
|
|
nengel@2
|
396 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
|
|
nengel@2
|
397 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
|
|
nengel@2
|
398 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
|
|
nengel@2
|
399
|
|
nengel@2
|
400 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
|
|
nengel@2
|
401
|
|
nengel@2
|
402 temp1 = vec_ld(0, pixels);
|
|
nengel@2
|
403 temp2 = vec_ld(16, pixels);
|
|
nengel@2
|
404 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
|
nengel@2
|
405 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
|
|
nengel@2
|
406 pixelsv2 = temp2;
|
|
nengel@2
|
407 } else {
|
|
nengel@2
|
408 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
|
nengel@2
|
409 }
|
|
nengel@2
|
410 pixelsv3 = vec_mergel(vczero, pixelsv1);
|
|
nengel@2
|
411 pixelsv4 = vec_mergel(vczero, pixelsv2);
|
|
nengel@2
|
412 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
nengel@2
|
413 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
nengel@2
|
414 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
|
|
nengel@2
|
415 (vector unsigned short)pixelsv4);
|
|
nengel@2
|
416 pixelssum3 = vec_add(pixelssum3, vctwo);
|
|
nengel@2
|
417 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
|
nengel@2
|
418 (vector unsigned short)pixelsv2);
|
|
nengel@2
|
419 pixelssum1 = vec_add(pixelssum1, vctwo);
|
|
nengel@2
|
420
|
|
nengel@2
|
421 for (i = 0; i < h ; i++) {
|
|
nengel@2
|
422 blockv = vec_ld(0, block);
|
|
nengel@2
|
423
|
|
nengel@2
|
424 temp1 = vec_ld(line_size, pixels);
|
|
nengel@2
|
425 temp2 = vec_ld(line_size + 16, pixels);
|
|
nengel@2
|
426 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
|
nengel@2
|
427 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
|
|
nengel@2
|
428 pixelsv2 = temp2;
|
|
nengel@2
|
429 } else {
|
|
nengel@2
|
430 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
|
nengel@2
|
431 }
|
|
nengel@2
|
432
|
|
nengel@2
|
433 pixelsv3 = vec_mergel(vczero, pixelsv1);
|
|
nengel@2
|
434 pixelsv4 = vec_mergel(vczero, pixelsv2);
|
|
nengel@2
|
435 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
nengel@2
|
436 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
nengel@2
|
437
|
|
nengel@2
|
438 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
|
|
nengel@2
|
439 (vector unsigned short)pixelsv4);
|
|
nengel@2
|
440 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
|
nengel@2
|
441 (vector unsigned short)pixelsv2);
|
|
nengel@2
|
442 temp4 = vec_add(pixelssum3, pixelssum4);
|
|
nengel@2
|
443 temp4 = vec_sra(temp4, vctwo);
|
|
nengel@2
|
444 temp3 = vec_add(pixelssum1, pixelssum2);
|
|
nengel@2
|
445 temp3 = vec_sra(temp3, vctwo);
|
|
nengel@2
|
446
|
|
nengel@2
|
447 pixelssum3 = vec_add(pixelssum4, vctwo);
|
|
nengel@2
|
448 pixelssum1 = vec_add(pixelssum2, vctwo);
|
|
nengel@2
|
449
|
|
nengel@2
|
450 blockv = vec_packsu(temp3, temp4);
|
|
nengel@2
|
451
|
|
nengel@2
|
452 vec_st(blockv, 0, block);
|
|
nengel@2
|
453
|
|
nengel@2
|
454 block += line_size;
|
|
nengel@2
|
455 pixels += line_size;
|
|
nengel@2
|
456 }
|
|
nengel@2
|
457
|
|
nengel@2
|
458 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
|
|
nengel@2
|
459 }
|
|
nengel@2
|
460
|
|
nengel@2
|
461 /* next one assumes that ((line_size % 16) == 0) */
|
|
nengel@2
|
462 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
|
|
nengel@2
|
463 {
|
|
nengel@2
|
464 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
|
|
nengel@2
|
465 register int i;
|
|
nengel@2
|
466 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
|
|
nengel@2
|
467 register vector unsigned char blockv, temp1, temp2;
|
|
nengel@2
|
468 register vector unsigned short temp3, temp4,
|
|
nengel@2
|
469 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
|
|
nengel@2
|
470 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
|
|
nengel@2
|
471 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
|
|
nengel@2
|
472 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
|
|
nengel@2
|
473
|
|
nengel@2
|
474 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
|
|
nengel@2
|
475
|
|
nengel@2
|
476 temp1 = vec_ld(0, pixels);
|
|
nengel@2
|
477 temp2 = vec_ld(16, pixels);
|
|
nengel@2
|
478 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
|
nengel@2
|
479 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
|
|
nengel@2
|
480 pixelsv2 = temp2;
|
|
nengel@2
|
481 } else {
|
|
nengel@2
|
482 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
|
nengel@2
|
483 }
|
|
nengel@2
|
484 pixelsv3 = vec_mergel(vczero, pixelsv1);
|
|
nengel@2
|
485 pixelsv4 = vec_mergel(vczero, pixelsv2);
|
|
nengel@2
|
486 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
nengel@2
|
487 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
nengel@2
|
488 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
|
|
nengel@2
|
489 (vector unsigned short)pixelsv4);
|
|
nengel@2
|
490 pixelssum3 = vec_add(pixelssum3, vcone);
|
|
nengel@2
|
491 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
|
nengel@2
|
492 (vector unsigned short)pixelsv2);
|
|
nengel@2
|
493 pixelssum1 = vec_add(pixelssum1, vcone);
|
|
nengel@2
|
494
|
|
nengel@2
|
495 for (i = 0; i < h ; i++) {
|
|
nengel@2
|
496 blockv = vec_ld(0, block);
|
|
nengel@2
|
497
|
|
nengel@2
|
498 temp1 = vec_ld(line_size, pixels);
|
|
nengel@2
|
499 temp2 = vec_ld(line_size + 16, pixels);
|
|
nengel@2
|
500 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
|
nengel@2
|
501 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
|
|
nengel@2
|
502 pixelsv2 = temp2;
|
|
nengel@2
|
503 } else {
|
|
nengel@2
|
504 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
|
nengel@2
|
505 }
|
|
nengel@2
|
506
|
|
nengel@2
|
507 pixelsv3 = vec_mergel(vczero, pixelsv1);
|
|
nengel@2
|
508 pixelsv4 = vec_mergel(vczero, pixelsv2);
|
|
nengel@2
|
509 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
nengel@2
|
510 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
nengel@2
|
511
|
|
nengel@2
|
512 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
|
|
nengel@2
|
513 (vector unsigned short)pixelsv4);
|
|
nengel@2
|
514 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
|
nengel@2
|
515 (vector unsigned short)pixelsv2);
|
|
nengel@2
|
516 temp4 = vec_add(pixelssum3, pixelssum4);
|
|
nengel@2
|
517 temp4 = vec_sra(temp4, vctwo);
|
|
nengel@2
|
518 temp3 = vec_add(pixelssum1, pixelssum2);
|
|
nengel@2
|
519 temp3 = vec_sra(temp3, vctwo);
|
|
nengel@2
|
520
|
|
nengel@2
|
521 pixelssum3 = vec_add(pixelssum4, vcone);
|
|
nengel@2
|
522 pixelssum1 = vec_add(pixelssum2, vcone);
|
|
nengel@2
|
523
|
|
nengel@2
|
524 blockv = vec_packsu(temp3, temp4);
|
|
nengel@2
|
525
|
|
nengel@2
|
526 vec_st(blockv, 0, block);
|
|
nengel@2
|
527
|
|
nengel@2
|
528 block += line_size;
|
|
nengel@2
|
529 pixels += line_size;
|
|
nengel@2
|
530 }
|
|
nengel@2
|
531
|
|
nengel@2
|
532 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
|
|
nengel@2
|
533 }
|
|
nengel@2
|
534
|
|
nengel@2
|
535 /* next one assumes that ((line_size % 8) == 0) */
|
|
nengel@2
|
536 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
nengel@2
|
537 {
|
|
nengel@2
|
538 POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
|
|
nengel@2
|
539 register int i;
|
|
nengel@2
|
540 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
|
|
nengel@2
|
541 register vector unsigned char blockv, temp1, temp2, blocktemp;
|
|
nengel@2
|
542 register vector unsigned short pixelssum1, pixelssum2, temp3;
|
|
nengel@2
|
543
|
|
nengel@2
|
544 register const vector unsigned char vczero = (const vector unsigned char)
|
|
nengel@2
|
545 vec_splat_u8(0);
|
|
nengel@2
|
546 register const vector unsigned short vctwo = (const vector unsigned short)
|
|
nengel@2
|
547 vec_splat_u16(2);
|
|
nengel@2
|
548
|
|
nengel@2
|
549 temp1 = vec_ld(0, pixels);
|
|
nengel@2
|
550 temp2 = vec_ld(16, pixels);
|
|
nengel@2
|
551 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
|
nengel@2
|
552 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
|
|
nengel@2
|
553 pixelsv2 = temp2;
|
|
nengel@2
|
554 } else {
|
|
nengel@2
|
555 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
|
nengel@2
|
556 }
|
|
nengel@2
|
557 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
nengel@2
|
558 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
nengel@2
|
559 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
|
nengel@2
|
560 (vector unsigned short)pixelsv2);
|
|
nengel@2
|
561 pixelssum1 = vec_add(pixelssum1, vctwo);
|
|
nengel@2
|
562
|
|
nengel@2
|
563 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
|
|
nengel@2
|
564 for (i = 0; i < h ; i++) {
|
|
nengel@2
|
565 int rightside = ((unsigned long)block & 0x0000000F);
|
|
nengel@2
|
566 blockv = vec_ld(0, block);
|
|
nengel@2
|
567
|
|
nengel@2
|
568 temp1 = vec_ld(line_size, pixels);
|
|
nengel@2
|
569 temp2 = vec_ld(line_size + 16, pixels);
|
|
nengel@2
|
570 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
|
nengel@2
|
571 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
|
|
nengel@2
|
572 pixelsv2 = temp2;
|
|
nengel@2
|
573 } else {
|
|
nengel@2
|
574 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
|
nengel@2
|
575 }
|
|
nengel@2
|
576
|
|
nengel@2
|
577 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
nengel@2
|
578 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
nengel@2
|
579 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
|
nengel@2
|
580 (vector unsigned short)pixelsv2);
|
|
nengel@2
|
581 temp3 = vec_add(pixelssum1, pixelssum2);
|
|
nengel@2
|
582 temp3 = vec_sra(temp3, vctwo);
|
|
nengel@2
|
583 pixelssum1 = vec_add(pixelssum2, vctwo);
|
|
nengel@2
|
584 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
|
|
nengel@2
|
585
|
|
nengel@2
|
586 if (rightside) {
|
|
nengel@2
|
587 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
|
|
nengel@2
|
588 } else {
|
|
nengel@2
|
589 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
|
|
nengel@2
|
590 }
|
|
nengel@2
|
591
|
|
nengel@2
|
592 blockv = vec_avg(blocktemp, blockv);
|
|
nengel@2
|
593 vec_st(blockv, 0, block);
|
|
nengel@2
|
594
|
|
nengel@2
|
595 block += line_size;
|
|
nengel@2
|
596 pixels += line_size;
|
|
nengel@2
|
597 }
|
|
nengel@2
|
598
|
|
nengel@2
|
599 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
|
|
nengel@2
|
600 }
|
|
nengel@2
|
601
|
|
nengel@2
|
602 void dsputil_init_altivec(DSPContext* c)
|
|
nengel@2
|
603 {
|
|
nengel@2
|
604 c->diff_pixels = diff_pixels_altivec;
|
|
nengel@2
|
605 c->get_pixels = get_pixels_altivec;
|
|
nengel@2
|
606 c->clear_block = clear_block_altivec;
|
|
nengel@2
|
607
|
|
nengel@2
|
608 c->put_pixels_tab[0][0] = put_pixels16_altivec;
|
|
nengel@2
|
609 /* the two functions do the same thing, so use the same code */
|
|
nengel@2
|
610 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
|
|
nengel@2
|
611 c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
|
|
nengel@2
|
612 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
|
|
nengel@2
|
613 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
|
|
nengel@2
|
614 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
|
|
nengel@2
|
615 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
|
|
nengel@2
|
616 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
|
|
nengel@2
|
617 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
|
|
nengel@2
|
618
|
|
nengel@2
|
619 }
|