PR/Applications/VSs/VSs__H264__App: ea1ba68cf0ed libavcodec/cell/dsputil

view libavcodec/cell/dsputil_spu.c @ 9:ea1ba68cf0ed

update to match api changes + add sscc produced source

author	Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date	Wed, 05 Jun 2013 14:43:26 +0200
parents
children

line source

1 /*

3 *

4 * Cell Parallel SPU - 2DWave Macroblock Decoding.

5 */

7 /**

8 * @file libavcodec/cell/spu/h264_main_spu.c

9 * Cell Parallel SPU - 2DWave Macroblock Decoding

10 * @author C C Chi <c.c.chi@student.tudelft.nl>

11 *

12 * SIMD SPU kernels

13 * H.264/AVC motion compensation

14 * @author Mauricio Alvarez <alvarez@ac.upc.edu>

15 * @author Albert Paradis <apar7632@hotmail.com>

16 */

19 #include "dsputil_spu.h"

20 #include "h264_idct_spu.h"

21 #include "h264_deblock_spu.h"

22 #include "types_spu.h"

23 #include "libavutil/intreadwrite.h"

25 #include <stdio.h>

26 #include <spu_intrinsics.h>

27 #include <spu_mfcio.h>

28 #include <assert.h>

30 //Luma interpolation

31 #define PUT_OP_U8_SPU(d, s, dst) (void) dst; d = s

32 #define AVG_OP_U8_SPU(d, s, dst) d = spu_avg(dst, s)

34 #define OP_U8_SPU PUT_OP_U8_SPU

35 #define PREFIX_h264_qpel16_h_lowpass_spu put_h264_qpel16_h_lowpass_spu

36 #define PREFIX_h264_qpel16_v_lowpass_spu put_h264_qpel16_v_lowpass_spu

37 #define PREFIX_h264_qpel16_hv_lowpass_spu put_h264_qpel16_hv_lowpass_spu

38 #define PREFIX_h264_qpel8_h_lowpass_spu put_h264_qpel8_h_lowpass_spu

39 #define PREFIX_h264_qpel8_v_lowpass_spu put_h264_qpel8_v_lowpass_spu

40 #define PREFIX_h264_qpel8_hv_lowpass_spu put_h264_qpel8_hv_lowpass_spu

41 #define PREFIX_h264_qpel4_h_lowpass_spu put_h264_qpel4_h_lowpass_spu

42 #define PREFIX_h264_qpel4_v_lowpass_spu put_h264_qpel4_v_lowpass_spu

43 #define PREFIX_h264_qpel4_hv_lowpass_spu put_h264_qpel4_hv_lowpass_spu

44 #include "h264_luma_template_spu.c"

45 #undef OP_U8_SPU

46 #undef PREFIX_h264_qpel16_h_lowpass_spu

47 #undef PREFIX_h264_qpel16_v_lowpass_spu

48 #undef PREFIX_h264_qpel16_hv_lowpass_spu

49 #undef PREFIX_h264_qpel8_h_lowpass_spu

50 #undef PREFIX_h264_qpel8_v_lowpass_spu

51 #undef PREFIX_h264_qpel8_hv_lowpass_spu

52 #undef PREFIX_h264_qpel4_h_lowpass_spu

53 #undef PREFIX_h264_qpel4_v_lowpass_spu

54 #undef PREFIX_h264_qpel4_hv_lowpass_spu

56 #define OP_U8_SPU AVG_OP_U8_SPU

57 #define PREFIX_h264_qpel16_h_lowpass_spu avg_h264_qpel16_h_lowpass_spu

58 #define PREFIX_h264_qpel16_v_lowpass_spu avg_h264_qpel16_v_lowpass_spu

59 #define PREFIX_h264_qpel16_hv_lowpass_spu avg_h264_qpel16_hv_lowpass_spu

60 #define PREFIX_h264_qpel8_h_lowpass_spu avg_h264_qpel8_h_lowpass_spu

61 #define PREFIX_h264_qpel8_v_lowpass_spu avg_h264_qpel8_v_lowpass_spu

62 #define PREFIX_h264_qpel8_hv_lowpass_spu avg_h264_qpel8_hv_lowpass_spu

63 #define PREFIX_h264_qpel4_h_lowpass_spu avg_h264_qpel4_h_lowpass_spu

64 #define PREFIX_h264_qpel4_v_lowpass_spu avg_h264_qpel4_v_lowpass_spu

65 #define PREFIX_h264_qpel4_hv_lowpass_spu avg_h264_qpel4_hv_lowpass_spu

66 #include "h264_luma_template_spu.c"

67 #undef OP_U8_SPU

68 #undef PREFIX_h264_qpel16_h_lowpass_spu

69 #undef PREFIX_h264_qpel16_v_lowpass_spu

70 #undef PREFIX_h264_qpel16_hv_lowpass_spu

71 #undef PREFIX_h264_qpel8_h_lowpass_spu

72 #undef PREFIX_h264_qpel8_v_lowpass_spu

73 #undef PREFIX_h264_qpel8_hv_lowpass_spu

74 #undef PREFIX_h264_qpel4_h_lowpass_spu

75 #undef PREFIX_h264_qpel4_v_lowpass_spu

76 #undef PREFIX_h264_qpel4_hv_lowpass_spu

78 #define H264_MC(OPNAME, SIZE, CODETYPE) \

79 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

80 OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, dst_stride, STRIDE_Y, h);\

81 }\

82 \

83 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){ \

84 DECLARE_ALIGNED_16(uint8_t, half[16*16]);\

85 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\

86 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\

87 }\

88 \

89 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

90 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\

91 }\

92 \

93 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

94 DECLARE_ALIGNED_16(uint8_t, half[16*16]);\

95 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\

96 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, dst_stride, STRIDE_Y, h);\

97 }\

98 \

99 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

100 DECLARE_ALIGNED_16(uint8_t, half[16*16]);\

101 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\

102 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\

103 }\

104 \

105 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

106 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\

107 }\

108 \

109 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

110 DECLARE_ALIGNED_16(uint8_t, half[16*16]);\

111 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\

112 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+STRIDE_Y, half, dst_stride, STRIDE_Y, h);\

113 }\

114 \

115 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

116 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\

117 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\

118 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\

119 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\

120 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\

121 }\

122 \

123 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

124 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\

125 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\

126 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\

127 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\

128 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\

129 }\

130 \

131 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

132 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\

133 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\

134 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\

135 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\

136 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\

137 }\

138 \

139 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

140 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\

141 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\

142 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\

143 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\

144 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\

145 }\

146 \

147 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

148 DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\

149 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, dst_stride, 16, h);\

150 }\

151 \

152 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

153 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\

154 DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\

155 DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\

156 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\

157 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\

158 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\

159 }\

160 \

161 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

162 DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\

163 DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\

164 DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\

165 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\

166 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\

167 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\

168 }\

169 \

170 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

171 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\

172 DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\

173 DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\

174 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\

175 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\

176 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\

177 }\

178 \

179 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\

180 DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\

181 DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\

182 DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\

183 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\

184 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\

185 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\

186 }\

187

188

189 /**************************/

190 /* put pixels functions */

191 /*************************/

192

193 static void put_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,

194 const uint8_t * src2, int dst_stride,

195 int src_stride1, int h)

196 {

197 int i;

198

199 const int perm_src1 = (unsigned int) src1 & 15;

200

201 for (i=0; i<h; i++){

202 //unaligned load of src1

203 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);

204 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);

205 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));

206

207 //aligned load of src2

208 const vuint8_t srcb = *(vuint8_t *)(src2);

209

210 //average and rounding

211 const vuint8_t avgc = spu_avg(srca,srcb);

212

213 // 16x16 dest luma blocks are always aligned

214 *(vuint8_t *)dst=avgc;

215

216 src1 +=src_stride1;

217 src2 +=16;

218 dst +=dst_stride;

219 }

220 }

221

222 static void avg_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,

223 const uint8_t * src2, int dst_stride,

224 int src_stride1, int h)

225 {

226 int i;

227

228 const int perm_src1 = (unsigned int) src1 & 15;

229

230 for (i=0; i<h; i++){

231 //unaligned load of src1

232 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);

233 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);

234 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));

235

236 //aligned load of src2

237 const vuint8_t srcb = *(vuint8_t *)(src2);

238

239 //average and rounding

240 const vuint8_t avgc = spu_avg(spu_avg(srca,srcb), *(vuint8_t *)dst);

241

242 // 16x16 dest luma blocks are always aligned

243 *(vuint8_t *)dst=avgc;

244

245 src1 +=src_stride1;

246 src2 +=16;

247 dst +=dst_stride;

248 }

249 }

250

251 // next one assumes that ((line_size % 16) == 0)

252 void put_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)

253 {

254 register vector unsigned char pixelsv1, pixelsv2;

255 register vector unsigned char pixelsv1B, pixelsv2B;

256 register vector unsigned char pixelsv1C, pixelsv2C;

257 register vector unsigned char pixelsv1D, pixelsv2D;

258

259 const int perm = (unsigned int) src & 15;

260 int i;

261 register int line_size = src_stride;

262 register int line_size_2 = line_size << 1;

263 register int line_size_3 = line_size + line_size_2;

264 register int line_size_4 = line_size << 2;

265

266 register int dst_stride_2 = dst_stride << 1;

267 register int dst_stride_3 = dst_stride_2 + dst_stride;

268 register int dst_stride_4 = dst_stride << 2;

269

270 for(i=0; i<h; i+=4) {

271 pixelsv1 = *(vuint8_t *)(src);

272 pixelsv2 = *(vuint8_t *)(src+16);

273 pixelsv1B = *(vuint8_t *)(src + line_size);

274 pixelsv2B = *(vuint8_t *)(src+16 + line_size);

275 pixelsv1C = *(vuint8_t *)(src + line_size_2);

276 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);

277 pixelsv1D = *(vuint8_t *)(src + line_size_3);

278 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);

279

280 *(vuint8_t *) dst = spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16));

281 *(vuint8_t *)(dst + dst_stride) = spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16));

282 *(vuint8_t *)(dst + dst_stride_2) = spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16));

283 *(vuint8_t *)(dst + dst_stride_3) = spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16));

284

285 src+= line_size_4;

286 dst+= dst_stride_4;

287 }

288 }

289

290 // next one assumes that ((line_size % 16) == 0)

291 void avg_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)

292 {

293 register vector unsigned char pixelsv1, pixelsv2;

294 register vector unsigned char pixelsv1B, pixelsv2B;

295 register vector unsigned char pixelsv1C, pixelsv2C;

296 register vector unsigned char pixelsv1D, pixelsv2D;

297

298 const int perm = (unsigned int) src & 15;

299 int i;

300 register int line_size = src_stride;

301 register int line_size_2 = line_size << 1;

302 register int line_size_3 = line_size + line_size_2;

303 register int line_size_4 = line_size << 2;

304

305 register int dst_stride_2 = dst_stride << 1;

306 register int dst_stride_3 = dst_stride_2 + dst_stride;

307 register int dst_stride_4 = dst_stride << 2;

308

309

310 for(i=0; i<h; i+=4) {

311 pixelsv1 = *(vuint8_t *)(src);

312 pixelsv2 = *(vuint8_t *)(src+16);

313 pixelsv1B = *(vuint8_t *)(src + line_size);

314 pixelsv2B = *(vuint8_t *)(src+16 + line_size);

315 pixelsv1C = *(vuint8_t *)(src + line_size_2);

316 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);

317 pixelsv1D = *(vuint8_t *)(src + line_size_3);

318 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);

319

320 *(vuint8_t *)dst = spu_avg(spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16)), *(vuint8_t *)dst);

321 *(vuint8_t *)(dst + dst_stride) = spu_avg(spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), *(vuint8_t *)(dst+dst_stride));

322 *(vuint8_t *)(dst + dst_stride_2) = spu_avg(spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), *(vuint8_t *)(dst+dst_stride_2));

323 *(vuint8_t *)(dst + dst_stride_3) = spu_avg(spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), *(vuint8_t *)(dst+dst_stride_3));

324

325 src+= line_size_4;

326 dst+= dst_stride_4;

327 }

328 }

329

330 void put_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,

331 int dst_stride, int src_stride1, int h)

332 {

333 int i;

334

335 const int perm_src1 = (unsigned int) src1 & 15;

336 const int shift_dst = (unsigned int) dst & 15;

337

338 // 8x dest luma blocks are aligned or desaligned by 8

339 vuint8_t dstmask;

340 const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

341 const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};

342

343 if(shift_dst==0){

344 dstmask = dst8mask1;

345 }

346 else{

347 dstmask = dst8mask2;

348 }

349

350 for (i=0; i<h; i++){

351 //unaligned load of src1

352 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);

353 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);

354 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));

355

356 //aligned load of src2

357 const vuint8_t srcb = *(vuint8_t *)(src2);

358

359 //average and rounding

360 const vuint8_t avgc = spu_avg(srca,srcb);

361

362 const vuint8_t dst1 = *(vuint8_t *)dst;

363

364 const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);

365

366 *(vuint8_t *)dst=davgc;

367

368 src1 +=src_stride1;

369 src2 +=16;

370 dst +=dst_stride;

371 }

372 }

373

374 void avg_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,

375 int dst_stride, int src_stride1, int h)

376 {

377 int i;

378

379 const int perm_src1 = (unsigned int) src1 & 15;

380 const int shift_dst = (unsigned int) dst & 15;

381

382 // 8x dest luma blocks are aligned or desaligned by 8

383 vuint8_t dstmask;

384 const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

385 const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};

386

387 if(shift_dst==0){

388 dstmask = dst8mask1;

389 }

390 else{

391 dstmask = dst8mask2;

392 }

393

394 for (i=0; i<h; i++){

395 //unaligned load of src1

396 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);

397 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);

398 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));

399

400 //aligned load of src2

401 const vuint8_t srcb = *(vuint8_t *)(src2);

402

403 //average and rounding

404 const vuint8_t avgc = spu_avg(srca,srcb);

405

406 const vuint8_t dst1 = *(vuint8_t *)dst;

407

408 const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);

409

410 const vuint8_t davgc = spu_avg(dst1,davgc1);

411

412 *(vuint8_t *)dst=davgc;

413

414 src1 +=src_stride1;

415 src2 +=16;

416 dst +=dst_stride;

417 }

418 }

419

420 // next one assumes that ((line_size % 16) == 0)

421 void put_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)

422 {

423 register vector unsigned char pixelsv1A, pixelsv2A;

424 register vector unsigned char pixelsv1B, pixelsv2B;

425 register vector unsigned char pixelsv1C, pixelsv2C;

426 register vector unsigned char pixelsv1D, pixelsv2D;

427

428 const int perm = (unsigned int) src & 15;

429 const int shift_dst = (unsigned int) dst & 15;

430

431 // 8x dest luma blocks are aligned or desaligned by 8

432 vuint8_t dstmask;

433 const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

434 const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};

435

436 if(shift_dst==0){

437 dstmask = dst8mask1;

438 }

439 else{

440 dstmask = dst8mask2;

441 }

442

443 int i;

444 register int line_size = src_stride;

445 register int line_size_2 = line_size << 1;

446 register int line_size_3 = line_size + line_size_2;

447 register int line_size_4 = line_size << 2;

448

449 register int dst_stride_2 = dst_stride << 1;

450 register int dst_stride_3 = dst_stride_2 + dst_stride;

451 register int dst_stride_4 = dst_stride << 2;

452

453 for(i=0; i<h; i+=4) {

454 pixelsv1A = *(vuint8_t *)(src);

455 pixelsv2A = *(vuint8_t *)(src+16);

456 pixelsv1B = *(vuint8_t *)(src + line_size);

457 pixelsv2B = *(vuint8_t *)(src+16 + line_size);

458 pixelsv1C = *(vuint8_t *)(src + line_size_2);

459 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);

460 pixelsv1D = *(vuint8_t *)(src + line_size_3);

461 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);

462

463 const vuint8_t block1 = *(vuint8_t *)dst;

464 const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);

465 const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);

466 const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);

467 const vuint8_t block3 = *(vuint8_t *)(dst+2*dst_stride);

468 const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);

469 const vuint8_t block4 = *(vuint8_t *)(dst+3*dst_stride);

470 const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);

471

472 *(vuint8_t *) dst = put1;

473 *(vuint8_t *)(dst + dst_stride) = put2;

474 *(vuint8_t *)(dst + dst_stride_2) = put3;

475 *(vuint8_t *)(dst + dst_stride_3) = put4;

476

477 src += line_size_4;

478 dst += dst_stride_4;

479 }

480 }

481

482 // next one assumes that ((line_size % 16) == 0)

483 void avg_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)

484 {

485 register vector unsigned char pixelsv1A, pixelsv2A;

486 register vector unsigned char pixelsv1B, pixelsv2B;

487 register vector unsigned char pixelsv1C, pixelsv2C;

488 register vector unsigned char pixelsv1D, pixelsv2D;

489

490 const int perm = (unsigned int) src & 15;

491 const int shift_dst = (unsigned int) dst & 15;

492

493 // 8x dest luma blocks are aligned or desaligned by 8

494 vuint8_t dstmask;

495 const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

496 const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};

497

498 if(shift_dst==0){

499 dstmask = dst8mask1;

500 }

501 else{

502 dstmask = dst8mask2;

503 }

504

505 int i;

506 register int line_size = src_stride;

507 register int line_size_2 = line_size << 1;

508 register int line_size_3 = line_size + line_size_2;

509 register int line_size_4 = line_size << 2;

510

511 register int dst_stride_2 = dst_stride << 1;

512 register int dst_stride_3 = dst_stride_2 + dst_stride;

513 register int dst_stride_4 = dst_stride << 2;

514

515 for(i=0; i<h; i+=4) {

516 pixelsv1A = *(vuint8_t *)(src);

517 pixelsv2A = *(vuint8_t *)(src+16);

518 pixelsv1B = *(vuint8_t *)(src + line_size);

519 pixelsv2B = *(vuint8_t *)(src+16 + line_size);

520 pixelsv1C = *(vuint8_t *)(src + line_size_2);

521 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);

522 pixelsv1D = *(vuint8_t *)(src + line_size_3);

523 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);

524

525 const vuint8_t block1 = *(vuint8_t *) dst;

526 const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);

527 const vuint8_t put1 = spu_avg(block1,put1a);

528

529 const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);

530 const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);

531 const vuint8_t put2 = spu_avg(block2,put2a);

532

533 const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);

534 const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);

535 const vuint8_t put3 = spu_avg(block3,put3a);

536

537 const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);

538 const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);

539 const vuint8_t put4 = spu_avg(block4,put4a);

540

541 *(vuint8_t *) dst = put1;

542 *(vuint8_t *)(dst + dst_stride) = put2;

543 *(vuint8_t *)(dst + dst_stride_2) = put3;

544 *(vuint8_t *)(dst + dst_stride_3) = put4;

545

546 src+= line_size_4;

547 dst+= dst_stride_4;

548 }

549 }

550

551 void put_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,

552 int dst_stride, int src_stride1, int h)

553 {

554 int i;

555

556 const int perm_src1 = (unsigned int) src1 & 15;

557 const int shift_dst = (unsigned int) dst & 15;

558

559 // 4x dest luma blocks are desaligned by 0, 4, 8, or 12

560 vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

561 const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

562 const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

563 const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};

564 const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};

565

566 switch(shift_dst){

567 case 0: dstmask = dstmask0;

568 break;

569 case 4: dstmask = dstmask4;

570 break;

571 case 8: dstmask = dstmask8;

572 break;

573 case 12: dstmask = dstmask12;

574 break;

575 }

576

577 for (i=0; i<h; i++){

578 //unaligned load of src1

579 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);

580 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);

581 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));

582

583 //aligned load of src2

584 const vuint8_t srcb = *(vuint8_t *)(src2);

585

586 //average and rounding

587 const vuint8_t avgc = spu_avg(srca,srcb);

588

589 const vuint8_t dst1 = *(vuint8_t *)dst;

590

591 const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);

592

593 *(vuint8_t *)dst=davgc;

594

595 src1 +=src_stride1;

596 src2 +=16;

597 dst +=dst_stride;

598 }

599 }

600

601 void avg_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,

602 int dst_stride, int src_stride1, int h)

603 {

604 int i;

605

606 const int perm_src1 = (unsigned int) src1 & 15;

607 const int shift_dst = (unsigned int) dst & 15;

608

609 // 4x dest luma blocks are desaligned by 0, 4, 8, or 12

610 vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

611 const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

612 const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

613 const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};

614 const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};

615

616 switch(shift_dst){

617 case 0: dstmask = dstmask0;

618 break;

619 case 4: dstmask = dstmask4;

620 break;

621 case 8: dstmask = dstmask8;

622 break;

623 case 12: dstmask = dstmask12;

624 break;

625 }

626

627 for (i=0; i<h; i++){

628 //unaligned load of src1

629 const vuint8_t srctmpa1 = *(vuint8_t *)(src1);

630 const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);

631 const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));

632

633 //aligned load of src2

634 const vuint8_t srcb = *(vuint8_t *)(src2);

635

636 //average and rounding

637 const vuint8_t avgc = spu_avg(srca,srcb);

638

639 const vuint8_t dst1 = *(vuint8_t *)dst;

640

641 const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);

642

643 const vuint8_t davgc = spu_avg(dst1,davgc1);

644

645 *(vuint8_t *)dst=davgc;

646

647 src1 +=src_stride1;

648 src2 +=16;

649 dst +=dst_stride;

650 }

651 }

652

653 // next one assumes that ((line_size % 16) == 0)

654 void put_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)

655 {

656 register vector unsigned char pixelsv1A, pixelsv2A;

657 register vector unsigned char pixelsv1B, pixelsv2B;

658 register vector unsigned char pixelsv1C, pixelsv2C;

659 register vector unsigned char pixelsv1D, pixelsv2D;

660

661 const int perm = (unsigned int) src & 15;

662 const int shift_dst = (unsigned int) dst & 15;

663

664 // 4x dest luma blocks are desaligned by 0, 4, 8, or 12

665 vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

666 const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

667 const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

668 const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};

669 const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};

670

671 switch(shift_dst){

672 case 0: dstmask = dstmask0;

673 break;

674 case 4: dstmask = dstmask4;

675 break;

676 case 8: dstmask = dstmask8;

677 break;

678 case 12: dstmask = dstmask12;

679 break;

680 }

681

682 int i;

683 register int line_size = src_stride;

684 register int line_size_2 = line_size << 1;

685 register int line_size_3 = line_size + line_size_2;

686 register int line_size_4 = line_size << 2;

687

688 register int dst_stride_2 = dst_stride << 1;

689 register int dst_stride_3 = dst_stride_2 + dst_stride;

690 register int dst_stride_4 = dst_stride << 2;

691

692 for(i=0; i<h; i+=4) {

693 pixelsv1A = *(vuint8_t *)(src);

694 pixelsv2A = *(vuint8_t *)(src+16);

695 pixelsv1B = *(vuint8_t *)(src + line_size);

696 pixelsv2B = *(vuint8_t *)(src+16 + line_size);

697 pixelsv1C = *(vuint8_t *)(src + line_size_2);

698 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);

699 pixelsv1D = *(vuint8_t *)(src + line_size_3);

700 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);

701

702 const vuint8_t block1 = *(vuint8_t *)dst;

703 const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);

704 const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);

705 const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);

706 const vuint8_t block3 = *(vuint8_t *)(dst+dst_stride_2);

707 const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);

708 const vuint8_t block4 = *(vuint8_t *)(dst+dst_stride_3);

709 const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);

710

711 *(vuint8_t *) dst = put1;

712 *(vuint8_t *)(dst + dst_stride) = put2;

713 *(vuint8_t *)(dst + dst_stride_2) = put3;

714 *(vuint8_t *)(dst + dst_stride_3) = put4;

715

716 src += line_size_4;

717 dst += dst_stride_4;

718 }

719 }

720

721 // next one assumes that ((line_size % 16) == 0)

722 void avg_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)

723 {

724 register vector unsigned char pixelsv1A, pixelsv2A;

725 register vector unsigned char pixelsv1B, pixelsv2B;

726 register vector unsigned char pixelsv1C, pixelsv2C;

727 register vector unsigned char pixelsv1D, pixelsv2D;

728

729 const int perm = (unsigned int) src & 15;

730 const int shift_dst = (unsigned int) dst & 15;

731

732 // 4x dest luma blocks are desaligned by 0, 4, 8, or 12

733 vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

734 const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

735 const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};

736 const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};

737 const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};

738

739 switch(shift_dst){

740 case 0: dstmask = dstmask0;

741 break;

742 case 4: dstmask = dstmask4;

743 break;

744 case 8: dstmask = dstmask8;

745 break;

746 case 12: dstmask = dstmask12;

747 break;

748 }

749

750 int i;

751 register int line_size = src_stride;

752 register int line_size_2 = line_size << 1;

753 register int line_size_3 = line_size + line_size_2;

754 register int line_size_4 = line_size << 2;

755

756 register int dst_stride_2 = dst_stride << 1;

757 register int dst_stride_3 = dst_stride_2 + dst_stride;

758 register int dst_stride_4 = dst_stride << 2;

759

760 for(i=0; i<h; i+=4) {

761 pixelsv1A = *(vuint8_t *)(src);

762 pixelsv2A = *(vuint8_t *)(src+16);

763 pixelsv1B = *(vuint8_t *)(src + line_size);

764 pixelsv2B = *(vuint8_t *)(src+16 + line_size);

765 pixelsv1C = *(vuint8_t *)(src + line_size_2);

766 pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);

767 pixelsv1D = *(vuint8_t *)(src + line_size_3);

768 pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);

769

770 const vuint8_t block1 = *(vuint8_t *) dst;

771 const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);

772 const vuint8_t put1 = spu_avg(block1,put1a);

773

774 const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);

775 const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);

776 const vuint8_t put2 = spu_avg(block2,put2a);

777

778 const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);

779 const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);

780 const vuint8_t put3 = spu_avg(block3,put3a);

781

782 const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);

783 const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);

784 const vuint8_t put4 = spu_avg(block4,put4a);

785

786 *(vuint8_t *) dst = put1;

787 *(vuint8_t *)(dst + dst_stride) = put2;

788 *(vuint8_t *)(dst + dst_stride_2) = put3;

789 *(vuint8_t *)(dst + dst_stride_3) = put4;

790

791 src+= line_size_4;

792 dst+= dst_stride_4;

793 }

794 }

795

796 /* Here we create all the interpolation modes H.264 motion compensation stage for luma */

797 H264_MC(put_, 16, spu)

798 H264_MC(put_, 8, spu)

799 H264_MC(put_, 4, spu)

800

801 H264_MC(avg_, 16, spu)

802 H264_MC(avg_, 8, spu)

803 H264_MC(avg_, 4, spu)

804

805

806 //Chroma interpolation:

807

808 #define OP_U8_SPU PUT_OP_U8_SPU

809 #define PREFIX_h264_chroma_mc8_spu put_h264_chroma_mc8_spu

810 #define PREFIX_h264_chroma_mc4_spu put_h264_chroma_mc4_spu

811 #define PREFIX_h264_chroma_mc2_spu put_h264_chroma_mc2_spu

812 #include "h264_chroma_template_spu.c"

813 #undef OP_U8_SPU

814 #undef PREFIX_h264_chroma_mc8_spu

815 #undef PREFIX_h264_chroma_mc4_spu

816 #undef PREFIX_h264_chroma_mc2_spu

817

818 #define OP_U8_SPU AVG_OP_U8_SPU

819 #define PREFIX_h264_chroma_mc8_spu avg_h264_chroma_mc8_spu

820 #define PREFIX_h264_chroma_mc4_spu avg_h264_chroma_mc4_spu

821 #define PREFIX_h264_chroma_mc2_spu avg_h264_chroma_mc2_spu

822 #include "h264_chroma_template_spu.c"

823 #undef OP_U8_SPU

824 #undef PREFIX_h264_chroma_mc8_spu

825 #undef PREFIX_h264_chroma_mc4_spu

826 #undef PREFIX_h264_chroma_mc2_spu

827

828 // Weight and Biweight functions

829

830 #define op_scale1(x) dst[x] = av_clip_uint8( (dst[x]*weight + offset) >> log2_denom )

831 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))

832 #define H264_WEIGHT(W,H) \

833 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \

834 int y; \

835 offset <<= log2_denom; \

836 if(log2_denom) offset += 1<<(log2_denom-1); \

837 for(y=0; y<H; y++, dst += stride){ \

838 op_scale1(0); \

839 op_scale1(1); \

840 if(W==2) continue; \

841 op_scale1(2); \

842 op_scale1(3); \

843 if(W==4) continue; \

844 op_scale1(4); \

845 op_scale1(5); \

846 op_scale1(6); \

847 op_scale1(7); \

848 if(W==8) continue; \

849 op_scale1(8); \

850 op_scale1(9); \

851 op_scale1(10); \

852 op_scale1(11); \

853 op_scale1(12); \

854 op_scale1(13); \

855 op_scale1(14); \

856 op_scale1(15); \

857 } \

858 } \

859 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd, int weights, int offset){ \

860 int y; \

861 offset = ((offset + 1) | 1) << log2_denom; \

862 for(y=0; y<H; y++, dst += dst_stride, src += src_stride){ \

863 op_scale2(0); \

864 op_scale2(1); \

865 if(W==2) continue; \

866 op_scale2(2); \

867 op_scale2(3); \

868 if(W==4) continue; \

869 op_scale2(4); \

870 op_scale2(5); \

871 op_scale2(6); \