27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
35 {-6, 123, 12, -1, 0, 0, 0, 0},
36 {2, -11, 108, 36, -8, 1, 0, 0},
37 {-9, 93, 50, -6, 0, 0, 0, 0},
38 {3, -16, 77, 77, -16, 3, 0, 0},
39 {-6, 50, 93, -9, 0, 0, 0, 0},
40 {1, -8, 36, 108, -11, 2, 0, 0},
41 {-1, 12, 123, -6, 0, 0, 0, 0},
54 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \ 55 filt_h0, filt_h1, filt_h2) \ 57 v16i8 vec0_m, vec1_m, vec2_m; \ 60 VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \ 61 vec0_m, vec1_m, vec2_m); \ 62 hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \ 63 filt_h0, filt_h1, filt_h2); \ 65 hz_out_m = __msa_srari_h(hz_out_m, 7); \ 66 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 71 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 72 mask0, mask1, mask2, \ 73 filt0, filt1, filt2, \ 76 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ 78 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 79 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 80 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 81 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 82 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 83 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ 86 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 87 mask0, mask1, mask2, \ 88 filt0, filt1, filt2, \ 89 out0, out1, out2, out3) \ 91 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 93 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 94 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 95 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 96 out0, out1, out2, out3); \ 97 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 98 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 99 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \ 100 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \ 101 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 102 out0, out1, out2, out3); \ 103 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \ 104 out0, out1, out2, out3); \ 107 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ 111 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \ 112 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \ 117 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ 119 v16i8 vec0_m, vec1_m; \ 122 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \ 123 hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \ 125 hz_out_m = __msa_srari_h(hz_out_m, 7); \ 126 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 131 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 132 mask0, mask1, filt0, filt1, \ 135 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 137 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 138 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 139 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 140 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 143 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 144 mask0, mask1, filt0, filt1, \ 145 out0, out1, out2, out3) \ 147 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 149 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 150 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 151 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 152 out0, out1, out2, out3); \ 153 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 154 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 155 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 156 out0, out1, out2, out3); \ 163 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2;
164 v16u8 mask0, mask1, mask2,
out;
165 v8i16
filt, out0, out1;
171 filt =
LD_SH(filter);
177 LD_SB4(src, src_stride, src0, src1, src2, src3);
180 filt0, filt1, filt2, out0, out1);
184 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
191 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2;
192 v16u8 mask0, mask1, mask2,
out;
193 v8i16
filt, out0, out1, out2, out3;
199 filt =
LD_SH(filter);
205 LD_SB4(src, src_stride, src0, src1, src2, src3);
207 src += (4 * src_stride);
209 filt0, filt1, filt2, out0, out1);
210 LD_SB4(src, src_stride, src0, src1, src2, src3);
213 filt0, filt1, filt2, out2, out3);
217 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
219 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
224 int height,
int mx,
int my)
230 }
else if (8 == height) {
237 int height,
int mx,
int my)
241 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2;
242 v16u8 mask0, mask1, mask2, tmp0, tmp1;
243 v8i16
filt, out0, out1, out2, out3;
250 filt =
LD_SH(filter);
256 LD_SB4(src, src_stride, src0, src1, src2, src3);
258 src += (4 * src_stride);
260 filt0, filt1, filt2, out0, out1, out2, out3);
265 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
266 dst += (4 * dst_stride);
268 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
269 LD_SB4(src, src_stride, src0, src1, src2, src3);
271 src += (4 * src_stride);
273 filt0, filt1, filt2, out0, out1, out2, out3);
278 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
279 dst += (4 * dst_stride);
285 int height,
int mx,
int my)
289 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
290 v16u8 mask0, mask1, mask2,
out;
291 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
297 filt =
LD_SH(filter);
303 for (loop_cnt = (height >> 2); loop_cnt--;) {
304 LD_SB4(src, src_stride, src0, src2, src4, src6);
305 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
307 src += (4 * src_stride);
310 filt0, filt1, filt2, out0, out1, out2, out3);
312 filt0, filt1, filt2, out4, out5, out6, out7);
334 int height,
int mx,
int my)
338 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
339 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
340 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
342 v8i16
filt, out10, out32;
344 src -= (2 * src_stride);
346 filt =
LD_SH(filter);
349 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
350 src += (5 * src_stride);
352 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
354 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
357 for (loop_cnt = (height >> 2); loop_cnt--;) {
358 LD_SB4(src, src_stride, src5, src6, src7, src8);
359 src += (4 * src_stride);
361 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
362 src65_r, src76_r, src87_r);
363 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
365 out10 =
DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
366 out32 =
DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
370 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
371 dst += (4 * dst_stride);
381 int height,
int mx,
int my)
385 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10;
386 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
387 v16i8 src109_r, filt0, filt1, filt2;
389 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
391 src -= (2 * src_stride);
393 filt =
LD_SH(filter);
396 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
397 src += (5 * src_stride);
400 ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
401 src10_r, src32_r, src21_r, src43_r);
403 for (loop_cnt = (height >> 2); loop_cnt--;) {
404 LD_SB4(src, src_stride, src7, src8, src9, src10);
406 src += (4 * src_stride);
408 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
409 src87_r, src98_r, src109_r);
410 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
411 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
412 out2_r =
DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
413 out3_r =
DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
415 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
418 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
419 dst += (4 * dst_stride);
431 int height,
int mx,
int my)
435 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
436 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
437 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
438 v16i8 src65_l, src87_l, filt0, filt1, filt2;
439 v16u8 tmp0, tmp1, tmp2, tmp3;
440 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l,
filt;
442 src -= (2 * src_stride);
444 filt =
LD_SH(filter);
447 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
448 src += (5 * src_stride);
451 ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
452 src32_r, src43_r, src21_r);
453 ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
454 src32_l, src43_l, src21_l);
456 for (loop_cnt = (height >> 2); loop_cnt--;) {
457 LD_SB4(src, src_stride, src5, src6, src7, src8);
458 src += (4 * src_stride);
461 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
462 src65_r, src76_r, src87_r);
463 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
464 src65_l, src76_l, src87_l);
465 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
467 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
469 out2_r =
DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
471 out3_r =
DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
473 out0_l =
DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
475 out1_l =
DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
477 out2_l =
DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
479 out3_l =
DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
483 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
484 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
485 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
486 out3_r, tmp0, tmp1, tmp2, tmp3);
488 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
489 dst += (4 * dst_stride);
505 int height,
int mx,
int my)
510 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
511 v16i8 filt_hz0, filt_hz1, filt_hz2;
512 v16u8 mask0, mask1, mask2,
out;
514 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
515 v8i16 hz_out7,
filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
518 src -= (2 + 2 * src_stride);
521 filt =
LD_SH(filter_horiz);
522 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
524 filt =
LD_SH(filter_vert);
525 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
530 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
531 src += (5 * src_stride);
538 hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
541 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
543 for (loop_cnt = (height >> 2); loop_cnt--;) {
544 LD_SB2(src, src_stride, src5, src6);
545 src += (2 * src_stride);
550 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
552 LD_SB2(src, src_stride, src7, src8);
553 src += (2 * src_stride);
558 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
560 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
561 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
563 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
564 tmp1 =
DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
569 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
570 dst += (4 * dst_stride);
580 int height,
int mx,
int my)
585 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
586 v16i8 filt_hz0, filt_hz1, filt_hz2;
587 v16u8 mask0, mask1, mask2, vec0, vec1;
588 v8i16
filt, filt_vt0, filt_vt1, filt_vt2;
589 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
590 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
591 v8i16 tmp0, tmp1, tmp2, tmp3;
594 src -= (2 + 2 * src_stride);
597 filt =
LD_SH(filter_horiz);
598 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
603 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
604 src += (5 * src_stride);
618 filt =
LD_SH(filter_vert);
619 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
621 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
622 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
624 for (loop_cnt = (height >> 2); loop_cnt--;) {
625 LD_SB4(src, src_stride, src5, src6, src7, src8);
626 src += (4 * src_stride);
631 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
632 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
636 out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
637 tmp1 =
DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
641 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
642 tmp2 =
DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
646 out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
647 tmp3 =
DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
653 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
654 dst += (4 * dst_stride);
667 int height,
int mx,
int my)
671 for (multiple8_cnt = 2; multiple8_cnt--;) {
684 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
685 v8i16
filt, out0, out1;
692 filt =
LD_SH(filter);
697 LD_SB4(src, src_stride, src0, src1, src2, src3);
700 filt0, filt1, out0, out1);
704 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
711 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
713 v8i16
filt, out0, out1, out2, out3;
719 filt =
LD_SH(filter);
724 LD_SB4(src, src_stride, src0, src1, src2, src3);
725 src += (4 * src_stride);
729 filt0, filt1, out0, out1);
730 LD_SB4(src, src_stride, src0, src1, src2, src3);
733 filt0, filt1, out2, out3);
737 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
739 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
746 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
747 v16i8 filt0, filt1, mask0, mask1;
749 v8i16
filt, out0, out1, out2, out3;
755 filt =
LD_SH(filter);
760 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
761 src += (8 * src_stride);
764 filt0, filt1, out0, out1);
766 filt0, filt1, out2, out3);
770 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
771 dst += (4 * dst_stride);
773 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
774 dst += (4 * dst_stride);
776 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
777 src += (8 * src_stride);
780 filt0, filt1, out0, out1);
782 filt0, filt1, out2, out3);
786 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
787 dst += (4 * dst_stride);
789 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
794 int height,
int mx,
int my)
800 }
else if (8 == height) {
802 }
else if (16 == height) {
809 int height,
int mx,
int my)
813 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
815 v8i16
filt, out0, out1, out2, out3;
821 filt =
LD_SH(filter);
826 for (loop_cnt = (height >> 2); loop_cnt--;) {
827 LD_SB4(src, src_stride, src0, src1, src2, src3);
828 src += (4 * src_stride);
832 filt1, out0, out1, out2, out3);
837 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
838 dst += (4 * dst_stride);
844 int height,
int mx,
int my)
848 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
849 v16i8 filt0, filt1, mask0, mask1;
850 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
857 filt =
LD_SH(filter);
862 for (loop_cnt = (height >> 2); loop_cnt--;) {
863 LD_SB4(src, src_stride, src0, src2, src4, src6);
864 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
865 src += (4 * src_stride);
869 filt1, out0, out1, out2, out3);
871 filt1, out4, out5, out6, out7);
893 int height,
int mx,
int my)
897 v16i8
src0,
src1, src2, src3, src4, src5;
898 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
899 v16i8 src2110, src4332, filt0, filt1;
900 v8i16
filt, out10, out32;
905 filt =
LD_SH(filter);
908 LD_SB3(src, src_stride, src0, src1, src2);
909 src += (3 * src_stride);
911 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
913 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
914 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
916 for (loop_cnt = (height >> 2); loop_cnt--;) {
917 LD_SB3(src, src_stride, src3, src4, src5);
918 src += (3 * src_stride);
919 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
920 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
921 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
926 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
927 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
928 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
933 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
934 dst += (4 * dst_stride);
940 int height,
int mx,
int my)
944 v16i8
src0,
src1, src2, src7, src8, src9, src10;
945 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
947 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
951 filt =
LD_SH(filter);
954 LD_SB3(src, src_stride, src0, src1, src2);
955 src += (3 * src_stride);
958 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
960 for (loop_cnt = (height >> 2); loop_cnt--;) {
961 LD_SB4(src, src_stride, src7, src8, src9, src10);
962 src += (4 * src_stride);
965 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
966 src72_r, src87_r, src98_r, src109_r);
972 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
975 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
976 dst += (4 * dst_stride);
986 int height,
int mx,
int my)
990 v16i8
src0,
src1, src2, src3, src4, src5, src6;
991 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
992 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
993 v16u8 tmp0, tmp1, tmp2, tmp3;
994 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
998 filt =
LD_SH(filter);
1001 LD_SB3(src, src_stride, src0, src1, src2);
1002 src += (3 * src_stride);
1005 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
1006 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
1008 for (loop_cnt = (height >> 2); loop_cnt--;) {
1009 LD_SB4(src, src_stride, src3, src4, src5, src6);
1010 src += (4 * src_stride);
1013 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1014 src32_r, src43_r, src54_r, src65_r);
1015 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1016 src32_l, src43_l, src54_l, src65_l);
1027 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1028 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1029 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1030 out3_r, tmp0, tmp1, tmp2, tmp3);
1032 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1033 dst += (4 * dst_stride);
1045 int height,
int mx,
int my)
1050 v16i8
src0,
src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1051 v16u8 mask0, mask1,
out;
1052 v8i16
filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1053 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1056 src -= (1 + 1 * src_stride);
1059 filt =
LD_SH(filter_horiz);
1064 LD_SB3(src, src_stride, src0, src1, src2);
1065 src += (3 * src_stride);
1068 hz_out0 =
HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1069 hz_out1 =
HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
1070 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1072 filt =
LD_SH(filter_vert);
1075 for (loop_cnt = (height >> 2); loop_cnt--;) {
1076 LD_SB4(src, src_stride, src3, src4, src5, src6);
1077 src += (4 * src_stride);
1080 hz_out3 =
HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1081 hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1082 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1086 hz_out5 =
HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1087 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1088 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1094 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1095 dst += (4 * dst_stride);
1104 int height,
int mx,
int my)
1109 v16i8
src0,
src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1110 v16u8 mask0, mask1, out0, out1;
1111 v8i16
filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1112 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1113 v8i16 vec0, vec1, vec2, vec3, vec4;
1116 src -= (1 + 1 * src_stride);
1119 filt =
LD_SH(filter_horiz);
1124 LD_SB3(src, src_stride, src0, src1, src2);
1125 src += (3 * src_stride);
1128 hz_out0 =
HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1129 hz_out1 =
HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1130 hz_out2 =
HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1131 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1133 filt =
LD_SH(filter_vert);
1136 for (loop_cnt = (height >> 2); loop_cnt--;) {
1137 LD_SB4(src, src_stride, src3, src4, src5, src6);
1138 src += (4 * src_stride);
1141 hz_out3 =
HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1142 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1145 hz_out0 =
HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1146 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1149 hz_out1 =
HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1150 vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1153 hz_out2 =
HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1154 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1161 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1162 dst += (4 * dst_stride);
1171 int height,
int mx,
int my)
1175 for (multiple8_cnt = 2; multiple8_cnt--;) {
1186 int height,
int mx,
int my)
1191 v16i8
src0,
src1, src2, src3, src4, src5, src6;
1192 v16i8 filt_hz0, filt_hz1, filt_hz2;
1193 v16u8 res0, res1, mask0, mask1, mask2;
1194 v8i16
filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1195 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1198 src -= (2 + 1 * src_stride);
1201 filt =
LD_SH(filter_horiz);
1202 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1207 LD_SB3(src, src_stride, src0, src1, src2);
1208 src += (3 * src_stride);
1212 filt_hz1, filt_hz2);
1214 filt_hz1, filt_hz2);
1215 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1217 filt =
LD_SH(filter_vert);
1220 for (loop_cnt = (height >> 2); loop_cnt--;) {
1221 LD_SB4(src, src_stride, src3, src4, src5, src6);
1222 src += (4 * src_stride);
1226 filt_hz1, filt_hz2);
1227 hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1228 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1232 filt_hz1, filt_hz2);
1233 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1234 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1241 ST_W2(res0, 0, 1, dst, dst_stride);
1242 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1243 dst += (4 * dst_stride);
1252 int height,
int mx,
int my)
1257 v16i8
src0,
src1, src2, src3, src4, src5, src6;
1258 v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1259 v8i16
filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1260 v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1264 src -= (2 + src_stride);
1267 filt =
LD_SH(filter_horiz);
1268 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1273 LD_SB3(src, src_stride, src0, src1, src2);
1274 src += (3 * src_stride);
1278 filt_hz1, filt_hz2);
1280 filt_hz1, filt_hz2);
1282 filt_hz1, filt_hz2);
1283 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1285 filt =
LD_SH(filter_vert);
1288 for (loop_cnt = (height >> 2); loop_cnt--;) {
1289 LD_SB4(src, src_stride, src3, src4, src5, src6);
1290 src += (4 * src_stride);
1295 filt_hz1, filt_hz2);
1296 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1300 filt_hz1, filt_hz2);
1301 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1305 filt_hz1, filt_hz2);
1306 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1310 filt_hz1, filt_hz2);
1311 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1318 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1319 dst += (4 * dst_stride);
1325 int height,
int mx,
int my)
1329 for (multiple8_cnt = 2; multiple8_cnt--;) {
1340 int height,
int mx,
int my)
1345 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1346 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1348 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1349 v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1350 v8i16
filt, filt_vt0, filt_vt1, filt_vt2;
1354 src -= (1 + 2 * src_stride);
1357 filt =
LD_SH(filter_horiz);
1362 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1363 src += (5 * src_stride);
1366 hz_out0 =
HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1367 hz_out2 =
HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1368 hz_out3 =
HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1369 hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1370 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1372 filt =
LD_SH(filter_vert);
1373 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1375 for (loop_cnt = (height >> 2); loop_cnt--;) {
1376 LD_SB4(src, src_stride, src5, src6, src7, src8);
1378 src += (4 * src_stride);
1380 hz_out5 =
HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1381 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1382 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1383 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1385 hz_out7 =
HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1386 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1387 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1388 tmp1 =
DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1393 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1394 dst += (4 * dst_stride);
1404 int height,
int mx,
int my)
1409 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1410 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1411 v8i16
filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1412 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1413 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1417 src -= (1 + 2 * src_stride);
1420 filt =
LD_SH(filter_horiz);
1425 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1426 src += (5 * src_stride);
1429 hz_out0 =
HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1430 hz_out1 =
HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1431 hz_out2 =
HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1432 hz_out3 =
HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1433 hz_out4 =
HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1434 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1435 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1437 filt =
LD_SH(filter_vert);
1438 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1440 for (loop_cnt = (height >> 2); loop_cnt--;) {
1441 LD_SB4(src, src_stride, src5, src6, src7, src8);
1442 src += (4 * src_stride);
1446 hz_out5 =
HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1447 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1448 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1450 hz_out6 =
HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1451 out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
1452 tmp1 =
DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1454 hz_out7 =
HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1455 out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1456 tmp2 =
DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1458 hz_out8 =
HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1459 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1460 tmp3 =
DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1466 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
1467 dst += (4 * dst_stride);
1479 int height,
int mx,
int my)
1483 for (multiple8_cnt = 2; multiple8_cnt--;) {
1497 v16u8 filt0, vec0, vec1, res0, res1;
1498 v8u16 vec2, vec3,
filt;
1503 filt =
LD_UH(filter);
1504 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1506 LD_SB4(src, src_stride, src0, src1, src2, src3);
1507 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1508 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1511 ST_W2(res0, 0, 1, dst, dst_stride);
1512 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1519 v16u8 vec0, vec1, vec2, vec3, filt0;
1520 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
1521 v16i8 res0, res1, res2, res3;
1522 v8u16 vec4, vec5, vec6, vec7,
filt;
1527 filt =
LD_UH(filter);
1528 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1530 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1531 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1532 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1533 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1534 vec4, vec5, vec6, vec7);
1536 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1537 res0, res1, res2, res3);
1538 ST_W2(res0, 0, 1, dst, dst_stride);
1539 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1540 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1541 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1546 int height,
int mx,
int my)
1552 }
else if (8 == height) {
1563 v8u16 vec0, vec1, vec2, vec3,
filt;
1568 filt =
LD_UH(filter);
1569 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1571 LD_SB4(src, src_stride, src0, src1, src2, src3);
1572 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1573 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1574 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1575 vec0, vec1, vec2, vec3);
1578 ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1587 v8u16 vec0, vec1, vec2, vec3,
filt;
1592 filt =
LD_UH(filter);
1593 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1595 LD_SB4(src, src_stride, src0, src1, src2, src3);
1596 src += (4 * src_stride);
1598 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1599 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1600 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1601 vec0, vec1, vec2, vec3);
1604 LD_SB4(src, src_stride, src0, src1, src2, src3);
1605 src += (4 * src_stride);
1608 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1610 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1611 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1612 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1613 vec0, vec1, vec2, vec3);
1616 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1617 dst += (8 * dst_stride);
1620 LD_SB4(src, src_stride, src0, src1, src2, src3);
1621 src += (4 * src_stride);
1623 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1624 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1625 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1626 vec0, vec1, vec2, vec3);
1628 LD_SB4(src, src_stride, src0, src1, src2, src3);
1629 src += (4 * src_stride);
1632 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1634 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1635 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1636 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1637 vec0, vec1, vec2, vec3);
1640 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1646 int height,
int mx,
int my)
1660 int height,
int mx,
int my)
1664 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
1665 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1666 v8u16 out0, out1, out2, out3, out4, out5, out6, out7,
filt;
1670 loop_cnt = (height >> 2) - 1;
1673 filt =
LD_UH(filter);
1674 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1676 LD_SB4(src, src_stride, src0, src2, src4, src6);
1677 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1678 src += (4 * src_stride);
1680 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1681 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1682 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1683 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1684 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1685 out0, out1, out2, out3);
1686 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1687 out4, out5, out6, out7);
1699 for (; loop_cnt--;) {
1700 LD_SB4(src, src_stride, src0, src2, src4, src6);
1701 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1702 src += (4 * src_stride);
1704 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1705 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1706 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1707 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1708 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1709 out0, out1, out2, out3);
1710 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1711 out4, out5, out6, out7);
1729 v16i8
src0,
src1, src2, src3, src4;
1730 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
1735 filt =
LD_SH(filter);
1736 filt0 = (v16u8) __msa_splati_h(filt, 0);
1738 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1739 src += (5 * src_stride);
1741 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1742 src10_r, src21_r, src32_r, src43_r);
1743 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1744 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1747 src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1748 ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
1755 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1756 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
1757 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
1758 v8u16 tmp0, tmp1, tmp2, tmp3;
1762 filt =
LD_SH(filter);
1763 filt0 = (v16u8) __msa_splati_h(filt, 0);
1765 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1766 src += (8 * src_stride);
1771 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1773 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1775 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1776 src87_r, src76_r, src2110, src4332, src6554, src8776);
1777 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
1778 tmp0, tmp1, tmp2, tmp3);
1781 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
1782 ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1787 int height,
int mx,
int my)
1793 }
else if (8 == height) {
1802 v16u8
src0,
src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
1804 v8u16 tmp0, tmp1, tmp2, tmp3;
1808 filt =
LD_SH(filter);
1809 filt0 = (v16u8) __msa_splati_h(filt, 0);
1811 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1812 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
1813 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
1814 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1815 tmp0, tmp1, tmp2, tmp3);
1819 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1827 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1828 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1830 v8u16 tmp0, tmp1, tmp2, tmp3;
1834 filt =
LD_SH(filter);
1835 filt0 = (v16u8) __msa_splati_h(filt, 0);
1840 for (loop_cnt = (height >> 3); loop_cnt--;) {
1841 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
1842 src += (8 * src_stride);
1844 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1845 vec0, vec1, vec2, vec3);
1846 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1847 vec4, vec5, vec6, vec7);
1848 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1849 tmp0, tmp1, tmp2, tmp3);
1853 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1855 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1856 tmp0, tmp1, tmp2, tmp3);
1860 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1861 dst += (8 * dst_stride);
1869 int height,
int mx,
int my)
1883 int height,
int mx,
int my)
1887 v16u8
src0,
src1, src2, src3, src4;
1888 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1889 v8u16 tmp0, tmp1, tmp2, tmp3;
1893 filt =
LD_SH(filter);
1894 filt0 = (v16u8) __msa_splati_h(filt, 0);
1899 for (loop_cnt = (height >> 2); loop_cnt--;) {
1900 LD_UB4(src, src_stride, src1, src2, src3, src4);
1901 src += (4 * src_stride);
1903 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
1904 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
1905 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1911 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
1912 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
1913 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1919 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1925 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1937 const int8_t *filter_horiz,
1938 const int8_t *filter_vert)
1941 v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
1942 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4,
filt, tmp0, tmp1;
1947 filt =
LD_UH(filter_horiz);
1948 filt_hz = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1950 filt =
LD_UH(filter_vert);
1951 filt_vt = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1953 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1957 hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1958 hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
1960 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1961 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1965 ST_W2(res0, 0, 1, dst, dst_stride);
1966 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1971 const int8_t *filter_horiz,
1972 const int8_t *filter_vert)
1974 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8,
mask;
1975 v16i8 res0, res1, res2, res3;
1976 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
1977 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1978 v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7,
filt;
1983 filt =
LD_UH(filter_horiz);
1984 filt_hz = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1986 filt =
LD_UH(filter_vert);
1987 filt_vt = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1989 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1990 src += (8 * src_stride);
1998 SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
2000 hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2002 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2003 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2004 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2005 vec4, vec5, vec6, vec7);
2008 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2009 res0, res1, res2, res3);
2010 ST_W2(res0, 0, 1, dst, dst_stride);
2011 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2012 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2013 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2018 int height,
int mx,
int my)
2025 filter_horiz, filter_vert);
2026 }
else if (8 == height) {
2028 filter_horiz, filter_vert);
2034 const int8_t *filter_horiz,
2035 const int8_t *filter_vert)
2038 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2039 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2045 filt =
LD_SH(filter_horiz);
2046 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2048 filt =
LD_SH(filter_vert);
2049 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2051 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2055 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2056 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2059 vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2060 tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2063 vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2064 tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2067 vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2068 tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2073 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2078 const int8_t *filter_horiz,
2079 const int8_t *filter_vert,
2084 v16u8 filt_hz, filt_vt, vec0;
2085 v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2091 filt =
LD_SH(filter_horiz);
2092 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2094 filt =
LD_SH(filter_vert);
2095 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2102 for (loop_cnt = (height >> 3); loop_cnt--;) {
2103 LD_SB4(src, src_stride, src1, src2, src3, src4);
2104 src += (4 * src_stride);
2107 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2108 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2111 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2112 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2118 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2119 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2122 LD_SB4(src, src_stride, src1, src2, src3, src4);
2123 src += (4 * src_stride);
2124 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2125 tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2130 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2133 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2134 tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2137 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2138 tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2141 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2142 tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2145 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2146 tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2151 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2152 dst += (8 * dst_stride);
2158 int height,
int mx,
int my)
2165 filter_horiz, filter_vert);
2168 filter_horiz, filter_vert, height);
2174 int height,
int mx,
int my)
2179 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
2180 v16u8 filt_hz, filt_vt, vec0, vec1;
2181 v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2187 filt =
LD_SH(filter_horiz);
2188 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2190 filt =
LD_SH(filter_vert);
2191 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2193 LD_SB2(src, 8, src0, src1);
2200 for (loop_cnt = (height >> 2); loop_cnt--;) {
2201 LD_SB4(src, src_stride, src0, src2, src4, src6);
2202 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2203 src += (4 * src_stride);
2207 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2208 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2216 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2217 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2225 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2226 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2234 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2235 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2245 int height,
int mx,
int my)
2248 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2249 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
2251 if (0 == height % 8) {
2252 for (cnt = height >> 3; cnt--;) {
2254 src0, src1, src2, src3, src4, src5, src6, src7);
2255 src += (8 * src_stride);
2257 out0 = __msa_copy_u_d((v2i64) src0, 0);
2258 out1 = __msa_copy_u_d((v2i64) src1, 0);
2259 out2 = __msa_copy_u_d((v2i64) src2, 0);
2260 out3 = __msa_copy_u_d((v2i64) src3, 0);
2261 out4 = __msa_copy_u_d((v2i64) src4, 0);
2262 out5 = __msa_copy_u_d((v2i64) src5, 0);
2263 out6 = __msa_copy_u_d((v2i64) src6, 0);
2264 out7 = __msa_copy_u_d((v2i64) src7, 0);
2266 SD4(out0, out1, out2, out3, dst, dst_stride);
2267 dst += (4 * dst_stride);
2268 SD4(out4, out5, out6, out7, dst, dst_stride);
2269 dst += (4 * dst_stride);
2271 }
else if (0 == height % 4) {
2272 for (cnt = (height / 4); cnt--;) {
2273 LD_UB4(src, src_stride, src0, src1, src2, src3);
2274 src += (4 * src_stride);
2275 out0 = __msa_copy_u_d((v2i64) src0, 0);
2276 out1 = __msa_copy_u_d((v2i64) src1, 0);
2277 out2 = __msa_copy_u_d((v2i64) src2, 0);
2278 out3 = __msa_copy_u_d((v2i64) src3, 0);
2280 SD4(out0, out1, out2, out3, dst, dst_stride);
2281 dst += (4 * dst_stride);
2292 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
2294 for (cnt = (width >> 4); cnt--;) {
2298 for (loop_cnt = (height >> 3); loop_cnt--;) {
2299 LD_UB8(src_tmp, src_stride,
2300 src0, src1, src2, src3, src4, src5, src6, src7);
2301 src_tmp += (8 * src_stride);
2303 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2304 dst_tmp, dst_stride);
2305 dst_tmp += (8 * dst_stride);
2315 int height,
int mx,
int my)
2320 if (0 == height % 8) {
2322 }
else if (0 == height % 4) {
2323 for (cnt = (height >> 2); cnt--;) {
2324 LD_UB4(src, src_stride, src0, src1, src2, src3);
2325 src += (4 * src_stride);
2327 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2328 dst += (4 * dst_stride);
void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
#define SPLATI_H3_SH(...)
void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
#define XORI_B3_128_SB(...)
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define SPLATI_H2_SH(...)
static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
#define XORI_B4_128_UB(...)
static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, int clip)
#define PCKEV_ST_SB(in0, in1, pdst)
#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, filt1, filt2, out0, out1, out2, out3)
void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
VP8 compatible video decoder.
static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_UB(...)
void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static const int8_t bilinear_filters_msa[7][2]
void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static const int8_t subpel_filters_msa[7][8]
void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static const uint16_t mask[17]
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1)
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define SPLATI_H2_SB(...)
void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define XORI_B4_128_SB(...)
void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, filt_h2)
void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)
static const uint8_t mc_filt_mask_arr[16 *3]
void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)
static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1, out2, out3)
void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
static const int8_t filt[NUMTAPS]
void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, filt1, filt2, out0, out1)
#define SPLATI_H3_SB(...)
static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define ST_W2(in, idx0, idx1, pdst, stride)
void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)