|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H |
|
#define AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef LOONGSON_INTRINSICS_H |
|
#define LOONGSON_INTRINSICS_H |
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LSOM_VERSION_MAJOR 1 |
|
#define LSOM_VERSION_MINOR 1 |
|
#define LSOM_VERSION_MICRO 0 |
|
|
|
#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \ |
|
{ \ |
|
_OUT0 = _INS(_IN0); \ |
|
_OUT1 = _INS(_IN1); \ |
|
} |
|
|
|
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \ |
|
{ \ |
|
_OUT0 = _INS(_IN0, _IN1); \ |
|
_OUT1 = _INS(_IN2, _IN3); \ |
|
} |
|
|
|
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \ |
|
{ \ |
|
_OUT0 = _INS(_IN0, _IN1, _IN2); \ |
|
_OUT1 = _INS(_IN3, _IN4, _IN5); \ |
|
} |
|
|
|
#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \ |
|
{ \ |
|
DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \ |
|
DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \ |
|
} |
|
|
|
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \ |
|
_OUT1, _OUT2, _OUT3) \ |
|
{ \ |
|
DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \ |
|
DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \ |
|
} |
|
|
|
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \ |
|
_IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \ |
|
{ \ |
|
DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \ |
|
DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \ |
|
} |
|
|
|
#ifdef __loongarch_sx |
|
#include <lsxintrin.h> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, |
|
__m128i in_l) { |
|
__m128i out; |
|
|
|
out = __lsx_vmaddwev_h_b(in_c, in_h, in_l); |
|
out = __lsx_vmaddwod_h_b(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, |
|
__m128i in_l) { |
|
__m128i out; |
|
|
|
out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l); |
|
out = __lsx_vmaddwod_h_bu(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h, |
|
__m128i in_l) { |
|
__m128i out; |
|
|
|
out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l); |
|
out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, |
|
__m128i in_l) { |
|
__m128i out; |
|
|
|
out = __lsx_vmaddwev_w_h(in_c, in_h, in_l); |
|
out = __lsx_vmaddwod_w_h(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) { |
|
__m128i out; |
|
|
|
out = __lsx_vmulwev_h_b(in_h, in_l); |
|
out = __lsx_vmaddwod_h_b(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) { |
|
__m128i out; |
|
|
|
out = __lsx_vmulwev_h_bu(in_h, in_l); |
|
out = __lsx_vmaddwod_h_bu(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) { |
|
__m128i out; |
|
|
|
out = __lsx_vmulwev_h_bu_b(in_h, in_l); |
|
out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) { |
|
__m128i out; |
|
|
|
out = __lsx_vmulwev_w_h(in_h, in_l); |
|
out = __lsx_vmaddwod_w_h(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) { |
|
__m128i out; |
|
|
|
out = __lsx_vmax_h(min, _in); |
|
out = __lsx_vmin_h(max, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m128i __lsx_vclip255_h(__m128i _in) { |
|
__m128i out; |
|
|
|
out = __lsx_vmaxi_h(_in, 0); |
|
out = __lsx_vsat_hu(out, 7); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m128i __lsx_vclip255_w(__m128i _in) { |
|
__m128i out; |
|
|
|
out = __lsx_vmaxi_w(_in, 0); |
|
out = __lsx_vsat_wu(out, 7); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LSX_SWAP(_in0, _in1) \ |
|
{ \ |
|
_in0 = __lsx_vxor_v(_in0, _in1); \ |
|
_in1 = __lsx_vxor_v(_in0, _in1); \ |
|
_in0 = __lsx_vxor_v(_in0, _in1); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ |
|
{ \ |
|
__m128i _t0, _t1, _t2, _t3; \ |
|
\ |
|
_t0 = __lsx_vilvl_w(_in1, _in0); \ |
|
_t1 = __lsx_vilvh_w(_in1, _in0); \ |
|
_t2 = __lsx_vilvl_w(_in3, _in2); \ |
|
_t3 = __lsx_vilvh_w(_in3, _in2); \ |
|
_out0 = __lsx_vilvl_d(_t2, _t0); \ |
|
_out1 = __lsx_vilvh_d(_t2, _t0); \ |
|
_out2 = __lsx_vilvl_d(_t3, _t1); \ |
|
_out3 = __lsx_vilvh_d(_t3, _t1); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
__m128i zero = { 0 }; \ |
|
__m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \ |
|
__m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ |
|
\ |
|
_t0 = __lsx_vilvl_b(_in2, _in0); \ |
|
_t1 = __lsx_vilvl_b(_in3, _in1); \ |
|
_t2 = __lsx_vilvl_b(_in6, _in4); \ |
|
_t3 = __lsx_vilvl_b(_in7, _in5); \ |
|
_t4 = __lsx_vilvl_b(_t1, _t0); \ |
|
_t5 = __lsx_vilvh_b(_t1, _t0); \ |
|
_t6 = __lsx_vilvl_b(_t3, _t2); \ |
|
_t7 = __lsx_vilvh_b(_t3, _t2); \ |
|
_out0 = __lsx_vilvl_w(_t6, _t4); \ |
|
_out2 = __lsx_vilvh_w(_t6, _t4); \ |
|
_out4 = __lsx_vilvl_w(_t7, _t5); \ |
|
_out6 = __lsx_vilvh_w(_t7, _t5); \ |
|
_out1 = __lsx_vshuf_b(zero, _out0, shuf8); \ |
|
_out3 = __lsx_vshuf_b(zero, _out2, shuf8); \ |
|
_out5 = __lsx_vshuf_b(zero, _out4, shuf8); \ |
|
_out7 = __lsx_vshuf_b(zero, _out6, shuf8); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
__m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ |
|
\ |
|
_s0 = __lsx_vilvl_h(_in6, _in4); \ |
|
_s1 = __lsx_vilvl_h(_in7, _in5); \ |
|
_t0 = __lsx_vilvl_h(_s1, _s0); \ |
|
_t1 = __lsx_vilvh_h(_s1, _s0); \ |
|
_s0 = __lsx_vilvh_h(_in6, _in4); \ |
|
_s1 = __lsx_vilvh_h(_in7, _in5); \ |
|
_t2 = __lsx_vilvl_h(_s1, _s0); \ |
|
_t3 = __lsx_vilvh_h(_s1, _s0); \ |
|
_s0 = __lsx_vilvl_h(_in2, _in0); \ |
|
_s1 = __lsx_vilvl_h(_in3, _in1); \ |
|
_t4 = __lsx_vilvl_h(_s1, _s0); \ |
|
_t5 = __lsx_vilvh_h(_s1, _s0); \ |
|
_s0 = __lsx_vilvh_h(_in2, _in0); \ |
|
_s1 = __lsx_vilvh_h(_in3, _in1); \ |
|
_t6 = __lsx_vilvl_h(_s1, _s0); \ |
|
_t7 = __lsx_vilvh_h(_s1, _s0); \ |
|
\ |
|
_out0 = __lsx_vpickev_d(_t0, _t4); \ |
|
_out2 = __lsx_vpickev_d(_t1, _t5); \ |
|
_out4 = __lsx_vpickev_d(_t2, _t6); \ |
|
_out6 = __lsx_vpickev_d(_t3, _t7); \ |
|
_out1 = __lsx_vpickod_d(_t0, _t4); \ |
|
_out3 = __lsx_vpickod_d(_t1, _t5); \ |
|
_out5 = __lsx_vpickod_d(_t2, _t6); \ |
|
_out7 = __lsx_vpickod_d(_t3, _t7); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3) \ |
|
{ \ |
|
__m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ |
|
\ |
|
_tmp0_m = __lsx_vpackev_w(_in4, _in0); \ |
|
_tmp1_m = __lsx_vpackev_w(_in5, _in1); \ |
|
_tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ |
|
_tmp0_m = __lsx_vpackev_w(_in6, _in2); \ |
|
_tmp1_m = __lsx_vpackev_w(_in7, _in3); \ |
|
\ |
|
_tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ |
|
_tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \ |
|
_tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \ |
|
\ |
|
_out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \ |
|
_out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \ |
|
_out1 = __lsx_vilvh_d(_out2, _out0); \ |
|
_out3 = __lsx_vilvh_d(_out0, _out2); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_in8, _in9, _in10, _in11, _in12, _in13, _in14, \ |
|
_in15, _out0, _out1, _out2, _out3, _out4, _out5, \ |
|
_out6, _out7) \ |
|
{ \ |
|
__m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ |
|
__m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ |
|
DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \ |
|
_tmp0, _tmp1, _tmp2, _tmp3); \ |
|
DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \ |
|
_in13, _tmp4, _tmp5, _tmp6, _tmp7); \ |
|
DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \ |
|
DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \ |
|
DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \ |
|
DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \ |
|
DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \ |
|
DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \ |
|
DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \ |
|
DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \ |
|
DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \ |
|
DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \ |
|
DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \ |
|
DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ |
|
{ \ |
|
_out0 = __lsx_vadd_b(_in0, _in3); \ |
|
_out1 = __lsx_vadd_b(_in1, _in2); \ |
|
_out2 = __lsx_vsub_b(_in1, _in2); \ |
|
_out3 = __lsx_vsub_b(_in0, _in3); \ |
|
} |
|
#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ |
|
{ \ |
|
_out0 = __lsx_vadd_h(_in0, _in3); \ |
|
_out1 = __lsx_vadd_h(_in1, _in2); \ |
|
_out2 = __lsx_vsub_h(_in1, _in2); \ |
|
_out3 = __lsx_vsub_h(_in0, _in3); \ |
|
} |
|
#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ |
|
{ \ |
|
_out0 = __lsx_vadd_w(_in0, _in3); \ |
|
_out1 = __lsx_vadd_w(_in1, _in2); \ |
|
_out2 = __lsx_vsub_w(_in1, _in2); \ |
|
_out3 = __lsx_vsub_w(_in0, _in3); \ |
|
} |
|
#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ |
|
{ \ |
|
_out0 = __lsx_vadd_d(_in0, _in3); \ |
|
_out1 = __lsx_vadd_d(_in1, _in2); \ |
|
_out2 = __lsx_vsub_d(_in1, _in2); \ |
|
_out3 = __lsx_vsub_d(_in0, _in3); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
_out0 = __lsx_vadd_b(_in0, _in7); \ |
|
_out1 = __lsx_vadd_b(_in1, _in6); \ |
|
_out2 = __lsx_vadd_b(_in2, _in5); \ |
|
_out3 = __lsx_vadd_b(_in3, _in4); \ |
|
_out4 = __lsx_vsub_b(_in3, _in4); \ |
|
_out5 = __lsx_vsub_b(_in2, _in5); \ |
|
_out6 = __lsx_vsub_b(_in1, _in6); \ |
|
_out7 = __lsx_vsub_b(_in0, _in7); \ |
|
} |
|
|
|
#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
_out0 = __lsx_vadd_h(_in0, _in7); \ |
|
_out1 = __lsx_vadd_h(_in1, _in6); \ |
|
_out2 = __lsx_vadd_h(_in2, _in5); \ |
|
_out3 = __lsx_vadd_h(_in3, _in4); \ |
|
_out4 = __lsx_vsub_h(_in3, _in4); \ |
|
_out5 = __lsx_vsub_h(_in2, _in5); \ |
|
_out6 = __lsx_vsub_h(_in1, _in6); \ |
|
_out7 = __lsx_vsub_h(_in0, _in7); \ |
|
} |
|
|
|
#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
_out0 = __lsx_vadd_w(_in0, _in7); \ |
|
_out1 = __lsx_vadd_w(_in1, _in6); \ |
|
_out2 = __lsx_vadd_w(_in2, _in5); \ |
|
_out3 = __lsx_vadd_w(_in3, _in4); \ |
|
_out4 = __lsx_vsub_w(_in3, _in4); \ |
|
_out5 = __lsx_vsub_w(_in2, _in5); \ |
|
_out6 = __lsx_vsub_w(_in1, _in6); \ |
|
_out7 = __lsx_vsub_w(_in0, _in7); \ |
|
} |
|
|
|
#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
_out0 = __lsx_vadd_d(_in0, _in7); \ |
|
_out1 = __lsx_vadd_d(_in1, _in6); \ |
|
_out2 = __lsx_vadd_d(_in2, _in5); \ |
|
_out3 = __lsx_vadd_d(_in3, _in4); \ |
|
_out4 = __lsx_vsub_d(_in3, _in4); \ |
|
_out5 = __lsx_vsub_d(_in2, _in5); \ |
|
_out6 = __lsx_vsub_d(_in1, _in6); \ |
|
_out7 = __lsx_vsub_d(_in0, _in7); \ |
|
} |
|
|
|
#endif |
|
|
|
#ifdef __loongarch_asx |
|
#include <lasxintrin.h> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmulwev_h_bu(in_h, in_l); |
|
out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmulwev_h_b(in_h, in_l); |
|
out = __lasx_xvmaddwod_h_b(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmulwev_w_h(in_h, in_l); |
|
out = __lasx_xvmaddwod_w_h(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmulwev_d_w(in_h, in_l); |
|
out = __lasx_xvmaddwod_d_w(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmulwev_w_hu_h(in_h, in_l); |
|
out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h, |
|
__m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l); |
|
out = __lasx_xvmaddwod_h_b(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h, |
|
__m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l); |
|
out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h, |
|
__m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l); |
|
out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, |
|
__m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l); |
|
out = __lasx_xvmaddwod_w_h(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, |
|
__m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l); |
|
out = __lasx_xvmaddwod_w_hu(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, |
|
__m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l); |
|
out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, |
|
__m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmulwev_h_bu(in_h, in_l); |
|
out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); |
|
out = __lasx_xvsub_h(in_c, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, |
|
__m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmulwev_w_h(in_h, in_l); |
|
out = __lasx_xvmaddwod_w_h(out, in_h, in_l); |
|
out = __lasx_xvsub_w(in_c, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmulwev_w_h(in_h, in_l); |
|
out = __lasx_xvmaddwod_w_h(out, in_h, in_l); |
|
out = __lasx_xvhaddw_d_w(out, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvilvh_b(in_h, in_l); |
|
out = __lasx_xvhaddw_h_b(out, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvilvh_h(in_h, in_l); |
|
out = __lasx_xvhaddw_w_h(out, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvilvl_b(in_h, in_l); |
|
out = __lasx_xvhaddw_h_b(out, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvilvl_h(in_h, in_l); |
|
out = __lasx_xvhaddw_w_h(out, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvilvl_b(in_h, in_l); |
|
out = __lasx_xvhaddw_hu_bu(out, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvsllwil_hu_bu(in_l, 0); |
|
out = __lasx_xvadd_h(in_h, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) { |
|
__m256i out; |
|
|
|
out = __lasx_xvsllwil_w_h(in_l, 0); |
|
out = __lasx_xvadd_w(in_h, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, |
|
__m256i in_l) { |
|
__m256i tmp0, tmp1, out; |
|
|
|
tmp0 = __lasx_xvsllwil_w_h(in_h, 0); |
|
tmp1 = __lasx_xvsllwil_w_h(in_l, 0); |
|
tmp0 = __lasx_xvmul_w(tmp0, tmp1); |
|
out = __lasx_xvadd_w(tmp0, in_c); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, |
|
__m256i in_l) { |
|
__m256i tmp0, tmp1, out; |
|
|
|
tmp0 = __lasx_xvilvh_h(in_h, in_h); |
|
tmp1 = __lasx_xvilvh_h(in_l, in_l); |
|
tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1); |
|
out = __lasx_xvadd_w(tmp0, in_c); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) { |
|
__m256i tmp0, tmp1, out; |
|
|
|
tmp0 = __lasx_xvsllwil_w_h(in_h, 0); |
|
tmp1 = __lasx_xvsllwil_w_h(in_l, 0); |
|
out = __lasx_xvmul_w(tmp0, tmp1); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) { |
|
__m256i tmp0, tmp1, out; |
|
|
|
tmp0 = __lasx_xvilvh_h(in_h, in_h); |
|
tmp1 = __lasx_xvilvh_h(in_l, in_l); |
|
out = __lasx_xvmulwev_w_h(tmp0, tmp1); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) { |
|
__m256i tmp1, out; |
|
__m256i zero = { 0 }; |
|
|
|
tmp1 = __lasx_xvilvl_b(zero, in_l); |
|
out = __lasx_xvsadd_hu(in_h, tmp1); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmax_h(min, in); |
|
out = __lasx_xvmin_h(max, out); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvclip255_h(__m256i in) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmaxi_h(in, 0); |
|
out = __lasx_xvsat_hu(out, 7); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvclip255_w(__m256i in) { |
|
__m256i out; |
|
|
|
out = __lasx_xvmaxi_w(in, 0); |
|
out = __lasx_xvsat_wu(out, 7); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) { |
|
__m256i out; |
|
|
|
out = __lasx_xvpermi_q(in, in, 0x02); |
|
out = __lasx_xvreplve_h(out, idx); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { |
|
__m256i out; |
|
|
|
out = __lasx_xvpermi_q(in, in, 0x13); |
|
out = __lasx_xvreplve_h(out, idx); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ |
|
_out3) \ |
|
{ \ |
|
__m256i _tmp0, _tmp1, _tmp2, _tmp3; \ |
|
_tmp0 = __lasx_xvilvl_d(_in1, _in0); \ |
|
_tmp1 = __lasx_xvilvh_d(_in1, _in0); \ |
|
_tmp2 = __lasx_xvilvl_d(_in3, _in2); \ |
|
_tmp3 = __lasx_xvilvh_d(_in3, _in2); \ |
|
_out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \ |
|
_out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \ |
|
_out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \ |
|
_out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
__m256i _s0_m, _s1_m; \ |
|
__m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ |
|
__m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ |
|
\ |
|
_s0_m = __lasx_xvilvl_w(_in2, _in0); \ |
|
_s1_m = __lasx_xvilvl_w(_in3, _in1); \ |
|
_tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ |
|
_tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ |
|
_s0_m = __lasx_xvilvh_w(_in2, _in0); \ |
|
_s1_m = __lasx_xvilvh_w(_in3, _in1); \ |
|
_tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ |
|
_tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ |
|
_s0_m = __lasx_xvilvl_w(_in6, _in4); \ |
|
_s1_m = __lasx_xvilvl_w(_in7, _in5); \ |
|
_tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ |
|
_tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ |
|
_s0_m = __lasx_xvilvh_w(_in6, _in4); \ |
|
_s1_m = __lasx_xvilvh_w(_in7, _in5); \ |
|
_tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ |
|
_tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ |
|
_out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \ |
|
_out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \ |
|
_out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \ |
|
_out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \ |
|
_out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \ |
|
_out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \ |
|
_out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \ |
|
_out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_in8, _in9, _in10, _in11, _in12, _in13, _in14, \ |
|
_in15, _out0, _out1, _out2, _out3, _out4, _out5, \ |
|
_out6, _out7) \ |
|
{ \ |
|
__m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ |
|
__m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ |
|
\ |
|
_tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ |
|
_tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ |
|
_tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ |
|
_tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ |
|
_tmp4_m = __lasx_xvilvl_b(_in10, _in8); \ |
|
_tmp5_m = __lasx_xvilvl_b(_in11, _in9); \ |
|
_tmp6_m = __lasx_xvilvl_b(_in14, _in12); \ |
|
_tmp7_m = __lasx_xvilvl_b(_in15, _in13); \ |
|
_out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ |
|
_out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ |
|
_out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ |
|
_out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ |
|
_out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \ |
|
_out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \ |
|
_out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \ |
|
_out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \ |
|
_tmp0_m = __lasx_xvilvl_w(_out2, _out0); \ |
|
_tmp2_m = __lasx_xvilvh_w(_out2, _out0); \ |
|
_tmp4_m = __lasx_xvilvl_w(_out3, _out1); \ |
|
_tmp6_m = __lasx_xvilvh_w(_out3, _out1); \ |
|
_tmp1_m = __lasx_xvilvl_w(_out6, _out4); \ |
|
_tmp3_m = __lasx_xvilvh_w(_out6, _out4); \ |
|
_tmp5_m = __lasx_xvilvl_w(_out7, _out5); \ |
|
_tmp7_m = __lasx_xvilvh_w(_out7, _out5); \ |
|
_out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \ |
|
_out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \ |
|
_out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \ |
|
_out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \ |
|
_out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \ |
|
_out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \ |
|
_out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \ |
|
_out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_in8, _in9, _in10, _in11, _in12, _in13, _in14, \ |
|
_in15, _out0, _out1, _out2, _out3, _out4, _out5, \ |
|
_out6, _out7) \ |
|
{ \ |
|
__m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ |
|
__m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ |
|
__m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ |
|
\ |
|
_tmp0_m = __lasx_xvilvl_h(_in2, _in0); \ |
|
_tmp1_m = __lasx_xvilvl_h(_in3, _in1); \ |
|
_tmp2_m = __lasx_xvilvl_h(_in6, _in4); \ |
|
_tmp3_m = __lasx_xvilvl_h(_in7, _in5); \ |
|
_tmp4_m = __lasx_xvilvl_h(_in10, _in8); \ |
|
_tmp5_m = __lasx_xvilvl_h(_in11, _in9); \ |
|
_tmp6_m = __lasx_xvilvl_h(_in14, _in12); \ |
|
_tmp7_m = __lasx_xvilvl_h(_in15, _in13); \ |
|
_t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ |
|
_t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ |
|
_t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ |
|
_t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ |
|
_t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ |
|
_t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ |
|
_t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ |
|
_t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ |
|
_tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ |
|
_tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ |
|
_tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ |
|
_tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ |
|
_tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ |
|
_tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ |
|
_tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ |
|
_tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ |
|
_out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ |
|
_out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ |
|
_out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ |
|
_out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ |
|
\ |
|
_tmp0_m = __lasx_xvilvh_h(_in2, _in0); \ |
|
_tmp1_m = __lasx_xvilvh_h(_in3, _in1); \ |
|
_tmp2_m = __lasx_xvilvh_h(_in6, _in4); \ |
|
_tmp3_m = __lasx_xvilvh_h(_in7, _in5); \ |
|
_tmp4_m = __lasx_xvilvh_h(_in10, _in8); \ |
|
_tmp5_m = __lasx_xvilvh_h(_in11, _in9); \ |
|
_tmp6_m = __lasx_xvilvh_h(_in14, _in12); \ |
|
_tmp7_m = __lasx_xvilvh_h(_in15, _in13); \ |
|
_t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ |
|
_t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ |
|
_t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ |
|
_t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ |
|
_t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ |
|
_t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ |
|
_t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ |
|
_t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ |
|
_tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ |
|
_tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ |
|
_tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ |
|
_tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ |
|
_tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ |
|
_tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ |
|
_tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ |
|
_tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ |
|
_out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ |
|
_out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ |
|
_out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ |
|
_out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ |
|
_out3) \ |
|
{ \ |
|
__m256i _s0_m, _s1_m; \ |
|
\ |
|
_s0_m = __lasx_xvilvl_h(_in1, _in0); \ |
|
_s1_m = __lasx_xvilvl_h(_in3, _in2); \ |
|
_out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \ |
|
_out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \ |
|
_out1 = __lasx_xvilvh_d(_out0, _out0); \ |
|
_out3 = __lasx_xvilvh_d(_out2, _out2); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
__m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ |
|
__m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ |
|
_tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ |
|
_tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ |
|
_tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ |
|
_tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ |
|
_tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ |
|
_tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ |
|
_tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ |
|
_tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ |
|
_out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \ |
|
_out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \ |
|
_out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \ |
|
_out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \ |
|
_out1 = __lasx_xvbsrl_v(_out0, 8); \ |
|
_out3 = __lasx_xvbsrl_v(_out2, 8); \ |
|
_out5 = __lasx_xvbsrl_v(_out4, 8); \ |
|
_out7 = __lasx_xvbsrl_v(_out6, 8); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
__m256i _s0_m, _s1_m; \ |
|
__m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ |
|
__m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ |
|
\ |
|
_s0_m = __lasx_xvilvl_h(_in6, _in4); \ |
|
_s1_m = __lasx_xvilvl_h(_in7, _in5); \ |
|
_tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ |
|
_tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ |
|
_s0_m = __lasx_xvilvh_h(_in6, _in4); \ |
|
_s1_m = __lasx_xvilvh_h(_in7, _in5); \ |
|
_tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ |
|
_tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ |
|
\ |
|
_s0_m = __lasx_xvilvl_h(_in2, _in0); \ |
|
_s1_m = __lasx_xvilvl_h(_in3, _in1); \ |
|
_tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ |
|
_tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ |
|
_s0_m = __lasx_xvilvh_h(_in2, _in0); \ |
|
_s1_m = __lasx_xvilvh_h(_in3, _in1); \ |
|
_tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ |
|
_tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ |
|
\ |
|
_out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \ |
|
_out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \ |
|
_out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \ |
|
_out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \ |
|
_out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \ |
|
_out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \ |
|
_out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \ |
|
_out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ |
|
{ \ |
|
_out0 = __lasx_xvadd_b(_in0, _in3); \ |
|
_out1 = __lasx_xvadd_b(_in1, _in2); \ |
|
_out2 = __lasx_xvsub_b(_in1, _in2); \ |
|
_out3 = __lasx_xvsub_b(_in0, _in3); \ |
|
} |
|
#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ |
|
{ \ |
|
_out0 = __lasx_xvadd_h(_in0, _in3); \ |
|
_out1 = __lasx_xvadd_h(_in1, _in2); \ |
|
_out2 = __lasx_xvsub_h(_in1, _in2); \ |
|
_out3 = __lasx_xvsub_h(_in0, _in3); \ |
|
} |
|
#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ |
|
{ \ |
|
_out0 = __lasx_xvadd_w(_in0, _in3); \ |
|
_out1 = __lasx_xvadd_w(_in1, _in2); \ |
|
_out2 = __lasx_xvsub_w(_in1, _in2); \ |
|
_out3 = __lasx_xvsub_w(_in0, _in3); \ |
|
} |
|
#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ |
|
{ \ |
|
_out0 = __lasx_xvadd_d(_in0, _in3); \ |
|
_out1 = __lasx_xvadd_d(_in1, _in2); \ |
|
_out2 = __lasx_xvsub_d(_in1, _in2); \ |
|
_out3 = __lasx_xvsub_d(_in0, _in3); \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
_out0 = __lasx_xvadd_b(_in0, _in7); \ |
|
_out1 = __lasx_xvadd_b(_in1, _in6); \ |
|
_out2 = __lasx_xvadd_b(_in2, _in5); \ |
|
_out3 = __lasx_xvadd_b(_in3, _in4); \ |
|
_out4 = __lasx_xvsub_b(_in3, _in4); \ |
|
_out5 = __lasx_xvsub_b(_in2, _in5); \ |
|
_out6 = __lasx_xvsub_b(_in1, _in6); \ |
|
_out7 = __lasx_xvsub_b(_in0, _in7); \ |
|
} |
|
|
|
#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
_out0 = __lasx_xvadd_h(_in0, _in7); \ |
|
_out1 = __lasx_xvadd_h(_in1, _in6); \ |
|
_out2 = __lasx_xvadd_h(_in2, _in5); \ |
|
_out3 = __lasx_xvadd_h(_in3, _in4); \ |
|
_out4 = __lasx_xvsub_h(_in3, _in4); \ |
|
_out5 = __lasx_xvsub_h(_in2, _in5); \ |
|
_out6 = __lasx_xvsub_h(_in1, _in6); \ |
|
_out7 = __lasx_xvsub_h(_in0, _in7); \ |
|
} |
|
|
|
#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
_out0 = __lasx_xvadd_w(_in0, _in7); \ |
|
_out1 = __lasx_xvadd_w(_in1, _in6); \ |
|
_out2 = __lasx_xvadd_w(_in2, _in5); \ |
|
_out3 = __lasx_xvadd_w(_in3, _in4); \ |
|
_out4 = __lasx_xvsub_w(_in3, _in4); \ |
|
_out5 = __lasx_xvsub_w(_in2, _in5); \ |
|
_out6 = __lasx_xvsub_w(_in1, _in6); \ |
|
_out7 = __lasx_xvsub_w(_in0, _in7); \ |
|
} |
|
|
|
#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ |
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \ |
|
_out7) \ |
|
{ \ |
|
_out0 = __lasx_xvadd_d(_in0, _in7); \ |
|
_out1 = __lasx_xvadd_d(_in1, _in6); \ |
|
_out2 = __lasx_xvadd_d(_in2, _in5); \ |
|
_out3 = __lasx_xvadd_d(_in3, _in4); \ |
|
_out4 = __lasx_xvsub_d(_in3, _in4); \ |
|
_out5 = __lasx_xvsub_d(_in2, _in5); \ |
|
_out6 = __lasx_xvsub_d(_in1, _in6); \ |
|
_out7 = __lasx_xvsub_d(_in0, _in7); \ |
|
} |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define VECT_PRINT(RTYPE, element_num, in0, enter) \ |
|
{ \ |
|
RTYPE _tmp0 = (RTYPE)in0; \ |
|
int _i = 0; \ |
|
if (enter) printf("\nVP:"); \ |
|
for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \ |
|
} |
|
|
|
#endif |
|
#endif |
|
|