13 #ifndef EIGEN_PACKET_MATH_MSA_H 14 #define EIGEN_PACKET_MATH_MSA_H 23 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 24 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 27 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 28 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 31 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 36 #define EIGEN_MSA_DEBUG \ 37 static bool firstTime = true; \ 40 std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \ 45 #define EIGEN_MSA_DEBUG 48 #define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a)) 50 typedef v4f32 Packet4f;
51 typedef v4i32 Packet4i;
52 typedef v4u32 Packet4ui;
54 #define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X } 55 #define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X } 56 #define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X } 58 inline std::ostream& operator<<(std::ostream& os,
const Packet4f& value) {
59 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
63 inline std::ostream& operator<<(std::ostream& os,
const Packet4i& value) {
64 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
68 inline std::ostream& operator<<(std::ostream& os,
const Packet4ui& value) {
69 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
74 struct packet_traits<float> : default_packet_traits {
75 typedef Packet4f type;
76 typedef Packet4f half;
84 HasSin = EIGEN_FAST_MATH,
85 HasCos = EIGEN_FAST_MATH,
86 HasTanh = EIGEN_FAST_MATH,
87 HasErf = EIGEN_FAST_MATH,
100 struct packet_traits<int32_t> : default_packet_traits {
101 typedef Packet4i type;
102 typedef Packet4i half;
115 struct unpacket_traits<Packet4f> {
117 enum { size = 4, alignment =
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false };
118 typedef Packet4f half;
122 struct unpacket_traits<Packet4i> {
123 typedef int32_t type;
124 enum { size = 4, alignment =
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false };
125 typedef Packet4i half;
129 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
132 Packet4f v = { from, from, from, from };
137 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int32_t& from) {
140 return __builtin_msa_fill_w(from);
144 EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(
const float* from) {
148 Packet4f v = { f, f, f, f };
153 EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(
const int32_t* from) {
156 return __builtin_msa_fill_w(*from);
160 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
163 return __builtin_msa_fadd_w(a, b);
167 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
170 return __builtin_msa_addv_w(a, b);
174 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
177 static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
178 return padd(pset1<Packet4f>(a), countdown);
182 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int32_t& a) {
185 static const Packet4i countdown = { 0, 1, 2, 3 };
186 return padd(pset1<Packet4i>(a), countdown);
190 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
193 return __builtin_msa_fsub_w(a, b);
197 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
200 return __builtin_msa_subv_w(a, b);
204 EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
207 return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
211 EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
214 return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
218 EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
225 EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
232 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
235 return __builtin_msa_fmul_w(a, b);
239 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
242 return __builtin_msa_mulv_w(a, b);
246 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
249 return __builtin_msa_fdiv_w(a, b);
253 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
256 return __builtin_msa_div_s_w(a, b);
260 EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
263 return __builtin_msa_fmadd_w(c, a, b);
267 EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
272 __asm__(
"maddv.w %w[value], %w[a], %w[b]\n" 274 : [value]
"+f"(value)
276 : [a]
"f"(a), [b]
"f"(b));
281 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
284 return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
288 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
291 return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
295 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
298 return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
302 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
305 return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
309 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
312 return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
316 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
319 return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
323 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
326 return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
330 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
333 return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
337 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
342 return __builtin_msa_fmin_w(a, b);
345 Packet4i aNaN = __builtin_msa_fcun_w(a, a);
346 Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
347 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
352 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
355 return __builtin_msa_min_s_w(a, b);
359 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
364 return __builtin_msa_fmax_w(a, b);
367 Packet4i aNaN = __builtin_msa_fcun_w(a, a);
368 Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
369 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
374 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
377 return __builtin_msa_max_s_w(a, b);
381 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
384 EIGEN_DEBUG_ALIGNED_LOAD
return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
388 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int32_t* from) {
391 EIGEN_DEBUG_ALIGNED_LOAD
return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
395 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
398 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
402 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int32_t* from) {
405 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
409 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
412 float f0 = from[0], f1 = from[1];
413 Packet4f v0 = { f0, f0, f0, f0 };
414 Packet4f v1 = { f1, f1, f1, f1 };
415 return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
419 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int32_t* from) {
422 int32_t i0 = from[0], i1 = from[1];
423 Packet4i v0 = { i0, i0, i0, i0 };
424 Packet4i v1 = { i1, i1, i1, i1 };
425 return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
429 EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
432 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
436 EIGEN_STRONG_INLINE
void pstore<int32_t>(int32_t* to,
const Packet4i& from) {
439 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
443 EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
446 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
450 EIGEN_STRONG_INLINE
void pstoreu<int32_t>(int32_t* to,
const Packet4i& from) {
453 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
457 EIGEN_DEVICE_FUNC
inline Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
461 Packet4f v = { f, f, f, f };
463 v[2] = from[2 * stride];
464 v[3] = from[3 * stride];
469 EIGEN_DEVICE_FUNC
inline Packet4i pgather<int32_t, Packet4i>(
const int32_t* from,
Index stride) {
473 Packet4i v = { i, i, i, i };
475 v[2] = from[2 * stride];
476 v[3] = from[3 * stride];
481 EIGEN_DEVICE_FUNC
inline void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
495 EIGEN_DEVICE_FUNC
inline void pscatter<int32_t, Packet4i>(int32_t* to,
const Packet4i& from,
509 EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
512 __builtin_prefetch(addr);
516 EIGEN_STRONG_INLINE
void prefetch<int32_t>(
const int32_t* addr) {
519 __builtin_prefetch(addr);
523 EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
530 EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(
const Packet4i& a) {
537 EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
540 return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
544 EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
547 return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
551 EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
554 return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
558 EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
561 Packet4i zero = __builtin_msa_ldi_w(0);
562 return __builtin_msa_add_a_w(zero, a);
566 EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
569 Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
570 s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
576 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(
const Packet4i& a) {
579 Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
580 s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
587 EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a) {
590 Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
591 p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
596 EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(
const Packet4i& a) {
599 Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
600 p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
606 EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
610 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
614 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
616 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
619 Packet4f v = __builtin_msa_fmin_w(a, swapped);
620 v = __builtin_msa_fmin_w(
621 v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
624 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
625 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
631 EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(
const Packet4i& a) {
634 Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
635 m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
641 EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
645 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
649 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
651 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
654 Packet4f v = __builtin_msa_fmax_w(a, swapped);
655 v = __builtin_msa_fmax_w(
656 v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
659 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
660 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
666 EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(
const Packet4i& a) {
669 Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
670 m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
674 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
675 os <<
"[ " << value.packet[0] <<
"," << std::endl
676 <<
" " << value.packet[1] <<
"," << std::endl
677 <<
" " << value.packet[2] <<
"," << std::endl
678 <<
" " << value.packet[3] <<
" ]";
682 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
685 v4i32 tmp1, tmp2, tmp3, tmp4;
687 tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
688 tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
689 tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
690 tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
692 kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
693 kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
694 kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
695 kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
698 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
699 os <<
"[ " << value.packet[0] <<
"," << std::endl
700 <<
" " << value.packet[1] <<
"," << std::endl
701 <<
" " << value.packet[2] <<
"," << std::endl
702 <<
" " << value.packet[3] <<
" ]";
706 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
709 v4i32 tmp1, tmp2, tmp3, tmp4;
711 tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
712 tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
713 tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
714 tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
716 kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
717 kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
718 kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
719 kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
723 EIGEN_STRONG_INLINE Packet4f psqrt(
const Packet4f& a) {
726 return __builtin_msa_fsqrt_w(a);
730 EIGEN_STRONG_INLINE Packet4f prsqrt(
const Packet4f& a) {
734 return __builtin_msa_frsqrt_w(a);
736 Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
737 return pdiv(ones, psqrt(a));
742 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
744 int32_t old_mode, new_mode;
746 "cfcmsa %[old_mode], $1\n" 747 "ori %[new_mode], %[old_mode], 3\n" 748 "ctcmsa $1, %[new_mode]\n" 749 "frint.w %w[v], %w[v]\n" 750 "ctcmsa $1, %[old_mode]\n" 752 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
761 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
763 int32_t old_mode, new_mode;
765 "cfcmsa %[old_mode], $1\n" 766 "ori %[new_mode], %[old_mode], 3\n" 767 "xori %[new_mode], %[new_mode], 1\n" 768 "ctcmsa $1, %[new_mode]\n" 769 "frint.w %w[v], %w[v]\n" 770 "ctcmsa $1, %[old_mode]\n" 772 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
781 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a) {
783 int32_t old_mode, new_mode;
785 "cfcmsa %[old_mode], $1\n" 786 "ori %[new_mode], %[old_mode], 3\n" 787 "xori %[new_mode], %[new_mode], 3\n" 788 "ctcmsa $1, %[new_mode]\n" 789 "frint.w %w[v], %w[v]\n" 790 "ctcmsa $1, %[old_mode]\n" 792 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
801 EIGEN_STRONG_INLINE Packet4f pblend(
const Selector<4>& ifPacket,
const Packet4f& thenPacket,
802 const Packet4f& elsePacket) {
803 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
804 ifPacket.select[3] };
805 Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
806 return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
810 EIGEN_STRONG_INLINE Packet4i pblend(
const Selector<4>& ifPacket,
const Packet4i& thenPacket,
811 const Packet4i& elsePacket) {
812 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
813 ifPacket.select[3] };
814 Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
815 return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
820 typedef v2f64 Packet2d;
821 typedef v2i64 Packet2l;
822 typedef v2u64 Packet2ul;
824 #define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X } 825 #define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X } 826 #define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X } 828 inline std::ostream& operator<<(std::ostream& os,
const Packet2d& value) {
829 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
833 inline std::ostream& operator<<(std::ostream& os,
const Packet2l& value) {
834 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
838 inline std::ostream& operator<<(std::ostream& os,
const Packet2ul& value) {
839 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
844 struct packet_traits<double> : default_packet_traits {
845 typedef Packet2d type;
846 typedef Packet2d half;
865 struct unpacket_traits<Packet2d> {
867 enum { size = 2, alignment =
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false };
868 typedef Packet2d half;
872 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
875 Packet2d value = { from, from };
880 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
883 return __builtin_msa_fadd_d(a, b);
887 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
890 static const Packet2d countdown = { 0.0, 1.0 };
891 return padd(pset1<Packet2d>(a), countdown);
895 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
898 return __builtin_msa_fsub_d(a, b);
902 EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
905 return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
909 EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
916 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
919 return __builtin_msa_fmul_d(a, b);
923 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
926 return __builtin_msa_fdiv_d(a, b);
930 EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
933 return __builtin_msa_fmadd_d(c, a, b);
939 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
942 return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
946 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
949 return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
953 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
956 return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
960 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
963 return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
967 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
970 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
974 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
979 return __builtin_msa_fmin_d(a, b);
982 v2i64 aNaN = __builtin_msa_fcun_d(a, a);
983 v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
984 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
989 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
994 return __builtin_msa_fmax_d(a, b);
997 v2i64 aNaN = __builtin_msa_fcun_d(a, a);
998 v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
999 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
1004 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
1007 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
1011 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
1014 Packet2d value = { *from, *from };
1019 EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
1022 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1026 EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
1029 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1033 EIGEN_DEVICE_FUNC
inline Packet2d pgather<double, Packet2d>(
const double* from,
Index stride) {
1044 EIGEN_DEVICE_FUNC
inline void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
1054 EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
1057 __builtin_prefetch(addr);
1061 EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1068 EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
1071 return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1075 EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
1078 return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
1082 EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
1085 Packet2d s = padd(a, preverse(a));
1092 EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
1095 Packet2d p = pmul(a, preverse(a));
1101 EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a) {
1105 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1106 Packet2d v = __builtin_msa_fmin_d(a, swapped);
1109 double a0 = a[0], a1 = a[1];
1110 return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
1116 EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a) {
1120 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1121 Packet2d v = __builtin_msa_fmax_d(a, swapped);
1124 double a0 = a[0], a1 = a[1];
1125 return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
1130 EIGEN_STRONG_INLINE Packet2d psqrt(
const Packet2d& a) {
1133 return __builtin_msa_fsqrt_d(a);
1137 EIGEN_STRONG_INLINE Packet2d prsqrt(
const Packet2d& a) {
1141 return __builtin_msa_frsqrt_d(a);
1143 Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
1144 return pdiv(ones, psqrt(a));
1148 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
1149 os <<
"[ " << value.packet[0] <<
"," << std::endl <<
" " << value.packet[1] <<
" ]";
1153 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1156 Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1157 Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1158 kernel.packet[0] = trn1;
1159 kernel.packet[1] = trn2;
1163 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a) {
1165 int32_t old_mode, new_mode;
1167 "cfcmsa %[old_mode], $1\n" 1168 "ori %[new_mode], %[old_mode], 3\n" 1169 "ctcmsa $1, %[new_mode]\n" 1170 "frint.d %w[v], %w[v]\n" 1171 "ctcmsa $1, %[old_mode]\n" 1173 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1182 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a) {
1184 int32_t old_mode, new_mode;
1186 "cfcmsa %[old_mode], $1\n" 1187 "ori %[new_mode], %[old_mode], 3\n" 1188 "xori %[new_mode], %[new_mode], 1\n" 1189 "ctcmsa $1, %[new_mode]\n" 1190 "frint.d %w[v], %w[v]\n" 1191 "ctcmsa $1, %[old_mode]\n" 1193 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1202 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(
const Packet2d& a) {
1204 int32_t old_mode, new_mode;
1206 "cfcmsa %[old_mode], $1\n" 1207 "ori %[new_mode], %[old_mode], 3\n" 1208 "xori %[new_mode], %[new_mode], 3\n" 1209 "ctcmsa $1, %[new_mode]\n" 1210 "frint.d %w[v], %w[v]\n" 1211 "ctcmsa $1, %[old_mode]\n" 1213 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1222 EIGEN_STRONG_INLINE Packet2d pblend(
const Selector<2>& ifPacket,
const Packet2d& thenPacket,
1223 const Packet2d& elsePacket) {
1224 Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
1225 Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
1226 return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
1233 #endif // EIGEN_PACKET_MATH_MSA_H Definition: Constants.h:235
Namespace containing all symbols from the Eigen library.
Definition: Core:141
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
Definition: Eigen_Colamd.h:50