10 #ifndef EIGEN_PACKET_MATH_SVE_H 11 #define EIGEN_PACKET_MATH_SVE_H 17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 21 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 22 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 25 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 27 template <
typename Scalar,
int SVEVectorLength>
28 struct sve_packet_size_selector {
29 enum { size = SVEVectorLength / (
sizeof(Scalar) * CHAR_BIT) };
33 typedef svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
36 struct packet_traits<numext::int32_t> : default_packet_traits {
37 typedef PacketXi type;
38 typedef PacketXi half;
42 size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
63 struct unpacket_traits<PacketXi> {
64 typedef numext::int32_t type;
65 typedef PacketXi half;
67 size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
70 masked_load_available =
false,
71 masked_store_available =
false 76 EIGEN_STRONG_INLINE
void prefetch<numext::int32_t>(
const numext::int32_t* addr)
78 svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
82 EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(
const numext::int32_t& from)
84 return svdup_n_s32(from);
88 EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(
const numext::int32_t& a)
90 numext::int32_t c[packet_traits<numext::int32_t>::size];
91 for (
int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
92 return svadd_s32_z(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
96 EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(
const PacketXi& a,
const PacketXi& b)
98 return svadd_s32_z(svptrue_b32(), a, b);
102 EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(
const PacketXi& a,
const PacketXi& b)
104 return svsub_s32_z(svptrue_b32(), a, b);
108 EIGEN_STRONG_INLINE PacketXi pnegate(
const PacketXi& a)
110 return svneg_s32_z(svptrue_b32(), a);
114 EIGEN_STRONG_INLINE PacketXi pconj(
const PacketXi& a)
120 EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(
const PacketXi& a,
const PacketXi& b)
122 return svmul_s32_z(svptrue_b32(), a, b);
126 EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(
const PacketXi& a,
const PacketXi& b)
128 return svdiv_s32_z(svptrue_b32(), a, b);
132 EIGEN_STRONG_INLINE PacketXi pmadd(
const PacketXi& a,
const PacketXi& b,
const PacketXi& c)
134 return svmla_s32_z(svptrue_b32(), c, a, b);
138 EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(
const PacketXi& a,
const PacketXi& b)
140 return svmin_s32_z(svptrue_b32(), a, b);
144 EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(
const PacketXi& a,
const PacketXi& b)
146 return svmax_s32_z(svptrue_b32(), a, b);
150 EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(
const PacketXi& a,
const PacketXi& b)
152 return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
156 EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(
const PacketXi& a,
const PacketXi& b)
158 return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
162 EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(
const PacketXi& a,
const PacketXi& b)
164 return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu);
168 EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(
const PacketXi& )
170 return svdup_n_s32_z(svptrue_b32(), 0xffffffffu);
174 EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(
const PacketXi& )
176 return svdup_n_s32_z(svptrue_b32(), 0);
180 EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(
const PacketXi& a,
const PacketXi& b)
182 return svand_s32_z(svptrue_b32(), a, b);
186 EIGEN_STRONG_INLINE PacketXi por<PacketXi>(
const PacketXi& a,
const PacketXi& b)
188 return svorr_s32_z(svptrue_b32(), a, b);
192 EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(
const PacketXi& a,
const PacketXi& b)
194 return sveor_s32_z(svptrue_b32(), a, b);
198 EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(
const PacketXi& a,
const PacketXi& b)
200 return svbic_s32_z(svptrue_b32(), a, b);
204 EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a)
206 return svasrd_n_s32_z(svptrue_b32(), a, N);
210 EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a)
212 return svreinterpret_s32_u32(svlsr_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), svdup_n_u32_z(svptrue_b32(), N)));
216 EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a)
218 return svlsl_s32_z(svptrue_b32(), a, svdup_n_u32_z(svptrue_b32(), N));
222 EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(
const numext::int32_t* from)
224 EIGEN_DEBUG_ALIGNED_LOAD
return svld1_s32(svptrue_b32(), from);
228 EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(
const numext::int32_t* from)
230 EIGEN_DEBUG_UNALIGNED_LOAD
return svld1_s32(svptrue_b32(), from);
234 EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(
const numext::int32_t* from)
236 svuint32_t indices = svindex_u32(0, 1);
237 indices = svzip1_u32(indices, indices);
238 return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
242 EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(
const numext::int32_t* from)
244 svuint32_t indices = svindex_u32(0, 1);
245 indices = svzip1_u32(indices, indices);
246 indices = svzip1_u32(indices, indices);
247 return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
251 EIGEN_STRONG_INLINE
void pstore<numext::int32_t>(numext::int32_t* to,
const PacketXi& from)
253 EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
257 EIGEN_STRONG_INLINE
void pstoreu<numext::int32_t>(numext::int32_t* to,
const PacketXi& from)
259 EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
263 EIGEN_DEVICE_FUNC
inline PacketXi pgather<numext::int32_t, PacketXi>(
const numext::int32_t* from,
Index stride)
266 svint32_t indices = svindex_s32(0, stride);
267 return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
271 EIGEN_DEVICE_FUNC
inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to,
const PacketXi& from,
Index stride)
274 svint32_t indices = svindex_s32(0, stride);
275 svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
279 EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(
const PacketXi& a)
282 return svlasta_s32(svpfalse_b(), a);
286 EIGEN_STRONG_INLINE PacketXi preverse(
const PacketXi& a)
292 EIGEN_STRONG_INLINE PacketXi pabs(
const PacketXi& a)
294 return svabs_s32_z(svptrue_b32(), a);
298 EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(
const PacketXi& a)
300 return static_cast<numext::int32_t
>(svaddv_s32(svptrue_b32(), a));
304 EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(
const PacketXi& a)
306 EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
307 EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
310 svint32_t prod = svmul_s32_z(svptrue_b32(), a, svrev_s32(a));
314 if (EIGEN_ARM64_SVE_VL >= 2048) {
315 half_prod = svtbl_s32(prod, svindex_u32(32, 1));
316 prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
318 if (EIGEN_ARM64_SVE_VL >= 1024) {
319 half_prod = svtbl_s32(prod, svindex_u32(16, 1));
320 prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
322 if (EIGEN_ARM64_SVE_VL >= 512) {
323 half_prod = svtbl_s32(prod, svindex_u32(8, 1));
324 prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
326 if (EIGEN_ARM64_SVE_VL >= 256) {
327 half_prod = svtbl_s32(prod, svindex_u32(4, 1));
328 prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
331 half_prod = svtbl_s32(prod, svindex_u32(2, 1));
332 prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
335 return pfirst<PacketXi>(prod);
339 EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(
const PacketXi& a)
341 return svminv_s32(svptrue_b32(), a);
345 EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(
const PacketXi& a)
347 return svmaxv_s32(svptrue_b32(), a);
351 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<PacketXi, N>& kernel) {
352 int buffer[packet_traits<numext::int32_t>::size * N] = {0};
355 PacketXi stride_index = svindex_s32(0, N);
357 for (i = 0; i < N; i++) {
358 svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
360 for (i = 0; i < N; i++) {
361 kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits<numext::int32_t>::size);
367 typedef svfloat32_t PacketXf __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
370 struct packet_traits<float> : default_packet_traits {
371 typedef PacketXf type;
372 typedef PacketXf half;
377 size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
398 HasSin = EIGEN_FAST_MATH,
399 HasCos = EIGEN_FAST_MATH,
403 HasTanh = EIGEN_FAST_MATH,
404 HasErf = EIGEN_FAST_MATH
409 struct unpacket_traits<PacketXf> {
411 typedef PacketXf half;
412 typedef PacketXi integer_packet;
415 size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
418 masked_load_available =
false,
419 masked_store_available =
false 424 EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(
const float& from)
426 return svdup_n_f32(from);
430 EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from)
432 return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from));
436 EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(
const float& a)
438 float c[packet_traits<float>::size];
439 for (
int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
440 return svadd_f32_z(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
444 EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(
const PacketXf& a,
const PacketXf& b)
446 return svadd_f32_z(svptrue_b32(), a, b);
450 EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(
const PacketXf& a,
const PacketXf& b)
452 return svsub_f32_z(svptrue_b32(), a, b);
456 EIGEN_STRONG_INLINE PacketXf pnegate(
const PacketXf& a)
458 return svneg_f32_z(svptrue_b32(), a);
462 EIGEN_STRONG_INLINE PacketXf pconj(
const PacketXf& a)
468 EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(
const PacketXf& a,
const PacketXf& b)
470 return svmul_f32_z(svptrue_b32(), a, b);
474 EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(
const PacketXf& a,
const PacketXf& b)
476 return svdiv_f32_z(svptrue_b32(), a, b);
480 EIGEN_STRONG_INLINE PacketXf pmadd(
const PacketXf& a,
const PacketXf& b,
const PacketXf& c)
482 return svmla_f32_z(svptrue_b32(), c, a, b);
486 EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(
const PacketXf& a,
const PacketXf& b)
488 return svmin_f32_z(svptrue_b32(), a, b);
492 EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(
const PacketXf& a,
const PacketXf& b)
494 return pmin<PacketXf>(a, b);
498 EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(
const PacketXf& a,
const PacketXf& b)
500 return svminnm_f32_z(svptrue_b32(), a, b);
504 EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(
const PacketXf& a,
const PacketXf& b)
506 return svmax_f32_z(svptrue_b32(), a, b);
510 EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(
const PacketXf& a,
const PacketXf& b)
512 return pmax<PacketXf>(a, b);
516 EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(
const PacketXf& a,
const PacketXf& b)
518 return svmaxnm_f32_z(svptrue_b32(), a, b);
524 EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(
const PacketXf& a,
const PacketXf& b)
526 return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
530 EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(
const PacketXf& a,
const PacketXf& b)
532 return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
536 EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(
const PacketXf& a,
const PacketXf& b)
538 return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
545 EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(
const PacketXf& a,
const PacketXf& b)
547 return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
551 EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(
const PacketXf& a)
553 return svrintm_f32_z(svptrue_b32(), a);
557 EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(
const PacketXf& )
559 return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu));
564 EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(
const PacketXf& a,
const PacketXf& b)
566 return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
570 EIGEN_STRONG_INLINE PacketXf por<PacketXf>(
const PacketXf& a,
const PacketXf& b)
572 return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
576 EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(
const PacketXf& a,
const PacketXf& b)
578 return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
582 EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(
const PacketXf& a,
const PacketXf& b)
584 return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
588 EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(
const float* from)
590 EIGEN_DEBUG_ALIGNED_LOAD
return svld1_f32(svptrue_b32(), from);
594 EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(
const float* from)
596 EIGEN_DEBUG_UNALIGNED_LOAD
return svld1_f32(svptrue_b32(), from);
600 EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(
const float* from)
602 svuint32_t indices = svindex_u32(0, 1);
603 indices = svzip1_u32(indices, indices);
604 return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
608 EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(
const float* from)
610 svuint32_t indices = svindex_u32(0, 1);
611 indices = svzip1_u32(indices, indices);
612 indices = svzip1_u32(indices, indices);
613 return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
617 EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const PacketXf& from)
619 EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
623 EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const PacketXf& from)
625 EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
629 EIGEN_DEVICE_FUNC
inline PacketXf pgather<float, PacketXf>(
const float* from,
Index stride)
632 svint32_t indices = svindex_s32(0, stride);
633 return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
637 EIGEN_DEVICE_FUNC
inline void pscatter<float, PacketXf>(
float* to,
const PacketXf& from,
Index stride)
640 svint32_t indices = svindex_s32(0, stride);
641 svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
645 EIGEN_STRONG_INLINE
float pfirst<PacketXf>(
const PacketXf& a)
648 return svlasta_f32(svpfalse_b(), a);
652 EIGEN_STRONG_INLINE PacketXf preverse(
const PacketXf& a)
658 EIGEN_STRONG_INLINE PacketXf pabs(
const PacketXf& a)
660 return svabs_f32_z(svptrue_b32(), a);
666 EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(
const PacketXf& a, PacketXf& exponent)
668 return pfrexp_generic(a, exponent);
672 EIGEN_STRONG_INLINE
float predux<PacketXf>(
const PacketXf& a)
674 return svaddv_f32(svptrue_b32(), a);
681 EIGEN_STRONG_INLINE
float predux_mul<PacketXf>(
const PacketXf& a)
683 EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
684 EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
686 svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a));
687 svfloat32_t half_prod;
690 if (EIGEN_ARM64_SVE_VL >= 2048) {
691 half_prod = svtbl_f32(prod, svindex_u32(32, 1));
692 prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
694 if (EIGEN_ARM64_SVE_VL >= 1024) {
695 half_prod = svtbl_f32(prod, svindex_u32(16, 1));
696 prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
698 if (EIGEN_ARM64_SVE_VL >= 512) {
699 half_prod = svtbl_f32(prod, svindex_u32(8, 1));
700 prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
702 if (EIGEN_ARM64_SVE_VL >= 256) {
703 half_prod = svtbl_f32(prod, svindex_u32(4, 1));
704 prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
707 half_prod = svtbl_f32(prod, svindex_u32(2, 1));
708 prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
711 return pfirst<PacketXf>(prod);
715 EIGEN_STRONG_INLINE
float predux_min<PacketXf>(
const PacketXf& a)
717 return svminv_f32(svptrue_b32(), a);
721 EIGEN_STRONG_INLINE
float predux_max<PacketXf>(
const PacketXf& a)
723 return svmaxv_f32(svptrue_b32(), a);
727 EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<PacketXf, N>& kernel)
729 float buffer[packet_traits<float>::size * N] = {0};
732 PacketXi stride_index = svindex_s32(0, N);
734 for (i = 0; i < N; i++) {
735 svst1_scatter_s32index_f32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
738 for (i = 0; i < N; i++) {
739 kernel.packet[i] = svld1_f32(svptrue_b32(), buffer + i * packet_traits<float>::size);
744 EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(
const PacketXf& a,
const PacketXf& exponent)
746 return pldexp_generic(a, exponent);
752 #endif // EIGEN_PACKET_MATH_SVE_H Namespace containing all symbols from the Eigen library.
Definition: Core:141
Definition: Constants.h:237
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
Definition: Eigen_Colamd.h:50