12#ifndef EIGEN_PACKET_MATH_NEON_H
13#define EIGEN_PACKET_MATH_NEON_H
19#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
23#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
29#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
31#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
35#if EIGEN_COMP_MSVC_STRICT
41typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
42typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
43typedef eigen_packet_wrapper<int32_t ,2> Packet4c;
44typedef eigen_packet_wrapper<int8x8_t ,3> Packet8c;
45typedef eigen_packet_wrapper<int8x16_t ,4> Packet16c;
46typedef eigen_packet_wrapper<uint32_t ,5> Packet4uc;
47typedef eigen_packet_wrapper<uint8x8_t ,6> Packet8uc;
48typedef eigen_packet_wrapper<uint8x16_t ,7> Packet16uc;
49typedef eigen_packet_wrapper<int16x4_t ,8> Packet4s;
50typedef eigen_packet_wrapper<int16x8_t ,9> Packet8s;
51typedef eigen_packet_wrapper<uint16x4_t ,10> Packet4us;
52typedef eigen_packet_wrapper<uint16x8_t ,11> Packet8us;
53typedef eigen_packet_wrapper<int32x2_t ,12> Packet2i;
54typedef eigen_packet_wrapper<int32x4_t ,13> Packet4i;
55typedef eigen_packet_wrapper<uint32x2_t ,14> Packet2ui;
56typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
57typedef eigen_packet_wrapper<int64x2_t ,16> Packet2l;
58typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
60EIGEN_ALWAYS_INLINE Packet4f make_packet4f(
float a,
float b,
float c,
float d) {
61 float from[4] = {a, b, c, d};
62 return vld1q_f32(from);
65EIGEN_ALWAYS_INLINE Packet2f make_packet2f(
float a,
float b) {
66 float from[2] = {a, b};
67 return vld1_f32(from);
72typedef float32x2_t Packet2f;
73typedef float32x4_t Packet4f;
74typedef eigen_packet_wrapper<int32_t ,2> Packet4c;
75typedef int8x8_t Packet8c;
76typedef int8x16_t Packet16c;
77typedef eigen_packet_wrapper<uint32_t ,5> Packet4uc;
78typedef uint8x8_t Packet8uc;
79typedef uint8x16_t Packet16uc;
80typedef int16x4_t Packet4s;
81typedef int16x8_t Packet8s;
82typedef uint16x4_t Packet4us;
83typedef uint16x8_t Packet8us;
84typedef int32x2_t Packet2i;
85typedef int32x4_t Packet4i;
86typedef uint32x2_t Packet2ui;
87typedef uint32x4_t Packet4ui;
88typedef int64x2_t Packet2l;
89typedef uint64x2_t Packet2ul;
91EIGEN_ALWAYS_INLINE Packet4f make_packet4f(
float a,
float b,
float c,
float d) {
92 const Packet2f low = {a, b};
93 const Packet2f high = {c, d};
94 return vcombine_f32(low, high);
97EIGEN_ALWAYS_INLINE Packet2f make_packet2f(
float a,
float b) {
98 const Packet2f result = {a, b};
104EIGEN_STRONG_INLINE Packet4f shuffle1(
const Packet4f& m,
int mask){
105 const float* a =
reinterpret_cast<const float*
>(&m);
106 Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3)));
114template<
bool interleave>
115EIGEN_STRONG_INLINE Packet4f shuffle2(
const Packet4f &m,
const Packet4f &n,
int mask)
117 const float* a =
reinterpret_cast<const float*
>(&m);
118 const float* b =
reinterpret_cast<const float*
>(&n);
119 Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
124EIGEN_STRONG_INLINE Packet4f shuffle2<true>(
const Packet4f &m,
const Packet4f &n,
int mask)
126 const float* a =
reinterpret_cast<const float*
>(&m);
127 const float* b =
reinterpret_cast<const float*
>(&n);
128 Packet4f res = make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
132EIGEN_STRONG_INLINE
static int eigen_neon_shuffle_mask(
int p,
int q,
int r,
int s) {
return ((s)<<6|(r)<<4|(q)<<2|(p));}
134EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(
const Packet4f& a,
int p,
int q,
int r,
int s)
136 return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
138EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(
const Packet4f& a,
const Packet4f& b,
int p,
int q,
int r,
int s)
140 return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
142EIGEN_STRONG_INLINE Packet4f vec4f_movelh(
const Packet4f& a,
const Packet4f& b)
144 return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
146EIGEN_STRONG_INLINE Packet4f vec4f_movehl(
const Packet4f& a,
const Packet4f& b)
148 return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
150EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(
const Packet4f& a,
const Packet4f& b)
152 return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
154EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(
const Packet4f& a,
const Packet4f& b)
156 return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
158#define vec4f_duplane(a, p) \
159 Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
161#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
162 const Packet4f p4f_##NAME = pset1<Packet4f>(X)
164#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
165 const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
167#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
168 const Packet4i p4i_##NAME = pset1<Packet4i>(X)
170#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
174 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
175#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
176 #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
178 #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
180 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
183 #define EIGEN_ARM_PREFETCH(ADDR)
187struct packet_traits<float> : default_packet_traits
189 typedef Packet4f type;
190 typedef Packet2f half;
218 HasSin = EIGEN_FAST_MATH,
219 HasCos = EIGEN_FAST_MATH,
224 HasTanh = EIGEN_FAST_MATH,
225 HasErf = EIGEN_FAST_MATH,
353 typedef Packet4i type;
354 typedef Packet2i
half;
470#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM
474EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(
const float* x) { return ::vld1_dup_f32 ((
const float32_t*)x); }
475EIGEN_STRONG_INLINE
void vst1q_f32(
float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
476EIGEN_STRONG_INLINE
void vst1_f32 (
float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
489 masked_load_available =
false,
490 masked_store_available =
false
496 typedef Packet2f
half;
497 typedef Packet4i integer_packet;
503 masked_load_available =
false,
504 masked_store_available =
false
516 masked_load_available =
false,
517 masked_store_available =
false
529 masked_load_available =
false,
530 masked_store_available =
false
536 typedef Packet8c
half;
542 masked_load_available =
false,
543 masked_store_available =
false
548 typedef uint8_t type;
555 masked_load_available =
false,
556 masked_store_available =
false
561 typedef uint8_t type;
568 masked_load_available =
false,
569 masked_store_available =
false
574 typedef uint8_t type;
575 typedef Packet8uc
half;
581 masked_load_available =
false,
582 masked_store_available =
false};
586 typedef int16_t type;
593 masked_load_available =
false,
594 masked_store_available =
false
599 typedef int16_t type;
600 typedef Packet4s
half;
606 masked_load_available =
false,
607 masked_store_available =
false
612 typedef uint16_t type;
619 masked_load_available =
false,
620 masked_store_available =
false
625 typedef uint16_t type;
626 typedef Packet4us
half;
632 masked_load_available =
false,
633 masked_store_available =
false
638 typedef int32_t type;
645 masked_load_available =
false,
646 masked_store_available =
false
651 typedef int32_t type;
652 typedef Packet2i
half;
658 masked_load_available =
false,
659 masked_store_available =
false
664 typedef uint32_t type;
671 masked_load_available =
false,
672 masked_store_available =
false
677 typedef uint32_t type;
684 masked_load_available =
false,
685 masked_store_available =
false
690 typedef int64_t type;
697 masked_load_available =
false,
698 masked_store_available =
false
703 typedef uint64_t type;
710 masked_load_available =
false,
711 masked_store_available =
false
717template<> EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(
const int8_t& from)
718{
return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); }
719template<> EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(
const int8_t& from) {
return vdup_n_s8(from); }
720template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(
const int8_t& from) {
return vdupq_n_s8(from); }
721template<> EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(
const uint8_t& from)
722{
return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); }
723template<> EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(
const uint8_t& from) {
return vdup_n_u8(from); }
724template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(
const uint8_t& from) {
return vdupq_n_u8(from); }
725template<> EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(
const int16_t& from) {
return vdup_n_s16(from); }
726template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(
const int16_t& from) {
return vdupq_n_s16(from); }
727template<> EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(
const uint16_t& from) {
return vdup_n_u16(from); }
728template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(
const uint16_t& from) {
return vdupq_n_u16(from); }
729template<> EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(
const int32_t& from) {
return vdup_n_s32(from); }
730template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int32_t& from) {
return vdupq_n_s32(from); }
731template<> EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(
const uint32_t& from) {
return vdup_n_u32(from); }
732template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(
const uint32_t& from) {
return vdupq_n_u32(from); }
733template<> EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(
const int64_t& from) {
return vdupq_n_s64(from); }
734template<> EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(
const uint64_t& from) {
return vdupq_n_u64(from); }
736template<> EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(
unsigned int from)
737{
return vreinterpret_f32_u32(vdup_n_u32(from)); }
738template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(
unsigned int from)
739{
return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
741template<> EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(
const float& a)
743 const float c[] = {0.0f,1.0f};
744 return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
746template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a)
748 const float c[] = {0.0f,1.0f,2.0f,3.0f};
749 return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
751template<> EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(
const int8_t& a)
752{
return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); }
753template<> EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(
const int8_t& a)
755 const int8_t c[] = {0,1,2,3,4,5,6,7};
756 return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
758template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(
const int8_t& a)
760 const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
761 return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
763template<> EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(
const uint8_t& a)
764{
return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); }
765template<> EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(
const uint8_t& a)
767 const uint8_t c[] = {0,1,2,3,4,5,6,7};
768 return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
770template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(
const uint8_t& a)
772 const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
773 return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
775template<> EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(
const int16_t& a)
777 const int16_t c[] = {0,1,2,3};
778 return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
780template<> EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(
const uint16_t& a)
782 const uint16_t c[] = {0,1,2,3};
783 return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
785template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(
const int16_t& a)
787 const int16_t c[] = {0,1,2,3,4,5,6,7};
788 return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
790template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(
const uint16_t& a)
792 const uint16_t c[] = {0,1,2,3,4,5,6,7};
793 return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
795template<> EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(
const int32_t& a)
797 const int32_t c[] = {0,1};
798 return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
800template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int32_t& a)
802 const int32_t c[] = {0,1,2,3};
803 return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
805template<> EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(
const uint32_t& a)
807 const uint32_t c[] = {0,1};
808 return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
810template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(
const uint32_t& a)
812 const uint32_t c[] = {0,1,2,3};
813 return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
815template<> EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(
const int64_t& a)
817 const int64_t c[] = {0,1};
818 return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
820template<> EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(
const uint64_t& a)
822 const uint64_t c[] = {0,1};
823 return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
826template<> EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
return vadd_f32(a,b); }
827template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vaddq_f32(a,b); }
828template<> EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(
const Packet4c& a,
const Packet4c& b)
830 return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(
831 vreinterpret_s8_s32(vdup_n_s32(a)),
832 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
834template<> EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
return vadd_s8(a,b); }
835template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
return vaddq_s8(a,b); }
836template<> EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
838 return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(
839 vreinterpret_u8_u32(vdup_n_u32(a)),
840 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
842template<> EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
return vadd_u8(a,b); }
843template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
return vaddq_u8(a,b); }
844template<> EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
return vadd_s16(a,b); }
845template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
return vaddq_s16(a,b); }
846template<> EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
return vadd_u16(a,b); }
847template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
return vaddq_u16(a,b); }
848template<> EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
return vadd_s32(a,b); }
849template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vaddq_s32(a,b); }
850template<> EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
return vadd_u32(a,b); }
851template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
return vaddq_u32(a,b); }
852template<> EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
return vaddq_s64(a,b); }
853template<> EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
return vaddq_u64(a,b); }
855template<> EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
return vsub_f32(a,b); }
856template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vsubq_f32(a,b); }
857template<> EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(
const Packet4c& a,
const Packet4c& b)
859 return vget_lane_s32(vreinterpret_s32_s8(vsub_s8(
860 vreinterpret_s8_s32(vdup_n_s32(a)),
861 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
863template<> EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
return vsub_s8(a,b); }
864template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
return vsubq_s8(a,b); }
865template<> EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
867 return vget_lane_u32(vreinterpret_u32_u8(vsub_u8(
868 vreinterpret_u8_u32(vdup_n_u32(a)),
869 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
871template<> EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
return vsub_u8(a,b); }
872template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
return vsubq_u8(a,b); }
873template<> EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
return vsub_s16(a,b); }
874template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
return vsubq_s16(a,b); }
875template<> EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
return vsub_u16(a,b); }
876template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
return vsubq_u16(a,b); }
877template<> EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
return vsub_s32(a,b); }
878template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vsubq_s32(a,b); }
879template<> EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
return vsub_u32(a,b); }
880template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
return vsubq_u32(a,b); }
881template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
return vsubq_s64(a,b); }
882template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
return vsubq_u64(a,b); }
884template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(
const Packet2f& a,
const Packet2f& b);
885template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(
const Packet2f& a,
const Packet2f & b) {
886 Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
887 return padd(a, pxor(mask, b));
889template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b);
890template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
891 Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
892 return padd(a, pxor(mask, b));
895template<> EIGEN_STRONG_INLINE Packet2f pnegate(
const Packet2f& a) {
return vneg_f32(a); }
896template<> EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
return vnegq_f32(a); }
897template<> EIGEN_STRONG_INLINE Packet4c pnegate(
const Packet4c& a)
898{
return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
899template<> EIGEN_STRONG_INLINE Packet8c pnegate(
const Packet8c& a) {
return vneg_s8(a); }
900template<> EIGEN_STRONG_INLINE Packet16c pnegate(
const Packet16c& a) {
return vnegq_s8(a); }
901template<> EIGEN_STRONG_INLINE Packet4s pnegate(
const Packet4s& a) {
return vneg_s16(a); }
902template<> EIGEN_STRONG_INLINE Packet8s pnegate(
const Packet8s& a) {
return vnegq_s16(a); }
903template<> EIGEN_STRONG_INLINE Packet2i pnegate(
const Packet2i& a) {
return vneg_s32(a); }
904template<> EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
return vnegq_s32(a); }
905template<> EIGEN_STRONG_INLINE Packet2l pnegate(
const Packet2l& a) {
910 vdup_n_s64(-vgetq_lane_s64(a, 0)),
911 vdup_n_s64(-vgetq_lane_s64(a, 1)));
915template<> EIGEN_STRONG_INLINE Packet2f pconj(
const Packet2f& a) {
return a; }
916template<> EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
return a; }
917template<> EIGEN_STRONG_INLINE Packet4c pconj(
const Packet4c& a) {
return a; }
918template<> EIGEN_STRONG_INLINE Packet8c pconj(
const Packet8c& a) {
return a; }
919template<> EIGEN_STRONG_INLINE Packet16c pconj(
const Packet16c& a) {
return a; }
920template<> EIGEN_STRONG_INLINE Packet4uc pconj(
const Packet4uc& a) {
return a; }
921template<> EIGEN_STRONG_INLINE Packet8uc pconj(
const Packet8uc& a) {
return a; }
922template<> EIGEN_STRONG_INLINE Packet16uc pconj(
const Packet16uc& a) {
return a; }
923template<> EIGEN_STRONG_INLINE Packet4s pconj(
const Packet4s& a) {
return a; }
924template<> EIGEN_STRONG_INLINE Packet8s pconj(
const Packet8s& a) {
return a; }
925template<> EIGEN_STRONG_INLINE Packet4us pconj(
const Packet4us& a) {
return a; }
926template<> EIGEN_STRONG_INLINE Packet8us pconj(
const Packet8us& a) {
return a; }
927template<> EIGEN_STRONG_INLINE Packet2i pconj(
const Packet2i& a) {
return a; }
928template<> EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
return a; }
929template<> EIGEN_STRONG_INLINE Packet2ui pconj(
const Packet2ui& a) {
return a; }
930template<> EIGEN_STRONG_INLINE Packet4ui pconj(
const Packet4ui& a) {
return a; }
931template<> EIGEN_STRONG_INLINE Packet2l pconj(
const Packet2l& a) {
return a; }
932template<> EIGEN_STRONG_INLINE Packet2ul pconj(
const Packet2ul& a) {
return a; }
934template<> EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
return vmul_f32(a,b); }
935template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vmulq_f32(a,b); }
936template<> EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(
const Packet4c& a,
const Packet4c& b)
938 return vget_lane_s32(vreinterpret_s32_s8(vmul_s8(
939 vreinterpret_s8_s32(vdup_n_s32(a)),
940 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
942template<> EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
return vmul_s8(a,b); }
943template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
return vmulq_s8(a,b); }
944template<> EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
946 return vget_lane_u32(vreinterpret_u32_u8(vmul_u8(
947 vreinterpret_u8_u32(vdup_n_u32(a)),
948 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
950template<> EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
return vmul_u8(a,b); }
951template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
return vmulq_u8(a,b); }
952template<> EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
return vmul_s16(a,b); }
953template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
return vmulq_s16(a,b); }
954template<> EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
return vmul_u16(a,b); }
955template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
return vmulq_u16(a,b); }
956template<> EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
return vmul_s32(a,b); }
957template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vmulq_s32(a,b); }
958template<> EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
return vmul_u32(a,b); }
959template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
return vmulq_u32(a,b); }
960template<> EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
962 vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)),
963 vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1)));
965template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
967 vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)),
968 vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
971template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(
const Packet4c& ,
const Packet4c& )
973 eigen_assert(
false &&
"packet integer division are not supported by NEON");
974 return pset1<Packet4c>(0);
976template<> EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(
const Packet8c& ,
const Packet8c& )
978 eigen_assert(
false &&
"packet integer division are not supported by NEON");
979 return pset1<Packet8c>(0);
981template<> EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(
const Packet16c& ,
const Packet16c& )
983 eigen_assert(
false &&
"packet integer division are not supported by NEON");
984 return pset1<Packet16c>(0);
986template<> EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(
const Packet4uc& ,
const Packet4uc& )
988 eigen_assert(
false &&
"packet integer division are not supported by NEON");
989 return pset1<Packet4uc>(0);
991template<> EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(
const Packet8uc& ,
const Packet8uc& )
993 eigen_assert(
false &&
"packet integer division are not supported by NEON");
994 return pset1<Packet8uc>(0);
996template<> EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(
const Packet16uc& ,
const Packet16uc& )
998 eigen_assert(
false &&
"packet integer division are not supported by NEON");
999 return pset1<Packet16uc>(0);
1001template<> EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(
const Packet4s& ,
const Packet4s& )
1003 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1004 return pset1<Packet4s>(0);
1006template<> EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(
const Packet8s& ,
const Packet8s& )
1008 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1009 return pset1<Packet8s>(0);
1011template<> EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(
const Packet4us& ,
const Packet4us& )
1013 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1014 return pset1<Packet4us>(0);
1016template<> EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(
const Packet8us& ,
const Packet8us& )
1018 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1019 return pset1<Packet8us>(0);
1021template<> EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(
const Packet2i& ,
const Packet2i& )
1023 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1024 return pset1<Packet2i>(0);
1026template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& ,
const Packet4i& )
1028 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1029 return pset1<Packet4i>(0);
1031template<> EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(
const Packet2ui& ,
const Packet2ui& )
1033 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1034 return pset1<Packet2ui>(0);
1036template<> EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(
const Packet4ui& ,
const Packet4ui& )
1038 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1039 return pset1<Packet4ui>(0);
1041template<> EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(
const Packet2l& ,
const Packet2l& )
1043 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1044 return pset1<Packet2l>(0LL);
1046template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(
const Packet2ul& ,
const Packet2ul& )
1048 eigen_assert(
false &&
"packet integer division are not supported by NEON");
1049 return pset1<Packet2ul>(0ULL);
1052#ifdef EIGEN_VECTORIZE_FMA
1054EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1055 return vfmaq_f32(c, a, b);
1058EIGEN_STRONG_INLINE Packet2f pmadd(
const Packet2f& a,
const Packet2f& b,
const Packet2f& c) {
1059 return vfma_f32(c, a, b);
1062template<> EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c)
1064 return vmlaq_f32(c,a,b);
1066template<> EIGEN_STRONG_INLINE Packet2f pmadd(
const Packet2f& a,
const Packet2f& b,
const Packet2f& c)
1068 return vmla_f32(c,a,b);
1073template<> EIGEN_STRONG_INLINE Packet4c pmadd(
const Packet4c& a,
const Packet4c& b,
const Packet4c& c)
1075 return vget_lane_s32(vreinterpret_s32_s8(vmla_s8(
1076 vreinterpret_s8_s32(vdup_n_s32(c)),
1077 vreinterpret_s8_s32(vdup_n_s32(a)),
1078 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1080template<> EIGEN_STRONG_INLINE Packet8c pmadd(
const Packet8c& a,
const Packet8c& b,
const Packet8c& c)
1081{
return vmla_s8(c,a,b); }
1082template<> EIGEN_STRONG_INLINE Packet16c pmadd(
const Packet16c& a,
const Packet16c& b,
const Packet16c& c)
1083{
return vmlaq_s8(c,a,b); }
1084template<> EIGEN_STRONG_INLINE Packet4uc pmadd(
const Packet4uc& a,
const Packet4uc& b,
const Packet4uc& c)
1086 return vget_lane_u32(vreinterpret_u32_u8(vmla_u8(
1087 vreinterpret_u8_u32(vdup_n_u32(c)),
1088 vreinterpret_u8_u32(vdup_n_u32(a)),
1089 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1091template<> EIGEN_STRONG_INLINE Packet8uc pmadd(
const Packet8uc& a,
const Packet8uc& b,
const Packet8uc& c)
1092{
return vmla_u8(c,a,b); }
1093template<> EIGEN_STRONG_INLINE Packet16uc pmadd(
const Packet16uc& a,
const Packet16uc& b,
const Packet16uc& c)
1094{
return vmlaq_u8(c,a,b); }
1095template<> EIGEN_STRONG_INLINE Packet4s pmadd(
const Packet4s& a,
const Packet4s& b,
const Packet4s& c)
1096{
return vmla_s16(c,a,b); }
1097template<> EIGEN_STRONG_INLINE Packet8s pmadd(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c)
1098{
return vmlaq_s16(c,a,b); }
1099template<> EIGEN_STRONG_INLINE Packet4us pmadd(
const Packet4us& a,
const Packet4us& b,
const Packet4us& c)
1100{
return vmla_u16(c,a,b); }
1101template<> EIGEN_STRONG_INLINE Packet8us pmadd(
const Packet8us& a,
const Packet8us& b,
const Packet8us& c)
1102{
return vmlaq_u16(c,a,b); }
1103template<> EIGEN_STRONG_INLINE Packet2i pmadd(
const Packet2i& a,
const Packet2i& b,
const Packet2i& c)
1104{
return vmla_s32(c,a,b); }
1105template<> EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c)
1106{
return vmlaq_s32(c,a,b); }
1107template<> EIGEN_STRONG_INLINE Packet2ui pmadd(
const Packet2ui& a,
const Packet2ui& b,
const Packet2ui& c)
1108{
return vmla_u32(c,a,b); }
1109template<> EIGEN_STRONG_INLINE Packet4ui pmadd(
const Packet4ui& a,
const Packet4ui& b,
const Packet4ui& c)
1110{
return vmlaq_u32(c,a,b); }
1112template<> EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(
const Packet2f& a,
const Packet2f& b)
1113{
return vabd_f32(a,b); }
1114template<> EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(
const Packet4f& a,
const Packet4f& b)
1115{
return vabdq_f32(a,b); }
1116template<> EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(
const Packet4c& a,
const Packet4c& b)
1118 return vget_lane_s32(vreinterpret_s32_s8(vabd_s8(
1119 vreinterpret_s8_s32(vdup_n_s32(a)),
1120 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1122template<> EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(
const Packet8c& a,
const Packet8c& b)
1123{
return vabd_s8(a,b); }
1124template<> EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(
const Packet16c& a,
const Packet16c& b)
1125{
return vabdq_s8(a,b); }
1126template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
1128 return vget_lane_u32(vreinterpret_u32_u8(vabd_u8(
1129 vreinterpret_u8_u32(vdup_n_u32(a)),
1130 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1132template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b)
1133{
return vabd_u8(a,b); }
1134template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b)
1135{
return vabdq_u8(a,b); }
1136template<> EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(
const Packet4s& a,
const Packet4s& b)
1137{
return vabd_s16(a,b); }
1138template<> EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(
const Packet8s& a,
const Packet8s& b)
1139{
return vabdq_s16(a,b); }
1140template<> EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(
const Packet4us& a,
const Packet4us& b)
1141{
return vabd_u16(a,b); }
1142template<> EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(
const Packet8us& a,
const Packet8us& b)
1143{
return vabdq_u16(a,b); }
1144template<> EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(
const Packet2i& a,
const Packet2i& b)
1145{
return vabd_s32(a,b); }
1146template<> EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(
const Packet4i& a,
const Packet4i& b)
1147{
return vabdq_s32(a,b); }
1148template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b)
1149{
return vabd_u32(a,b); }
1150template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b)
1151{
return vabdq_u32(a,b); }
1153template<> EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
return vmin_f32(a,b); }
1154template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vminq_f32(a,b); }
1156#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1158template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vminnmq_f32(a, b); }
1159template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(
const Packet2f& a,
const Packet2f& b) {
return vminnm_f32(a, b); }
1162template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return pmin<Packet4f>(a, b); }
1164template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(
const Packet2f& a,
const Packet2f& b) {
return pmin<Packet2f>(a, b); }
1166template<> EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(
const Packet4c& a,
const Packet4c& b)
1168 return vget_lane_s32(vreinterpret_s32_s8(vmin_s8(
1169 vreinterpret_s8_s32(vdup_n_s32(a)),
1170 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1172template<> EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
return vmin_s8(a,b); }
1173template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
return vminq_s8(a,b); }
1174template<> EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
1176 return vget_lane_u32(vreinterpret_u32_u8(vmin_u8(
1177 vreinterpret_u8_u32(vdup_n_u32(a)),
1178 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1180template<> EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
return vmin_u8(a,b); }
1181template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
return vminq_u8(a,b); }
1182template<> EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
return vmin_s16(a,b); }
1183template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
return vminq_s16(a,b); }
1184template<> EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
return vmin_u16(a,b); }
1185template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
return vminq_u16(a,b); }
1186template<> EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
return vmin_s32(a,b); }
1187template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vminq_s32(a,b); }
1188template<> EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
return vmin_u32(a,b); }
1189template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
return vminq_u32(a,b); }
1190template<> EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1191 return vcombine_s64(
1192 vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1193 vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1195template<> EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1196 return vcombine_u64(
1197 vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1198 vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1201template<> EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
return vmax_f32(a,b); }
1202template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vmaxq_f32(a,b); }
1204#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1206template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vmaxnmq_f32(a, b); }
1207template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(
const Packet2f& a,
const Packet2f& b) {
return vmaxnm_f32(a, b); }
1210template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return pmax<Packet4f>(a, b); }
1212template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(
const Packet2f& a,
const Packet2f& b) {
return pmax<Packet2f>(a, b); }
1214template<> EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(
const Packet4c& a,
const Packet4c& b)
1216 return vget_lane_s32(vreinterpret_s32_s8(vmax_s8(
1217 vreinterpret_s8_s32(vdup_n_s32(a)),
1218 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1220template<> EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
return vmax_s8(a,b); }
1221template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
return vmaxq_s8(a,b); }
1222template<> EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
1224 return vget_lane_u32(vreinterpret_u32_u8(vmax_u8(
1225 vreinterpret_u8_u32(vdup_n_u32(a)),
1226 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1228template<> EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b) {
return vmax_u8(a,b); }
1229template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
return vmaxq_u8(a,b); }
1230template<> EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
return vmax_s16(a,b); }
1231template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
return vmaxq_s16(a,b); }
1232template<> EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(
const Packet4us& a,
const Packet4us& b) {
return vmax_u16(a,b); }
1233template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
return vmaxq_u16(a,b); }
1234template<> EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
return vmax_s32(a,b); }
1235template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vmaxq_s32(a,b); }
1236template<> EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b) {
return vmax_u32(a,b); }
1237template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
return vmaxq_u32(a,b); }
1238template<> EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1239 return vcombine_s64(
1240 vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1241 vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1243template<> EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1244 return vcombine_u64(
1245 vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1246 vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1249template<> EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(
const Packet2f& a,
const Packet2f& b)
1250{
return vreinterpret_f32_u32(vcle_f32(a,b)); }
1251template<> EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(
const Packet4f& a,
const Packet4f& b)
1252{
return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
1253template<> EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(
const Packet4c& a,
const Packet4c& b)
1255 return vget_lane_s32(vreinterpret_s32_u8(vcle_s8(
1256 vreinterpret_s8_s32(vdup_n_s32(a)),
1257 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1259template<> EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(
const Packet8c& a,
const Packet8c& b)
1260{
return vreinterpret_s8_u8(vcle_s8(a,b)); }
1261template<> EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(
const Packet16c& a,
const Packet16c& b)
1262{
return vreinterpretq_s8_u8(vcleq_s8(a,b)); }
1263template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
1265 return vget_lane_u32(vreinterpret_u32_u8(vcle_u8(
1266 vreinterpret_u8_u32(vdup_n_u32(a)),
1267 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1269template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b)
1270{
return vcle_u8(a,b); }
1271template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b)
1272{
return vcleq_u8(a,b); }
1273template<> EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(
const Packet4s& a,
const Packet4s& b)
1274{
return vreinterpret_s16_u16(vcle_s16(a,b)); }
1275template<> EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(
const Packet8s& a,
const Packet8s& b)
1276{
return vreinterpretq_s16_u16(vcleq_s16(a,b)); }
1277template<> EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(
const Packet4us& a,
const Packet4us& b)
1278{
return vcle_u16(a,b); }
1279template<> EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(
const Packet8us& a,
const Packet8us& b)
1280{
return vcleq_u16(a,b); }
1281template<> EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(
const Packet2i& a,
const Packet2i& b)
1282{
return vreinterpret_s32_u32(vcle_s32(a,b)); }
1283template<> EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(
const Packet4i& a,
const Packet4i& b)
1284{
return vreinterpretq_s32_u32(vcleq_s32(a,b)); }
1285template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b)
1286{
return vcle_u32(a,b); }
1287template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b)
1288{
return vcleq_u32(a,b); }
1289template<> EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(
const Packet2l& a,
const Packet2l& b)
1292 return vreinterpretq_s64_u64(vcleq_s64(a,b));
1294 return vcombine_s64(
1295 vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1296 vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1299template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b)
1302 return vcleq_u64(a,b);
1304 return vcombine_u64(
1305 vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1306 vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1310template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(
const Packet2f& a,
const Packet2f& b)
1311{
return vreinterpret_f32_u32(vclt_f32(a,b)); }
1312template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(
const Packet4f& a,
const Packet4f& b)
1313{
return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
1314template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(
const Packet4c& a,
const Packet4c& b)
1316 return vget_lane_s32(vreinterpret_s32_u8(vclt_s8(
1317 vreinterpret_s8_s32(vdup_n_s32(a)),
1318 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1320template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(
const Packet8c& a,
const Packet8c& b)
1321{
return vreinterpret_s8_u8(vclt_s8(a,b)); }
1322template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(
const Packet16c& a,
const Packet16c& b)
1323{
return vreinterpretq_s8_u8(vcltq_s8(a,b)); }
1324template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
1326 return vget_lane_u32(vreinterpret_u32_u8(vclt_u8(
1327 vreinterpret_u8_u32(vdup_n_u32(a)),
1328 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1330template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b)
1331{
return vclt_u8(a,b); }
1332template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b)
1333{
return vcltq_u8(a,b); }
1334template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(
const Packet4s& a,
const Packet4s& b)
1335{
return vreinterpret_s16_u16(vclt_s16(a,b)); }
1336template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(
const Packet8s& a,
const Packet8s& b)
1337{
return vreinterpretq_s16_u16(vcltq_s16(a,b)); }
1338template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(
const Packet4us& a,
const Packet4us& b)
1339{
return vclt_u16(a,b); }
1340template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(
const Packet8us& a,
const Packet8us& b)
1341{
return vcltq_u16(a,b); }
1342template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(
const Packet2i& a,
const Packet2i& b)
1343{
return vreinterpret_s32_u32(vclt_s32(a,b)); }
1344template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(
const Packet4i& a,
const Packet4i& b)
1345{
return vreinterpretq_s32_u32(vcltq_s32(a,b)); }
1346template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b)
1347{
return vclt_u32(a,b); }
1348template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b)
1349{
return vcltq_u32(a,b); }
1350template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(
const Packet2l& a,
const Packet2l& b)
1353 return vreinterpretq_s64_u64(vcltq_s64(a,b));
1355 return vcombine_s64(
1356 vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1357 vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1360template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b)
1363 return vcltq_u64(a,b);
1365 return vcombine_u64(
1366 vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1367 vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1371template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(
const Packet2f& a,
const Packet2f& b)
1372{
return vreinterpret_f32_u32(vceq_f32(a,b)); }
1373template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(
const Packet4f& a,
const Packet4f& b)
1374{
return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
1375template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(
const Packet4c& a,
const Packet4c& b)
1377 return vget_lane_s32(vreinterpret_s32_u8(vceq_s8(
1378 vreinterpret_s8_s32(vdup_n_s32(a)),
1379 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1381template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(
const Packet8c& a,
const Packet8c& b)
1382{
return vreinterpret_s8_u8(vceq_s8(a,b)); }
1383template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(
const Packet16c& a,
const Packet16c& b)
1384{
return vreinterpretq_s8_u8(vceqq_s8(a,b)); }
1385template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
1387 return vget_lane_u32(vreinterpret_u32_u8(vceq_u8(
1388 vreinterpret_u8_u32(vdup_n_u32(a)),
1389 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1391template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b)
1392{
return vceq_u8(a,b); }
1393template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b)
1394{
return vceqq_u8(a,b); }
1395template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(
const Packet4s& a,
const Packet4s& b)
1396{
return vreinterpret_s16_u16(vceq_s16(a,b)); }
1397template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(
const Packet8s& a,
const Packet8s& b)
1398{
return vreinterpretq_s16_u16(vceqq_s16(a,b)); }
1399template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(
const Packet4us& a,
const Packet4us& b)
1400{
return vceq_u16(a,b); }
1401template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(
const Packet8us& a,
const Packet8us& b)
1402{
return vceqq_u16(a,b); }
1403template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(
const Packet2i& a,
const Packet2i& b)
1404{
return vreinterpret_s32_u32(vceq_s32(a,b)); }
1405template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(
const Packet4i& a,
const Packet4i& b)
1406{
return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
1407template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b)
1408{
return vceq_u32(a,b); }
1409template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b)
1410{
return vceqq_u32(a,b); }
1411template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(
const Packet2l& a,
const Packet2l& b)
1414 return vreinterpretq_s64_u64(vceqq_s64(a,b));
1416 return vcombine_s64(
1417 vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1418 vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1421template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b)
1424 return vceqq_u64(a,b);
1426 return vcombine_u64(
1427 vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1428 vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1432template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(
const Packet2f& a,
const Packet2f& b)
1433{
return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); }
1434template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(
const Packet4f& a,
const Packet4f& b)
1435{
return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
1438template<> EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(
const Packet2f& a,
const Packet2f& b)
1439{
return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1440template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b)
1441{
return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1442template<> EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(
const Packet4c& a,
const Packet4c& b)
1444template<> EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(
const Packet8c& a,
const Packet8c& b)
1445{
return vand_s8(a,b); }
1446template<> EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(
const Packet16c& a,
const Packet16c& b)
1447{
return vandq_s8(a,b); }
1448template<> EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
1450template<> EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b)
1451{
return vand_u8(a,b); }
1452template<> EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b)
1453{
return vandq_u8(a,b); }
1454template<> EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
return vand_s16(a,b); }
1455template<> EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
return vandq_s16(a,b); }
1456template<> EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(
const Packet4us& a,
const Packet4us& b)
1457{
return vand_u16(a,b); }
1458template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(
const Packet8us& a,
const Packet8us& b)
1459{
return vandq_u16(a,b); }
1460template<> EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
return vand_s32(a,b); }
1461template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vandq_s32(a,b); }
1462template<> EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b)
1463{
return vand_u32(a,b); }
1464template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b)
1465{
return vandq_u32(a,b); }
1466template<> EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
return vandq_s64(a,b); }
1467template<> EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b)
1468{
return vandq_u64(a,b); }
1470template<> EIGEN_STRONG_INLINE Packet2f por<Packet2f>(
const Packet2f& a,
const Packet2f& b)
1471{
return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1472template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b)
1473{
return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1474template<> EIGEN_STRONG_INLINE Packet4c por<Packet4c>(
const Packet4c& a,
const Packet4c& b)
1476template<> EIGEN_STRONG_INLINE Packet8c por<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
return vorr_s8(a,b); }
1477template<> EIGEN_STRONG_INLINE Packet16c por<Packet16c>(
const Packet16c& a,
const Packet16c& b)
1478{
return vorrq_s8(a,b); }
1479template<> EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
1481template<> EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b)
1482{
return vorr_u8(a,b); }
1483template<> EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b)
1484{
return vorrq_u8(a,b); }
1485template<> EIGEN_STRONG_INLINE Packet4s por<Packet4s>(
const Packet4s& a,
const Packet4s& b)
1486{
return vorr_s16(a,b); }
1487template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(
const Packet8s& a,
const Packet8s& b)
1488{
return vorrq_s16(a,b); }
1489template<> EIGEN_STRONG_INLINE Packet4us por<Packet4us>(
const Packet4us& a,
const Packet4us& b)
1490{
return vorr_u16(a,b); }
1491template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(
const Packet8us& a,
const Packet8us& b)
1492{
return vorrq_u16(a,b); }
1493template<> EIGEN_STRONG_INLINE Packet2i por<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
return vorr_s32(a,b); }
1494template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vorrq_s32(a,b); }
1495template<> EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b)
1496{
return vorr_u32(a,b); }
1497template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b)
1498{
return vorrq_u32(a,b); }
1499template<> EIGEN_STRONG_INLINE Packet2l por<Packet2l>(
const Packet2l& a,
const Packet2l& b)
1500{
return vorrq_s64(a,b); }
1501template<> EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b)
1502{
return vorrq_u64(a,b); }
1504template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(
const Packet2f& a,
const Packet2f& b)
1505{
return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1506template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b)
1507{
return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1508template<> EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(
const Packet4c& a,
const Packet4c& b)
1510template<> EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(
const Packet8c& a,
const Packet8c& b)
1511{
return veor_s8(a,b); }
1512template<> EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(
const Packet16c& a,
const Packet16c& b)
1513{
return veorq_s8(a,b); }
1514template<> EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
1516template<> EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b)
1517{
return veor_u8(a,b); }
1518template<> EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b)
1519{
return veorq_u8(a,b); }
1520template<> EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(
const Packet4s& a,
const Packet4s& b) {
return veor_s16(a,b); }
1521template<> EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
return veorq_s16(a,b); }
1522template<> EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(
const Packet4us& a,
const Packet4us& b)
1523{
return veor_u16(a,b); }
1524template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(
const Packet8us& a,
const Packet8us& b)
1525{
return veorq_u16(a,b); }
1526template<> EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(
const Packet2i& a,
const Packet2i& b) {
return veor_s32(a,b); }
1527template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return veorq_s32(a,b); }
1528template<> EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b)
1529{
return veor_u32(a,b); }
1530template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b)
1531{
return veorq_u32(a,b); }
1532template<> EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(
const Packet2l& a,
const Packet2l& b)
1533{
return veorq_s64(a,b); }
1534template<> EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b)
1535{
return veorq_u64(a,b); }
1537template<> EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(
const Packet2f& a,
const Packet2f& b)
1538{
return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1539template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b)
1540{
return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1541template<> EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(
const Packet4c& a,
const Packet4c& b)
1543template<> EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(
const Packet8c& a,
const Packet8c& b) {
return vbic_s8(a,b); }
1544template<> EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
return vbicq_s8(a,b); }
1545template<> EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(
const Packet4uc& a,
const Packet4uc& b)
1547template<> EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(
const Packet8uc& a,
const Packet8uc& b)
1548{
return vbic_u8(a,b); }
1549template<> EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b)
1550{
return vbicq_u8(a,b); }
1551template<> EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(
const Packet4s& a,
const Packet4s& b)
1552{
return vbic_s16(a,b); }
1553template<> EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(
const Packet8s& a,
const Packet8s& b)
1554{
return vbicq_s16(a,b); }
1555template<> EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(
const Packet4us& a,
const Packet4us& b)
1556{
return vbic_u16(a,b); }
1557template<> EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(
const Packet8us& a,
const Packet8us& b)
1558{
return vbicq_u16(a,b); }
1559template<> EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(
const Packet2i& a,
const Packet2i& b)
1560{
return vbic_s32(a,b); }
1561template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b)
1562{
return vbicq_s32(a,b); }
1563template<> EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(
const Packet2ui& a,
const Packet2ui& b)
1564{
return vbic_u32(a,b); }
1565template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b)
1566{
return vbicq_u32(a,b); }
1567template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(
const Packet2l& a,
const Packet2l& b)
1568{
return vbicq_s64(a,b); }
1569template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b)
1570{
return vbicq_u64(a,b); }
1573template<
int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
1574{
return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
1575template<
int N> EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) {
return vshr_n_s8(a,N); }
1576template<
int N> EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) {
return vshrq_n_s8(a,N); }
1577template<
int N> EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a)
1578{
return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
1579template<
int N> EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) {
return vshr_n_u8(a,N); }
1580template<
int N> EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) {
return vshrq_n_u8(a,N); }
1581template<
int N> EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) {
return vshr_n_s16(a,N); }
1582template<
int N> EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
return vshrq_n_s16(a,N); }
1583template<
int N> EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) {
return vshr_n_u16(a,N); }
1584template<
int N> EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) {
return vshrq_n_u16(a,N); }
1585template<
int N> EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) {
return vshr_n_s32(a,N); }
1586template<
int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) {
return vshrq_n_s32(a,N); }
1587template<
int N> EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) {
return vshr_n_u32(a,N); }
1588template<
int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) {
return vshrq_n_u32(a,N); }
1589template<
int N> EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) {
return vshrq_n_s64(a,N); }
1590template<
int N> EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) {
return vshrq_n_u64(a,N); }
1592template<
int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a)
1593{
return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); }
1594template<
int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a)
1595{
return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); }
1596template<
int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a)
1597{
return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); }
1598template<
int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a)
1599{
return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); }
1600template<
int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) {
return vshr_n_u8(a,N); }
1601template<
int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) {
return vshrq_n_u8(a,N); }
1602template<
int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a)
1603{
return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); }
1604template<
int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a)
1605{
return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); }
1606template<
int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) {
return vshr_n_u16(a,N); }
1607template<
int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) {
return vshrq_n_u16(a,N); }
1608template<
int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a)
1609{
return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); }
1610template<
int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
1611{
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); }
1612template<
int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) {
return vshr_n_u32(a,N); }
1613template<
int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) {
return vshrq_n_u32(a,N); }
1614template<
int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a)
1615{
return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); }
1616template<
int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) {
return vshrq_n_u64(a,N); }
1618template<
int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a)
1619{
return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
1620template<
int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) {
return vshl_n_s8(a,N); }
1621template<
int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) {
return vshlq_n_s8(a,N); }
1622template<
int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a)
1623{
return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
1624template<
int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) {
return vshl_n_u8(a,N); }
1625template<
int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) {
return vshlq_n_u8(a,N); }
1626template<
int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) {
return vshl_n_s16(a,N); }
1627template<
int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
return vshlq_n_s16(a,N); }
1628template<
int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) {
return vshl_n_u16(a,N); }
1629template<
int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) {
return vshlq_n_u16(a,N); }
1630template<
int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) {
return vshl_n_s32(a,N); }
1631template<
int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) {
return vshlq_n_s32(a,N); }
1632template<
int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) {
return vshl_n_u32(a,N); }
1633template<
int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) {
return vshlq_n_u32(a,N); }
1634template<
int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) {
return vshlq_n_s64(a,N); }
1635template<
int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) {
return vshlq_n_u64(a,N); }
1637template<> EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(
const float* from)
1638{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1_f32(from); }
1639template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from)
1640{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_f32(from); }
1641template<> EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(
const int8_t* from)
1644 memcpy(&res, from,
sizeof(Packet4c));
1647template<> EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(
const int8_t* from)
1648{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1_s8(from); }
1649template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(
const int8_t* from)
1650{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_s8(from); }
1651template<> EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(
const uint8_t* from)
1654 memcpy(&res, from,
sizeof(Packet4uc));
1657template<> EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(
const uint8_t* from)
1658{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1_u8(from); }
1659template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(
const uint8_t* from)
1660{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_u8(from); }
1661template<> EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(
const int16_t* from)
1662{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1_s16(from); }
1663template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(
const int16_t* from)
1664{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_s16(from); }
1665template<> EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(
const uint16_t* from)
1666{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1_u16(from); }
1667template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(
const uint16_t* from)
1668{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_u16(from); }
1669template<> EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(
const int32_t* from)
1670{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1_s32(from); }
1671template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int32_t* from)
1672{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_s32(from); }
1673template<> EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(
const uint32_t* from)
1674{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1_u32(from); }
1675template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(
const uint32_t* from)
1676{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_u32(from); }
1677template<> EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(
const int64_t* from)
1678{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_s64(from); }
1679template<> EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(
const uint64_t* from)
1680{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_u64(from); }
1682template<> EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(
const float* from)
1683{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_f32(from); }
1684template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from)
1685{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_f32(from); }
1686template<> EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(
const int8_t* from)
1689 memcpy(&res, from,
sizeof(Packet4c));
1692template<> EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(
const int8_t* from)
1693{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_s8(from); }
1694template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(
const int8_t* from)
1695{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_s8(from); }
1696template<> EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(
const uint8_t* from)
1699 memcpy(&res, from,
sizeof(Packet4uc));
1702template<> EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(
const uint8_t* from)
1703{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_u8(from); }
1704template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(
const uint8_t* from)
1705{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_u8(from); }
1706template<> EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(
const int16_t* from)
1707{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_s16(from); }
1708template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(
const int16_t* from)
1709{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_s16(from); }
1710template<> EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(
const uint16_t* from)
1711{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_u16(from); }
1712template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(
const uint16_t* from)
1713{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_u16(from); }
1714template<> EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(
const int32_t* from)
1715{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_s32(from); }
1716template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int32_t* from)
1717{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_s32(from); }
1718template<> EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(
const uint32_t* from)
1719{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_u32(from); }
1720template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(
const uint32_t* from)
1721{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_u32(from); }
1722template<> EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(
const int64_t* from)
1723{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_s64(from); }
1724template<> EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(
const uint64_t* from)
1725{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_u64(from); }
1727template<> EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(
const float* from)
1728{
return vld1_dup_f32(from); }
1729template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from)
1730{
return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); }
1731template<> EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(
const int8_t* from)
1733 const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
1734 return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0);
1736template<> EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(
const int8_t* from)
1738 const int8x8_t a = vld1_s8(from);
1739 return vzip_s8(a,a).val[0];
1741template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(
const int8_t* from)
1743 const int8x8_t a = vld1_s8(from);
1744 const int8x8x2_t b = vzip_s8(a,a);
1745 return vcombine_s8(b.val[0], b.val[1]);
1747template<> EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(
const uint8_t* from)
1749 const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
1750 return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0);
1752template<> EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(
const uint8_t* from)
1754 const uint8x8_t a = vld1_u8(from);
1755 return vzip_u8(a,a).val[0];
1757template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(
const uint8_t* from)
1759 const uint8x8_t a = vld1_u8(from);
1760 const uint8x8x2_t b = vzip_u8(a,a);
1761 return vcombine_u8(b.val[0], b.val[1]);
1763template<> EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(
const int16_t* from)
1765 return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)),
1766 vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]);
1768template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(
const int16_t* from)
1770 const int16x4_t a = vld1_s16(from);
1771 const int16x4x2_t b = vzip_s16(a,a);
1772 return vcombine_s16(b.val[0], b.val[1]);
1774template<> EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(
const uint16_t* from)
1776 return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)),
1777 vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]);
1779template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(
const uint16_t* from)
1781 const uint16x4_t a = vld1_u16(from);
1782 const uint16x4x2_t b = vzip_u16(a,a);
1783 return vcombine_u16(b.val[0], b.val[1]);
1785template<> EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(
const int32_t* from)
1786{
return vld1_dup_s32(from); }
1787template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int32_t* from)
1788{
return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); }
1789template<> EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(
const uint32_t* from)
1790{
return vld1_dup_u32(from); }
1791template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(
const uint32_t* from)
1792{
return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); }
1793template<> EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(
const int64_t* from)
1794{
return vld1q_dup_s64(from); }
1795template<> EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(
const uint64_t* from)
1796{
return vld1q_dup_u64(from); }
1798template<> EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(
const float* from) {
return vld1q_dup_f32(from); }
1799template<> EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(
const int8_t* from)
1800{
return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); }
1801template<> EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(
const int8_t* from)
1803 return vreinterpret_s8_u32(vzip_u32(
1804 vreinterpret_u32_s8(vld1_dup_s8(from)),
1805 vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
1807template<> EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(
const int8_t* from)
1809 const int8x8_t a = vreinterpret_s8_u32(vzip_u32(
1810 vreinterpret_u32_s8(vld1_dup_s8(from)),
1811 vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
1812 const int8x8_t b = vreinterpret_s8_u32(vzip_u32(
1813 vreinterpret_u32_s8(vld1_dup_s8(from+2)),
1814 vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]);
1815 return vcombine_s8(a,b);
1817template<> EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(
const uint8_t* from)
1818{
return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); }
1819template<> EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(
const uint8_t* from)
1821 return vreinterpret_u8_u32(vzip_u32(
1822 vreinterpret_u32_u8(vld1_dup_u8(from)),
1823 vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
1825template<> EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(
const uint8_t* from)
1827 const uint8x8_t a = vreinterpret_u8_u32(vzip_u32(
1828 vreinterpret_u32_u8(vld1_dup_u8(from)),
1829 vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
1830 const uint8x8_t b = vreinterpret_u8_u32(vzip_u32(
1831 vreinterpret_u32_u8(vld1_dup_u8(from+2)),
1832 vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]);
1833 return vcombine_u8(a,b);
1835template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(
const int16_t* from)
1836{
return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); }
1837template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(
const uint16_t* from)
1838{
return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); }
1839template<> EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(
const int32_t* from) {
return vld1q_dup_s32(from); }
1840template<> EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(
const uint32_t* from) {
return vld1q_dup_u32(from); }
1842template<> EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet2f& from)
1843{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); }
1844template<> EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from)
1845{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); }
1846template<> EIGEN_STRONG_INLINE
void pstore<int8_t>(int8_t* to,
const Packet4c& from)
1847{ memcpy(to, &from,
sizeof(from)); }
1848template<> EIGEN_STRONG_INLINE
void pstore<int8_t>(int8_t* to,
const Packet8c& from)
1849{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); }
1850template<> EIGEN_STRONG_INLINE
void pstore<int8_t>(int8_t* to,
const Packet16c& from)
1851{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); }
1852template<> EIGEN_STRONG_INLINE
void pstore<uint8_t>(uint8_t* to,
const Packet4uc& from)
1853{ memcpy(to, &from,
sizeof(from)); }
1854template<> EIGEN_STRONG_INLINE
void pstore<uint8_t>(uint8_t* to,
const Packet8uc& from)
1855{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); }
1856template<> EIGEN_STRONG_INLINE
void pstore<uint8_t>(uint8_t* to,
const Packet16uc& from)
1857{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); }
1858template<> EIGEN_STRONG_INLINE
void pstore<int16_t>(int16_t* to,
const Packet4s& from)
1859{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); }
1860template<> EIGEN_STRONG_INLINE
void pstore<int16_t>(int16_t* to,
const Packet8s& from)
1861{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); }
1862template<> EIGEN_STRONG_INLINE
void pstore<uint16_t>(uint16_t* to,
const Packet4us& from)
1863{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); }
1864template<> EIGEN_STRONG_INLINE
void pstore<uint16_t>(uint16_t* to,
const Packet8us& from)
1865{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); }
1866template<> EIGEN_STRONG_INLINE
void pstore<int32_t>(int32_t* to,
const Packet2i& from)
1867{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); }
1868template<> EIGEN_STRONG_INLINE
void pstore<int32_t>(int32_t* to,
const Packet4i& from)
1869{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); }
1870template<> EIGEN_STRONG_INLINE
void pstore<uint32_t>(uint32_t* to,
const Packet2ui& from)
1871{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); }
1872template<> EIGEN_STRONG_INLINE
void pstore<uint32_t>(uint32_t* to,
const Packet4ui& from)
1873{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); }
1874template<> EIGEN_STRONG_INLINE
void pstore<int64_t>(int64_t* to,
const Packet2l& from)
1875{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); }
1876template<> EIGEN_STRONG_INLINE
void pstore<uint64_t>(uint64_t* to,
const Packet2ul& from)
1877{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); }
1879template<> EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet2f& from)
1880{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); }
1881template<> EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from)
1882{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); }
1883template<> EIGEN_STRONG_INLINE
void pstoreu<int8_t>(int8_t* to,
const Packet4c& from)
1884{ memcpy(to, &from,
sizeof(from)); }
1885template<> EIGEN_STRONG_INLINE
void pstoreu<int8_t>(int8_t* to,
const Packet8c& from)
1886{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); }
1887template<> EIGEN_STRONG_INLINE
void pstoreu<int8_t>(int8_t* to,
const Packet16c& from)
1888{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); }
1889template<> EIGEN_STRONG_INLINE
void pstoreu<uint8_t>(uint8_t* to,
const Packet4uc& from)
1890{ memcpy(to, &from,
sizeof(from)); }
1891template<> EIGEN_STRONG_INLINE
void pstoreu<uint8_t>(uint8_t* to,
const Packet8uc& from)
1892{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); }
1893template<> EIGEN_STRONG_INLINE
void pstoreu<uint8_t>(uint8_t* to,
const Packet16uc& from)
1894{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); }
1895template<> EIGEN_STRONG_INLINE
void pstoreu<int16_t>(int16_t* to,
const Packet4s& from)
1896{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); }
1897template<> EIGEN_STRONG_INLINE
void pstoreu<int16_t>(int16_t* to,
const Packet8s& from)
1898{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); }
1899template<> EIGEN_STRONG_INLINE
void pstoreu<uint16_t>(uint16_t* to,
const Packet4us& from)
1900{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); }
1901template<> EIGEN_STRONG_INLINE
void pstoreu<uint16_t>(uint16_t* to,
const Packet8us& from)
1902{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); }
1903template<> EIGEN_STRONG_INLINE
void pstoreu<int32_t>(int32_t* to,
const Packet2i& from)
1904{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); }
1905template<> EIGEN_STRONG_INLINE
void pstoreu<int32_t>(int32_t* to,
const Packet4i& from)
1906{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); }
1907template<> EIGEN_STRONG_INLINE
void pstoreu<uint32_t>(uint32_t* to,
const Packet2ui& from)
1908{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); }
1909template<> EIGEN_STRONG_INLINE
void pstoreu<uint32_t>(uint32_t* to,
const Packet4ui& from)
1910{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); }
1911template<> EIGEN_STRONG_INLINE
void pstoreu<int64_t>(int64_t* to,
const Packet2l& from)
1912{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); }
1913template<> EIGEN_STRONG_INLINE
void pstoreu<uint64_t>(uint64_t* to,
const Packet2ul& from)
1914{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); }
1916template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(
const float* from,
Index stride)
1918 Packet2f res = vld1_dup_f32(from);
1919 res = vld1_lane_f32(from + 1*stride, res, 1);
1922template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(
const float* from,
Index stride)
1924 Packet4f res = vld1q_dup_f32(from);
1925 res = vld1q_lane_f32(from + 1*stride, res, 1);
1926 res = vld1q_lane_f32(from + 2*stride, res, 2);
1927 res = vld1q_lane_f32(from + 3*stride, res, 3);
1930template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(
const int8_t* from,
Index stride)
1933 for (
int i = 0; i != 4; i++)
1934 reinterpret_cast<int8_t*
>(&res)[i] = *(from + i * stride);
1937template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(
const int8_t* from,
Index stride)
1939 Packet8c res = vld1_dup_s8(from);
1940 res = vld1_lane_s8(from + 1*stride, res, 1);
1941 res = vld1_lane_s8(from + 2*stride, res, 2);
1942 res = vld1_lane_s8(from + 3*stride, res, 3);
1943 res = vld1_lane_s8(from + 4*stride, res, 4);
1944 res = vld1_lane_s8(from + 5*stride, res, 5);
1945 res = vld1_lane_s8(from + 6*stride, res, 6);
1946 res = vld1_lane_s8(from + 7*stride, res, 7);
1949template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(
const int8_t* from,
Index stride)
1951 Packet16c res = vld1q_dup_s8(from);
1952 res = vld1q_lane_s8(from + 1*stride, res, 1);
1953 res = vld1q_lane_s8(from + 2*stride, res, 2);
1954 res = vld1q_lane_s8(from + 3*stride, res, 3);
1955 res = vld1q_lane_s8(from + 4*stride, res, 4);
1956 res = vld1q_lane_s8(from + 5*stride, res, 5);
1957 res = vld1q_lane_s8(from + 6*stride, res, 6);
1958 res = vld1q_lane_s8(from + 7*stride, res, 7);
1959 res = vld1q_lane_s8(from + 8*stride, res, 8);
1960 res = vld1q_lane_s8(from + 9*stride, res, 9);
1961 res = vld1q_lane_s8(from + 10*stride, res, 10);
1962 res = vld1q_lane_s8(from + 11*stride, res, 11);
1963 res = vld1q_lane_s8(from + 12*stride, res, 12);
1964 res = vld1q_lane_s8(from + 13*stride, res, 13);
1965 res = vld1q_lane_s8(from + 14*stride, res, 14);
1966 res = vld1q_lane_s8(from + 15*stride, res, 15);
1969template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(
const uint8_t* from,
Index stride)
1972 for (
int i = 0; i != 4; i++)
1973 reinterpret_cast<uint8_t*
>(&res)[i] = *(from + i * stride);
1976template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(
const uint8_t* from,
Index stride)
1978 Packet8uc res = vld1_dup_u8(from);
1979 res = vld1_lane_u8(from + 1*stride, res, 1);
1980 res = vld1_lane_u8(from + 2*stride, res, 2);
1981 res = vld1_lane_u8(from + 3*stride, res, 3);
1982 res = vld1_lane_u8(from + 4*stride, res, 4);
1983 res = vld1_lane_u8(from + 5*stride, res, 5);
1984 res = vld1_lane_u8(from + 6*stride, res, 6);
1985 res = vld1_lane_u8(from + 7*stride, res, 7);
1988template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(
const uint8_t* from,
Index stride)
1990 Packet16uc res = vld1q_dup_u8(from);
1991 res = vld1q_lane_u8(from + 1*stride, res, 1);
1992 res = vld1q_lane_u8(from + 2*stride, res, 2);
1993 res = vld1q_lane_u8(from + 3*stride, res, 3);
1994 res = vld1q_lane_u8(from + 4*stride, res, 4);
1995 res = vld1q_lane_u8(from + 5*stride, res, 5);
1996 res = vld1q_lane_u8(from + 6*stride, res, 6);
1997 res = vld1q_lane_u8(from + 7*stride, res, 7);
1998 res = vld1q_lane_u8(from + 8*stride, res, 8);
1999 res = vld1q_lane_u8(from + 9*stride, res, 9);
2000 res = vld1q_lane_u8(from + 10*stride, res, 10);
2001 res = vld1q_lane_u8(from + 11*stride, res, 11);
2002 res = vld1q_lane_u8(from + 12*stride, res, 12);
2003 res = vld1q_lane_u8(from + 13*stride, res, 13);
2004 res = vld1q_lane_u8(from + 14*stride, res, 14);
2005 res = vld1q_lane_u8(from + 15*stride, res, 15);
2008template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(
const int16_t* from,
Index stride)
2010 Packet4s res = vld1_dup_s16(from);
2011 res = vld1_lane_s16(from + 1*stride, res, 1);
2012 res = vld1_lane_s16(from + 2*stride, res, 2);
2013 res = vld1_lane_s16(from + 3*stride, res, 3);
2016template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(
const int16_t* from,
Index stride)
2018 Packet8s res = vld1q_dup_s16(from);
2019 res = vld1q_lane_s16(from + 1*stride, res, 1);
2020 res = vld1q_lane_s16(from + 2*stride, res, 2);
2021 res = vld1q_lane_s16(from + 3*stride, res, 3);
2022 res = vld1q_lane_s16(from + 4*stride, res, 4);
2023 res = vld1q_lane_s16(from + 5*stride, res, 5);
2024 res = vld1q_lane_s16(from + 6*stride, res, 6);
2025 res = vld1q_lane_s16(from + 7*stride, res, 7);
2028template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(
const uint16_t* from,
Index stride)
2030 Packet4us res = vld1_dup_u16(from);
2031 res = vld1_lane_u16(from + 1*stride, res, 1);
2032 res = vld1_lane_u16(from + 2*stride, res, 2);
2033 res = vld1_lane_u16(from + 3*stride, res, 3);
2036template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(
const uint16_t* from,
Index stride)
2038 Packet8us res = vld1q_dup_u16(from);
2039 res = vld1q_lane_u16(from + 1*stride, res, 1);
2040 res = vld1q_lane_u16(from + 2*stride, res, 2);
2041 res = vld1q_lane_u16(from + 3*stride, res, 3);
2042 res = vld1q_lane_u16(from + 4*stride, res, 4);
2043 res = vld1q_lane_u16(from + 5*stride, res, 5);
2044 res = vld1q_lane_u16(from + 6*stride, res, 6);
2045 res = vld1q_lane_u16(from + 7*stride, res, 7);
2048template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(
const int32_t* from,
Index stride)
2050 Packet2i res = vld1_dup_s32(from);
2051 res = vld1_lane_s32(from + 1*stride, res, 1);
2054template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(
const int32_t* from,
Index stride)
2056 Packet4i res = vld1q_dup_s32(from);
2057 res = vld1q_lane_s32(from + 1*stride, res, 1);
2058 res = vld1q_lane_s32(from + 2*stride, res, 2);
2059 res = vld1q_lane_s32(from + 3*stride, res, 3);
2062template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(
const uint32_t* from,
Index stride)
2064 Packet2ui res = vld1_dup_u32(from);
2065 res = vld1_lane_u32(from + 1*stride, res, 1);
2068template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(
const uint32_t* from,
Index stride)
2070 Packet4ui res = vld1q_dup_u32(from);
2071 res = vld1q_lane_u32(from + 1*stride, res, 1);
2072 res = vld1q_lane_u32(from + 2*stride, res, 2);
2073 res = vld1q_lane_u32(from + 3*stride, res, 3);
2076template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(
const int64_t* from,
Index stride)
2078 Packet2l res = vld1q_dup_s64(from);
2079 res = vld1q_lane_s64(from + 1*stride, res, 1);
2082template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(
const uint64_t* from,
Index stride)
2084 Packet2ul res = vld1q_dup_u64(from);
2085 res = vld1q_lane_u64(from + 1*stride, res, 1);
2089template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<float, Packet2f>(
float* to,
const Packet2f& from,
Index stride)
2091 vst1_lane_f32(to + stride*0, from, 0);
2092 vst1_lane_f32(to + stride*1, from, 1);
2094template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride)
2096 vst1q_lane_f32(to + stride*0, from, 0);
2097 vst1q_lane_f32(to + stride*1, from, 1);
2098 vst1q_lane_f32(to + stride*2, from, 2);
2099 vst1q_lane_f32(to + stride*3, from, 3);
2101template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int8_t, Packet4c>(int8_t* to,
const Packet4c& from,
Index stride)
2103 for (
int i = 0; i != 4; i++)
2104 *(to + i * stride) =
reinterpret_cast<const int8_t*
>(&from)[i];
2106template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int8_t, Packet8c>(int8_t* to,
const Packet8c& from,
Index stride)
2108 vst1_lane_s8(to + stride*0, from, 0);
2109 vst1_lane_s8(to + stride*1, from, 1);
2110 vst1_lane_s8(to + stride*2, from, 2);
2111 vst1_lane_s8(to + stride*3, from, 3);
2112 vst1_lane_s8(to + stride*4, from, 4);
2113 vst1_lane_s8(to + stride*5, from, 5);
2114 vst1_lane_s8(to + stride*6, from, 6);
2115 vst1_lane_s8(to + stride*7, from, 7);
2117template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int8_t, Packet16c>(int8_t* to,
const Packet16c& from,
Index stride)
2119 vst1q_lane_s8(to + stride*0, from, 0);
2120 vst1q_lane_s8(to + stride*1, from, 1);
2121 vst1q_lane_s8(to + stride*2, from, 2);
2122 vst1q_lane_s8(to + stride*3, from, 3);
2123 vst1q_lane_s8(to + stride*4, from, 4);
2124 vst1q_lane_s8(to + stride*5, from, 5);
2125 vst1q_lane_s8(to + stride*6, from, 6);
2126 vst1q_lane_s8(to + stride*7, from, 7);
2127 vst1q_lane_s8(to + stride*8, from, 8);
2128 vst1q_lane_s8(to + stride*9, from, 9);
2129 vst1q_lane_s8(to + stride*10, from, 10);
2130 vst1q_lane_s8(to + stride*11, from, 11);
2131 vst1q_lane_s8(to + stride*12, from, 12);
2132 vst1q_lane_s8(to + stride*13, from, 13);
2133 vst1q_lane_s8(to + stride*14, from, 14);
2134 vst1q_lane_s8(to + stride*15, from, 15);
2136template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint8_t, Packet4uc>(uint8_t* to,
const Packet4uc& from,
Index stride)
2138 for (
int i = 0; i != 4; i++)
2139 *(to + i * stride) =
reinterpret_cast<const uint8_t*
>(&from)[i];
2141template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint8_t, Packet8uc>(uint8_t* to,
const Packet8uc& from,
Index stride)
2143 vst1_lane_u8(to + stride*0, from, 0);
2144 vst1_lane_u8(to + stride*1, from, 1);
2145 vst1_lane_u8(to + stride*2, from, 2);
2146 vst1_lane_u8(to + stride*3, from, 3);
2147 vst1_lane_u8(to + stride*4, from, 4);
2148 vst1_lane_u8(to + stride*5, from, 5);
2149 vst1_lane_u8(to + stride*6, from, 6);
2150 vst1_lane_u8(to + stride*7, from, 7);
2152template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint8_t, Packet16uc>(uint8_t* to,
const Packet16uc& from,
Index stride)
2154 vst1q_lane_u8(to + stride*0, from, 0);
2155 vst1q_lane_u8(to + stride*1, from, 1);
2156 vst1q_lane_u8(to + stride*2, from, 2);
2157 vst1q_lane_u8(to + stride*3, from, 3);
2158 vst1q_lane_u8(to + stride*4, from, 4);
2159 vst1q_lane_u8(to + stride*5, from, 5);
2160 vst1q_lane_u8(to + stride*6, from, 6);
2161 vst1q_lane_u8(to + stride*7, from, 7);
2162 vst1q_lane_u8(to + stride*8, from, 8);
2163 vst1q_lane_u8(to + stride*9, from, 9);
2164 vst1q_lane_u8(to + stride*10, from, 10);
2165 vst1q_lane_u8(to + stride*11, from, 11);
2166 vst1q_lane_u8(to + stride*12, from, 12);
2167 vst1q_lane_u8(to + stride*13, from, 13);
2168 vst1q_lane_u8(to + stride*14, from, 14);
2169 vst1q_lane_u8(to + stride*15, from, 15);
2171template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int16_t, Packet4s>(int16_t* to,
const Packet4s& from,
Index stride)
2173 vst1_lane_s16(to + stride*0, from, 0);
2174 vst1_lane_s16(to + stride*1, from, 1);
2175 vst1_lane_s16(to + stride*2, from, 2);
2176 vst1_lane_s16(to + stride*3, from, 3);
2178template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int16_t, Packet8s>(int16_t* to,
const Packet8s& from,
Index stride)
2180 vst1q_lane_s16(to + stride*0, from, 0);
2181 vst1q_lane_s16(to + stride*1, from, 1);
2182 vst1q_lane_s16(to + stride*2, from, 2);
2183 vst1q_lane_s16(to + stride*3, from, 3);
2184 vst1q_lane_s16(to + stride*4, from, 4);
2185 vst1q_lane_s16(to + stride*5, from, 5);
2186 vst1q_lane_s16(to + stride*6, from, 6);
2187 vst1q_lane_s16(to + stride*7, from, 7);
2189template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint16_t, Packet4us>(uint16_t* to,
const Packet4us& from,
Index stride)
2191 vst1_lane_u16(to + stride*0, from, 0);
2192 vst1_lane_u16(to + stride*1, from, 1);
2193 vst1_lane_u16(to + stride*2, from, 2);
2194 vst1_lane_u16(to + stride*3, from, 3);
2196template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint16_t, Packet8us>(uint16_t* to,
const Packet8us& from,
Index stride)
2198 vst1q_lane_u16(to + stride*0, from, 0);
2199 vst1q_lane_u16(to + stride*1, from, 1);
2200 vst1q_lane_u16(to + stride*2, from, 2);
2201 vst1q_lane_u16(to + stride*3, from, 3);
2202 vst1q_lane_u16(to + stride*4, from, 4);
2203 vst1q_lane_u16(to + stride*5, from, 5);
2204 vst1q_lane_u16(to + stride*6, from, 6);
2205 vst1q_lane_u16(to + stride*7, from, 7);
2207template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int32_t, Packet2i>(int32_t* to,
const Packet2i& from,
Index stride)
2209 vst1_lane_s32(to + stride*0, from, 0);
2210 vst1_lane_s32(to + stride*1, from, 1);
2212template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int32_t, Packet4i>(int32_t* to,
const Packet4i& from,
Index stride)
2214 vst1q_lane_s32(to + stride*0, from, 0);
2215 vst1q_lane_s32(to + stride*1, from, 1);
2216 vst1q_lane_s32(to + stride*2, from, 2);
2217 vst1q_lane_s32(to + stride*3, from, 3);
2219template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint32_t, Packet2ui>(uint32_t* to,
const Packet2ui& from,
Index stride)
2221 vst1_lane_u32(to + stride*0, from, 0);
2222 vst1_lane_u32(to + stride*1, from, 1);
2224template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint32_t, Packet4ui>(uint32_t* to,
const Packet4ui& from,
Index stride)
2226 vst1q_lane_u32(to + stride*0, from, 0);
2227 vst1q_lane_u32(to + stride*1, from, 1);
2228 vst1q_lane_u32(to + stride*2, from, 2);
2229 vst1q_lane_u32(to + stride*3, from, 3);
2231template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int64_t, Packet2l>(int64_t* to,
const Packet2l& from,
Index stride)
2233 vst1q_lane_s64(to + stride*0, from, 0);
2234 vst1q_lane_s64(to + stride*1, from, 1);
2236template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint64_t, Packet2ul>(uint64_t* to,
const Packet2ul& from,
Index stride)
2238 vst1q_lane_u64(to + stride*0, from, 0);
2239 vst1q_lane_u64(to + stride*1, from, 1);
2242template<> EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) { EIGEN_ARM_PREFETCH(addr); }
2243template<> EIGEN_STRONG_INLINE
void prefetch<int8_t>(
const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2244template<> EIGEN_STRONG_INLINE
void prefetch<uint8_t>(
const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2245template<> EIGEN_STRONG_INLINE
void prefetch<int16_t>(
const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2246template<> EIGEN_STRONG_INLINE
void prefetch<uint16_t>(
const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2247template<> EIGEN_STRONG_INLINE
void prefetch<int32_t>(
const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2248template<> EIGEN_STRONG_INLINE
void prefetch<uint32_t>(
const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2249template<> EIGEN_STRONG_INLINE
void prefetch<int64_t>(
const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2250template<> EIGEN_STRONG_INLINE
void prefetch<uint64_t>(
const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2252template<> EIGEN_STRONG_INLINE
float pfirst<Packet2f>(
const Packet2f& a) {
return vget_lane_f32(a,0); }
2253template<> EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
return vgetq_lane_f32(a,0); }
2254template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(
const Packet4c& a) {
return static_cast<int8_t
>(a & 0xff); }
2255template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(
const Packet8c& a) {
return vget_lane_s8(a,0); }
2256template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(
const Packet16c& a) {
return vgetq_lane_s8(a,0); }
2257template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(
const Packet4uc& a) {
return static_cast<uint8_t
>(a & 0xff); }
2258template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(
const Packet8uc& a) {
return vget_lane_u8(a,0); }
2259template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(
const Packet16uc& a) {
return vgetq_lane_u8(a,0); }
2260template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(
const Packet4s& a) {
return vget_lane_s16(a,0); }
2261template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(
const Packet8s& a) {
return vgetq_lane_s16(a,0); }
2262template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(
const Packet4us& a) {
return vget_lane_u16(a,0); }
2263template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(
const Packet8us& a) {
return vgetq_lane_u16(a,0); }
2264template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(
const Packet2i& a) {
return vget_lane_s32(a,0); }
2265template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(
const Packet4i& a) {
return vgetq_lane_s32(a,0); }
2266template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(
const Packet2ui& a) {
return vget_lane_u32(a,0); }
2267template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(
const Packet4ui& a) {
return vgetq_lane_u32(a,0); }
2268template<> EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(
const Packet2l& a) {
return vgetq_lane_s64(a,0); }
2269template<> EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(
const Packet2ul& a) {
return vgetq_lane_u64(a,0); }
2271template<> EIGEN_STRONG_INLINE Packet2f preverse(
const Packet2f& a) {
return vrev64_f32(a); }
2272template<> EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a)
2274 const float32x4_t a_r64 = vrev64q_f32(a);
2275 return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
2277template<> EIGEN_STRONG_INLINE Packet4c preverse(
const Packet4c& a)
2278{
return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
2279template<> EIGEN_STRONG_INLINE Packet8c preverse(
const Packet8c& a) {
return vrev64_s8(a); }
2280template<> EIGEN_STRONG_INLINE Packet16c preverse(
const Packet16c& a)
2282 const int8x16_t a_r64 = vrev64q_s8(a);
2283 return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
2285template<> EIGEN_STRONG_INLINE Packet4uc preverse(
const Packet4uc& a)
2286{
return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); }
2287template<> EIGEN_STRONG_INLINE Packet8uc preverse(
const Packet8uc& a) {
return vrev64_u8(a); }
2288template<> EIGEN_STRONG_INLINE Packet16uc preverse(
const Packet16uc& a)
2290 const uint8x16_t a_r64 = vrev64q_u8(a);
2291 return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
2293template<> EIGEN_STRONG_INLINE Packet4s preverse(
const Packet4s& a) {
return vrev64_s16(a); }
2294template<> EIGEN_STRONG_INLINE Packet8s preverse(
const Packet8s& a)
2296 const int16x8_t a_r64 = vrev64q_s16(a);
2297 return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
2299template<> EIGEN_STRONG_INLINE Packet4us preverse(
const Packet4us& a) {
return vrev64_u16(a); }
2300template<> EIGEN_STRONG_INLINE Packet8us preverse(
const Packet8us& a)
2302 const uint16x8_t a_r64 = vrev64q_u16(a);
2303 return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
2305template<> EIGEN_STRONG_INLINE Packet2i preverse(
const Packet2i& a) {
return vrev64_s32(a); }
2306template<> EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a)
2308 const int32x4_t a_r64 = vrev64q_s32(a);
2309 return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
2311template<> EIGEN_STRONG_INLINE Packet2ui preverse(
const Packet2ui& a) {
return vrev64_u32(a); }
2312template<> EIGEN_STRONG_INLINE Packet4ui preverse(
const Packet4ui& a)
2314 const uint32x4_t a_r64 = vrev64q_u32(a);
2315 return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
2317template<> EIGEN_STRONG_INLINE Packet2l preverse(
const Packet2l& a)
2318{
return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); }
2319template<> EIGEN_STRONG_INLINE Packet2ul preverse(
const Packet2ul& a)
2320{
return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); }
2322template<> EIGEN_STRONG_INLINE Packet2f pabs(
const Packet2f& a) {
return vabs_f32(a); }
2323template<> EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
return vabsq_f32(a); }
2324template<> EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(
const Packet4c& a)
2325{
return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
2326template<> EIGEN_STRONG_INLINE Packet8c pabs(
const Packet8c& a) {
return vabs_s8(a); }
2327template<> EIGEN_STRONG_INLINE Packet16c pabs(
const Packet16c& a) {
return vabsq_s8(a); }
2328template<> EIGEN_STRONG_INLINE Packet4uc pabs(
const Packet4uc& a) {
return a; }
2329template<> EIGEN_STRONG_INLINE Packet8uc pabs(
const Packet8uc& a) {
return a; }
2330template<> EIGEN_STRONG_INLINE Packet16uc pabs(
const Packet16uc& a) {
return a; }
2331template<> EIGEN_STRONG_INLINE Packet4s pabs(
const Packet4s& a) {
return vabs_s16(a); }
2332template<> EIGEN_STRONG_INLINE Packet8s pabs(
const Packet8s& a) {
return vabsq_s16(a); }
2333template<> EIGEN_STRONG_INLINE Packet4us pabs(
const Packet4us& a) {
return a; }
2334template<> EIGEN_STRONG_INLINE Packet8us pabs(
const Packet8us& a) {
return a; }
2335template<> EIGEN_STRONG_INLINE Packet2i pabs(
const Packet2i& a) {
return vabs_s32(a); }
2336template<> EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
return vabsq_s32(a); }
2337template<> EIGEN_STRONG_INLINE Packet2ui pabs(
const Packet2ui& a) {
return a; }
2338template<> EIGEN_STRONG_INLINE Packet4ui pabs(
const Packet4ui& a) {
return a; }
2339template<> EIGEN_STRONG_INLINE Packet2l pabs(
const Packet2l& a) {
2341 return vabsq_s64(a);
2343 return vcombine_s64(
2344 vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))),
2345 vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
2348template<> EIGEN_STRONG_INLINE Packet2ul pabs(
const Packet2ul& a) {
return a; }
2350template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(
const Packet2f& a, Packet2f& exponent)
2351{
return pfrexp_generic(a,exponent); }
2352template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(
const Packet4f& a, Packet4f& exponent)
2353{
return pfrexp_generic(a,exponent); }
2355template<> EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(
const Packet2f& a,
const Packet2f& exponent)
2356{
return pldexp_generic(a,exponent); }
2357template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(
const Packet4f& a,
const Packet4f& exponent)
2358{
return pldexp_generic(a,exponent); }
2360template<> EIGEN_STRONG_INLINE
float predux<Packet2f>(
const Packet2f& a) {
return vget_lane_f32(vpadd_f32(a,a), 0); }
2361template<> EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a)
2363 const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
2364 return vget_lane_f32(vpadd_f32(sum, sum), 0);
2366template<> EIGEN_STRONG_INLINE int8_t predux<Packet4c>(
const Packet4c& a)
2368 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
2369 int8x8_t sum = vpadd_s8(a_dup, a_dup);
2370 sum = vpadd_s8(sum, sum);
2371 return vget_lane_s8(sum, 0);
2373template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(
const Packet8c& a)
2375 int8x8_t sum = vpadd_s8(a,a);
2376 sum = vpadd_s8(sum, sum);
2377 sum = vpadd_s8(sum, sum);
2378 return vget_lane_s8(sum, 0);
2380template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(
const Packet16c& a)
2382 int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
2383 sum = vpadd_s8(sum, sum);
2384 sum = vpadd_s8(sum, sum);
2385 sum = vpadd_s8(sum, sum);
2386 return vget_lane_s8(sum, 0);
2388template<> EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(
const Packet4uc& a)
2390 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
2391 uint8x8_t sum = vpadd_u8(a_dup, a_dup);
2392 sum = vpadd_u8(sum, sum);
2393 return vget_lane_u8(sum, 0);
2395template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(
const Packet8uc& a)
2397 uint8x8_t sum = vpadd_u8(a,a);
2398 sum = vpadd_u8(sum, sum);
2399 sum = vpadd_u8(sum, sum);
2400 return vget_lane_u8(sum, 0);
2402template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(
const Packet16uc& a)
2404 uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
2405 sum = vpadd_u8(sum, sum);
2406 sum = vpadd_u8(sum, sum);
2407 sum = vpadd_u8(sum, sum);
2408 return vget_lane_u8(sum, 0);
2410template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(
const Packet4s& a)
2412 const int16x4_t sum = vpadd_s16(a,a);
2413 return vget_lane_s16(vpadd_s16(sum, sum), 0);
2415template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(
const Packet8s& a)
2417 int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
2418 sum = vpadd_s16(sum, sum);
2419 sum = vpadd_s16(sum, sum);
2420 return vget_lane_s16(sum, 0);
2422template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(
const Packet4us& a)
2424 const uint16x4_t sum = vpadd_u16(a,a);
2425 return vget_lane_u16(vpadd_u16(sum, sum), 0);
2427template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(
const Packet8us& a)
2429 uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
2430 sum = vpadd_u16(sum, sum);
2431 sum = vpadd_u16(sum, sum);
2432 return vget_lane_u16(sum, 0);
2434template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(
const Packet2i& a) {
return vget_lane_s32(vpadd_s32(a,a), 0); }
2435template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(
const Packet4i& a)
2437 const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
2438 return vget_lane_s32(vpadd_s32(sum, sum), 0);
2440template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(
const Packet2ui& a) {
return vget_lane_u32(vpadd_u32(a,a), 0); }
2441template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(
const Packet4ui& a)
2443 const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
2444 return vget_lane_u32(vpadd_u32(sum, sum), 0);
2446template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(
const Packet2l& a)
2447{
return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); }
2448template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(
const Packet2ul& a)
2449{
return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); }
2451template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(
const Packet8c& a)
2453 return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a,
2454 vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
2456template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(
const Packet16c& a)
2457{
return vadd_s8(vget_high_s8(a), vget_low_s8(a)); }
2458template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(
const Packet8uc& a)
2460 return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a,
2461 vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
2463template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(
const Packet16uc& a)
2464{
return vadd_u8(vget_high_u8(a), vget_low_u8(a)); }
2465template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(
const Packet8s& a)
2466{
return vadd_s16(vget_high_s16(a), vget_low_s16(a)); }
2467template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(
const Packet8us& a)
2468{
return vadd_u16(vget_high_u16(a), vget_low_u16(a)); }
2472template<> EIGEN_STRONG_INLINE
float predux_mul<Packet2f>(
const Packet2f& a)
2473{
return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
2474template<> EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a)
2475{
return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
2476template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(
const Packet4c& a)
2478 int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
2479 prod = vmul_s8(prod, vrev16_s8(prod));
2480 return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
2482template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(
const Packet8c& a)
2484 int8x8_t prod = vmul_s8(a, vrev16_s8(a));
2485 prod = vmul_s8(prod, vrev32_s8(prod));
2486 return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
2488template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(
const Packet16c& a)
2489{
return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
2490template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(
const Packet4uc& a)
2492 uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
2493 prod = vmul_u8(prod, vrev16_u8(prod));
2494 return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
2496template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(
const Packet8uc& a)
2498 uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
2499 prod = vmul_u8(prod, vrev32_u8(prod));
2500 return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
2502template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(
const Packet16uc& a)
2503{
return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
2504template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(
const Packet4s& a)
2506 const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
2507 return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
2509template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(
const Packet8s& a)
2514 prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
2516 prod = vmul_s16(prod, vrev32_s16(prod));
2518 return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
2520template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(
const Packet4us& a)
2522 const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
2523 return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
2525template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(
const Packet8us& a)
2530 prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
2532 prod = vmul_u16(prod, vrev32_u16(prod));
2534 return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
2536template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(
const Packet2i& a)
2537{
return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
2538template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(
const Packet4i& a)
2539{
return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
2540template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(
const Packet2ui& a)
2541{
return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
2542template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(
const Packet4ui& a)
2543{
return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
2544template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(
const Packet2l& a)
2545{
return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
2546template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(
const Packet2ul& a)
2547{
return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); }
2550template<> EIGEN_STRONG_INLINE
float predux_min<Packet2f>(
const Packet2f& a)
2551{
return vget_lane_f32(vpmin_f32(a,a), 0); }
2552template<> EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a)
2554 const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
2555 return vget_lane_f32(vpmin_f32(min, min), 0);
2557template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(
const Packet4c& a)
2559 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
2560 int8x8_t min = vpmin_s8(a_dup, a_dup);
2561 min = vpmin_s8(min, min);
2562 return vget_lane_s8(min, 0);
2564template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(
const Packet8c& a)
2566 int8x8_t min = vpmin_s8(a,a);
2567 min = vpmin_s8(min, min);
2568 min = vpmin_s8(min, min);
2569 return vget_lane_s8(min, 0);
2571template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(
const Packet16c& a)
2573 int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
2574 min = vpmin_s8(min, min);
2575 min = vpmin_s8(min, min);
2576 min = vpmin_s8(min, min);
2577 return vget_lane_s8(min, 0);
2579template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(
const Packet4uc& a)
2581 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
2582 uint8x8_t min = vpmin_u8(a_dup, a_dup);
2583 min = vpmin_u8(min, min);
2584 return vget_lane_u8(min, 0);
2586template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(
const Packet8uc& a)
2588 uint8x8_t min = vpmin_u8(a,a);
2589 min = vpmin_u8(min, min);
2590 min = vpmin_u8(min, min);
2591 return vget_lane_u8(min, 0);
2593template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(
const Packet16uc& a)
2595 uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
2596 min = vpmin_u8(min, min);
2597 min = vpmin_u8(min, min);
2598 min = vpmin_u8(min, min);
2599 return vget_lane_u8(min, 0);
2601template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(
const Packet4s& a)
2603 const int16x4_t min = vpmin_s16(a,a);
2604 return vget_lane_s16(vpmin_s16(min, min), 0);
2606template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(
const Packet8s& a)
2608 int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
2609 min = vpmin_s16(min, min);
2610 min = vpmin_s16(min, min);
2611 return vget_lane_s16(min, 0);
2613template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(
const Packet4us& a)
2615 const uint16x4_t min = vpmin_u16(a,a);
2616 return vget_lane_u16(vpmin_u16(min, min), 0);
2618template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(
const Packet8us& a)
2620 uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
2621 min = vpmin_u16(min, min);
2622 min = vpmin_u16(min, min);
2623 return vget_lane_u16(min, 0);
2625template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(
const Packet2i& a)
2626{
return vget_lane_s32(vpmin_s32(a,a), 0); }
2627template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(
const Packet4i& a)
2629 const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
2630 return vget_lane_s32(vpmin_s32(min, min), 0);
2632template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(
const Packet2ui& a)
2633{
return vget_lane_u32(vpmin_u32(a,a), 0); }
2634template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(
const Packet4ui& a)
2636 const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
2637 return vget_lane_u32(vpmin_u32(min, min), 0);
2639template<> EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(
const Packet2l& a)
2640{
return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
2641template<> EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(
const Packet2ul& a)
2642{
return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
2645template<> EIGEN_STRONG_INLINE
float predux_max<Packet2f>(
const Packet2f& a)
2646{
return vget_lane_f32(vpmax_f32(a,a), 0); }
2647template<> EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a)
2649 const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
2650 return vget_lane_f32(vpmax_f32(max, max), 0);
2652template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(
const Packet4c& a)
2654 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
2655 int8x8_t max = vpmax_s8(a_dup, a_dup);
2656 max = vpmax_s8(max, max);
2657 return vget_lane_s8(max, 0);
2659template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(
const Packet8c& a)
2661 int8x8_t max = vpmax_s8(a,a);
2662 max = vpmax_s8(max, max);
2663 max = vpmax_s8(max, max);
2664 return vget_lane_s8(max, 0);
2666template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(
const Packet16c& a)
2668 int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
2669 max = vpmax_s8(max, max);
2670 max = vpmax_s8(max, max);
2671 max = vpmax_s8(max, max);
2672 return vget_lane_s8(max, 0);
2674template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(
const Packet4uc& a)
2676 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
2677 uint8x8_t max = vpmax_u8(a_dup, a_dup);
2678 max = vpmax_u8(max, max);
2679 return vget_lane_u8(max, 0);
2681template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(
const Packet8uc& a)
2683 uint8x8_t max = vpmax_u8(a,a);
2684 max = vpmax_u8(max, max);
2685 max = vpmax_u8(max, max);
2686 return vget_lane_u8(max, 0);
2688template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(
const Packet16uc& a)
2690 uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
2691 max = vpmax_u8(max, max);
2692 max = vpmax_u8(max, max);
2693 max = vpmax_u8(max, max);
2694 return vget_lane_u8(max, 0);
2696template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(
const Packet4s& a)
2698 const int16x4_t max = vpmax_s16(a,a);
2699 return vget_lane_s16(vpmax_s16(max, max), 0);
2701template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(
const Packet8s& a)
2703 int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
2704 max = vpmax_s16(max, max);
2705 max = vpmax_s16(max, max);
2706 return vget_lane_s16(max, 0);
2708template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(
const Packet4us& a)
2710 const uint16x4_t max = vpmax_u16(a,a);
2711 return vget_lane_u16(vpmax_u16(max, max), 0);
2713template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(
const Packet8us& a)
2715 uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
2716 max = vpmax_u16(max, max);
2717 max = vpmax_u16(max, max);
2718 return vget_lane_u16(max, 0);
2720template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(
const Packet2i& a)
2721{
return vget_lane_s32(vpmax_s32(a,a), 0); }
2722template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(
const Packet4i& a)
2724 const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
2725 return vget_lane_s32(vpmax_s32(max, max), 0);
2727template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(
const Packet2ui& a)
2728{
return vget_lane_u32(vpmax_u32(a,a), 0); }
2729template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(
const Packet4ui& a)
2731 const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
2732 return vget_lane_u32(vpmax_u32(max, max), 0);
2734template<> EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(
const Packet2l& a)
2735{
return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
2736template<> EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(
const Packet2ul& a)
2737{
return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
2739template<> EIGEN_STRONG_INLINE
bool predux_any(
const Packet4f& x)
2741 uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
2742 vget_high_u32(vreinterpretq_u32_f32(x)));
2743 return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
2749template<
typename Packet>
2750void zip_in_place(Packet& p1, Packet& p2);
2753EIGEN_ALWAYS_INLINE
void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
2754 const float32x2x2_t tmp = vzip_f32(p1, p2);
2760EIGEN_ALWAYS_INLINE
void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
2761 const float32x4x2_t tmp = vzipq_f32(p1, p2);
2767EIGEN_ALWAYS_INLINE
void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
2768 const int8x8x2_t tmp = vzip_s8(p1, p2);
2774EIGEN_ALWAYS_INLINE
void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
2775 const int8x16x2_t tmp = vzipq_s8(p1, p2);
2781EIGEN_ALWAYS_INLINE
void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
2782 const uint8x8x2_t tmp = vzip_u8(p1, p2);
2788EIGEN_ALWAYS_INLINE
void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
2789 const uint8x16x2_t tmp = vzipq_u8(p1, p2);
2795EIGEN_ALWAYS_INLINE
void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
2796 const int32x2x2_t tmp = vzip_s32(p1, p2);
2802EIGEN_ALWAYS_INLINE
void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
2803 const int32x4x2_t tmp = vzipq_s32(p1, p2);
2809EIGEN_ALWAYS_INLINE
void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
2810 const uint32x2x2_t tmp = vzip_u32(p1, p2);
2816EIGEN_ALWAYS_INLINE
void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
2817 const uint32x4x2_t tmp = vzipq_u32(p1, p2);
2823EIGEN_ALWAYS_INLINE
void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
2824 const int16x4x2_t tmp = vzip_s16(p1, p2);
2830EIGEN_ALWAYS_INLINE
void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
2831 const int16x8x2_t tmp = vzipq_s16(p1, p2);
2837EIGEN_ALWAYS_INLINE
void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
2838 const uint16x4x2_t tmp = vzip_u16(p1, p2);
2844EIGEN_ALWAYS_INLINE
void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
2845 const uint16x8x2_t tmp = vzipq_u16(p1, p2);
2850template<
typename Packet>
2851EIGEN_ALWAYS_INLINE
void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
2852 zip_in_place(kernel.packet[0], kernel.packet[1]);
2855template<
typename Packet>
2856EIGEN_ALWAYS_INLINE
void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
2857 zip_in_place(kernel.packet[0], kernel.packet[2]);
2858 zip_in_place(kernel.packet[1], kernel.packet[3]);
2859 zip_in_place(kernel.packet[0], kernel.packet[1]);
2860 zip_in_place(kernel.packet[2], kernel.packet[3]);
2863template<
typename Packet>
2864EIGEN_ALWAYS_INLINE
void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
2865 zip_in_place(kernel.packet[0], kernel.packet[4]);
2866 zip_in_place(kernel.packet[1], kernel.packet[5]);
2867 zip_in_place(kernel.packet[2], kernel.packet[6]);
2868 zip_in_place(kernel.packet[3], kernel.packet[7]);
2870 zip_in_place(kernel.packet[0], kernel.packet[2]);
2871 zip_in_place(kernel.packet[1], kernel.packet[3]);
2872 zip_in_place(kernel.packet[4], kernel.packet[6]);
2873 zip_in_place(kernel.packet[5], kernel.packet[7]);
2875 zip_in_place(kernel.packet[0], kernel.packet[1]);
2876 zip_in_place(kernel.packet[2], kernel.packet[3]);
2877 zip_in_place(kernel.packet[4], kernel.packet[5]);
2878 zip_in_place(kernel.packet[6], kernel.packet[7]);
2881template<
typename Packet>
2882EIGEN_ALWAYS_INLINE
void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
2884 for (
int i=0; i<4; ++i) {
2885 const int m = (1 << i);
2887 for (
int j=0; j<m; ++j) {
2888 const int n = (1 << (3-i));
2890 for (
int k=0; k<n; ++k) {
2891 const int idx = 2*j*n+k;
2892 zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
2900EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
2901 detail::ptranspose_impl(kernel);
2903EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
2904 detail::ptranspose_impl(kernel);
2907EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4c, 4>& kernel)
2909 const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
2910 const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
2912 const int8x8x2_t zip8 = vzip_s8(a,b);
2913 const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
2915 kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
2916 kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
2917 kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
2918 kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
2920EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
2921 detail::ptranspose_impl(kernel);
2923EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
2924 detail::ptranspose_impl(kernel);
2926EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2927 detail::ptranspose_impl(kernel);
2929EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
2930 detail::ptranspose_impl(kernel);
2932EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2933 detail::ptranspose_impl(kernel);
2936EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
2938 const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
2939 const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
2941 const uint8x8x2_t zip8 = vzip_u8(a,b);
2942 const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
2944 kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
2945 kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
2946 kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
2947 kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
2949EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
2950 detail::ptranspose_impl(kernel);
2952EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
2953 detail::ptranspose_impl(kernel);
2955EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2956 detail::ptranspose_impl(kernel);
2958EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
2959 detail::ptranspose_impl(kernel);
2961EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2962 detail::ptranspose_impl(kernel);
2965EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
2966 detail::ptranspose_impl(kernel);
2968EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2969 detail::ptranspose_impl(kernel);
2971EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2972 detail::ptranspose_impl(kernel);
2975EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
2976 detail::ptranspose_impl(kernel);
2978EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2979 detail::ptranspose_impl(kernel);
2981EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2982 detail::ptranspose_impl(kernel);
2985EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
2986 detail::ptranspose_impl(kernel);
2988EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
2989 detail::ptranspose_impl(kernel);
2991EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
2992 detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
2994EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
2995 detail::ptranspose_impl(kernel);
2998EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void
2999ptranspose(PacketBlock<Packet2l, 2>& kernel)
3002 const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
3003 kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
3004 kernel.packet[0] = tmp1;
3006 const int64x1_t tmp[2][2] = {
3007 { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) },
3008 { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) }
3011 kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
3012 kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
3015EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void
3016ptranspose(PacketBlock<Packet2ul, 2>& kernel)
3019 const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
3020 kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
3021 kernel.packet[0] = tmp1;
3023 const uint64x1_t tmp[2][2] = {
3024 { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) },
3025 { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) }
3028 kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
3029 kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
3033template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect(
const Packet2f& mask,
const Packet2f& a,
const Packet2f& b)
3034{
return vbsl_f32(vreinterpret_u32_f32(mask), a, b); }
3035template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(
const Packet4f& mask,
const Packet4f& a,
const Packet4f& b)
3036{
return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); }
3037template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(
const Packet8c& mask,
const Packet8c& a,
const Packet8c& b)
3038{
return vbsl_s8(vreinterpret_u8_s8(mask), a, b); }
3039template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(
const Packet16c& mask,
const Packet16c& a,
const Packet16c& b)
3040{
return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); }
3041template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(
const Packet8uc& mask,
const Packet8uc& a,
const Packet8uc& b)
3042{
return vbsl_u8(mask, a, b); }
3043template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(
const Packet16uc& mask,
const Packet16uc& a,
const Packet16uc& b)
3044{
return vbslq_u8(mask, a, b); }
3045template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(
const Packet4s& mask,
const Packet4s& a,
const Packet4s& b)
3046{
return vbsl_s16(vreinterpret_u16_s16(mask), a, b); }
3047template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(
const Packet8s& mask,
const Packet8s& a,
const Packet8s& b)
3048{
return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); }
3049template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(
const Packet4us& mask,
const Packet4us& a,
const Packet4us& b)
3050{
return vbsl_u16(mask, a, b); }
3051template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(
const Packet8us& mask,
const Packet8us& a,
const Packet8us& b)
3052{
return vbslq_u16(mask, a, b); }
3053template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(
const Packet2i& mask,
const Packet2i& a,
const Packet2i& b)
3054{
return vbsl_s32(vreinterpret_u32_s32(mask), a, b); }
3055template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(
const Packet4i& mask,
const Packet4i& a,
const Packet4i& b)
3056{
return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); }
3057template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(
const Packet2ui& mask,
const Packet2ui& a,
const Packet2ui& b)
3058{
return vbsl_u32(mask, a, b); }
3059template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(
const Packet4ui& mask,
const Packet4ui& a,
const Packet4ui& b)
3060{
return vbslq_u32(mask, a, b); }
3061template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(
const Packet2l& mask,
const Packet2l& a,
const Packet2l& b)
3062{
return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); }
3063template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(
const Packet2ul& mask,
const Packet2ul& a,
const Packet2ul& b)
3064{
return vbslq_u64(mask, a, b); }
3068template<> EIGEN_STRONG_INLINE Packet2f print<Packet2f>(
const Packet2f& a)
3069{
return vrndn_f32(a); }
3071template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(
const Packet4f& a)
3072{
return vrndnq_f32(a); }
3074template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(
const Packet2f& a)
3075{
return vrndm_f32(a); }
3077template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a)
3078{
return vrndmq_f32(a); }
3080template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(
const Packet2f& a)
3081{
return vrndp_f32(a); }
3083template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a)
3084{
return vrndpq_f32(a); }
3088template<> EIGEN_STRONG_INLINE Packet4f print(
const Packet4f& a) {
3090 const Packet4f limit = pset1<Packet4f>(
static_cast<float>(1<<23));
3091 const Packet4f abs_a = pabs(a);
3092 Packet4f r = padd(abs_a, limit);
3094 EIGEN_OPTIMIZATION_BARRIER(r);
3097 r = pselect(pcmp_lt(abs_a, limit),
3098 pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
3102template<> EIGEN_STRONG_INLINE Packet2f print(
const Packet2f& a) {
3104 const Packet2f limit = pset1<Packet2f>(
static_cast<float>(1<<23));
3105 const Packet2f abs_a = pabs(a);
3106 Packet2f r = padd(abs_a, limit);
3108 EIGEN_OPTIMIZATION_BARRIER(r);
3111 r = pselect(pcmp_lt(abs_a, limit),
3112 pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
3116template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a)
3118 const Packet4f cst_1 = pset1<Packet4f>(1.0f);
3119 Packet4f tmp = print<Packet4f>(a);
3121 Packet4f mask = pcmp_lt(a, tmp);
3122 mask = pand(mask, cst_1);
3123 return psub(tmp, mask);
3126template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(
const Packet2f& a)
3128 const Packet2f cst_1 = pset1<Packet2f>(1.0f);
3129 Packet2f tmp = print<Packet2f>(a);
3131 Packet2f mask = pcmp_lt(a, tmp);
3132 mask = pand(mask, cst_1);
3133 return psub(tmp, mask);
3136template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a)
3138 const Packet4f cst_1 = pset1<Packet4f>(1.0f);
3139 Packet4f tmp = print<Packet4f>(a);
3141 Packet4f mask = pcmp_lt(tmp, a);
3142 mask = pand(mask, cst_1);
3143 return padd(tmp, mask);
3146template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(
const Packet2f& a)
3148 const Packet2f cst_1 = pset1<Packet2f>(1.0);
3149 Packet2f tmp = print<Packet2f>(a);
3151 Packet2f mask = pcmp_lt(tmp, a);
3152 mask = pand(mask, cst_1);
3153 return padd(tmp, mask);
3164template<> EIGEN_STRONG_INLINE Packet4uc psqrt(
const Packet4uc& a) {
3165 uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
3166 uint8x8_t res = vdup_n_u8(0);
3167 uint8x8_t add = vdup_n_u8(0x8);
3168 for (
int i = 0; i < 4; i++)
3170 const uint8x8_t temp = vorr_u8(res, add);
3171 res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
3172 add = vshr_n_u8(add, 1);
3174 return vget_lane_u32(vreinterpret_u32_u8(res), 0);
3177template<> EIGEN_STRONG_INLINE Packet8uc psqrt(
const Packet8uc& a) {
3178 uint8x8_t res = vdup_n_u8(0);
3179 uint8x8_t add = vdup_n_u8(0x8);
3180 for (
int i = 0; i < 4; i++)
3182 const uint8x8_t temp = vorr_u8(res, add);
3183 res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
3184 add = vshr_n_u8(add, 1);
3189template<> EIGEN_STRONG_INLINE Packet16uc psqrt(
const Packet16uc& a) {
3190 uint8x16_t res = vdupq_n_u8(0);
3191 uint8x16_t add = vdupq_n_u8(0x8);
3192 for (
int i = 0; i < 4; i++)
3194 const uint8x16_t temp = vorrq_u8(res, add);
3195 res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
3196 add = vshrq_n_u8(add, 1);
3201template<> EIGEN_STRONG_INLINE Packet4us psqrt(
const Packet4us& a) {
3202 uint16x4_t res = vdup_n_u16(0);
3203 uint16x4_t add = vdup_n_u16(0x80);
3204 for (
int i = 0; i < 8; i++)
3206 const uint16x4_t temp = vorr_u16(res, add);
3207 res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
3208 add = vshr_n_u16(add, 1);
3213template<> EIGEN_STRONG_INLINE Packet8us psqrt(
const Packet8us& a) {
3214 uint16x8_t res = vdupq_n_u16(0);
3215 uint16x8_t add = vdupq_n_u16(0x80);
3216 for (
int i = 0; i < 8; i++)
3218 const uint16x8_t temp = vorrq_u16(res, add);
3219 res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
3220 add = vshrq_n_u16(add, 1);
3225template<> EIGEN_STRONG_INLINE Packet2ui psqrt(
const Packet2ui& a) {
3226 uint32x2_t res = vdup_n_u32(0);
3227 uint32x2_t add = vdup_n_u32(0x8000);
3228 for (
int i = 0; i < 16; i++)
3230 const uint32x2_t temp = vorr_u32(res, add);
3231 res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
3232 add = vshr_n_u32(add, 1);
3237template<> EIGEN_STRONG_INLINE Packet4ui psqrt(
const Packet4ui& a) {
3238 uint32x4_t res = vdupq_n_u32(0);
3239 uint32x4_t add = vdupq_n_u32(0x8000);
3240 for (
int i = 0; i < 16; i++)
3242 const uint32x4_t temp = vorrq_u32(res, add);
3243 res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
3244 add = vshrq_n_u32(add, 1);
3249EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(
const Packet4f& a) {
3252 float32x4_t result = vrsqrteq_f32(a);
3253 result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
3254 result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
3258EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(
const Packet2f& a) {
3261 float32x2_t result = vrsqrte_f32(a);
3262 result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
3263 result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
3267template<
typename Packet> Packet prsqrt_float_common(
const Packet& a) {
3268 const Packet cst_zero = pzero(a);
3269 const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
3270 Packet return_zero = pcmp_eq(a, cst_inf);
3271 Packet return_inf = pcmp_eq(a, cst_zero);
3272 Packet result = prsqrt_float_unsafe(a);
3273 result = pselect(return_inf, por(cst_inf, a), result);
3274 result = pandnot(result, return_zero);
3278template<> EIGEN_STRONG_INLINE Packet4f prsqrt(
const Packet4f& a) {
3279 return prsqrt_float_common(a);
3282template<> EIGEN_STRONG_INLINE Packet2f prsqrt(
const Packet2f& a) {
3283 return prsqrt_float_common(a);
3286EIGEN_STRONG_INLINE Packet4f preciprocal(
const Packet4f& a)
3289 float32x4_t result = vrecpeq_f32(a);
3290 result = vmulq_f32(vrecpsq_f32(a, result), result);
3291 result = vmulq_f32(vrecpsq_f32(a, result), result);
3295EIGEN_STRONG_INLINE Packet2f preciprocal(
const Packet2f& a)
3298 float32x2_t result = vrecpe_f32(a);
3299 result = vmul_f32(vrecps_f32(a, result), result);
3300 result = vmul_f32(vrecps_f32(a, result), result);
3306template<> EIGEN_STRONG_INLINE Packet4f psqrt(
const Packet4f& a) {
return vsqrtq_f32(a); }
3308template<> EIGEN_STRONG_INLINE Packet2f psqrt(
const Packet2f& a) {
return vsqrt_f32(a); }
3310template<> EIGEN_STRONG_INLINE Packet4f pdiv(
const Packet4f& a,
const Packet4f& b) {
return vdivq_f32(a, b); }
3312template<> EIGEN_STRONG_INLINE Packet2f pdiv(
const Packet2f& a,
const Packet2f& b) {
return vdiv_f32(a, b); }
3314template<
typename Packet>
3315EIGEN_STRONG_INLINE Packet psqrt_float_common(
const Packet& a) {
3316 const Packet cst_zero = pzero(a);
3317 const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
3319 Packet result = pmul(a, prsqrt_float_unsafe(a));
3320 Packet a_is_zero = pcmp_eq(a, cst_zero);
3321 Packet a_is_inf = pcmp_eq(a, cst_inf);
3322 Packet return_a = por(a_is_zero, a_is_inf);
3324 result = pselect(return_a, a, result);
3328template<> EIGEN_STRONG_INLINE Packet4f psqrt(
const Packet4f& a) {
3329 return psqrt_float_common(a);
3332template<> EIGEN_STRONG_INLINE Packet2f psqrt(
const Packet2f& a) {
3333 return psqrt_float_common(a);
3336template<
typename Packet>
3337EIGEN_STRONG_INLINE Packet pdiv_float_common(
const Packet& a,
const Packet& b) {
3342 const Packet cst_one = pset1<Packet>(1.0f);
3343 const Packet cst_quarter = pset1<Packet>(0.25f);
3344 const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
3346 Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
3347 Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
3348 Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
3352template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
3353 return pdiv_float_common(a, b);
3356template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(
const Packet2f& a,
const Packet2f& b) {
3357 return pdiv_float_common(a, b);
3365typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
3376 AlignedOnScalar = 1,
3400 HasSin = EIGEN_FAST_MATH,
3401 HasCos = EIGEN_FAST_MATH,
3405 HasTanh = EIGEN_FAST_MATH,
3406 HasErf = EIGEN_FAST_MATH,
3420 vectorizable =
true,
3421 masked_load_available =
false,
3422 masked_store_available =
false
3435EIGEN_STRONG_INLINE Packet4bf F32ToBf16(
const Packet4f& p)
3439 Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
3442 Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
3445 Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
3448 input = vaddq_u32(input, rounding_bias);
3451 input = vshrq_n_u32(input, 16);
3454 const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
3455 const Packet4ui mask = vceqq_f32(p, p);
3456 input = vbslq_u32(mask, input, bf16_nan);
3459 return vmovn_u32(input);
3462EIGEN_STRONG_INLINE Packet4f Bf16ToF32(
const Packet4bf& p)
3464 return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
3467EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(
const Packet4f& p) {
3468 return vmovn_u32(vreinterpretq_u32_f32(p));
3471template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(
const bfloat16& from) {
3472 return Packet4bf(pset1<Packet4us>(from.value));
3475template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(
const Packet4bf& from) {
3476 return bfloat16_impl::raw_uint16_to_bfloat16(
static_cast<uint16_t
>(pfirst<Packet4us>(Packet4us(from))));
3479template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(
const bfloat16* from)
3481 return Packet4bf(pload<Packet4us>(
reinterpret_cast<const uint16_t*
>(from)));
3484template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(
const bfloat16* from)
3486 return Packet4bf(ploadu<Packet4us>(
reinterpret_cast<const uint16_t*
>(from)));
3489template<> EIGEN_STRONG_INLINE
void pstore<bfloat16>(bfloat16* to,
const Packet4bf& from)
3491 EIGEN_DEBUG_ALIGNED_STORE vst1_u16(
reinterpret_cast<uint16_t*
>(to), from);
3494template<> EIGEN_STRONG_INLINE
void pstoreu<bfloat16>(bfloat16* to,
const Packet4bf& from)
3496 EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(
reinterpret_cast<uint16_t*
>(to), from);
3499template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(
const bfloat16* from)
3501 return Packet4bf(ploaddup<Packet4us>(
reinterpret_cast<const uint16_t*
>(from)));
3504template <> EIGEN_STRONG_INLINE Packet4bf pabs(
const Packet4bf& a) {
3505 return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
3508template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(
const Packet4bf &a,
3511 return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3513template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(
const Packet4bf &a,
3516 return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3519template <> EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(
const Packet4bf &a,
3522 return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3525template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(
const Packet4bf &a,
3528 return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3530template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(
const Packet4bf &a,
3533 return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3536template <> EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(
const Packet4bf &a,
3539 return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3542template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(
const bfloat16& a)
3544 return F32ToBf16(plset<Packet4f>(
static_cast<float>(a)));
3547template<> EIGEN_STRONG_INLINE Packet4bf por(
const Packet4bf& a,
const Packet4bf& b) {
3548 return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
3551template<> EIGEN_STRONG_INLINE Packet4bf pxor(
const Packet4bf& a,
const Packet4bf& b) {
3552 return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
3555template<> EIGEN_STRONG_INLINE Packet4bf pand(
const Packet4bf& a,
const Packet4bf& b) {
3556 return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
3559template<> EIGEN_STRONG_INLINE Packet4bf pandnot(
const Packet4bf& a,
const Packet4bf& b) {
3560 return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
3563template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(
const Packet4bf& mask,
const Packet4bf& a,
3566 return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
3569template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(
const Packet4bf& a)
3571 return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
3574template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(
const Packet4bf& a)
3576 return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
3579template<> EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(
const Packet4bf& a)
3581 return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
3584template<> EIGEN_STRONG_INLINE Packet4bf pconj(
const Packet4bf& a) {
return a; }
3586template<> EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
3587 return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3590template<> EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
3591 return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3594template<> EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
3595 return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3598template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b) {
3599 return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3603EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(
const bfloat16* from,
Index stride)
3605 return Packet4bf(pgather<uint16_t, Packet4us>(
reinterpret_cast<const uint16_t*
>(from), stride));
3609EIGEN_STRONG_INLINE
void pscatter<bfloat16, Packet4bf>(bfloat16* to,
const Packet4bf& from,
Index stride)
3611 pscatter<uint16_t, Packet4us>(
reinterpret_cast<uint16_t*
>(to), Packet4us(from), stride);
3614template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(
const Packet4bf& a)
3616 return static_cast<bfloat16
>(predux<Packet4f>(Bf16ToF32(a)));
3619template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(
const Packet4bf& a)
3621 return static_cast<bfloat16
>(predux_max<Packet4f>(Bf16ToF32(a)));
3624template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(
const Packet4bf& a)
3626 return static_cast<bfloat16
>(predux_min<Packet4f>(Bf16ToF32(a)));
3629template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(
const Packet4bf& a)
3631 return static_cast<bfloat16
>(predux_mul<Packet4f>(Bf16ToF32(a)));
3634template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(
const Packet4bf& a)
3636 return Packet4bf(preverse<Packet4us>(Packet4us(a)));
3639EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
3641 detail::ptranspose_impl(kernel);
3644template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b)
3646 return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3649template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b)
3651 return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3654template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b)
3656 return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3659template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b)
3661 return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3664template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(
const Packet4bf& a,
const Packet4bf& b)
3666 return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3669template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(
const Packet4bf& a)
3671 return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(
static_cast<uint16_t
>(0x8000))));
3678#ifdef __apple_build_version__
3682#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
3684#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
3687#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
3695template <
typename T> uint64x2_t vreinterpretq_u64_f64(T a) {
return (uint64x2_t) a; }
3696template <
typename T> float64x2_t vreinterpretq_f64_u64(T a) {
return (float64x2_t) a; }
3699#if EIGEN_COMP_MSVC_STRICT
3700typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
3701typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
3703EIGEN_ALWAYS_INLINE Packet2d make_packet2d(
double a,
double b) {
3704 double from[2] = {a, b};
3705 return vld1q_f64(from);
3709typedef float64x2_t Packet2d;
3710typedef float64x1_t Packet1d;
3712EIGEN_ALWAYS_INLINE Packet2d make_packet2d(
double a,
double b) {
3713 double from[2] = {a, b};
3714 return vld1q_f64(from);
3721EIGEN_STRONG_INLINE Packet2d shuffle(
const Packet2d& m,
const Packet2d& n,
int mask)
3723 const double* a =
reinterpret_cast<const double*
>(&m);
3724 const double* b =
reinterpret_cast<const double*
>(&n);
3725 Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
3729EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(
const Packet2d& a,
const Packet2d& b,
int mask)
3731 return shuffle(a, b, mask);
3733EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(
const Packet2d& a,
const Packet2d& b)
3735 return shuffle(a, b, 0);
3737EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(
const Packet2d& a,
const Packet2d& b)
3739 return shuffle(a, b, 3);
3741#define vec2d_duplane(a, p) \
3742 Packet2d(vdupq_laneq_f64(a, p))
3744template<>
struct packet_traits<double> : default_packet_traits
3746 typedef Packet2d type;
3747 typedef Packet2d half;
3751 AlignedOnScalar = 1,
3787template<>
struct unpacket_traits<Packet2d>
3789 typedef double type;
3790 typedef Packet2d half;
3791 typedef Packet2l integer_packet;
3796 vectorizable =
true,
3797 masked_load_available =
false,
3798 masked_store_available =
false
3802template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
return vdupq_n_f64(from); }
3804template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a)
3806 const double c[] = {0.0,1.0};
3807 return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
3810template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vaddq_f64(a,b); }
3812template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vsubq_f64(a,b); }
3814template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& ,
const Packet2d& );
3815template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(
const Packet2d& a,
const Packet2d& b){
3816 const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
3817 return padd(a, pxor(mask, b));
3820template<> EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
return vnegq_f64(a); }
3822template<> EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
return a; }
3824template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vmulq_f64(a,b); }
3826template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vdivq_f64(a,b); }
3828#ifdef EIGEN_VECTORIZE_FMA
3830template<> EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c)
3831{
return vfmaq_f64(c,a,b); }
3833template<> EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c)
3834{
return vmlaq_f64(c,a,b); }
3837template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vminq_f64(a,b); }
3839#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
3841template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vminnmq_f64(a, b); }
3842template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vmaxnmq_f64(a, b); }
3846template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return pmin<Packet2d>(a, b); }
3848template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vmaxq_f64(a,b); }
3851template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return pmax<Packet2d>(a, b); }
3854template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b)
3855{
return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3857template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b)
3858{
return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3860template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b)
3861{
return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3863template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b)
3864{
return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3866template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(
const Packet2d& a,
const Packet2d& b)
3867{
return vreinterpretq_f64_u64(vcleq_f64(a,b)); }
3869template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(
const Packet2d& a,
const Packet2d& b)
3870{
return vreinterpretq_f64_u64(vcltq_f64(a,b)); }
3872template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(
const Packet2d& a,
const Packet2d& b)
3873{
return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); }
3875template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(
const Packet2d& a,
const Packet2d& b)
3876{
return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
3878template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from)
3879{ EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_f64(from); }
3881template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from)
3882{ EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_f64(from); }
3884template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
return vld1q_dup_f64(from); }
3885template<> EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from)
3886{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); }
3888template<> EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from)
3889{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); }
3891template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(
const double* from,
Index stride)
3893 Packet2d res = pset1<Packet2d>(0.0);
3894 res = vld1q_lane_f64(from + 0*stride, res, 0);
3895 res = vld1q_lane_f64(from + 1*stride, res, 1);
3899template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride)
3901 vst1q_lane_f64(to + stride*0, from, 0);
3902 vst1q_lane_f64(to + stride*1, from, 1);
3905template<> EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) { EIGEN_ARM_PREFETCH(addr); }
3908template<> EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
return vgetq_lane_f64(a,0); }
3910template<> EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a)
3911{
return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
3913template<> EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
return vabsq_f64(a); }
3915#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
3917template<> EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a)
3918{
return (vget_low_f64(a) + vget_high_f64(a))[0]; }
3920template<> EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a)
3921{
return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
3926#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
3927template<> EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a)
3928{
return (vget_low_f64(a) * vget_high_f64(a))[0]; }
3930template<> EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a)
3931{
return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0); }
3935template<> EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a)
3936{
return vgetq_lane_f64(vpminq_f64(a,a), 0); }
3939template<> EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a)
3940{
return vgetq_lane_f64(vpmaxq_f64(a,a), 0); }
3943EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void
3944ptranspose(PacketBlock<Packet2d, 2>& kernel)
3946 const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
3947 const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
3949 kernel.packet[0] = tmp1;
3950 kernel.packet[1] = tmp2;
3953template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(
const Packet2d& mask,
const Packet2d& a,
const Packet2d& b)
3954{
return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
3956template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(
const Packet2d& a)
3957{
return vrndnq_f64(a); }
3959template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a)
3960{
return vrndmq_f64(a); }
3962template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a)
3963{
return vrndpq_f64(a); }
3965template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(
const Packet2d& a,
const Packet2d& exponent)
3966{
return pldexp_generic(a, exponent); }
3968template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(
const Packet2d& a, Packet2d& exponent)
3969{
return pfrexp_generic(a,exponent); }
3971template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from)
3972{
return vreinterpretq_f64_u64(vdupq_n_u64(from)); }
3974template<> EIGEN_STRONG_INLINE Packet2d prsqrt(
const Packet2d& a) {
3976 Packet2d x = vrsqrteq_f64(a);
3978 x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
3979 x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
3980 x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
3981 const Packet2d infinity = pset1<Packet2d>(NumTraits<double>::infinity());
3982 return pselect(pcmp_eq(a, pzero(a)), infinity, x);
3985template<> EIGEN_STRONG_INLINE Packet2d psqrt(
const Packet2d& _x){
return vsqrtq_f64(_x); }
3990#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
3991typedef float16x4_t Packet4hf;
3992typedef float16x8_t Packet8hf;
3995struct packet_traits<
Eigen::half> : default_packet_traits {
3996 typedef Packet8hf type;
3997 typedef Packet4hf half;
4000 AlignedOnScalar = 1,
4032 HasErf = EIGEN_FAST_MATH,
4039struct unpacket_traits<Packet4hf> {
4041 typedef Packet4hf half;
4045 vectorizable =
true,
4046 masked_load_available =
false,
4047 masked_store_available =
false
4052struct unpacket_traits<Packet8hf> {
4054 typedef Packet4hf half;
4058 vectorizable =
true,
4059 masked_load_available =
false,
4060 masked_store_available =
false
4065EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(
const Packet8hf& a) {
4066 return vadd_f16(vget_low_f16(a), vget_high_f16(a));
4070EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(
const Eigen::half& from) {
4071 return vdupq_n_f16(from.x);
4075EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(
const Eigen::half& from) {
4076 return vdup_n_f16(from.x);
4080EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(
const Eigen::half& a) {
4081 const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
4082 Packet8hf countdown = vld1q_f16(f);
4083 return vaddq_f16(pset1<Packet8hf>(a), countdown);
4087EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(
const Eigen::half& a) {
4088 const float16_t f[] = {0, 1, 2, 3};
4089 Packet4hf countdown = vld1_f16(f);
4090 return vadd_f16(pset1<Packet4hf>(a), countdown);
4094EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
4095 return vaddq_f16(a, b);
4099EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
4100 return vadd_f16(a, b);
4104EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
4105 return vsubq_f16(a, b);
4109EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
4110 return vsub_f16(a, b);
4114EIGEN_STRONG_INLINE Packet8hf pnegate(
const Packet8hf& a) {
4115 return vnegq_f16(a);
4119EIGEN_STRONG_INLINE Packet4hf pnegate(
const Packet4hf& a) {
4124EIGEN_STRONG_INLINE Packet8hf pconj(
const Packet8hf& a) {
4129EIGEN_STRONG_INLINE Packet4hf pconj(
const Packet4hf& a) {
4134EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
4135 return vmulq_f16(a, b);
4139EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
4140 return vmul_f16(a, b);
4144EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
4145 return vdivq_f16(a, b);
4149EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
4150 return vdiv_f16(a, b);
4154EIGEN_STRONG_INLINE Packet8hf pmadd(
const Packet8hf& a,
const Packet8hf& b,
const Packet8hf& c) {
4155 return vfmaq_f16(c, a, b);
4159EIGEN_STRONG_INLINE Packet4hf pmadd(
const Packet4hf& a,
const Packet4hf& b,
const Packet4hf& c) {
4160 return vfma_f16(c, a, b);
4164EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
4165 return vminq_f16(a, b);
4169EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
4170 return vmin_f16(a, b);
4173#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
4175template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
return vminnm_f16(a, b); }
4176template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
return vminnmq_f16(a, b); }
4179template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
return pmin<Packet4hf>(a, b); }
4181template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
return pmin<Packet8hf>(a, b); }
4184EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
4185 return vmaxq_f16(a, b);
4189EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
4190 return vmax_f16(a, b);
4193#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
4195template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
return vmaxnm_f16(a, b); }
4196template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
return vmaxnmq_f16(a, b); }
4199template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
return pmax<Packet4hf>(a, b); }
4201template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
return pmax<Packet8hf>(a, b); }
4203#define EIGEN_MAKE_ARM_FP16_CMP_8(name) \
4205 EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
4206 return vreinterpretq_f16_u16(vc##name##q_f16(a, b)); \
4209#define EIGEN_MAKE_ARM_FP16_CMP_4(name) \
4211 EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
4212 return vreinterpret_f16_u16(vc##name##_f16(a, b)); \
4215EIGEN_MAKE_ARM_FP16_CMP_8(eq)
4216EIGEN_MAKE_ARM_FP16_CMP_8(lt)
4217EIGEN_MAKE_ARM_FP16_CMP_8(le)
4219EIGEN_MAKE_ARM_FP16_CMP_4(eq)
4220EIGEN_MAKE_ARM_FP16_CMP_4(lt)
4221EIGEN_MAKE_ARM_FP16_CMP_4(le)
4223#undef EIGEN_MAKE_ARM_FP16_CMP_8
4224#undef EIGEN_MAKE_ARM_FP16_CMP_4
4227EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
4228 return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
4232EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
4233 return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
4237EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(
const Packet8hf& a)
4238{
return vrndnq_f16(a); }
4241EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(
const Packet4hf& a)
4242{
return vrndn_f16(a); }
4245EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(
const Packet8hf& a)
4246{
return vrndmq_f16(a); }
4249EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(
const Packet4hf& a)
4250{
return vrndm_f16(a); }
4253EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(
const Packet8hf& a)
4254{
return vrndpq_f16(a); }
4257EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(
const Packet4hf& a)
4258{
return vrndp_f16(a); }
4261EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(
const Packet8hf& a) {
4262 return vsqrtq_f16(a);
4266EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(
const Packet4hf& a) {
4267 return vsqrt_f16(a);
4271EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
4272 return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4276EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
4277 return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4281EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
4282 return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4286EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
4287 return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4291EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
4292 return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4296EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
4297 return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4301EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(
const Packet8hf& a,
const Packet8hf& b) {
4302 return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4306EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(
const Packet4hf& a,
const Packet4hf& b) {
4307 return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4311EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(
const Eigen::half* from) {
4312 EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_f16(
reinterpret_cast<const float16_t*
>(from));
4316EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(
const Eigen::half* from) {
4317 EIGEN_DEBUG_ALIGNED_LOAD
return vld1_f16(
reinterpret_cast<const float16_t*
>(from));
4321EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(
const Eigen::half* from) {
4322 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_f16(
reinterpret_cast<const float16_t*
>(from));
4326EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(
const Eigen::half* from) {
4327 EIGEN_DEBUG_UNALIGNED_LOAD
return vld1_f16(
reinterpret_cast<const float16_t*
>(from));
4331EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(
const Eigen::half* from) {
4333 packet[0] = from[0].x;
4334 packet[1] = from[0].x;
4335 packet[2] = from[1].x;
4336 packet[3] = from[1].x;
4337 packet[4] = from[2].x;
4338 packet[5] = from[2].x;
4339 packet[6] = from[3].x;
4340 packet[7] = from[3].x;
4345EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(
const Eigen::half* from) {
4348 tmp = (float16_t*)&packet;
4357EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(
const Eigen::half* from) {
4359 lo = vld1_dup_f16(
reinterpret_cast<const float16_t*
>(from));
4360 hi = vld1_dup_f16(
reinterpret_cast<const float16_t*
>(from+1));
4361 return vcombine_f16(lo, hi);
4364EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(
const Packet8hf& a,
Eigen::half b) {
return vsetq_lane_f16(b.x, a, 0); }
4366EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(
const Packet4hf& a,
Eigen::half b) {
return vset_lane_f16(b.x, a, 0); }
4369EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(
const Packet8hf& mask,
const Packet8hf& a,
const Packet8hf& b) {
4370 return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
4374EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(
const Packet4hf& mask,
const Packet4hf& a,
const Packet4hf& b) {
4375 return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
4378EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(
const Packet8hf& a,
Eigen::half b) {
return vsetq_lane_f16(b.x, a, 7); }
4380EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(
const Packet4hf& a,
Eigen::half b) {
return vset_lane_f16(b.x, a, 3); }
4383EIGEN_STRONG_INLINE
void pstore<Eigen::half>(
Eigen::half* to,
const Packet8hf& from) {
4384 EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(
reinterpret_cast<float16_t*
>(to), from);
4388EIGEN_STRONG_INLINE
void pstore<Eigen::half>(
Eigen::half* to,
const Packet4hf& from) {
4389 EIGEN_DEBUG_ALIGNED_STORE vst1_f16(
reinterpret_cast<float16_t*
>(to), from);
4393EIGEN_STRONG_INLINE
void pstoreu<Eigen::half>(
Eigen::half* to,
const Packet8hf& from) {
4394 EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(
reinterpret_cast<float16_t*
>(to), from);
4398EIGEN_STRONG_INLINE
void pstoreu<Eigen::half>(
Eigen::half* to,
const Packet4hf& from) {
4399 EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(
reinterpret_cast<float16_t*
>(to), from);
4403EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(
const Eigen::half* from,
Index stride) {
4404 Packet8hf res = pset1<Packet8hf>(
Eigen::half(0.f));
4405 res = vsetq_lane_f16(from[0 * stride].x, res, 0);
4406 res = vsetq_lane_f16(from[1 * stride].x, res, 1);
4407 res = vsetq_lane_f16(from[2 * stride].x, res, 2);
4408 res = vsetq_lane_f16(from[3 * stride].x, res, 3);
4409 res = vsetq_lane_f16(from[4 * stride].x, res, 4);
4410 res = vsetq_lane_f16(from[5 * stride].x, res, 5);
4411 res = vsetq_lane_f16(from[6 * stride].x, res, 6);
4412 res = vsetq_lane_f16(from[7 * stride].x, res, 7);
4417EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(
const Eigen::half* from,
Index stride) {
4418 Packet4hf res = pset1<Packet4hf>(
Eigen::half(0.f));
4419 res = vset_lane_f16(from[0 * stride].x, res, 0);
4420 res = vset_lane_f16(from[1 * stride].x, res, 1);
4421 res = vset_lane_f16(from[2 * stride].x, res, 2);
4422 res = vset_lane_f16(from[3 * stride].x, res, 3);
4427EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<Eigen::half, Packet8hf>(
Eigen::half* to,
const Packet8hf& from,
Index stride) {
4428 to[stride * 0].x = vgetq_lane_f16(from, 0);
4429 to[stride * 1].x = vgetq_lane_f16(from, 1);
4430 to[stride * 2].x = vgetq_lane_f16(from, 2);
4431 to[stride * 3].x = vgetq_lane_f16(from, 3);
4432 to[stride * 4].x = vgetq_lane_f16(from, 4);
4433 to[stride * 5].x = vgetq_lane_f16(from, 5);
4434 to[stride * 6].x = vgetq_lane_f16(from, 6);
4435 to[stride * 7].x = vgetq_lane_f16(from, 7);
4439EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<Eigen::half, Packet4hf>(
Eigen::half* to,
const Packet4hf& from,
Index stride) {
4440 to[stride * 0].x = vget_lane_f16(from, 0);
4441 to[stride * 1].x = vget_lane_f16(from, 1);
4442 to[stride * 2].x = vget_lane_f16(from, 2);
4443 to[stride * 3].x = vget_lane_f16(from, 3);
4447EIGEN_STRONG_INLINE
void prefetch<Eigen::half>(
const Eigen::half* addr) {
4448 EIGEN_ARM_PREFETCH(addr);
4452EIGEN_STRONG_INLINE
Eigen::half pfirst<Packet8hf>(
const Packet8hf& a) {
4461EIGEN_STRONG_INLINE
Eigen::half pfirst<Packet4hf>(
const Packet4hf& a) {
4469template<> EIGEN_STRONG_INLINE Packet8hf preverse(
const Packet8hf& a) {
4470 float16x4_t a_lo, a_hi;
4473 a_r64 = vrev64q_f16(a);
4474 a_lo = vget_low_f16(a_r64);
4475 a_hi = vget_high_f16(a_r64);
4476 return vcombine_f16(a_hi, a_lo);
4480EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(
const Packet4hf& a) {
4481 return vrev64_f16(a);
4485EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(
const Packet8hf& a) {
4486 return vabsq_f16(a);
4490EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(
const Packet4hf& a) {
4495EIGEN_STRONG_INLINE
Eigen::half predux<Packet8hf>(
const Packet8hf& a) {
4496 float16x4_t a_lo, a_hi, sum;
4498 a_lo = vget_low_f16(a);
4499 a_hi = vget_high_f16(a);
4500 sum = vpadd_f16(a_lo, a_hi);
4501 sum = vpadd_f16(sum, sum);
4502 sum = vpadd_f16(sum, sum);
4505 h.x = vget_lane_f16(sum, 0);
4510EIGEN_STRONG_INLINE
Eigen::half predux<Packet4hf>(
const Packet4hf& a) {
4513 sum = vpadd_f16(a, a);
4514 sum = vpadd_f16(sum, sum);
4516 h.x = vget_lane_f16(sum, 0);
4521EIGEN_STRONG_INLINE
Eigen::half predux_mul<Packet8hf>(
const Packet8hf& a) {
4522 float16x4_t a_lo, a_hi, prod;
4524 a_lo = vget_low_f16(a);
4525 a_hi = vget_high_f16(a);
4526 prod = vmul_f16(a_lo, a_hi);
4527 prod = vmul_f16(prod, vrev64_f16(prod));
4530 h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
4535EIGEN_STRONG_INLINE
Eigen::half predux_mul<Packet4hf>(
const Packet4hf& a) {
4537 prod = vmul_f16(a, vrev64_f16(a));
4539 h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
4544EIGEN_STRONG_INLINE
Eigen::half predux_min<Packet8hf>(
const Packet8hf& a) {
4545 float16x4_t a_lo, a_hi, min;
4547 a_lo = vget_low_f16(a);
4548 a_hi = vget_high_f16(a);
4549 min = vpmin_f16(a_lo, a_hi);
4550 min = vpmin_f16(min, min);
4551 min = vpmin_f16(min, min);
4554 h.x = vget_lane_f16(min, 0);
4559EIGEN_STRONG_INLINE
Eigen::half predux_min<Packet4hf>(
const Packet4hf& a) {
4561 tmp = vpmin_f16(a, a);
4562 tmp = vpmin_f16(tmp, tmp);
4564 h.x = vget_lane_f16(tmp, 0);
4569EIGEN_STRONG_INLINE
Eigen::half predux_max<Packet8hf>(
const Packet8hf& a) {
4570 float16x4_t a_lo, a_hi, max;
4572 a_lo = vget_low_f16(a);
4573 a_hi = vget_high_f16(a);
4574 max = vpmax_f16(a_lo, a_hi);
4575 max = vpmax_f16(max, max);
4576 max = vpmax_f16(max, max);
4579 h.x = vget_lane_f16(max, 0);
4584EIGEN_STRONG_INLINE
Eigen::half predux_max<Packet4hf>(
const Packet4hf& a) {
4586 tmp = vpmax_f16(a, a);
4587 tmp = vpmax_f16(tmp, tmp);
4589 h.x = vget_lane_f16(tmp, 0);
4593EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
4595 const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
4596 const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
4598 const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
4599 const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
4601 kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
4602 kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
4603 kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
4604 kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
4607EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
4608 EIGEN_ALIGN16 float16x4x4_t tmp_x4;
4609 float16_t* tmp = (float16_t*)&kernel;
4610 tmp_x4 = vld4_f16(tmp);
4612 kernel.packet[0] = tmp_x4.val[0];
4613 kernel.packet[1] = tmp_x4.val[1];
4614 kernel.packet[2] = tmp_x4.val[2];
4615 kernel.packet[3] = tmp_x4.val[3];
4618EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
4619 float16x8x2_t T_1[4];
4621 T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
4622 T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
4623 T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
4624 T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
4626 float16x8x2_t T_2[4];
4627 T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
4628 T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
4629 T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
4630 T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
4632 float16x8x2_t T_3[4];
4633 T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
4634 T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
4635 T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
4636 T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
4638 kernel.packet[0] = T_3[0].val[0];
4639 kernel.packet[1] = T_3[2].val[0];
4640 kernel.packet[2] = T_3[1].val[0];
4641 kernel.packet[3] = T_3[3].val[0];
4642 kernel.packet[4] = T_3[0].val[1];
4643 kernel.packet[5] = T_3[2].val[1];
4644 kernel.packet[6] = T_3[1].val[1];
4645 kernel.packet[7] = T_3[3].val[1];
Base class for all dense matrices, vectors, and expressions.
Definition MatrixBase.h:50
@ Unaligned
Data pointer has no specific alignment.
Definition Constants.h:233
@ Aligned16
Data pointer is aligned on a 16 bytes boundary.
Definition Constants.h:235
Namespace containing all symbols from the Eigen library.
Definition LDLT.h:16
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
detail namespace with internal helper functions
Definition json.hpp:249
Definition GenericPacketMath.h:43
Definition GenericPacketMath.h:107
Definition GenericPacketMath.h:133