10#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
11#define EIGEN_PACKET_MATH_ALTIVEC_H
17#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
18#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
21#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
22#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
26#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
27#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
30typedef __vector
float Packet4f;
31typedef __vector
int Packet4i;
32typedef __vector
unsigned int Packet4ui;
33typedef __vector __bool
int Packet4bi;
34typedef __vector
short int Packet8s;
35typedef __vector
unsigned short int Packet8us;
36typedef __vector
signed char Packet16c;
37typedef __vector
unsigned char Packet16uc;
38typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf;
42#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
43 Packet4f p4f_##NAME = {X, X, X, X}
45#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
46 Packet4i p4i_##NAME = vec_splat_s32(X)
48#define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
49 Packet4ui p4ui_##NAME = {X, X, X, X}
51#define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
52 Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
54#define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
55 Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
57#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
58 Packet4f p4f_##NAME = pset1<Packet4f>(X)
60#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
61 Packet4i p4i_##NAME = pset1<Packet4i>(X)
63#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
64 Packet2d p2d_##NAME = pset1<Packet2d>(X)
66#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
67 Packet2l p2l_##NAME = pset1<Packet2l>(X)
69#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
70 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
73#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
74#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
77static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
78static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
79static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
80static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
81static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
82static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
83static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
84static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1);
85static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
86static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
87#ifndef EIGEN_VECTORIZE_VSX
88static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
91static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
92static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
93static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
94static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
96static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
97 8, 9, 10, 11, 12, 13, 14, 15};
98static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
99 8, 9, 10, 11, 12, 13, 14, 15};
101static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
102static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
103static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
105static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
106static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 };
107static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 };
108static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
109static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
111static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
116static Packet16uc p16uc_FORWARD = vec_lvsl(0, (
float*)0);
117#ifdef EIGEN_VECTORIZE_VSX
118static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
120static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);
121static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);
122static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);
124static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
125static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
126static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);
127static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);
128static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8);
131static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);
132static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);
133static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16;
134static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16;
136static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);
139static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);
141static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8);
144#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
145 #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
147 #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
167 HasSin = EIGEN_FAST_MATH,
168 HasCos = EIGEN_FAST_MATH,
171#ifdef EIGEN_VECTORIZE_VSX
178 HasTanh = EIGEN_FAST_MATH,
179 HasErf = EIGEN_FAST_MATH,
184 HasTanh = EIGEN_FAST_MATH,
185 HasErf = EIGEN_FAST_MATH,
211 HasSin = EIGEN_FAST_MATH,
212 HasCos = EIGEN_FAST_MATH,
215#ifdef EIGEN_VECTORIZE_VSX
334 enum {size=4, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
340 enum {size=4, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
344 typedef short int type;
346 enum {size=8, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
350 typedef unsigned short int type;
352 enum {size=8, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
357 typedef signed char type;
359 enum {size=16, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
363 typedef unsigned char type;
365 enum {size=16, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
372 enum {size=8, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
374inline std::ostream &
operator <<(std::ostream & s,
const Packet16c & v)
381 for (
int i=0; i< 16; i++)
382 s <<
vt.n[i] <<
", ";
386inline std::ostream & operator <<(std::ostream & s,
const Packet16uc & v)
393 for (
int i=0; i< 16; i++)
394 s << vt.n[i] <<
", ";
398inline std::ostream & operator <<(std::ostream & s,
const Packet4f & v)
405 s << vt.n[0] <<
", " << vt.n[1] <<
", " << vt.n[2] <<
", " << vt.n[3];
409inline std::ostream & operator <<(std::ostream & s,
const Packet4i & v)
416 s << vt.n[0] <<
", " << vt.n[1] <<
", " << vt.n[2] <<
", " << vt.n[3];
420inline std::ostream & operator <<(std::ostream & s,
const Packet4ui & v)
427 s << vt.n[0] <<
", " << vt.n[1] <<
", " << vt.n[2] <<
", " << vt.n[3];
431template <
typename Packet>
432EIGEN_STRONG_INLINE Packet pload_common(
const __UNPACK_TYPE__(Packet)* from)
436 EIGEN_UNUSED_VARIABLE(from);
437 EIGEN_DEBUG_ALIGNED_LOAD
438#ifdef EIGEN_VECTORIZE_VSX
439 return vec_xl(0,
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
441 return vec_ld(0, from);
446template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from)
448 return pload_common<Packet4f>(from);
451template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int* from)
453 return pload_common<Packet4i>(from);
456template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(
const short int* from)
458 return pload_common<Packet8s>(from);
461template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(
const unsigned short int* from)
463 return pload_common<Packet8us>(from);
466template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(
const signed char* from)
468 return pload_common<Packet16c>(from);
471template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(
const unsigned char* from)
473 return pload_common<Packet16uc>(from);
476template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(
const bfloat16* from)
478 return pload_common<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
481template <
typename Packet>
482EIGEN_STRONG_INLINE
void pstore_common(__UNPACK_TYPE__(Packet)* to,
const Packet& from){
485 EIGEN_UNUSED_VARIABLE(to);
486 EIGEN_DEBUG_ALIGNED_STORE
487#ifdef EIGEN_VECTORIZE_VSX
488 vec_xst(from, 0, to);
494template<> EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from)
496 pstore_common<Packet4f>(to, from);
499template<> EIGEN_STRONG_INLINE
void pstore<int>(
int* to,
const Packet4i& from)
501 pstore_common<Packet4i>(to, from);
504template<> EIGEN_STRONG_INLINE
void pstore<short int>(
short int* to,
const Packet8s& from)
506 pstore_common<Packet8s>(to, from);
509template<> EIGEN_STRONG_INLINE
void pstore<unsigned short int>(
unsigned short int* to,
const Packet8us& from)
511 pstore_common<Packet8us>(to, from);
514template<> EIGEN_STRONG_INLINE
void pstore<bfloat16>(bfloat16* to,
const Packet8bf& from)
516 pstore_common<Packet8us>(
reinterpret_cast<unsigned short int*
>(to), from);
519template<> EIGEN_STRONG_INLINE
void pstore<signed char>(
signed char* to,
const Packet16c& from)
521 pstore_common<Packet16c>(to, from);
524template<> EIGEN_STRONG_INLINE
void pstore<unsigned char>(
unsigned char* to,
const Packet16uc& from)
526 pstore_common<Packet16uc>(to, from);
529template<
typename Packet>
530EIGEN_STRONG_INLINE Packet pset1_size4(
const __UNPACK_TYPE__(Packet)& from)
532 Packet v = {from, from, from, from};
536template<
typename Packet>
537EIGEN_STRONG_INLINE Packet pset1_size8(
const __UNPACK_TYPE__(Packet)& from)
539 Packet v = {from, from, from, from, from, from, from, from};
543template<
typename Packet>
544EIGEN_STRONG_INLINE Packet pset1_size16(
const __UNPACK_TYPE__(Packet)& from)
546 Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
550template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
551 return pset1_size4<Packet4f>(from);
554template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int& from) {
555 return pset1_size4<Packet4i>(from);
558template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(
const short int& from) {
559 return pset1_size8<Packet8s>(from);
562template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(
const unsigned short int& from) {
563 return pset1_size8<Packet8us>(from);
566template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(
const signed char& from) {
567 return pset1_size16<Packet16c>(from);
570template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(
const unsigned char& from) {
571 return pset1_size16<Packet16uc>(from);
574template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(
unsigned int from) {
575 return reinterpret_cast<Packet4f
>(pset1<Packet4i>(from));
578template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(
const bfloat16& from) {
579 return pset1_size8<Packet8us>(
reinterpret_cast<const unsigned short int&
>(from));
582template<
typename Packet> EIGEN_STRONG_INLINE
void
583pbroadcast4_common(
const __UNPACK_TYPE__(Packet) *a,
584 Packet& a0, Packet& a1, Packet& a2, Packet& a3)
586 a3 = pload<Packet>(a);
587 a0 = vec_splat(a3, 0);
588 a1 = vec_splat(a3, 1);
589 a2 = vec_splat(a3, 2);
590 a3 = vec_splat(a3, 3);
593template<> EIGEN_STRONG_INLINE
void
594pbroadcast4<Packet4f>(
const float *a,
595 Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
597 pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
599template<> EIGEN_STRONG_INLINE
void
600pbroadcast4<Packet4i>(
const int *a,
601 Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
603 pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
606template<
typename Packet> EIGEN_DEVICE_FUNC
inline Packet pgather_common(
const __UNPACK_TYPE__(Packet)* from,
Index stride)
608 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
609 a[0] = from[0*stride];
610 a[1] = from[1*stride];
611 a[2] = from[2*stride];
612 a[3] = from[3*stride];
613 return pload<Packet>(a);
616template<> EIGEN_DEVICE_FUNC
inline Packet4f pgather<float, Packet4f>(
const float* from,
Index stride)
618 return pgather_common<Packet4f>(from, stride);
621template<> EIGEN_DEVICE_FUNC
inline Packet4i pgather<int, Packet4i>(
const int* from,
Index stride)
623 return pgather_common<Packet4i>(from, stride);
626template<
typename Packet> EIGEN_DEVICE_FUNC
inline Packet pgather_size8(
const __UNPACK_TYPE__(Packet)* from,
Index stride)
628 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
629 a[0] = from[0*stride];
630 a[1] = from[1*stride];
631 a[2] = from[2*stride];
632 a[3] = from[3*stride];
633 a[4] = from[4*stride];
634 a[5] = from[5*stride];
635 a[6] = from[6*stride];
636 a[7] = from[7*stride];
637 return pload<Packet>(a);
640template<> EIGEN_DEVICE_FUNC
inline Packet8s pgather<short int, Packet8s>(
const short int* from,
Index stride)
642 return pgather_size8<Packet8s>(from, stride);
645template<> EIGEN_DEVICE_FUNC
inline Packet8us pgather<unsigned short int, Packet8us>(
const unsigned short int* from,
Index stride)
647 return pgather_size8<Packet8us>(from, stride);
650template<> EIGEN_DEVICE_FUNC
inline Packet8bf pgather<bfloat16, Packet8bf>(
const bfloat16* from,
Index stride)
652 return pgather_size8<Packet8bf>(from, stride);
655template<
typename Packet> EIGEN_DEVICE_FUNC
inline Packet pgather_size16(
const __UNPACK_TYPE__(Packet)* from,
Index stride)
657 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
658 a[0] = from[0*stride];
659 a[1] = from[1*stride];
660 a[2] = from[2*stride];
661 a[3] = from[3*stride];
662 a[4] = from[4*stride];
663 a[5] = from[5*stride];
664 a[6] = from[6*stride];
665 a[7] = from[7*stride];
666 a[8] = from[8*stride];
667 a[9] = from[9*stride];
668 a[10] = from[10*stride];
669 a[11] = from[11*stride];
670 a[12] = from[12*stride];
671 a[13] = from[13*stride];
672 a[14] = from[14*stride];
673 a[15] = from[15*stride];
674 return pload<Packet>(a);
678template<> EIGEN_DEVICE_FUNC
inline Packet16c pgather<signed char, Packet16c>(
const signed char* from,
Index stride)
680 return pgather_size16<Packet16c>(from, stride);
683template<> EIGEN_DEVICE_FUNC
inline Packet16uc pgather<unsigned char, Packet16uc>(
const unsigned char* from,
Index stride)
685 return pgather_size16<Packet16uc>(from, stride);
688template<
typename Packet> EIGEN_DEVICE_FUNC
inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to,
const Packet& from,
Index stride)
690 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
691 pstore<__UNPACK_TYPE__(Packet)>(a, from);
698template<> EIGEN_DEVICE_FUNC
inline void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride)
700 pscatter_size4<Packet4f>(to, from, stride);
703template<> EIGEN_DEVICE_FUNC
inline void pscatter<int, Packet4i>(
int* to,
const Packet4i& from,
Index stride)
705 pscatter_size4<Packet4i>(to, from, stride);
708template<
typename Packet> EIGEN_DEVICE_FUNC
inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to,
const Packet& from,
Index stride)
710 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
711 pstore<__UNPACK_TYPE__(Packet)>(a, from);
723template<> EIGEN_DEVICE_FUNC
inline void pscatter<short int, Packet8s>(
short int* to,
const Packet8s& from,
Index stride)
725 pscatter_size8<Packet8s>(to, from, stride);
728template<> EIGEN_DEVICE_FUNC
inline void pscatter<unsigned short int, Packet8us>(
unsigned short int* to,
const Packet8us& from,
Index stride)
730 pscatter_size8<Packet8us>(to, from, stride);
733template<> EIGEN_DEVICE_FUNC
inline void pscatter<bfloat16, Packet8bf>(bfloat16* to,
const Packet8bf& from,
Index stride)
735 pscatter_size8<Packet8bf>(to, from, stride);
738template<
typename Packet> EIGEN_DEVICE_FUNC
inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to,
const Packet& from,
Index stride)
740 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
741 pstore<__UNPACK_TYPE__(Packet)>(a, from);
752 to[10*stride] = a[10];
753 to[11*stride] = a[11];
754 to[12*stride] = a[12];
755 to[13*stride] = a[13];
756 to[14*stride] = a[14];
757 to[15*stride] = a[15];
760template<> EIGEN_DEVICE_FUNC
inline void pscatter<signed char, Packet16c>(
signed char* to,
const Packet16c& from,
Index stride)
762 pscatter_size16<Packet16c>(to, from, stride);
765template<> EIGEN_DEVICE_FUNC
inline void pscatter<unsigned char, Packet16uc>(
unsigned char* to,
const Packet16uc& from,
Index stride)
767 pscatter_size16<Packet16uc>(to, from, stride);
770template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
771template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int& a) {
return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
772template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(
const short int& a) {
return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
773template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(
const unsigned short int& a) {
return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
774template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(
const signed char& a) {
return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
775template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(
const unsigned char& a) {
return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
777template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f> (
const Packet4f& a,
const Packet4f& b) {
return a + b; }
778template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i> (
const Packet4i& a,
const Packet4i& b) {
return a + b; }
779template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui> (
const Packet4ui& a,
const Packet4ui& b) {
return a + b; }
780template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s> (
const Packet8s& a,
const Packet8s& b) {
return a + b; }
781template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us> (
const Packet8us& a,
const Packet8us& b) {
return a + b; }
782template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c> (
const Packet16c& a,
const Packet16c& b) {
return a + b; }
783template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
return a + b; }
785template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f> (
const Packet4f& a,
const Packet4f& b) {
return a - b; }
786template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i> (
const Packet4i& a,
const Packet4i& b) {
return a - b; }
787template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s> (
const Packet8s& a,
const Packet8s& b) {
return a - b; }
788template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us> (
const Packet8us& a,
const Packet8us& b) {
return a - b; }
789template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c> (
const Packet16c& a,
const Packet16c& b) {
return a - b; }
790template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
return a - b; }
792template<> EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a)
794#ifdef __POWER8_VECTOR__
797 return vec_xor(a, p4f_MZERO);
800template<> EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a)
802#ifdef __POWER8_VECTOR__
809template<> EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
return a; }
810template<> EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
return a; }
812template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f> (
const Packet4f& a,
const Packet4f& b) {
return vec_madd(a,b, p4f_MZERO); }
813template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i> (
const Packet4i& a,
const Packet4i& b) {
return a * b; }
814template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s> (
const Packet8s& a,
const Packet8s& b) {
return vec_mul(a,b); }
815template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us> (
const Packet8us& a,
const Packet8us& b) {
return vec_mul(a,b); }
816template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c> (
const Packet16c& a,
const Packet16c& b) {
return vec_mul(a,b); }
817template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
return vec_mul(a,b); }
820template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b)
822#ifndef EIGEN_VECTORIZE_VSX
823 Packet4f t, y_0, y_1;
829 t = vec_nmsub(y_0, b, p4f_ONE);
830 y_1 = vec_madd(y_0, t, y_0);
832 return vec_madd(a, y_1, p4f_MZERO);
834 return vec_div(a, b);
838template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& ,
const Packet4i& )
839{ eigen_assert(
false &&
"packet integer division are not supported by AltiVec");
840 return pset1<Packet4i>(0);
844template<> EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
return vec_madd(a,b,c); }
845template<> EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
return a*b + c; }
846template<> EIGEN_STRONG_INLINE Packet8s pmadd(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
return vec_madd(a,b,c); }
847template<> EIGEN_STRONG_INLINE Packet8us pmadd(
const Packet8us& a,
const Packet8us& b,
const Packet8us& c) {
return vec_madd(a,b,c); }
849template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b)
851 #ifdef EIGEN_VECTORIZE_VSX
854 __asm__ (
"xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa" (ret) :
"wa" (a),
"wa" (b));
857 return vec_min(a, b);
860template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vec_min(a, b); }
861template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
return vec_min(a, b); }
862template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
return vec_min(a, b); }
863template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
return vec_min(a, b); }
864template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
return vec_min(a, b); }
867template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b)
869 #ifdef EIGEN_VECTORIZE_VSX
872 __asm__ (
"xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa" (ret) :
"wa" (a),
"wa" (b));
875 return vec_max(a, b);
878template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vec_max(a, b); }
879template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
return vec_max(a, b); }
880template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
return vec_max(a, b); }
881template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
return vec_max(a, b); }
882template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
return vec_max(a, b); }
884template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(
const Packet4f& a,
const Packet4f& b) {
return reinterpret_cast<Packet4f
>(vec_cmple(a,b)); }
886#if defined(__POWER8_VECTOR__) || EIGEN_COMP_LLVM
887template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(
const Packet4f& a,
const Packet4f& b) {
return reinterpret_cast<Packet4f
>(vec_cmplt(a,b)); }
889template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(
const Packet4f& a,
const Packet4f& b) {
return reinterpret_cast<Packet4f
>(vec_cmpeq(a,b)); }
890template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(
const Packet4f& a,
const Packet4f& b) {
891 Packet4f c =
reinterpret_cast<Packet4f
>(vec_cmpge(a,b));
896template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(
const Packet4i& a,
const Packet4i& b) {
return reinterpret_cast<Packet4i
>(vec_cmple(a,b)); }
898template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(
const Packet4i& a,
const Packet4i& b) {
return reinterpret_cast<Packet4i
>(vec_cmplt(a,b)); }
899template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(
const Packet4i& a,
const Packet4i& b) {
return reinterpret_cast<Packet4i
>(vec_cmpeq(a,b)); }
901template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(
const Packet8s& a,
const Packet8s& b) {
return reinterpret_cast<Packet8s
>(vec_cmple(a,b)); }
903template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(
const Packet8s& a,
const Packet8s& b) {
return reinterpret_cast<Packet8s
>(vec_cmplt(a,b)); }
904template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(
const Packet8s& a,
const Packet8s& b) {
return reinterpret_cast<Packet8s
>(vec_cmpeq(a,b)); }
906template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(
const Packet8us& a,
const Packet8us& b) {
return reinterpret_cast<Packet8us
>(vec_cmple(a,b)); }
908template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(
const Packet8us& a,
const Packet8us& b) {
return reinterpret_cast<Packet8us
>(vec_cmplt(a,b)); }
909template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(
const Packet8us& a,
const Packet8us& b) {
return reinterpret_cast<Packet8us
>(vec_cmpeq(a,b)); }
911template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(
const Packet16c& a,
const Packet16c& b) {
return reinterpret_cast<Packet16c
>(vec_cmple(a,b)); }
913template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(
const Packet16c& a,
const Packet16c& b) {
return reinterpret_cast<Packet16c
>(vec_cmplt(a,b)); }
914template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(
const Packet16c& a,
const Packet16c& b) {
return reinterpret_cast<Packet16c
>(vec_cmpeq(a,b)); }
916template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(
const Packet16uc& a,
const Packet16uc& b) {
return reinterpret_cast<Packet16uc
>(vec_cmple(a,b)); }
918template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(
const Packet16uc& a,
const Packet16uc& b) {
return reinterpret_cast<Packet16uc
>(vec_cmplt(a,b)); }
919template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(
const Packet16uc& a,
const Packet16uc& b) {
return reinterpret_cast<Packet16uc
>(vec_cmpeq(a,b)); }
921template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vec_and(a, b); }
922template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vec_and(a, b); }
923template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
return vec_and(a, b); }
924template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
return vec_and(a, b); }
925template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
926 return pand<Packet8us>(a, b);
930template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vec_or(a, b); }
931template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vec_or(a, b); }
932template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
return vec_or(a, b); }
933template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
return vec_or(a, b); }
934template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
935 return por<Packet8us>(a, b);
938template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vec_xor(a, b); }
939template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vec_xor(a, b); }
940template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
941 return pxor<Packet8us>(a, b);
944template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vec_andc(a, b); }
945template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vec_andc(a, b); }
947template<> EIGEN_STRONG_INLINE Packet4f pselect(
const Packet4f& mask,
const Packet4f& a,
const Packet4f& b) {
948 return vec_sel(b, a,
reinterpret_cast<Packet4ui
>(mask));
951template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a)
953 Packet4f t = vec_add(
reinterpret_cast<Packet4f
>(vec_or(vec_and(
reinterpret_cast<Packet4ui
>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
956#ifdef EIGEN_VECTORIZE_VSX
957 __asm__(
"xvrspiz %x0, %x1\n\t"
961 __asm__(
"vrfiz %0, %1\n\t"
968template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
return vec_ceil(a); }
969template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
return vec_floor(a); }
971template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(
const Packet4f& a)
975 __asm__(
"xvrspic %x0, %x1\n\t"
983template<
typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(
const __UNPACK_TYPE__(Packet)* from)
985 EIGEN_DEBUG_UNALIGNED_LOAD
986#ifdef EIGEN_VECTORIZE_VSX
987 return vec_xl(0,
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
989 Packet16uc mask = vec_lvsl(0, from);
990 Packet16uc MSQ = vec_ld(0, (
unsigned char *)from);
991 Packet16uc LSQ = vec_ld(15, (
unsigned char *)from);
993 return (Packet) vec_perm(MSQ, LSQ, mask);
997template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from)
999 return ploadu_common<Packet4f>(from);
1001template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int* from)
1003 return ploadu_common<Packet4i>(from);
1005template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(
const short int* from)
1007 return ploadu_common<Packet8s>(from);
1009template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(
const unsigned short int* from)
1011 return ploadu_common<Packet8us>(from);
1013template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(
const bfloat16* from)
1015 return ploadu_common<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
1017template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(
const signed char* from)
1019 return ploadu_common<Packet16c>(from);
1021template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(
const unsigned char* from)
1023 return ploadu_common<Packet16uc>(from);
1026template<
typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(
const __UNPACK_TYPE__(Packet)* from)
1029 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from);
1030 else p = ploadu<Packet>(from);
1031 return vec_perm(p, p, p16uc_DUPLICATE32_HI);
1033template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from)
1035 return ploaddup_common<Packet4f>(from);
1037template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int* from)
1039 return ploaddup_common<Packet4i>(from);
1042template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(
const short int* from)
1045 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1046 else p = ploadu<Packet8s>(from);
1047 return vec_perm(p, p, p16uc_DUPLICATE16_HI);
1050template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(
const unsigned short int* from)
1053 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1054 else p = ploadu<Packet8us>(from);
1055 return vec_perm(p, p, p16uc_DUPLICATE16_HI);
1058template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(
const short int* from)
1061 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1062 else p = ploadu<Packet8s>(from);
1063 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1066template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(
const unsigned short int* from)
1069 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1070 else p = ploadu<Packet8us>(from);
1071 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1074template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(
const bfloat16* from)
1076 return ploadquad<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
1079template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(
const signed char* from)
1082 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16c>(from);
1083 else p = ploadu<Packet16c>(from);
1084 return vec_perm(p, p, p16uc_DUPLICATE8_HI);
1087template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(
const unsigned char* from)
1090 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16uc>(from);
1091 else p = ploadu<Packet16uc>(from);
1092 return vec_perm(p, p, p16uc_DUPLICATE8_HI);
1095template<
typename Packet> EIGEN_STRONG_INLINE
void pstoreu_common(__UNPACK_TYPE__(Packet)* to,
const Packet& from)
1097 EIGEN_DEBUG_UNALIGNED_STORE
1098#ifdef EIGEN_VECTORIZE_VSX
1099 vec_xst(from, 0, to);
1103 Packet16uc MSQ, LSQ, edges;
1104 Packet16uc edgeAlign, align;
1106 MSQ = vec_ld(0, (
unsigned char *)to);
1107 LSQ = vec_ld(15, (
unsigned char *)to);
1108 edgeAlign = vec_lvsl(0, to);
1109 edges=vec_perm(LSQ,MSQ,edgeAlign);
1110 align = vec_lvsr( 0, to );
1111 MSQ = vec_perm(edges,(Packet16uc)from,align);
1112 LSQ = vec_perm((Packet16uc)from,edges,align);
1113 vec_st( LSQ, 15, (
unsigned char *)to );
1114 vec_st( MSQ, 0, (
unsigned char *)to );
1117template<> EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from)
1119 pstoreu_common<Packet4f>(to, from);
1121template<> EIGEN_STRONG_INLINE
void pstoreu<int>(
int* to,
const Packet4i& from)
1123 pstoreu_common<Packet4i>(to, from);
1125template<> EIGEN_STRONG_INLINE
void pstoreu<short int>(
short int* to,
const Packet8s& from)
1127 pstoreu_common<Packet8s>(to, from);
1129template<> EIGEN_STRONG_INLINE
void pstoreu<unsigned short int>(
unsigned short int* to,
const Packet8us& from)
1131 pstoreu_common<Packet8us>(to, from);
1133template<> EIGEN_STRONG_INLINE
void pstoreu<bfloat16>(bfloat16* to,
const Packet8bf& from)
1135 pstoreu_common<Packet8us>(
reinterpret_cast<unsigned short int*
>(to), from);
1137template<> EIGEN_STRONG_INLINE
void pstoreu<signed char>(
signed char* to,
const Packet16c& from)
1139 pstoreu_common<Packet16c>(to, from);
1141template<> EIGEN_STRONG_INLINE
void pstoreu<unsigned char>(
unsigned char* to,
const Packet16uc& from)
1143 pstoreu_common<Packet16uc>(to, from);
1146template<> EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) { EIGEN_PPC_PREFETCH(addr); }
1147template<> EIGEN_STRONG_INLINE
void prefetch<int>(
const int* addr) { EIGEN_PPC_PREFETCH(addr); }
1149template<> EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) { EIGEN_ALIGN16
float x; vec_ste(a, 0, &x);
return x; }
1150template<> EIGEN_STRONG_INLINE
int pfirst<Packet4i>(
const Packet4i& a) { EIGEN_ALIGN16
int x; vec_ste(a, 0, &x);
return x; }
1152template<
typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(
const Packet& a) {
1153 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
1158template<> EIGEN_STRONG_INLINE
short int pfirst<Packet8s>(
const Packet8s& a) {
1159 return pfirst_common<Packet8s>(a);
1162template<> EIGEN_STRONG_INLINE
unsigned short int pfirst<Packet8us>(
const Packet8us& a) {
1163 return pfirst_common<Packet8us>(a);
1166template<> EIGEN_STRONG_INLINE
signed char pfirst<Packet16c>(
const Packet16c& a)
1168 return pfirst_common<Packet16c>(a);
1171template<> EIGEN_STRONG_INLINE
unsigned char pfirst<Packet16uc>(
const Packet16uc& a)
1173 return pfirst_common<Packet16uc>(a);
1176template<> EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a)
1178 return reinterpret_cast<Packet4f
>(vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE32));
1180template<> EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a)
1182 return reinterpret_cast<Packet4i
>(vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE32));
1184template<> EIGEN_STRONG_INLINE Packet8s preverse(
const Packet8s& a)
1186 return reinterpret_cast<Packet8s
>(vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE16));
1188template<> EIGEN_STRONG_INLINE Packet8us preverse(
const Packet8us& a)
1190 return reinterpret_cast<Packet8us
>(vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE16));
1192template<> EIGEN_STRONG_INLINE Packet16c preverse(
const Packet16c& a)
1194 return vec_perm(a, a, p16uc_REVERSE8);
1196template<> EIGEN_STRONG_INLINE Packet16uc preverse(
const Packet16uc& a)
1198 return vec_perm(a, a, p16uc_REVERSE8);
1200template<> EIGEN_STRONG_INLINE Packet8bf preverse(
const Packet8bf& a)
1202 return preverse<Packet8us>(a);
1205template<> EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
return vec_abs(a); }
1206template<> EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
return vec_abs(a); }
1207template<> EIGEN_STRONG_INLINE Packet8s pabs(
const Packet8s& a) {
return vec_abs(a); }
1208template<> EIGEN_STRONG_INLINE Packet8us pabs(
const Packet8us& a) {
return a; }
1209template<> EIGEN_STRONG_INLINE Packet16c pabs(
const Packet16c& a) {
return vec_abs(a); }
1210template<> EIGEN_STRONG_INLINE Packet16uc pabs(
const Packet16uc& a) {
return a; }
1211template<> EIGEN_STRONG_INLINE Packet8bf pabs(
const Packet8bf& a) {
1212 _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF);
1213 return pand<Packet8us>(p8us_abs_mask, a);
1216template<
int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(
const Packet4i& a)
1217{
return vec_sra(a,
reinterpret_cast<Packet4ui
>(pset1<Packet4i>(N))); }
1218template<
int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(
const Packet4i& a)
1219{
return vec_sr(a,
reinterpret_cast<Packet4ui
>(pset1<Packet4i>(N))); }
1220template<
int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(
const Packet4i& a)
1221{
return vec_sl(a,
reinterpret_cast<Packet4ui
>(pset1<Packet4i>(N))); }
1222template<
int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(
const Packet4f& a)
1224 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1225 Packet4ui r = vec_sl(
reinterpret_cast<Packet4ui
>(a), p4ui_mask);
1226 return reinterpret_cast<Packet4f
>(r);
1229template<
int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(
const Packet4f& a)
1231 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1232 Packet4ui r = vec_sr(
reinterpret_cast<Packet4ui
>(a), p4ui_mask);
1233 return reinterpret_cast<Packet4f
>(r);
1236template<
int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(
const Packet4ui& a)
1238 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1239 return vec_sr(a, p4ui_mask);
1242template<
int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(
const Packet4ui& a)
1244 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1245 return vec_sl(a, p4ui_mask);
1248template<
int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(
const Packet8us& a)
1250 const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1251 return vec_sl(a, p8us_mask);
1253template<
int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(
const Packet8us& a)
1255 const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1256 return vec_sr(a, p8us_mask);
1259EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(
const Packet8bf& bf){
1260 return plogical_shift_left<16>(
reinterpret_cast<Packet4f
>(bf.m_val));
1263EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(
const Packet8bf& bf){
1264 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1265 return pand<Packet4f>(
1266 reinterpret_cast<Packet4f
>(bf.m_val),
1267 reinterpret_cast<Packet4f
>(p4ui_high_mask)
1273EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1274 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1275 Packet4f bf_odd, bf_even;
1276 bf_odd = pand(
reinterpret_cast<Packet4f
>(p4ui_high_mask), odd);
1277 bf_even = plogical_shift_right<16>(even);
1278 return reinterpret_cast<Packet8us
>(por<Packet4f>(bf_even, bf_odd));
1281EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
1282 Packet4ui input =
reinterpret_cast<Packet4ui
>(p4f);
1283 Packet4ui lsb = plogical_shift_right<16>(input);
1284 lsb = pand<Packet4ui>(lsb,
reinterpret_cast<Packet4ui
>(p4i_ONE));
1286 _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu);
1287 Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
1288 input = padd<Packet4ui>(input, rounding_bias);
1291 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
1292 Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask,
reinterpret_cast<Packet4ui
>(p4f));
1294 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
1295 Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask,
reinterpret_cast<Packet4ui
>(p4f));
1297 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000);
1298 Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp);
1299 Packet4bi is_zero_exp = vec_cmpeq(exp,
reinterpret_cast<Packet4ui
>(p4i_ZERO));
1301 Packet4bi is_mant_zero = vec_cmpeq(mantissa,
reinterpret_cast<Packet4ui
>(p4i_ZERO));
1302 Packet4ui nan_selector = pandnot<Packet4ui>(
1303 reinterpret_cast<Packet4ui
>(is_max_exp),
1304 reinterpret_cast<Packet4ui
>(is_mant_zero)
1307 Packet4ui subnormal_selector = pandnot<Packet4ui>(
1308 reinterpret_cast<Packet4ui
>(is_zero_exp),
1309 reinterpret_cast<Packet4ui
>(is_mant_zero)
1312 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
1313 input = vec_sel(input, p4ui_nan, nan_selector);
1314 input = vec_sel(input,
reinterpret_cast<Packet4ui
>(p4f), subnormal_selector);
1317 input = plogical_shift_right<16>(input);
1318 return reinterpret_cast<Packet8us
>(input);
1321EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){
1322 Packet4f bf_odd, bf_even;
1323 bf_odd =
reinterpret_cast<Packet4f
>(F32ToBf16(odd).m_val);
1324 bf_odd = plogical_shift_left<16>(bf_odd);
1325 bf_even =
reinterpret_cast<Packet4f
>(F32ToBf16(even).m_val);
1326 return reinterpret_cast<Packet8us
>(por<Packet4f>(bf_even, bf_odd));
1328#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
1329 Packet4f a_even = Bf16ToF32Even(A);\
1330 Packet4f a_odd = Bf16ToF32Odd(A);\
1331 Packet4f op_even = OP(a_even);\
1332 Packet4f op_odd = OP(a_odd);\
1333 return F32ToBf16(op_even, op_odd);\
1335#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
1336 Packet4f a_even = Bf16ToF32Even(A);\
1337 Packet4f a_odd = Bf16ToF32Odd(A);\
1338 Packet4f b_even = Bf16ToF32Even(B);\
1339 Packet4f b_odd = Bf16ToF32Odd(B);\
1340 Packet4f op_even = OP(a_even, b_even);\
1341 Packet4f op_odd = OP(a_odd, b_odd);\
1342 return F32ToBf16(op_even, op_odd);\
1344#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
1345 Packet4f a_even = Bf16ToF32Even(A);\
1346 Packet4f a_odd = Bf16ToF32Odd(A);\
1347 Packet4f b_even = Bf16ToF32Even(B);\
1348 Packet4f b_odd = Bf16ToF32Odd(B);\
1349 Packet4f op_even = OP(a_even, b_even);\
1350 Packet4f op_odd = OP(a_odd, b_odd);\
1351 return F32ToBf16Bool(op_even, op_odd);\
1353template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1354 BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
1357template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1358 BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
1361template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1362 BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
1365template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(
const Packet8bf& a) {
1366 BF16_TO_F32_UNARY_OP_WRAPPER(pnegate<Packet4f>, a);
1369template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1370 BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
1373template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(
const Packet4f& a,
const Packet4f& exponent) {
1374 return pldexp_generic(a,exponent);
1376template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (
const Packet8bf& a,
const Packet8bf& exponent){
1377 BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
1380template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(
const Packet4f& a, Packet4f& exponent) {
1381 return pfrexp_generic(a,exponent);
1383template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (
const Packet8bf& a, Packet8bf& e){
1384 Packet4f a_even = Bf16ToF32Even(a);
1385 Packet4f a_odd = Bf16ToF32Odd(a);
1388 Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
1389 Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
1390 e = F32ToBf16(e_even, e_odd);
1391 return F32ToBf16(op_even, op_odd);
1394template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (
const Packet8bf& a){
1395 BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
1397template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (
const Packet8bf& a){
1398 BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
1400template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (
const Packet8bf& a){
1401 BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
1403template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (
const Packet8bf& a){
1404 BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
1406template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (
const Packet8bf& a){
1407 BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
1409template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (
const Packet8bf& a){
1410 BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
1413template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (
const Packet8bf& a){
1414 BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
1417template<> EIGEN_STRONG_INLINE Packet8bf pmadd(
const Packet8bf& a,
const Packet8bf& b,
const Packet8bf& c) {
1418 Packet4f a_even = Bf16ToF32Even(a);
1419 Packet4f a_odd = Bf16ToF32Odd(a);
1420 Packet4f b_even = Bf16ToF32Even(b);
1421 Packet4f b_odd = Bf16ToF32Odd(b);
1422 Packet4f c_even = Bf16ToF32Even(c);
1423 Packet4f c_odd = Bf16ToF32Odd(c);
1424 Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
1425 Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
1426 return F32ToBf16(pmadd_even, pmadd_odd);
1429template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1430 BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
1433template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1434 BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
1437template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(
const Packet8bf& a,
const Packet8bf& b) {
1438 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
1440template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(
const Packet8bf& a,
const Packet8bf& b) {
1441 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
1443template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(
const Packet8bf& a,
const Packet8bf& b) {
1444 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
1446template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(
const Packet8bf& a,
const Packet8bf& b) {
1447 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
1450template<> EIGEN_STRONG_INLINE bfloat16 pfirst(
const Packet8bf& a) {
1451 return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
1454template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(
const bfloat16* from)
1456 return ploaddup<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
1459template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(
const bfloat16& a) {
1460 bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
1461 bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) };
1462 return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
1465template<> EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a)
1468 b = vec_sld(a, a, 8);
1470 b = vec_sld(sum, sum, 4);
1475template<> EIGEN_STRONG_INLINE
int predux<Packet4i>(
const Packet4i& a)
1478 sum = vec_sums(a, p4i_ZERO);
1480 sum = vec_sld(sum, p4i_ZERO, 12);
1482 sum = vec_sld(p4i_ZERO, sum, 4);
1487template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(
const Packet8bf& a)
1489 float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
1490 float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
1491 float f32_result = redux_even + redux_odd;
1492 return bfloat16(f32_result);
1494template<
typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(
const Packet& a)
1498 __UNPACK_TYPE__(Packet) n[8];
1502 EIGEN_ALIGN16
int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
1503 EIGEN_ALIGN16
int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
1504 Packet4i first_half = pload<Packet4i>(first_loader);
1505 Packet4i second_half = pload<Packet4i>(second_loader);
1507 return static_cast<__UNPACK_TYPE__(Packet)
>(predux(first_half) + predux(second_half));
1510template<> EIGEN_STRONG_INLINE
short int predux<Packet8s>(
const Packet8s& a)
1512 return predux_size8<Packet8s>(a);
1515template<> EIGEN_STRONG_INLINE
unsigned short int predux<Packet8us>(
const Packet8us& a)
1517 return predux_size8<Packet8us>(a);
1520template<
typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(
const Packet& a)
1524 __UNPACK_TYPE__(Packet) n[16];
1528 EIGEN_ALIGN16
int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
1529 EIGEN_ALIGN16
int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
1530 EIGEN_ALIGN16
int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
1531 EIGEN_ALIGN16
int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
1533 Packet4i first_quarter = pload<Packet4i>(first_loader);
1534 Packet4i second_quarter = pload<Packet4i>(second_loader);
1535 Packet4i third_quarter = pload<Packet4i>(third_loader);
1536 Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
1538 return static_cast<__UNPACK_TYPE__(Packet)
>(predux(first_quarter) + predux(second_quarter)
1539 + predux(third_quarter) + predux(fourth_quarter));
1542template<> EIGEN_STRONG_INLINE
signed char predux<Packet16c>(
const Packet16c& a)
1544 return predux_size16<Packet16c>(a);
1547template<> EIGEN_STRONG_INLINE
unsigned char predux<Packet16uc>(
const Packet16uc& a)
1549 return predux_size16<Packet16uc>(a);
1554template<> EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a)
1557 prod = pmul(a, vec_sld(a, a, 8));
1558 return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
1561template<> EIGEN_STRONG_INLINE
int predux_mul<Packet4i>(
const Packet4i& a)
1563 EIGEN_ALIGN16
int aux[4];
1565 return aux[0] * aux[1] * aux[2] * aux[3];
1568template<> EIGEN_STRONG_INLINE
short int predux_mul<Packet8s>(
const Packet8s& a)
1570 Packet8s pair, quad, octo;
1572 pair = vec_mul(a, vec_sld(a, a, 8));
1573 quad = vec_mul(pair, vec_sld(pair, pair, 4));
1574 octo = vec_mul(quad, vec_sld(quad, quad, 2));
1576 return pfirst(octo);
1579template<> EIGEN_STRONG_INLINE
unsigned short int predux_mul<Packet8us>(
const Packet8us& a)
1581 Packet8us pair, quad, octo;
1583 pair = vec_mul(a, vec_sld(a, a, 8));
1584 quad = vec_mul(pair, vec_sld(pair, pair, 4));
1585 octo = vec_mul(quad, vec_sld(quad, quad, 2));
1587 return pfirst(octo);
1590template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(
const Packet8bf& a)
1592 float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
1593 float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
1594 float f32_result = redux_even * redux_odd;
1595 return bfloat16(f32_result);
1599template<> EIGEN_STRONG_INLINE
signed char predux_mul<Packet16c>(
const Packet16c& a)
1601 Packet16c pair, quad, octo, result;
1603 pair = vec_mul(a, vec_sld(a, a, 8));
1604 quad = vec_mul(pair, vec_sld(pair, pair, 4));
1605 octo = vec_mul(quad, vec_sld(quad, quad, 2));
1606 result = vec_mul(octo, vec_sld(octo, octo, 1));
1608 return pfirst(result);
1611template<> EIGEN_STRONG_INLINE
unsigned char predux_mul<Packet16uc>(
const Packet16uc& a)
1613 Packet16uc pair, quad, octo, result;
1615 pair = vec_mul(a, vec_sld(a, a, 8));
1616 quad = vec_mul(pair, vec_sld(pair, pair, 4));
1617 octo = vec_mul(quad, vec_sld(quad, quad, 2));
1618 result = vec_mul(octo, vec_sld(octo, octo, 1));
1620 return pfirst(result);
1624template<
typename Packet> EIGEN_STRONG_INLINE
1625__UNPACK_TYPE__(Packet) predux_min4(
const Packet& a)
1628 b = vec_min(a, vec_sld(a, a, 8));
1629 res = vec_min(b, vec_sld(b, b, 4));
1634template<> EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a)
1636 return predux_min4<Packet4f>(a);
1639template<> EIGEN_STRONG_INLINE
int predux_min<Packet4i>(
const Packet4i& a)
1641 return predux_min4<Packet4i>(a);
1644template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(
const Packet8bf& a)
1646 float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
1647 float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
1648 float f32_result = (std::min)(redux_even, redux_odd);
1649 return bfloat16(f32_result);
1652template<> EIGEN_STRONG_INLINE
short int predux_min<Packet8s>(
const Packet8s& a)
1654 Packet8s pair, quad, octo;
1657 pair = vec_min(a, vec_sld(a, a, 8));
1660 quad = vec_min(pair, vec_sld(pair, pair, 4));
1663 octo = vec_min(quad, vec_sld(quad, quad, 2));
1664 return pfirst(octo);
1667template<> EIGEN_STRONG_INLINE
unsigned short int predux_min<Packet8us>(
const Packet8us& a)
1669 Packet8us pair, quad, octo;
1672 pair = vec_min(a, vec_sld(a, a, 8));
1675 quad = vec_min(pair, vec_sld(pair, pair, 4));
1678 octo = vec_min(quad, vec_sld(quad, quad, 2));
1679 return pfirst(octo);
1682template<> EIGEN_STRONG_INLINE
signed char predux_min<Packet16c>(
const Packet16c& a)
1684 Packet16c pair, quad, octo, result;
1686 pair = vec_min(a, vec_sld(a, a, 8));
1687 quad = vec_min(pair, vec_sld(pair, pair, 4));
1688 octo = vec_min(quad, vec_sld(quad, quad, 2));
1689 result = vec_min(octo, vec_sld(octo, octo, 1));
1691 return pfirst(result);
1694template<> EIGEN_STRONG_INLINE
unsigned char predux_min<Packet16uc>(
const Packet16uc& a)
1696 Packet16uc pair, quad, octo, result;
1698 pair = vec_min(a, vec_sld(a, a, 8));
1699 quad = vec_min(pair, vec_sld(pair, pair, 4));
1700 octo = vec_min(quad, vec_sld(quad, quad, 2));
1701 result = vec_min(octo, vec_sld(octo, octo, 1));
1703 return pfirst(result);
1706template<
typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(
const Packet& a)
1709 b = vec_max(a, vec_sld(a, a, 8));
1710 res = vec_max(b, vec_sld(b, b, 4));
1714template<> EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a)
1716 return predux_max4<Packet4f>(a);
1719template<> EIGEN_STRONG_INLINE
int predux_max<Packet4i>(
const Packet4i& a)
1721 return predux_max4<Packet4i>(a);
1724template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(
const Packet8bf& a)
1726 float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
1727 float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
1728 float f32_result = (std::max)(redux_even, redux_odd);
1729 return bfloat16(f32_result);
1732template<> EIGEN_STRONG_INLINE
short int predux_max<Packet8s>(
const Packet8s& a)
1734 Packet8s pair, quad, octo;
1737 pair = vec_max(a, vec_sld(a, a, 8));
1740 quad = vec_max(pair, vec_sld(pair, pair, 4));
1743 octo = vec_max(quad, vec_sld(quad, quad, 2));
1744 return pfirst(octo);
1747template<> EIGEN_STRONG_INLINE
unsigned short int predux_max<Packet8us>(
const Packet8us& a)
1749 Packet8us pair, quad, octo;
1752 pair = vec_max(a, vec_sld(a, a, 8));
1755 quad = vec_max(pair, vec_sld(pair, pair, 4));
1758 octo = vec_max(quad, vec_sld(quad, quad, 2));
1759 return pfirst(octo);
1762template<> EIGEN_STRONG_INLINE
signed char predux_max<Packet16c>(
const Packet16c& a)
1764 Packet16c pair, quad, octo, result;
1766 pair = vec_max(a, vec_sld(a, a, 8));
1767 quad = vec_max(pair, vec_sld(pair, pair, 4));
1768 octo = vec_max(quad, vec_sld(quad, quad, 2));
1769 result = vec_max(octo, vec_sld(octo, octo, 1));
1771 return pfirst(result);
1774template<> EIGEN_STRONG_INLINE
unsigned char predux_max<Packet16uc>(
const Packet16uc& a)
1776 Packet16uc pair, quad, octo, result;
1778 pair = vec_max(a, vec_sld(a, a, 8));
1779 quad = vec_max(pair, vec_sld(pair, pair, 4));
1780 octo = vec_max(quad, vec_sld(quad, quad, 2));
1781 result = vec_max(octo, vec_sld(octo, octo, 1));
1783 return pfirst(result);
1786template<> EIGEN_STRONG_INLINE
bool predux_any(
const Packet4f& x)
1788 return vec_any_ne(x, pzero(x));
1791template <
typename T> EIGEN_DEVICE_FUNC
inline void
1792ptranpose_common(PacketBlock<T,4>& kernel){
1794 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1795 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1796 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1797 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1798 kernel.packet[0] = vec_mergeh(t0, t2);
1799 kernel.packet[1] = vec_mergel(t0, t2);
1800 kernel.packet[2] = vec_mergeh(t1, t3);
1801 kernel.packet[3] = vec_mergel(t1, t3);
1804EIGEN_DEVICE_FUNC
inline void
1805ptranspose(PacketBlock<Packet4f,4>& kernel) {
1806 ptranpose_common<Packet4f>(kernel);
1809EIGEN_DEVICE_FUNC
inline void
1810ptranspose(PacketBlock<Packet4i,4>& kernel) {
1811 ptranpose_common<Packet4i>(kernel);
1814EIGEN_DEVICE_FUNC
inline void
1815ptranspose(PacketBlock<Packet8s,4>& kernel) {
1816 Packet8s t0, t1, t2, t3;
1817 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1818 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1819 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1820 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1821 kernel.packet[0] = vec_mergeh(t0, t2);
1822 kernel.packet[1] = vec_mergel(t0, t2);
1823 kernel.packet[2] = vec_mergeh(t1, t3);
1824 kernel.packet[3] = vec_mergel(t1, t3);
1827EIGEN_DEVICE_FUNC
inline void
1828ptranspose(PacketBlock<Packet8us,4>& kernel) {
1829 Packet8us t0, t1, t2, t3;
1830 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1831 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1832 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1833 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1834 kernel.packet[0] = vec_mergeh(t0, t2);
1835 kernel.packet[1] = vec_mergel(t0, t2);
1836 kernel.packet[2] = vec_mergeh(t1, t3);
1837 kernel.packet[3] = vec_mergel(t1, t3);
1841EIGEN_DEVICE_FUNC
inline void
1842ptranspose(PacketBlock<Packet8bf,4>& kernel) {
1843 Packet8us t0, t1, t2, t3;
1845 t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
1846 t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
1847 t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
1848 t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
1849 kernel.packet[0] = vec_mergeh(t0, t2);
1850 kernel.packet[1] = vec_mergel(t0, t2);
1851 kernel.packet[2] = vec_mergeh(t1, t3);
1852 kernel.packet[3] = vec_mergel(t1, t3);
1855EIGEN_DEVICE_FUNC
inline void
1856ptranspose(PacketBlock<Packet16c,4>& kernel) {
1857 Packet16c t0, t1, t2, t3;
1858 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1859 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1860 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1861 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1862 kernel.packet[0] = vec_mergeh(t0, t2);
1863 kernel.packet[1] = vec_mergel(t0, t2);
1864 kernel.packet[2] = vec_mergeh(t1, t3);
1865 kernel.packet[3] = vec_mergel(t1, t3);
1869EIGEN_DEVICE_FUNC
inline void
1870ptranspose(PacketBlock<Packet16uc,4>& kernel) {
1871 Packet16uc t0, t1, t2, t3;
1872 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1873 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1874 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1875 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1876 kernel.packet[0] = vec_mergeh(t0, t2);
1877 kernel.packet[1] = vec_mergel(t0, t2);
1878 kernel.packet[2] = vec_mergeh(t1, t3);
1879 kernel.packet[3] = vec_mergel(t1, t3);
1882EIGEN_DEVICE_FUNC
inline void
1883ptranspose(PacketBlock<Packet8s,8>& kernel) {
1884 Packet8s v[8], sum[8];
1886 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
1887 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
1888 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
1889 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
1890 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
1891 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
1892 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
1893 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
1894 sum[0] = vec_mergeh(v[0], v[4]);
1895 sum[1] = vec_mergel(v[0], v[4]);
1896 sum[2] = vec_mergeh(v[1], v[5]);
1897 sum[3] = vec_mergel(v[1], v[5]);
1898 sum[4] = vec_mergeh(v[2], v[6]);
1899 sum[5] = vec_mergel(v[2], v[6]);
1900 sum[6] = vec_mergeh(v[3], v[7]);
1901 sum[7] = vec_mergel(v[3], v[7]);
1903 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
1904 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
1905 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
1906 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
1907 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
1908 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
1909 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
1910 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
1913EIGEN_DEVICE_FUNC
inline void
1914ptranspose(PacketBlock<Packet8us,8>& kernel) {
1915 Packet8us v[8], sum[8];
1917 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
1918 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
1919 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
1920 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
1921 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
1922 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
1923 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
1924 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
1925 sum[0] = vec_mergeh(v[0], v[4]);
1926 sum[1] = vec_mergel(v[0], v[4]);
1927 sum[2] = vec_mergeh(v[1], v[5]);
1928 sum[3] = vec_mergel(v[1], v[5]);
1929 sum[4] = vec_mergeh(v[2], v[6]);
1930 sum[5] = vec_mergel(v[2], v[6]);
1931 sum[6] = vec_mergeh(v[3], v[7]);
1932 sum[7] = vec_mergel(v[3], v[7]);
1934 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
1935 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
1936 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
1937 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
1938 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
1939 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
1940 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
1941 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
1944EIGEN_DEVICE_FUNC
inline void
1945ptranspose(PacketBlock<Packet8bf,8>& kernel) {
1946 Packet8bf v[8], sum[8];
1948 v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
1949 v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
1950 v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
1951 v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
1952 v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
1953 v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
1954 v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
1955 v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
1956 sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
1957 sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
1958 sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
1959 sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
1960 sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
1961 sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
1962 sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
1963 sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
1965 kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
1966 kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
1967 kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
1968 kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
1969 kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
1970 kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
1971 kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
1972 kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
1975EIGEN_DEVICE_FUNC
inline void
1976ptranspose(PacketBlock<Packet16c,16>& kernel) {
1977 Packet16c step1[16], step2[16], step3[16];
1979 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
1980 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
1981 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
1982 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
1983 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
1984 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
1985 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
1986 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
1987 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
1988 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
1989 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
1990 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
1991 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
1992 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
1993 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
1994 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
1996 step2[0] = vec_mergeh(step1[0], step1[8]);
1997 step2[1] = vec_mergel(step1[0], step1[8]);
1998 step2[2] = vec_mergeh(step1[1], step1[9]);
1999 step2[3] = vec_mergel(step1[1], step1[9]);
2000 step2[4] = vec_mergeh(step1[2], step1[10]);
2001 step2[5] = vec_mergel(step1[2], step1[10]);
2002 step2[6] = vec_mergeh(step1[3], step1[11]);
2003 step2[7] = vec_mergel(step1[3], step1[11]);
2004 step2[8] = vec_mergeh(step1[4], step1[12]);
2005 step2[9] = vec_mergel(step1[4], step1[12]);
2006 step2[10] = vec_mergeh(step1[5], step1[13]);
2007 step2[11] = vec_mergel(step1[5], step1[13]);
2008 step2[12] = vec_mergeh(step1[6], step1[14]);
2009 step2[13] = vec_mergel(step1[6], step1[14]);
2010 step2[14] = vec_mergeh(step1[7], step1[15]);
2011 step2[15] = vec_mergel(step1[7], step1[15]);
2013 step3[0] = vec_mergeh(step2[0], step2[8]);
2014 step3[1] = vec_mergel(step2[0], step2[8]);
2015 step3[2] = vec_mergeh(step2[1], step2[9]);
2016 step3[3] = vec_mergel(step2[1], step2[9]);
2017 step3[4] = vec_mergeh(step2[2], step2[10]);
2018 step3[5] = vec_mergel(step2[2], step2[10]);
2019 step3[6] = vec_mergeh(step2[3], step2[11]);
2020 step3[7] = vec_mergel(step2[3], step2[11]);
2021 step3[8] = vec_mergeh(step2[4], step2[12]);
2022 step3[9] = vec_mergel(step2[4], step2[12]);
2023 step3[10] = vec_mergeh(step2[5], step2[13]);
2024 step3[11] = vec_mergel(step2[5], step2[13]);
2025 step3[12] = vec_mergeh(step2[6], step2[14]);
2026 step3[13] = vec_mergel(step2[6], step2[14]);
2027 step3[14] = vec_mergeh(step2[7], step2[15]);
2028 step3[15] = vec_mergel(step2[7], step2[15]);
2030 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2031 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2032 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2033 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2034 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2035 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2036 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2037 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2038 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2039 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2040 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2041 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2042 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2043 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2044 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2045 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2048EIGEN_DEVICE_FUNC
inline void
2049ptranspose(PacketBlock<Packet16uc,16>& kernel) {
2050 Packet16uc step1[16], step2[16], step3[16];
2052 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2053 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2054 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2055 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2056 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2057 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2058 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2059 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2060 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2061 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2062 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2063 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2064 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2065 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2066 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2067 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2069 step2[0] = vec_mergeh(step1[0], step1[8]);
2070 step2[1] = vec_mergel(step1[0], step1[8]);
2071 step2[2] = vec_mergeh(step1[1], step1[9]);
2072 step2[3] = vec_mergel(step1[1], step1[9]);
2073 step2[4] = vec_mergeh(step1[2], step1[10]);
2074 step2[5] = vec_mergel(step1[2], step1[10]);
2075 step2[6] = vec_mergeh(step1[3], step1[11]);
2076 step2[7] = vec_mergel(step1[3], step1[11]);
2077 step2[8] = vec_mergeh(step1[4], step1[12]);
2078 step2[9] = vec_mergel(step1[4], step1[12]);
2079 step2[10] = vec_mergeh(step1[5], step1[13]);
2080 step2[11] = vec_mergel(step1[5], step1[13]);
2081 step2[12] = vec_mergeh(step1[6], step1[14]);
2082 step2[13] = vec_mergel(step1[6], step1[14]);
2083 step2[14] = vec_mergeh(step1[7], step1[15]);
2084 step2[15] = vec_mergel(step1[7], step1[15]);
2086 step3[0] = vec_mergeh(step2[0], step2[8]);
2087 step3[1] = vec_mergel(step2[0], step2[8]);
2088 step3[2] = vec_mergeh(step2[1], step2[9]);
2089 step3[3] = vec_mergel(step2[1], step2[9]);
2090 step3[4] = vec_mergeh(step2[2], step2[10]);
2091 step3[5] = vec_mergel(step2[2], step2[10]);
2092 step3[6] = vec_mergeh(step2[3], step2[11]);
2093 step3[7] = vec_mergel(step2[3], step2[11]);
2094 step3[8] = vec_mergeh(step2[4], step2[12]);
2095 step3[9] = vec_mergel(step2[4], step2[12]);
2096 step3[10] = vec_mergeh(step2[5], step2[13]);
2097 step3[11] = vec_mergel(step2[5], step2[13]);
2098 step3[12] = vec_mergeh(step2[6], step2[14]);
2099 step3[13] = vec_mergel(step2[6], step2[14]);
2100 step3[14] = vec_mergeh(step2[7], step2[15]);
2101 step3[15] = vec_mergel(step2[7], step2[15]);
2103 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2104 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2105 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2106 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2107 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2108 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2109 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2110 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2111 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2112 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2113 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2114 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2115 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2116 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2117 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2118 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2121template<
typename Packet> EIGEN_STRONG_INLINE
2122Packet pblend4(
const Selector<4>& ifPacket,
const Packet& thenPacket,
const Packet& elsePacket) {
2124 Packet4ui mask =
reinterpret_cast<Packet4ui
>(vec_cmpeq(
reinterpret_cast<Packet4ui
>(select),
reinterpret_cast<Packet4ui
>(p4i_ONE)));
2125 return vec_sel(elsePacket, thenPacket, mask);
2128template<> EIGEN_STRONG_INLINE Packet4i pblend(
const Selector<4>& ifPacket,
const Packet4i& thenPacket,
const Packet4i& elsePacket) {
2129 return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
2132template<> EIGEN_STRONG_INLINE Packet4f pblend(
const Selector<4>& ifPacket,
const Packet4f& thenPacket,
const Packet4f& elsePacket) {
2133 return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
2136template<> EIGEN_STRONG_INLINE Packet8s pblend(
const Selector<8>& ifPacket,
const Packet8s& thenPacket,
const Packet8s& elsePacket) {
2139 Packet8us mask =
reinterpret_cast<Packet8us
>(vec_cmpeq(select, p8us_ONE));
2140 Packet8s result = vec_sel(elsePacket, thenPacket, mask);
2144template<> EIGEN_STRONG_INLINE Packet8us pblend(
const Selector<8>& ifPacket,
const Packet8us& thenPacket,
const Packet8us& elsePacket) {
2147 Packet8us mask =
reinterpret_cast<Packet8us
>(vec_cmpeq(
reinterpret_cast<Packet8us
>(select), p8us_ONE));
2148 return vec_sel(elsePacket, thenPacket, mask);
2151template<> EIGEN_STRONG_INLINE Packet8bf pblend(
const Selector<8>& ifPacket,
const Packet8bf& thenPacket,
const Packet8bf& elsePacket) {
2152 return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
2155template<> EIGEN_STRONG_INLINE Packet16c pblend(
const Selector<16>& ifPacket,
const Packet16c& thenPacket,
const Packet16c& elsePacket) {
2161 Packet16uc mask =
reinterpret_cast<Packet16uc
>(vec_cmpeq(
reinterpret_cast<Packet16uc
>(select), p16uc_ONE));
2162 return vec_sel(elsePacket, thenPacket, mask);
2165template<> EIGEN_STRONG_INLINE Packet16uc pblend(
const Selector<16>& ifPacket,
const Packet16uc& thenPacket,
const Packet16uc& elsePacket) {
2171 Packet16uc mask =
reinterpret_cast<Packet16uc
>(vec_cmpeq(
reinterpret_cast<Packet16uc
>(select), p16uc_ONE));
2172 return vec_sel(elsePacket, thenPacket, mask);
2219template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(
const Packet4i& a) {
2220 return vec_ctf(a,0);
2223template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(
const Packet4ui& a) {
2224 return vec_ctf(a,0);
2227template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(
const Packet8bf& a) {
2228 Packet4f float_even = Bf16ToF32Even(a);
2229 Packet4f float_odd = Bf16ToF32Odd(a);
2230 Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
2231 Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd);
2232 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
2233 Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
2234 Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
2237 Packet4bi overflow_selector;
2238 if(vec_any_gt(int_even, p4ui_low_mask)){
2239 overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
2240 low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
2242 if(vec_any_gt(int_odd, p4ui_low_mask)){
2243 overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
2244 low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
2247 low_odd = plogical_shift_left<16>(low_odd);
2249 Packet4ui int_final = por<Packet4ui>(low_even, low_odd);
2250 return reinterpret_cast<Packet8us
>(int_final);
2253template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(
const Packet8us& a) {
2255 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
2256 Packet4ui int_cast =
reinterpret_cast<Packet4ui
>(a);
2257 Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
2258 Packet4ui int_odd = plogical_shift_right<16>(int_cast);
2259 Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even);
2260 Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd);
2261 return F32ToBf16(float_even, float_odd);
2265template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(
const Packet4f& a) {
2266 return reinterpret_cast<Packet4i
>(a);
2269template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(
const Packet4i& a) {
2270 return reinterpret_cast<Packet4f
>(a);
2276#ifdef EIGEN_VECTORIZE_VSX
2277typedef __vector
double Packet2d;
2278typedef __vector
unsigned long long Packet2ul;
2279typedef __vector
long long Packet2l;
2281typedef Packet2ul Packet2bl;
2283typedef __vector __bool
long Packet2bl;
2286static Packet2l p2l_ONE = { 1, 1 };
2287static Packet2l p2l_ZERO =
reinterpret_cast<Packet2l
>(p4i_ZERO);
2288static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
2289static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
2290static Packet2d p2d_ONE = { 1.0, 1.0 };
2291static Packet2d p2d_ZERO =
reinterpret_cast<Packet2d
>(p4f_ZERO);
2292static Packet2d p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
2293 numext::bit_cast<double>(0x8000000000000000ull) };
2296static Packet2d p2d_COUNTDOWN =
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4f
>(p2d_ZERO),
reinterpret_cast<Packet4f
>(p2d_ONE), 8));
2298static Packet2d p2d_COUNTDOWN =
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4f
>(p2d_ONE),
reinterpret_cast<Packet4f
>(p2d_ZERO), 8));
2301template<
int index> Packet2d vec_splat_dbl(Packet2d& a)
2303 return vec_splat(a, index);
2306template<>
struct packet_traits<double> : default_packet_traits
2308 typedef Packet2d type;
2309 typedef Packet2d half;
2312 AlignedOnScalar = 1,
2328#if !EIGEN_COMP_CLANG
2342template<>
struct unpacket_traits<Packet2d> {
typedef double type;
enum {size=2, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef Packet2d half; };
2344inline std::ostream & operator <<(std::ostream & s,
const Packet2l & v)
2351 s << vt.n[0] <<
", " << vt.n[1];
2355inline std::ostream & operator <<(std::ostream & s,
const Packet2d & v)
2362 s << vt.n[0] <<
", " << vt.n[1];
2367template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from)
2369 EIGEN_DEBUG_ALIGNED_LOAD
2370 return vec_xl(0,
const_cast<double *
>(from));
2373template<> EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from)
2375 EIGEN_DEBUG_ALIGNED_STORE
2376 vec_xst(from, 0, to);
2379template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
2380 Packet2d v = {from, from};
2384template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(
unsigned long from) {
2385 Packet2l v = {
static_cast<long long>(from),
static_cast<long long>(from)};
2386 return reinterpret_cast<Packet2d
>(v);
2389template<> EIGEN_STRONG_INLINE
void
2390pbroadcast4<Packet2d>(
const double *a,
2391 Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
2394 a0 = pset1<Packet2d>(a[0]);
2395 a1 = pset1<Packet2d>(a[1]);
2396 a2 = pset1<Packet2d>(a[2]);
2397 a3 = pset1<Packet2d>(a[3]);
2400template<> EIGEN_DEVICE_FUNC
inline Packet2d pgather<double, Packet2d>(
const double* from,
Index stride)
2402 EIGEN_ALIGN16
double af[2];
2403 af[0] = from[0*stride];
2404 af[1] = from[1*stride];
2405 return pload<Packet2d>(af);
2407template<> EIGEN_DEVICE_FUNC
inline void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride)
2409 EIGEN_ALIGN16
double af[2];
2410 pstore<double>(af, from);
2411 to[0*stride] = af[0];
2412 to[1*stride] = af[1];
2415template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
2417template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return a + b; }
2419template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return a - b; }
2421template<> EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a)
2423#ifdef __POWER8_VECTOR__
2426 return vec_xor(a, p2d_MZERO);
2430template<> EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
return a; }
2432template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vec_madd(a,b,p2d_MZERO); }
2433template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vec_div(a,b); }
2436template<> EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
return vec_madd(a, b, c); }
2438template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b)
2442 __asm__ (
"xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa" (ret) :
"wa" (a),
"wa" (b));
2446template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b)
2450 __asm__ (
"xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa" (ret) :
"wa" (a),
"wa" (b));
2454template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(
const Packet2d& a,
const Packet2d& b) {
return reinterpret_cast<Packet2d
>(vec_cmple(a,b)); }
2455template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(
const Packet2d& a,
const Packet2d& b) {
return reinterpret_cast<Packet2d
>(vec_cmplt(a,b)); }
2456template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(
const Packet2d& a,
const Packet2d& b) {
return reinterpret_cast<Packet2d
>(vec_cmpeq(a,b)); }
2457template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(
const Packet2d& a,
const Packet2d& b) {
2458 Packet2d c =
reinterpret_cast<Packet2d
>(vec_cmpge(a,b));
2459 return vec_nor(c,c);
2462template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vec_and(a, b); }
2464template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vec_or(a, b); }
2466template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vec_xor(a, b); }
2468template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vec_and(a, vec_nor(b, b)); }
2470template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(
const Packet2d& a)
2472 Packet2d t = vec_add(
reinterpret_cast<Packet2d
>(vec_or(vec_and(
reinterpret_cast<Packet2ul
>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
2475 __asm__(
"xvrdpiz %x0, %x1\n\t"
2481template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a) {
return vec_ceil(a); }
2482template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a) {
return vec_floor(a); }
2483template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(
const Packet2d& a)
2487 __asm__(
"xvrdpic %x0, %x1\n\t"
2494template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from)
2496 EIGEN_DEBUG_UNALIGNED_LOAD
2497 return vec_xl(0,
const_cast<double*
>(from));
2500template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from)
2503 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
2504 else p = ploadu<Packet2d>(from);
2505 return vec_splat_dbl<0>(p);
2508template<> EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from)
2510 EIGEN_DEBUG_UNALIGNED_STORE
2511 vec_xst(from, 0, to);
2514template<> EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) { EIGEN_PPC_PREFETCH(addr); }
2516template<> EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) { EIGEN_ALIGN16
double x[2]; pstore<double>(x, a);
return x[0]; }
2518template<> EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a)
2520 return reinterpret_cast<Packet2d
>(vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE64));
2522template<> EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
return vec_abs(a); }
2531inline Packet2l pcast<Packet2d, Packet2l>(
const Packet2d& x) {
2532#if EIGEN_GNUC_AT_LEAST(5, 4) || \
2533 (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
2534 return vec_cts(x, 0);
2537 memcpy(tmp, &x,
sizeof(tmp));
2538 Packet2l l = {
static_cast<long long>(tmp[0]),
2539 static_cast<long long>(tmp[1]) };
2545inline Packet2d pcast<Packet2l, Packet2d>(
const Packet2l& x) {
2546 unsigned long long tmp[2];
2547 memcpy(tmp, &x,
sizeof(tmp));
2548 Packet2d d = {
static_cast<double>(tmp[0]),
2549 static_cast<double>(tmp[1]) };
2560#ifdef __POWER8_VECTOR__
2563EIGEN_STRONG_INLINE Packet2l plogical_shift_left(
const Packet2l& a) {
2564 const Packet2ul shift = {
N,
N };
2565 return vec_sl(a, shift);
2569EIGEN_STRONG_INLINE Packet2l plogical_shift_right(
const Packet2l& a) {
2570 const Packet2ul shift = {
N,
N };
2571 return vec_sr(a, shift);
2578EIGEN_ALWAYS_INLINE Packet4i shift_even_left(
const Packet4i& a) {
2579 static const Packet16uc perm = {
2580 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
2581 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
2583 return vec_perm(p4i_ZERO, a, perm);
2585 return vec_perm(a, p4i_ZERO, perm);
2591EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(
const Packet4i& a) {
2592 static const Packet16uc perm = {
2593 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
2594 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
2596 return vec_perm(p4i_ZERO, a, perm);
2598 return vec_perm(a, p4i_ZERO, perm);
2602template<
int N,
typename EnableIf =
void>
2603struct plogical_shift_left_impl;
2606struct plogical_shift_left_impl<
N, typename enable_if<(
N < 32) && (
N >= 0)>::type> {
2607 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
2608 static const unsigned n =
static_cast<unsigned>(
N);
2609 const Packet4ui shift = {n, n, n, n};
2610 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
2611 static const unsigned m =
static_cast<unsigned>(32 -
N);
2612 const Packet4ui shift_right = {m, m, m, m};
2613 const Packet4i out_hi = vec_sl(ai, shift);
2614 const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
2615 return reinterpret_cast<Packet2l
>(por<Packet4i>(out_hi, out_lo));
2620struct plogical_shift_left_impl<
N, typename enable_if<(
N >= 32)>::type> {
2621 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
2622 static const unsigned m =
static_cast<unsigned>(
N - 32);
2623 const Packet4ui shift = {m, m, m, m};
2624 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
2625 return reinterpret_cast<Packet2l
>(shift_even_left(vec_sl(ai, shift)));
2630EIGEN_STRONG_INLINE Packet2l plogical_shift_left(
const Packet2l& a) {
2631 return plogical_shift_left_impl<N>::run(a);
2634template<
int N,
typename EnableIf =
void>
2635struct plogical_shift_right_impl;
2638struct plogical_shift_right_impl<
N, typename enable_if<(
N < 32) && (
N >= 0)>::type> {
2639 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
2640 static const unsigned n =
static_cast<unsigned>(
N);
2641 const Packet4ui shift = {n, n, n, n};
2642 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
2643 static const unsigned m =
static_cast<unsigned>(32 -
N);
2644 const Packet4ui shift_left = {m, m, m, m};
2645 const Packet4i out_lo = vec_sr(ai, shift);
2646 const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
2647 return reinterpret_cast<Packet2l
>(por<Packet4i>(out_hi, out_lo));
2652struct plogical_shift_right_impl<
N, typename enable_if<(
N >= 32)>::type> {
2653 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
2654 static const unsigned m =
static_cast<unsigned>(
N - 32);
2655 const Packet4ui shift = {m, m, m, m};
2656 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
2657 return reinterpret_cast<Packet2l
>(shift_odd_right(vec_sr(ai, shift)));
2662EIGEN_STRONG_INLINE Packet2l plogical_shift_right(
const Packet2l& a) {
2663 return plogical_shift_right_impl<N>::run(a);
2667template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(
const Packet2d& a,
const Packet2d& exponent) {
2669 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
2670 const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
2673 const Packet2l bias = { 1023, 1023 };
2674 Packet2l b = plogical_shift_right<2>(e);
2675 Packet2d c =
reinterpret_cast<Packet2d
>(plogical_shift_left<52>(b + bias));
2676 Packet2d out = pmul(pmul(pmul(a, c), c), c);
2677 b = psub(psub(psub(e, b), b), b);
2678 c =
reinterpret_cast<Packet2d
>(plogical_shift_left<52>(b + bias));
2687Packet2d pfrexp_generic_get_biased_exponent(
const Packet2d& a) {
2688 return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(
reinterpret_cast<Packet2l
>(pabs(a))));
2691template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (
const Packet2d& a, Packet2d& exponent) {
2692 return pfrexp_generic(a, exponent);
2695template<> EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a)
2698 b =
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4f
>(a),
reinterpret_cast<Packet4f
>(a), 8));
2700 return pfirst<Packet2d>(sum);
2705template<> EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a)
2707 return pfirst(pmul(a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4ui
>(a),
reinterpret_cast<Packet4ui
>(a), 8))));
2711template<> EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a)
2713 return pfirst(pmin(a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4ui
>(a),
reinterpret_cast<Packet4ui
>(a), 8))));
2717template<> EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a)
2719 return pfirst(pmax(a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4ui
>(a),
reinterpret_cast<Packet4ui
>(a), 8))));
2722EIGEN_DEVICE_FUNC
inline void
2723ptranspose(PacketBlock<Packet2d,2>& kernel) {
2725 t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
2726 t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
2727 kernel.packet[0] = t0;
2728 kernel.packet[1] = t1;
2731template<> EIGEN_STRONG_INLINE Packet2d pblend(
const Selector<2>& ifPacket,
const Packet2d& thenPacket,
const Packet2d& elsePacket) {
2732 Packet2l select = { ifPacket.
select[0], ifPacket.
select[1] };
2733 Packet2bl mask =
reinterpret_cast<Packet2bl
>( vec_cmpeq(
reinterpret_cast<Packet2d
>(select),
reinterpret_cast<Packet2d
>(p2l_ONE)) );
2734 return vec_sel(elsePacket, thenPacket, mask);
EIGEN_DEVICE_FUNC const Select< Derived, ThenDerived, ElseDerived > select(const DenseBase< ThenDerived > &thenMatrix, const DenseBase< ElseDerived > &elseMatrix) const
Definition Select.h:126
Base class for all dense matrices, vectors, and expressions.
Definition MatrixBase.h:50
@ Aligned16
Data pointer is aligned on a 16 bytes boundary.
Definition Constants.h:235
Namespace containing all symbols from the Eigen library.
Definition LDLT.h:16
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
int N
Simulate some binary data with a single categorical and single continuous predictor.
Definition logistic_regression.py:26
Definition GenericPacketMath.h:43
Definition GenericPacketMath.h:160
Definition GenericPacketMath.h:107
Definition GenericPacketMath.h:148
Definition GenericPacketMath.h:133