Medial Code Documentation
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
11#define EIGEN_PACKET_MATH_ALTIVEC_H
12
13namespace Eigen {
14
15namespace internal {
16
17#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
18#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
19#endif
20
21#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
22#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
23#endif
24
25// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
26#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
27#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
28#endif
29
30typedef __vector float Packet4f;
31typedef __vector int Packet4i;
32typedef __vector unsigned int Packet4ui;
33typedef __vector __bool int Packet4bi;
34typedef __vector short int Packet8s;
35typedef __vector unsigned short int Packet8us;
36typedef __vector signed char Packet16c;
37typedef __vector unsigned char Packet16uc;
38typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf;
39
40// We don't want to write the same code all the time, but we need to reuse the constants
41// and it doesn't really work to declare them global, so we define macros instead
42#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
43 Packet4f p4f_##NAME = {X, X, X, X}
44
45#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
46 Packet4i p4i_##NAME = vec_splat_s32(X)
47
48#define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
49 Packet4ui p4ui_##NAME = {X, X, X, X}
50
51#define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
52 Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
53
54#define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
55 Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
56
57#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
58 Packet4f p4f_##NAME = pset1<Packet4f>(X)
59
60#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
61 Packet4i p4i_##NAME = pset1<Packet4i>(X)
62
63#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
64 Packet2d p2d_##NAME = pset1<Packet2d>(X)
65
66#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
67 Packet2l p2l_##NAME = pset1<Packet2l>(X)
68
69#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
70 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
71
72#define DST_CHAN 1
73#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
74#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
75
76// These constants are endian-agnostic
77static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
78static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
79static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
80static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
81static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
82static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
83static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
84static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
85static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
86static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
87#ifndef EIGEN_VECTORIZE_VSX
88static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
89#endif
90
91static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
92static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
93static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
94static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
95
96static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
97 8, 9, 10, 11, 12, 13, 14, 15};
98static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
99 8, 9, 10, 11, 12, 13, 14, 15};
100
101static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
102static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
103static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
104
105static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
106static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 };
107static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 };
108static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
109static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
110
111static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
112
113// Handle endianness properly while loading constants
114// Define global static constants:
115#ifdef _BIG_ENDIAN
116static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
117#ifdef EIGEN_VECTORIZE_VSX
118static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
119#endif
120static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
121static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
122static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
123#else
124static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
125static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
126static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
127static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
128static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
129#endif // _BIG_ENDIAN
130
131static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
132static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
133static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
134static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
135
136static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
137
138#ifdef _BIG_ENDIAN
139static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
140#else
141static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
142#endif // _BIG_ENDIAN
143
144#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
145 #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
146#else
147 #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
148#endif
149
150template <>
152 typedef Packet4f type;
153 typedef Packet4f half;
154 enum {
155 Vectorizable = 1,
156 AlignedOnScalar = 1,
157 size = 4,
158 HasHalfPacket = 1,
159
160 HasAdd = 1,
161 HasSub = 1,
162 HasMul = 1,
163 HasDiv = 1,
164 HasMin = 1,
165 HasMax = 1,
166 HasAbs = 1,
167 HasSin = EIGEN_FAST_MATH,
168 HasCos = EIGEN_FAST_MATH,
169 HasLog = 1,
170 HasExp = 1,
171#ifdef EIGEN_VECTORIZE_VSX
172 HasSqrt = 1,
173#if !EIGEN_COMP_CLANG
174 HasRsqrt = 1,
175#else
176 HasRsqrt = 0,
177#endif
178 HasTanh = EIGEN_FAST_MATH,
179 HasErf = EIGEN_FAST_MATH,
180 HasRint = 1,
181#else
182 HasSqrt = 0,
183 HasRsqrt = 0,
184 HasTanh = EIGEN_FAST_MATH,
185 HasErf = EIGEN_FAST_MATH,
186#endif
187 HasRound = 1,
188 HasFloor = 1,
189 HasCeil = 1,
190 HasNegate = 1,
191 HasBlend = 1
192 };
193};
194template <>
196 typedef Packet8bf type;
197 typedef Packet8bf half;
198 enum {
199 Vectorizable = 1,
200 AlignedOnScalar = 1,
201 size = 8,
202 HasHalfPacket = 0,
203
204 HasAdd = 1,
205 HasSub = 1,
206 HasMul = 1,
207 HasDiv = 1,
208 HasMin = 1,
209 HasMax = 1,
210 HasAbs = 1,
211 HasSin = EIGEN_FAST_MATH,
212 HasCos = EIGEN_FAST_MATH,
213 HasLog = 1,
214 HasExp = 1,
215#ifdef EIGEN_VECTORIZE_VSX
216 HasSqrt = 1,
217#if !EIGEN_COMP_CLANG
218 HasRsqrt = 1,
219#else
220 HasRsqrt = 0,
221#endif
222 HasRint = 1,
223#else
224 HasSqrt = 0,
225 HasRsqrt = 0,
226 HasRint = 0,
227#endif
228 HasTanh = 0,
229 HasErf = 0,
230 HasRound = 1,
231 HasFloor = 1,
232 HasCeil = 1,
233 HasNegate = 1,
234 HasBlend = 1
235 };
236};
237
238template <>
240 typedef Packet4i type;
241 typedef Packet4i half;
242 enum {
243 Vectorizable = 1,
244 AlignedOnScalar = 1,
245 size = 4,
246 HasHalfPacket = 0,
247
248 HasAdd = 1,
249 HasSub = 1,
250 HasShift = 1,
251 HasMul = 1,
252 HasDiv = 0,
253 HasBlend = 1
254 };
255};
256
257template <>
259 typedef Packet8s type;
260 typedef Packet8s half;
261 enum {
262 Vectorizable = 1,
263 AlignedOnScalar = 1,
264 size = 8,
265 HasHalfPacket = 0,
266
267 HasAdd = 1,
268 HasSub = 1,
269 HasMul = 1,
270 HasDiv = 0,
271 HasBlend = 1
272 };
273};
274
275template <>
277 typedef Packet8us type;
278 typedef Packet8us half;
279 enum {
280 Vectorizable = 1,
281 AlignedOnScalar = 1,
282 size = 8,
283 HasHalfPacket = 0,
284
285 HasAdd = 1,
286 HasSub = 1,
287 HasMul = 1,
288 HasDiv = 0,
289 HasBlend = 1
290 };
291};
292
293template <>
295 typedef Packet16c type;
296 typedef Packet16c half;
297 enum {
298 Vectorizable = 1,
299 AlignedOnScalar = 1,
300 size = 16,
301 HasHalfPacket = 0,
302
303 HasAdd = 1,
304 HasSub = 1,
305 HasMul = 1,
306 HasDiv = 0,
307 HasBlend = 1
308 };
309};
310
311template <>
313 typedef Packet16uc type;
314 typedef Packet16uc half;
315 enum {
316 Vectorizable = 1,
317 AlignedOnScalar = 1,
318 size = 16,
319 HasHalfPacket = 0,
320
321 HasAdd = 1,
322 HasSub = 1,
323 HasMul = 1,
324 HasDiv = 0,
325 HasBlend = 1
326 };
327};
328
329template<> struct unpacket_traits<Packet4f>
330{
331 typedef float type;
332 typedef Packet4f half;
333 typedef Packet4i integer_packet;
334 enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
335};
336template<> struct unpacket_traits<Packet4i>
337{
338 typedef int type;
339 typedef Packet4i half;
340 enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
341};
342template<> struct unpacket_traits<Packet8s>
343{
344 typedef short int type;
345 typedef Packet8s half;
346 enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
347};
348template<> struct unpacket_traits<Packet8us>
349{
350 typedef unsigned short int type;
351 typedef Packet8us half;
352 enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
353};
354
355template<> struct unpacket_traits<Packet16c>
356{
357 typedef signed char type;
358 typedef Packet16c half;
359 enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
360};
361template<> struct unpacket_traits<Packet16uc>
362{
363 typedef unsigned char type;
364 typedef Packet16uc half;
365 enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
366};
367
368template<> struct unpacket_traits<Packet8bf>
369{
370 typedef bfloat16 type;
371 typedef Packet8bf half;
372 enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
373};
374inline std::ostream & operator <<(std::ostream & s, const Packet16c & v)
375{
376 union {
377 Packet16c v;
378 signed char n[16];
379 } vt;
380 vt.v = v;
381 for (int i=0; i< 16; i++)
382 s << vt.n[i] << ", ";
383 return s;
384}
385
386inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
387{
388 union {
389 Packet16uc v;
390 unsigned char n[16];
391 } vt;
392 vt.v = v;
393 for (int i=0; i< 16; i++)
394 s << vt.n[i] << ", ";
395 return s;
396}
397
398inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
399{
400 union {
401 Packet4f v;
402 float n[4];
403 } vt;
404 vt.v = v;
405 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
406 return s;
407}
408
409inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
410{
411 union {
412 Packet4i v;
413 int n[4];
414 } vt;
415 vt.v = v;
416 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
417 return s;
418}
419
420inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
421{
422 union {
423 Packet4ui v;
424 unsigned int n[4];
425 } vt;
426 vt.v = v;
427 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
428 return s;
429}
430
431template <typename Packet>
432EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
433{
434 // some versions of GCC throw "unused-but-set-parameter".
435 // ignoring these warnings for now.
436 EIGEN_UNUSED_VARIABLE(from);
437 EIGEN_DEBUG_ALIGNED_LOAD
438#ifdef EIGEN_VECTORIZE_VSX
439 return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
440#else
441 return vec_ld(0, from);
442#endif
443}
444
445// Need to define them first or we get specialization after instantiation errors
446template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
447{
448 return pload_common<Packet4f>(from);
449}
450
451template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
452{
453 return pload_common<Packet4i>(from);
454}
455
456template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from)
457{
458 return pload_common<Packet8s>(from);
459}
460
461template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from)
462{
463 return pload_common<Packet8us>(from);
464}
465
466template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from)
467{
468 return pload_common<Packet16c>(from);
469}
470
471template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from)
472{
473 return pload_common<Packet16uc>(from);
474}
475
476template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from)
477{
478 return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
479}
480
481template <typename Packet>
482EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
483 // some versions of GCC throw "unused-but-set-parameter" (float *to).
484 // ignoring these warnings for now.
485 EIGEN_UNUSED_VARIABLE(to);
486 EIGEN_DEBUG_ALIGNED_STORE
487#ifdef EIGEN_VECTORIZE_VSX
488 vec_xst(from, 0, to);
489#else
490 vec_st(from, 0, to);
491#endif
492}
493
494template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
495{
496 pstore_common<Packet4f>(to, from);
497}
498
499template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
500{
501 pstore_common<Packet4i>(to, from);
502}
503
504template<> EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from)
505{
506 pstore_common<Packet8s>(to, from);
507}
508
509template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from)
510{
511 pstore_common<Packet8us>(to, from);
512}
513
514template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from)
515{
516 pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
517}
518
519template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from)
520{
521 pstore_common<Packet16c>(to, from);
522}
523
524template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from)
525{
526 pstore_common<Packet16uc>(to, from);
527}
528
529template<typename Packet>
530EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
531{
532 Packet v = {from, from, from, from};
533 return v;
534}
535
536template<typename Packet>
537EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from)
538{
539 Packet v = {from, from, from, from, from, from, from, from};
540 return v;
541}
542
543template<typename Packet>
544EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from)
545{
546 Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
547 return v;
548}
549
550template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
551 return pset1_size4<Packet4f>(from);
552}
553
554template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
555 return pset1_size4<Packet4i>(from);
556}
557
558template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
559 return pset1_size8<Packet8s>(from);
560}
561
562template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
563 return pset1_size8<Packet8us>(from);
564}
565
566template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
567 return pset1_size16<Packet16c>(from);
568}
569
570template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
571 return pset1_size16<Packet16uc>(from);
572}
573
574template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
575 return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
576}
577
578template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
579 return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
580}
581
582template<typename Packet> EIGEN_STRONG_INLINE void
583pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
584 Packet& a0, Packet& a1, Packet& a2, Packet& a3)
585{
586 a3 = pload<Packet>(a);
587 a0 = vec_splat(a3, 0);
588 a1 = vec_splat(a3, 1);
589 a2 = vec_splat(a3, 2);
590 a3 = vec_splat(a3, 3);
591}
592
593template<> EIGEN_STRONG_INLINE void
594pbroadcast4<Packet4f>(const float *a,
595 Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
596{
597 pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
598}
599template<> EIGEN_STRONG_INLINE void
600pbroadcast4<Packet4i>(const int *a,
601 Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
602{
603 pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
604}
605
606template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
607{
608 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
609 a[0] = from[0*stride];
610 a[1] = from[1*stride];
611 a[2] = from[2*stride];
612 a[3] = from[3*stride];
613 return pload<Packet>(a);
614}
615
616template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
617{
618 return pgather_common<Packet4f>(from, stride);
619}
620
621template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
622{
623 return pgather_common<Packet4i>(from, stride);
624}
625
626template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride)
627{
628 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
629 a[0] = from[0*stride];
630 a[1] = from[1*stride];
631 a[2] = from[2*stride];
632 a[3] = from[3*stride];
633 a[4] = from[4*stride];
634 a[5] = from[5*stride];
635 a[6] = from[6*stride];
636 a[7] = from[7*stride];
637 return pload<Packet>(a);
638}
639
640template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
641{
642 return pgather_size8<Packet8s>(from, stride);
643}
644
645template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
646{
647 return pgather_size8<Packet8us>(from, stride);
648}
649
650template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
651{
652 return pgather_size8<Packet8bf>(from, stride);
653}
654
655template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride)
656{
657 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
658 a[0] = from[0*stride];
659 a[1] = from[1*stride];
660 a[2] = from[2*stride];
661 a[3] = from[3*stride];
662 a[4] = from[4*stride];
663 a[5] = from[5*stride];
664 a[6] = from[6*stride];
665 a[7] = from[7*stride];
666 a[8] = from[8*stride];
667 a[9] = from[9*stride];
668 a[10] = from[10*stride];
669 a[11] = from[11*stride];
670 a[12] = from[12*stride];
671 a[13] = from[13*stride];
672 a[14] = from[14*stride];
673 a[15] = from[15*stride];
674 return pload<Packet>(a);
675}
676
677
678template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
679{
680 return pgather_size16<Packet16c>(from, stride);
681}
682
683template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
684{
685 return pgather_size16<Packet16uc>(from, stride);
686}
687
688template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
689{
690 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
691 pstore<__UNPACK_TYPE__(Packet)>(a, from);
692 to[0*stride] = a[0];
693 to[1*stride] = a[1];
694 to[2*stride] = a[2];
695 to[3*stride] = a[3];
696}
697
698template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
699{
700 pscatter_size4<Packet4f>(to, from, stride);
701}
702
703template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
704{
705 pscatter_size4<Packet4i>(to, from, stride);
706}
707
708template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
709{
710 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8];
711 pstore<__UNPACK_TYPE__(Packet)>(a, from);
712 to[0*stride] = a[0];
713 to[1*stride] = a[1];
714 to[2*stride] = a[2];
715 to[3*stride] = a[3];
716 to[4*stride] = a[4];
717 to[5*stride] = a[5];
718 to[6*stride] = a[6];
719 to[7*stride] = a[7];
720}
721
722
723template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
724{
725 pscatter_size8<Packet8s>(to, from, stride);
726}
727
728template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
729{
730 pscatter_size8<Packet8us>(to, from, stride);
731}
732
733template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
734{
735 pscatter_size8<Packet8bf>(to, from, stride);
736}
737
738template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
739{
740 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16];
741 pstore<__UNPACK_TYPE__(Packet)>(a, from);
742 to[0*stride] = a[0];
743 to[1*stride] = a[1];
744 to[2*stride] = a[2];
745 to[3*stride] = a[3];
746 to[4*stride] = a[4];
747 to[5*stride] = a[5];
748 to[6*stride] = a[6];
749 to[7*stride] = a[7];
750 to[8*stride] = a[8];
751 to[9*stride] = a[9];
752 to[10*stride] = a[10];
753 to[11*stride] = a[11];
754 to[12*stride] = a[12];
755 to[13*stride] = a[13];
756 to[14*stride] = a[14];
757 to[15*stride] = a[15];
758}
759
760template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
761{
762 pscatter_size16<Packet16c>(to, from, stride);
763}
764
765template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
766{
767 pscatter_size16<Packet16uc>(to, from, stride);
768}
769
770template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
771template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
772template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
773template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
774template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) { return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
775template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
776
777template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f> (const Packet4f& a, const Packet4f& b) { return a + b; }
778template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i> (const Packet4i& a, const Packet4i& b) { return a + b; }
779template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui> (const Packet4ui& a, const Packet4ui& b) { return a + b; }
780template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s> (const Packet8s& a, const Packet8s& b) { return a + b; }
781template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us> (const Packet8us& a, const Packet8us& b) { return a + b; }
782template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c> (const Packet16c& a, const Packet16c& b) { return a + b; }
783template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; }
784
785template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f> (const Packet4f& a, const Packet4f& b) { return a - b; }
786template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i> (const Packet4i& a, const Packet4i& b) { return a - b; }
787template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s> (const Packet8s& a, const Packet8s& b) { return a - b; }
788template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us> (const Packet8us& a, const Packet8us& b) { return a - b; }
789template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c> (const Packet16c& a, const Packet16c& b) { return a - b; }
790template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
791
792template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
793{
794#ifdef __POWER8_VECTOR__
795 return vec_neg(a);
796#else
797 return vec_xor(a, p4f_MZERO);
798#endif
799}
800template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
801{
802#ifdef __POWER8_VECTOR__
803 return vec_neg(a);
804#else
805 return p4i_ZERO - a;
806#endif
807}
808
809template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
810template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
811
812template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
813template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i> (const Packet4i& a, const Packet4i& b) { return a * b; }
814template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s> (const Packet8s& a, const Packet8s& b) { return vec_mul(a,b); }
815template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us> (const Packet8us& a, const Packet8us& b) { return vec_mul(a,b); }
816template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c> (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); }
817template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
818
819
820template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
821{
822#ifndef EIGEN_VECTORIZE_VSX // VSX actually provides a div instruction
823 Packet4f t, y_0, y_1;
824
825 // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
826 y_0 = vec_re(b);
827
828 // Do one Newton-Raphson iteration to get the needed accuracy
829 t = vec_nmsub(y_0, b, p4f_ONE);
830 y_1 = vec_madd(y_0, t, y_0);
831
832 return vec_madd(a, y_1, p4f_MZERO);
833#else
834 return vec_div(a, b);
835#endif
836}
837
838template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
839{ eigen_assert(false && "packet integer division are not supported by AltiVec");
840 return pset1<Packet4i>(0);
841}
842
843// for some weird raisons, it has to be overloaded for packet of integers
844template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
845template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
846template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
847template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
848
849template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
850{
851 #ifdef EIGEN_VECTORIZE_VSX
852 // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
853 Packet4f ret;
854 __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
855 return ret;
856 #else
857 return vec_min(a, b);
858 #endif
859}
860template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
861template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
862template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
863template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
864template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
865
866
867template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
868{
869 #ifdef EIGEN_VECTORIZE_VSX
870 // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
871 Packet4f ret;
872 __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
873 return ret;
874 #else
875 return vec_max(a, b);
876 #endif
877}
878template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
879template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); }
880template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); }
881template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); }
882template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
883
884template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
885// To fix bug with vec_cmplt on older versions
886#if defined(__POWER8_VECTOR__) || EIGEN_COMP_LLVM
887template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
888#endif
889template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
890template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
891 Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
892 return vec_nor(c,c);
893}
894
895#ifdef __VSX__
896template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
897#endif
898template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
899template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
900#ifdef __VSX__
901template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
902#endif
903template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
904template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
905#ifdef __VSX__
906template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
907#endif
908template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
909template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
910#ifdef __VSX__
911template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
912#endif
913template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
914template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
915#ifdef __VSX__
916template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
917#endif
918template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
919template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
920
921template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
922template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
923template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); }
924template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); }
925template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
926 return pand<Packet8us>(a, b);
927}
928
929
930template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
931template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
932template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); }
933template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); }
934template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
935 return por<Packet8us>(a, b);
936}
937
938template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
939template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
940template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
941 return pxor<Packet8us>(a, b);
942}
943
944template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); }
945template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); }
946
947template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
948 return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
949}
950
951template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
952{
953 Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
954 Packet4f res;
955
956#ifdef EIGEN_VECTORIZE_VSX
957 __asm__("xvrspiz %x0, %x1\n\t"
958 : "=&wa" (res)
959 : "wa" (t));
960#else
961 __asm__("vrfiz %0, %1\n\t"
962 : "=v" (res)
963 : "v" (t));
964#endif
965
966 return res;
967}
968template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
969template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
970#ifdef __VSX__
971template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
972{
973 Packet4f res;
974
975 __asm__("xvrspic %x0, %x1\n\t"
976 : "=&wa" (res)
977 : "wa" (a));
978
979 return res;
980}
981#endif
982
983template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
984{
985 EIGEN_DEBUG_UNALIGNED_LOAD
986#ifdef EIGEN_VECTORIZE_VSX
987 return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
988#else
989 Packet16uc mask = vec_lvsl(0, from); // create the permute mask
990 Packet16uc MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
991 Packet16uc LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
992 //TODO: Add static_cast here
993 return (Packet) vec_perm(MSQ, LSQ, mask); // align the data
994#endif
995}
996
997template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
998{
999 return ploadu_common<Packet4f>(from);
1000}
1001template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
1002{
1003 return ploadu_common<Packet4i>(from);
1004}
1005template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
1006{
1007 return ploadu_common<Packet8s>(from);
1008}
1009template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
1010{
1011 return ploadu_common<Packet8us>(from);
1012}
1013template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from)
1014{
1015 return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1016}
1017template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from)
1018{
1019 return ploadu_common<Packet16c>(from);
1020}
1021template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from)
1022{
1023 return ploadu_common<Packet16uc>(from);
1024}
1025
1026template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
1027{
1028 Packet p;
1029 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from);
1030 else p = ploadu<Packet>(from);
1031 return vec_perm(p, p, p16uc_DUPLICATE32_HI);
1032}
1033template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
1034{
1035 return ploaddup_common<Packet4f>(from);
1036}
1037template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
1038{
1039 return ploaddup_common<Packet4i>(from);
1040}
1041
1042template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from)
1043{
1044 Packet8s p;
1045 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1046 else p = ploadu<Packet8s>(from);
1047 return vec_perm(p, p, p16uc_DUPLICATE16_HI);
1048}
1049
1050template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from)
1051{
1052 Packet8us p;
1053 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1054 else p = ploadu<Packet8us>(from);
1055 return vec_perm(p, p, p16uc_DUPLICATE16_HI);
1056}
1057
1058template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from)
1059{
1060 Packet8s p;
1061 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1062 else p = ploadu<Packet8s>(from);
1063 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1064}
1065
1066template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from)
1067{
1068 Packet8us p;
1069 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1070 else p = ploadu<Packet8us>(from);
1071 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1072}
1073
1074template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from)
1075{
1076 return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1077}
1078
1079template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from)
1080{
1081 Packet16c p;
1082 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16c>(from);
1083 else p = ploadu<Packet16c>(from);
1084 return vec_perm(p, p, p16uc_DUPLICATE8_HI);
1085}
1086
1087template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from)
1088{
1089 Packet16uc p;
1090 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16uc>(from);
1091 else p = ploadu<Packet16uc>(from);
1092 return vec_perm(p, p, p16uc_DUPLICATE8_HI);
1093}
1094
1095template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from)
1096{
1097 EIGEN_DEBUG_UNALIGNED_STORE
1098#ifdef EIGEN_VECTORIZE_VSX
1099 vec_xst(from, 0, to);
1100#else
1101 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
1102 // Warning: not thread safe!
1103 Packet16uc MSQ, LSQ, edges;
1104 Packet16uc edgeAlign, align;
1105
1106 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
1107 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
1108 edgeAlign = vec_lvsl(0, to); // permute map to extract edges
1109 edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges
1110 align = vec_lvsr( 0, to ); // permute map to misalign data
1111 MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
1112 LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
1113 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
1114 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second
1115#endif
1116}
1117template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
1118{
1119 pstoreu_common<Packet4f>(to, from);
1120}
1121template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
1122{
1123 pstoreu_common<Packet4i>(to, from);
1124}
1125template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from)
1126{
1127 pstoreu_common<Packet8s>(to, from);
1128}
1129template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from)
1130{
1131 pstoreu_common<Packet8us>(to, from);
1132}
1133template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from)
1134{
1135 pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from);
1136}
1137template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from)
1138{
1139 pstoreu_common<Packet16c>(to, from);
1140}
1141template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from)
1142{
1143 pstoreu_common<Packet16uc>(to, from);
1144}
1145
1146template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
1147template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
1148
1149template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
1150template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; }
1151
1152template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
1153 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
1154 vec_ste(a, 0, &x);
1155 return x;
1156}
1157
1158template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
1159 return pfirst_common<Packet8s>(a);
1160}
1161
1162template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
1163 return pfirst_common<Packet8us>(a);
1164}
1165
1166template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a)
1167{
1168 return pfirst_common<Packet16c>(a);
1169}
1170
1171template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a)
1172{
1173 return pfirst_common<Packet16uc>(a);
1174}
1175
1176template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
1177{
1178 return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1179}
1180template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
1181{
1182 return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1183}
1184template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
1185{
1186 return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1187}
1188template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
1189{
1190 return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1191}
1192template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
1193{
1194 return vec_perm(a, a, p16uc_REVERSE8);
1195}
1196template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
1197{
1198 return vec_perm(a, a, p16uc_REVERSE8);
1199}
1200template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
1201{
1202 return preverse<Packet8us>(a);
1203}
1204
1205template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
1206template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
1207template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); }
1208template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
1209template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); }
1210template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
1211template<> EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
1212 _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF);
1213 return pand<Packet8us>(p8us_abs_mask, a);
1214}
1215
1216template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
1217{ return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1218template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
1219{ return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1220template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a)
1221{ return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1222template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a)
1223{
1224 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1225 Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1226 return reinterpret_cast<Packet4f>(r);
1227}
1228
1229template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a)
1230{
1231 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1232 Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1233 return reinterpret_cast<Packet4f>(r);
1234}
1235
1236template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a)
1237{
1238 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1239 return vec_sr(a, p4ui_mask);
1240}
1241
1242template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a)
1243{
1244 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1245 return vec_sl(a, p4ui_mask);
1246}
1247
1248template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a)
1249{
1250 const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1251 return vec_sl(a, p8us_mask);
1252}
1253template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a)
1254{
1255 const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1256 return vec_sr(a, p8us_mask);
1257}
1258
1259EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
1260 return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
1261}
1262
1263EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){
1264 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1265 return pand<Packet4f>(
1266 reinterpret_cast<Packet4f>(bf.m_val),
1267 reinterpret_cast<Packet4f>(p4ui_high_mask)
1268 );
1269}
1270
1271// Simple interleaving of bool masks, prevents true values from being
1272// converted to NaNs.
1273EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1274 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1275 Packet4f bf_odd, bf_even;
1276 bf_odd = pand(reinterpret_cast<Packet4f>(p4ui_high_mask), odd);
1277 bf_even = plogical_shift_right<16>(even);
1278 return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
1279}
1280
1281EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
1282 Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
1283 Packet4ui lsb = plogical_shift_right<16>(input);
1284 lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
1285
1286 _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu);
1287 Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
1288 input = padd<Packet4ui>(input, rounding_bias);
1289
1290 //Test NaN and Subnormal - Begin
1291 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
1292 Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
1293
1294 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
1295 Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
1296
1297 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000);
1298 Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp);
1299 Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
1300
1301 Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
1302 Packet4ui nan_selector = pandnot<Packet4ui>(
1303 reinterpret_cast<Packet4ui>(is_max_exp),
1304 reinterpret_cast<Packet4ui>(is_mant_zero)
1305 );
1306
1307 Packet4ui subnormal_selector = pandnot<Packet4ui>(
1308 reinterpret_cast<Packet4ui>(is_zero_exp),
1309 reinterpret_cast<Packet4ui>(is_mant_zero)
1310 );
1311
1312 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
1313 input = vec_sel(input, p4ui_nan, nan_selector);
1314 input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
1315 //Test NaN and Subnormal - End
1316
1317 input = plogical_shift_right<16>(input);
1318 return reinterpret_cast<Packet8us>(input);
1319}
1320
1321EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){
1322 Packet4f bf_odd, bf_even;
1323 bf_odd = reinterpret_cast<Packet4f>(F32ToBf16(odd).m_val);
1324 bf_odd = plogical_shift_left<16>(bf_odd);
1325 bf_even = reinterpret_cast<Packet4f>(F32ToBf16(even).m_val);
1326 return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd));
1327}
1328#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
1329 Packet4f a_even = Bf16ToF32Even(A);\
1330 Packet4f a_odd = Bf16ToF32Odd(A);\
1331 Packet4f op_even = OP(a_even);\
1332 Packet4f op_odd = OP(a_odd);\
1333 return F32ToBf16(op_even, op_odd);\
1334
1335#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
1336 Packet4f a_even = Bf16ToF32Even(A);\
1337 Packet4f a_odd = Bf16ToF32Odd(A);\
1338 Packet4f b_even = Bf16ToF32Even(B);\
1339 Packet4f b_odd = Bf16ToF32Odd(B);\
1340 Packet4f op_even = OP(a_even, b_even);\
1341 Packet4f op_odd = OP(a_odd, b_odd);\
1342 return F32ToBf16(op_even, op_odd);\
1343
1344#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
1345 Packet4f a_even = Bf16ToF32Even(A);\
1346 Packet4f a_odd = Bf16ToF32Odd(A);\
1347 Packet4f b_even = Bf16ToF32Even(B);\
1348 Packet4f b_odd = Bf16ToF32Odd(B);\
1349 Packet4f op_even = OP(a_even, b_even);\
1350 Packet4f op_odd = OP(a_odd, b_odd);\
1351 return F32ToBf16Bool(op_even, op_odd);\
1352
1353template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1354 BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
1355}
1356
1357template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1358 BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
1359}
1360
1361template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1362 BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
1363}
1364
1365template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
1366 BF16_TO_F32_UNARY_OP_WRAPPER(pnegate<Packet4f>, a);
1367}
1368
1369template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1370 BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
1371}
1372
1373template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
1374 return pldexp_generic(a,exponent);
1375}
1376template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
1377 BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
1378}
1379
1380template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
1381 return pfrexp_generic(a,exponent);
1382}
1383template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
1384 Packet4f a_even = Bf16ToF32Even(a);
1385 Packet4f a_odd = Bf16ToF32Odd(a);
1386 Packet4f e_even;
1387 Packet4f e_odd;
1388 Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
1389 Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
1390 e = F32ToBf16(e_even, e_odd);
1391 return F32ToBf16(op_even, op_odd);
1392}
1393
1394template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){
1395 BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
1396}
1397template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){
1398 BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
1399}
1400template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){
1401 BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
1402}
1403template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){
1404 BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
1405}
1406template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){
1407 BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
1408}
1409template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
1410 BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
1411}
1412#ifdef __VSX__
1413template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
1414 BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
1415}
1416#endif
1417template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
1418 Packet4f a_even = Bf16ToF32Even(a);
1419 Packet4f a_odd = Bf16ToF32Odd(a);
1420 Packet4f b_even = Bf16ToF32Even(b);
1421 Packet4f b_odd = Bf16ToF32Odd(b);
1422 Packet4f c_even = Bf16ToF32Even(c);
1423 Packet4f c_odd = Bf16ToF32Odd(c);
1424 Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
1425 Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
1426 return F32ToBf16(pmadd_even, pmadd_odd);
1427}
1428
1429template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1430 BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
1431}
1432
1433template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1434 BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
1435}
1436
1437template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
1438 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
1439}
1440template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
1441 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
1442}
1443template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
1444 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
1445}
1446template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
1447 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
1448}
1449
1450template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
1451 return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
1452}
1453
1454template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from)
1455{
1456 return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1457}
1458
1459template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
1460 bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
1461 bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) };
1462 return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
1463}
1464
1465template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
1466{
1467 Packet4f b, sum;
1468 b = vec_sld(a, a, 8);
1469 sum = a + b;
1470 b = vec_sld(sum, sum, 4);
1471 sum += b;
1472 return pfirst(sum);
1473}
1474
1475template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
1476{
1477 Packet4i sum;
1478 sum = vec_sums(a, p4i_ZERO);
1479#ifdef _BIG_ENDIAN
1480 sum = vec_sld(sum, p4i_ZERO, 12);
1481#else
1482 sum = vec_sld(p4i_ZERO, sum, 4);
1483#endif
1484 return pfirst(sum);
1485}
1486
1487template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a)
1488{
1489 float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
1490 float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
1491 float f32_result = redux_even + redux_odd;
1492 return bfloat16(f32_result);
1493}
1494template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a)
1495{
1496 union{
1497 Packet v;
1498 __UNPACK_TYPE__(Packet) n[8];
1499 } vt;
1500 vt.v = a;
1501
1502 EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
1503 EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
1504 Packet4i first_half = pload<Packet4i>(first_loader);
1505 Packet4i second_half = pload<Packet4i>(second_loader);
1506
1507 return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
1508}
1509
1510template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a)
1511{
1512 return predux_size8<Packet8s>(a);
1513}
1514
1515template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a)
1516{
1517 return predux_size8<Packet8us>(a);
1518}
1519
1520template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a)
1521{
1522 union{
1523 Packet v;
1524 __UNPACK_TYPE__(Packet) n[16];
1525 } vt;
1526 vt.v = a;
1527
1528 EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
1529 EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
1530 EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
1531 EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
1532
1533 Packet4i first_quarter = pload<Packet4i>(first_loader);
1534 Packet4i second_quarter = pload<Packet4i>(second_loader);
1535 Packet4i third_quarter = pload<Packet4i>(third_loader);
1536 Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
1537
1538 return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter)
1539 + predux(third_quarter) + predux(fourth_quarter));
1540}
1541
1542template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a)
1543{
1544 return predux_size16<Packet16c>(a);
1545}
1546
1547template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a)
1548{
1549 return predux_size16<Packet16uc>(a);
1550}
1551
1552// Other reduction functions:
1553// mul
1554template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
1555{
1556 Packet4f prod;
1557 prod = pmul(a, vec_sld(a, a, 8));
1558 return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
1559}
1560
1561template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
1562{
1563 EIGEN_ALIGN16 int aux[4];
1564 pstore(aux, a);
1565 return aux[0] * aux[1] * aux[2] * aux[3];
1566}
1567
1568template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
1569{
1570 Packet8s pair, quad, octo;
1571
1572 pair = vec_mul(a, vec_sld(a, a, 8));
1573 quad = vec_mul(pair, vec_sld(pair, pair, 4));
1574 octo = vec_mul(quad, vec_sld(quad, quad, 2));
1575
1576 return pfirst(octo);
1577}
1578
1579template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a)
1580{
1581 Packet8us pair, quad, octo;
1582
1583 pair = vec_mul(a, vec_sld(a, a, 8));
1584 quad = vec_mul(pair, vec_sld(pair, pair, 4));
1585 octo = vec_mul(quad, vec_sld(quad, quad, 2));
1586
1587 return pfirst(octo);
1588}
1589
1590template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a)
1591{
1592 float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
1593 float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
1594 float f32_result = redux_even * redux_odd;
1595 return bfloat16(f32_result);
1596}
1597
1598
1599template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a)
1600{
1601 Packet16c pair, quad, octo, result;
1602
1603 pair = vec_mul(a, vec_sld(a, a, 8));
1604 quad = vec_mul(pair, vec_sld(pair, pair, 4));
1605 octo = vec_mul(quad, vec_sld(quad, quad, 2));
1606 result = vec_mul(octo, vec_sld(octo, octo, 1));
1607
1608 return pfirst(result);
1609}
1610
1611template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a)
1612{
1613 Packet16uc pair, quad, octo, result;
1614
1615 pair = vec_mul(a, vec_sld(a, a, 8));
1616 quad = vec_mul(pair, vec_sld(pair, pair, 4));
1617 octo = vec_mul(quad, vec_sld(quad, quad, 2));
1618 result = vec_mul(octo, vec_sld(octo, octo, 1));
1619
1620 return pfirst(result);
1621}
1622
1623// min
1624template<typename Packet> EIGEN_STRONG_INLINE
1625__UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
1626{
1627 Packet b, res;
1628 b = vec_min(a, vec_sld(a, a, 8));
1629 res = vec_min(b, vec_sld(b, b, 4));
1630 return pfirst(res);
1631}
1632
1633
1634template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
1635{
1636 return predux_min4<Packet4f>(a);
1637}
1638
1639template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
1640{
1641 return predux_min4<Packet4i>(a);
1642}
1643
1644template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a)
1645{
1646 float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
1647 float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
1648 float f32_result = (std::min)(redux_even, redux_odd);
1649 return bfloat16(f32_result);
1650}
1651
1652template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a)
1653{
1654 Packet8s pair, quad, octo;
1655
1656 //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
1657 pair = vec_min(a, vec_sld(a, a, 8));
1658
1659 //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
1660 quad = vec_min(pair, vec_sld(pair, pair, 4));
1661
1662 //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
1663 octo = vec_min(quad, vec_sld(quad, quad, 2));
1664 return pfirst(octo);
1665}
1666
1667template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a)
1668{
1669 Packet8us pair, quad, octo;
1670
1671 //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
1672 pair = vec_min(a, vec_sld(a, a, 8));
1673
1674 //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
1675 quad = vec_min(pair, vec_sld(pair, pair, 4));
1676
1677 //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
1678 octo = vec_min(quad, vec_sld(quad, quad, 2));
1679 return pfirst(octo);
1680}
1681
1682template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a)
1683{
1684 Packet16c pair, quad, octo, result;
1685
1686 pair = vec_min(a, vec_sld(a, a, 8));
1687 quad = vec_min(pair, vec_sld(pair, pair, 4));
1688 octo = vec_min(quad, vec_sld(quad, quad, 2));
1689 result = vec_min(octo, vec_sld(octo, octo, 1));
1690
1691 return pfirst(result);
1692}
1693
1694template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a)
1695{
1696 Packet16uc pair, quad, octo, result;
1697
1698 pair = vec_min(a, vec_sld(a, a, 8));
1699 quad = vec_min(pair, vec_sld(pair, pair, 4));
1700 octo = vec_min(quad, vec_sld(quad, quad, 2));
1701 result = vec_min(octo, vec_sld(octo, octo, 1));
1702
1703 return pfirst(result);
1704}
1705// max
1706template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
1707{
1708 Packet b, res;
1709 b = vec_max(a, vec_sld(a, a, 8));
1710 res = vec_max(b, vec_sld(b, b, 4));
1711 return pfirst(res);
1712}
1713
1714template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
1715{
1716 return predux_max4<Packet4f>(a);
1717}
1718
1719template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
1720{
1721 return predux_max4<Packet4i>(a);
1722}
1723
1724template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a)
1725{
1726 float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
1727 float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
1728 float f32_result = (std::max)(redux_even, redux_odd);
1729 return bfloat16(f32_result);
1730}
1731
1732template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a)
1733{
1734 Packet8s pair, quad, octo;
1735
1736 //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
1737 pair = vec_max(a, vec_sld(a, a, 8));
1738
1739 //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
1740 quad = vec_max(pair, vec_sld(pair, pair, 4));
1741
1742 //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
1743 octo = vec_max(quad, vec_sld(quad, quad, 2));
1744 return pfirst(octo);
1745}
1746
1747template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a)
1748{
1749 Packet8us pair, quad, octo;
1750
1751 //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
1752 pair = vec_max(a, vec_sld(a, a, 8));
1753
1754 //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
1755 quad = vec_max(pair, vec_sld(pair, pair, 4));
1756
1757 //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
1758 octo = vec_max(quad, vec_sld(quad, quad, 2));
1759 return pfirst(octo);
1760}
1761
1762template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a)
1763{
1764 Packet16c pair, quad, octo, result;
1765
1766 pair = vec_max(a, vec_sld(a, a, 8));
1767 quad = vec_max(pair, vec_sld(pair, pair, 4));
1768 octo = vec_max(quad, vec_sld(quad, quad, 2));
1769 result = vec_max(octo, vec_sld(octo, octo, 1));
1770
1771 return pfirst(result);
1772}
1773
1774template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a)
1775{
1776 Packet16uc pair, quad, octo, result;
1777
1778 pair = vec_max(a, vec_sld(a, a, 8));
1779 quad = vec_max(pair, vec_sld(pair, pair, 4));
1780 octo = vec_max(quad, vec_sld(quad, quad, 2));
1781 result = vec_max(octo, vec_sld(octo, octo, 1));
1782
1783 return pfirst(result);
1784}
1785
1786template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
1787{
1788 return vec_any_ne(x, pzero(x));
1789}
1790
1791template <typename T> EIGEN_DEVICE_FUNC inline void
1792ptranpose_common(PacketBlock<T,4>& kernel){
1793 T t0, t1, t2, t3;
1794 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1795 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1796 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1797 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1798 kernel.packet[0] = vec_mergeh(t0, t2);
1799 kernel.packet[1] = vec_mergel(t0, t2);
1800 kernel.packet[2] = vec_mergeh(t1, t3);
1801 kernel.packet[3] = vec_mergel(t1, t3);
1802}
1803
1804EIGEN_DEVICE_FUNC inline void
1805ptranspose(PacketBlock<Packet4f,4>& kernel) {
1806 ptranpose_common<Packet4f>(kernel);
1807}
1808
1809EIGEN_DEVICE_FUNC inline void
1810ptranspose(PacketBlock<Packet4i,4>& kernel) {
1811 ptranpose_common<Packet4i>(kernel);
1812}
1813
1814EIGEN_DEVICE_FUNC inline void
1815ptranspose(PacketBlock<Packet8s,4>& kernel) {
1816 Packet8s t0, t1, t2, t3;
1817 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1818 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1819 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1820 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1821 kernel.packet[0] = vec_mergeh(t0, t2);
1822 kernel.packet[1] = vec_mergel(t0, t2);
1823 kernel.packet[2] = vec_mergeh(t1, t3);
1824 kernel.packet[3] = vec_mergel(t1, t3);
1825}
1826
1827EIGEN_DEVICE_FUNC inline void
1828ptranspose(PacketBlock<Packet8us,4>& kernel) {
1829 Packet8us t0, t1, t2, t3;
1830 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1831 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1832 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1833 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1834 kernel.packet[0] = vec_mergeh(t0, t2);
1835 kernel.packet[1] = vec_mergel(t0, t2);
1836 kernel.packet[2] = vec_mergeh(t1, t3);
1837 kernel.packet[3] = vec_mergel(t1, t3);
1838}
1839
1840
1841EIGEN_DEVICE_FUNC inline void
1842ptranspose(PacketBlock<Packet8bf,4>& kernel) {
1843 Packet8us t0, t1, t2, t3;
1844
1845 t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
1846 t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
1847 t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
1848 t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
1849 kernel.packet[0] = vec_mergeh(t0, t2);
1850 kernel.packet[1] = vec_mergel(t0, t2);
1851 kernel.packet[2] = vec_mergeh(t1, t3);
1852 kernel.packet[3] = vec_mergel(t1, t3);
1853}
1854
1855EIGEN_DEVICE_FUNC inline void
1856ptranspose(PacketBlock<Packet16c,4>& kernel) {
1857 Packet16c t0, t1, t2, t3;
1858 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1859 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1860 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1861 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1862 kernel.packet[0] = vec_mergeh(t0, t2);
1863 kernel.packet[1] = vec_mergel(t0, t2);
1864 kernel.packet[2] = vec_mergeh(t1, t3);
1865 kernel.packet[3] = vec_mergel(t1, t3);
1866}
1867
1868
1869EIGEN_DEVICE_FUNC inline void
1870ptranspose(PacketBlock<Packet16uc,4>& kernel) {
1871 Packet16uc t0, t1, t2, t3;
1872 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1873 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1874 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1875 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1876 kernel.packet[0] = vec_mergeh(t0, t2);
1877 kernel.packet[1] = vec_mergel(t0, t2);
1878 kernel.packet[2] = vec_mergeh(t1, t3);
1879 kernel.packet[3] = vec_mergel(t1, t3);
1880}
1881
1882EIGEN_DEVICE_FUNC inline void
1883ptranspose(PacketBlock<Packet8s,8>& kernel) {
1884 Packet8s v[8], sum[8];
1885
1886 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
1887 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
1888 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
1889 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
1890 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
1891 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
1892 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
1893 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
1894 sum[0] = vec_mergeh(v[0], v[4]);
1895 sum[1] = vec_mergel(v[0], v[4]);
1896 sum[2] = vec_mergeh(v[1], v[5]);
1897 sum[3] = vec_mergel(v[1], v[5]);
1898 sum[4] = vec_mergeh(v[2], v[6]);
1899 sum[5] = vec_mergel(v[2], v[6]);
1900 sum[6] = vec_mergeh(v[3], v[7]);
1901 sum[7] = vec_mergel(v[3], v[7]);
1902
1903 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
1904 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
1905 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
1906 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
1907 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
1908 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
1909 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
1910 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
1911}
1912
1913EIGEN_DEVICE_FUNC inline void
1914ptranspose(PacketBlock<Packet8us,8>& kernel) {
1915 Packet8us v[8], sum[8];
1916
1917 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
1918 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
1919 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
1920 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
1921 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
1922 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
1923 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
1924 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
1925 sum[0] = vec_mergeh(v[0], v[4]);
1926 sum[1] = vec_mergel(v[0], v[4]);
1927 sum[2] = vec_mergeh(v[1], v[5]);
1928 sum[3] = vec_mergel(v[1], v[5]);
1929 sum[4] = vec_mergeh(v[2], v[6]);
1930 sum[5] = vec_mergel(v[2], v[6]);
1931 sum[6] = vec_mergeh(v[3], v[7]);
1932 sum[7] = vec_mergel(v[3], v[7]);
1933
1934 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
1935 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
1936 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
1937 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
1938 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
1939 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
1940 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
1941 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
1942}
1943
1944EIGEN_DEVICE_FUNC inline void
1945ptranspose(PacketBlock<Packet8bf,8>& kernel) {
1946 Packet8bf v[8], sum[8];
1947
1948 v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
1949 v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
1950 v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
1951 v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
1952 v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
1953 v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
1954 v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
1955 v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
1956 sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
1957 sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
1958 sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
1959 sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
1960 sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
1961 sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
1962 sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
1963 sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
1964
1965 kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
1966 kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
1967 kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
1968 kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
1969 kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
1970 kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
1971 kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
1972 kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
1973}
1974
1975EIGEN_DEVICE_FUNC inline void
1976ptranspose(PacketBlock<Packet16c,16>& kernel) {
1977 Packet16c step1[16], step2[16], step3[16];
1978
1979 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
1980 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
1981 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
1982 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
1983 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
1984 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
1985 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
1986 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
1987 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
1988 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
1989 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
1990 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
1991 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
1992 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
1993 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
1994 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
1995
1996 step2[0] = vec_mergeh(step1[0], step1[8]);
1997 step2[1] = vec_mergel(step1[0], step1[8]);
1998 step2[2] = vec_mergeh(step1[1], step1[9]);
1999 step2[3] = vec_mergel(step1[1], step1[9]);
2000 step2[4] = vec_mergeh(step1[2], step1[10]);
2001 step2[5] = vec_mergel(step1[2], step1[10]);
2002 step2[6] = vec_mergeh(step1[3], step1[11]);
2003 step2[7] = vec_mergel(step1[3], step1[11]);
2004 step2[8] = vec_mergeh(step1[4], step1[12]);
2005 step2[9] = vec_mergel(step1[4], step1[12]);
2006 step2[10] = vec_mergeh(step1[5], step1[13]);
2007 step2[11] = vec_mergel(step1[5], step1[13]);
2008 step2[12] = vec_mergeh(step1[6], step1[14]);
2009 step2[13] = vec_mergel(step1[6], step1[14]);
2010 step2[14] = vec_mergeh(step1[7], step1[15]);
2011 step2[15] = vec_mergel(step1[7], step1[15]);
2012
2013 step3[0] = vec_mergeh(step2[0], step2[8]);
2014 step3[1] = vec_mergel(step2[0], step2[8]);
2015 step3[2] = vec_mergeh(step2[1], step2[9]);
2016 step3[3] = vec_mergel(step2[1], step2[9]);
2017 step3[4] = vec_mergeh(step2[2], step2[10]);
2018 step3[5] = vec_mergel(step2[2], step2[10]);
2019 step3[6] = vec_mergeh(step2[3], step2[11]);
2020 step3[7] = vec_mergel(step2[3], step2[11]);
2021 step3[8] = vec_mergeh(step2[4], step2[12]);
2022 step3[9] = vec_mergel(step2[4], step2[12]);
2023 step3[10] = vec_mergeh(step2[5], step2[13]);
2024 step3[11] = vec_mergel(step2[5], step2[13]);
2025 step3[12] = vec_mergeh(step2[6], step2[14]);
2026 step3[13] = vec_mergel(step2[6], step2[14]);
2027 step3[14] = vec_mergeh(step2[7], step2[15]);
2028 step3[15] = vec_mergel(step2[7], step2[15]);
2029
2030 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2031 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2032 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2033 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2034 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2035 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2036 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2037 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2038 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2039 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2040 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2041 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2042 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2043 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2044 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2045 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2046}
2047
2048EIGEN_DEVICE_FUNC inline void
2049ptranspose(PacketBlock<Packet16uc,16>& kernel) {
2050 Packet16uc step1[16], step2[16], step3[16];
2051
2052 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2053 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2054 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2055 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2056 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2057 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2058 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2059 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2060 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2061 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2062 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2063 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2064 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2065 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2066 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2067 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2068
2069 step2[0] = vec_mergeh(step1[0], step1[8]);
2070 step2[1] = vec_mergel(step1[0], step1[8]);
2071 step2[2] = vec_mergeh(step1[1], step1[9]);
2072 step2[3] = vec_mergel(step1[1], step1[9]);
2073 step2[4] = vec_mergeh(step1[2], step1[10]);
2074 step2[5] = vec_mergel(step1[2], step1[10]);
2075 step2[6] = vec_mergeh(step1[3], step1[11]);
2076 step2[7] = vec_mergel(step1[3], step1[11]);
2077 step2[8] = vec_mergeh(step1[4], step1[12]);
2078 step2[9] = vec_mergel(step1[4], step1[12]);
2079 step2[10] = vec_mergeh(step1[5], step1[13]);
2080 step2[11] = vec_mergel(step1[5], step1[13]);
2081 step2[12] = vec_mergeh(step1[6], step1[14]);
2082 step2[13] = vec_mergel(step1[6], step1[14]);
2083 step2[14] = vec_mergeh(step1[7], step1[15]);
2084 step2[15] = vec_mergel(step1[7], step1[15]);
2085
2086 step3[0] = vec_mergeh(step2[0], step2[8]);
2087 step3[1] = vec_mergel(step2[0], step2[8]);
2088 step3[2] = vec_mergeh(step2[1], step2[9]);
2089 step3[3] = vec_mergel(step2[1], step2[9]);
2090 step3[4] = vec_mergeh(step2[2], step2[10]);
2091 step3[5] = vec_mergel(step2[2], step2[10]);
2092 step3[6] = vec_mergeh(step2[3], step2[11]);
2093 step3[7] = vec_mergel(step2[3], step2[11]);
2094 step3[8] = vec_mergeh(step2[4], step2[12]);
2095 step3[9] = vec_mergel(step2[4], step2[12]);
2096 step3[10] = vec_mergeh(step2[5], step2[13]);
2097 step3[11] = vec_mergel(step2[5], step2[13]);
2098 step3[12] = vec_mergeh(step2[6], step2[14]);
2099 step3[13] = vec_mergel(step2[6], step2[14]);
2100 step3[14] = vec_mergeh(step2[7], step2[15]);
2101 step3[15] = vec_mergel(step2[7], step2[15]);
2102
2103 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2104 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2105 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2106 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2107 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2108 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2109 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2110 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2111 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2112 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2113 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2114 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2115 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2116 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2117 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2118 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2119}
2120
2121template<typename Packet> EIGEN_STRONG_INLINE
2122Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
2123 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
2124 Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
2125 return vec_sel(elsePacket, thenPacket, mask);
2126}
2127
2128template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
2129 return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
2130}
2131
2132template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
2133 return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
2134}
2135
2136template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
2137 Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2138 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
2139 Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(select, p8us_ONE));
2140 Packet8s result = vec_sel(elsePacket, thenPacket, mask);
2141 return result;
2142}
2143
2144template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
2145 Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2146 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
2147 Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(reinterpret_cast<Packet8us>(select), p8us_ONE));
2148 return vec_sel(elsePacket, thenPacket, mask);
2149}
2150
2151template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) {
2152 return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
2153}
2154
2155template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) {
2156 Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2157 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
2158 ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
2159 ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
2160
2161 Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
2162 return vec_sel(elsePacket, thenPacket, mask);
2163}
2164
2165template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
2166 Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2167 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
2168 ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
2169 ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
2170
2171 Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
2172 return vec_sel(elsePacket, thenPacket, mask);
2173}
2174
2175template <>
2176struct type_casting_traits<float, int> {
2177 enum {
2178 VectorizedCast = 1,
2179 SrcCoeffRatio = 1,
2180 TgtCoeffRatio = 1
2181 };
2182};
2183
2184template <>
2185struct type_casting_traits<int, float> {
2186 enum {
2187 VectorizedCast = 1,
2188 SrcCoeffRatio = 1,
2189 TgtCoeffRatio = 1
2190 };
2191};
2192
2193template <>
2195 enum {
2196 VectorizedCast = 1,
2197 SrcCoeffRatio = 1,
2198 TgtCoeffRatio = 1
2199 };
2200};
2201
2202template <>
2204 enum {
2205 VectorizedCast = 1,
2206 SrcCoeffRatio = 1,
2207 TgtCoeffRatio = 1
2208 };
2209};
2210
2211template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
2212 return vec_cts(a,0);
2213}
2214
2215template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
2216 return vec_ctu(a,0);
2217}
2218
2219template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
2220 return vec_ctf(a,0);
2221}
2222
2223template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
2224 return vec_ctf(a,0);
2225}
2226
2227template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
2228 Packet4f float_even = Bf16ToF32Even(a);
2229 Packet4f float_odd = Bf16ToF32Odd(a);
2230 Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
2231 Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd);
2232 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
2233 Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
2234 Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
2235
2236 //Check values that are bigger than USHRT_MAX (0xFFFF)
2237 Packet4bi overflow_selector;
2238 if(vec_any_gt(int_even, p4ui_low_mask)){
2239 overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
2240 low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
2241 }
2242 if(vec_any_gt(int_odd, p4ui_low_mask)){
2243 overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
2244 low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
2245 }
2246
2247 low_odd = plogical_shift_left<16>(low_odd);
2248
2249 Packet4ui int_final = por<Packet4ui>(low_even, low_odd);
2250 return reinterpret_cast<Packet8us>(int_final);
2251}
2252
2253template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
2254 //short -> int -> float -> bfloat16
2255 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
2256 Packet4ui int_cast = reinterpret_cast<Packet4ui>(a);
2257 Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
2258 Packet4ui int_odd = plogical_shift_right<16>(int_cast);
2259 Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even);
2260 Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd);
2261 return F32ToBf16(float_even, float_odd);
2262}
2263
2264
2265template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
2266 return reinterpret_cast<Packet4i>(a);
2267}
2268
2269template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
2270 return reinterpret_cast<Packet4f>(a);
2271}
2272
2273
2274
2275//---------- double ----------
2276#ifdef EIGEN_VECTORIZE_VSX
2277typedef __vector double Packet2d;
2278typedef __vector unsigned long long Packet2ul;
2279typedef __vector long long Packet2l;
2280#if EIGEN_COMP_CLANG
2281typedef Packet2ul Packet2bl;
2282#else
2283typedef __vector __bool long Packet2bl;
2284#endif
2285
2286static Packet2l p2l_ONE = { 1, 1 };
2287static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
2288static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
2289static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
2290static Packet2d p2d_ONE = { 1.0, 1.0 };
2291static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
2292static Packet2d p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
2293 numext::bit_cast<double>(0x8000000000000000ull) };
2294
2295#ifdef _BIG_ENDIAN
2296static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
2297#else
2298static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
2299#endif
2300
2301template<int index> Packet2d vec_splat_dbl(Packet2d& a)
2302{
2303 return vec_splat(a, index);
2304}
2305
2306template<> struct packet_traits<double> : default_packet_traits
2307{
2308 typedef Packet2d type;
2309 typedef Packet2d half;
2310 enum {
2311 Vectorizable = 1,
2312 AlignedOnScalar = 1,
2313 size=2,
2314 HasHalfPacket = 1,
2315
2316 HasAdd = 1,
2317 HasSub = 1,
2318 HasMul = 1,
2319 HasDiv = 1,
2320 HasMin = 1,
2321 HasMax = 1,
2322 HasAbs = 1,
2323 HasSin = 0,
2324 HasCos = 0,
2325 HasLog = 0,
2326 HasExp = 1,
2327 HasSqrt = 1,
2328#if !EIGEN_COMP_CLANG
2329 HasRsqrt = 1,
2330#else
2331 HasRsqrt = 0,
2332#endif
2333 HasRound = 1,
2334 HasFloor = 1,
2335 HasCeil = 1,
2336 HasRint = 1,
2337 HasNegate = 1,
2338 HasBlend = 1
2339 };
2340};
2341
2342template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
2343
2344inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
2345{
2346 union {
2347 Packet2l v;
2348 int64_t n[2];
2349 } vt;
2350 vt.v = v;
2351 s << vt.n[0] << ", " << vt.n[1];
2352 return s;
2353}
2354
2355inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
2356{
2357 union {
2358 Packet2d v;
2359 double n[2];
2360 } vt;
2361 vt.v = v;
2362 s << vt.n[0] << ", " << vt.n[1];
2363 return s;
2364}
2365
2366// Need to define them first or we get specialization after instantiation errors
2367template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
2368{
2369 EIGEN_DEBUG_ALIGNED_LOAD
2370 return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
2371}
2372
2373template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
2374{
2375 EIGEN_DEBUG_ALIGNED_STORE
2376 vec_xst(from, 0, to);
2377}
2378
2379template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
2380 Packet2d v = {from, from};
2381 return v;
2382}
2383
2384template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
2385 Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
2386 return reinterpret_cast<Packet2d>(v);
2387}
2388
2389template<> EIGEN_STRONG_INLINE void
2390pbroadcast4<Packet2d>(const double *a,
2391 Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
2392{
2393 //This way is faster than vec_splat (at least for doubles in Power 9)
2394 a0 = pset1<Packet2d>(a[0]);
2395 a1 = pset1<Packet2d>(a[1]);
2396 a2 = pset1<Packet2d>(a[2]);
2397 a3 = pset1<Packet2d>(a[3]);
2398}
2399
2400template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
2401{
2402 EIGEN_ALIGN16 double af[2];
2403 af[0] = from[0*stride];
2404 af[1] = from[1*stride];
2405 return pload<Packet2d>(af);
2406}
2407template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
2408{
2409 EIGEN_ALIGN16 double af[2];
2410 pstore<double>(af, from);
2411 to[0*stride] = af[0];
2412 to[1*stride] = af[1];
2413}
2414
2415template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
2416
2417template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
2418
2419template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
2420
2421template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
2422{
2423#ifdef __POWER8_VECTOR__
2424 return vec_neg(a);
2425#else
2426 return vec_xor(a, p2d_MZERO);
2427#endif
2428}
2429
2430template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
2431
2432template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
2433template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
2434
2435// for some weird raisons, it has to be overloaded for packet of integers
2436template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
2437
2438template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
2439{
2440 // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
2441 Packet2d ret;
2442 __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
2443 return ret;
2444 }
2445
2446template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
2447{
2448 // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
2449 Packet2d ret;
2450 __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
2451 return ret;
2452}
2453
2454template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
2455template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
2456template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
2457template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
2458 Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
2459 return vec_nor(c,c);
2460}
2461
2462template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
2463
2464template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
2465
2466template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
2467
2468template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
2469
2470template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
2471{
2472 Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
2473 Packet2d res;
2474
2475 __asm__("xvrdpiz %x0, %x1\n\t"
2476 : "=&wa" (res)
2477 : "wa" (t));
2478
2479 return res;
2480}
2481template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
2482template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
2483template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
2484{
2485 Packet2d res;
2486
2487 __asm__("xvrdpic %x0, %x1\n\t"
2488 : "=&wa" (res)
2489 : "wa" (a));
2490
2491 return res;
2492}
2493
2494template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
2495{
2496 EIGEN_DEBUG_UNALIGNED_LOAD
2497 return vec_xl(0, const_cast<double*>(from));
2498}
2499
2500template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
2501{
2502 Packet2d p;
2503 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
2504 else p = ploadu<Packet2d>(from);
2505 return vec_splat_dbl<0>(p);
2506}
2507
2508template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
2509{
2510 EIGEN_DEBUG_UNALIGNED_STORE
2511 vec_xst(from, 0, to);
2512}
2513
2514template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
2515
2516template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
2517
2518template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
2519{
2520 return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
2521}
2522template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
2523
2524// VSX support varies between different compilers and even different
2525// versions of the same compiler. For gcc version >= 4.9.3, we can use
2526// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use
2527// a slow version that works with older compilers.
2528// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
2529// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
2530template<>
2531inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
2532#if EIGEN_GNUC_AT_LEAST(5, 4) || \
2533 (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
2534 return vec_cts(x, 0); // TODO: check clang version.
2535#else
2536 double tmp[2];
2537 memcpy(tmp, &x, sizeof(tmp));
2538 Packet2l l = { static_cast<long long>(tmp[0]),
2539 static_cast<long long>(tmp[1]) };
2540 return l;
2541#endif
2542}
2543
2544template<>
2545inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
2546 unsigned long long tmp[2];
2547 memcpy(tmp, &x, sizeof(tmp));
2548 Packet2d d = { static_cast<double>(tmp[0]),
2549 static_cast<double>(tmp[1]) };
2550 return d;
2551}
2552
2553
2554// Packet2l shifts.
2555// For POWER8 we simply use vec_sr/l.
2556//
2557// Things are more complicated for POWER7. There is actually a
2558// vec_xxsxdi intrinsic but it is not supported by some gcc versions.
2559// So we need to shift by N % 32 and rearrage bytes.
2560#ifdef __POWER8_VECTOR__
2561
2562template<int N>
2563EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
2564 const Packet2ul shift = { N, N };
2565 return vec_sl(a, shift);
2566}
2567
2568template<int N>
2569EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
2570 const Packet2ul shift = { N, N };
2571 return vec_sr(a, shift);
2572}
2573
2574#else
2575
2576// Shifts [A, B, C, D] to [B, 0, D, 0].
2577// Used to implement left shifts for Packet2l.
2578EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
2579 static const Packet16uc perm = {
2580 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
2581 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
2582 #ifdef _BIG_ENDIAN
2583 return vec_perm(p4i_ZERO, a, perm);
2584 #else
2585 return vec_perm(a, p4i_ZERO, perm);
2586 #endif
2587}
2588
2589// Shifts [A, B, C, D] to [0, A, 0, C].
2590// Used to implement right shifts for Packet2l.
2591EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
2592 static const Packet16uc perm = {
2593 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
2594 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
2595 #ifdef _BIG_ENDIAN
2596 return vec_perm(p4i_ZERO, a, perm);
2597 #else
2598 return vec_perm(a, p4i_ZERO, perm);
2599 #endif
2600}
2601
2602template<int N, typename EnableIf = void>
2603struct plogical_shift_left_impl;
2604
2605template<int N>
2606struct plogical_shift_left_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
2607 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2608 static const unsigned n = static_cast<unsigned>(N);
2609 const Packet4ui shift = {n, n, n, n};
2610 const Packet4i ai = reinterpret_cast<Packet4i>(a);
2611 static const unsigned m = static_cast<unsigned>(32 - N);
2612 const Packet4ui shift_right = {m, m, m, m};
2613 const Packet4i out_hi = vec_sl(ai, shift);
2614 const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
2615 return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
2616 }
2617};
2618
2619template<int N>
2620struct plogical_shift_left_impl<N, typename enable_if<(N >= 32)>::type> {
2621 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2622 static const unsigned m = static_cast<unsigned>(N - 32);
2623 const Packet4ui shift = {m, m, m, m};
2624 const Packet4i ai = reinterpret_cast<Packet4i>(a);
2625 return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
2626 }
2627};
2628
2629template<int N>
2630EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
2631 return plogical_shift_left_impl<N>::run(a);
2632}
2633
2634template<int N, typename EnableIf = void>
2635struct plogical_shift_right_impl;
2636
2637template<int N>
2638struct plogical_shift_right_impl<N, typename enable_if<(N < 32) && (N >= 0)>::type> {
2639 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2640 static const unsigned n = static_cast<unsigned>(N);
2641 const Packet4ui shift = {n, n, n, n};
2642 const Packet4i ai = reinterpret_cast<Packet4i>(a);
2643 static const unsigned m = static_cast<unsigned>(32 - N);
2644 const Packet4ui shift_left = {m, m, m, m};
2645 const Packet4i out_lo = vec_sr(ai, shift);
2646 const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
2647 return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
2648 }
2649};
2650
2651template<int N>
2652struct plogical_shift_right_impl<N, typename enable_if<(N >= 32)>::type> {
2653 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
2654 static const unsigned m = static_cast<unsigned>(N - 32);
2655 const Packet4ui shift = {m, m, m, m};
2656 const Packet4i ai = reinterpret_cast<Packet4i>(a);
2657 return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
2658 }
2659};
2660
2661template<int N>
2662EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
2663 return plogical_shift_right_impl<N>::run(a);
2664}
2665#endif
2666
2667template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
2668 // Clamp exponent to [-2099, 2099]
2669 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
2670 const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
2671
2672 // Split 2^e into four factors and multiply:
2673 const Packet2l bias = { 1023, 1023 };
2674 Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
2675 Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
2676 Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
2677 b = psub(psub(psub(e, b), b), b); // e - 3b
2678 c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
2679 out = pmul(out, c); // a * 2^e
2680 return out;
2681}
2682
2683
2684// Extract exponent without existence of Packet2l.
2685template<>
2686EIGEN_STRONG_INLINE
2687Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
2688 return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
2689}
2690
2691template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
2692 return pfrexp_generic(a, exponent);
2693}
2694
2695template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
2696{
2697 Packet2d b, sum;
2698 b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
2699 sum = a + b;
2700 return pfirst<Packet2d>(sum);
2701}
2702
2703// Other reduction functions:
2704// mul
2705template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
2706{
2707 return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
2708}
2709
2710// min
2711template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
2712{
2713 return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
2714}
2715
2716// max
2717template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
2718{
2719 return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
2720}
2721
2722EIGEN_DEVICE_FUNC inline void
2723ptranspose(PacketBlock<Packet2d,2>& kernel) {
2724 Packet2d t0, t1;
2725 t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
2726 t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
2727 kernel.packet[0] = t0;
2728 kernel.packet[1] = t1;
2729}
2730
2731template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
2732 Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
2733 Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
2734 return vec_sel(elsePacket, thenPacket, mask);
2735}
2736
2737
2738#endif // EIGEN_VECTORIZE_VSX
2739} // end namespace internal
2740
2741} // end namespace Eigen
2742
2743#endif // EIGEN_PACKET_MATH_ALTIVEC_H
EIGEN_DEVICE_FUNC const Select< Derived, ThenDerived, ElseDerived > select(const DenseBase< ThenDerived > &thenMatrix, const DenseBase< ElseDerived > &elseMatrix) const
Definition Select.h:126
Base class for all dense matrices, vectors, and expressions.
Definition MatrixBase.h:50
@ Aligned16
Data pointer is aligned on a 16 bytes boundary.
Definition Constants.h:235
Namespace containing all symbols from the Eigen library.
Definition LDLT.h:16
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
int N
Simulate some binary data with a single categorical and single continuous predictor.
Definition logistic_regression.py:26
Definition BFloat16.h:58
Definition GenericPacketMath.h:43
Definition GenericPacketMath.h:160
Definition GenericPacketMath.h:107
Definition GenericPacketMath.h:148
Definition GenericPacketMath.h:133