Medial Code Documentation
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
6// Heavily based on Gael's SSE version.
7//
8// This Source Code Form is subject to the terms of the Mozilla
9// Public License v. 2.0. If a copy of the MPL was not distributed
10// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11
12#ifndef EIGEN_PACKET_MATH_NEON_H
13#define EIGEN_PACKET_MATH_NEON_H
14
15namespace Eigen {
16
17namespace internal {
18
19#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
21#endif
22
23#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25#endif
26
27#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
28#if EIGEN_ARCH_ARM64
29#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
30#else
31#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
32#endif
33#endif
34
35#if EIGEN_COMP_MSVC_STRICT
36
37// In MSVC's arm_neon.h header file, all NEON vector types
38// are aliases to the same underlying type __n128.
39// We thus have to wrap them to make them different C++ types.
40// (See also bug 1428)
41typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
42typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
43typedef eigen_packet_wrapper<int32_t ,2> Packet4c;
44typedef eigen_packet_wrapper<int8x8_t ,3> Packet8c;
45typedef eigen_packet_wrapper<int8x16_t ,4> Packet16c;
46typedef eigen_packet_wrapper<uint32_t ,5> Packet4uc;
47typedef eigen_packet_wrapper<uint8x8_t ,6> Packet8uc;
48typedef eigen_packet_wrapper<uint8x16_t ,7> Packet16uc;
49typedef eigen_packet_wrapper<int16x4_t ,8> Packet4s;
50typedef eigen_packet_wrapper<int16x8_t ,9> Packet8s;
51typedef eigen_packet_wrapper<uint16x4_t ,10> Packet4us;
52typedef eigen_packet_wrapper<uint16x8_t ,11> Packet8us;
53typedef eigen_packet_wrapper<int32x2_t ,12> Packet2i;
54typedef eigen_packet_wrapper<int32x4_t ,13> Packet4i;
55typedef eigen_packet_wrapper<uint32x2_t ,14> Packet2ui;
56typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
57typedef eigen_packet_wrapper<int64x2_t ,16> Packet2l;
58typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
59
60EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
61 float from[4] = {a, b, c, d};
62 return vld1q_f32(from);
63}
64
65EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
66 float from[2] = {a, b};
67 return vld1_f32(from);
68}
69
70#else
71
72typedef float32x2_t Packet2f;
73typedef float32x4_t Packet4f;
74typedef eigen_packet_wrapper<int32_t ,2> Packet4c;
75typedef int8x8_t Packet8c;
76typedef int8x16_t Packet16c;
77typedef eigen_packet_wrapper<uint32_t ,5> Packet4uc;
78typedef uint8x8_t Packet8uc;
79typedef uint8x16_t Packet16uc;
80typedef int16x4_t Packet4s;
81typedef int16x8_t Packet8s;
82typedef uint16x4_t Packet4us;
83typedef uint16x8_t Packet8us;
84typedef int32x2_t Packet2i;
85typedef int32x4_t Packet4i;
86typedef uint32x2_t Packet2ui;
87typedef uint32x4_t Packet4ui;
88typedef int64x2_t Packet2l;
89typedef uint64x2_t Packet2ul;
90
91EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
92 const Packet2f low = {a, b};
93 const Packet2f high = {c, d};
94 return vcombine_f32(low, high);
95}
96
97EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
98 const Packet2f result = {a, b};
99 return result;
100}
101
102#endif // EIGEN_COMP_MSVC_STRICT
103
104EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
105 const float* a = reinterpret_cast<const float*>(&m);
106 Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3)));
107 return res;
108}
109
110// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
111// == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
112// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
113// to enable a shared implementation for fast inversion of matrices of size 4.
114template<bool interleave>
115EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask)
116{
117 const float* a = reinterpret_cast<const float*>(&m);
118 const float* b = reinterpret_cast<const float*>(&n);
119 Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
120 return res;
121}
122
123template<>
124EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n, int mask)
125{
126 const float* a = reinterpret_cast<const float*>(&m);
127 const float* b = reinterpret_cast<const float*>(&n);
128 Packet4f res = make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
129 return res;
130}
131
132EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
133
134EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s)
135{
136 return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
137}
138EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
139{
140 return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
141}
142EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
143{
144 return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
145}
146EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
147{
148 return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
149}
150EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
151{
152 return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
153}
154EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
155{
156 return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
157}
158#define vec4f_duplane(a, p) \
159 Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
160
161#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
162 const Packet4f p4f_##NAME = pset1<Packet4f>(X)
163
164#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
165 const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
166
167#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
168 const Packet4i p4i_##NAME = pset1<Packet4i>(X)
169
170#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
171 // __builtin_prefetch tends to do nothing on ARM64 compilers because the
172 // prefetch instructions there are too detailed for __builtin_prefetch to map
173 // meaningfully to them.
174 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
175#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
176 #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
177#elif defined __pld
178 #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
179#elif EIGEN_ARCH_ARM
180 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
181#else
182 // by default no explicit prefetching
183 #define EIGEN_ARM_PREFETCH(ADDR)
184#endif
185
186template <>
187struct packet_traits<float> : default_packet_traits
188{
189 typedef Packet4f type;
190 typedef Packet2f half;
191 enum
192 {
193 Vectorizable = 1,
194 AlignedOnScalar = 1,
195 size = 4,
196 HasHalfPacket = 1,
197
198 HasAdd = 1,
199 HasSub = 1,
200 HasShift = 1,
201 HasMul = 1,
202 HasNegate = 1,
203 HasAbs = 1,
204 HasArg = 0,
205 HasAbs2 = 1,
206 HasAbsDiff = 1,
207 HasMin = 1,
208 HasMax = 1,
209 HasConj = 1,
210 HasSetLinear = 0,
211 HasBlend = 0,
212
213 HasDiv = 1,
214 HasFloor = 1,
215 HasCeil = 1,
216 HasRint = 1,
217
218 HasSin = EIGEN_FAST_MATH,
219 HasCos = EIGEN_FAST_MATH,
220 HasLog = 1,
221 HasExp = 1,
222 HasSqrt = 1,
223 HasRsqrt = 1,
224 HasTanh = EIGEN_FAST_MATH,
225 HasErf = EIGEN_FAST_MATH,
226 HasBessel = 0, // Issues with accuracy.
227 HasNdtri = 0
228 };
229};
230
231template <>
233{
234 typedef Packet16c type;
235 typedef Packet8c half;
236 enum
237 {
238 Vectorizable = 1,
239 AlignedOnScalar = 1,
240 size = 16,
241 HasHalfPacket = 1,
242
243 HasAdd = 1,
244 HasSub = 1,
245 HasShift = 1,
246 HasMul = 1,
247 HasNegate = 1,
248 HasAbs = 1,
249 HasAbsDiff = 1,
250 HasArg = 0,
251 HasAbs2 = 1,
252 HasMin = 1,
253 HasMax = 1,
254 HasConj = 1,
255 HasSetLinear = 0,
256 HasBlend = 0
257 };
258};
259
260template <>
262{
263 typedef Packet16uc type;
264 typedef Packet8uc half;
265 enum
266 {
267 Vectorizable = 1,
268 AlignedOnScalar = 1,
269 size = 16,
270 HasHalfPacket = 1,
271
272 HasAdd = 1,
273 HasSub = 1,
274 HasShift = 1,
275 HasMul = 1,
276 HasNegate = 0,
277 HasAbs = 1,
278 HasAbsDiff = 1,
279 HasArg = 0,
280 HasAbs2 = 1,
281 HasMin = 1,
282 HasMax = 1,
283 HasConj = 1,
284 HasSetLinear = 0,
285 HasBlend = 0,
286
287 HasSqrt = 1
288 };
289};
290
291template <>
293{
294 typedef Packet8s type;
295 typedef Packet4s half;
296 enum
297 {
298 Vectorizable = 1,
299 AlignedOnScalar = 1,
300 size = 8,
301 HasHalfPacket = 1,
302
303 HasAdd = 1,
304 HasSub = 1,
305 HasShift = 1,
306 HasMul = 1,
307 HasNegate = 1,
308 HasAbs = 1,
309 HasAbsDiff = 1,
310 HasArg = 0,
311 HasAbs2 = 1,
312 HasMin = 1,
313 HasMax = 1,
314 HasConj = 1,
315 HasSetLinear = 0,
316 HasBlend = 0
317 };
318};
319
320template <>
322{
323 typedef Packet8us type;
324 typedef Packet4us half;
325 enum
326 {
327 Vectorizable = 1,
328 AlignedOnScalar = 1,
329 size = 8,
330 HasHalfPacket = 1,
331
332 HasAdd = 1,
333 HasSub = 1,
334 HasShift = 1,
335 HasMul = 1,
336 HasNegate = 0,
337 HasAbs = 0,
338 HasAbsDiff = 1,
339 HasArg = 0,
340 HasAbs2 = 1,
341 HasMin = 1,
342 HasMax = 1,
343 HasConj = 1,
344 HasSetLinear = 0,
345 HasBlend = 0,
346 HasSqrt = 1
347 };
348};
349
350template <>
351struct packet_traits<int32_t> : default_packet_traits
352{
353 typedef Packet4i type;
354 typedef Packet2i half;
355 enum
356 {
357 Vectorizable = 1,
358 AlignedOnScalar = 1,
359 size = 4,
360 HasHalfPacket = 1,
361
362 HasAdd = 1,
363 HasSub = 1,
364 HasShift = 1,
365 HasMul = 1,
366 HasNegate = 1,
367 HasAbs = 1,
368 HasArg = 0,
369 HasAbs2 = 1,
370 HasAbsDiff = 1,
371 HasMin = 1,
372 HasMax = 1,
373 HasConj = 1,
374 HasSetLinear = 0,
375 HasBlend = 0
376 };
377};
378
379template <>
381{
382 typedef Packet4ui type;
383 typedef Packet2ui half;
384 enum
385 {
386 Vectorizable = 1,
387 AlignedOnScalar = 1,
388 size = 4,
389 HasHalfPacket = 1,
390
391 HasAdd = 1,
392 HasSub = 1,
393 HasShift = 1,
394 HasMul = 1,
395 HasNegate = 0,
396 HasAbs = 0,
397 HasArg = 0,
398 HasAbs2 = 1,
399 HasAbsDiff = 1,
400 HasMin = 1,
401 HasMax = 1,
402 HasConj = 1,
403 HasSetLinear = 0,
404 HasBlend = 0,
405
406 HasSqrt = 1
407 };
408};
409
410template <>
412{
413 typedef Packet2l type;
414 typedef Packet2l half;
415 enum
416 {
417 Vectorizable = 1,
418 AlignedOnScalar = 1,
419 size = 2,
420 HasHalfPacket = 0,
421
422 HasCmp = 1,
423 HasAdd = 1,
424 HasSub = 1,
425 HasShift = 1,
426 HasMul = 1,
427 HasNegate = 1,
428 HasAbs = 1,
429 HasArg = 0,
430 HasAbs2 = 1,
431 HasAbsDiff = 1,
432 HasMin = 1,
433 HasMax = 1,
434 HasConj = 1,
435 HasSetLinear = 0,
436 HasBlend = 0
437 };
438};
439
440template <>
442{
443 typedef Packet2ul type;
444 typedef Packet2ul half;
445 enum
446 {
447 Vectorizable = 1,
448 AlignedOnScalar = 1,
449 size = 2,
450 HasHalfPacket = 0,
451
452 HasCmp = 1,
453 HasAdd = 1,
454 HasSub = 1,
455 HasShift = 1,
456 HasMul = 1,
457 HasNegate = 0,
458 HasAbs = 0,
459 HasArg = 0,
460 HasAbs2 = 1,
461 HasAbsDiff = 1,
462 HasMin = 1,
463 HasMax = 1,
464 HasConj = 1,
465 HasSetLinear = 0,
466 HasBlend = 0
467 };
468};
469
470#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM
471// workaround gcc 4.2, 4.3 and 4.4 compilation issue
472EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
473EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); }
474EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
475EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
476EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
477#endif
478
479template<> struct unpacket_traits<Packet2f>
480{
481 typedef float type;
482 typedef Packet2f half;
483 typedef Packet2i integer_packet;
484 enum
485 {
486 size = 2,
487 alignment = Aligned16,
488 vectorizable = true,
489 masked_load_available = false,
490 masked_store_available = false
491 };
492};
493template<> struct unpacket_traits<Packet4f>
494{
495 typedef float type;
496 typedef Packet2f half;
497 typedef Packet4i integer_packet;
498 enum
499 {
500 size = 4,
501 alignment = Aligned16,
502 vectorizable = true,
503 masked_load_available = false,
504 masked_store_available = false
505 };
506};
507template<> struct unpacket_traits<Packet4c>
508{
509 typedef int8_t type;
510 typedef Packet4c half;
511 enum
512 {
513 size = 4,
514 alignment = Unaligned,
515 vectorizable = true,
516 masked_load_available = false,
517 masked_store_available = false
518 };
519};
520template<> struct unpacket_traits<Packet8c>
521{
522 typedef int8_t type;
523 typedef Packet4c half;
524 enum
525 {
526 size = 8,
527 alignment = Aligned16,
528 vectorizable = true,
529 masked_load_available = false,
530 masked_store_available = false
531 };
532};
533template<> struct unpacket_traits<Packet16c>
534{
535 typedef int8_t type;
536 typedef Packet8c half;
537 enum
538 {
539 size = 16,
540 alignment = Aligned16,
541 vectorizable = true,
542 masked_load_available = false,
543 masked_store_available = false
544 };
545};
546template<> struct unpacket_traits<Packet4uc>
547{
548 typedef uint8_t type;
549 typedef Packet4uc half;
550 enum
551 {
552 size = 4,
553 alignment = Unaligned,
554 vectorizable = true,
555 masked_load_available = false,
556 masked_store_available = false
557 };
558};
559template<> struct unpacket_traits<Packet8uc>
560{
561 typedef uint8_t type;
562 typedef Packet4uc half;
563 enum
564 {
565 size = 8,
566 alignment = Aligned16,
567 vectorizable = true,
568 masked_load_available = false,
569 masked_store_available = false
570 };
571};
572template<> struct unpacket_traits<Packet16uc>
573{
574 typedef uint8_t type;
575 typedef Packet8uc half;
576 enum
577 {
578 size = 16,
579 alignment = Aligned16,
580 vectorizable = true,
581 masked_load_available = false,
582 masked_store_available = false};
583};
584template<> struct unpacket_traits<Packet4s>
585{
586 typedef int16_t type;
587 typedef Packet4s half;
588 enum
589 {
590 size = 4,
591 alignment = Aligned16,
592 vectorizable = true,
593 masked_load_available = false,
594 masked_store_available = false
595 };
596};
597template<> struct unpacket_traits<Packet8s>
598{
599 typedef int16_t type;
600 typedef Packet4s half;
601 enum
602 {
603 size = 8,
604 alignment = Aligned16,
605 vectorizable = true,
606 masked_load_available = false,
607 masked_store_available = false
608 };
609};
610template<> struct unpacket_traits<Packet4us>
611{
612 typedef uint16_t type;
613 typedef Packet4us half;
614 enum
615 {
616 size = 4,
617 alignment = Aligned16,
618 vectorizable = true,
619 masked_load_available = false,
620 masked_store_available = false
621 };
622};
623template<> struct unpacket_traits<Packet8us>
624{
625 typedef uint16_t type;
626 typedef Packet4us half;
627 enum
628 {
629 size = 8,
630 alignment = Aligned16,
631 vectorizable = true,
632 masked_load_available = false,
633 masked_store_available = false
634 };
635};
636template<> struct unpacket_traits<Packet2i>
637{
638 typedef int32_t type;
639 typedef Packet2i half;
640 enum
641 {
642 size = 2,
643 alignment = Aligned16,
644 vectorizable = true,
645 masked_load_available = false,
646 masked_store_available = false
647 };
648};
649template<> struct unpacket_traits<Packet4i>
650{
651 typedef int32_t type;
652 typedef Packet2i half;
653 enum
654 {
655 size = 4,
656 alignment = Aligned16,
657 vectorizable = true,
658 masked_load_available = false,
659 masked_store_available = false
660 };
661};
662template<> struct unpacket_traits<Packet2ui>
663{
664 typedef uint32_t type;
665 typedef Packet2ui half;
666 enum
667 {
668 size = 2,
669 alignment = Aligned16,
670 vectorizable = true,
671 masked_load_available = false,
672 masked_store_available = false
673 };
674};
675template<> struct unpacket_traits<Packet4ui>
676{
677 typedef uint32_t type;
678 typedef Packet2ui half;
679 enum
680 {
681 size = 4,
682 alignment = Aligned16,
683 vectorizable = true,
684 masked_load_available = false,
685 masked_store_available = false
686 };
687};
688template<> struct unpacket_traits<Packet2l>
689{
690 typedef int64_t type;
691 typedef Packet2l half;
692 enum
693 {
694 size = 2,
695 alignment = Aligned16,
696 vectorizable = true,
697 masked_load_available = false,
698 masked_store_available = false
699 };
700};
701template<> struct unpacket_traits<Packet2ul>
702{
703 typedef uint64_t type;
704 typedef Packet2ul half;
705 enum
706 {
707 size = 2,
708 alignment = Aligned16,
709 vectorizable = true,
710 masked_load_available = false,
711 masked_store_available = false
712 };
713};
714
715template<> EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) { return vdup_n_f32(from); }
716template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
717template<> EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from)
718{ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); }
719template<> EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) { return vdup_n_s8(from); }
720template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) { return vdupq_n_s8(from); }
721template<> EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from)
722{ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); }
723template<> EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) { return vdup_n_u8(from); }
724template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) { return vdupq_n_u8(from); }
725template<> EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) { return vdup_n_s16(from); }
726template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) { return vdupq_n_s16(from); }
727template<> EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) { return vdup_n_u16(from); }
728template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) { return vdupq_n_u16(from); }
729template<> EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) { return vdup_n_s32(from); }
730template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
731template<> EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) { return vdup_n_u32(from); }
732template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return vdupq_n_u32(from); }
733template<> EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) { return vdupq_n_s64(from); }
734template<> EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) { return vdupq_n_u64(from); }
735
736template<> EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(unsigned int from)
737{ return vreinterpret_f32_u32(vdup_n_u32(from)); }
738template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from)
739{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
740
741template<> EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a)
742{
743 const float c[] = {0.0f,1.0f};
744 return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
745}
746template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
747{
748 const float c[] = {0.0f,1.0f,2.0f,3.0f};
749 return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
750}
751template<> EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a)
752{ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); }
753template<> EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a)
754{
755 const int8_t c[] = {0,1,2,3,4,5,6,7};
756 return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
757}
758template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a)
759{
760 const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
761 return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
762}
763template<> EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a)
764{ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); }
765template<> EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a)
766{
767 const uint8_t c[] = {0,1,2,3,4,5,6,7};
768 return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
769}
770template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a)
771{
772 const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
773 return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
774}
775template<> EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a)
776{
777 const int16_t c[] = {0,1,2,3};
778 return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
779}
780template<> EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a)
781{
782 const uint16_t c[] = {0,1,2,3};
783 return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
784}
785template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a)
786{
787 const int16_t c[] = {0,1,2,3,4,5,6,7};
788 return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
789}
790template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a)
791{
792 const uint16_t c[] = {0,1,2,3,4,5,6,7};
793 return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
794}
795template<> EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a)
796{
797 const int32_t c[] = {0,1};
798 return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
799}
800template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
801{
802 const int32_t c[] = {0,1,2,3};
803 return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
804}
805template<> EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a)
806{
807 const uint32_t c[] = {0,1};
808 return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
809}
810template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a)
811{
812 const uint32_t c[] = {0,1,2,3};
813 return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
814}
815template<> EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a)
816{
817 const int64_t c[] = {0,1};
818 return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
819}
820template<> EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a)
821{
822 const uint64_t c[] = {0,1};
823 return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
824}
825
826template<> EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); }
827template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
828template<> EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b)
829{
830 return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(
831 vreinterpret_s8_s32(vdup_n_s32(a)),
832 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
833}
834template<> EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); }
835template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); }
836template<> EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
837{
838 return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(
839 vreinterpret_u8_u32(vdup_n_u32(a)),
840 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
841}
842template<> EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); }
843template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); }
844template<> EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); }
845template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); }
846template<> EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); }
847template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); }
848template<> EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); }
849template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
850template<> EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); }
851template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); }
852template<> EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); }
853template<> EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); }
854
855template<> EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); }
856template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
857template<> EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b)
858{
859 return vget_lane_s32(vreinterpret_s32_s8(vsub_s8(
860 vreinterpret_s8_s32(vdup_n_s32(a)),
861 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
862}
863template<> EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); }
864template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); }
865template<> EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
866{
867 return vget_lane_u32(vreinterpret_u32_u8(vsub_u8(
868 vreinterpret_u8_u32(vdup_n_u32(a)),
869 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
870}
871template<> EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); }
872template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); }
873template<> EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); }
874template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); }
875template<> EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); }
876template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); }
877template<> EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); }
878template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
879template<> EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); }
880template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); }
881template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); }
882template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); }
883
884template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
885template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
886 Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
887 return padd(a, pxor(mask, b));
888}
889template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
890template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
891 Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
892 return padd(a, pxor(mask, b));
893}
894
895template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); }
896template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
897template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a)
898{ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
899template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); }
900template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); }
901template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); }
902template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); }
903template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); }
904template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
905template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
906#if EIGEN_ARCH_ARM64
907 return vnegq_s64(a);
908#else
909 return vcombine_s64(
910 vdup_n_s64(-vgetq_lane_s64(a, 0)),
911 vdup_n_s64(-vgetq_lane_s64(a, 1)));
912#endif
913}
914
915template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; }
916template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
917template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; }
918template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; }
919template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; }
920template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; }
921template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; }
922template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; }
923template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; }
924template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; }
925template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; }
926template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; }
927template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; }
928template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
929template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; }
930template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; }
931template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; }
932template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; }
933
934template<> EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); }
935template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
936template<> EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b)
937{
938 return vget_lane_s32(vreinterpret_s32_s8(vmul_s8(
939 vreinterpret_s8_s32(vdup_n_s32(a)),
940 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
941}
942template<> EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); }
943template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); }
944template<> EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
945{
946 return vget_lane_u32(vreinterpret_u32_u8(vmul_u8(
947 vreinterpret_u8_u32(vdup_n_u32(a)),
948 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
949}
950template<> EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); }
951template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); }
952template<> EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); }
953template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); }
954template<> EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); }
955template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); }
956template<> EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); }
957template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
958template<> EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); }
959template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); }
960template<> EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
961 return vcombine_s64(
962 vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)),
963 vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1)));
964}
965template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
966 return vcombine_u64(
967 vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)),
968 vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
969}
970
971template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
972{
973 eigen_assert(false && "packet integer division are not supported by NEON");
974 return pset1<Packet4c>(0);
975}
976template<> EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/)
977{
978 eigen_assert(false && "packet integer division are not supported by NEON");
979 return pset1<Packet8c>(0);
980}
981template<> EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/)
982{
983 eigen_assert(false && "packet integer division are not supported by NEON");
984 return pset1<Packet16c>(0);
985}
986template<> EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/)
987{
988 eigen_assert(false && "packet integer division are not supported by NEON");
989 return pset1<Packet4uc>(0);
990}
991template<> EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/)
992{
993 eigen_assert(false && "packet integer division are not supported by NEON");
994 return pset1<Packet8uc>(0);
995}
996template<> EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/)
997{
998 eigen_assert(false && "packet integer division are not supported by NEON");
999 return pset1<Packet16uc>(0);
1000}
1001template<> EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/)
1002{
1003 eigen_assert(false && "packet integer division are not supported by NEON");
1004 return pset1<Packet4s>(0);
1005}
1006template<> EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/)
1007{
1008 eigen_assert(false && "packet integer division are not supported by NEON");
1009 return pset1<Packet8s>(0);
1010}
1011template<> EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/)
1012{
1013 eigen_assert(false && "packet integer division are not supported by NEON");
1014 return pset1<Packet4us>(0);
1015}
1016template<> EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/)
1017{
1018 eigen_assert(false && "packet integer division are not supported by NEON");
1019 return pset1<Packet8us>(0);
1020}
1021template<> EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/)
1022{
1023 eigen_assert(false && "packet integer division are not supported by NEON");
1024 return pset1<Packet2i>(0);
1025}
1026template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
1027{
1028 eigen_assert(false && "packet integer division are not supported by NEON");
1029 return pset1<Packet4i>(0);
1030}
1031template<> EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/)
1032{
1033 eigen_assert(false && "packet integer division are not supported by NEON");
1034 return pset1<Packet2ui>(0);
1035}
1036template<> EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/)
1037{
1038 eigen_assert(false && "packet integer division are not supported by NEON");
1039 return pset1<Packet4ui>(0);
1040}
1041template<> EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/)
1042{
1043 eigen_assert(false && "packet integer division are not supported by NEON");
1044 return pset1<Packet2l>(0LL);
1045}
1046template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/)
1047{
1048 eigen_assert(false && "packet integer division are not supported by NEON");
1049 return pset1<Packet2ul>(0ULL);
1050}
1051
1052#ifdef EIGEN_VECTORIZE_FMA
1053template <>
1054EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1055 return vfmaq_f32(c, a, b);
1056}
1057template <>
1058EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1059 return vfma_f32(c, a, b);
1060}
1061#else
1062template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
1063{
1064 return vmlaq_f32(c,a,b);
1065}
1066template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
1067{
1068 return vmla_f32(c,a,b);
1069}
1070#endif
1071
1072// No FMA instruction for int, so use MLA unconditionally.
1073template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c)
1074{
1075 return vget_lane_s32(vreinterpret_s32_s8(vmla_s8(
1076 vreinterpret_s8_s32(vdup_n_s32(c)),
1077 vreinterpret_s8_s32(vdup_n_s32(a)),
1078 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1079}
1080template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c)
1081{ return vmla_s8(c,a,b); }
1082template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c)
1083{ return vmlaq_s8(c,a,b); }
1084template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c)
1085{
1086 return vget_lane_u32(vreinterpret_u32_u8(vmla_u8(
1087 vreinterpret_u8_u32(vdup_n_u32(c)),
1088 vreinterpret_u8_u32(vdup_n_u32(a)),
1089 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1090}
1091template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c)
1092{ return vmla_u8(c,a,b); }
1093template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c)
1094{ return vmlaq_u8(c,a,b); }
1095template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c)
1096{ return vmla_s16(c,a,b); }
1097template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c)
1098{ return vmlaq_s16(c,a,b); }
1099template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c)
1100{ return vmla_u16(c,a,b); }
1101template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c)
1102{ return vmlaq_u16(c,a,b); }
1103template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c)
1104{ return vmla_s32(c,a,b); }
1105template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c)
1106{ return vmlaq_s32(c,a,b); }
1107template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c)
1108{ return vmla_u32(c,a,b); }
1109template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c)
1110{ return vmlaq_u32(c,a,b); }
1111
1112template<> EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b)
1113{ return vabd_f32(a,b); }
1114template<> EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b)
1115{ return vabdq_f32(a,b); }
1116template<> EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b)
1117{
1118 return vget_lane_s32(vreinterpret_s32_s8(vabd_s8(
1119 vreinterpret_s8_s32(vdup_n_s32(a)),
1120 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1121}
1122template<> EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b)
1123{ return vabd_s8(a,b); }
1124template<> EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b)
1125{ return vabdq_s8(a,b); }
1126template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1127{
1128 return vget_lane_u32(vreinterpret_u32_u8(vabd_u8(
1129 vreinterpret_u8_u32(vdup_n_u32(a)),
1130 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1131}
1132template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1133{ return vabd_u8(a,b); }
1134template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1135{ return vabdq_u8(a,b); }
1136template<> EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b)
1137{ return vabd_s16(a,b); }
1138template<> EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b)
1139{ return vabdq_s16(a,b); }
1140template<> EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b)
1141{ return vabd_u16(a,b); }
1142template<> EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b)
1143{ return vabdq_u16(a,b); }
1144template<> EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b)
1145{ return vabd_s32(a,b); }
1146template<> EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b)
1147{ return vabdq_s32(a,b); }
1148template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1149{ return vabd_u32(a,b); }
1150template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1151{ return vabdq_u32(a,b); }
1152
1153template<> EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); }
1154template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
1155
1156#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1157// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
1158template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); }
1159template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); }
1160#endif
1161
1162template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmin<Packet4f>(a, b); }
1163
1164template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmin<Packet2f>(a, b); }
1165
1166template<> EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b)
1167{
1168 return vget_lane_s32(vreinterpret_s32_s8(vmin_s8(
1169 vreinterpret_s8_s32(vdup_n_s32(a)),
1170 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1171}
1172template<> EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); }
1173template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); }
1174template<> EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1175{
1176 return vget_lane_u32(vreinterpret_u32_u8(vmin_u8(
1177 vreinterpret_u8_u32(vdup_n_u32(a)),
1178 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1179}
1180template<> EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); }
1181template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); }
1182template<> EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); }
1183template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); }
1184template<> EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); }
1185template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); }
1186template<> EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); }
1187template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
1188template<> EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); }
1189template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); }
1190template<> EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
1191 return vcombine_s64(
1192 vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1193 vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1194}
1195template<> EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1196 return vcombine_u64(
1197 vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1198 vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1199}
1200
1201template<> EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); }
1202template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
1203
1204#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1205// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
1206template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); }
1207template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); }
1208#endif
1209
1210template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmax<Packet4f>(a, b); }
1211
1212template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmax<Packet2f>(a, b); }
1213
1214template<> EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b)
1215{
1216 return vget_lane_s32(vreinterpret_s32_s8(vmax_s8(
1217 vreinterpret_s8_s32(vdup_n_s32(a)),
1218 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1219}
1220template<> EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); }
1221template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); }
1222template<> EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1223{
1224 return vget_lane_u32(vreinterpret_u32_u8(vmax_u8(
1225 vreinterpret_u8_u32(vdup_n_u32(a)),
1226 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1227}
1228template<> EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); }
1229template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); }
1230template<> EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); }
1231template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); }
1232template<> EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); }
1233template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); }
1234template<> EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); }
1235template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
1236template<> EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); }
1237template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); }
1238template<> EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
1239 return vcombine_s64(
1240 vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1241 vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1242}
1243template<> EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1244 return vcombine_u64(
1245 vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1246 vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1247}
1248
1249template<> EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b)
1250{ return vreinterpret_f32_u32(vcle_f32(a,b)); }
1251template<> EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
1252{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
1253template<> EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b)
1254{
1255 return vget_lane_s32(vreinterpret_s32_u8(vcle_s8(
1256 vreinterpret_s8_s32(vdup_n_s32(a)),
1257 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1258}
1259template<> EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b)
1260{ return vreinterpret_s8_u8(vcle_s8(a,b)); }
1261template<> EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b)
1262{ return vreinterpretq_s8_u8(vcleq_s8(a,b)); }
1263template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1264{
1265 return vget_lane_u32(vreinterpret_u32_u8(vcle_u8(
1266 vreinterpret_u8_u32(vdup_n_u32(a)),
1267 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1268}
1269template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1270{ return vcle_u8(a,b); }
1271template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1272{ return vcleq_u8(a,b); }
1273template<> EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b)
1274{ return vreinterpret_s16_u16(vcle_s16(a,b)); }
1275template<> EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b)
1276{ return vreinterpretq_s16_u16(vcleq_s16(a,b)); }
1277template<> EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b)
1278{ return vcle_u16(a,b); }
1279template<> EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b)
1280{ return vcleq_u16(a,b); }
1281template<> EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b)
1282{ return vreinterpret_s32_u32(vcle_s32(a,b)); }
1283template<> EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b)
1284{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); }
1285template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1286{ return vcle_u32(a,b); }
1287template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1288{ return vcleq_u32(a,b); }
1289template<> EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b)
1290{
1291#if EIGEN_ARCH_ARM64
1292 return vreinterpretq_s64_u64(vcleq_s64(a,b));
1293#else
1294 return vcombine_s64(
1295 vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1296 vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1297#endif
1298}
1299template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1300{
1301#if EIGEN_ARCH_ARM64
1302 return vcleq_u64(a,b);
1303#else
1304 return vcombine_u64(
1305 vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1306 vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1307#endif
1308}
1309
1310template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b)
1311{ return vreinterpret_f32_u32(vclt_f32(a,b)); }
1312template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
1313{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
1314template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b)
1315{
1316 return vget_lane_s32(vreinterpret_s32_u8(vclt_s8(
1317 vreinterpret_s8_s32(vdup_n_s32(a)),
1318 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1319}
1320template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b)
1321{ return vreinterpret_s8_u8(vclt_s8(a,b)); }
1322template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b)
1323{ return vreinterpretq_s8_u8(vcltq_s8(a,b)); }
1324template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1325{
1326 return vget_lane_u32(vreinterpret_u32_u8(vclt_u8(
1327 vreinterpret_u8_u32(vdup_n_u32(a)),
1328 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1329}
1330template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1331{ return vclt_u8(a,b); }
1332template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1333{ return vcltq_u8(a,b); }
1334template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b)
1335{ return vreinterpret_s16_u16(vclt_s16(a,b)); }
1336template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b)
1337{ return vreinterpretq_s16_u16(vcltq_s16(a,b)); }
1338template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b)
1339{ return vclt_u16(a,b); }
1340template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b)
1341{ return vcltq_u16(a,b); }
1342template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b)
1343{ return vreinterpret_s32_u32(vclt_s32(a,b)); }
1344template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b)
1345{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); }
1346template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1347{ return vclt_u32(a,b); }
1348template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1349{ return vcltq_u32(a,b); }
1350template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b)
1351{
1352#if EIGEN_ARCH_ARM64
1353 return vreinterpretq_s64_u64(vcltq_s64(a,b));
1354#else
1355 return vcombine_s64(
1356 vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1357 vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1358#endif
1359}
1360template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1361{
1362#if EIGEN_ARCH_ARM64
1363 return vcltq_u64(a,b);
1364#else
1365 return vcombine_u64(
1366 vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1367 vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1368#endif
1369}
1370
1371template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b)
1372{ return vreinterpret_f32_u32(vceq_f32(a,b)); }
1373template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
1374{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
1375template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b)
1376{
1377 return vget_lane_s32(vreinterpret_s32_u8(vceq_s8(
1378 vreinterpret_s8_s32(vdup_n_s32(a)),
1379 vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1380}
1381template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b)
1382{ return vreinterpret_s8_u8(vceq_s8(a,b)); }
1383template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b)
1384{ return vreinterpretq_s8_u8(vceqq_s8(a,b)); }
1385template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1386{
1387 return vget_lane_u32(vreinterpret_u32_u8(vceq_u8(
1388 vreinterpret_u8_u32(vdup_n_u32(a)),
1389 vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1390}
1391template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1392{ return vceq_u8(a,b); }
1393template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1394{ return vceqq_u8(a,b); }
1395template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b)
1396{ return vreinterpret_s16_u16(vceq_s16(a,b)); }
1397template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b)
1398{ return vreinterpretq_s16_u16(vceqq_s16(a,b)); }
1399template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b)
1400{ return vceq_u16(a,b); }
1401template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b)
1402{ return vceqq_u16(a,b); }
1403template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b)
1404{ return vreinterpret_s32_u32(vceq_s32(a,b)); }
1405template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b)
1406{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
1407template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1408{ return vceq_u32(a,b); }
1409template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1410{ return vceqq_u32(a,b); }
1411template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b)
1412{
1413#if EIGEN_ARCH_ARM64
1414 return vreinterpretq_s64_u64(vceqq_s64(a,b));
1415#else
1416 return vcombine_s64(
1417 vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1418 vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1419#endif
1420}
1421template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1422{
1423#if EIGEN_ARCH_ARM64
1424 return vceqq_u64(a,b);
1425#else
1426 return vcombine_u64(
1427 vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1428 vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1429#endif
1430}
1431
1432template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b)
1433{ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); }
1434template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b)
1435{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
1436
1437// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
1438template<> EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b)
1439{ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1440template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
1441{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1442template<> EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b)
1443{ return a & b; }
1444template<> EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b)
1445{ return vand_s8(a,b); }
1446template<> EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b)
1447{ return vandq_s8(a,b); }
1448template<> EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1449{ return a & b; }
1450template<> EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1451{ return vand_u8(a,b); }
1452template<> EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1453{ return vandq_u8(a,b); }
1454template<> EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); }
1455template<> EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); }
1456template<> EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b)
1457{ return vand_u16(a,b); }
1458template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b)
1459{ return vandq_u16(a,b); }
1460template<> EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); }
1461template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
1462template<> EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1463{ return vand_u32(a,b); }
1464template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1465{ return vandq_u32(a,b); }
1466template<> EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); }
1467template<> EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1468{ return vandq_u64(a,b); }
1469
1470template<> EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b)
1471{ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1472template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
1473{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1474template<> EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b)
1475{ return a | b; }
1476template<> EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); }
1477template<> EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b)
1478{ return vorrq_s8(a,b); }
1479template<> EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1480{ return a | b; }
1481template<> EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1482{ return vorr_u8(a,b); }
1483template<> EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1484{ return vorrq_u8(a,b); }
1485template<> EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b)
1486{ return vorr_s16(a,b); }
1487template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b)
1488{ return vorrq_s16(a,b); }
1489template<> EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b)
1490{ return vorr_u16(a,b); }
1491template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b)
1492{ return vorrq_u16(a,b); }
1493template<> EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); }
1494template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
1495template<> EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1496{ return vorr_u32(a,b); }
1497template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1498{ return vorrq_u32(a,b); }
1499template<> EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b)
1500{ return vorrq_s64(a,b); }
1501template<> EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1502{ return vorrq_u64(a,b); }
1503
1504template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b)
1505{ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1506template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
1507{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1508template<> EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b)
1509{ return a ^ b; }
1510template<> EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b)
1511{ return veor_s8(a,b); }
1512template<> EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b)
1513{ return veorq_s8(a,b); }
1514template<> EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1515{ return a ^ b; }
1516template<> EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1517{ return veor_u8(a,b); }
1518template<> EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1519{ return veorq_u8(a,b); }
1520template<> EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); }
1521template<> EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); }
1522template<> EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b)
1523{ return veor_u16(a,b); }
1524template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b)
1525{ return veorq_u16(a,b); }
1526template<> EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); }
1527template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
1528template<> EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1529{ return veor_u32(a,b); }
1530template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1531{ return veorq_u32(a,b); }
1532template<> EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b)
1533{ return veorq_s64(a,b); }
1534template<> EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1535{ return veorq_u64(a,b); }
1536
1537template<> EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b)
1538{ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
1539template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
1540{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
1541template<> EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b)
1542{ return a & ~b; }
1543template<> EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); }
1544template<> EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); }
1545template<> EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
1546{ return a & ~b; }
1547template<> EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
1548{ return vbic_u8(a,b); }
1549template<> EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
1550{ return vbicq_u8(a,b); }
1551template<> EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b)
1552{ return vbic_s16(a,b); }
1553template<> EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b)
1554{ return vbicq_s16(a,b); }
1555template<> EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b)
1556{ return vbic_u16(a,b); }
1557template<> EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b)
1558{ return vbicq_u16(a,b); }
1559template<> EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b)
1560{ return vbic_s32(a,b); }
1561template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b)
1562{ return vbicq_s32(a,b); }
1563template<> EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
1564{ return vbic_u32(a,b); }
1565template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
1566{ return vbicq_u32(a,b); }
1567template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b)
1568{ return vbicq_s64(a,b); }
1569template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
1570{ return vbicq_u64(a,b); }
1571
1572
1573template<int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
1574{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
1575template<int N> EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); }
1576template<int N> EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); }
1577template<int N> EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a)
1578{ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
1579template<int N> EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
1580template<int N> EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
1581template<int N> EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); }
1582template<int N> EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); }
1583template<int N> EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
1584template<int N> EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
1585template<int N> EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); }
1586template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); }
1587template<int N> EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
1588template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
1589template<int N> EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); }
1590template<int N> EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
1591
1592template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a)
1593{ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); }
1594template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a)
1595{ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); }
1596template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a)
1597{ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); }
1598template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a)
1599{ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); }
1600template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
1601template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
1602template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a)
1603{ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); }
1604template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a)
1605{ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); }
1606template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
1607template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
1608template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a)
1609{ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); }
1610template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
1611{ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); }
1612template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
1613template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
1614template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a)
1615{ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); }
1616template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
1617
1618template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a)
1619{ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
1620template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); }
1621template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); }
1622template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a)
1623{ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
1624template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); }
1625template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); }
1626template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); }
1627template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); }
1628template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); }
1629template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); }
1630template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); }
1631template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); }
1632template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); }
1633template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); }
1634template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); }
1635template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); }
1636
1637template<> EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from)
1638{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); }
1639template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
1640{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
1641template<> EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from)
1642{
1643 Packet4c res;
1644 memcpy(&res, from, sizeof(Packet4c));
1645 return res;
1646}
1647template<> EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from)
1648{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); }
1649template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from)
1650{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); }
1651template<> EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from)
1652{
1653 Packet4uc res;
1654 memcpy(&res, from, sizeof(Packet4uc));
1655 return res;
1656}
1657template<> EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from)
1658{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); }
1659template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from)
1660{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); }
1661template<> EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from)
1662{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); }
1663template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from)
1664{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); }
1665template<> EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from)
1666{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); }
1667template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from)
1668{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); }
1669template<> EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from)
1670{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); }
1671template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from)
1672{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
1673template<> EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from)
1674{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); }
1675template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from)
1676{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); }
1677template<> EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from)
1678{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); }
1679template<> EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from)
1680{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); }
1681
1682template<> EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from)
1683{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); }
1684template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
1685{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
1686template<> EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from)
1687{
1688 Packet4c res;
1689 memcpy(&res, from, sizeof(Packet4c));
1690 return res;
1691}
1692template<> EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from)
1693{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); }
1694template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from)
1695{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); }
1696template<> EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from)
1697{
1698 Packet4uc res;
1699 memcpy(&res, from, sizeof(Packet4uc));
1700 return res;
1701}
1702template<> EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from)
1703{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); }
1704template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from)
1705{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); }
1706template<> EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from)
1707{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); }
1708template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from)
1709{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); }
1710template<> EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from)
1711{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); }
1712template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from)
1713{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); }
1714template<> EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from)
1715{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); }
1716template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from)
1717{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
1718template<> EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from)
1719{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); }
1720template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
1721{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); }
1722template<> EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from)
1723{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); }
1724template<> EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from)
1725{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); }
1726
1727template<> EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from)
1728{ return vld1_dup_f32(from); }
1729template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
1730{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); }
1731template<> EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from)
1732{
1733 const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
1734 return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0);
1735}
1736template<> EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from)
1737{
1738 const int8x8_t a = vld1_s8(from);
1739 return vzip_s8(a,a).val[0];
1740}
1741template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from)
1742{
1743 const int8x8_t a = vld1_s8(from);
1744 const int8x8x2_t b = vzip_s8(a,a);
1745 return vcombine_s8(b.val[0], b.val[1]);
1746}
1747template<> EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from)
1748{
1749 const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
1750 return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0);
1751}
1752template<> EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from)
1753{
1754 const uint8x8_t a = vld1_u8(from);
1755 return vzip_u8(a,a).val[0];
1756}
1757template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from)
1758{
1759 const uint8x8_t a = vld1_u8(from);
1760 const uint8x8x2_t b = vzip_u8(a,a);
1761 return vcombine_u8(b.val[0], b.val[1]);
1762}
1763template<> EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from)
1764{
1765 return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)),
1766 vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]);
1767}
1768template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from)
1769{
1770 const int16x4_t a = vld1_s16(from);
1771 const int16x4x2_t b = vzip_s16(a,a);
1772 return vcombine_s16(b.val[0], b.val[1]);
1773}
1774template<> EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from)
1775{
1776 return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)),
1777 vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]);
1778}
1779template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from)
1780{
1781 const uint16x4_t a = vld1_u16(from);
1782 const uint16x4x2_t b = vzip_u16(a,a);
1783 return vcombine_u16(b.val[0], b.val[1]);
1784}
1785template<> EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from)
1786{ return vld1_dup_s32(from); }
1787template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
1788{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); }
1789template<> EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from)
1790{ return vld1_dup_u32(from); }
1791template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
1792{ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); }
1793template<> EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from)
1794{ return vld1q_dup_s64(from); }
1795template<> EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from)
1796{ return vld1q_dup_u64(from); }
1797
1798template<> EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) { return vld1q_dup_f32(from); }
1799template<> EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from)
1800{ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); }
1801template<> EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from)
1802{
1803 return vreinterpret_s8_u32(vzip_u32(
1804 vreinterpret_u32_s8(vld1_dup_s8(from)),
1805 vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
1806}
1807template<> EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from)
1808{
1809 const int8x8_t a = vreinterpret_s8_u32(vzip_u32(
1810 vreinterpret_u32_s8(vld1_dup_s8(from)),
1811 vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
1812 const int8x8_t b = vreinterpret_s8_u32(vzip_u32(
1813 vreinterpret_u32_s8(vld1_dup_s8(from+2)),
1814 vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]);
1815 return vcombine_s8(a,b);
1816}
1817template<> EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from)
1818{ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); }
1819template<> EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from)
1820{
1821 return vreinterpret_u8_u32(vzip_u32(
1822 vreinterpret_u32_u8(vld1_dup_u8(from)),
1823 vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
1824}
1825template<> EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from)
1826{
1827 const uint8x8_t a = vreinterpret_u8_u32(vzip_u32(
1828 vreinterpret_u32_u8(vld1_dup_u8(from)),
1829 vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
1830 const uint8x8_t b = vreinterpret_u8_u32(vzip_u32(
1831 vreinterpret_u32_u8(vld1_dup_u8(from+2)),
1832 vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]);
1833 return vcombine_u8(a,b);
1834}
1835template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from)
1836{ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); }
1837template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from)
1838{ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); }
1839template<> EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) { return vld1q_dup_s32(from); }
1840template<> EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) { return vld1q_dup_u32(from); }
1841
1842template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from)
1843{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); }
1844template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
1845{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); }
1846template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from)
1847{ memcpy(to, &from, sizeof(from)); }
1848template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from)
1849{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); }
1850template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from)
1851{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); }
1852template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from)
1853{ memcpy(to, &from, sizeof(from)); }
1854template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from)
1855{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); }
1856template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from)
1857{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); }
1858template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from)
1859{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); }
1860template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from)
1861{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); }
1862template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from)
1863{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); }
1864template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from)
1865{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); }
1866template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from)
1867{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); }
1868template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from)
1869{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); }
1870template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from)
1871{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); }
1872template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from)
1873{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); }
1874template<> EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from)
1875{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); }
1876template<> EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from)
1877{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); }
1878
1879template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from)
1880{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); }
1881template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
1882{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); }
1883template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from)
1884{ memcpy(to, &from, sizeof(from)); }
1885template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from)
1886{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); }
1887template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from)
1888{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); }
1889template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from)
1890{ memcpy(to, &from, sizeof(from)); }
1891template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from)
1892{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); }
1893template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from)
1894{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); }
1895template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from)
1896{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); }
1897template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from)
1898{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); }
1899template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from)
1900{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); }
1901template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from)
1902{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); }
1903template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from)
1904{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); }
1905template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from)
1906{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); }
1907template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from)
1908{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); }
1909template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from)
1910{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); }
1911template<> EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from)
1912{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); }
1913template<> EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from)
1914{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); }
1915
1916template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride)
1917{
1918 Packet2f res = vld1_dup_f32(from);
1919 res = vld1_lane_f32(from + 1*stride, res, 1);
1920 return res;
1921}
1922template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
1923{
1924 Packet4f res = vld1q_dup_f32(from);
1925 res = vld1q_lane_f32(from + 1*stride, res, 1);
1926 res = vld1q_lane_f32(from + 2*stride, res, 2);
1927 res = vld1q_lane_f32(from + 3*stride, res, 3);
1928 return res;
1929}
1930template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride)
1931{
1932 Packet4c res;
1933 for (int i = 0; i != 4; i++)
1934 reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
1935 return res;
1936}
1937template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride)
1938{
1939 Packet8c res = vld1_dup_s8(from);
1940 res = vld1_lane_s8(from + 1*stride, res, 1);
1941 res = vld1_lane_s8(from + 2*stride, res, 2);
1942 res = vld1_lane_s8(from + 3*stride, res, 3);
1943 res = vld1_lane_s8(from + 4*stride, res, 4);
1944 res = vld1_lane_s8(from + 5*stride, res, 5);
1945 res = vld1_lane_s8(from + 6*stride, res, 6);
1946 res = vld1_lane_s8(from + 7*stride, res, 7);
1947 return res;
1948}
1949template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
1950{
1951 Packet16c res = vld1q_dup_s8(from);
1952 res = vld1q_lane_s8(from + 1*stride, res, 1);
1953 res = vld1q_lane_s8(from + 2*stride, res, 2);
1954 res = vld1q_lane_s8(from + 3*stride, res, 3);
1955 res = vld1q_lane_s8(from + 4*stride, res, 4);
1956 res = vld1q_lane_s8(from + 5*stride, res, 5);
1957 res = vld1q_lane_s8(from + 6*stride, res, 6);
1958 res = vld1q_lane_s8(from + 7*stride, res, 7);
1959 res = vld1q_lane_s8(from + 8*stride, res, 8);
1960 res = vld1q_lane_s8(from + 9*stride, res, 9);
1961 res = vld1q_lane_s8(from + 10*stride, res, 10);
1962 res = vld1q_lane_s8(from + 11*stride, res, 11);
1963 res = vld1q_lane_s8(from + 12*stride, res, 12);
1964 res = vld1q_lane_s8(from + 13*stride, res, 13);
1965 res = vld1q_lane_s8(from + 14*stride, res, 14);
1966 res = vld1q_lane_s8(from + 15*stride, res, 15);
1967 return res;
1968}
1969template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride)
1970{
1971 Packet4uc res;
1972 for (int i = 0; i != 4; i++)
1973 reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
1974 return res;
1975}
1976template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride)
1977{
1978 Packet8uc res = vld1_dup_u8(from);
1979 res = vld1_lane_u8(from + 1*stride, res, 1);
1980 res = vld1_lane_u8(from + 2*stride, res, 2);
1981 res = vld1_lane_u8(from + 3*stride, res, 3);
1982 res = vld1_lane_u8(from + 4*stride, res, 4);
1983 res = vld1_lane_u8(from + 5*stride, res, 5);
1984 res = vld1_lane_u8(from + 6*stride, res, 6);
1985 res = vld1_lane_u8(from + 7*stride, res, 7);
1986 return res;
1987}
1988template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
1989{
1990 Packet16uc res = vld1q_dup_u8(from);
1991 res = vld1q_lane_u8(from + 1*stride, res, 1);
1992 res = vld1q_lane_u8(from + 2*stride, res, 2);
1993 res = vld1q_lane_u8(from + 3*stride, res, 3);
1994 res = vld1q_lane_u8(from + 4*stride, res, 4);
1995 res = vld1q_lane_u8(from + 5*stride, res, 5);
1996 res = vld1q_lane_u8(from + 6*stride, res, 6);
1997 res = vld1q_lane_u8(from + 7*stride, res, 7);
1998 res = vld1q_lane_u8(from + 8*stride, res, 8);
1999 res = vld1q_lane_u8(from + 9*stride, res, 9);
2000 res = vld1q_lane_u8(from + 10*stride, res, 10);
2001 res = vld1q_lane_u8(from + 11*stride, res, 11);
2002 res = vld1q_lane_u8(from + 12*stride, res, 12);
2003 res = vld1q_lane_u8(from + 13*stride, res, 13);
2004 res = vld1q_lane_u8(from + 14*stride, res, 14);
2005 res = vld1q_lane_u8(from + 15*stride, res, 15);
2006 return res;
2007}
2008template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride)
2009{
2010 Packet4s res = vld1_dup_s16(from);
2011 res = vld1_lane_s16(from + 1*stride, res, 1);
2012 res = vld1_lane_s16(from + 2*stride, res, 2);
2013 res = vld1_lane_s16(from + 3*stride, res, 3);
2014 return res;
2015}
2016template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride)
2017{
2018 Packet8s res = vld1q_dup_s16(from);
2019 res = vld1q_lane_s16(from + 1*stride, res, 1);
2020 res = vld1q_lane_s16(from + 2*stride, res, 2);
2021 res = vld1q_lane_s16(from + 3*stride, res, 3);
2022 res = vld1q_lane_s16(from + 4*stride, res, 4);
2023 res = vld1q_lane_s16(from + 5*stride, res, 5);
2024 res = vld1q_lane_s16(from + 6*stride, res, 6);
2025 res = vld1q_lane_s16(from + 7*stride, res, 7);
2026 return res;
2027}
2028template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride)
2029{
2030 Packet4us res = vld1_dup_u16(from);
2031 res = vld1_lane_u16(from + 1*stride, res, 1);
2032 res = vld1_lane_u16(from + 2*stride, res, 2);
2033 res = vld1_lane_u16(from + 3*stride, res, 3);
2034 return res;
2035}
2036template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride)
2037{
2038 Packet8us res = vld1q_dup_u16(from);
2039 res = vld1q_lane_u16(from + 1*stride, res, 1);
2040 res = vld1q_lane_u16(from + 2*stride, res, 2);
2041 res = vld1q_lane_u16(from + 3*stride, res, 3);
2042 res = vld1q_lane_u16(from + 4*stride, res, 4);
2043 res = vld1q_lane_u16(from + 5*stride, res, 5);
2044 res = vld1q_lane_u16(from + 6*stride, res, 6);
2045 res = vld1q_lane_u16(from + 7*stride, res, 7);
2046 return res;
2047}
2048template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride)
2049{
2050 Packet2i res = vld1_dup_s32(from);
2051 res = vld1_lane_s32(from + 1*stride, res, 1);
2052 return res;
2053}
2054template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
2055{
2056 Packet4i res = vld1q_dup_s32(from);
2057 res = vld1q_lane_s32(from + 1*stride, res, 1);
2058 res = vld1q_lane_s32(from + 2*stride, res, 2);
2059 res = vld1q_lane_s32(from + 3*stride, res, 3);
2060 return res;
2061}
2062template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride)
2063{
2064 Packet2ui res = vld1_dup_u32(from);
2065 res = vld1_lane_u32(from + 1*stride, res, 1);
2066 return res;
2067}
2068template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
2069{
2070 Packet4ui res = vld1q_dup_u32(from);
2071 res = vld1q_lane_u32(from + 1*stride, res, 1);
2072 res = vld1q_lane_u32(from + 2*stride, res, 2);
2073 res = vld1q_lane_u32(from + 3*stride, res, 3);
2074 return res;
2075}
2076template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride)
2077{
2078 Packet2l res = vld1q_dup_s64(from);
2079 res = vld1q_lane_s64(from + 1*stride, res, 1);
2080 return res;
2081}
2082template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride)
2083{
2084 Packet2ul res = vld1q_dup_u64(from);
2085 res = vld1q_lane_u64(from + 1*stride, res, 1);
2086 return res;
2087}
2088
2089template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride)
2090{
2091 vst1_lane_f32(to + stride*0, from, 0);
2092 vst1_lane_f32(to + stride*1, from, 1);
2093}
2094template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
2095{
2096 vst1q_lane_f32(to + stride*0, from, 0);
2097 vst1q_lane_f32(to + stride*1, from, 1);
2098 vst1q_lane_f32(to + stride*2, from, 2);
2099 vst1q_lane_f32(to + stride*3, from, 3);
2100}
2101template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride)
2102{
2103 for (int i = 0; i != 4; i++)
2104 *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
2105}
2106template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride)
2107{
2108 vst1_lane_s8(to + stride*0, from, 0);
2109 vst1_lane_s8(to + stride*1, from, 1);
2110 vst1_lane_s8(to + stride*2, from, 2);
2111 vst1_lane_s8(to + stride*3, from, 3);
2112 vst1_lane_s8(to + stride*4, from, 4);
2113 vst1_lane_s8(to + stride*5, from, 5);
2114 vst1_lane_s8(to + stride*6, from, 6);
2115 vst1_lane_s8(to + stride*7, from, 7);
2116}
2117template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, Index stride)
2118{
2119 vst1q_lane_s8(to + stride*0, from, 0);
2120 vst1q_lane_s8(to + stride*1, from, 1);
2121 vst1q_lane_s8(to + stride*2, from, 2);
2122 vst1q_lane_s8(to + stride*3, from, 3);
2123 vst1q_lane_s8(to + stride*4, from, 4);
2124 vst1q_lane_s8(to + stride*5, from, 5);
2125 vst1q_lane_s8(to + stride*6, from, 6);
2126 vst1q_lane_s8(to + stride*7, from, 7);
2127 vst1q_lane_s8(to + stride*8, from, 8);
2128 vst1q_lane_s8(to + stride*9, from, 9);
2129 vst1q_lane_s8(to + stride*10, from, 10);
2130 vst1q_lane_s8(to + stride*11, from, 11);
2131 vst1q_lane_s8(to + stride*12, from, 12);
2132 vst1q_lane_s8(to + stride*13, from, 13);
2133 vst1q_lane_s8(to + stride*14, from, 14);
2134 vst1q_lane_s8(to + stride*15, from, 15);
2135}
2136template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from, Index stride)
2137{
2138 for (int i = 0; i != 4; i++)
2139 *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
2140}
2141template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from, Index stride)
2142{
2143 vst1_lane_u8(to + stride*0, from, 0);
2144 vst1_lane_u8(to + stride*1, from, 1);
2145 vst1_lane_u8(to + stride*2, from, 2);
2146 vst1_lane_u8(to + stride*3, from, 3);
2147 vst1_lane_u8(to + stride*4, from, 4);
2148 vst1_lane_u8(to + stride*5, from, 5);
2149 vst1_lane_u8(to + stride*6, from, 6);
2150 vst1_lane_u8(to + stride*7, from, 7);
2151}
2152template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, Index stride)
2153{
2154 vst1q_lane_u8(to + stride*0, from, 0);
2155 vst1q_lane_u8(to + stride*1, from, 1);
2156 vst1q_lane_u8(to + stride*2, from, 2);
2157 vst1q_lane_u8(to + stride*3, from, 3);
2158 vst1q_lane_u8(to + stride*4, from, 4);
2159 vst1q_lane_u8(to + stride*5, from, 5);
2160 vst1q_lane_u8(to + stride*6, from, 6);
2161 vst1q_lane_u8(to + stride*7, from, 7);
2162 vst1q_lane_u8(to + stride*8, from, 8);
2163 vst1q_lane_u8(to + stride*9, from, 9);
2164 vst1q_lane_u8(to + stride*10, from, 10);
2165 vst1q_lane_u8(to + stride*11, from, 11);
2166 vst1q_lane_u8(to + stride*12, from, 12);
2167 vst1q_lane_u8(to + stride*13, from, 13);
2168 vst1q_lane_u8(to + stride*14, from, 14);
2169 vst1q_lane_u8(to + stride*15, from, 15);
2170}
2171template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from, Index stride)
2172{
2173 vst1_lane_s16(to + stride*0, from, 0);
2174 vst1_lane_s16(to + stride*1, from, 1);
2175 vst1_lane_s16(to + stride*2, from, 2);
2176 vst1_lane_s16(to + stride*3, from, 3);
2177}
2178template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, Index stride)
2179{
2180 vst1q_lane_s16(to + stride*0, from, 0);
2181 vst1q_lane_s16(to + stride*1, from, 1);
2182 vst1q_lane_s16(to + stride*2, from, 2);
2183 vst1q_lane_s16(to + stride*3, from, 3);
2184 vst1q_lane_s16(to + stride*4, from, 4);
2185 vst1q_lane_s16(to + stride*5, from, 5);
2186 vst1q_lane_s16(to + stride*6, from, 6);
2187 vst1q_lane_s16(to + stride*7, from, 7);
2188}
2189template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from, Index stride)
2190{
2191 vst1_lane_u16(to + stride*0, from, 0);
2192 vst1_lane_u16(to + stride*1, from, 1);
2193 vst1_lane_u16(to + stride*2, from, 2);
2194 vst1_lane_u16(to + stride*3, from, 3);
2195}
2196template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, Index stride)
2197{
2198 vst1q_lane_u16(to + stride*0, from, 0);
2199 vst1q_lane_u16(to + stride*1, from, 1);
2200 vst1q_lane_u16(to + stride*2, from, 2);
2201 vst1q_lane_u16(to + stride*3, from, 3);
2202 vst1q_lane_u16(to + stride*4, from, 4);
2203 vst1q_lane_u16(to + stride*5, from, 5);
2204 vst1q_lane_u16(to + stride*6, from, 6);
2205 vst1q_lane_u16(to + stride*7, from, 7);
2206}
2207template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from, Index stride)
2208{
2209 vst1_lane_s32(to + stride*0, from, 0);
2210 vst1_lane_s32(to + stride*1, from, 1);
2211}
2212template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
2213{
2214 vst1q_lane_s32(to + stride*0, from, 0);
2215 vst1q_lane_s32(to + stride*1, from, 1);
2216 vst1q_lane_s32(to + stride*2, from, 2);
2217 vst1q_lane_s32(to + stride*3, from, 3);
2218}
2219template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from, Index stride)
2220{
2221 vst1_lane_u32(to + stride*0, from, 0);
2222 vst1_lane_u32(to + stride*1, from, 1);
2223}
2224template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
2225{
2226 vst1q_lane_u32(to + stride*0, from, 0);
2227 vst1q_lane_u32(to + stride*1, from, 1);
2228 vst1q_lane_u32(to + stride*2, from, 2);
2229 vst1q_lane_u32(to + stride*3, from, 3);
2230}
2231template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride)
2232{
2233 vst1q_lane_s64(to + stride*0, from, 0);
2234 vst1q_lane_s64(to + stride*1, from, 1);
2235}
2236template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, Index stride)
2237{
2238 vst1q_lane_u64(to + stride*0, from, 0);
2239 vst1q_lane_u64(to + stride*1, from, 1);
2240}
2241
2242template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
2243template<> EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2244template<> EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2245template<> EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2246template<> EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2247template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2248template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2249template<> EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2250template<> EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
2251
2252template<> EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) { return vget_lane_f32(a,0); }
2253template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return vgetq_lane_f32(a,0); }
2254template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) { return static_cast<int8_t>(a & 0xff); }
2255template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) { return vget_lane_s8(a,0); }
2256template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) { return vgetq_lane_s8(a,0); }
2257template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) { return static_cast<uint8_t>(a & 0xff); }
2258template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) { return vget_lane_u8(a,0); }
2259template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) { return vgetq_lane_u8(a,0); }
2260template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) { return vget_lane_s16(a,0); }
2261template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) { return vgetq_lane_s16(a,0); }
2262template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) { return vget_lane_u16(a,0); }
2263template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) { return vgetq_lane_u16(a,0); }
2264template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) { return vget_lane_s32(a,0); }
2265template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { return vgetq_lane_s32(a,0); }
2266template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(a,0); }
2267template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return vgetq_lane_u32(a,0); }
2268template<> EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) { return vgetq_lane_s64(a,0); }
2269template<> EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) { return vgetq_lane_u64(a,0); }
2270
2271template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); }
2272template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
2273{
2274 const float32x4_t a_r64 = vrev64q_f32(a);
2275 return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
2276}
2277template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a)
2278{ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
2279template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); }
2280template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
2281{
2282 const int8x16_t a_r64 = vrev64q_s8(a);
2283 return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
2284}
2285template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a)
2286{ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); }
2287template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); }
2288template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
2289{
2290 const uint8x16_t a_r64 = vrev64q_u8(a);
2291 return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
2292}
2293template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); }
2294template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
2295{
2296 const int16x8_t a_r64 = vrev64q_s16(a);
2297 return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
2298}
2299template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); }
2300template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
2301{
2302 const uint16x8_t a_r64 = vrev64q_u16(a);
2303 return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
2304}
2305template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); }
2306template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
2307{
2308 const int32x4_t a_r64 = vrev64q_s32(a);
2309 return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
2310}
2311template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); }
2312template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a)
2313{
2314 const uint32x4_t a_r64 = vrev64q_u32(a);
2315 return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
2316}
2317template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a)
2318{ return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); }
2319template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a)
2320{ return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); }
2321
2322template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); }
2323template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
2324template<> EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a)
2325{ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
2326template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); }
2327template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); }
2328template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; }
2329template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; }
2330template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
2331template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); }
2332template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); }
2333template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; }
2334template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
2335template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); }
2336template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
2337template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; }
2338template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
2339template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
2340#if EIGEN_ARCH_ARM64
2341 return vabsq_s64(a);
2342#else
2343 return vcombine_s64(
2344 vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))),
2345 vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
2346#endif
2347}
2348template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; }
2349
2350template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent)
2351{ return pfrexp_generic(a,exponent); }
2352template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent)
2353{ return pfrexp_generic(a,exponent); }
2354
2355template<> EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent)
2356{ return pldexp_generic(a,exponent); }
2357template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent)
2358{ return pldexp_generic(a,exponent); }
2359
2360template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); }
2361template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
2362{
2363 const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
2364 return vget_lane_f32(vpadd_f32(sum, sum), 0);
2365}
2366template<> EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a)
2367{
2368 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
2369 int8x8_t sum = vpadd_s8(a_dup, a_dup);
2370 sum = vpadd_s8(sum, sum);
2371 return vget_lane_s8(sum, 0);
2372}
2373template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a)
2374{
2375 int8x8_t sum = vpadd_s8(a,a);
2376 sum = vpadd_s8(sum, sum);
2377 sum = vpadd_s8(sum, sum);
2378 return vget_lane_s8(sum, 0);
2379}
2380template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a)
2381{
2382 int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
2383 sum = vpadd_s8(sum, sum);
2384 sum = vpadd_s8(sum, sum);
2385 sum = vpadd_s8(sum, sum);
2386 return vget_lane_s8(sum, 0);
2387}
2388template<> EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a)
2389{
2390 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
2391 uint8x8_t sum = vpadd_u8(a_dup, a_dup);
2392 sum = vpadd_u8(sum, sum);
2393 return vget_lane_u8(sum, 0);
2394}
2395template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a)
2396{
2397 uint8x8_t sum = vpadd_u8(a,a);
2398 sum = vpadd_u8(sum, sum);
2399 sum = vpadd_u8(sum, sum);
2400 return vget_lane_u8(sum, 0);
2401}
2402template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a)
2403{
2404 uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
2405 sum = vpadd_u8(sum, sum);
2406 sum = vpadd_u8(sum, sum);
2407 sum = vpadd_u8(sum, sum);
2408 return vget_lane_u8(sum, 0);
2409}
2410template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a)
2411{
2412 const int16x4_t sum = vpadd_s16(a,a);
2413 return vget_lane_s16(vpadd_s16(sum, sum), 0);
2414}
2415template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a)
2416{
2417 int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
2418 sum = vpadd_s16(sum, sum);
2419 sum = vpadd_s16(sum, sum);
2420 return vget_lane_s16(sum, 0);
2421}
2422template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a)
2423{
2424 const uint16x4_t sum = vpadd_u16(a,a);
2425 return vget_lane_u16(vpadd_u16(sum, sum), 0);
2426}
2427template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a)
2428{
2429 uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
2430 sum = vpadd_u16(sum, sum);
2431 sum = vpadd_u16(sum, sum);
2432 return vget_lane_u16(sum, 0);
2433}
2434template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); }
2435template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
2436{
2437 const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
2438 return vget_lane_s32(vpadd_s32(sum, sum), 0);
2439}
2440template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); }
2441template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
2442{
2443 const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
2444 return vget_lane_u32(vpadd_u32(sum, sum), 0);
2445}
2446template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a)
2447{ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); }
2448template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a)
2449{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); }
2450
2451template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a)
2452{
2453 return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a,
2454 vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
2455}
2456template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a)
2457{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); }
2458template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a)
2459{
2460 return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a,
2461 vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
2462}
2463template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a)
2464{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); }
2465template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a)
2466{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); }
2467template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a)
2468{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); }
2469
2470// Other reduction functions:
2471// mul
2472template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
2473{ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
2474template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
2475{ return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
2476template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
2477{
2478 int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
2479 prod = vmul_s8(prod, vrev16_s8(prod));
2480 return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
2481}
2482template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
2483{
2484 int8x8_t prod = vmul_s8(a, vrev16_s8(a));
2485 prod = vmul_s8(prod, vrev32_s8(prod));
2486 return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
2487}
2488template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
2489{ return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
2490template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
2491{
2492 uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
2493 prod = vmul_u8(prod, vrev16_u8(prod));
2494 return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
2495}
2496template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
2497{
2498 uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
2499 prod = vmul_u8(prod, vrev32_u8(prod));
2500 return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
2501}
2502template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
2503{ return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
2504template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
2505{
2506 const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
2507 return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
2508}
2509template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a)
2510{
2511 int16x4_t prod;
2512
2513 // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
2514 prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
2515 // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
2516 prod = vmul_s16(prod, vrev32_s16(prod));
2517 // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
2518 return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
2519}
2520template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a)
2521{
2522 const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
2523 return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
2524}
2525template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a)
2526{
2527 uint16x4_t prod;
2528
2529 // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
2530 prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
2531 // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
2532 prod = vmul_u16(prod, vrev32_u16(prod));
2533 // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
2534 return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
2535}
2536template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
2537{ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
2538template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
2539{ return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
2540template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
2541{ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
2542template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
2543{ return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
2544template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
2545{ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
2546template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
2547{ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); }
2548
2549// min
2550template<> EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a)
2551{ return vget_lane_f32(vpmin_f32(a,a), 0); }
2552template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
2553{
2554 const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
2555 return vget_lane_f32(vpmin_f32(min, min), 0);
2556}
2557template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a)
2558{
2559 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
2560 int8x8_t min = vpmin_s8(a_dup, a_dup);
2561 min = vpmin_s8(min, min);
2562 return vget_lane_s8(min, 0);
2563}
2564template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a)
2565{
2566 int8x8_t min = vpmin_s8(a,a);
2567 min = vpmin_s8(min, min);
2568 min = vpmin_s8(min, min);
2569 return vget_lane_s8(min, 0);
2570}
2571template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a)
2572{
2573 int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
2574 min = vpmin_s8(min, min);
2575 min = vpmin_s8(min, min);
2576 min = vpmin_s8(min, min);
2577 return vget_lane_s8(min, 0);
2578}
2579template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a)
2580{
2581 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
2582 uint8x8_t min = vpmin_u8(a_dup, a_dup);
2583 min = vpmin_u8(min, min);
2584 return vget_lane_u8(min, 0);
2585}
2586template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a)
2587{
2588 uint8x8_t min = vpmin_u8(a,a);
2589 min = vpmin_u8(min, min);
2590 min = vpmin_u8(min, min);
2591 return vget_lane_u8(min, 0);
2592}
2593template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a)
2594{
2595 uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
2596 min = vpmin_u8(min, min);
2597 min = vpmin_u8(min, min);
2598 min = vpmin_u8(min, min);
2599 return vget_lane_u8(min, 0);
2600}
2601template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a)
2602{
2603 const int16x4_t min = vpmin_s16(a,a);
2604 return vget_lane_s16(vpmin_s16(min, min), 0);
2605}
2606template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a)
2607{
2608 int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
2609 min = vpmin_s16(min, min);
2610 min = vpmin_s16(min, min);
2611 return vget_lane_s16(min, 0);
2612}
2613template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a)
2614{
2615 const uint16x4_t min = vpmin_u16(a,a);
2616 return vget_lane_u16(vpmin_u16(min, min), 0);
2617}
2618template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a)
2619{
2620 uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
2621 min = vpmin_u16(min, min);
2622 min = vpmin_u16(min, min);
2623 return vget_lane_u16(min, 0);
2624}
2625template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a)
2626{ return vget_lane_s32(vpmin_s32(a,a), 0); }
2627template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
2628{
2629 const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
2630 return vget_lane_s32(vpmin_s32(min, min), 0);
2631}
2632template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a)
2633{ return vget_lane_u32(vpmin_u32(a,a), 0); }
2634template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
2635{
2636 const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
2637 return vget_lane_u32(vpmin_u32(min, min), 0);
2638}
2639template<> EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a)
2640{ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
2641template<> EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a)
2642{ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
2643
2644// max
2645template<> EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a)
2646{ return vget_lane_f32(vpmax_f32(a,a), 0); }
2647template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
2648{
2649 const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
2650 return vget_lane_f32(vpmax_f32(max, max), 0);
2651}
2652template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a)
2653{
2654 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
2655 int8x8_t max = vpmax_s8(a_dup, a_dup);
2656 max = vpmax_s8(max, max);
2657 return vget_lane_s8(max, 0);
2658}
2659template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a)
2660{
2661 int8x8_t max = vpmax_s8(a,a);
2662 max = vpmax_s8(max, max);
2663 max = vpmax_s8(max, max);
2664 return vget_lane_s8(max, 0);
2665}
2666template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a)
2667{
2668 int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
2669 max = vpmax_s8(max, max);
2670 max = vpmax_s8(max, max);
2671 max = vpmax_s8(max, max);
2672 return vget_lane_s8(max, 0);
2673}
2674template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a)
2675{
2676 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
2677 uint8x8_t max = vpmax_u8(a_dup, a_dup);
2678 max = vpmax_u8(max, max);
2679 return vget_lane_u8(max, 0);
2680}
2681template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a)
2682{
2683 uint8x8_t max = vpmax_u8(a,a);
2684 max = vpmax_u8(max, max);
2685 max = vpmax_u8(max, max);
2686 return vget_lane_u8(max, 0);
2687}
2688template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a)
2689{
2690 uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
2691 max = vpmax_u8(max, max);
2692 max = vpmax_u8(max, max);
2693 max = vpmax_u8(max, max);
2694 return vget_lane_u8(max, 0);
2695}
2696template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a)
2697{
2698 const int16x4_t max = vpmax_s16(a,a);
2699 return vget_lane_s16(vpmax_s16(max, max), 0);
2700}
2701template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a)
2702{
2703 int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
2704 max = vpmax_s16(max, max);
2705 max = vpmax_s16(max, max);
2706 return vget_lane_s16(max, 0);
2707}
2708template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a)
2709{
2710 const uint16x4_t max = vpmax_u16(a,a);
2711 return vget_lane_u16(vpmax_u16(max, max), 0);
2712}
2713template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a)
2714{
2715 uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
2716 max = vpmax_u16(max, max);
2717 max = vpmax_u16(max, max);
2718 return vget_lane_u16(max, 0);
2719}
2720template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a)
2721{ return vget_lane_s32(vpmax_s32(a,a), 0); }
2722template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
2723{
2724 const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
2725 return vget_lane_s32(vpmax_s32(max, max), 0);
2726}
2727template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a)
2728{ return vget_lane_u32(vpmax_u32(a,a), 0); }
2729template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
2730{
2731 const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
2732 return vget_lane_u32(vpmax_u32(max, max), 0);
2733}
2734template<> EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a)
2735{ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
2736template<> EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a)
2737{ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
2738
2739template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
2740{
2741 uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
2742 vget_high_u32(vreinterpretq_u32_f32(x)));
2743 return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
2744}
2745
2746// Helpers for ptranspose.
2747namespace detail {
2748
2749template<typename Packet>
2750void zip_in_place(Packet& p1, Packet& p2);
2751
2752template<>
2753EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
2754 const float32x2x2_t tmp = vzip_f32(p1, p2);
2755 p1 = tmp.val[0];
2756 p2 = tmp.val[1];
2757}
2758
2759template<>
2760EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
2761 const float32x4x2_t tmp = vzipq_f32(p1, p2);
2762 p1 = tmp.val[0];
2763 p2 = tmp.val[1];
2764}
2765
2766template<>
2767EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
2768 const int8x8x2_t tmp = vzip_s8(p1, p2);
2769 p1 = tmp.val[0];
2770 p2 = tmp.val[1];
2771}
2772
2773template<>
2774EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
2775 const int8x16x2_t tmp = vzipq_s8(p1, p2);
2776 p1 = tmp.val[0];
2777 p2 = tmp.val[1];
2778}
2779
2780template<>
2781EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
2782 const uint8x8x2_t tmp = vzip_u8(p1, p2);
2783 p1 = tmp.val[0];
2784 p2 = tmp.val[1];
2785}
2786
2787template<>
2788EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
2789 const uint8x16x2_t tmp = vzipq_u8(p1, p2);
2790 p1 = tmp.val[0];
2791 p2 = tmp.val[1];
2792}
2793
2794template<>
2795EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
2796 const int32x2x2_t tmp = vzip_s32(p1, p2);
2797 p1 = tmp.val[0];
2798 p2 = tmp.val[1];
2799}
2800
2801template<>
2802EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
2803 const int32x4x2_t tmp = vzipq_s32(p1, p2);
2804 p1 = tmp.val[0];
2805 p2 = tmp.val[1];
2806}
2807
2808template<>
2809EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
2810 const uint32x2x2_t tmp = vzip_u32(p1, p2);
2811 p1 = tmp.val[0];
2812 p2 = tmp.val[1];
2813}
2814
2815template<>
2816EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
2817 const uint32x4x2_t tmp = vzipq_u32(p1, p2);
2818 p1 = tmp.val[0];
2819 p2 = tmp.val[1];
2820}
2821
2822template<>
2823EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
2824 const int16x4x2_t tmp = vzip_s16(p1, p2);
2825 p1 = tmp.val[0];
2826 p2 = tmp.val[1];
2827}
2828
2829template<>
2830EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
2831 const int16x8x2_t tmp = vzipq_s16(p1, p2);
2832 p1 = tmp.val[0];
2833 p2 = tmp.val[1];
2834}
2835
2836template<>
2837EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
2838 const uint16x4x2_t tmp = vzip_u16(p1, p2);
2839 p1 = tmp.val[0];
2840 p2 = tmp.val[1];
2841}
2842
2843template<>
2844EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
2845 const uint16x8x2_t tmp = vzipq_u16(p1, p2);
2846 p1 = tmp.val[0];
2847 p2 = tmp.val[1];
2848}
2849
2850template<typename Packet>
2851EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
2852 zip_in_place(kernel.packet[0], kernel.packet[1]);
2853}
2854
2855template<typename Packet>
2856EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
2857 zip_in_place(kernel.packet[0], kernel.packet[2]);
2858 zip_in_place(kernel.packet[1], kernel.packet[3]);
2859 zip_in_place(kernel.packet[0], kernel.packet[1]);
2860 zip_in_place(kernel.packet[2], kernel.packet[3]);
2861}
2862
2863template<typename Packet>
2864EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
2865 zip_in_place(kernel.packet[0], kernel.packet[4]);
2866 zip_in_place(kernel.packet[1], kernel.packet[5]);
2867 zip_in_place(kernel.packet[2], kernel.packet[6]);
2868 zip_in_place(kernel.packet[3], kernel.packet[7]);
2869
2870 zip_in_place(kernel.packet[0], kernel.packet[2]);
2871 zip_in_place(kernel.packet[1], kernel.packet[3]);
2872 zip_in_place(kernel.packet[4], kernel.packet[6]);
2873 zip_in_place(kernel.packet[5], kernel.packet[7]);
2874
2875 zip_in_place(kernel.packet[0], kernel.packet[1]);
2876 zip_in_place(kernel.packet[2], kernel.packet[3]);
2877 zip_in_place(kernel.packet[4], kernel.packet[5]);
2878 zip_in_place(kernel.packet[6], kernel.packet[7]);
2879}
2880
2881template<typename Packet>
2882EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
2883 EIGEN_UNROLL_LOOP
2884 for (int i=0; i<4; ++i) {
2885 const int m = (1 << i);
2886 EIGEN_UNROLL_LOOP
2887 for (int j=0; j<m; ++j) {
2888 const int n = (1 << (3-i));
2889 EIGEN_UNROLL_LOOP
2890 for (int k=0; k<n; ++k) {
2891 const int idx = 2*j*n+k;
2892 zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
2893 }
2894 }
2895 }
2896}
2897
2898} // namespace detail
2899
2900EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
2901 detail::ptranspose_impl(kernel);
2902}
2903EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
2904 detail::ptranspose_impl(kernel);
2905}
2906
2907EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel)
2908{
2909 const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
2910 const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
2911
2912 const int8x8x2_t zip8 = vzip_s8(a,b);
2913 const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
2914
2915 kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
2916 kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
2917 kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
2918 kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
2919}
2920EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
2921 detail::ptranspose_impl(kernel);
2922}
2923EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
2924 detail::ptranspose_impl(kernel);
2925}
2926EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2927 detail::ptranspose_impl(kernel);
2928}
2929EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
2930 detail::ptranspose_impl(kernel);
2931}
2932EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2933 detail::ptranspose_impl(kernel);
2934}
2935
2936EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
2937{
2938 const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
2939 const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
2940
2941 const uint8x8x2_t zip8 = vzip_u8(a,b);
2942 const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
2943
2944 kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
2945 kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
2946 kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
2947 kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
2948}
2949EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
2950 detail::ptranspose_impl(kernel);
2951}
2952EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
2953 detail::ptranspose_impl(kernel);
2954}
2955EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2956 detail::ptranspose_impl(kernel);
2957}
2958EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
2959 detail::ptranspose_impl(kernel);
2960}
2961EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2962 detail::ptranspose_impl(kernel);
2963}
2964
2965EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
2966 detail::ptranspose_impl(kernel);
2967}
2968EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2969 detail::ptranspose_impl(kernel);
2970}
2971EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2972 detail::ptranspose_impl(kernel);
2973}
2974
2975EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
2976 detail::ptranspose_impl(kernel);
2977}
2978EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2979 detail::ptranspose_impl(kernel);
2980}
2981EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2982 detail::ptranspose_impl(kernel);
2983}
2984
2985EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
2986 detail::ptranspose_impl(kernel);
2987}
2988EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
2989 detail::ptranspose_impl(kernel);
2990}
2991EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
2992 detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
2993}
2994EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
2995 detail::ptranspose_impl(kernel);
2996}
2997
2998EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
2999ptranspose(PacketBlock<Packet2l, 2>& kernel)
3000{
3001#if EIGEN_ARCH_ARM64
3002 const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
3003 kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
3004 kernel.packet[0] = tmp1;
3005#else
3006 const int64x1_t tmp[2][2] = {
3007 { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) },
3008 { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) }
3009 };
3010
3011 kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
3012 kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
3013#endif
3014}
3015EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
3016ptranspose(PacketBlock<Packet2ul, 2>& kernel)
3017{
3018#if EIGEN_ARCH_ARM64
3019 const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
3020 kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
3021 kernel.packet[0] = tmp1;
3022#else
3023 const uint64x1_t tmp[2][2] = {
3024 { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) },
3025 { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) }
3026 };
3027
3028 kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
3029 kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
3030#endif
3031}
3032
3033template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b)
3034{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); }
3035template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b)
3036{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); }
3037template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b)
3038{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); }
3039template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b)
3040{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); }
3041template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b)
3042{ return vbsl_u8(mask, a, b); }
3043template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b)
3044{ return vbslq_u8(mask, a, b); }
3045template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b)
3046{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); }
3047template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b)
3048{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); }
3049template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b)
3050{ return vbsl_u16(mask, a, b); }
3051template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b)
3052{ return vbslq_u16(mask, a, b); }
3053template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b)
3054{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); }
3055template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b)
3056{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); }
3057template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b)
3058{ return vbsl_u32(mask, a, b); }
3059template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b)
3060{ return vbslq_u32(mask, a, b); }
3061template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b)
3062{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); }
3063template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
3064{ return vbslq_u64(mask, a, b); }
3065
3066// Use armv8 rounding intinsics if available.
3067#if EIGEN_ARCH_ARMV8
3068template<> EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a)
3069{ return vrndn_f32(a); }
3070
3071template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
3072{ return vrndnq_f32(a); }
3073
3074template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
3075{ return vrndm_f32(a); }
3076
3077template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
3078{ return vrndmq_f32(a); }
3079
3080template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
3081{ return vrndp_f32(a); }
3082
3083template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
3084{ return vrndpq_f32(a); }
3085
3086#else
3087
3088template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
3089 // Adds and subtracts signum(a) * 2^23 to force rounding.
3090 const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
3091 const Packet4f abs_a = pabs(a);
3092 Packet4f r = padd(abs_a, limit);
3093 // Don't compile-away addition and subtraction.
3094 EIGEN_OPTIMIZATION_BARRIER(r);
3095 r = psub(r, limit);
3096 // If greater than limit, simply return a. Otherwise, account for sign.
3097 r = pselect(pcmp_lt(abs_a, limit),
3098 pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
3099 return r;
3100}
3101
3102template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
3103 // Adds and subtracts signum(a) * 2^23 to force rounding.
3104 const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
3105 const Packet2f abs_a = pabs(a);
3106 Packet2f r = padd(abs_a, limit);
3107 // Don't compile-away addition and subtraction.
3108 EIGEN_OPTIMIZATION_BARRIER(r);
3109 r = psub(r, limit);
3110 // If greater than limit, simply return a. Otherwise, account for sign.
3111 r = pselect(pcmp_lt(abs_a, limit),
3112 pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
3113 return r;
3114}
3115
3116template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
3117{
3118 const Packet4f cst_1 = pset1<Packet4f>(1.0f);
3119 Packet4f tmp = print<Packet4f>(a);
3120 // If greater, subtract one.
3121 Packet4f mask = pcmp_lt(a, tmp);
3122 mask = pand(mask, cst_1);
3123 return psub(tmp, mask);
3124}
3125
3126template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
3127{
3128 const Packet2f cst_1 = pset1<Packet2f>(1.0f);
3129 Packet2f tmp = print<Packet2f>(a);
3130 // If greater, subtract one.
3131 Packet2f mask = pcmp_lt(a, tmp);
3132 mask = pand(mask, cst_1);
3133 return psub(tmp, mask);
3134}
3135
3136template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
3137{
3138 const Packet4f cst_1 = pset1<Packet4f>(1.0f);
3139 Packet4f tmp = print<Packet4f>(a);
3140 // If smaller, add one.
3141 Packet4f mask = pcmp_lt(tmp, a);
3142 mask = pand(mask, cst_1);
3143 return padd(tmp, mask);
3144}
3145
3146template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
3147{
3148 const Packet2f cst_1 = pset1<Packet2f>(1.0);
3149 Packet2f tmp = print<Packet2f>(a);
3150 // If smaller, add one.
3151 Packet2f mask = pcmp_lt(tmp, a);
3152 mask = pand(mask, cst_1);
3153 return padd(tmp, mask);
3154}
3155
3156#endif // EIGEN_ARCH_ARMV8
3157
3164template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
3165 uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
3166 uint8x8_t res = vdup_n_u8(0);
3167 uint8x8_t add = vdup_n_u8(0x8);
3168 for (int i = 0; i < 4; i++)
3169 {
3170 const uint8x8_t temp = vorr_u8(res, add);
3171 res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
3172 add = vshr_n_u8(add, 1);
3173 }
3174 return vget_lane_u32(vreinterpret_u32_u8(res), 0);
3175}
3177template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
3178 uint8x8_t res = vdup_n_u8(0);
3179 uint8x8_t add = vdup_n_u8(0x8);
3180 for (int i = 0; i < 4; i++)
3181 {
3182 const uint8x8_t temp = vorr_u8(res, add);
3183 res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
3184 add = vshr_n_u8(add, 1);
3185 }
3186 return res;
3187}
3189template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
3190 uint8x16_t res = vdupq_n_u8(0);
3191 uint8x16_t add = vdupq_n_u8(0x8);
3192 for (int i = 0; i < 4; i++)
3193 {
3194 const uint8x16_t temp = vorrq_u8(res, add);
3195 res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
3196 add = vshrq_n_u8(add, 1);
3197 }
3198 return res;
3199}
3201template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
3202 uint16x4_t res = vdup_n_u16(0);
3203 uint16x4_t add = vdup_n_u16(0x80);
3204 for (int i = 0; i < 8; i++)
3205 {
3206 const uint16x4_t temp = vorr_u16(res, add);
3207 res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
3208 add = vshr_n_u16(add, 1);
3209 }
3210 return res;
3211}
3213template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
3214 uint16x8_t res = vdupq_n_u16(0);
3215 uint16x8_t add = vdupq_n_u16(0x80);
3216 for (int i = 0; i < 8; i++)
3217 {
3218 const uint16x8_t temp = vorrq_u16(res, add);
3219 res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
3220 add = vshrq_n_u16(add, 1);
3221 }
3222 return res;
3223}
3225template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
3226 uint32x2_t res = vdup_n_u32(0);
3227 uint32x2_t add = vdup_n_u32(0x8000);
3228 for (int i = 0; i < 16; i++)
3229 {
3230 const uint32x2_t temp = vorr_u32(res, add);
3231 res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
3232 add = vshr_n_u32(add, 1);
3233 }
3234 return res;
3235}
3237template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
3238 uint32x4_t res = vdupq_n_u32(0);
3239 uint32x4_t add = vdupq_n_u32(0x8000);
3240 for (int i = 0; i < 16; i++)
3241 {
3242 const uint32x4_t temp = vorrq_u32(res, add);
3243 res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
3244 add = vshrq_n_u32(add, 1);
3245 }
3246 return res;
3247}
3248
3249EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(const Packet4f& a) {
3250 // Compute approximate reciprocal sqrt.
3251 // Does not correctly handle +/- 0 or +inf
3252 float32x4_t result = vrsqrteq_f32(a);
3253 result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
3254 result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
3255 return result;
3256}
3257
3258EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(const Packet2f& a) {
3259 // Compute approximate reciprocal sqrt.
3260 // Does not correctly handle +/- 0 or +inf
3261 float32x2_t result = vrsqrte_f32(a);
3262 result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
3263 result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
3264 return result;
3265}
3266
3267template<typename Packet> Packet prsqrt_float_common(const Packet& a) {
3268 const Packet cst_zero = pzero(a);
3269 const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
3270 Packet return_zero = pcmp_eq(a, cst_inf);
3271 Packet return_inf = pcmp_eq(a, cst_zero);
3272 Packet result = prsqrt_float_unsafe(a);
3273 result = pselect(return_inf, por(cst_inf, a), result);
3274 result = pandnot(result, return_zero);
3275 return result;
3276}
3277
3278template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
3279 return prsqrt_float_common(a);
3280}
3281
3282template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
3283 return prsqrt_float_common(a);
3284}
3285
3286EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a)
3287{
3288 // Compute approximate reciprocal.
3289 float32x4_t result = vrecpeq_f32(a);
3290 result = vmulq_f32(vrecpsq_f32(a, result), result);
3291 result = vmulq_f32(vrecpsq_f32(a, result), result);
3292 return result;
3293}
3294
3295EIGEN_STRONG_INLINE Packet2f preciprocal(const Packet2f& a)
3296{
3297 // Compute approximate reciprocal.
3298 float32x2_t result = vrecpe_f32(a);
3299 result = vmul_f32(vrecps_f32(a, result), result);
3300 result = vmul_f32(vrecps_f32(a, result), result);
3301 return result;
3302}
3303
3304// Unfortunately vsqrt_f32 is only available for A64.
3305#if EIGEN_ARCH_ARM64
3306template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { return vsqrtq_f32(a); }
3307
3308template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { return vsqrt_f32(a); }
3309
3310template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); }
3311
3312template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); }
3313#else
3314template<typename Packet>
3315EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) {
3316 const Packet cst_zero = pzero(a);
3317 const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
3318
3319 Packet result = pmul(a, prsqrt_float_unsafe(a));
3320 Packet a_is_zero = pcmp_eq(a, cst_zero);
3321 Packet a_is_inf = pcmp_eq(a, cst_inf);
3322 Packet return_a = por(a_is_zero, a_is_inf);
3323
3324 result = pselect(return_a, a, result);
3325 return result;
3326}
3327
3328template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
3329 return psqrt_float_common(a);
3330}
3331
3332template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
3333 return psqrt_float_common(a);
3334}
3335
3336template<typename Packet>
3337EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
3338 // if b is large, NEON intrinsics will flush preciprocal(b) to zero
3339 // avoid underflow with the following manipulation:
3340 // a / b = f * (a * reciprocal(f * b))
3341
3342 const Packet cst_one = pset1<Packet>(1.0f);
3343 const Packet cst_quarter = pset1<Packet>(0.25f);
3344 const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
3345
3346 Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
3347 Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
3348 Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
3349 return result;
3350}
3351
3352template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
3353 return pdiv_float_common(a, b);
3354}
3355
3356template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
3357 return pdiv_float_common(a, b);
3358}
3359#endif
3360
3361//---------- bfloat16 ----------
3362// TODO: Add support for native armv8.6-a bfloat16_t
3363
3364// TODO: Guard if we have native bfloat16 support
3365typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
3366
3367template<> struct is_arithmetic<Packet4bf> { enum { value = true }; };
3368
3369template<> struct packet_traits<bfloat16> : default_packet_traits
3370{
3371 typedef Packet4bf type;
3372 typedef Packet4bf half;
3373 enum
3374 {
3375 Vectorizable = 1,
3376 AlignedOnScalar = 1,
3377 size = 4,
3378 HasHalfPacket = 0,
3379
3380 HasCmp = 1,
3381 HasAdd = 1,
3382 HasSub = 1,
3383 HasShift = 1,
3384 HasMul = 1,
3385 HasNegate = 1,
3386 HasAbs = 1,
3387 HasArg = 0,
3388 HasAbs2 = 1,
3389 HasAbsDiff = 1,
3390 HasMin = 1,
3391 HasMax = 1,
3392 HasConj = 1,
3393 HasSetLinear = 0,
3394 HasBlend = 0,
3395 HasDiv = 1,
3396 HasFloor = 1,
3397 HasCeil = 1,
3398 HasRint = 1,
3399
3400 HasSin = EIGEN_FAST_MATH,
3401 HasCos = EIGEN_FAST_MATH,
3402 HasLog = 1,
3403 HasExp = 1,
3404 HasSqrt = 0,
3405 HasTanh = EIGEN_FAST_MATH,
3406 HasErf = EIGEN_FAST_MATH,
3407 HasBessel = 0, // Issues with accuracy.
3408 HasNdtri = 0
3409 };
3410};
3411
3412template<> struct unpacket_traits<Packet4bf>
3413{
3414 typedef bfloat16 type;
3415 typedef Packet4bf half;
3416 enum
3417 {
3418 size = 4,
3419 alignment = Aligned16,
3420 vectorizable = true,
3421 masked_load_available = false,
3422 masked_store_available = false
3423 };
3424};
3425
3426namespace detail {
3427template<>
3428EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
3429 const uint16x4x2_t tmp = vzip_u16(p1, p2);
3430 p1 = tmp.val[0];
3431 p2 = tmp.val[1];
3432}
3433} // namespace detail
3434
3435EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
3436{
3437 // See the scalar implemention in BFloat16.h for a comprehensible explanation
3438 // of this fast rounding algorithm
3439 Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
3440
3441 // lsb = (input >> 16) & 1
3442 Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
3443
3444 // rounding_bias = 0x7fff + lsb
3445 Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
3446
3447 // input += rounding_bias
3448 input = vaddq_u32(input, rounding_bias);
3449
3450 // input = input >> 16
3451 input = vshrq_n_u32(input, 16);
3452
3453 // Replace float-nans by bfloat16-nans, that is 0x7fc0
3454 const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
3455 const Packet4ui mask = vceqq_f32(p, p);
3456 input = vbslq_u32(mask, input, bf16_nan);
3457
3458 // output = static_cast<uint16_t>(input)
3459 return vmovn_u32(input);
3460}
3461
3462EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
3463{
3464 return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
3465}
3466
3467EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
3468 return vmovn_u32(vreinterpretq_u32_f32(p));
3469}
3470
3471template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
3472 return Packet4bf(pset1<Packet4us>(from.value));
3473}
3474
3475template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
3476 return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
3477}
3478
3479template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
3480{
3481 return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
3482}
3483
3484template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
3485{
3486 return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
3487}
3488
3489template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
3490{
3491 EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
3492}
3493
3494template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from)
3495{
3496 EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
3497}
3498
3499template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
3500{
3501 return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
3502}
3503
3504template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
3505 return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
3506}
3507
3508template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf &a,
3509 const Packet4bf &b)
3510{
3511 return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3512}
3513template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf &a,
3514 const Packet4bf &b)
3515{
3516 return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3517}
3518
3519template <> EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf &a,
3520 const Packet4bf &b)
3521{
3522 return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3523}
3524
3525template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf &a,
3526 const Packet4bf &b)
3527{
3528 return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3529}
3530template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf &a,
3531 const Packet4bf &b)
3532{
3533 return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3534}
3535
3536template <> EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf &a,
3537 const Packet4bf &b)
3538{
3539 return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3540}
3541
3542template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a)
3543{
3544 return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
3545}
3546
3547template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
3548 return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
3549}
3550
3551template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
3552 return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
3553}
3554
3555template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
3556 return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
3557}
3558
3559template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
3560 return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
3561}
3562
3563template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
3564 const Packet4bf& b)
3565{
3566 return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
3567}
3568
3569template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
3570{
3571 return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
3572}
3573
3574template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a)
3575{
3576 return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
3577}
3578
3579template<> EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a)
3580{
3581 return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
3582}
3583
3584template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; }
3585
3586template<> EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
3587 return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3588}
3589
3590template<> EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
3591 return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3592}
3593
3594template<> EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
3595 return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3596}
3597
3598template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
3599 return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3600}
3601
3602template<>
3603EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
3604{
3605 return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
3606}
3607
3608template<>
3609EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
3610{
3611 pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), Packet4us(from), stride);
3612}
3613
3614template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
3615{
3616 return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
3617}
3618
3619template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a)
3620{
3621 return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
3622}
3623
3624template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a)
3625{
3626 return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
3627}
3628
3629template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a)
3630{
3631 return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
3632}
3633
3634template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
3635{
3636 return Packet4bf(preverse<Packet4us>(Packet4us(a)));
3637}
3638
3639EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
3640{
3641 detail::ptranspose_impl(kernel);
3642}
3643
3644template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
3645{
3646 return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3647}
3648
3649template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
3650{
3651 return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3652}
3653
3654template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
3655{
3656 return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3657}
3658
3659template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
3660{
3661 return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3662}
3663
3664template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
3665{
3666 return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
3667}
3668
3669template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
3670{
3671 return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
3672}
3673
3674//---------- double ----------
3675
3676// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
3677// Confirmed at least with __apple_build_version__ = 6000054.
3678#ifdef __apple_build_version__
3679// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
3680// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
3681// major toolchain updates.
3682#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
3683#else
3684#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
3685#endif
3686
3687#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
3688
3689#if EIGEN_COMP_GNUC
3690// Bug 907: workaround missing declarations of the following two functions in the ADK
3691// Defining these functions as templates ensures that if these intrinsics are
3692// already defined in arm_neon.h, then our workaround doesn't cause a conflict
3693// and has lower priority in overload resolution.
3694// This doesn't work with MSVC though, since the function names are macros.
3695template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
3696template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
3697#endif
3698
3699#if EIGEN_COMP_MSVC_STRICT
3700typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
3701typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
3702
3703EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
3704 double from[2] = {a, b};
3705 return vld1q_f64(from);
3706}
3707
3708#else
3709typedef float64x2_t Packet2d;
3710typedef float64x1_t Packet1d;
3711
3712EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
3713 double from[2] = {a, b};
3714 return vld1q_f64(from);
3715}
3716#endif
3717
3718// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
3719// Currently used in LU/arch/InverseSize4.h to enable a shared implementation
3720// for fast inversion of matrices of size 4.
3721EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask)
3722{
3723 const double* a = reinterpret_cast<const double*>(&m);
3724 const double* b = reinterpret_cast<const double*>(&n);
3725 Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
3726 return res;
3727}
3728
3729EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask)
3730{
3731 return shuffle(a, b, mask);
3732}
3733EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b)
3734{
3735 return shuffle(a, b, 0);
3736}
3737EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
3738{
3739 return shuffle(a, b, 3);
3740}
3741#define vec2d_duplane(a, p) \
3742 Packet2d(vdupq_laneq_f64(a, p))
3743
3744template<> struct packet_traits<double> : default_packet_traits
3745{
3746 typedef Packet2d type;
3747 typedef Packet2d half;
3748 enum
3749 {
3750 Vectorizable = 1,
3751 AlignedOnScalar = 1,
3752 size = 2,
3753 HasHalfPacket = 0,
3754
3755 HasCmp = 1,
3756 HasAdd = 1,
3757 HasSub = 1,
3758 HasShift = 1,
3759 HasMul = 1,
3760 HasNegate = 1,
3761 HasAbs = 1,
3762 HasArg = 0,
3763 HasAbs2 = 1,
3764 HasAbsDiff = 1,
3765 HasMin = 1,
3766 HasMax = 1,
3767 HasConj = 1,
3768 HasSetLinear = 0,
3769 HasBlend = 0,
3770
3771 HasDiv = 1,
3772 HasFloor = 1,
3773 HasCeil = 1,
3774 HasRint = 1,
3775
3776 HasSin = 0,
3777 HasCos = 0,
3778 HasLog = 1,
3779 HasExp = 1,
3780 HasSqrt = 1,
3781 HasRsqrt = 1,
3782 HasTanh = 0,
3783 HasErf = 0
3784 };
3785};
3786
3787template<> struct unpacket_traits<Packet2d>
3788{
3789 typedef double type;
3790 typedef Packet2d half;
3791 typedef Packet2l integer_packet;
3792 enum
3793 {
3794 size = 2,
3795 alignment = Aligned16,
3796 vectorizable = true,
3797 masked_load_available = false,
3798 masked_store_available = false
3799 };
3800};
3801
3802template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); }
3803
3804template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
3805{
3806 const double c[] = {0.0,1.0};
3807 return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
3808}
3809
3810template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
3811
3812template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
3813
3814template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
3815template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
3816 const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
3817 return padd(a, pxor(mask, b));
3818}
3819
3820template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
3821
3822template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
3823
3824template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
3825
3826template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
3827
3828#ifdef EIGEN_VECTORIZE_FMA
3829// See bug 936. See above comment about FMA for float.
3830template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
3831{ return vfmaq_f64(c,a,b); }
3832#else
3833template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
3834{ return vmlaq_f64(c,a,b); }
3835#endif
3836
3837template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
3838
3839#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
3840// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
3841template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); }
3842template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); }
3843
3844#endif
3845
3846template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmin<Packet2d>(a, b); }
3847
3848template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
3849
3850
3851template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmax<Packet2d>(a, b); }
3852
3853// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
3854template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
3855{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3856
3857template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
3858{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3859
3860template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
3861{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3862
3863template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
3864{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
3865
3866template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b)
3867{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); }
3868
3869template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b)
3870{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); }
3871
3872template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b)
3873{ return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); }
3874
3875template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b)
3876{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
3877
3878template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
3879{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
3880
3881template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
3882{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
3883
3884template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) { return vld1q_dup_f64(from); }
3885template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
3886{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); }
3887
3888template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
3889{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); }
3890
3891template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
3892{
3893 Packet2d res = pset1<Packet2d>(0.0);
3894 res = vld1q_lane_f64(from + 0*stride, res, 0);
3895 res = vld1q_lane_f64(from + 1*stride, res, 1);
3896 return res;
3897}
3898
3899template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
3900{
3901 vst1q_lane_f64(to + stride*0, from, 0);
3902 vst1q_lane_f64(to + stride*1, from, 1);
3903}
3904
3905template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
3906
3907// FIXME only store the 2 first elements ?
3908template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a,0); }
3909
3910template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
3911{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
3912
3913template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
3914
3915#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
3916// workaround ICE, see bug 907
3917template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
3918{ return (vget_low_f64(a) + vget_high_f64(a))[0]; }
3919#else
3920template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
3921{ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
3922#endif
3923
3924// Other reduction functions:
3925// mul
3926#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
3927template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
3928{ return (vget_low_f64(a) * vget_high_f64(a))[0]; }
3929#else
3930template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
3931{ return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0); }
3932#endif
3933
3934// min
3935template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
3936{ return vgetq_lane_f64(vpminq_f64(a,a), 0); }
3937
3938// max
3939template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
3940{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); }
3941
3942
3943EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
3944ptranspose(PacketBlock<Packet2d, 2>& kernel)
3945{
3946 const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
3947 const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
3948
3949 kernel.packet[0] = tmp1;
3950 kernel.packet[1] = tmp2;
3951}
3952
3953template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
3954{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
3955
3956template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
3957{ return vrndnq_f64(a); }
3958
3959template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
3960{ return vrndmq_f64(a); }
3961
3962template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
3963{ return vrndpq_f64(a); }
3964
3965template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
3966{ return pldexp_generic(a, exponent); }
3967
3968template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent)
3969{ return pfrexp_generic(a,exponent); }
3970
3971template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from)
3972{ return vreinterpretq_f64_u64(vdupq_n_u64(from)); }
3973
3974template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
3975 // Compute approximate reciprocal sqrt.
3976 Packet2d x = vrsqrteq_f64(a);
3977 // Do Newton iterations for 1/sqrt(x).
3978 x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
3979 x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
3980 x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
3981 const Packet2d infinity = pset1<Packet2d>(NumTraits<double>::infinity());
3982 return pselect(pcmp_eq(a, pzero(a)), infinity, x);
3983}
3984
3985template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
3986
3987#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
3988
3989// Do we have an fp16 types and supporting Neon intrinsics?
3990#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
3991typedef float16x4_t Packet4hf;
3992typedef float16x8_t Packet8hf;
3993
3994template <>
3995struct packet_traits<Eigen::half> : default_packet_traits {
3996 typedef Packet8hf type;
3997 typedef Packet4hf half;
3998 enum {
3999 Vectorizable = 1,
4000 AlignedOnScalar = 1,
4001 size = 8,
4002 HasHalfPacket = 1,
4003
4004 HasCmp = 1,
4005 HasCast = 1,
4006 HasAdd = 1,
4007 HasSub = 1,
4008 HasShift = 1,
4009 HasMul = 1,
4010 HasNegate = 1,
4011 HasAbs = 1,
4012 HasArg = 0,
4013 HasAbs2 = 1,
4014 HasAbsDiff = 0,
4015 HasMin = 1,
4016 HasMax = 1,
4017 HasConj = 1,
4018 HasSetLinear = 0,
4019 HasBlend = 0,
4020 HasInsert = 1,
4021 HasReduxp = 1,
4022 HasDiv = 1,
4023 HasFloor = 1,
4024 HasCeil = 1,
4025 HasRint = 1,
4026 HasSin = 0,
4027 HasCos = 0,
4028 HasLog = 0,
4029 HasExp = 0,
4030 HasSqrt = 1,
4031 HasRsqrt = 1,
4032 HasErf = EIGEN_FAST_MATH,
4033 HasBessel = 0, // Issues with accuracy.
4034 HasNdtri = 0
4035 };
4036};
4037
4038template <>
4039struct unpacket_traits<Packet4hf> {
4040 typedef Eigen::half type;
4041 typedef Packet4hf half;
4042 enum {
4043 size = 4,
4044 alignment = Aligned16,
4045 vectorizable = true,
4046 masked_load_available = false,
4047 masked_store_available = false
4048 };
4049};
4050
4051template <>
4052struct unpacket_traits<Packet8hf> {
4053 typedef Eigen::half type;
4054 typedef Packet4hf half;
4055 enum {
4056 size = 8,
4057 alignment = Aligned16,
4058 vectorizable = true,
4059 masked_load_available = false,
4060 masked_store_available = false
4061 };
4062};
4063
4064template<>
4065EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
4066 return vadd_f16(vget_low_f16(a), vget_high_f16(a));
4067}
4068
4069template <>
4070EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
4071 return vdupq_n_f16(from.x);
4072}
4073
4074template <>
4075EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(const Eigen::half& from) {
4076 return vdup_n_f16(from.x);
4077}
4078
4079template <>
4080EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(const Eigen::half& a) {
4081 const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
4082 Packet8hf countdown = vld1q_f16(f);
4083 return vaddq_f16(pset1<Packet8hf>(a), countdown);
4084}
4085
4086template <>
4087EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(const Eigen::half& a) {
4088 const float16_t f[] = {0, 1, 2, 3};
4089 Packet4hf countdown = vld1_f16(f);
4090 return vadd_f16(pset1<Packet4hf>(a), countdown);
4091}
4092
4093template <>
4094EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4095 return vaddq_f16(a, b);
4096}
4097
4098template <>
4099EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4100 return vadd_f16(a, b);
4101}
4102
4103template <>
4104EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4105 return vsubq_f16(a, b);
4106}
4107
4108template <>
4109EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4110 return vsub_f16(a, b);
4111}
4112
4113template <>
4114EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) {
4115 return vnegq_f16(a);
4116}
4117
4118template <>
4119EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) {
4120 return vneg_f16(a);
4121}
4122
4123template <>
4124EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) {
4125 return a;
4126}
4127
4128template <>
4129EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) {
4130 return a;
4131}
4132
4133template <>
4134EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4135 return vmulq_f16(a, b);
4136}
4137
4138template <>
4139EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4140 return vmul_f16(a, b);
4141}
4142
4143template <>
4144EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4145 return vdivq_f16(a, b);
4146}
4147
4148template <>
4149EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4150 return vdiv_f16(a, b);
4151}
4152
4153template <>
4154EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
4155 return vfmaq_f16(c, a, b);
4156}
4157
4158template <>
4159EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
4160 return vfma_f16(c, a, b);
4161}
4162
4163template <>
4164EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4165 return vminq_f16(a, b);
4166}
4167
4168template <>
4169EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4170 return vmin_f16(a, b);
4171}
4172
4173#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
4174// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
4175template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); }
4176template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); }
4177#endif
4178
4179template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmin<Packet4hf>(a, b); }
4180
4181template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmin<Packet8hf>(a, b); }
4182
4183template <>
4184EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4185 return vmaxq_f16(a, b);
4186}
4187
4188template <>
4189EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4190 return vmax_f16(a, b);
4191}
4192
4193#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
4194// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
4195template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); }
4196template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); }
4197#endif
4198
4199template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmax<Packet4hf>(a, b); }
4200
4201template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmax<Packet8hf>(a, b); }
4202
4203#define EIGEN_MAKE_ARM_FP16_CMP_8(name) \
4204 template <> \
4205 EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
4206 return vreinterpretq_f16_u16(vc##name##q_f16(a, b)); \
4207 }
4208
4209#define EIGEN_MAKE_ARM_FP16_CMP_4(name) \
4210 template <> \
4211 EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
4212 return vreinterpret_f16_u16(vc##name##_f16(a, b)); \
4213 }
4214
4215EIGEN_MAKE_ARM_FP16_CMP_8(eq)
4216EIGEN_MAKE_ARM_FP16_CMP_8(lt)
4217EIGEN_MAKE_ARM_FP16_CMP_8(le)
4218
4219EIGEN_MAKE_ARM_FP16_CMP_4(eq)
4220EIGEN_MAKE_ARM_FP16_CMP_4(lt)
4221EIGEN_MAKE_ARM_FP16_CMP_4(le)
4222
4223#undef EIGEN_MAKE_ARM_FP16_CMP_8
4224#undef EIGEN_MAKE_ARM_FP16_CMP_4
4225
4226template <>
4227EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4228 return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
4229}
4230
4231template <>
4232EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4233 return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
4234}
4235
4236template <>
4237EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a)
4238{ return vrndnq_f16(a); }
4239
4240template <>
4241EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a)
4242{ return vrndn_f16(a); }
4243
4244template <>
4245EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a)
4246{ return vrndmq_f16(a); }
4247
4248template <>
4249EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a)
4250{ return vrndm_f16(a); }
4251
4252template <>
4253EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a)
4254{ return vrndpq_f16(a); }
4255
4256template <>
4257EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a)
4258{ return vrndp_f16(a); }
4259
4260template <>
4261EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
4262 return vsqrtq_f16(a);
4263}
4264
4265template <>
4266EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(const Packet4hf& a) {
4267 return vsqrt_f16(a);
4268}
4269
4270template <>
4271EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4272 return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4273}
4274
4275template <>
4276EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4277 return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4278}
4279
4280template <>
4281EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4282 return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4283}
4284
4285template <>
4286EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4287 return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4288}
4289
4290template <>
4291EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4292 return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4293}
4294
4295template <>
4296EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4297 return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4298}
4299
4300template <>
4301EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
4302 return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
4303}
4304
4305template <>
4306EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
4307 return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
4308}
4309
4310template <>
4311EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
4312 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
4313}
4314
4315template <>
4316EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
4317 EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
4318}
4319
4320template <>
4321EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(const Eigen::half* from) {
4322 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
4323}
4324
4325template <>
4326EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(const Eigen::half* from) {
4327 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
4328}
4329
4330template <>
4331EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(const Eigen::half* from) {
4332 Packet8hf packet;
4333 packet[0] = from[0].x;
4334 packet[1] = from[0].x;
4335 packet[2] = from[1].x;
4336 packet[3] = from[1].x;
4337 packet[4] = from[2].x;
4338 packet[5] = from[2].x;
4339 packet[6] = from[3].x;
4340 packet[7] = from[3].x;
4341 return packet;
4342}
4343
4344template <>
4345EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(const Eigen::half* from) {
4346 float16x4_t packet;
4347 float16_t* tmp;
4348 tmp = (float16_t*)&packet;
4349 tmp[0] = from[0].x;
4350 tmp[1] = from[0].x;
4351 tmp[2] = from[1].x;
4352 tmp[3] = from[1].x;
4353 return packet;
4354}
4355
4356template <>
4357EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
4358 Packet4hf lo, hi;
4359 lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
4360 hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from+1));
4361 return vcombine_f16(lo, hi);
4362}
4363
4364EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); }
4365
4366EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); }
4367
4368template <>
4369EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
4370 return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
4371}
4372
4373template <>
4374EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
4375 return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
4376}
4377
4378EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); }
4379
4380EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); }
4381
4382template <>
4383EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
4384 EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
4385}
4386
4387template <>
4388EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
4389 EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
4390}
4391
4392template <>
4393EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
4394 EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
4395}
4396
4397template <>
4398EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
4399 EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
4400}
4401
4402template <>
4403EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
4404 Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
4405 res = vsetq_lane_f16(from[0 * stride].x, res, 0);
4406 res = vsetq_lane_f16(from[1 * stride].x, res, 1);
4407 res = vsetq_lane_f16(from[2 * stride].x, res, 2);
4408 res = vsetq_lane_f16(from[3 * stride].x, res, 3);
4409 res = vsetq_lane_f16(from[4 * stride].x, res, 4);
4410 res = vsetq_lane_f16(from[5 * stride].x, res, 5);
4411 res = vsetq_lane_f16(from[6 * stride].x, res, 6);
4412 res = vsetq_lane_f16(from[7 * stride].x, res, 7);
4413 return res;
4414}
4415
4416template <>
4417EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
4418 Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
4419 res = vset_lane_f16(from[0 * stride].x, res, 0);
4420 res = vset_lane_f16(from[1 * stride].x, res, 1);
4421 res = vset_lane_f16(from[2 * stride].x, res, 2);
4422 res = vset_lane_f16(from[3 * stride].x, res, 3);
4423 return res;
4424}
4425
4426template <>
4427EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from, Index stride) {
4428 to[stride * 0].x = vgetq_lane_f16(from, 0);
4429 to[stride * 1].x = vgetq_lane_f16(from, 1);
4430 to[stride * 2].x = vgetq_lane_f16(from, 2);
4431 to[stride * 3].x = vgetq_lane_f16(from, 3);
4432 to[stride * 4].x = vgetq_lane_f16(from, 4);
4433 to[stride * 5].x = vgetq_lane_f16(from, 5);
4434 to[stride * 6].x = vgetq_lane_f16(from, 6);
4435 to[stride * 7].x = vgetq_lane_f16(from, 7);
4436}
4437
4438template <>
4439EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from, Index stride) {
4440 to[stride * 0].x = vget_lane_f16(from, 0);
4441 to[stride * 1].x = vget_lane_f16(from, 1);
4442 to[stride * 2].x = vget_lane_f16(from, 2);
4443 to[stride * 3].x = vget_lane_f16(from, 3);
4444}
4445
4446template <>
4447EIGEN_STRONG_INLINE void prefetch<Eigen::half>(const Eigen::half* addr) {
4448 EIGEN_ARM_PREFETCH(addr);
4449}
4450
4451template <>
4452EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(const Packet8hf& a) {
4453 float16_t x[8];
4454 vst1q_f16(x, a);
4455 Eigen::half h;
4456 h.x = x[0];
4457 return h;
4458}
4459
4460template <>
4461EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(const Packet4hf& a) {
4462 float16_t x[4];
4463 vst1_f16(x, a);
4464 Eigen::half h;
4465 h.x = x[0];
4466 return h;
4467}
4468
4469template<> EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
4470 float16x4_t a_lo, a_hi;
4471 Packet8hf a_r64;
4472
4473 a_r64 = vrev64q_f16(a);
4474 a_lo = vget_low_f16(a_r64);
4475 a_hi = vget_high_f16(a_r64);
4476 return vcombine_f16(a_hi, a_lo);
4477}
4478
4479template <>
4480EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(const Packet4hf& a) {
4481 return vrev64_f16(a);
4482}
4483
4484template <>
4485EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(const Packet8hf& a) {
4486 return vabsq_f16(a);
4487}
4488
4489template <>
4490EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
4491 return vabs_f16(a);
4492}
4493
4494template <>
4495EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
4496 float16x4_t a_lo, a_hi, sum;
4497
4498 a_lo = vget_low_f16(a);
4499 a_hi = vget_high_f16(a);
4500 sum = vpadd_f16(a_lo, a_hi);
4501 sum = vpadd_f16(sum, sum);
4502 sum = vpadd_f16(sum, sum);
4503
4504 Eigen::half h;
4505 h.x = vget_lane_f16(sum, 0);
4506 return h;
4507}
4508
4509template <>
4510EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(const Packet4hf& a) {
4511 float16x4_t sum;
4512
4513 sum = vpadd_f16(a, a);
4514 sum = vpadd_f16(sum, sum);
4515 Eigen::half h;
4516 h.x = vget_lane_f16(sum, 0);
4517 return h;
4518}
4519
4520template <>
4521EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(const Packet8hf& a) {
4522 float16x4_t a_lo, a_hi, prod;
4523
4524 a_lo = vget_low_f16(a);
4525 a_hi = vget_high_f16(a);
4526 prod = vmul_f16(a_lo, a_hi);
4527 prod = vmul_f16(prod, vrev64_f16(prod));
4528
4529 Eigen::half h;
4530 h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
4531 return h;
4532}
4533
4534template <>
4535EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(const Packet4hf& a) {
4536 float16x4_t prod;
4537 prod = vmul_f16(a, vrev64_f16(a));
4538 Eigen::half h;
4539 h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
4540 return h;
4541}
4542
4543template <>
4544EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(const Packet8hf& a) {
4545 float16x4_t a_lo, a_hi, min;
4546
4547 a_lo = vget_low_f16(a);
4548 a_hi = vget_high_f16(a);
4549 min = vpmin_f16(a_lo, a_hi);
4550 min = vpmin_f16(min, min);
4551 min = vpmin_f16(min, min);
4552
4553 Eigen::half h;
4554 h.x = vget_lane_f16(min, 0);
4555 return h;
4556}
4557
4558template <>
4559EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(const Packet4hf& a) {
4560 Packet4hf tmp;
4561 tmp = vpmin_f16(a, a);
4562 tmp = vpmin_f16(tmp, tmp);
4563 Eigen::half h;
4564 h.x = vget_lane_f16(tmp, 0);
4565 return h;
4566}
4567
4568template <>
4569EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(const Packet8hf& a) {
4570 float16x4_t a_lo, a_hi, max;
4571
4572 a_lo = vget_low_f16(a);
4573 a_hi = vget_high_f16(a);
4574 max = vpmax_f16(a_lo, a_hi);
4575 max = vpmax_f16(max, max);
4576 max = vpmax_f16(max, max);
4577
4578 Eigen::half h;
4579 h.x = vget_lane_f16(max, 0);
4580 return h;
4581}
4582
4583template <>
4584EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
4585 Packet4hf tmp;
4586 tmp = vpmax_f16(a, a);
4587 tmp = vpmax_f16(tmp, tmp);
4588 Eigen::half h;
4589 h.x = vget_lane_f16(tmp, 0);
4590 return h;
4591}
4592
4593EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
4594{
4595 const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
4596 const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
4597
4598 const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
4599 const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
4600
4601 kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
4602 kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
4603 kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
4604 kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
4605}
4606
4607EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
4608 EIGEN_ALIGN16 float16x4x4_t tmp_x4;
4609 float16_t* tmp = (float16_t*)&kernel;
4610 tmp_x4 = vld4_f16(tmp);
4611
4612 kernel.packet[0] = tmp_x4.val[0];
4613 kernel.packet[1] = tmp_x4.val[1];
4614 kernel.packet[2] = tmp_x4.val[2];
4615 kernel.packet[3] = tmp_x4.val[3];
4616}
4617
4618EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
4619 float16x8x2_t T_1[4];
4620
4621 T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
4622 T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
4623 T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
4624 T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
4625
4626 float16x8x2_t T_2[4];
4627 T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
4628 T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
4629 T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
4630 T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
4631
4632 float16x8x2_t T_3[4];
4633 T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
4634 T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
4635 T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
4636 T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
4637
4638 kernel.packet[0] = T_3[0].val[0];
4639 kernel.packet[1] = T_3[2].val[0];
4640 kernel.packet[2] = T_3[1].val[0];
4641 kernel.packet[3] = T_3[3].val[0];
4642 kernel.packet[4] = T_3[0].val[1];
4643 kernel.packet[5] = T_3[2].val[1];
4644 kernel.packet[6] = T_3[1].val[1];
4645 kernel.packet[7] = T_3[3].val[1];
4646}
4647#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
4648
4649} // end namespace internal
4650
4651} // end namespace Eigen
4652
4653#endif // EIGEN_PACKET_MATH_NEON_H
Base class for all dense matrices, vectors, and expressions.
Definition MatrixBase.h:50
@ Unaligned
Data pointer has no specific alignment.
Definition Constants.h:233
@ Aligned16
Data pointer is aligned on a 16 bytes boundary.
Definition Constants.h:235
Namespace containing all symbols from the Eigen library.
Definition LDLT.h:16
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
detail namespace with internal helper functions
Definition json.hpp:249
Definition BFloat16.h:58
Definition Half.h:140
Definition GenericPacketMath.h:43
Definition Meta.h:133
Definition GenericPacketMath.h:107
Definition GenericPacketMath.h:133