10#ifndef EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
11#define EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
13#include "../../InternalHeaderCheck.h"
15#if defined(__MMA__) && !EIGEN_ALTIVEC_DISABLE_MMA
16#if EIGEN_COMP_LLVM || (__GNUC__ > 10 || __GNUC_MINOR__ >= 3)
20#if !EIGEN_COMP_LLVM && (__GNUC__ == 10 && __GNUC_MINOR__ <= 3)
22#define GCC_ONE_VECTORPAIR_BUG
29#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
30#define EIGEN_POWER_GEMV_PREFETCH(p) prefetch(p)
32#define EIGEN_POWER_GEMV_PREFETCH(p)
36#if !__has_builtin(__builtin_vsx_assemble_pair)
37#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
39#if !__has_builtin(__builtin_vsx_disassemble_pair)
40#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
45#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
46 __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1)
49#if (__GNUC_MINOR__ > 3)
50#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
51 __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1)
53#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
54 __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2)
57#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
58 __builtin_vsx_build_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2)
62#define GEMV_IS_COMPLEX_COMPLEX ((sizeof(LhsPacket) == 16) && (sizeof(RhsPacket) == 16))
63#define GEMV_IS_FLOAT (ResPacketSize == (16 / sizeof(float)))
64#define GEMV_IS_SCALAR (sizeof(ResPacket) != 16)
65#define GEMV_IS_COMPLEX_FLOAT (ResPacketSize == (16 / sizeof(std::complex<float>)))
68template<
typename ResPacket,
typename ResScalar>
69EIGEN_ALWAYS_INLINE
void storeMaddData(ResScalar* res, ResPacket& palpha, ResPacket& data)
71 pstoreu(res, pmadd(data, palpha, ploadu<ResPacket>(res)));
74template<
typename ResScalar>
75EIGEN_ALWAYS_INLINE
void storeMaddData(ResScalar* res, ResScalar& alpha, ResScalar& data)
77 *res += (alpha * data);
80#define GEMV_UNROLL(func, N) \
81 func(0, N) func(1, N) func(2, N) func(3, N) \
82 func(4, N) func(5, N) func(6, N) func(7, N)
84#define GEMV_UNROLL_HALF(func, N) \
85 func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
87#define GEMV_GETN(N) (((N) * ResPacketSize) >> 2)
89#define GEMV_LOADPACKET_COL(iter) \
90 lhs.template load<LhsPacket, LhsAlignment>(i + ((iter) * LhsPacketSize), j)
93#define GEMV_UNROLL3(func, N, which) \
94 func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \
95 func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which)
97#define GEMV_UNUSED_VAR(iter, N, which) \
98 if (GEMV_GETN(N) <= iter) { \
99 EIGEN_UNUSED_VARIABLE(which##iter); \
102#define GEMV_UNUSED_EXTRA_VAR(iter, N, which) \
104 EIGEN_UNUSED_VARIABLE(which##iter); \
107#define GEMV_UNUSED_EXTRA(N, which) \
108 GEMV_UNROLL3(GEMV_UNUSED_EXTRA_VAR, N, which)
110#define GEMV_UNUSED(N, which) \
111 GEMV_UNROLL3(GEMV_UNUSED_VAR, N, which)
113#define GEMV_INIT_MMA(iter, N) \
114 if (GEMV_GETN(N) > iter) { \
115 __builtin_mma_xxsetaccz(&e##iter); \
119#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \
120 GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_COL(iter2), GEMV_LOADPACKET_COL((iter2) + 1));
122#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \
123 const LhsScalar& src##iter1 = lhs(i + ((iter1 * 32) / sizeof(LhsScalar)), j); \
124 b##iter1 = *reinterpret_cast<__vector_pair *>(const_cast<LhsScalar *>(&src##iter1));
127#define GEMV_LOAD1A_COL_MMA(iter, N) \
128 if (GEMV_GETN(N) > iter) { \
129 if (GEMV_IS_FLOAT) { \
130 g##iter = GEMV_LOADPACKET_COL(iter); \
131 EIGEN_UNUSED_VARIABLE(b##iter); \
133 GEMV_LOADPAIR_COL_MMA(iter, iter << 1) \
134 EIGEN_UNUSED_VARIABLE(g##iter); \
137 EIGEN_UNUSED_VARIABLE(b##iter); \
138 EIGEN_UNUSED_VARIABLE(g##iter); \
141#define GEMV_WORK1A_COL_MMA(iter, N) \
142 if (GEMV_GETN(N) > iter) { \
143 if (GEMV_IS_FLOAT) { \
144 pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter, a0, g##iter); \
146 pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter, b##iter, a0); \
150#define GEMV_LOAD1B_COL_MMA(iter1, iter2, iter3, N) \
151 if (GEMV_GETN(N) > iter1) { \
152 if (GEMV_IS_FLOAT) { \
153 GEMV_LOADPAIR_COL_MMA(iter2, iter2) \
154 EIGEN_UNUSED_VARIABLE(b##iter3); \
156 GEMV_LOADPAIR_COL_MMA(iter2, iter2 << 1) \
157 GEMV_LOADPAIR_COL_MMA(iter3, iter3 << 1) \
160 EIGEN_UNUSED_VARIABLE(b##iter2); \
161 EIGEN_UNUSED_VARIABLE(b##iter3); \
163 EIGEN_UNUSED_VARIABLE(g##iter2); \
164 EIGEN_UNUSED_VARIABLE(g##iter3);
166#define GEMV_WORK1B_COL_MMA(iter1, iter2, iter3, N) \
167 if (GEMV_GETN(N) > iter1) { \
168 if (GEMV_IS_FLOAT) { \
170 __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(h), &b##iter2); \
171 pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, a0, h[0]); \
172 pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, a0, h[1]); \
174 pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, b##iter2, a0); \
175 pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, b##iter3, a0); \
180#define GEMV_LOAD_COL_MMA(N) \
181 if (GEMV_GETN(N) > 1) { \
182 GEMV_UNROLL_HALF(GEMV_LOAD1B_COL_MMA, (N >> 1)) \
184 GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N) \
187#define GEMV_WORK_COL_MMA(N) \
188 if (GEMV_GETN(N) > 1) { \
189 GEMV_UNROLL_HALF(GEMV_WORK1B_COL_MMA, (N >> 1)) \
191 GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N) \
194#define GEMV_LOAD_COL_MMA(N) \
195 GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N)
197#define GEMV_WORK_COL_MMA(N) \
198 GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N)
201#define GEMV_DISASSEMBLE_MMA(iter, N) \
202 if (GEMV_GETN(N) > iter) { \
203 __builtin_mma_disassemble_acc(&result##iter.packet, &e##iter); \
204 if (!GEMV_IS_FLOAT) { \
205 result##iter.packet[0][1] = result##iter.packet[1][0]; \
206 result##iter.packet[2][1] = result##iter.packet[3][0]; \
210#define GEMV_LOADPAIR2_COL_MMA(iter1, iter2) \
211 b##iter1 = *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize));
213#define GEMV_LOAD2_COL_MMA(iter1, iter2, iter3, N) \
214 if (GEMV_GETN(N) > iter1) { \
215 if (GEMV_IS_FLOAT) { \
216 GEMV_LOADPAIR2_COL_MMA(iter2, iter2); \
217 EIGEN_UNUSED_VARIABLE(b##iter3); \
219 GEMV_LOADPAIR2_COL_MMA(iter2, iter2 << 1); \
220 GEMV_LOADPAIR2_COL_MMA(iter3, iter3 << 1); \
223 EIGEN_UNUSED_VARIABLE(b##iter2); \
224 EIGEN_UNUSED_VARIABLE(b##iter3); \
228#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
229 ResPacket f##iter2[2]; \
230 __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(f##iter2), &b##iter2); \
231 f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]); \
232 f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, f##iter2[1]); \
233 GEMV_BUILDPAIR_MMA(b##iter2, f##iter2[0], f##iter2[1]);
235#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
236 if (GEMV_IS_FLOAT) { \
237 __asm__ ("xvmaddasp %0,%x1,%x3\n\txvmaddasp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter3.packet[0]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \
239 __asm__ ("xvmaddadp %0,%x1,%x3\n\txvmaddadp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter2.packet[2]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \
243#define GEMV_WORK2_COL_MMA(iter1, iter2, iter3, N) \
244 if (GEMV_GETN(N) > iter1) { \
245 if (GEMV_IS_FLOAT) { \
246 GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter2); \
248 GEMV_WORKPAIR2_COL_MMA(iter2, iter2, iter2 << 1); \
249 GEMV_WORKPAIR2_COL_MMA(iter3, iter3, iter3 << 1); \
253#define GEMV_STOREPAIR2_COL_MMA(iter1, iter2) \
254 *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize)) = b##iter1;
256#define GEMV_STORE_COL_MMA(iter, N) \
257 if (GEMV_GETN(N) > iter) { \
258 if (GEMV_IS_FLOAT) { \
259 storeMaddData<ResPacket, ResScalar>(res + i + (iter * ResPacketSize), palpha, result##iter.packet[0]); \
261 GEMV_LOADPAIR2_COL_MMA(iter, iter << 1) \
262 GEMV_WORKPAIR2_COL_MMA(iter, iter, iter << 1) \
263 GEMV_STOREPAIR2_COL_MMA(iter, iter << 1) \
267#define GEMV_STORE2_COL_MMA(iter1, iter2, iter3, N) \
268 if (GEMV_GETN(N) > iter1) { \
269 if (GEMV_IS_FLOAT) { \
270 GEMV_STOREPAIR2_COL_MMA(iter2, iter2); \
272 GEMV_STOREPAIR2_COL_MMA(iter2, iter2 << 1) \
273 GEMV_STOREPAIR2_COL_MMA(iter3, iter3 << 1) \
277#define GEMV_PROCESS_COL_ONE_MMA(N) \
278 GEMV_UNROLL(GEMV_INIT_MMA, N) \
280 __vector_pair b0, b1, b2, b3, b4, b5, b6, b7; \
282 LhsPacket g0, g1, g2, g3, g4, g5, g6, g7; \
283 RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0)); \
284 GEMV_UNROLL(GEMV_PREFETCH, N) \
285 GEMV_LOAD_COL_MMA(N) \
286 GEMV_WORK_COL_MMA(N) \
287 } while (++j < jend); \
288 GEMV_UNROLL(GEMV_DISASSEMBLE_MMA, N) \
289 if (GEMV_GETN(N) <= 1) { \
290 GEMV_UNROLL(GEMV_STORE_COL_MMA, N) \
292 GEMV_UNROLL_HALF(GEMV_LOAD2_COL_MMA, (N >> 1)) \
293 GEMV_UNROLL_HALF(GEMV_WORK2_COL_MMA, (N >> 1)) \
294 GEMV_UNROLL_HALF(GEMV_STORE2_COL_MMA, (N >> 1)) \
296 i += (ResPacketSize * N);
299#define GEMV_INIT(iter, N) \
301 c##iter = pset1<ResPacket>(ResScalar(0)); \
303 EIGEN_UNUSED_VARIABLE(c##iter); \
306#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
307#define GEMV_PREFETCH(iter, N) \
308 if (GEMV_GETN(N) > ((iter >> 1) + ((N >> 1) * (iter & 1)))) { \
309 lhs.prefetch(i + (iter * LhsPacketSize) + prefetch_dist, j); \
312#define GEMV_PREFETCH(iter, N)
315#define GEMV_WORK_COL(iter, N) \
317 c##iter = pcj.pmadd(GEMV_LOADPACKET_COL(iter), a0, c##iter); \
320#define GEMV_STORE_COL(iter, N) \
322 pstoreu(res + i + (iter * ResPacketSize), pmadd(c##iter, palpha, ploadu<ResPacket>(res + i + (iter * ResPacketSize)))); \
326#define GEMV_PROCESS_COL_ONE(N) \
327 GEMV_UNROLL(GEMV_INIT, N) \
330 RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0)); \
331 GEMV_UNROLL(GEMV_PREFETCH, N) \
332 GEMV_UNROLL(GEMV_WORK_COL, N) \
333 } while (++j < jend); \
334 GEMV_UNROLL(GEMV_STORE_COL, N) \
335 i += (ResPacketSize * N);
338#define GEMV_PROCESS_COL(N) \
339 GEMV_PROCESS_COL_ONE_MMA(N)
341#define GEMV_PROCESS_COL(N) \
342 GEMV_PROCESS_COL_ONE(N)
347template<
typename LhsPacket,
typename RhsPacket,
bool accumulate>
348EIGEN_ALWAYS_INLINE
void pger_vecMMA_acc(__vector_quad* acc,
const RhsPacket& a,
const LhsPacket& b)
352 __builtin_mma_xvf32gerpp(acc, (__vector
unsigned char)a, (__vector
unsigned char)b);
356 __builtin_mma_xvf32ger(acc, (__vector
unsigned char)a, (__vector
unsigned char)b);
361template<
typename LhsPacket,
typename RhsPacket,
bool accumulate>
362EIGEN_ALWAYS_INLINE
void pger_vecMMA_acc(__vector_quad* acc, __vector_pair& a,
const LhsPacket& b)
366 __builtin_mma_xvf64gerpp(acc, a, (__vector
unsigned char)b);
370 __builtin_mma_xvf64ger(acc, a, (__vector
unsigned char)b);
375template<
typename LhsScalar,
typename LhsMapper,
typename RhsScalar,
typename RhsMapper,
typename ResScalar>
376EIGEN_STRONG_INLINE
void gemv_col(
377 Index rows, Index cols,
378 const LhsMapper& alhs,
379 const RhsMapper& rhs,
380 ResScalar* res, Index resIncr,
383 typedef gemv_traits<LhsScalar, RhsScalar> Traits;
385 typedef typename Traits::LhsPacket LhsPacket;
386 typedef typename Traits::RhsPacket RhsPacket;
387 typedef typename Traits::ResPacket ResPacket;
389 EIGEN_UNUSED_VARIABLE(resIncr);
390 eigen_internal_assert(resIncr == 1);
397 conj_helper<LhsScalar, RhsScalar, false, false> cj;
398 conj_helper<LhsPacket, RhsPacket, false, false> pcj;
400 const Index lhsStride = lhs.stride();
404 ResPacketSize = Traits::ResPacketSize,
405 LhsPacketSize = Traits::LhsPacketSize,
406 RhsPacketSize = Traits::RhsPacketSize,
409#ifndef GCC_ONE_VECTORPAIR_BUG
410 const Index n8 = rows - 8 * ResPacketSize + 1;
411 const Index n4 = rows - 4 * ResPacketSize + 1;
412 const Index n2 = rows - 2 * ResPacketSize + 1;
414 const Index n1 = rows - 1 * ResPacketSize + 1;
415#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
416 const Index prefetch_dist = 64 * LhsPacketSize;
420 const Index block_cols = cols < 128 ? cols : (lhsStride *
sizeof(LhsScalar) < 16000 ? 16 : 8);
421 ResPacket palpha = pset1<ResPacket>(alpha);
423 for (Index j2 = 0; j2 < cols; j2 += block_cols)
425 Index jend = numext::mini(j2 + block_cols, cols);
427 ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
429 __vector_quad e0, e1, e2, e3, e4, e5, e6, e7;
430 PacketBlock<ResPacket, 4> result0, result1, result2, result3, result4, result5, result6, result7;
432 GEMV_UNUSED(8, result)
433 GEMV_UNUSED_EXTRA(1, c)
435#ifndef GCC_ONE_VECTORPAIR_BUG
453 GEMV_PROCESS_COL_ONE(1)
460 d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
461 }
while (++j < jend);
462 res[i] += alpha * d0;
467const Packet16uc p16uc_COMPLEX32_XORFLIP = { 0x44,0x55,0x66,0x77, 0x00,0x11,0x22,0x33, 0xcc,0xdd,0xee,0xff, 0x88,0x99,0xaa,0xbb };
468const Packet16uc p16uc_COMPLEX64_XORFLIP = { 0x88,0x99,0xaa,0xbb, 0xcc,0xdd,0xee,0xff, 0x00,0x11,0x22,0x33, 0x44,0x55,0x66,0x77 };
471const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 };
472const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
473const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
474const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
475const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 };
476const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
478const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
479const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
480const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00 };
481const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
482const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80 };
483const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
487#define COMPLEX_DELTA 0
489#define COMPLEX_DELTA 2
493EIGEN_ALWAYS_INLINE Packet2cf pconj2(
const Packet2cf& a) {
494 return Packet2cf(pxor(a.v,
reinterpret_cast<Packet4f
>(p16uc_COMPLEX32_CONJ_XOR)));
497EIGEN_ALWAYS_INLINE Packet1cd pconj2(
const Packet1cd& a) {
498 return Packet1cd(pxor(a.v,
reinterpret_cast<Packet2d
>(p16uc_COMPLEX64_CONJ_XOR)));
502EIGEN_ALWAYS_INLINE Packet2cf pconjinv(
const Packet2cf& a) {
503#ifdef __POWER8_VECTOR__
504 return Packet2cf(Packet4f(vec_neg(Packet2d(a.v))));
506 return Packet2cf(pxor(a.v,
reinterpret_cast<Packet4f
>(p16uc_COMPLEX32_CONJ_XOR2)));
510EIGEN_ALWAYS_INLINE Packet1cd pconjinv(
const Packet1cd& a) {
511 return Packet1cd(pxor(a.v,
reinterpret_cast<Packet2d
>(p16uc_COMPLEX64_CONJ_XOR2)));
514#if defined(_ARCH_PWR8) && (!EIGEN_COMP_LLVM || __clang_major__ >= 12)
519EIGEN_ALWAYS_INLINE Packet2cf pcplxflipconj(Packet2cf a)
522 return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR, p16uc_COMPLEX32_XORFLIP)));
524 return pcplxflip(pconj2(a));
528EIGEN_ALWAYS_INLINE Packet1cd pcplxflipconj(Packet1cd a)
531 return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR, p16uc_COMPLEX64_XORFLIP)));
533 return pcplxflip(pconj2(a));
538EIGEN_ALWAYS_INLINE Packet2cf pcplxconjflip(Packet2cf a)
541 return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR2, p16uc_COMPLEX32_XORFLIP)));
543 return pconj2(pcplxflip(a));
547EIGEN_ALWAYS_INLINE Packet1cd pcplxconjflip(Packet1cd a)
550 return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR2, p16uc_COMPLEX64_XORFLIP)));
552 return pconj2(pcplxflip(a));
557EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a)
559#ifdef __POWER8_VECTOR__
560 return Packet2cf(vec_neg(a.v));
562 return Packet2cf(pxor(a.v,
reinterpret_cast<Packet4f
>(p16uc_COMPLEX32_NEGATE)));
566EIGEN_ALWAYS_INLINE Packet1cd pnegate2(Packet1cd a)
568#ifdef __POWER8_VECTOR__
569 return Packet1cd(vec_neg(a.v));
571 return Packet1cd(pxor(a.v,
reinterpret_cast<Packet2d
>(p16uc_COMPLEX64_NEGATE)));
576EIGEN_ALWAYS_INLINE Packet2cf pcplxflipnegate(Packet2cf a)
579 return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_NEGATE, p16uc_COMPLEX32_XORFLIP)));
581 return pcplxflip(pnegate2(a));
585EIGEN_ALWAYS_INLINE Packet1cd pcplxflipnegate(Packet1cd a)
588 return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_NEGATE, p16uc_COMPLEX64_XORFLIP)));
590 return pcplxflip(pnegate2(a));
595EIGEN_ALWAYS_INLINE Packet2cf pcplxflip2(Packet2cf a)
597 return Packet2cf(Packet4f(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX32_XORFLIP)));
600EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a)
602#ifdef EIGEN_VECTORIZE_VSX
603 return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2));
605 return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP)));
610EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex<float>* src)
613#ifdef EIGEN_VECTORIZE_VSX
615 __asm__(
"lxsdx %x0,%y1" :
"=wa" (t) :
"Z" (*src));
617 *
reinterpret_cast<std::complex<float>*
>(
reinterpret_cast<float*
>(&t) + COMPLEX_DELTA) = *src;
623template<
typename RhsScalar>
624EIGEN_ALWAYS_INLINE
void pload_realimag(RhsScalar* src, Packet4f& r, Packet4f& i)
627 __asm__(
"lxvwsx %x0,%y1" :
"=wa" (r) :
"Z" (*(reinterpret_cast<float*>(src) + 0)));
628 __asm__(
"lxvwsx %x0,%y1" :
"=wa" (i) :
"Z" (*(reinterpret_cast<float*>(src) + 1)));
630 Packet4f t = pload_complex_half(src);
631 r = vec_splat(t, COMPLEX_DELTA + 0);
632 i = vec_splat(t, COMPLEX_DELTA + 1);
636template<
typename RhsScalar>
637EIGEN_ALWAYS_INLINE
void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i)
639#ifdef EIGEN_VECTORIZE_VSX
640 __asm__(
"lxvdsx %x0,%y1" :
"=wa" (r) :
"Z" (*(reinterpret_cast<double*>(src) + 0)));
641 __asm__(
"lxvdsx %x0,%y1" :
"=wa" (i) :
"Z" (*(reinterpret_cast<double*>(src) + 1)));
643 Packet2d t = ploadu<Packet2d>(
reinterpret_cast<double*
>(src));
649#ifndef __POWER8_VECTOR__
650const Packet16uc p16uc_MERGEE = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B };
652const Packet16uc p16uc_MERGEO = { 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
656template<
typename RhsScalar>
657EIGEN_ALWAYS_INLINE
void pload_realimag_row(RhsScalar* src, Packet4f& r, Packet4f& i)
659 Packet4f t = ploadu<Packet4f>(
reinterpret_cast<float*
>(src));
660#ifdef __POWER8_VECTOR__
661 r = vec_mergee(t, t);
662 i = vec_mergeo(t, t);
664 r = vec_perm(t, t, p16uc_MERGEE);
665 i = vec_perm(t, t, p16uc_MERGEO);
669template<
typename RhsScalar>
670EIGEN_ALWAYS_INLINE
void pload_realimag_row(RhsScalar* src, Packet2d& r, Packet2d& i)
672 return pload_realimag(src, r, i);
676EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex<float>* src)
678#ifdef EIGEN_VECTORIZE_VSX
680 __asm__(
"lxvdsx %x0,%y1" :
"=wa" (ret) :
"Z" (*(reinterpret_cast<double*>(src) + 0)));
683 return Packet4f(ploaddup<Packet2d>(
reinterpret_cast<double *
>(src)));
687EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine(std::complex<double>* src)
689 return ploadu<Packet1cd>(src).v;
693EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine_row(std::complex<float>* src)
695 return ploadu<Packet2cf>(src).v;
698EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine_row(std::complex<double>* src)
700 return ploadu<Packet1cd>(src).v;
704template<
typename ResPacket>
705EIGEN_ALWAYS_INLINE Packet4f pload_complex(std::complex<float>* src)
707 if (GEMV_IS_SCALAR) {
708 return pload_complex_half(src);
712 return ploadu<Packet4f>(
reinterpret_cast<float*
>(src));
716template<
typename ResPacket>
717EIGEN_ALWAYS_INLINE Packet2d pload_complex(std::complex<double>* src)
719 return ploadu<Packet2d>(
reinterpret_cast<double*
>(src));
723template<
typename ResPacket>
724EIGEN_ALWAYS_INLINE Packet4f pload_complex(Packet2cf* src)
729template<
typename ResPacket>
730EIGEN_ALWAYS_INLINE Packet2d pload_complex(Packet1cd* src)
736EIGEN_ALWAYS_INLINE Packet4f pload_complex_full(std::complex<float>* src)
738 return Packet4f(ploaddup<Packet2d>(
reinterpret_cast<double *
>(src)));
741EIGEN_ALWAYS_INLINE Packet2d pload_complex_full(std::complex<double>* src)
743 return ploadu<Packet1cd>(src).v;
747EIGEN_ALWAYS_INLINE Packet4f pload_complex_full_row(std::complex<float>* src)
749 return ploadu<Packet2cf>(src).v;
752EIGEN_ALWAYS_INLINE Packet2d pload_complex_full_row(std::complex<double>* src)
754 return pload_complex_full(src);
758EIGEN_ALWAYS_INLINE Packet4f pload_real(
float* src)
760 return pset1<Packet4f>(*src);
763EIGEN_ALWAYS_INLINE Packet2d pload_real(
double* src)
765 return pset1<Packet2d>(*src);
768EIGEN_ALWAYS_INLINE Packet4f pload_real(Packet4f& src)
773EIGEN_ALWAYS_INLINE Packet2d pload_real(Packet2d& src)
779EIGEN_ALWAYS_INLINE Packet4f pload_real_full(
float* src)
781 Packet4f ret = ploadu<Packet4f>(src);
782 return vec_mergeh(ret, ret);
785EIGEN_ALWAYS_INLINE Packet2d pload_real_full(
double* src)
787 return pload_real(src);
790EIGEN_ALWAYS_INLINE Packet4f pload_real_full(std::complex<float>* src)
792 return pload_complex_full(src);
795EIGEN_ALWAYS_INLINE Packet2d pload_real_full(std::complex<double>* src)
797 return pload_complex_full(src);
801template<
typename ResPacket>
802EIGEN_ALWAYS_INLINE Packet4f pload_real_row(
float* src)
804 if (GEMV_IS_SCALAR) {
805 return pload_real_full(src);
808 return ploadu<Packet4f>(src);
812template<
typename ResPacket>
813EIGEN_ALWAYS_INLINE Packet2d pload_real_row(
double* src)
815 return pload_real(src);
818EIGEN_ALWAYS_INLINE Packet2cf padd(Packet2cf& a, std::complex<float>& b)
820 EIGEN_UNUSED_VARIABLE(b);
824EIGEN_ALWAYS_INLINE Packet1cd padd(Packet1cd& a, std::complex<double>& b)
826 EIGEN_UNUSED_VARIABLE(b);
831template<
typename Scalar,
typename ResScalar>
832EIGEN_ALWAYS_INLINE Scalar pset1_realimag(ResScalar& alpha,
int which,
int conj)
834 return (which) ? ((conj) ? -alpha.real() : alpha.real()) : ((conj) ? -alpha.imag() : alpha.imag());
838template<
typename Scalar,
typename ResScalar,
typename ResPacket,
int which>
839EIGEN_ALWAYS_INLINE Packet2cf pset1_complex(std::complex<float>& alpha)
842 ret.v[COMPLEX_DELTA + 0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
843 ret.v[COMPLEX_DELTA + 1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
844 ret.v[2 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 0];
845 ret.v[3 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 1];
849template<
typename Scalar,
typename ResScalar,
typename ResPacket,
int which>
850EIGEN_ALWAYS_INLINE Packet1cd pset1_complex(std::complex<double>& alpha)
853 ret.v[0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
854 ret.v[1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
859template<
typename Packet>
860EIGEN_ALWAYS_INLINE Packet pset_zero()
862 return pset1<Packet>(__UNPACK_TYPE__(Packet)(0));
866EIGEN_ALWAYS_INLINE Packet2cf pset_zero<Packet2cf>()
868 return Packet2cf(pset1<Packet4f>(
float(0)));
872EIGEN_ALWAYS_INLINE Packet1cd pset_zero<Packet1cd>()
874 return Packet1cd(pset1<Packet2d>(
double(0)));
878template<
typename Packet,
typename LhsPacket,
typename RhsPacket>
879EIGEN_ALWAYS_INLINE Packet pset_init(Packet& c1)
881 if (GEMV_IS_COMPLEX_COMPLEX) {
882 EIGEN_UNUSED_VARIABLE(c1);
883 return pset_zero<Packet>();
891template<
typename PResPacket,
typename ResPacket,
typename ResScalar,
typename Scalar>
895 separate.r = pset1_complex<Scalar, ResScalar, ResPacket, 0x3>(alpha);
896 separate.i = pset1_complex<Scalar, ResScalar, ResPacket, 0x0>(alpha);
905template<
typename ScalarPacket,
typename AlphaData>
906EIGEN_ALWAYS_INLINE ScalarPacket pmadd_complex(ScalarPacket& c0, ScalarPacket& c2, ScalarPacket& c4, AlphaData& b0)
908 return pmadd(c2, b0.separate.i.v, pmadd(c0, b0.separate.r.v, c4));
912template<
typename Scalar,
typename ScalarPacket,
typename PResPacket,
typename ResPacket,
typename ResScalar,
typename AlphaData>
913EIGEN_ALWAYS_INLINE
void pstoreu_pmadd_complex(PResPacket& c0, AlphaData& b0, ResScalar* res)
915 PResPacket c2 = pcplxflipconj(c0);
916 if (GEMV_IS_SCALAR) {
917 ScalarPacket c4 = ploadu<ScalarPacket>(
reinterpret_cast<Scalar*
>(res));
918 ScalarPacket c3 = pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0);
919 pstoreu(
reinterpret_cast<Scalar*
>(res), c3);
921 ScalarPacket c4 = pload_complex<ResPacket>(res);
922 PResPacket c3 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
927template<
typename ScalarPacket,
typename PResPacket,
typename ResPacket,
typename ResScalar,
typename AlphaData, Index ResPacketSize, Index iter2>
928EIGEN_ALWAYS_INLINE
void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res)
930 PResPacket c2 = pcplxflipconj(c0);
931 PResPacket c3 = pcplxflipconj(c1);
932#if !defined(_ARCH_PWR10)
933 ScalarPacket c4 = pload_complex<ResPacket>(res + (iter2 * ResPacketSize));
934 ScalarPacket c5 = pload_complex<ResPacket>(res + ((iter2 + 1) * ResPacketSize));
935 PResPacket c6 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
936 PResPacket c7 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c5, b0));
937 pstoreu(res + (iter2 * ResPacketSize), c6);
938 pstoreu(res + ((iter2 + 1) * ResPacketSize), c7);
940 __vector_pair a = *
reinterpret_cast<__vector_pair *
>(res + (iter2 * ResPacketSize));
943 __builtin_vsx_disassemble_pair(
reinterpret_cast<void*
>(c6), &a);
944 c6[0] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c6[0].v, b0));
945 c6[1] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c6[1].v, b0));
946 GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v);
948 if (GEMV_IS_COMPLEX_FLOAT) {
949 __asm__ (
"xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" :
"+&d" (a) :
"wa" (b0.separate.r.v),
"wa" (c0.v),
"wa" (c1.v));
950 __asm__ (
"xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" :
"+&d" (a) :
"wa" (b0.separate.i.v),
"wa" (c2.v),
"wa" (c3.v));
952 __asm__ (
"xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" :
"+&d" (a) :
"wa" (b0.separate.r.v),
"wa" (c0.v),
"wa" (c1.v));
953 __asm__ (
"xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" :
"+&d" (a) :
"wa" (b0.separate.i.v),
"wa" (c2.v),
"wa" (c3.v));
956 *
reinterpret_cast<__vector_pair *
>(res + (iter2 * ResPacketSize)) = a;
961template<
typename Scalar,
typename LhsScalar,
typename LhsMapper,
typename LhsPacket>
962EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j)
964 if (
sizeof(Scalar) ==
sizeof(LhsScalar)) {
965 const LhsScalar& src = lhs(i + 0, j);
966 return LhsPacket(pload_real_full(
const_cast<LhsScalar*
>(&src)));
968 return lhs.template load<LhsPacket, Unaligned>(i + 0, j);
972template<
typename ComplexPacket,
typename RealPacket,
bool ConjugateLhs,
bool ConjugateRhs,
bool Negate>
973EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_complex(RealPacket& a, RealPacket& b, RealPacket& c)
975 if (ConjugateLhs && ConjugateRhs) {
976 return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
978 else if (Negate && !ConjugateLhs && ConjugateRhs) {
979 return vec_nmsub(a, b, c);
982 return vec_madd(a, b, c);
987template<
typename ComplexPacket,
typename RealPacket,
bool Conjugate>
988EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_real(RealPacket& a, RealPacket& b, RealPacket& c)
991 return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
994 return vec_madd(a, b, c);
998template<
typename LhsPacket,
typename RhsScalar,
typename RhsPacket,
typename PResPacket,
bool ConjugateLhs,
bool ConjugateRhs,
int StorageOrder>
999EIGEN_ALWAYS_INLINE
void gemv_mult_generic(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
1001 conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
1003 if (StorageOrder == ColMajor) {
1004 b0 = pset1<RhsPacket>(*b);
1007 b0 = ploadu<RhsPacket>(b);
1009 c0 = pcj.pmadd(a0, b0, c0);
1013template<
typename ScalarPacket,
typename LhsPacket,
typename RhsScalar,
typename RhsPacket,
typename PResPacket,
typename ResPacket,
bool ConjugateLhs,
bool ConjugateRhs,
int StorageOrder>
1014EIGEN_ALWAYS_INLINE
void gemv_mult_complex_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0, ResPacket& c1)
1016 ScalarPacket br, bi;
1017 if (StorageOrder == ColMajor) {
1018 pload_realimag<RhsScalar>(b, br, bi);
1021 pload_realimag_row<RhsScalar>(b, br, bi);
1023 if (ConjugateLhs && !ConjugateRhs) a0 = pconj2(a0);
1024 LhsPacket a1 = pcplxflipconj(a0);
1025 ScalarPacket cr = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, false>(a0.v, br, c0.v);
1026 ScalarPacket ci = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, true>(a1.v, bi, c1.v);
1028 c0 = PResPacket(cr);
1032template<
typename ScalarPacket,
typename LhsPacket,
typename RhsScalar,
typename RhsPacket,
typename PResPacket,
typename ResPacket,
bool ConjugateLhs,
bool ConjugateRhs,
int StorageOrder>
1033EIGEN_ALWAYS_INLINE
void gemv_mult_real_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
1036 if (StorageOrder == ColMajor) {
1037 b0 = pload_complex_full(b);
1040 b0 = pload_complex_full_row(b);
1042 ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateRhs>(a0, b0, c0.v);
1043 c0 = PResPacket(cri);
1047template<
typename ScalarPacket,
typename LhsPacket,
typename RhsScalar,
typename RhsPacket,
typename PResPacket,
typename ResPacket,
bool ConjugateLhs,
bool ConjugateRhs,
int StorageOrder>
1048EIGEN_ALWAYS_INLINE
void gemv_mult_complex_real(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
1050 ScalarPacket a1 = pload_complex<ResPacket>(&a0);
1052 if (StorageOrder == ColMajor) {
1056 b0 = pload_real_row<ResPacket>(b);
1058 ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateLhs>(a1, b0, c0.v);
1059 c0 = PResPacket(cri);
1062#define GEMV_MULT_COMPLEX_COMPLEX(LhsType, RhsType, ResType) \
1063template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1064EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, ResType& c1) \
1066 gemv_mult_complex_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0, c1); \
1069GEMV_MULT_COMPLEX_COMPLEX(Packet2cf, std::complex<float>, Packet2cf)
1070GEMV_MULT_COMPLEX_COMPLEX(Packet1cd, std::complex<double>, Packet1cd)
1072#define GEMV_MULT_REAL_COMPLEX(LhsType, RhsType, ResType) \
1073template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1074EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, RhsType&) \
1076 gemv_mult_real_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
1079GEMV_MULT_REAL_COMPLEX(
float, std::complex<float>, Packet2cf)
1080GEMV_MULT_REAL_COMPLEX(
double, std::complex<double>, Packet1cd)
1081GEMV_MULT_REAL_COMPLEX(Packet4f, std::complex<float>, Packet2cf)
1082GEMV_MULT_REAL_COMPLEX(Packet2d, std::complex<double>, Packet1cd)
1084#define GEMV_MULT_COMPLEX_REAL(LhsType, RhsType, ResType1, ResType2) \
1085template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1086EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType1& c0, ResType2&) \
1088 gemv_mult_complex_real<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
1091GEMV_MULT_COMPLEX_REAL(Packet2cf,
float, Packet2cf, std::complex<float>)
1092GEMV_MULT_COMPLEX_REAL(Packet1cd,
double, Packet1cd, std::complex<double>)
1093GEMV_MULT_COMPLEX_REAL(std::complex<float>,
float, Packet2cf, std::complex<float>)
1094GEMV_MULT_COMPLEX_REAL(std::complex<double>,
double, Packet1cd, std::complex<double>)
1099EIGEN_ALWAYS_INLINE T convertReal(T a)
1104EIGEN_ALWAYS_INLINE Packet4f convertReal(Packet2cf a)
1109EIGEN_ALWAYS_INLINE Packet2d convertReal(Packet1cd a)
1116EIGEN_ALWAYS_INLINE T convertComplex(T a)
1121EIGEN_ALWAYS_INLINE Packet2cf convertComplex(Packet4f a)
1123 return Packet2cf(a);
1126EIGEN_ALWAYS_INLINE Packet1cd convertComplex(Packet2d a)
1128 return Packet1cd(a);
1132template<
typename ScalarPacket,
typename LhsPacket,
typename SLhsPacket,
typename ResPacket>
1133EIGEN_ALWAYS_INLINE
void pload_complex_MMA(SLhsPacket& a)
1135 a = SLhsPacket(pload_complex<ResPacket>(&a));
1138template<
typename ScalarPacket,
typename LhsPacket,
typename SLhsPacket,
typename ResPacket>
1139EIGEN_ALWAYS_INLINE
void pload_complex_MMA(__vector_pair&)
1145template<
typename LhsPacket,
typename RhsPacket,
bool NegativeAccumulate>
1146EIGEN_ALWAYS_INLINE
void pger_vecMMA(__vector_quad* acc, RhsPacket& a, LhsPacket& b)
1148 if (NegativeAccumulate)
1150 __builtin_mma_xvf32gernp(acc, (__vector
unsigned char)a, (__vector
unsigned char)b);
1153 __builtin_mma_xvf32gerpp(acc, (__vector
unsigned char)a, (__vector
unsigned char)b);
1158template<
typename LhsPacket,
typename RhsPacket,
bool NegativeAccumulate>
1159EIGEN_ALWAYS_INLINE
void pger_vecMMA(__vector_quad* acc, __vector_pair& a, Packet2d& b)
1161 if (NegativeAccumulate)
1163 __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector
unsigned char)b);
1166 __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector
unsigned char)b);
1170template<
typename LhsPacket,
typename RhsPacket,
bool NegativeAccumulate>
1171EIGEN_ALWAYS_INLINE
void pger_vecMMA(__vector_quad*, __vector_pair&, Packet4f&)
1177template<
typename RealPacket,
typename LhsPacket,
bool ConjugateLhs,
bool ConjugateRhs,
bool Negate>
1178EIGEN_ALWAYS_INLINE
void pmadd_complex_complex_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c)
1180 if (ConjugateLhs && ConjugateRhs) {
1181 RealPacket b2 = pconj2(convertComplex(b)).v;
1182 return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a.v);
1184 else if (Negate && !ConjugateLhs && ConjugateRhs) {
1185 return pger_vecMMA<RealPacket, RealPacket, true>(c, b, a.v);
1188 return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a.v);
1192template<
typename RealPacket,
typename LhsPacket,
bool ConjugateLhs,
bool ConjugateRhs,
bool Negate>
1193EIGEN_ALWAYS_INLINE
void pmadd_complex_complex_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c)
1195 if (ConjugateLhs && ConjugateRhs) {
1196 RealPacket b2 = pconj2(convertComplex(b)).v;
1197 return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
1199 else if (Negate && !ConjugateLhs && ConjugateRhs) {
1200 return pger_vecMMA<RealPacket, __vector_pair, true>(c, a, b);
1203 return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
1208template<
typename RealPacket,
typename LhsPacket,
bool Conjugate,
int StorageOrder>
1209EIGEN_ALWAYS_INLINE
void pmadd_complex_real_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c)
1211 RealPacket a2 = convertReal(a);
1213 RealPacket b2 = pconj2(convertComplex(b)).v;
1214 if (StorageOrder == ColMajor) {
1215 return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a2);
1217 return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b2);
1221 if (StorageOrder == ColMajor) {
1222 return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a2);
1224 return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b);
1230template<
typename RealPacket,
typename LhsPacket,
bool Conjugate,
int StorageOrder>
1231EIGEN_ALWAYS_INLINE
void pmadd_complex_real_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c)
1234 RealPacket b2 = pconj2(convertComplex(b)).v;
1235 return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
1238 return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
1243template<
typename ScalarPacket,
typename LhsPacket,
typename SLhsPacket,
typename RhsScalar,
typename ResPacket,
bool ConjugateLhs,
bool ConjugateRhs,
int StorageOrder>
1244EIGEN_ALWAYS_INLINE
void gemv_mult_complex_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
1247 if (StorageOrder == ColMajor) {
1248 b0 = pload_realimag_combine(b);
1250 b0 = pload_realimag_combine_row(b);
1252 pmadd_complex_complex_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ConjugateRhs, false>(a0, b0, c0);
1256template<
typename ScalarPacket,
typename LhsPacket,
typename SLhsPacket,
typename RhsScalar,
typename ResPacket,
bool ConjugateLhs,
bool ConjugateRhs,
int StorageOrder>
1257EIGEN_ALWAYS_INLINE
void gemv_mult_complex_real_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
1259 pload_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, ResPacket>(a0);
1261 if (StorageOrder == ColMajor) {
1265 b0 = pload_real_row<ResPacket>(b);
1267 pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ColMajor>(a0, b0, c0);
1271template<
typename ScalarPacket,
typename LhsPacket,
typename SLhsPacket,
typename RhsScalar,
typename ResPacket,
bool ConjugateLhs,
bool ConjugateRhs,
int StorageOrder>
1272EIGEN_ALWAYS_INLINE
void gemv_mult_real_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
1275 if (StorageOrder == ColMajor) {
1276 b0 = pload_complex_full(b);
1279 b0 = pload_complex_full_row(b);
1281 pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateRhs, (
sizeof(RhsScalar) ==
sizeof(std::complex<float>)) ? StorageOrder :
ColMajor>(a0, b0, c0);
1284#define GEMV_MULT_COMPLEX_COMPLEX_MMA(LhsType, RhsType) \
1285template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1286EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
1288 gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
1291GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet2cf, std::complex<float>)
1292GEMV_MULT_COMPLEX_COMPLEX_MMA(__vector_pair, std::complex<float>)
1293GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet1cd, std::complex<double>)
1296template<
typename ScalarPacket,
typename LhsScalar,
typename LhsPacket,
typename SLhsPacket,
typename RhsScalar,
typename RhsPacket,
typename ResPacket,
bool ConjugateLhs,
bool ConjugateRhs,
int StorageOrder>
1297EIGEN_ALWAYS_INLINE
void gemv_mult_complex_MMA(__vector_pair& a0, std::complex<double>* b, __vector_quad* c0)
1299 if (
sizeof(LhsScalar) == 16) {
1300 gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0);
1303 gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0);
1307#define GEMV_MULT_REAL_COMPLEX_MMA(LhsType, RhsType) \
1308template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1309EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
1311 gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
1314GEMV_MULT_REAL_COMPLEX_MMA(Packet4f, std::complex<float>)
1315GEMV_MULT_REAL_COMPLEX_MMA(Packet2d, std::complex<double>)
1317#define GEMV_MULT_COMPLEX_REAL_MMA(LhsType, RhsType) \
1318template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1319EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
1321 gemv_mult_complex_real_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
1324GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf,
float)
1325GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd,
double)
1326GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair,
float)
1327GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair,
double)
1330template <
typename Scalar,
typename ScalarPacket,
typename LhsPacket,
typename RhsPacket,
bool ConjugateLhs,
bool ConjugateRhs>
1331EIGEN_ALWAYS_INLINE
void disassembleResults2(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
1333 __builtin_mma_disassemble_acc(&result0.packet, c0);
1334 if (
sizeof(LhsPacket) == 16) {
1335 if (
sizeof(RhsPacket) == 16) {
1336 ScalarPacket tmp0, tmp2;
1337 tmp2 = vec_mergeh(result0.packet[2], result0.packet[3]);
1338 tmp0 = vec_mergeh(result0.packet[0], result0.packet[1]);
1339 result0.packet[3] = vec_mergel(result0.packet[3], result0.packet[2]);
1340 result0.packet[1] = vec_mergel(result0.packet[1], result0.packet[0]);
1341 result0.packet[2] = tmp2;
1342 result0.packet[0] = tmp0;
1345 result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
1346 result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
1347 }
else if (ConjugateRhs) {
1348 result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
1349 result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
1351 result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
1352 result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
1354 result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
1355 result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]);
1357 result0.packet[0][1] = result0.packet[1][1];
1358 result0.packet[2][1] = result0.packet[3][1];
1363template <
typename Scalar,
typename ScalarPacket,
typename LhsPacket,
typename RhsPacket,
bool ConjugateLhs,
bool ConjugateRhs>
1364EIGEN_ALWAYS_INLINE
void disassembleResults4(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
1366 __builtin_mma_disassemble_acc(&result0.packet, c0);
1367 if (GEMV_IS_COMPLEX_COMPLEX) {
1369 result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
1370 result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
1373 result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
1375 result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
1378 result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
1379 }
else if (
sizeof(LhsPacket) ==
sizeof(std::complex<float>)) {
1381 result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
1384 result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]);
1388template <
typename Scalar,
typename ScalarPacket,
int ResPacketSize,
typename LhsPacket,
typename RhsPacket,
bool ConjugateLhs,
bool ConjugateRhs>
1389EIGEN_ALWAYS_INLINE
void disassembleResults(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
1391 if (!GEMV_IS_COMPLEX_FLOAT) {
1392 disassembleResults2<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
1394 disassembleResults4<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
1399#define GEMV_GETN_COMPLEX(N) (((N) * ResPacketSize) >> 1)
1401#define GEMV_LOADPACKET_COL_COMPLEX(iter) \
1402 loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + ((iter) * ResPacketSize), j)
1404#define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) \
1405 convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter))
1408#define GEMV_INIT_COL_COMPLEX_MMA(iter, N) \
1409 if (GEMV_GETN_COMPLEX(N) > iter) { \
1410 __builtin_mma_xxsetaccz(&e0##iter); \
1414#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \
1415 GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); \
1416 EIGEN_UNUSED_VARIABLE(f##iter1);
1418#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \
1419 if (sizeof(LhsPacket) == 16) { \
1420 const LhsScalar& src = lhs(i + ((32 * iter1) / sizeof(LhsScalar)), j); \
1421 a##iter1 = *reinterpret_cast<__vector_pair *>(const_cast<LhsScalar *>(&src)); \
1422 EIGEN_UNUSED_VARIABLE(f##iter1); \
1424 f##iter1 = lhs.template load<PLhsPacket, Unaligned>(i + ((iter2) * ResPacketSize), j); \
1425 GEMV_BUILDPAIR_MMA(a##iter1, vec_splat(convertReal(f##iter1), 0), vec_splat(convertReal(f##iter1), 1)); \
1429#define GEMV_LOAD1_COL_COMPLEX_MMA(iter, N) \
1430 if (GEMV_GETN_COMPLEX(N) > iter) { \
1431 if (GEMV_IS_COMPLEX_FLOAT) { \
1432 f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \
1433 EIGEN_UNUSED_VARIABLE(a##iter); \
1435 GEMV_LOADPAIR_COL_COMPLEX_MMA(iter, iter << 1) \
1438 EIGEN_UNUSED_VARIABLE(a##iter); \
1439 EIGEN_UNUSED_VARIABLE(f##iter); \
1442#define GEMV_WORK1_COL_COMPLEX_MMA(iter, N) \
1443 if (GEMV_GETN_COMPLEX(N) > iter) { \
1444 if (GEMV_IS_COMPLEX_FLOAT) { \
1445 gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(f##iter, b, &e0##iter); \
1447 gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter, b, &e0##iter); \
1451#define GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter1, iter2) \
1452 GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1));
1454#define GEMV_LOAD2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
1455 if (GEMV_GETN_COMPLEX(N) > iter1) { \
1456 if (GEMV_IS_COMPLEX_FLOAT) { \
1457 GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2); \
1458 EIGEN_UNUSED_VARIABLE(a##iter3) \
1460 GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2 << 1); \
1461 GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter3, iter3 << 1); \
1464 EIGEN_UNUSED_VARIABLE(a##iter2); \
1465 EIGEN_UNUSED_VARIABLE(a##iter3); \
1467 EIGEN_UNUSED_VARIABLE(f##iter2); \
1468 EIGEN_UNUSED_VARIABLE(f##iter3);
1470#define GEMV_WORK2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
1471 if (GEMV_GETN_COMPLEX(N) > iter1) { \
1472 if (GEMV_IS_COMPLEX_FLOAT) { \
1474 __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(g), &a##iter2); \
1475 gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(g[0], b, &e0##iter2); \
1476 gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(g[1], b, &e0##iter3); \
1478 gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter2, b, &e0##iter2); \
1479 gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter3, b, &e0##iter3); \
1484#define GEMV_LOAD_COL_COMPLEX_MMA(N) \
1485 if (GEMV_GETN_COMPLEX(N) > 1) { \
1486 GEMV_UNROLL_HALF(GEMV_LOAD2_COL_COMPLEX_MMA, (N >> 1)) \
1488 GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N) \
1491#define GEMV_WORK_COL_COMPLEX_MMA(N) \
1492 if (GEMV_GETN_COMPLEX(N) > 1) { \
1493 GEMV_UNROLL_HALF(GEMV_WORK2_COL_COMPLEX_MMA, (N >> 1)) \
1495 GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N) \
1498#define GEMV_LOAD_COL_COMPLEX_MMA(N) \
1499 GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N)
1501#define GEMV_WORK_COL_COMPLEX_MMA(N) \
1502 GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N)
1505#define GEMV_DISASSEMBLE_COMPLEX_MMA(iter) \
1506 disassembleResults<Scalar, ScalarPacket, ResPacketSize, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter, result0##iter);
1508#define GEMV_STORE_COL_COMPLEX_MMA(iter, N) \
1509 if (GEMV_GETN_COMPLEX(N) > iter) { \
1510 GEMV_DISASSEMBLE_COMPLEX_MMA(iter); \
1511 c0##iter = PResPacket(result0##iter.packet[0]); \
1512 if (GEMV_IS_COMPLEX_FLOAT) { \
1513 pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \
1515 pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + ((iter << 1) * ResPacketSize)); \
1516 c0##iter = PResPacket(result0##iter.packet[2]); \
1517 pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (((iter << 1) + 1) * ResPacketSize)); \
1521#define GEMV_STORE2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
1522 if (GEMV_GETN_COMPLEX(N) > iter1) { \
1523 GEMV_DISASSEMBLE_COMPLEX_MMA(iter2); \
1524 GEMV_DISASSEMBLE_COMPLEX_MMA(iter3); \
1525 c0##iter2 = PResPacket(result0##iter2.packet[0]); \
1526 if (GEMV_IS_COMPLEX_FLOAT) { \
1527 c0##iter3 = PResPacket(result0##iter3.packet[0]); \
1528 pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2>(c0##iter2, c0##iter3, alpha_data, res + i); \
1530 c0##iter3 = PResPacket(result0##iter2.packet[2]); \
1531 pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
1532 c0##iter2 = PResPacket(result0##iter3.packet[0]); \
1533 c0##iter3 = PResPacket(result0##iter3.packet[2]); \
1534 pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter3 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
1538#define GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \
1539 GEMV_UNROLL(GEMV_INIT_COL_COMPLEX_MMA, N) \
1542 const RhsScalar& b1 = rhs2(j, 0); \
1543 RhsScalar* b = const_cast<RhsScalar *>(&b1); \
1544 GEMV_UNROLL(GEMV_PREFETCH, N) \
1545 GEMV_LOAD_COL_COMPLEX_MMA(N) \
1546 GEMV_WORK_COL_COMPLEX_MMA(N) \
1547 } while (++j < jend); \
1548 if (GEMV_GETN(N) <= 2) { \
1549 GEMV_UNROLL(GEMV_STORE_COL_COMPLEX_MMA, N) \
1551 GEMV_UNROLL_HALF(GEMV_STORE2_COL_COMPLEX_MMA, (N >> 1)) \
1553 i += (ResPacketSize * N);
1556#define GEMV_INIT_COMPLEX(iter, N) \
1558 c0##iter = pset_zero<PResPacket>(); \
1559 c1##iter = pset_init<ResPacket, LhsPacket, RhsPacket>(c1##iter); \
1561 EIGEN_UNUSED_VARIABLE(c0##iter); \
1562 EIGEN_UNUSED_VARIABLE(c1##iter); \
1565#define GEMV_WORK_COL_COMPLEX(iter, N) \
1567 f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \
1568 gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(f##iter, b, c0##iter, c1##iter); \
1570 EIGEN_UNUSED_VARIABLE(f##iter); \
1573#define GEMV_STORE_COL_COMPLEX(iter, N) \
1575 if (GEMV_IS_COMPLEX_COMPLEX) { \
1576 c0##iter = padd(c0##iter, c1##iter); \
1578 pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \
1582#define GEMV_PROCESS_COL_COMPLEX_ONE(N) \
1583 GEMV_UNROLL(GEMV_INIT_COMPLEX, N) \
1586 const RhsScalar& b1 = rhs2(j, 0); \
1587 RhsScalar* b = const_cast<RhsScalar *>(&b1); \
1588 GEMV_UNROLL(GEMV_PREFETCH, N) \
1589 GEMV_UNROLL(GEMV_WORK_COL_COMPLEX, N) \
1590 } while (++j < jend); \
1591 GEMV_UNROLL(GEMV_STORE_COL_COMPLEX, N) \
1592 i += (ResPacketSize * N);
1594#if defined(USE_GEMV_MMA) && (EIGEN_COMP_LLVM || defined(USE_SLOWER_GEMV_MMA))
1595#define USE_GEMV_COL_COMPLEX_MMA
1598#ifdef USE_GEMV_COL_COMPLEX_MMA
1599#define GEMV_PROCESS_COL_COMPLEX(N) \
1600 GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N)
1602#if defined(USE_GEMV_MMA) && (__GNUC__ > 10)
1603#define GEMV_PROCESS_COL_COMPLEX(N) \
1604 if (sizeof(Scalar) != sizeof(LhsPacket)) { \
1605 GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \
1607 GEMV_PROCESS_COL_COMPLEX_ONE(N) \
1610#define GEMV_PROCESS_COL_COMPLEX(N) \
1611 GEMV_PROCESS_COL_COMPLEX_ONE(N)
1615template<
typename Scalar,
typename LhsScalar,
typename LhsMapper,
bool ConjugateLhs,
bool LhsIsReal,
typename RhsScalar,
typename RhsMapper,
bool ConjugateRhs,
bool RhsIsReal,
typename ResScalar>
1616EIGEN_STRONG_INLINE
void gemv_complex_col(
1617 Index rows, Index cols,
1618 const LhsMapper& alhs,
1619 const RhsMapper& rhs,
1620 ResScalar* res, Index resIncr,
1623 typedef gemv_traits<LhsScalar, RhsScalar> Traits;
1625 typedef typename Traits::LhsPacket LhsPacket;
1626 typedef typename Traits::RhsPacket RhsPacket;
1627 typedef typename Traits::ResPacket ResPacket;
1629 typedef typename packet_traits<Scalar>::type ScalarPacket;
1630 typedef typename packet_traits<LhsScalar>::type PLhsPacket;
1631 typedef typename packet_traits<ResScalar>::type PResPacket;
1632 typedef gemv_traits<ResPacket, ResPacket> PTraits;
1634 EIGEN_UNUSED_VARIABLE(resIncr);
1635 eigen_internal_assert(resIncr == 1);
1639 LhsMapper lhs(alhs);
1640 RhsMapper rhs2(rhs);
1642 conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
1644 const Index lhsStride = lhs.stride();
1648 ResPacketSize = PTraits::ResPacketSize,
1649 LhsPacketSize = PTraits::LhsPacketSize,
1650 RhsPacketSize = PTraits::RhsPacketSize,
1652#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
1653 const Index prefetch_dist = 64 * LhsPacketSize;
1656#ifndef GCC_ONE_VECTORPAIR_BUG
1657 const Index n8 = rows - 8 * ResPacketSize + 1;
1658 const Index n4 = rows - 4 * ResPacketSize + 1;
1659 const Index n2 = rows - 2 * ResPacketSize + 1;
1661 const Index n1 = rows - 1 * ResPacketSize + 1;
1664 const Index block_cols = cols < 128 ? cols : (lhsStride *
sizeof(LhsScalar) < 16000 ? 16 : 8);
1667 AlphaData alpha_data(alpha);
1669 for (Index j2 = 0; j2 < cols; j2 += block_cols)
1671 Index jend = numext::mini(j2 + block_cols, cols);
1673 PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
1674 ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
1675 PLhsPacket f0, f1, f2, f3, f4, f5, f6, f7;
1677 __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
1678 __vector_pair a0, a1, a2, a3, a4, a5, a6, a7;
1679 PacketBlock<ScalarPacket, 4> result00, result01, result02, result03, result04, result05, result06, result07;
1681 GEMV_UNUSED(8, result0)
1684#if !defined(GCC_ONE_VECTORPAIR_BUG) && defined(USE_GEMV_COL_COMPLEX_MMA)
1685 if (GEMV_IS_COMPLEX_COMPLEX || !GEMV_IS_COMPLEX_FLOAT)
1688#ifndef GCC_ONE_VECTORPAIR_BUG
1692 GEMV_PROCESS_COL_COMPLEX(8)
1697 GEMV_PROCESS_COL_COMPLEX(4)
1701 GEMV_PROCESS_COL_COMPLEX(2)
1708 GEMV_PROCESS_COL_COMPLEX_ONE(1)
1715 d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
1716 }
while (++j < jend);
1717 res[i] += alpha * d0;
1727static Packet16uc p16uc_ELEMENT_3 = { 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f, 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f };
1730template<
typename ResScalar,
typename ResPacket>
1733 PacketBlock<ResPacket, 4> result0, result1;
1734 __builtin_mma_disassemble_acc(&result0.packet, acc0);
1735 __builtin_mma_disassemble_acc(&result1.packet, acc1);
1736 result0.packet[0] = vec_mergeh(result0.packet[0], result1.packet[0]);
1737 result0.packet[1] = vec_mergeo(result0.packet[1], result1.packet[1]);
1738 result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]);
1739 result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3);
1740 result0.packet[0] = vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3]));
1745EIGEN_ALWAYS_INLINE
ScalarBlock<double, 2> predux_real<double, Packet2d>(__vector_quad* acc0, __vector_quad* acc1)
1747 PacketBlock<Packet2d, 4> result0, result1;
1748 __builtin_mma_disassemble_acc(&result0.packet, acc0);
1749 __builtin_mma_disassemble_acc(&result1.packet, acc1);
1750 result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1]));
1755template<
typename LhsPacket,
typename RhsPacket,
bool ConjugateLhs,
bool ConjugateRhs>
1759 result0.packet[0] =
reinterpret_cast<Packet4f
>(vec_mergeh(
reinterpret_cast<Packet2d
>(result0.packet[0]),
reinterpret_cast<Packet2d
>(result1.packet[0])));
1760 result0.packet[2] =
reinterpret_cast<Packet4f
>(vec_mergel(
reinterpret_cast<Packet2d
>(result0.packet[2]),
reinterpret_cast<Packet2d
>(result1.packet[2])));
1761 result0.packet[0] = vec_add(result0.packet[0], result0.packet[2]);
1762 if (GEMV_IS_COMPLEX_COMPLEX) {
1763 result0.packet[1] =
reinterpret_cast<Packet4f
>(vec_mergeh(
reinterpret_cast<Packet2d
>(result0.packet[1]),
reinterpret_cast<Packet2d
>(result1.packet[1])));
1764 result0.packet[3] =
reinterpret_cast<Packet4f
>(vec_mergel(
reinterpret_cast<Packet2d
>(result0.packet[3]),
reinterpret_cast<Packet2d
>(result1.packet[3])));
1765 result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]);
1767 result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
1768 result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
1769 }
else if (ConjugateRhs) {
1770 result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
1772 result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
1774 result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
1776 if (ConjugateLhs && (
sizeof(LhsPacket) ==
sizeof(std::complex<float>))) {
1777 result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
1780 cc0.scalar[0].real(result0.packet[0][0]);
1781 cc0.scalar[0].imag(result0.packet[0][1]);
1782 cc0.scalar[1].real(result0.packet[0][2]);
1783 cc0.scalar[1].imag(result0.packet[0][3]);
1787template<
typename LhsPacket,
typename RhsPacket,
bool ConjugateLhs,
bool ConjugateRhs>
1791 EIGEN_UNUSED_VARIABLE(cc0);
1796template<
typename ResScalar,
typename ResPacket,
typename LhsPacket,
typename RhsPacket,
bool ConjugateLhs,
bool ConjugateRhs>
1799 PacketBlock<ResPacket, 4> result0, result1;
1800 __builtin_mma_disassemble_acc(&result0.packet, acc0);
1801 __builtin_mma_disassemble_acc(&result1.packet, acc1);
1802 return addComplexResults<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(result0, result1);
1805template<
typename ResScalar,
typename ResPacket>
1808 PacketBlock<ResPacket, 4> result0;
1809 __builtin_mma_disassemble_acc(&result0.packet, acc0);
1810 result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3]));
1814template<
typename ResScalar,
typename ResPacket,
typename LhsPacket,
typename RhsPacket,
bool ConjugateLhs,
bool ConjugateRhs>
1818 PacketBlock<ResPacket, 4> result0;
1819 __builtin_mma_disassemble_acc(&result0.packet, acc0);
1820 if (GEMV_IS_COMPLEX_COMPLEX) {
1822 result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
1823 result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
1824 }
else if (ConjugateRhs) {
1825 result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
1826 result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
1828 result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
1829 result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
1831 result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2));
1832 result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2));
1834 result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1);
1835 result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1);
1837 cc0.scalar[0].real(result0.packet[0][0]);
1838 cc0.scalar[0].imag(result0.packet[0][1]);
1839 cc0.scalar[1].real(result0.packet[2][0]);
1840 cc0.scalar[1].imag(result0.packet[2][1]);
1845template<
typename ResScalar,
typename ResPacket>
1849 cc0.scalar[0] = predux(a);
1850 cc0.scalar[1] = predux(b);
1854template<
typename ResScalar,
typename ResPacket>
1857 return predux_real<ResScalar, ResPacket>(a, b);
1860#define GEMV_UNROLL_ROW(func, N) \
1861 func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N)
1863#define GEMV_UNROLL_ROW_HALF(func, N) \
1864 func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
1866#define GEMV_LOADPACKET_ROW(iter) \
1867 lhs.template load<LhsPacket, Unaligned>(i + (iter), j)
1870#define GEMV_UNROLL3_ROW(func, N, which) \
1871 func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \
1872 func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which)
1874#define GEMV_UNUSED_ROW(N, which) \
1875 GEMV_UNROLL3_ROW(GEMV_UNUSED_VAR, N, which)
1877#define GEMV_INIT_ROW(iter, N) \
1878 if (GEMV_GETN(N) > iter) { \
1879 __builtin_mma_xxsetaccz(&c##iter); \
1882#define GEMV_LOADPAIR_ROW(iter1, iter2) \
1883 GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_ROW(iter2), GEMV_LOADPACKET_ROW((iter2) + 1));
1885#define GEMV_WORK_ROW(iter, N) \
1886 if (GEMV_GETN(N) > iter) { \
1887 if (GEMV_IS_FLOAT) { \
1888 pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, a0, GEMV_LOADPACKET_ROW(iter)); \
1890 __vector_pair b##iter; \
1891 GEMV_LOADPAIR_ROW(iter, iter << 1) \
1892 pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, b##iter, a0); \
1896#define GEMV_PREDUX2(iter1, iter2, iter3, N) \
1898 if (GEMV_IS_FLOAT) { \
1899 cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter2, &c##iter3); \
1901 cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter1); \
1904 EIGEN_UNUSED_VARIABLE(cc##iter1); \
1907#define GEMV_INIT_ROW(iter, N) \
1909 c##iter = pset1<ResPacket>(ResScalar(0)); \
1911 EIGEN_UNUSED_VARIABLE(c##iter); \
1914#define GEMV_WORK_ROW(iter, N) \
1916 c##iter = pcj.pmadd(GEMV_LOADPACKET_ROW(iter), a0, c##iter); \
1919#define GEMV_PREDUX2(iter1, iter2, iter3, N) \
1921 cc##iter1 = predux_real<ResScalar, ResPacket>(c##iter2, c##iter3); \
1923 EIGEN_UNUSED_VARIABLE(cc##iter1); \
1927#define GEMV_MULT(iter1, iter2, iter3, N) \
1929 cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), a0); \
1930 cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), a0); \
1933#define GEMV_STORE_ROW(iter1, iter2, iter3, N) \
1935 storeMaddData<ResScalar>(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \
1936 storeMaddData<ResScalar>(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \
1940#define GEMV_PROCESS_ROW(N) \
1941 for (; i < n##N; i += N) { \
1942 GEMV_UNROLL_ROW(GEMV_INIT_ROW, N) \
1944 for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
1945 RhsPacket a0 = rhs2.template load<RhsPacket, Unaligned>(j); \
1946 GEMV_UNROLL_ROW(GEMV_WORK_ROW, N) \
1948 GEMV_UNROLL_ROW_HALF(GEMV_PREDUX2, (N >> 1)) \
1949 for (; j < cols; ++j) { \
1950 RhsScalar a0 = rhs2(j); \
1951 GEMV_UNROLL_ROW_HALF(GEMV_MULT, (N >> 1)) \
1953 GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1)) \
1956template<
typename LhsScalar,
typename LhsMapper,
typename RhsScalar,
typename RhsMapper,
typename ResScalar>
1957EIGEN_STRONG_INLINE
void gemv_row(
1958 Index rows, Index cols,
1959 const LhsMapper& alhs,
1960 const RhsMapper& rhs,
1961 ResScalar* res, Index resIncr,
1964 typedef gemv_traits<LhsScalar, RhsScalar> Traits;
1966 typedef typename Traits::LhsPacket LhsPacket;
1967 typedef typename Traits::RhsPacket RhsPacket;
1968 typedef typename Traits::ResPacket ResPacket;
1972 LhsMapper lhs(alhs);
1973 typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
1975 eigen_internal_assert(rhs.stride() == 1);
1976 conj_helper<LhsScalar, RhsScalar, false, false> cj;
1977 conj_helper<LhsPacket, RhsPacket, false, false> pcj;
1981#ifndef GCC_ONE_VECTORPAIR_BUG
1982 const Index n8 = lhs.stride() *
sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
1983 const Index n4 = rows - 3;
1984 const Index n2 = rows - 1;
1990 ResPacketSize = Traits::ResPacketSize,
1991 LhsPacketSize = Traits::LhsPacketSize,
1992 RhsPacketSize = Traits::RhsPacketSize,
1997 __vector_quad c0, c1, c2, c3, c4, c5, c6, c7;
1998 GEMV_UNUSED_ROW(8, c)
2000 ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
2002#ifndef GCC_ONE_VECTORPAIR_BUG
2008 for (; i < rows; ++i)
2010 ResPacket d0 = pset1<ResPacket>(ResScalar(0));
2012 for (; j + LhsPacketSize <= cols; j += LhsPacketSize)
2014 RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j);
2016 d0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, d0);
2018 ResScalar dd0 = predux(d0);
2019 for (; j < cols; ++j)
2021 dd0 += cj.pmul(lhs(i, j), rhs2(j));
2023 res[i * resIncr] += alpha * dd0;
2027#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(Scalar) \
2028template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
2029struct general_matrix_vector_product<Index, Scalar, LhsMapper, ColMajor, ConjugateLhs, Scalar, RhsMapper, ConjugateRhs, Version> \
2031 typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar; \
2033 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
2034 Index rows, Index cols, \
2035 const LhsMapper& lhs, \
2036 const RhsMapper& rhs, \
2037 ResScalar* res, Index resIncr, \
2038 ResScalar alpha) { \
2039 gemv_col<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
2043#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(Scalar) \
2044template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
2045struct general_matrix_vector_product<Index, Scalar, LhsMapper, RowMajor, ConjugateLhs, Scalar, RhsMapper, ConjugateRhs, Version> \
2047 typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar; \
2049 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
2050 Index rows, Index cols, \
2051 const LhsMapper& lhs, \
2052 const RhsMapper& rhs, \
2053 ResScalar* res, Index resIncr, \
2054 ResScalar alpha) { \
2055 gemv_row<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
2059EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(
float)
2060EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(
double)
2061EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(
float)
2062EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(
double)
2064template<
typename ResScalar,
typename PResPacket,
typename ResPacket,
typename LhsPacket,
typename RhsPacket>
2065EIGEN_ALWAYS_INLINE
ScalarBlock<ResScalar, 2> predux_complex(PResPacket& a0, PResPacket& b0, ResPacket& a1, ResPacket& b1)
2067 if (GEMV_IS_COMPLEX_COMPLEX) {
2071 return predux_complex<ResScalar, PResPacket>(a0, b0);
2074#define GEMV_LOADPACKET_ROW_COMPLEX(iter) \
2075 loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + (iter), j)
2077#define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) \
2078 convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter))
2080#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(which, N) \
2082 for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
2083 const RhsScalar& b1 = rhs2(j); \
2084 RhsScalar* b = const_cast<RhsScalar *>(&b1); \
2085 GEMV_UNROLL_ROW(which, N) \
2088#define GEMV_PROCESS_END_ROW_COMPLEX(N) \
2089 for (; j < cols; ++j) { \
2090 RhsScalar b0 = rhs2(j); \
2091 GEMV_UNROLL_ROW_HALF(GEMV_MULT_COMPLEX, (N >> 1)) \
2093 GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW_COMPLEX, (N >> 1))
2096#define GEMV_INIT_ROW_COMPLEX_MMA(iter, N) \
2097 if (GEMV_GETN_COMPLEX(N) > iter) { \
2098 __builtin_mma_xxsetaccz(&e0##iter); \
2101#define GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter1, iter2) \
2102 GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter2), GEMV_LOADPACKET_ROW_COMPLEX_DATA((iter2) + 1));
2104#define GEMV_WORK_ROW_COMPLEX_MMA(iter, N) \
2105 if (GEMV_GETN_COMPLEX(N) > iter) { \
2106 if (GEMV_IS_COMPLEX_FLOAT) { \
2107 PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \
2108 gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter); \
2110 __vector_pair a##iter; \
2111 GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter, iter << 1) \
2112 gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter); \
2116#define GEMV_PREDUX4_COMPLEX_MMA(iter1, iter2, iter3, N) \
2118 if (GEMV_IS_COMPLEX_FLOAT) { \
2119 cc##iter1 = predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter2, &e0##iter3); \
2121 cc##iter1 = predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter1); \
2124 EIGEN_UNUSED_VARIABLE(cc##iter1); \
2127#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \
2128 GEMV_UNROLL_ROW(GEMV_INIT_ROW_COMPLEX_MMA, N) \
2129 GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX_MMA, N)
2131#define GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N) \
2132 for (; i < n##N; i += N) { \
2133 GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \
2134 GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_MMA, (N >> 1)) \
2135 GEMV_PROCESS_END_ROW_COMPLEX(N); \
2139#define GEMV_WORK_ROW_COMPLEX(iter, N) \
2141 PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \
2142 gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, c0##iter, c1##iter); \
2145#define GEMV_PREDUX4_COMPLEX(iter1, iter2, iter3, N) \
2147 cc##iter1 = predux_complex<ResScalar, PResPacket, ResPacket, LhsPacket, RhsPacket>(c0##iter2, c0##iter3, c1##iter2, c1##iter3); \
2149 EIGEN_UNUSED_VARIABLE(cc##iter1); \
2152#define GEMV_MULT_COMPLEX(iter1, iter2, iter3, N) \
2154 cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), b0); \
2155 cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), b0); \
2158#define GEMV_STORE_ROW_COMPLEX(iter1, iter2, iter3, N) \
2160 storeMaddData<ResScalar>(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \
2161 storeMaddData<ResScalar>(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \
2164#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
2165 GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX, N) \
2166 GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX, N)
2169#define GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \
2170 for (; i < n##N; i += N) { \
2171 GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
2172 GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX, (N >> 1)) \
2173 GEMV_PROCESS_END_ROW_COMPLEX(N); \
2176#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \
2177 if (GEMV_IS_COMPLEX_COMPLEX) { \
2178 c0##iter = padd(c0##iter, c1##iter); \
2180 dd0 = predux(c0##iter);
2183#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \
2184 GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N)
2186#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \
2187 GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N)
2189#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \
2190 GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter)
2195#define GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter) \
2196 lhs.template load<LhsPacket, LhsAlignment>(i + (iter), j)
2198#define GEMV_INIT_COMPLEX_OLD(iter, N) \
2199 EIGEN_UNUSED_VARIABLE(c0##iter); \
2201 c1##iter = pset_zero<ResPacket>(); \
2203 EIGEN_UNUSED_VARIABLE(c1##iter); \
2206#define GEMV_WORK_ROW_COMPLEX_OLD(iter, N) \
2208 LhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter); \
2209 c1##iter = pcj.pmadd(a##iter, b0, c1##iter); \
2212#define GEMV_PREDUX4_COMPLEX_OLD(iter1, iter2, iter3, N) \
2214 cc##iter1.scalar[0] = predux(c1##iter2); \
2215 cc##iter1.scalar[1] = predux(c1##iter3); \
2217 EIGEN_UNUSED_VARIABLE(cc##iter1); \
2220#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
2221 GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX_OLD, N) \
2223 for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
2224 RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j); \
2225 GEMV_UNROLL_ROW(GEMV_WORK_ROW_COMPLEX_OLD, N) \
2228#define GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \
2229 for (; i < n##N; i += N) { \
2230 GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
2231 GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_OLD, (N >> 1)) \
2232 GEMV_PROCESS_END_ROW_COMPLEX(N) \
2235#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \
2236 dd0 = predux(c1##iter);
2239#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW 1
2241#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW \
2242 (sizeof(Scalar) == sizeof(float)) || GEMV_IS_COMPLEX_COMPLEX
2245#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \
2246 if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
2247 GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
2249 GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
2252#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \
2253 if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
2254 GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \
2256 GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \
2259#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \
2260 if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
2261 GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \
2263 GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \
2268#define GEMV_PROCESS_ROW_COMPLEX(N) \
2269 GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N)
2271#define GEMV_PROCESS_ROW_COMPLEX(N) \
2272 GEMV_PROCESS_ROW_COMPLEX_ONE(N)
2275template<
typename Scalar,
typename LhsScalar,
typename LhsMapper,
bool ConjugateLhs,
bool LhsIsReal,
typename RhsScalar,
typename RhsMapper,
bool ConjugateRhs,
bool RhsIsReal,
typename ResScalar>
2276EIGEN_STRONG_INLINE
void gemv_complex_row(
2277 Index rows, Index cols,
2278 const LhsMapper& alhs,
2279 const RhsMapper& rhs,
2280 ResScalar* res, Index resIncr,
2283 typedef gemv_traits<LhsScalar, RhsScalar> Traits;
2285 typedef typename Traits::LhsPacket LhsPacket;
2286 typedef typename Traits::RhsPacket RhsPacket;
2287 typedef typename Traits::ResPacket ResPacket;
2289 typedef typename packet_traits<Scalar>::type ScalarPacket;
2290 typedef typename packet_traits<LhsScalar>::type PLhsPacket;
2291 typedef typename packet_traits<ResScalar>::type PResPacket;
2292 typedef gemv_traits<ResPacket, ResPacket> PTraits;
2296 LhsMapper lhs(alhs);
2297 typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
2299 eigen_internal_assert(rhs.stride() == 1);
2300 conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
2302 conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
2307#ifndef GCC_ONE_VECTORPAIR_BUG
2308 const Index n8 = lhs.stride() *
sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
2309 const Index n4 = rows - 3;
2310 const Index n2 = rows - 1;
2316 ResPacketSize = PTraits::ResPacketSize,
2317 LhsPacketSize = PTraits::LhsPacketSize,
2318 RhsPacketSize = PTraits::RhsPacketSize,
2322 PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
2323 ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
2325 __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
2326 GEMV_UNUSED_ROW(8, e0)
2327 GEMV_UNUSED_EXTRA(1, c0)
2328 GEMV_UNUSED_EXTRA(1, c1)
2331#ifndef GCC_ONE_VECTORPAIR_BUG
2334 if (!GEMV_IS_COMPLEX_COMPLEX)
2337 GEMV_PROCESS_ROW_COMPLEX(8)
2339 GEMV_PROCESS_ROW_COMPLEX(4)
2340 GEMV_PROCESS_ROW_COMPLEX(2)
2342 for (; i < rows; ++i)
2344 GEMV_PROCESS_ROW_COMPLEX_SINGLE(1)
2345 GEMV_PROCESS_ROW_COMPLEX_PREDUX(0)
2346 for (; j < cols; ++j)
2348 dd0 += cj.pmul(lhs(i, j), rhs2(j));
2350 res[i * resIncr] += alpha * dd0;
2354#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(Scalar, LhsScalar, RhsScalar) \
2355template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
2356struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version> \
2358 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; \
2360 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
2361 Index rows, Index cols, \
2362 const LhsMapper& lhs, \
2363 const RhsMapper& rhs, \
2364 ResScalar* res, Index resIncr, \
2365 ResScalar alpha) { \
2366 gemv_complex_col<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
2370#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(Scalar, LhsScalar, RhsScalar) \
2371template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
2372struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version> \
2374 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; \
2376 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
2377 Index rows, Index cols, \
2378 const LhsMapper& lhs, \
2379 const RhsMapper& rhs, \
2380 ResScalar* res, Index resIncr, \
2381 ResScalar alpha) { \
2382 gemv_complex_row<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
2386EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(
float,
float, std::complex<float>)
2387EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(
float, std::complex<float>,
float)
2388EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(
float, std::complex<float>, std::complex<float>)
2389EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(
double,
double, std::complex<double>)
2390EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(
double, std::complex<double>,
double)
2391EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(
double, std::complex<double>, std::complex<double>)
2392EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(
float,
float, std::complex<float>)
2393EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(
float, std::complex<float>,
float)
2394EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(
float, std::complex<float>, std::complex<float>)
2395EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(
double,
double, std::complex<double>)
2396EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(
double, std::complex<double>,
double)
2397EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(
double, std::complex<double>, std::complex<double>)
@ Unaligned
Data pointer has no specific alignment.
Definition Constants.h:233
@ ColMajor
Storage order is column major (see TopicStorageOrders).
Definition Constants.h:319
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition MatrixVectorProduct.h:1722
Definition MatrixVectorProduct.h:898
Definition MatrixVectorProduct.h:893