11#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
12#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
14#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
15#define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1
18#include "MatrixProductCommon.h"
20#if !defined(EIGEN_ALTIVEC_DISABLE_MMA)
21#define EIGEN_ALTIVEC_DISABLE_MMA 0
25#if !EIGEN_ALTIVEC_DISABLE_MMA && defined(__has_builtin)
26#if __has_builtin(__builtin_mma_assemble_acc)
27 #define EIGEN_ALTIVEC_MMA_SUPPORT
32#if defined(EIGEN_ALTIVEC_MMA_SUPPORT)
34#if !defined(EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH)
35#define EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH 0
39#if EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH && !EIGEN_COMP_LLVM
40#define EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH 1
43#define EIGEN_ALTIVEC_MMA_ONLY 1
48#if defined(EIGEN_ALTIVEC_MMA_ONLY) || defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
49 #include "MatrixProductMMA.h"
64template<
typename Scalar>
67 typedef typename packet_traits<Scalar>::type vectortype;
69 typedef vectortype rhstype;
96const static Packet16uc p16uc_GETREAL32 = { 0, 1, 2, 3,
101const static Packet16uc p16uc_GETIMAG32 = { 4, 5, 6, 7,
105const static Packet16uc p16uc_GETREAL64 = { 0, 1, 2, 3, 4, 5, 6, 7,
106 16, 17, 18, 19, 20, 21, 22, 23};
109const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15,
110 24, 25, 26, 27, 28, 29, 30, 31};
130template<
typename Scalar,
typename Index,
int StorageOrder>
131EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(
Index i,
Index j, const_blas_data_mapper<std::complex<Scalar>,
Index, StorageOrder>& dt)
133 std::complex<Scalar> v;
136 v.real( dt(j,i).real());
137 v.imag(-dt(j,i).imag());
140 v.real( dt(i,j).real());
141 v.imag( dt(i,j).imag());
143 v.real( dt(i,j).real());
149template<
typename Scalar,
typename Index,
int StorageOrder,
int N>
150EIGEN_STRONG_INLINE
void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB,
const std::complex<Scalar>* _rhs,
Index rhsStride,
Index rows,
Index cols,
Index k2)
152 const Index depth = k2 + rows;
153 const_blas_data_mapper<std::complex<Scalar>,
Index, StorageOrder> rhs(_rhs, rhsStride);
154 const Index vectorSize =
N*quad_traits<Scalar>::vectorsize;
155 const Index vectorDelta = vectorSize * rows;
156 Scalar* blockBf =
reinterpret_cast<Scalar *
>(blockB);
158 Index rir = 0, rii, j = 0;
159 for(; j + vectorSize <= cols; j+=vectorSize)
161 rii = rir + vectorDelta;
163 for(
Index i = k2; i < depth; i++)
165 for(
Index k = 0; k < vectorSize; k++)
167 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j + k, rhs);
169 blockBf[rir + k] = v.real();
170 blockBf[rii + k] = v.imag();
183 for(
Index i = k2; i < depth; i++)
185 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j, rhs);
187 blockBf[rir] = v.real();
188 blockBf[rii] = v.imag();
198template<
typename Scalar,
typename Index,
int StorageOrder>
199EIGEN_STRONG_INLINE
void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA,
const std::complex<Scalar>* _lhs,
Index lhsStride,
Index cols,
Index rows)
201 const Index depth = cols;
202 const_blas_data_mapper<std::complex<Scalar>,
Index, StorageOrder> lhs(_lhs, lhsStride);
203 const Index vectorSize = quad_traits<Scalar>::vectorsize;
204 const Index vectorDelta = vectorSize * depth;
205 Scalar* blockAf = (Scalar *)(blockA);
207 Index rir = 0, rii, j = 0;
208 for(; j + vectorSize <= rows; j+=vectorSize)
210 rii = rir + vectorDelta;
212 for(
Index i = 0; i < depth; i++)
214 for(
Index k = 0; k < vectorSize; k++)
216 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(j+k, i, lhs);
218 blockAf[rir + k] = v.real();
219 blockAf[rii + k] = v.imag();
230 rii = rir + ((rows - j) * depth);
232 for(
Index i = 0; i < depth; i++)
237 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(k, i, lhs);
239 blockAf[rir] = v.real();
240 blockAf[rii] = v.imag();
249template<
typename Scalar,
typename Index,
int StorageOrder,
int N>
250EIGEN_STRONG_INLINE
void symm_pack_rhs_helper(Scalar* blockB,
const Scalar* _rhs,
Index rhsStride,
Index rows,
Index cols,
Index k2)
252 const Index depth = k2 + rows;
253 const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(_rhs, rhsStride);
254 const Index vectorSize = quad_traits<Scalar>::vectorsize;
257 for(; j +
N*vectorSize <= cols; j+=
N*vectorSize)
260 for(; i < depth; i++)
262 for(
Index k = 0; k <
N*vectorSize; k++)
265 blockB[ri + k] = rhs(j+k, i);
267 blockB[ri + k] = rhs(i, j+k);
275 for(
Index i = k2; i < depth; i++)
278 blockB[ri] = rhs(i, j);
280 blockB[ri] = rhs(j, i);
286template<
typename Scalar,
typename Index,
int StorageOrder>
287EIGEN_STRONG_INLINE
void symm_pack_lhs_helper(Scalar* blockA,
const Scalar* _lhs,
Index lhsStride,
Index cols,
Index rows)
289 const Index depth = cols;
290 const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs, lhsStride);
291 const Index vectorSize = quad_traits<Scalar>::vectorsize;
294 for(; j + vectorSize <= rows; j+=vectorSize)
298 for(; i < depth; i++)
300 for(
Index k = 0; k < vectorSize; k++)
303 blockA[ri + k] = lhs(j+k, i);
305 blockA[ri + k] = lhs(i, j+k);
313 for(
Index i = 0; i < depth; i++)
319 blockA[ri] = lhs(k, i);
321 blockA[ri] = lhs(i, k);
328template<
typename Index,
int nr,
int StorageOrder>
337template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
348template<
typename Index,
int nr,
int StorageOrder>
357template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
367template<
typename Index,
int nr,
int StorageOrder>
376template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
386template<
typename Index,
int nr,
int StorageOrder>
395template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
415template<
typename Scalar,
typename Packet,
typename Index,
int N>
418 const Index size = 16 /
sizeof(Scalar);
430template<
typename Scalar,
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode,
bool UseLhs>
437 Scalar*
blockAt =
reinterpret_cast<Scalar *
>(blockA);
487 for(; i < depth; i++)
503 cblock.packet[0] = pload2(lhs(j + 0, i), lhs(j + 1, i));
504 cblock.packet[1] = pload2(lhs(j + 2, i), lhs(j + 3, i));
506 cblock.packet[0] = pload2(lhs(i, j + 0), lhs(i, j + 1));
507 cblock.packet[1] = pload2(lhs(i, j + 2), lhs(i, j + 3));
537 for(
Index i = 0; i < depth; i++)
558 for(
Index i = 0; i < depth; i++)
580template<
typename Scalar,
typename Index,
typename DataMapper,
typename Packet,
int StorageOrder,
bool PanelMode,
bool UseLhs>
611 for(; i < depth; i++)
616 blockA[ri+0] = lhs(j+0, i);
617 blockA[ri+1] = lhs(j+1, i);
618 blockA[ri+2] = lhs(j+2, i);
619 blockA[ri+3] = lhs(j+3, i);
621 blockA[ri+0] = lhs(i, j+0);
622 blockA[ri+1] = lhs(i, j+1);
623 blockA[ri+2] = lhs(i, j+2);
624 blockA[ri+3] = lhs(i, j+3);
648 for(
Index i = 0; i < depth; i++)
650 blockA[ri] = lhs(i, j);
661 for(
Index i = 0; i < depth; i++)
666 blockA[ri] = lhs(k, i);
676template<
typename Index,
typename DataMapper,
int StorageOrder,
bool PanelMode>
708 for(; i < depth; i++)
712 blockA[ri+0] = lhs(j+0, i);
713 blockA[ri+1] = lhs(j+1, i);
729 for(
Index i = 0; i < depth; i++)
734 blockA[ri] = lhs(k, i);
743template<
typename Index,
typename DataMapper,
int StorageOrder,
bool PanelMode>
786 for(; i < depth; i++)
790 blockB[ri+0] = rhs(i, j+0);
791 blockB[ri+1] = rhs(i, j+1);
795 blockB[ri+0] = rhs(i, j+2);
796 blockB[ri+1] = rhs(i, j+3);
816 for(
Index i = 0; i < depth; i++)
818 blockB[ri] = rhs(i, j);
828template<
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode>
836 double*
blockAt =
reinterpret_cast<double *
>(blockA);
889 for(; i < depth; i++)
920 for(
Index i = 0; i < depth; i++)
941template<
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode>
949 double*
blockBt =
reinterpret_cast<double *
>(blockB);
958 for(; i < depth; i++)
993 for(
Index i = 0; i < depth; i++)
1016template<
typename Packet,
bool NegativeAccumulate,
int N>
1029 acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
1032 acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
1034 acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
1037 acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
1040 acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
1045template<
int N,
typename Scalar,
typename Packet,
bool NegativeAccumulate>
1046EIGEN_ALWAYS_INLINE
void pger(PacketBlock<Packet,N>* acc,
const Scalar* lhs,
const Packet* rhsV)
1048 Packet lhsV = pload<Packet>(lhs);
1050 pger_common<Packet, NegativeAccumulate, N>(acc, lhsV, rhsV);
1053template<
typename Scalar,
typename Packet,
typename Index, const Index remaining_rows>
1054EIGEN_ALWAYS_INLINE
void loadPacketRemaining(
const Scalar* lhs, Packet &lhsV)
1057 lhsV = vec_xl_len((Scalar *)lhs, remaining_rows *
sizeof(Scalar));
1062 }
while (++i < remaining_rows);
1066template<
int N,
typename Scalar,
typename Packet,
typename Index,
bool NegativeAccumulate, const Index remaining_rows>
1067EIGEN_ALWAYS_INLINE
void pger(PacketBlock<Packet,N>* acc,
const Scalar* lhs,
const Packet* rhsV)
1070 loadPacketRemaining<Scalar, Packet, Index, remaining_rows>(lhs, lhsV);
1072 pger_common<Packet, NegativeAccumulate, N>(acc, lhsV, rhsV);
1076template<
int N,
typename Packet,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1077EIGEN_ALWAYS_INLINE
void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag,
const Packet &lhsV,
const Packet &lhsVi,
const Packet* rhsV,
const Packet* rhsVi)
1079 pger_common<Packet, false, N>(accReal, lhsV, rhsV);
1082 pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
1083 EIGEN_UNUSED_VARIABLE(lhsVi);
1086 pger_common<Packet, ConjugateLhs == ConjugateRhs, N>(accReal, lhsVi, rhsVi);
1087 pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
1089 EIGEN_UNUSED_VARIABLE(rhsVi);
1091 pger_common<Packet, ConjugateLhs, N>(accImag, lhsVi, rhsV);
1095template<
int N,
typename Scalar,
typename Packet,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1096EIGEN_ALWAYS_INLINE
void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag,
const Scalar* lhs_ptr,
const Scalar* lhs_ptr_imag,
const Packet* rhsV,
const Packet* rhsVi)
1098 Packet lhsV = ploadLhs<Scalar, Packet>(lhs_ptr);
1100 if(!LhsIsReal) lhsVi = ploadLhs<Scalar, Packet>(lhs_ptr_imag);
1101 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1103 pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
1106template<
typename Scalar,
typename Packet,
typename Index,
bool LhsIsReal, const Index remaining_rows>
1107EIGEN_ALWAYS_INLINE
void loadPacketRemaining(
const Scalar* lhs_ptr,
const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi)
1110 lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows *
sizeof(Scalar));
1111 if(!LhsIsReal) lhsVi = vec_xl_len((Scalar *)lhs_ptr_imag, remaining_rows *
sizeof(Scalar));
1112 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1116 lhsV[i] = lhs_ptr[i];
1117 if(!LhsIsReal) lhsVi[i] = lhs_ptr_imag[i];
1118 }
while (++i < remaining_rows);
1119 if(LhsIsReal) EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1123template<
int N,
typename Scalar,
typename Packet,
typename Index,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal, const Index remaining_rows>
1124EIGEN_ALWAYS_INLINE
void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag,
const Scalar* lhs_ptr,
const Scalar* lhs_ptr_imag,
const Packet* rhsV,
const Packet* rhsVi)
1127 loadPacketRemaining<Scalar, Packet, Index, LhsIsReal, remaining_rows>(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi);
1129 pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
1132template<
typename Scalar,
typename Packet>
1133EIGEN_ALWAYS_INLINE Packet ploadLhs(
const Scalar* lhs)
1135 return ploadu<Packet>(lhs);
1139template<
typename Scalar,
typename Packet,
int N>
1140EIGEN_ALWAYS_INLINE
void bsetzero(PacketBlock<Packet,N>& acc)
1142 acc.packet[0] = pset1<Packet>((Scalar)0);
1144 acc.packet[1] = pset1<Packet>((Scalar)0);
1147 acc.packet[2] = pset1<Packet>((Scalar)0);
1150 acc.packet[3] = pset1<Packet>((Scalar)0);
1155template<
typename Packet,
int N>
1156EIGEN_ALWAYS_INLINE
void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ,
const Packet& pAlpha)
1158 acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
1160 acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]);
1163 acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]);
1166 acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]);
1170template<
typename Packet,
int N>
1171EIGEN_ALWAYS_INLINE
void bscalec_common(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ,
const Packet& pAlpha)
1173 acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
1175 acc.packet[1] = pmul<Packet>(accZ.packet[1], pAlpha);
1178 acc.packet[2] = pmul<Packet>(accZ.packet[2], pAlpha);
1181 acc.packet[3] = pmul<Packet>(accZ.packet[3], pAlpha);
1186template<
typename Packet,
int N>
1187EIGEN_ALWAYS_INLINE
void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag,
const Packet& bReal,
const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag)
1189 bscalec_common<Packet, N>(cReal, aReal, bReal);
1191 bscalec_common<Packet, N>(cImag, aImag, bReal);
1193 pger_common<Packet, true, N>(&cReal, bImag, aImag.packet);
1195 pger_common<Packet, false, N>(&cImag, bImag, aReal.packet);
1198template<
typename Packet,
int N>
1199EIGEN_ALWAYS_INLINE
void band(PacketBlock<Packet,N>& acc,
const Packet& pMask)
1201 acc.packet[0] = pand(acc.packet[0], pMask);
1203 acc.packet[1] = pand(acc.packet[1], pMask);
1206 acc.packet[2] = pand(acc.packet[2], pMask);
1209 acc.packet[3] = pand(acc.packet[3], pMask);
1213template<
typename Packet,
int N>
1214EIGEN_ALWAYS_INLINE
void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag,
const Packet& bReal,
const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag,
const Packet& pMask)
1216 band<Packet, N>(aReal, pMask);
1217 band<Packet, N>(aImag, pMask);
1219 bscalec<Packet,N>(aReal, aImag, bReal, bImag, cReal, cImag);
1223template<
typename DataMapper,
typename Packet,
typename Index, const Index accCols,
int StorageOrder,
bool Complex,
int N>
1224EIGEN_ALWAYS_INLINE
void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res,
Index row,
Index col)
1227 acc.packet[0] = res.template loadPacket<Packet>(row + 0, col);
1229 acc.packet[1] = res.template loadPacket<Packet>(row + 1, col);
1232 acc.packet[2] = res.template loadPacket<Packet>(row + 2, col);
1235 acc.packet[3] = res.template loadPacket<Packet>(row + 3, col);
1238 acc.packet[0+
N] = res.template loadPacket<Packet>(row + 0, col + accCols);
1240 acc.packet[1+
N] = res.template loadPacket<Packet>(row + 1, col + accCols);
1243 acc.packet[2+
N] = res.template loadPacket<Packet>(row + 2, col + accCols);
1246 acc.packet[3+
N] = res.template loadPacket<Packet>(row + 3, col + accCols);
1250 acc.packet[0] = res.template loadPacket<Packet>(row, col + 0);
1252 acc.packet[1] = res.template loadPacket<Packet>(row, col + 1);
1255 acc.packet[2] = res.template loadPacket<Packet>(row, col + 2);
1258 acc.packet[3] = res.template loadPacket<Packet>(row, col + 3);
1261 acc.packet[0+
N] = res.template loadPacket<Packet>(row + accCols, col + 0);
1263 acc.packet[1+
N] = res.template loadPacket<Packet>(row + accCols, col + 1);
1266 acc.packet[2+
N] = res.template loadPacket<Packet>(row + accCols, col + 2);
1269 acc.packet[3+
N] = res.template loadPacket<Packet>(row + accCols, col + 3);
1275const static Packet4i mask41 = { -1, 0, 0, 0 };
1276const static Packet4i mask42 = { -1, -1, 0, 0 };
1277const static Packet4i mask43 = { -1, -1, -1, 0 };
1279const static Packet2l mask21 = { -1, 0 };
1281template<
typename Packet>
1282EIGEN_ALWAYS_INLINE Packet bmask(
const int remaining_rows)
1284 if (remaining_rows == 0) {
1285 return pset1<Packet>(
float(0.0));
1287 switch (remaining_rows) {
1288 case 1:
return Packet(mask41);
1289 case 2:
return Packet(mask42);
1290 default:
return Packet(mask43);
1296EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(
const int remaining_rows)
1298 if (remaining_rows == 0) {
1299 return pset1<Packet2d>(
double(0.0));
1301 return Packet2d(mask21);
1305template<
typename Packet,
int N>
1306EIGEN_ALWAYS_INLINE
void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ,
const Packet& pAlpha,
const Packet& pMask)
1308 band<Packet, N>(accZ, pMask);
1310 bscale<Packet, N>(acc, accZ, pAlpha);
1313template<
typename Packet,
int N> EIGEN_ALWAYS_INLINE
void
1314pbroadcastN_old(
const __UNPACK_TYPE__(Packet) *a,
1315 Packet& a0, Packet& a1, Packet& a2, Packet& a3)
1317 a0 = pset1<Packet>(a[0]);
1319 a1 = pset1<Packet>(a[1]);
1321 EIGEN_UNUSED_VARIABLE(a1);
1324 a2 = pset1<Packet>(a[2]);
1326 EIGEN_UNUSED_VARIABLE(a2);
1329 a3 = pset1<Packet>(a[3]);
1331 EIGEN_UNUSED_VARIABLE(a3);
1336EIGEN_ALWAYS_INLINE
void pbroadcastN_old<Packet4f,4>(
const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
1338 pbroadcast4<Packet4f>(a, a0, a1, a2, a3);
1342EIGEN_ALWAYS_INLINE
void pbroadcastN_old<Packet2d,4>(
const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
1344 a1 = pload<Packet2d>(a);
1345 a3 = pload<Packet2d>(a + 2);
1346 a0 = vec_splat(a1, 0);
1347 a1 = vec_splat(a1, 1);
1348 a2 = vec_splat(a3, 0);
1349 a3 = vec_splat(a3, 1);
1352template<
typename Packet,
int N> EIGEN_ALWAYS_INLINE
void
1353pbroadcastN(
const __UNPACK_TYPE__(Packet) *a,
1354 Packet& a0, Packet& a1, Packet& a2, Packet& a3)
1356 a0 = pset1<Packet>(a[0]);
1358 a1 = pset1<Packet>(a[1]);
1360 EIGEN_UNUSED_VARIABLE(a1);
1363 a2 = pset1<Packet>(a[2]);
1365 EIGEN_UNUSED_VARIABLE(a2);
1368 a3 = pset1<Packet>(a[3]);
1370 EIGEN_UNUSED_VARIABLE(a3);
1374template<> EIGEN_ALWAYS_INLINE
void
1375pbroadcastN<Packet4f,4>(
const float *a,
1376 Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
1378 a3 = pload<Packet4f>(a);
1379 a0 = vec_splat(a3, 0);
1380 a1 = vec_splat(a3, 1);
1381 a2 = vec_splat(a3, 2);
1382 a3 = vec_splat(a3, 3);
1389#define MICRO_UNROLL_PEEL(func) \
1390 func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
1392#define MICRO_ZERO_PEEL(peel) \
1393 if ((PEEL_ROW > peel) && (peel != 0)) { \
1394 bsetzero<Scalar, Packet, accRows>(accZero##peel); \
1396 EIGEN_UNUSED_VARIABLE(accZero##peel); \
1399#define MICRO_ZERO_PEEL_ROW \
1400 MICRO_UNROLL_PEEL(MICRO_ZERO_PEEL);
1402#define MICRO_WORK_PEEL(peel) \
1403 if (PEEL_ROW > peel) { \
1404 pbroadcastN<Packet,accRows>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
1405 pger<accRows, Scalar, Packet, false>(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \
1407 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1410#define MICRO_WORK_PEEL_ROW \
1411 Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \
1412 MICRO_UNROLL_PEEL(MICRO_WORK_PEEL); \
1413 lhs_ptr += (remaining_rows * PEEL_ROW); \
1414 rhs_ptr += (accRows * PEEL_ROW);
1416#define MICRO_ADD_PEEL(peel, sum) \
1417 if (PEEL_ROW > peel) { \
1418 for (Index i = 0; i < accRows; i++) { \
1419 accZero##sum.packet[i] += accZero##peel.packet[i]; \
1423#define MICRO_ADD_PEEL_ROW \
1424 MICRO_ADD_PEEL(4, 0) MICRO_ADD_PEEL(5, 1) MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) \
1425 MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0)
1427template<
typename Scalar,
typename Packet,
typename Index, const Index accRows, const Index remaining_rows>
1428EIGEN_ALWAYS_INLINE
void MICRO_EXTRA_ROW(
1429 const Scalar* &lhs_ptr,
1430 const Scalar* &rhs_ptr,
1431 PacketBlock<Packet,accRows> &accZero)
1434 pbroadcastN<Packet,accRows>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1435 pger<accRows, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
1436 lhs_ptr += remaining_rows;
1440template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows, const Index accCols, const Index remaining_rows>
1441EIGEN_ALWAYS_INLINE
void gemm_unrolled_row_iteration(
1442 const DataMapper& res,
1443 const Scalar* lhs_base,
1444 const Scalar* rhs_base,
1452 const Packet& pAlpha,
1453 const Packet& pMask)
1455 const Scalar* rhs_ptr = rhs_base;
1456 const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
1457 PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc;
1459 bsetzero<Scalar, Packet, accRows>(accZero0);
1461 Index remaining_depth = (col + quad_traits<Scalar>::rows < cols) ? depth : (depth & -quad_traits<Scalar>::rows);
1463 if (remaining_depth >= PEEL_ROW) {
1467 EIGEN_POWER_PREFETCH(rhs_ptr);
1468 EIGEN_POWER_PREFETCH(lhs_ptr);
1470 }
while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth);
1473 for(; k < remaining_depth; k++)
1475 MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows, remaining_rows>(lhs_ptr, rhs_ptr, accZero0);
1478 if ((remaining_depth == depth) && (rows >= accCols))
1480 bload<DataMapper, Packet, Index, 0, ColMajor, false, accRows>(acc, res, row, 0);
1481 bscale<Packet,accRows>(acc, accZero0, pAlpha, pMask);
1482 res.template storePacketBlock<Packet,accRows>(row, 0, acc);
1484 for(; k < depth; k++)
1487 pbroadcastN<Packet,accRows>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1488 pger<accRows, Scalar, Packet, Index, false, remaining_rows>(&accZero0, lhs_ptr, rhsV);
1489 lhs_ptr += remaining_rows;
1493 for(
Index j = 0; j < accRows; j++) {
1494 accZero0.packet[j] = vec_mul(pAlpha, accZero0.packet[j]);
1495 for(
Index i = 0; i < remaining_rows; i++) {
1496 res(row + i, j) += accZero0.packet[j][i];
1502template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows, const Index accCols>
1503EIGEN_ALWAYS_INLINE
void gemm_extra_row(
1504 const DataMapper& res,
1505 const Scalar* lhs_base,
1506 const Scalar* rhs_base,
1514 Index remaining_rows,
1515 const Packet& pAlpha,
1516 const Packet& pMask)
1518 switch(remaining_rows) {
1520 gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, Index, accRows, accCols, 1>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask);
1523 if (
sizeof(Scalar) ==
sizeof(
float)) {
1524 gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, Index, accRows, accCols, 2>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask);
1528 if (
sizeof(Scalar) ==
sizeof(
float)) {
1529 gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, Index, accRows, accCols, 3>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask);
1535#define MICRO_UNROLL(func) \
1536 func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
1538#define MICRO_UNROLL_WORK(func, func2, peel) \
1539 MICRO_UNROLL(func2); \
1540 func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
1541 func(4,peel) func(5,peel) func(6,peel) func(7,peel)
1543#define MICRO_LOAD_ONE(iter) \
1544 if (unroll_factor > iter) { \
1545 lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
1546 lhs_ptr##iter += accCols; \
1548 EIGEN_UNUSED_VARIABLE(lhsV##iter); \
1551#define MICRO_WORK_ONE(iter, peel) \
1552 if (unroll_factor > iter) { \
1553 pger_common<Packet, false, accRows>(&accZero##iter, lhsV##iter, rhsV##peel); \
1556#define MICRO_TYPE_PEEL4(func, func2, peel) \
1557 if (PEEL > peel) { \
1558 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
1559 pbroadcastN<Packet,accRows>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
1560 MICRO_UNROLL_WORK(func, func2, peel) \
1562 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1565#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
1566 Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \
1567 func(func1,func2,0); func(func1,func2,1); \
1568 func(func1,func2,2); func(func1,func2,3); \
1569 func(func1,func2,4); func(func1,func2,5); \
1570 func(func1,func2,6); func(func1,func2,7);
1572#define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
1574 func(func1,func2,0);
1576#define MICRO_ONE_PEEL4 \
1577 MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1578 rhs_ptr += (accRows * PEEL);
1581 MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1584#define MICRO_DST_PTR_ONE(iter) \
1585 if (unroll_factor > iter) { \
1586 bsetzero<Scalar, Packet, accRows>(accZero##iter); \
1588 EIGEN_UNUSED_VARIABLE(accZero##iter); \
1591#define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
1593#define MICRO_SRC_PTR_ONE(iter) \
1594 if (unroll_factor > iter) { \
1595 lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \
1597 EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
1600#define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE)
1602#define MICRO_PREFETCH_ONE(iter) \
1603 if (unroll_factor > iter) { \
1604 EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
1607#define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
1609#define MICRO_STORE_ONE(iter) \
1610 if (unroll_factor > iter) { \
1611 bload<DataMapper, Packet, Index, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
1612 bscale<Packet,accRows>(acc, accZero##iter, pAlpha); \
1613 res.template storePacketBlock<Packet,accRows>(row + iter*accCols, 0, acc); \
1616#define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
1618template<
int unroll_factor,
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows, const Index accCols>
1619EIGEN_STRONG_INLINE
void gemm_unrolled_iteration(
1620 const DataMapper& res,
1621 const Scalar* lhs_base,
1622 const Scalar* rhs_base,
1626 const Packet& pAlpha)
1628 const Scalar* rhs_ptr = rhs_base;
1629 const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
1630 PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
1631 PacketBlock<Packet,accRows> acc;
1637 for(; k + PEEL <= depth; k+= PEEL)
1639 EIGEN_POWER_PREFETCH(rhs_ptr);
1643 for(; k < depth; k++)
1649 row += unroll_factor*accCols;
1652template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows, const Index accCols>
1653EIGEN_ALWAYS_INLINE
void gemm_cols(
1654 const DataMapper& res,
1655 const Scalar* blockA,
1656 const Scalar* blockB,
1665 Index remaining_rows,
1666 const Packet& pAlpha,
1667 const Packet& pMask)
1669 const DataMapper res3 = res.getSubMapper(0, col);
1671 const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
1672 const Scalar* lhs_base = blockA + accCols*offsetA;
1676 while(row + MAX_UNROLL*accCols <= rows) {
1677 gemm_unrolled_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
1679 switch( (rows-row)/accCols ) {
1682 gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
1687 gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
1692 gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
1697 gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
1702 gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
1707 gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
1712 gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
1720 if(remaining_rows > 0)
1722 gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
1726template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accCols>
1727EIGEN_STRONG_INLINE
void gemm_extra_cols(
1728 const DataMapper& res,
1729 const Scalar* blockA,
1730 const Scalar* blockB,
1739 Index remaining_rows,
1740 const Packet& pAlpha,
1741 const Packet& pMask)
1743 for (; col < cols; col++) {
1744 gemm_cols<Scalar, Packet, DataMapper, Index, 1, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
1751template<
typename Scalar,
typename Index,
typename Packet,
typename RhsPacket,
typename DataMapper, const Index accRows, const Index accCols>
1752EIGEN_STRONG_INLINE
void gemm(
const DataMapper& res,
const Scalar* blockA,
const Scalar* blockB,
Index rows,
Index depth,
Index cols, Scalar alpha,
Index strideA,
Index strideB,
Index offsetA,
Index offsetB)
1754 const Index remaining_rows = rows % accCols;
1756 if( strideA == -1 ) strideA = depth;
1757 if( strideB == -1 ) strideB = depth;
1759 const Packet pAlpha = pset1<Packet>(alpha);
1760 const Packet pMask = bmask<Packet>((
const int)(remaining_rows));
1763 for(; col + accRows <= cols; col += accRows)
1765 gemm_cols<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
1768 gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
1771#define accColsC (accCols / 2)
1772#define advanceRows ((LhsIsReal) ? 1 : 2)
1773#define advanceCols ((RhsIsReal) ? 1 : 2)
1776#define PEEL_COMPLEX 3
1777#define PEEL_COMPLEX_ROW 3
1779#define MICRO_COMPLEX_UNROLL_PEEL(func) \
1780 func(0) func(1) func(2) func(3)
1782#define MICRO_COMPLEX_ZERO_PEEL(peel) \
1783 if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \
1784 bsetzero<Scalar, Packet, accRows>(accReal##peel); \
1785 bsetzero<Scalar, Packet, accRows>(accImag##peel); \
1787 EIGEN_UNUSED_VARIABLE(accReal##peel); \
1788 EIGEN_UNUSED_VARIABLE(accImag##peel); \
1791#define MICRO_COMPLEX_ZERO_PEEL_ROW \
1792 MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_ZERO_PEEL);
1794#define MICRO_COMPLEX_WORK_PEEL(peel) \
1795 if (PEEL_COMPLEX_ROW > peel) { \
1796 pbroadcastN_old<Packet,accRows>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
1797 if(!RhsIsReal) pbroadcastN_old<Packet,accRows>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
1798 pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \
1800 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1801 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
1804#define MICRO_COMPLEX_WORK_PEEL_ROW \
1805 Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \
1806 Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \
1807 MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_WORK_PEEL); \
1808 lhs_ptr_real += (remaining_rows * PEEL_COMPLEX_ROW); \
1809 if(!LhsIsReal) lhs_ptr_imag += (remaining_rows * PEEL_COMPLEX_ROW); \
1810 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); \
1811 rhs_ptr_real += (accRows * PEEL_COMPLEX_ROW); \
1812 if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_ROW); \
1813 else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1815#define MICRO_COMPLEX_ADD_PEEL(peel, sum) \
1816 if (PEEL_COMPLEX_ROW > peel) { \
1817 for (Index i = 0; i < accRows; i++) { \
1818 accReal##sum.packet[i] += accReal##peel.packet[i]; \
1819 accImag##sum.packet[i] += accImag##peel.packet[i]; \
1823#define MICRO_COMPLEX_ADD_PEEL_ROW \
1824 MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \
1825 MICRO_COMPLEX_ADD_PEEL(1, 0)
1827template<
typename Scalar,
typename Packet,
typename Index, const Index accRows,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal, const Index remaining_rows>
1828EIGEN_ALWAYS_INLINE
void MICRO_COMPLEX_EXTRA_ROW(
1829 const Scalar* &lhs_ptr_real,
const Scalar* &lhs_ptr_imag,
1830 const Scalar* &rhs_ptr_real,
const Scalar* &rhs_ptr_imag,
1831 PacketBlock<Packet,accRows> &accReal, PacketBlock<Packet,accRows> &accImag)
1833 Packet rhsV[4], rhsVi[4];
1834 pbroadcastN_old<Packet,accRows>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1835 if(!RhsIsReal) pbroadcastN_old<Packet,accRows>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
1836 pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
1837 lhs_ptr_real += remaining_rows;
1838 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1839 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1840 rhs_ptr_real += accRows;
1841 if(!RhsIsReal) rhs_ptr_imag += accRows;
1842 else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1845template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal, const Index remaining_rows>
1846EIGEN_ALWAYS_INLINE
void gemm_unrolled_complex_row_iteration(
1847 const DataMapper& res,
1848 const Scalar* lhs_base,
1849 const Scalar* rhs_base,
1858 const Packet& pAlphaReal,
1859 const Packet& pAlphaImag,
1860 const Packet& pMask)
1862 const Scalar* rhs_ptr_real = rhs_base;
1863 const Scalar* rhs_ptr_imag = NULL;
1864 if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB;
1865 else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1866 const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
1867 const Scalar* lhs_ptr_imag = NULL;
1868 if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
1869 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1870 PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
1871 PacketBlock<Packet,accRows> taccReal, taccImag;
1872 PacketBlock<Packetc,accRows> acc0, acc1;
1873 PacketBlock<Packetc,accRows*2> tRes;
1875 bsetzero<Scalar, Packet, accRows>(accReal0);
1876 bsetzero<Scalar, Packet, accRows>(accImag0);
1878 Index remaining_depth = (col + quad_traits<Scalar>::rows < cols) ? depth : (depth & -quad_traits<Scalar>::rows);
1880 if (remaining_depth >= PEEL_COMPLEX_ROW) {
1881 MICRO_COMPLEX_ZERO_PEEL_ROW
1884 EIGEN_POWER_PREFETCH(rhs_ptr_real);
1886 EIGEN_POWER_PREFETCH(rhs_ptr_imag);
1888 EIGEN_POWER_PREFETCH(lhs_ptr_real);
1890 EIGEN_POWER_PREFETCH(lhs_ptr_imag);
1892 MICRO_COMPLEX_WORK_PEEL_ROW
1893 }
while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth);
1894 MICRO_COMPLEX_ADD_PEEL_ROW
1896 for(; k < remaining_depth; k++)
1898 MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal0, accImag0);
1901 if ((remaining_depth == depth) && (rows >= accCols))
1903 bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, accRows>(tRes, res, row, 0);
1904 bscalec<Packet,accRows>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
1905 bcouple<Packet, Packetc, accRows>(taccReal, taccImag, tRes, acc0, acc1);
1906 res.template storePacketBlock<Packetc,accRows>(row + 0, 0, acc0);
1907 res.template storePacketBlock<Packetc,accRows>(row + accColsC, 0, acc1);
1909 for(; k < depth; k++)
1911 Packet rhsV[4], rhsVi[4];
1912 pbroadcastN_old<Packet,accRows>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1913 if(!RhsIsReal) pbroadcastN_old<Packet,accRows>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
1914 pgerc<accRows, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(&accReal0, &accImag0, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
1915 lhs_ptr_real += remaining_rows;
1916 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1917 rhs_ptr_real += accRows;
1918 if(!RhsIsReal) rhs_ptr_imag += accRows;
1921 bscalec<Packet,accRows>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag);
1922 bcouple_common<Packet, Packetc, accRows>(taccReal, taccImag, acc0, acc1);
1924 if ((
sizeof(Scalar) ==
sizeof(
float)) && (remaining_rows == 1))
1926 for(
Index j = 0; j < accRows; j++) {
1927 res(row + 0, j) += pfirst<Packetc>(acc0.packet[j]);
1930 for(
Index j = 0; j < accRows; j++) {
1931 PacketBlock<Packetc,1> acc2;
1932 acc2.packet[0] = res.template loadPacket<Packetc>(row + 0, j) + acc0.packet[j];
1933 res.template storePacketBlock<Packetc,1>(row + 0, j, acc2);
1934 if(remaining_rows > accColsC) {
1935 res(row + accColsC, j) += pfirst<Packetc>(acc1.packet[j]);
1942template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1943EIGEN_ALWAYS_INLINE
void gemm_complex_extra_row(
1944 const DataMapper& res,
1945 const Scalar* lhs_base,
1946 const Scalar* rhs_base,
1955 Index remaining_rows,
1956 const Packet& pAlphaReal,
1957 const Packet& pAlphaImag,
1958 const Packet& pMask)
1960 switch(remaining_rows) {
1962 gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, 1>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask);
1965 if (
sizeof(Scalar) ==
sizeof(
float)) {
1966 gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, 2>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask);
1970 if (
sizeof(Scalar) ==
sizeof(
float)) {
1971 gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, 3>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask);
1977#define MICRO_COMPLEX_UNROLL(func) \
1978 func(0) func(1) func(2) func(3)
1980#define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
1981 MICRO_COMPLEX_UNROLL(func2); \
1982 func(0,peel) func(1,peel) func(2,peel) func(3,peel)
1984#define MICRO_COMPLEX_LOAD_ONE(iter) \
1985 if (unroll_factor > iter) { \
1986 lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
1988 lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter + imag_delta); \
1990 EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
1992 lhs_ptr_real##iter += accCols; \
1994 EIGEN_UNUSED_VARIABLE(lhsV##iter); \
1995 EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
1998#define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
1999 if (unroll_factor > iter) { \
2000 pgerc_common<accRows, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
2003#define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
2004 if (PEEL_COMPLEX > peel) { \
2005 Packet lhsV0, lhsV1, lhsV2, lhsV3; \
2006 Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
2007 pbroadcastN_old<Packet,accRows>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
2009 pbroadcastN_old<Packet,accRows>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
2011 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2013 MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2015 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2016 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2019#define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
2020 Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \
2021 Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \
2022 func(func1,func2,0); func(func1,func2,1); \
2023 func(func1,func2,2); func(func1,func2,3);
2025#define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
2026 Packet rhsV0[M], rhsVi0[M];\
2027 func(func1,func2,0);
2029#define MICRO_COMPLEX_ONE_PEEL4 \
2030 MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
2031 rhs_ptr_real += (accRows * PEEL_COMPLEX); \
2032 if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX);
2034#define MICRO_COMPLEX_ONE4 \
2035 MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
2036 rhs_ptr_real += accRows; \
2037 if(!RhsIsReal) rhs_ptr_imag += accRows;
2039#define MICRO_COMPLEX_DST_PTR_ONE(iter) \
2040 if (unroll_factor > iter) { \
2041 bsetzero<Scalar, Packet, accRows>(accReal##iter); \
2042 bsetzero<Scalar, Packet, accRows>(accImag##iter); \
2044 EIGEN_UNUSED_VARIABLE(accReal##iter); \
2045 EIGEN_UNUSED_VARIABLE(accImag##iter); \
2048#define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
2050#define MICRO_COMPLEX_SRC_PTR_ONE(iter) \
2051 if (unroll_factor > iter) { \
2052 lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \
2054 EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
2057#define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
2059#define MICRO_COMPLEX_PREFETCH_ONE(iter) \
2060 if (unroll_factor > iter) { \
2061 EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
2064#define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
2066#define MICRO_COMPLEX_STORE_ONE(iter) \
2067 if (unroll_factor > iter) { \
2068 bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, accRows>(tRes, res, row + iter*accCols, 0); \
2069 bscalec<Packet,accRows>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
2070 bcouple<Packet, Packetc, accRows>(taccReal, taccImag, tRes, acc0, acc1); \
2071 res.template storePacketBlock<Packetc,accRows>(row + iter*accCols + 0, 0, acc0); \
2072 res.template storePacketBlock<Packetc,accRows>(row + iter*accCols + accColsC, 0, acc1); \
2075#define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
2077template<
int unroll_factor,
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2078EIGEN_STRONG_INLINE
void gemm_complex_unrolled_iteration(
2079 const DataMapper& res,
2080 const Scalar* lhs_base,
2081 const Scalar* rhs_base,
2086 const Packet& pAlphaReal,
2087 const Packet& pAlphaImag)
2089 const Scalar* rhs_ptr_real = rhs_base;
2090 const Scalar* rhs_ptr_imag = NULL;
2091 const Index imag_delta = accCols*strideA;
2093 rhs_ptr_imag = rhs_base + accRows*strideB;
2095 EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
2097 const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
2098 const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
2099 PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1;
2100 PacketBlock<Packet,accRows> accReal2, accImag2, accReal3, accImag3;
2101 PacketBlock<Packet,accRows> taccReal, taccImag;
2102 PacketBlock<Packetc,accRows> acc0, acc1;
2103 PacketBlock<Packetc,accRows*2> tRes;
2105 MICRO_COMPLEX_SRC_PTR
2106 MICRO_COMPLEX_DST_PTR
2109 for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
2111 EIGEN_POWER_PREFETCH(rhs_ptr_real);
2113 EIGEN_POWER_PREFETCH(rhs_ptr_imag);
2115 MICRO_COMPLEX_PREFETCH
2116 MICRO_COMPLEX_ONE_PEEL4
2118 for(; k < depth; k++)
2124 row += unroll_factor*accCols;
2127template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2128EIGEN_ALWAYS_INLINE
void gemm_complex_cols(
2129 const DataMapper& res,
2130 const Scalar* blockA,
2131 const Scalar* blockB,
2140 Index remaining_rows,
2141 const Packet& pAlphaReal,
2142 const Packet& pAlphaImag,
2143 const Packet& pMask)
2145 const DataMapper res3 = res.getSubMapper(0, col);
2147 const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
2148 const Scalar* lhs_base = blockA + accCols*offsetA;
2151#define MAX_COMPLEX_UNROLL 3
2152 while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
2153 gemm_complex_unrolled_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
2155 switch( (rows-row)/accCols ) {
2156#if MAX_COMPLEX_UNROLL > 4
2158 gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
2161#if MAX_COMPLEX_UNROLL > 3
2163 gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
2166#if MAX_COMPLEX_UNROLL > 2
2168 gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
2171#if MAX_COMPLEX_UNROLL > 1
2173 gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
2179#undef MAX_COMPLEX_UNROLL
2181 if(remaining_rows > 0)
2183 gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2187template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2188EIGEN_STRONG_INLINE
void gemm_complex_extra_cols(
2189 const DataMapper& res,
2190 const Scalar* blockA,
2191 const Scalar* blockB,
2200 Index remaining_rows,
2201 const Packet& pAlphaReal,
2202 const Packet& pAlphaImag,
2203 const Packet& pMask)
2205 for (; col < cols; col++) {
2206 gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, Index, 1, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2210template<
typename LhsScalar,
typename RhsScalar,
typename Scalarc,
typename Scalar,
typename Index,
typename Packet,
typename Packetc,
typename RhsPacket,
typename DataMapper, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2211EIGEN_STRONG_INLINE
void gemm_complex(
const DataMapper& res,
const LhsScalar* blockAc,
const RhsScalar* blockBc,
Index rows,
Index depth,
Index cols, Scalarc alpha,
Index strideA,
Index strideB,
Index offsetA,
Index offsetB)
2213 const Index remaining_rows = rows % accCols;
2215 if( strideA == -1 ) strideA = depth;
2216 if( strideB == -1 ) strideB = depth;
2218 const Packet pAlphaReal = pset1<Packet>(alpha.real());
2219 const Packet pAlphaImag = pset1<Packet>(alpha.imag());
2220 const Packet pMask = bmask<Packet>((
const int)(remaining_rows));
2222 const Scalar* blockA = (Scalar *) blockAc;
2223 const Scalar* blockB = (Scalar *) blockBc;
2226 for(; col + accRows <= cols; col += accRows)
2228 gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2231 gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2241template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2247template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2252 pack(blockA, lhs, depth, rows, stride, offset);
2255template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2261template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2266 pack(blockA, lhs, depth, rows, stride, offset);
2269#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
2270template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2276template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2281 pack(blockB, rhs, depth, cols, stride, offset);
2284template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2290template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2295 pack(blockB, rhs, depth, cols, stride, offset);
2299template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2305template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2310 pack(blockA, lhs, depth, rows, stride, offset);
2313template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2319template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2324 pack(blockA, lhs, depth, rows, stride, offset);
2327template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2333template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2338 pack(blockA, lhs, depth, rows, stride, offset);
2341template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2347template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2352 pack(blockA, lhs, depth, rows, stride, offset);
2355#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
2356template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2362template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2367 pack(blockB, rhs, depth, cols, stride, offset);
2370template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2376template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2381 pack(blockB, rhs, depth, cols, stride, offset);
2385template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2391template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2396 pack(blockB, rhs, depth, cols, stride, offset);
2399template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2405template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2410 pack(blockB, rhs, depth, cols, stride, offset);
2413template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2419template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2424 pack(blockA, lhs, depth, rows, stride, offset);
2427template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2433template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2438 pack(blockA, lhs, depth, rows, stride, offset);
2441template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2447template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2452 pack(blockB, rhs, depth, cols, stride, offset);
2455template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2461template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2466 pack(blockB, rhs, depth, cols, stride, offset);
2470template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2473 typedef typename quad_traits<float>::vectortype Packet;
2474 typedef typename quad_traits<float>::rhstype RhsPacket;
2476 void operator()(
const DataMapper& res,
const float* blockA,
const float* blockB,
2481template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2489 void (*
gemm_function)(
const DataMapper&,
const float*,
const float*,
Index,
Index,
Index, float,
Index,
Index,
Index,
Index);
2491 #if defined(EIGEN_ALTIVEC_MMA_ONLY)
2493 gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2494 #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
2496 gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2499 gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2502 gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2504 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2507template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2514 void operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const std::complex<float>* blockB,
2519template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2521 ::operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const std::complex<float>* blockB,
2530 #if defined(EIGEN_ALTIVEC_MMA_ONLY)
2532 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
false>;
2533 #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
2535 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
false>;
2538 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
false>;
2541 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2543 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2546template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2553 void operator()(
const DataMapper& res,
const float* blockA,
const std::complex<float>* blockB,
2558template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2560 ::operator()(
const DataMapper& res,
const float* blockA,
const std::complex<float>* blockB,
2568 #if defined(EIGEN_ALTIVEC_MMA_ONLY)
2570 gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
true,
false>;
2571 #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
2573 gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
true,
false>;
2576 gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
true,
false>;
2579 gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2581 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2584template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2591 void operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const float* blockB,
2596template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2598 ::operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const float* blockB,
2606 #if defined(EIGEN_ALTIVEC_MMA_ONLY)
2608 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
true>;
2609 #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
2611 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
true>;
2614 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
true>;
2617 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2619 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2622template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2625 typedef typename quad_traits<double>::vectortype Packet;
2626 typedef typename quad_traits<double>::rhstype RhsPacket;
2628 void operator()(
const DataMapper& res,
const double* blockA,
const double* blockB,
2633template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2641 void (*
gemm_function)(
const DataMapper&,
const double*,
const double*,
Index,
Index,
Index,
double,
Index,
Index,
Index,
Index);
2643 #if defined(EIGEN_ALTIVEC_MMA_ONLY)
2645 gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2646 #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
2648 gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2651 gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2654 gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2656 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2659template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2662 typedef quad_traits<double>::vectortype Packet;
2664 typedef quad_traits<double>::rhstype RhsPacket;
2666 void operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const std::complex<double>* blockB,
2671template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2673 ::operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const std::complex<double>* blockB,
2681 #if defined(EIGEN_ALTIVEC_MMA_ONLY)
2683 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>,
double,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
false>;
2684 #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
2686 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>,
double,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
false>;
2689 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>,
double,
Index, Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
false>;
2692 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2694 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2697template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2700 typedef quad_traits<double>::vectortype Packet;
2702 typedef quad_traits<double>::rhstype RhsPacket;
2704 void operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const double* blockB,
2709template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2711 ::operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const double* blockB,
2719 #if defined(EIGEN_ALTIVEC_MMA_ONLY)
2721 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>,
double, std::complex<double>,
double,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
true>;
2722 #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
2724 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>,
double, std::complex<double>,
double,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
true>;
2727 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>,
double, std::complex<double>,
double,
Index, Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
false,
true>;
2730 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2732 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2735template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2738 typedef quad_traits<double>::vectortype Packet;
2740 typedef quad_traits<double>::rhstype RhsPacket;
2742 void operator()(
const DataMapper& res,
const double* blockA,
const std::complex<double>* blockB,
2747template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2749 ::operator()(
const DataMapper& res,
const double* blockA,
const std::complex<double>* blockB,
2757 #if defined(EIGEN_ALTIVEC_MMA_ONLY)
2759 gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>,
double,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
true,
false>;
2760 #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
2762 gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>,
double,
Index,
Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
true,
false>;
2765 gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>,
double,
Index, Packet, Packetc, RhsPacket,
DataMapper,
accRows,
accCols,
ConjugateLhs,
ConjugateRhs,
true,
false>;
2768 gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2770 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
Definition ForwardDeclarations.h:87
Base class for all dense matrices, vectors, and expressions.
Definition MatrixBase.h:50
@ ColMajor
Storage order is column major (see TopicStorageOrders).
Definition Constants.h:319
@ RowMajor
Storage order is row major (see TopicStorageOrders).
Definition Constants.h:321
Namespace containing all symbols from the Eigen library.
Definition LDLT.h:16
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
int N
Simulate some binary data with a single categorical and single continuous predictor.
Definition logistic_regression.py:26
Definition MatrixProduct.h:431
Definition MatrixProduct.h:581
Definition GeneralBlockPanelKernel.h:1058
Definition GenericPacketMath.h:107
Definition MatrixProduct.h:66
Definition SelfadjointMatrixMatrix.h:20
Definition SelfadjointMatrixMatrix.h:102
Definition PacketMath.h:47