10#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11#define EIGEN_GENERAL_BLOCK_PANEL_H
18enum GEBPPacketSizeType {
24template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs=false,
bool _ConjRhs=false,
int Arch=Architecture::Target,
int _PacketSize=GEBPPacketFull>
29inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
34#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
37#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
40#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
43#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
46#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
49#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
52#if EIGEN_ARCH_i386_OR_x86_64
53const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
54const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
55const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
57const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
58const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
59const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
61const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
62const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
63const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
66#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
75 m_l1 = manage_caching_sizes_helper(
l1CacheSize, defaultL1CacheSize);
76 m_l2 = manage_caching_sizes_helper(
l2CacheSize, defaultL2CacheSize);
77 m_l3 = manage_caching_sizes_helper(
l3CacheSize, defaultL3CacheSize);
86inline void manage_caching_sizes(Action action, std::ptrdiff_t*
l1, std::ptrdiff_t*
l2, std::ptrdiff_t*
l3)
93 eigen_internal_assert(
l1!=0 &&
l2!=0);
98 else if(action==GetAction)
100 eigen_internal_assert(
l1!=0 &&
l2!=0);
107 eigen_internal_assert(
false);
123template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
126 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
133 std::ptrdiff_t l1, l2, l3;
134 manage_caching_sizes(GetAction, &l1, &l2, &l3);
135 #ifdef EIGEN_VECTORIZE_AVX512
146 if (num_threads > 1) {
147 typedef typename Traits::ResScalar ResScalar;
149 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
150 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
160 const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
162 k = k_cache - (k_cache % kr);
163 eigen_internal_assert(k > 0);
166 const Index n_cache = (l2-l1) / (nr *
sizeof(RhsScalar) * k);
167 const Index n_per_thread = numext::div_ceil(n, num_threads);
168 if (n_cache <= n_per_thread) {
170 eigen_internal_assert(n_cache >=
static_cast<Index>(nr));
171 n = n_cache - (n_cache % nr);
172 eigen_internal_assert(n > 0);
174 n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
179 const Index m_cache = (l3-l2) / (
sizeof(LhsScalar) * k * num_threads);
180 const Index m_per_thread = numext::div_ceil(m, num_threads);
181 if(m_cache < m_per_thread && m_cache >=
static_cast<Index>(mr)) {
182 m = m_cache - (m_cache % mr);
183 eigen_internal_assert(m > 0);
185 m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
192#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
202 if((numext::maxi)(k,(numext::maxi)(m,n))<48)
205 typedef typename Traits::ResScalar ResScalar;
208 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
209 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
219 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
220 const Index old_k = k;
226 k = (k%max_kc)==0 ? max_kc
227 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
229 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
238 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
239 const Index actual_l2 = l3;
241 const Index actual_l2 = 1572864;
251 const Index lhs_bytes = m * k *
sizeof(LhsScalar);
252 const Index remaining_l1 = l1- k_sub - lhs_bytes;
253 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
256 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
261 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
264 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
272 : (nc - Traits::nr * ((nc-(n%nc))/(Traits::nr*(n/nc+1))));
279 Index problem_size = k*n*
sizeof(LhsScalar);
280 Index actual_lm = actual_l2;
282 if(problem_size<=1024)
288 else if(l3!=0 && problem_size<=32768)
293 max_mc = (numext::mini<Index>)(576,max_mc);
295 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
296 if (mc > Traits::mr) mc -= mc % Traits::mr;
297 else if (mc==0)
return;
299 : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1))));
304template <
typename Index>
307#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
308 if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
309 k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
310 m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
311 n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
315 EIGEN_UNUSED_VARIABLE(k)
316 EIGEN_UNUSED_VARIABLE(m)
317 EIGEN_UNUSED_VARIABLE(n)
338template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
341 if (!useSpecificBlockingSizes(k, m, n)) {
342 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
346template<
typename LhsScalar,
typename RhsScalar,
typename Index>
349 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
352template <
typename RhsPacket,
typename RhsPacketx4,
int registers_taken>
355 static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -
registers_taken;
360template <
typename Packet>
370template <
int N,
typename T1,
typename T2,
typename T3>
373template <
typename T1,
typename T2,
typename T3>
376template <
typename T1,
typename T2,
typename T3>
379#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
380 typedef typename packet_conditional<packet_size, \
381 typename packet_traits<name ## Scalar>::type, \
382 typename packet_traits<name ## Scalar>::half, \
383 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
384 prefix ## name ## Packet
386#define PACKET_DECL_COND(name, packet_size) \
387 typedef typename packet_conditional<packet_size, \
388 typename packet_traits<name ## Scalar>::type, \
389 typename packet_traits<name ## Scalar>::half, \
390 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
393#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
394 typedef typename packet_conditional<packet_size, \
395 typename packet_traits<Scalar>::type, \
396 typename packet_traits<Scalar>::half, \
397 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
398 prefix ## ScalarPacket
400#define PACKET_DECL_COND_SCALAR(packet_size) \
401 typedef typename packet_conditional<packet_size, \
402 typename packet_traits<Scalar>::type, \
403 typename packet_traits<Scalar>::half, \
404 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
417template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
437 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
443 default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
445 && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
450 mr = Vectorizable ? 3*LhsPacketSize : default_mr,
455 LhsProgress = LhsPacketSize,
468 EIGEN_STRONG_INLINE
void initAcc(
AccPacket& p)
473 template<
typename RhsPacketType>
481 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
484 template<
typename RhsPacketType>
499 template<
typename LhsPacketType>
505 template<
typename LhsPacketType>
511 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
519#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
520 EIGEN_UNUSED_VARIABLE(tmp);
523 tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
527 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
535 r = pmadd(c,alpha,r);
538 template<
typename ResPacketHalf>
539 EIGEN_STRONG_INLINE
void acc(
const ResPacketHalf& c,
const ResPacketHalf& alpha, ResPacketHalf& r)
const
541 r = pmadd(c,alpha,r);
546template<
typename RealScalar,
bool _ConjLhs,
int Arch,
int _PacketSize>
550 typedef std::complex<RealScalar> LhsScalar;
551 typedef RealScalar RhsScalar;
566 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
568#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
570 mr = 3*LhsPacketSize,
572 mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
575 LhsProgress = LhsPacketSize,
588 EIGEN_STRONG_INLINE
void initAcc(
AccPacket& p)
593 template<
typename RhsPacketType>
594 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b,
RhsPacketType& dest)
const
599 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b,
RhsPacketx4& dest)
const
601 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
604 template<
typename RhsPacketType>
605 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b,
RhsPacketType& dest)
const
610 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*,
RhsPacketx4&)
const
613 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b,
RhsPacket& dest)
const
618 EIGEN_STRONG_INLINE
void loadRhsQuad_impl(
const RhsScalar* b,
RhsPacket& dest,
const true_type&)
const
622 RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
626 EIGEN_STRONG_INLINE
void loadRhsQuad_impl(
const RhsScalar* b,
RhsPacket& dest,
const false_type&)
const
628 eigen_internal_assert(RhsPacketSize<=8);
632 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a,
LhsPacket& dest)
const
637 template<
typename LhsPacketType>
638 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a,
LhsPacketType& dest)
const
643 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
649 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
652#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
653 EIGEN_UNUSED_VARIABLE(tmp);
654 c.v = pmadd(a.v,b,c.v);
656 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
660 EIGEN_STRONG_INLINE
void madd_impl(
const LhsScalar& a,
const RhsScalar& b, ResScalar& c, RhsScalar& ,
const false_type&)
const
665 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
671 template <
typename ResPacketType,
typename AccPacketType>
675 r = cj.pmadd(c,alpha,r);
681template<
typename Packet>
688template<
typename Packet>
692 res.first = padd(a.first, b.first);
693 res.second = padd(a.second,b.second);
701template<
typename Packet>
704 typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
709template<
typename Packet>
710DoublePacket<typename unpacket_traits<Packet>::half>
711predux_half_dowto4(
const DoublePacket<Packet> &a,
712 typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
715 DoublePacket<typename unpacket_traits<Packet>::half> res;
716 typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
717 typedef typename packet_traits<Cplx>::type CplxPacket;
718 res.first = predux_half_dowto4(CplxPacket(a.first)).v;
719 res.second = predux_half_dowto4(CplxPacket(a.second)).v;
724template<
typename Scalar,
typename RealPacket>
725void loadQuadToDoublePacket(
const Scalar* b, DoublePacket<RealPacket>& dest,
726 typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
728 dest.first = pset1<RealPacket>(numext::real(*b));
729 dest.second = pset1<RealPacket>(numext::imag(*b));
732template<
typename Scalar,
typename RealPacket>
733void loadQuadToDoublePacket(
const Scalar* b, DoublePacket<RealPacket>& dest,
734 typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
737 typedef typename NumTraits<Scalar>::Real RealScalar;
738 RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
739 RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
740 dest.first = ploadquad<RealPacket>(r);
741 dest.second = ploadquad<RealPacket>(i);
757template<
typename RealScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
761 typedef std::complex<RealScalar> Scalar;
762 typedef std::complex<RealScalar> LhsScalar;
763 typedef std::complex<RealScalar> RhsScalar;
764 typedef std::complex<RealScalar> ResScalar;
786 LhsProgress = ResPacketSize,
801 EIGEN_STRONG_INLINE
void initAcc(Scalar& p) { p = Scalar(0); }
810 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b,
ScalarPacket& dest)
const
816 template<
typename RealPacketType>
823 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b,
RhsPacketx4& dest)
const
825 loadRhs(b, dest.B_0);
826 loadRhs(b + 1, dest.B1);
827 loadRhs(b + 2, dest.B2);
828 loadRhs(b + 3, dest.B3);
832 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b,
ScalarPacket& dest)
const
838 template<
typename RealPacketType>
844 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*,
RhsPacketx4&)
const {}
846 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b,
ResPacket& dest)
const
850 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b,
DoublePacketType& dest)
const
852 loadQuadToDoublePacket(b,dest);
856 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a,
LhsPacket& dest)
const
861 template<
typename LhsPacketType>
862 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a,
LhsPacketType& dest)
const
867 template<
typename LhsPacketType,
typename RhsPacketType,
typename ResPacketType,
typename TmpType,
typename LaneIdType>
872 c.first = padd(pmul(a,b.first), c.first);
873 c.second = padd(pmul(a,b.second),c.second);
876 template<
typename LaneIdType>
882 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
888 EIGEN_STRONG_INLINE
void acc(
const Scalar& c,
const Scalar& alpha, Scalar& r)
const { r += alpha * c; }
890 template<
typename RealPacketType,
typename ResPacketType>
895 if((!ConjLhs)&&(!ConjRhs))
900 else if((!ConjLhs)&&(ConjRhs))
905 else if((ConjLhs)&&(!ConjRhs))
910 else if((ConjLhs)&&(ConjRhs))
916 r = pmadd(tmp,alpha,r);
923template<
typename RealScalar,
bool _ConjRhs,
int Arch,
int _PacketSize>
927 typedef std::complex<RealScalar> Scalar;
928 typedef RealScalar LhsScalar;
929 typedef Scalar RhsScalar;
930 typedef Scalar ResScalar;
938#undef PACKET_DECL_COND_SCALAR_PREFIX
939#undef PACKET_DECL_COND_PREFIX
940#undef PACKET_DECL_COND_SCALAR
941#undef PACKET_DECL_COND
952 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
955 mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
957 LhsProgress = ResPacketSize,
968 EIGEN_STRONG_INLINE
void initAcc(
AccPacket& p)
973 template<
typename RhsPacketType>
974 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b,
RhsPacketType& dest)
const
979 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b,
RhsPacketx4& dest)
const
981 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
984 template<
typename RhsPacketType>
985 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b,
RhsPacketType& dest)
const
990 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*,
RhsPacketx4&)
const
993 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a,
LhsPacket& dest)
const
998 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b,
RhsPacket& dest)
const
1003 template<
typename LhsPacketType>
1004 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a,
LhsPacketType& dest)
const
1009 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
1015 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
1018#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1019 EIGEN_UNUSED_VARIABLE(tmp);
1020 c.v = pmadd(a,b.v,c.v);
1022 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
1027 EIGEN_STRONG_INLINE
void madd_impl(
const LhsScalar& a,
const RhsScalar& b, ResScalar& c, RhsScalar& ,
const false_type&)
const
1032 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
1035 madd(a, b.get(
lane), c, tmp,
lane);
1038 template <
typename ResPacketType,
typename AccPacketType>
1042 r = cj.pmadd(alpha,c,r);
1056template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1063 typedef typename Traits::ResScalar ResScalar;
1070 typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
1074 typedef typename SwappedTraits::ResScalar SResScalar;
1075 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1076 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1077 typedef typename SwappedTraits::ResPacket SResPacket;
1078 typedef typename SwappedTraits::AccPacket SAccPacket;
1080 typedef typename HalfTraits::LhsPacket LhsPacketHalf;
1081 typedef typename HalfTraits::RhsPacket RhsPacketHalf;
1082 typedef typename HalfTraits::ResPacket ResPacketHalf;
1083 typedef typename HalfTraits::AccPacket AccPacketHalf;
1085 typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
1086 typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
1087 typedef typename QuarterTraits::ResPacket ResPacketQuarter;
1088 typedef typename QuarterTraits::AccPacket AccPacketQuarter;
1090 typedef typename DataMapper::LinearMapper LinearMapper;
1093 Vectorizable = Traits::Vectorizable,
1094 LhsProgress = Traits::LhsProgress,
1095 LhsProgressHalf = HalfTraits::LhsProgress,
1096 LhsProgressQuarter = QuarterTraits::LhsProgress,
1097 RhsProgress = Traits::RhsProgress,
1098 RhsProgressHalf = HalfTraits::RhsProgress,
1099 RhsProgressQuarter = QuarterTraits::RhsProgress,
1100 ResPacketSize = Traits::ResPacketSize
1104 void operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
1116 typedef typename Traits::ResScalar ResScalar;
1117 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1118 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1119 typedef typename SwappedTraits::ResPacket SResPacket;
1120 typedef typename SwappedTraits::AccPacket SAccPacket;
1124 ResScalar alpha, SAccPacket &
C0)
1126 EIGEN_UNUSED_VARIABLE(res);
1127 EIGEN_UNUSED_VARIABLE(
straits);
1128 EIGEN_UNUSED_VARIABLE(
blA);
1129 EIGEN_UNUSED_VARIABLE(
blB);
1130 EIGEN_UNUSED_VARIABLE(depth);
1131 EIGEN_UNUSED_VARIABLE(
endk);
1132 EIGEN_UNUSED_VARIABLE(i);
1133 EIGEN_UNUSED_VARIABLE(
j2);
1134 EIGEN_UNUSED_VARIABLE(alpha);
1135 EIGEN_UNUSED_VARIABLE(
C0);
1140template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1145 typedef typename Traits::ResScalar ResScalar;
1146 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1147 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1148 typedef typename SwappedTraits::ResPacket SResPacket;
1149 typedef typename SwappedTraits::AccPacket SAccPacket;
1153 ResScalar alpha, SAccPacket &
C0)
1163 if (depth -
endk > 0)
1176 blB += SwappedTraits::LhsProgress/4;
1185 res.scatterPacket(i,
j2, R);
1189template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1192 typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1194 EIGEN_STRONG_INLINE
void peeled_kc_onestep(
Index K,
const LhsScalar*
blA,
const RhsScalar*
blB,
GEBPTraits traits, LhsPacket *
A0, RhsPacketx4 *
rhs_panel, RhsPacket *
T0, AccPacket *
C0, AccPacket *
C1, AccPacket *
C2, AccPacket *
C3)
1196 EIGEN_ASM_COMMENT(
"begin step of gebp micro kernel 1X4");
1197 EIGEN_ASM_COMMENT(
"Note: these asm comments work around bug 935!");
1204 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1207 EIGEN_ASM_COMMENT(
"end step of gebp micro kernel 1X4");
1210 EIGEN_STRONG_INLINE
void operator()(
1211 const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB, ResScalar alpha,
1247 LinearMapper
r0 = res.getLinearMapper(i,
j2 + 0);
1248 LinearMapper
r1 = res.getLinearMapper(i,
j2 + 1);
1249 LinearMapper
r2 = res.getLinearMapper(i,
j2 + 2);
1250 LinearMapper
r3 = res.getLinearMapper(i,
j2 + 3);
1264 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 1/half/quarterX4");
1268 internal::prefetch(
blB+(48+0));
1269 peeled_kc_onestep(0,
blA,
blB,
traits, &
A0, &
rhs_panel, &
T0, &
C0, &
C1, &
C2, &
C3);
1270 peeled_kc_onestep(1,
blA,
blB,
traits, &
A1, &
rhs_panel, &
T0, &
D0, &
D1, &
D2, &
D3);
1271 peeled_kc_onestep(2,
blA,
blB,
traits, &
A0, &
rhs_panel, &
T0, &
C0, &
C1, &
C2, &
C3);
1272 peeled_kc_onestep(3,
blA,
blB,
traits, &
A1, &
rhs_panel, &
T0, &
D0, &
D1, &
D2, &
D3);
1273 internal::prefetch(
blB+(48+16));
1274 peeled_kc_onestep(4,
blA,
blB,
traits, &
A0, &
rhs_panel, &
T0, &
C0, &
C1, &
C2, &
C3);
1275 peeled_kc_onestep(5,
blA,
blB,
traits, &
A1, &
rhs_panel, &
T0, &
D0, &
D1, &
D2, &
D3);
1276 peeled_kc_onestep(6,
blA,
blB,
traits, &
A0, &
rhs_panel, &
T0, &
C0, &
C1, &
C2, &
C3);
1277 peeled_kc_onestep(7,
blA,
blB,
traits, &
A1, &
rhs_panel, &
T0, &
D0, &
D1, &
D2, &
D3);
1279 blB +=
pk*4*RhsProgress;
1280 blA +=
pk*LhsProgress;
1282 EIGEN_ASM_COMMENT(
"end gebp micro kernel 1/half/quarterX4");
1294 peeled_kc_onestep(0,
blA,
blB,
traits, &
A0, &
rhs_panel, &
T0, &
C0, &
C1, &
C2, &
C3);
1295 blB += 4*RhsProgress;
1306 r0.storePacket(0,
R0);
1307 r1.storePacket(0,
R1);
1313 r2.storePacket(0,
R0);
1314 r3.storePacket(0,
R1);
1328 LinearMapper
r0 = res.getLinearMapper(i,
j2);
1336 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 1/half/quarterX1");
1339#define EIGEN_GEBGP_ONESTEP(K) \
1341 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1342 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1344 traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1345 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1346 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1347 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1350 EIGEN_GEBGP_ONESTEP(0);
1351 EIGEN_GEBGP_ONESTEP(1);
1352 EIGEN_GEBGP_ONESTEP(2);
1353 EIGEN_GEBGP_ONESTEP(3);
1354 EIGEN_GEBGP_ONESTEP(4);
1355 EIGEN_GEBGP_ONESTEP(5);
1356 EIGEN_GEBGP_ONESTEP(6);
1357 EIGEN_GEBGP_ONESTEP(7);
1359 blB +=
pk*RhsProgress;
1360 blA +=
pk*LhsProgress;
1362 EIGEN_ASM_COMMENT(
"end gebp micro kernel 1/half/quarterX1");
1369 EIGEN_GEBGP_ONESTEP(0);
1373#undef EIGEN_GEBGP_ONESTEP
1378 r0.storePacket(0,
R0);
1384template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1385struct lhs_process_fraction_of_packet :
lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1388EIGEN_STRONG_INLINE
void peeled_kc_onestep(
Index K,
const LhsScalar*
blA,
const RhsScalar*
blB,
GEBPTraits traits, LhsPacket *
A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *
C0, AccPacket *
C1, AccPacket *
C2, AccPacket *
C3)
1390 EIGEN_ASM_COMMENT(
"begin step of gebp micro kernel 1X4");
1391 EIGEN_ASM_COMMENT(
"Note: these asm comments work around bug 935!");
1392 traits.loadLhsUnaligned(&
blA[(0+1*K)*(LhsProgress)], *
A0);
1393 traits.broadcastRhs(&
blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1398 EIGEN_ASM_COMMENT(
"end step of gebp micro kernel 1X4");
1402template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1416 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1429 if(mr>=3*Traits::LhsProgress)
1436 const Index l1 = defaultL1CacheSize;
1440 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (
l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
1459 traits.initAcc(
C0); traits.initAcc(
C1); traits.initAcc(
C2); traits.initAcc(
C3);
1460 traits.initAcc(
C4); traits.initAcc(
C5); traits.initAcc(
C6); traits.initAcc(
C7);
1461 traits.initAcc(
C8); traits.initAcc(
C9); traits.initAcc(
C10); traits.initAcc(
C11);
1463 LinearMapper
r0 = res.getLinearMapper(i,
j2 + 0);
1464 LinearMapper
r1 = res.getLinearMapper(i,
j2 + 1);
1465 LinearMapper
r2 = res.getLinearMapper(i,
j2 + 2);
1466 LinearMapper
r3 = res.getLinearMapper(i,
j2 + 3);
1480 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 3pX4");
1485 #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1489 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1491 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1493#define EIGEN_GEBP_ONESTEP(K) \
1495 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1496 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1497 internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1498 if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1499 internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1501 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1502 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1503 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1504 EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1505 traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1506 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1507 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1508 traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1509 traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1510 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1511 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1512 traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1513 traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1514 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1515 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1516 traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1517 traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1518 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1519 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1520 traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1521 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1524 internal::prefetch(
blB);
1525 EIGEN_GEBP_ONESTEP(0);
1526 EIGEN_GEBP_ONESTEP(1);
1527 EIGEN_GEBP_ONESTEP(2);
1528 EIGEN_GEBP_ONESTEP(3);
1529 EIGEN_GEBP_ONESTEP(4);
1530 EIGEN_GEBP_ONESTEP(5);
1531 EIGEN_GEBP_ONESTEP(6);
1532 EIGEN_GEBP_ONESTEP(7);
1534 blB +=
pk*4*RhsProgress;
1535 blA +=
pk*3*Traits::LhsProgress;
1537 EIGEN_ASM_COMMENT(
"end gebp micro kernel 3pX4");
1540 for(
Index k=peeled_kc; k<depth; k++)
1542 RhsPanel15 rhs_panel;
1545 EIGEN_GEBP_ONESTEP(0);
1546 blB += 4*RhsProgress;
1547 blA += 3*Traits::LhsProgress;
1550#undef EIGEN_GEBP_ONESTEP
1552 ResPacket R0, R1, R2;
1553 ResPacket alphav = pset1<ResPacket>(alpha);
1555 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1556 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1557 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1558 traits.acc(C0, alphav, R0);
1559 traits.acc(C4, alphav, R1);
1560 traits.acc(C8, alphav, R2);
1561 r0.storePacket(0 * Traits::ResPacketSize, R0);
1562 r0.storePacket(1 * Traits::ResPacketSize, R1);
1563 r0.storePacket(2 * Traits::ResPacketSize, R2);
1565 R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1566 R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1567 R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1568 traits.acc(C1, alphav, R0);
1569 traits.acc(C5, alphav, R1);
1570 traits.acc(C9, alphav, R2);
1571 r1.storePacket(0 * Traits::ResPacketSize, R0);
1572 r1.storePacket(1 * Traits::ResPacketSize, R1);
1573 r1.storePacket(2 * Traits::ResPacketSize, R2);
1575 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1576 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1577 R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1578 traits.acc(C2, alphav, R0);
1579 traits.acc(C6, alphav, R1);
1580 traits.acc(C10, alphav, R2);
1581 r2.storePacket(0 * Traits::ResPacketSize, R0);
1582 r2.storePacket(1 * Traits::ResPacketSize, R1);
1583 r2.storePacket(2 * Traits::ResPacketSize, R2);
1585 R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1586 R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1587 R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1588 traits.acc(C3, alphav, R0);
1589 traits.acc(C7, alphav, R1);
1590 traits.acc(C11, alphav, R2);
1591 r3.storePacket(0 * Traits::ResPacketSize, R0);
1592 r3.storePacket(1 * Traits::ResPacketSize, R1);
1593 r3.storePacket(2 * Traits::ResPacketSize, R2);
1598 for(
Index j2=packet_cols4; j2<cols; j2++)
1600 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1603 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1607 AccPacket C0, C4, C8;
1612 LinearMapper r0 = res.getLinearMapper(i, j2);
1616 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1617 LhsPacket A0, A1, A2;
1619 for(
Index k=0; k<peeled_kc; k+=pk)
1621 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 3pX1");
1623#define EIGEN_GEBGP_ONESTEP(K) \
1625 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1626 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1627 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1628 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1629 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1630 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1631 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1632 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1633 traits.madd(A2, B_0, C8, B_0, fix<0>); \
1634 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1637 EIGEN_GEBGP_ONESTEP(0);
1638 EIGEN_GEBGP_ONESTEP(1);
1639 EIGEN_GEBGP_ONESTEP(2);
1640 EIGEN_GEBGP_ONESTEP(3);
1641 EIGEN_GEBGP_ONESTEP(4);
1642 EIGEN_GEBGP_ONESTEP(5);
1643 EIGEN_GEBGP_ONESTEP(6);
1644 EIGEN_GEBGP_ONESTEP(7);
1646 blB += int(pk) * int(RhsProgress);
1647 blA += int(pk) * 3 * int(Traits::LhsProgress);
1649 EIGEN_ASM_COMMENT(
"end gebp micro kernel 3pX1");
1653 for(
Index k=peeled_kc; k<depth; k++)
1656 EIGEN_GEBGP_ONESTEP(0);
1658 blA += 3*Traits::LhsProgress;
1660#undef EIGEN_GEBGP_ONESTEP
1661 ResPacket R0, R1, R2;
1662 ResPacket alphav = pset1<ResPacket>(alpha);
1664 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1665 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1666 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1667 traits.acc(C0, alphav, R0);
1668 traits.acc(C4, alphav, R1);
1669 traits.acc(C8, alphav, R2);
1670 r0.storePacket(0 * Traits::ResPacketSize, R0);
1671 r0.storePacket(1 * Traits::ResPacketSize, R1);
1672 r0.storePacket(2 * Traits::ResPacketSize, R2);
1679 if(mr>=2*Traits::LhsProgress)
1681 const Index l1 = defaultL1CacheSize;
1685 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
1687 for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1689 Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1690 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1692 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1698 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1702 AccPacket C0, C1, C2, C3,
1704 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1705 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1707 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1708 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1709 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1710 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1712 r0.prefetch(prefetch_res_offset);
1713 r1.prefetch(prefetch_res_offset);
1714 r2.prefetch(prefetch_res_offset);
1715 r3.prefetch(prefetch_res_offset);
1718 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1722 for(
Index k=0; k<peeled_kc; k+=pk)
1724 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 2pX4");
1725 RhsPacketx4 rhs_panel;
1730 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1731 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1733 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1735#define EIGEN_GEBGP_ONESTEP(K) \
1737 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1738 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1739 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1740 traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1741 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1742 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1743 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1744 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1745 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1746 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1747 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1748 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1749 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1750 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1753 internal::prefetch(blB+(48+0));
1754 EIGEN_GEBGP_ONESTEP(0);
1755 EIGEN_GEBGP_ONESTEP(1);
1756 EIGEN_GEBGP_ONESTEP(2);
1757 EIGEN_GEBGP_ONESTEP(3);
1758 internal::prefetch(blB+(48+16));
1759 EIGEN_GEBGP_ONESTEP(4);
1760 EIGEN_GEBGP_ONESTEP(5);
1761 EIGEN_GEBGP_ONESTEP(6);
1762 EIGEN_GEBGP_ONESTEP(7);
1764 blB += pk*4*RhsProgress;
1765 blA += pk*(2*Traits::LhsProgress);
1767 EIGEN_ASM_COMMENT(
"end gebp micro kernel 2pX4");
1770 for(
Index k=peeled_kc; k<depth; k++)
1772 RhsPacketx4 rhs_panel;
1774 EIGEN_GEBGP_ONESTEP(0);
1775 blB += 4*RhsProgress;
1776 blA += 2*Traits::LhsProgress;
1778#undef EIGEN_GEBGP_ONESTEP
1780 ResPacket R0, R1, R2, R3;
1781 ResPacket alphav = pset1<ResPacket>(alpha);
1783 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785 R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1786 R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1787 traits.acc(C0, alphav, R0);
1788 traits.acc(C4, alphav, R1);
1789 traits.acc(C1, alphav, R2);
1790 traits.acc(C5, alphav, R3);
1791 r0.storePacket(0 * Traits::ResPacketSize, R0);
1792 r0.storePacket(1 * Traits::ResPacketSize, R1);
1793 r1.storePacket(0 * Traits::ResPacketSize, R2);
1794 r1.storePacket(1 * Traits::ResPacketSize, R3);
1796 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1797 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1798 R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1799 R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1800 traits.acc(C2, alphav, R0);
1801 traits.acc(C6, alphav, R1);
1802 traits.acc(C3, alphav, R2);
1803 traits.acc(C7, alphav, R3);
1804 r2.storePacket(0 * Traits::ResPacketSize, R0);
1805 r2.storePacket(1 * Traits::ResPacketSize, R1);
1806 r3.storePacket(0 * Traits::ResPacketSize, R2);
1807 r3.storePacket(1 * Traits::ResPacketSize, R3);
1812 for(
Index j2=packet_cols4; j2<cols; j2++)
1814 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1817 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1825 LinearMapper r0 = res.getLinearMapper(i, j2);
1826 r0.prefetch(prefetch_res_offset);
1829 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1832 for(
Index k=0; k<peeled_kc; k+=pk)
1834 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 2pX1");
1837#define EIGEN_GEBGP_ONESTEP(K) \
1839 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1840 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1841 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1842 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1843 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1844 traits.madd(A0, B_0, C0, B1, fix<0>); \
1845 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1846 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1849 EIGEN_GEBGP_ONESTEP(0);
1850 EIGEN_GEBGP_ONESTEP(1);
1851 EIGEN_GEBGP_ONESTEP(2);
1852 EIGEN_GEBGP_ONESTEP(3);
1853 EIGEN_GEBGP_ONESTEP(4);
1854 EIGEN_GEBGP_ONESTEP(5);
1855 EIGEN_GEBGP_ONESTEP(6);
1856 EIGEN_GEBGP_ONESTEP(7);
1858 blB += int(pk) * int(RhsProgress);
1859 blA += int(pk) * 2 * int(Traits::LhsProgress);
1861 EIGEN_ASM_COMMENT(
"end gebp micro kernel 2pX1");
1865 for(
Index k=peeled_kc; k<depth; k++)
1868 EIGEN_GEBGP_ONESTEP(0);
1870 blA += 2*Traits::LhsProgress;
1872#undef EIGEN_GEBGP_ONESTEP
1874 ResPacket alphav = pset1<ResPacket>(alpha);
1876 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1877 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1878 traits.acc(C0, alphav, R0);
1879 traits.acc(C4, alphav, R1);
1880 r0.storePacket(0 * Traits::ResPacketSize, R0);
1881 r0.storePacket(1 * Traits::ResPacketSize, R1);
1887 if(mr>=1*Traits::LhsProgress)
1889 lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
1890 p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1893 if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
1895 lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
1896 p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1899 if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
1901 lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
1902 p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1905 if(peeled_mc_quarter<rows)
1908 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1911 for(
Index i=peeled_mc_quarter; i<rows; i+=1)
1913 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1915 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1920 const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
1921 const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
1922 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1923 (SwappedTraits::LhsProgress<=16) &&
1924 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
1925 (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
1927 SAccPacket C0, C1, C2, C3;
1928 straits.initAcc(C0);
1929 straits.initAcc(C1);
1930 straits.initAcc(C2);
1931 straits.initAcc(C3);
1933 const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1934 const Index endk = (depth/spk)*spk;
1935 const Index endk4 = (depth/(spk*4))*(spk*4);
1938 for(; k<endk4; k+=4*spk)
1943 straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1944 straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1946 straits.loadRhsQuad(blA+0*spk, B_0);
1947 straits.loadRhsQuad(blA+1*spk, B_1);
1948 straits.madd(A0,B_0,C0,B_0, fix<0>);
1949 straits.madd(A1,B_1,C1,B_1, fix<0>);
1951 straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1952 straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1953 straits.loadRhsQuad(blA+2*spk, B_0);
1954 straits.loadRhsQuad(blA+3*spk, B_1);
1955 straits.madd(A0,B_0,C2,B_0, fix<0>);
1956 straits.madd(A1,B_1,C3,B_1, fix<0>);
1958 blB += 4*SwappedTraits::LhsProgress;
1961 C0 = padd(padd(C0,C1),padd(C2,C3));
1962 for(; k<endk; k+=spk)
1967 straits.loadLhsUnaligned(blB, A0);
1968 straits.loadRhsQuad(blA, B_0);
1969 straits.madd(A0,B_0,C0,B_0, fix<0>);
1971 blB += SwappedTraits::LhsProgress;
1974 if(SwappedTraits::LhsProgress==8)
1977 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1978 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1979 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1980 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1982 SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1983 SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1990 straits.loadLhsUnaligned(blB, a0);
1991 straits.loadRhs(blA, b0);
1992 SAccPacketHalf c0 = predux_half_dowto4(C0);
1993 straits.madd(a0,b0,c0,b0, fix<0>);
1994 straits.acc(c0, alphav, R);
1998 straits.acc(predux_half_dowto4(C0), alphav, R);
2000 res.scatterPacket(i, j2, R);
2002 else if (SwappedTraits::LhsProgress==16)
2008 last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2009 p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2013 SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2014 SResPacket alphav = pset1<SResPacket>(alpha);
2015 straits.acc(C0, alphav, R);
2016 res.scatterPacket(i, j2, R);
2022 ResScalar C0(0), C1(0), C2(0), C3(0);
2024 for(
Index k=0; k<depth; k++)
2033 C0 = cj.pmadd(A0,B_0,C0);
2034 C1 = cj.pmadd(A0,B_1,C1);
2038 C2 = cj.pmadd(A0,B_0,C2);
2039 C3 = cj.pmadd(A0,B_1,C3);
2043 res(i, j2 + 0) += alpha * C0;
2044 res(i, j2 + 1) += alpha * C1;
2045 res(i, j2 + 2) += alpha * C2;
2046 res(i, j2 + 3) += alpha * C3;
2051 for(
Index j2=packet_cols4; j2<cols; j2++)
2054 for(
Index i=peeled_mc_quarter; i<rows; i+=1)
2056 const LhsScalar* blA = &blockA[i*strideA+offsetA];
2060 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2061 for(
Index k=0; k<depth; k++)
2063 LhsScalar A0 = blA[k];
2064 RhsScalar B_0 = blB[k];
2065 C0 = cj.pmadd(A0, B_0, C0);
2067 res(i, j2) += alpha * C0;
2088template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2091 typedef typename DataMapper::LinearMapper LinearMapper;
2095template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2104 HasHalf = (int)HalfPacketSize < (
int)PacketSize,
2105 HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize};
2107 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK LHS");
2108 EIGEN_UNUSED_VARIABLE(stride);
2109 EIGEN_UNUSED_VARIABLE(offset);
2110 eigen_assert(((!
PanelMode) && stride==0 && offset==0) || (
PanelMode && stride>=depth && offset<=stride));
2111 eigen_assert( ((
Pack1%PacketSize)==0 &&
Pack1<=4*PacketSize) || (
Pack1<=4) );
2127 if(
Pack1>=3*PacketSize)
2131 if(
PanelMode) count += (3*PacketSize) * offset;
2133 for(
Index k=0; k<depth; k++)
2139 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2140 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2141 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2143 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2147 if(Pack1>=2*PacketSize)
2149 for(; i<peeled_mc2; i+=2*PacketSize)
2151 if(PanelMode) count += (2*PacketSize) * offset;
2153 for(
Index k=0; k<depth; k++)
2156 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2157 B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2158 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2159 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2161 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2165 if(Pack1>=1*PacketSize)
2167 for(; i<peeled_mc1; i+=1*PacketSize)
2169 if(PanelMode) count += (1*PacketSize) * offset;
2171 for(
Index k=0; k<depth; k++)
2174 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2175 pstore(blockA+count, cj.pconj(A));
2178 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2182 if(HasHalf && Pack1>=HalfPacketSize)
2184 for(; i<peeled_mc_half; i+=HalfPacketSize)
2186 if(PanelMode) count += (HalfPacketSize) * offset;
2188 for(
Index k=0; k<depth; k++)
2191 A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2192 pstoreu(blockA+count, cj.pconj(A));
2193 count+=HalfPacketSize;
2195 if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2199 if(HasQuarter && Pack1>=QuarterPacketSize)
2201 for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2203 if(PanelMode) count += (QuarterPacketSize) * offset;
2205 for(
Index k=0; k<depth; k++)
2208 A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2209 pstoreu(blockA+count, cj.pconj(A));
2210 count+=QuarterPacketSize;
2212 if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2221 if(Pack2<PacketSize && Pack2>1)
2223 for(; i<peeled_mc0; i+=last_lhs_progress)
2225 if(PanelMode) count += last_lhs_progress * offset;
2227 for(
Index k=0; k<depth; k++)
2228 for(
Index w=0; w<last_lhs_progress; w++)
2229 blockA[count++] = cj(lhs(i+w, k));
2231 if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
2237 if(PanelMode) count += offset;
2238 for(
Index k=0; k<depth; k++)
2239 blockA[count++] = cj(lhs(i, k));
2240 if(PanelMode) count += (stride-offset-depth);
2244template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2247 typedef typename DataMapper::LinearMapper LinearMapper;
2251template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2260 HasHalf = (int)HalfPacketSize < (
int)PacketSize,
2261 HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize};
2263 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK LHS");
2264 EIGEN_UNUSED_VARIABLE(stride);
2265 EIGEN_UNUSED_VARIABLE(offset);
2266 eigen_assert(((!
PanelMode) && stride==0 && offset==0) || (
PanelMode && stride>=depth && offset<=stride));
2284 if(pack>=
psize &&
psize >= QuarterPacketSize)
2291 if (
psize == PacketSize) {
2295 for (
Index p = 0; p <
psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2296 }
else if (HasHalf && psize == HalfPacketSize) {
2298 PacketBlock<HalfPacket> kernel_half;
2299 for (
Index p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2300 ptranspose(kernel_half);
2301 for (
Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
2302 }
else if (HasQuarter && psize == QuarterPacketSize) {
2303 gone_quarter =
true;
2304 PacketBlock<QuarterPacket> kernel_quarter;
2305 for (
Index p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2306 ptranspose(kernel_quarter);
2307 for (
Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
2310 count += psize*pack;
2317 for(; w<pack-3; w+=4)
2319 Scalar a(cj(lhs(i+w+0, k))),
2320 b(cj(lhs(i+w+1, k))),
2321 c(cj(lhs(i+w+2, k))),
2322 d(cj(lhs(i+w+3, k)));
2323 blockA[count++] = a;
2324 blockA[count++] = b;
2325 blockA[count++] = c;
2326 blockA[count++] = d;
2330 blockA[count++] = cj(lhs(i+w, k));
2333 if(PanelMode) count += pack * (stride-offset-depth);
2337 Index left = rows - i;
2340 (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2341 ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2342 (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2353 if (Pack2 < PacketSize && !gone_last) {
2355 psize = pack = left & ~1;
2362 if(PanelMode) count += offset;
2363 for(
Index k=0; k<depth; k++)
2364 blockA[count++] = cj(lhs(i, k));
2365 if(PanelMode) count += (stride-offset-depth);
2376template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2379 typedef typename packet_traits<Scalar>::type Packet;
2380 typedef typename DataMapper::LinearMapper LinearMapper;
2385template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2389 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK RHS COLMAJOR");
2390 EIGEN_UNUSED_VARIABLE(stride);
2391 EIGEN_UNUSED_VARIABLE(offset);
2392 eigen_assert(((!
PanelMode) && stride==0 && offset==0) || (
PanelMode && stride>=depth && offset<=stride));
2450 const LinearMapper
dm0 = rhs.getLinearMapper(0,
j2 + 0);
2451 const LinearMapper
dm1 = rhs.getLinearMapper(0,
j2 + 1);
2452 const LinearMapper
dm2 = rhs.getLinearMapper(0,
j2 + 2);
2453 const LinearMapper
dm3 = rhs.getLinearMapper(0,
j2 + 3);
2456 if((PacketSize%4)==0)
2465 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
2466 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
2467 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
2468 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
2469 count+=4*PacketSize;
2474 blockB[count+0] = cj(dm0(k));
2475 blockB[count+1] = cj(dm1(k));
2476 blockB[count+2] = cj(dm2(k));
2477 blockB[count+3] = cj(dm3(k));
2481 if(PanelMode) count += 4 * (stride-offset-depth);
2486 for(
Index j2=packet_cols4; j2<cols; ++j2)
2488 if(PanelMode) count += offset;
2489 const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2490 for(
Index k=0; k<depth; k++)
2492 blockB[count] = cj(dm0(k));
2495 if(PanelMode) count += (stride-offset-depth);
2500template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2503 typedef typename packet_traits<Scalar>::type Packet;
2506 typedef typename DataMapper::LinearMapper LinearMapper;
2512 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK RHS ROWMAJOR");
2513 EIGEN_UNUSED_VARIABLE(stride);
2514 EIGEN_UNUSED_VARIABLE(offset);
2515 eigen_assert(((!
PanelMode) && stride==0 && offset==0) || (
PanelMode && stride>=depth && offset<=stride));
2516 const bool HasHalf = (int)HalfPacketSize < (
int)PacketSize;
2517 const bool HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize;
2562 for(
Index k=0; k<depth; k++)
2564 if (PacketSize==4) {
2566 pstoreu(blockB+count, cj.pconj(A));
2567 count += PacketSize;
2568 }
else if (
HasHalf && HalfPacketSize==4) {
2570 pstoreu(blockB+count, cj.pconj(A));
2571 count += HalfPacketSize;
2572 }
else if (
HasQuarter && QuarterPacketSize==4) {
2574 pstoreu(blockB+count, cj.pconj(A));
2575 count += QuarterPacketSize;
2577 const LinearMapper
dm0 = rhs.getLinearMapper(k,
j2);
2578 blockB[count+0] = cj(
dm0(0));
2579 blockB[count+1] = cj(
dm0(1));
2580 blockB[count+2] = cj(
dm0(2));
2581 blockB[count+3] = cj(
dm0(3));
2586 if(
PanelMode) count += 4 * (stride-offset-depth);
2593 for(
Index k=0; k<depth; k++)
2595 blockB[count] = cj(rhs(k,
j2));
2598 if(
PanelMode) count += stride-offset-depth;
2609 std::ptrdiff_t
l1,
l2,
l3;
2610 internal::manage_caching_sizes(GetAction, &
l1, &
l2, &
l3);
2618 std::ptrdiff_t
l1,
l2,
l3;
2619 internal::manage_caching_sizes(GetAction, &
l1, &
l2, &
l3);
2628 std::ptrdiff_t
l1,
l2,
l3;
2629 internal::manage_caching_sizes(GetAction, &
l1, &
l2, &
l3);
2640 internal::manage_caching_sizes(SetAction, &
l1, &
l2, &
l3);
Definition ForwardDeclarations.h:87
Base class for all dense matrices, vectors, and expressions.
Definition MatrixBase.h:50
Definition GeneralBlockPanelKernel.h:419
@ ColMajor
Storage order is column major (see TopicStorageOrders).
Definition Constants.h:319
@ RowMajor
Storage order is row major (see TopicStorageOrders).
Definition Constants.h:321
Namespace containing all symbols from the Eigen library.
Definition LDLT.h:16
std::ptrdiff_t l1CacheSize()
Definition GeneralBlockPanelKernel.h:2607
std::ptrdiff_t l2CacheSize()
Definition GeneralBlockPanelKernel.h:2616
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
std::ptrdiff_t l3CacheSize()
Definition GeneralBlockPanelKernel.h:2626
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
Set the cpu L1 and L2 cache sizes (in bytes).
Definition GeneralBlockPanelKernel.h:2638
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition XprHelper.h:806
Definition GeneralBlockPanelKernel.h:71
Definition GeneralBlockPanelKernel.h:683
Definition GenericPacketMath.h:1014
Definition GeneralBlockPanelKernel.h:362
Definition GeneralBlockPanelKernel.h:353
Definition GeneralBlockPanelKernel.h:1058
Definition GeneralBlockPanelKernel.h:1112
Definition GeneralBlockPanelKernel.h:1386
Definition GeneralBlockPanelKernel.h:1191
Definition GeneralBlockPanelKernel.h:371
Definition GenericPacketMath.h:107
Definition ForwardDeclarations.h:17
Definition GenericPacketMath.h:133
Definition PacketMath.h:47