Medial Code Documentation
Loading...
Searching...
No Matches
MathFunctions.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com)
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
11#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
12
13namespace Eigen {
14
15namespace internal {
16
17#if EIGEN_HAS_AVX512_MATH
18
19#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
20 const Packet16f p16f_##NAME = pset1<Packet16f>(X)
21
22#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
23 const Packet16f p16f_##NAME = preinterpret<Packet16f,Packet16i>(pset1<Packet16i>(X))
24
25#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
26 const Packet8d p8d_##NAME = pset1<Packet8d>(X)
27
28#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
29 const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
30
31#define _EIGEN_DECLARE_CONST_Packet16bf(NAME, X) \
32 const Packet16bf p16bf_##NAME = pset1<Packet16bf>(X)
33
34#define _EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \
35 const Packet16bf p16bf_##NAME = preinterpret<Packet16bf,Packet16i>(pset1<Packet16i>(X))
36
37template <>
38EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
39plog<Packet16f>(const Packet16f& _x) {
40 return plog_float(_x);
41}
42
43template <>
44EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
45plog<Packet8d>(const Packet8d& _x) {
46 return plog_double(_x);
47}
48
49F16_PACKET_FUNCTION(Packet16f, Packet16h, plog)
50BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog)
51
52template <>
53EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
54plog2<Packet16f>(const Packet16f& _x) {
55 return plog2_float(_x);
56}
57
58template <>
59EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
60plog2<Packet8d>(const Packet8d& _x) {
61 return plog2_double(_x);
62}
63
64F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2)
65BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2)
66
67// Exponential function. Works by writing "x = m*log(2) + r" where
68// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
69// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
70template <>
71EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
72pexp<Packet16f>(const Packet16f& _x) {
73 _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
74 _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
75 _EIGEN_DECLARE_CONST_Packet16f(127, 127.0f);
76
77 _EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f);
78 _EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f);
79
80 _EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f);
81
82 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f);
83 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f);
84 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f);
85 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f);
86 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f);
87 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f);
88
89 // Clamp x.
90 Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo);
91
92 // Express exp(x) as exp(m*ln(2) + r), start by extracting
93 // m = floor(x/ln(2) + 0.5).
94 Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half));
95
96 // Get r = x - m*ln(2). Note that we can do this without losing more than one
97 // ulp precision due to the FMA instruction.
98 _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
99 Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
100 Packet16f r2 = pmul(r, r);
101 Packet16f r3 = pmul(r2, r);
102
103 // Evaluate the polynomial approximant,improved by instruction-level parallelism.
104 Packet16f y, y1, y2;
105 y = pmadd(p16f_cephes_exp_p0, r, p16f_cephes_exp_p1);
106 y1 = pmadd(p16f_cephes_exp_p3, r, p16f_cephes_exp_p4);
107 y2 = padd(r, p16f_1);
108 y = pmadd(y, r, p16f_cephes_exp_p2);
109 y1 = pmadd(y1, r, p16f_cephes_exp_p5);
110 y = pmadd(y, r3, y1);
111 y = pmadd(y, r2, y2);
112
113 // Build emm0 = 2^m.
114 Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
115 emm0 = _mm512_slli_epi32(emm0, 23);
116
117 // Return 2^m * exp(r).
118 return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
119}
120
121template <>
122EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
123pexp<Packet8d>(const Packet8d& _x) {
124 return pexp_double(_x);
125}
126
127F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
128BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
129
130template <>
131EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h& a, Packet16h& exponent) {
132 Packet16f fexponent;
133 const Packet16h out = float2half(pfrexp<Packet16f>(half2float(a), fexponent));
134 exponent = float2half(fexponent);
135 return out;
136}
137
138template <>
139EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h& a, const Packet16h& exponent) {
140 return float2half(pldexp<Packet16f>(half2float(a), half2float(exponent)));
141}
142
143template <>
144EIGEN_STRONG_INLINE Packet16bf pfrexp(const Packet16bf& a, Packet16bf& exponent) {
145 Packet16f fexponent;
146 const Packet16bf out = F32ToBf16(pfrexp<Packet16f>(Bf16ToF32(a), fexponent));
147 exponent = F32ToBf16(fexponent);
148 return out;
149}
150
151template <>
152EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exponent) {
153 return F32ToBf16(pldexp<Packet16f>(Bf16ToF32(a), Bf16ToF32(exponent)));
154}
155
156// Functions for sqrt.
157// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
158// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
159// exact solution. The main advantage of this approach is not just speed, but
160// also the fact that it can be inlined and pipelined with other computations,
161// further reducing its effective latency.
162#if EIGEN_FAST_MATH
163template <>
164EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
165psqrt<Packet16f>(const Packet16f& _x) {
166 Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
167 __mmask16 denormal_mask = _mm512_kand(
168 _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
169 _CMP_LT_OQ),
170 _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
171
172 Packet16f x = _mm512_rsqrt14_ps(_x);
173
174 // Do a single step of Newton's iteration.
175 x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
176
177 // Flush results for denormals to zero.
178 return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
179}
180
181template <>
182EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
183psqrt<Packet8d>(const Packet8d& _x) {
184 Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
185 __mmask16 denormal_mask = _mm512_kand(
186 _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
187 _CMP_LT_OQ),
188 _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
189
190 Packet8d x = _mm512_rsqrt14_pd(_x);
191
192 // Do a single step of Newton's iteration.
193 x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
194
195 // Do a second step of Newton's iteration.
196 x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
197
198 return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
199}
200#else
201template <>
202EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
203 return _mm512_sqrt_ps(x);
204}
205
206template <>
207EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
208 return _mm512_sqrt_pd(x);
209}
210#endif
211
212F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
213BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
214
215// prsqrt for float.
216#if defined(EIGEN_VECTORIZE_AVX512ER)
217
218template <>
219EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
220 return _mm512_rsqrt28_ps(x);
221}
222#elif EIGEN_FAST_MATH
223
224template <>
225EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
226prsqrt<Packet16f>(const Packet16f& _x) {
227 _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
228 _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
229 _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
230
231 Packet16f neg_half = pmul(_x, p16f_minus_half);
232
233 // Identity infinite, negative and denormal arguments.
234 __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ);
235 __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ);
236 __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask;
237
238 // Compute an approximate result using the rsqrt intrinsic, forcing +inf
239 // for denormals for consistency with AVX and SSE implementations.
240 Packet16f y_approx = _mm512_rsqrt14_ps(_x);
241
242 // Do a single step of Newton-Raphson iteration to improve the approximation.
243 // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
244 // It is essential to evaluate the inner term like this because forming
245 // y_n^2 may over- or underflow.
246 Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five));
247
248 // Select the result of the Newton-Raphson step for positive finite arguments.
249 // For other arguments, choose the output of the intrinsic. This will
250 // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
251 return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
252}
253#else
254
255template <>
256EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
257 _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f);
258 return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x));
259}
260#endif
261
262F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
263BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
264
265// prsqrt for double.
266#if EIGEN_FAST_MATH
267template <>
268EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
269prsqrt<Packet8d>(const Packet8d& _x) {
270 _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
271 _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
272 _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
273
274 Packet8d neg_half = pmul(_x, p8d_minus_half);
275
276 // Identity infinite, negative and denormal arguments.
277 __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ);
278 __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ);
279 __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask;
280
281 // Compute an approximate result using the rsqrt intrinsic, forcing +inf
282 // for denormals for consistency with AVX and SSE implementations.
283#if defined(EIGEN_VECTORIZE_AVX512ER)
284 Packet8d y_approx = _mm512_rsqrt28_pd(_x);
285#else
286 Packet8d y_approx = _mm512_rsqrt14_pd(_x);
287#endif
288 // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the
289 // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available).
290 // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number
291 // of correct digits for each step.
292 // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
293 // It is essential to evaluate the inner term like this because forming
294 // y_n^2 may over- or underflow.
295 Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five));
296#if !defined(EIGEN_VECTORIZE_AVX512ER)
297 y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five));
298#endif
299 // Select the result of the Newton-Raphson step for positive finite arguments.
300 // For other arguments, choose the output of the intrinsic. This will
301 // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
302 return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx);
303}
304#else
305template <>
306EIGEN_STRONG_INLINE Packet8d prsqrt<Packet8d>(const Packet8d& x) {
307 _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f);
308 return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x));
309}
310#endif
311
312template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
313Packet16f plog1p<Packet16f>(const Packet16f& _x) {
314 return generic_plog1p(_x);
315}
316
317F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p)
318BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p)
319
320template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
321Packet16f pexpm1<Packet16f>(const Packet16f& _x) {
322 return generic_expm1(_x);
323}
324
325F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
326BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
327
328#endif // EIGEN_HAS_AVX512_MATH
329
330
331template <>
332EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
333psin<Packet16f>(const Packet16f& _x) {
334 return psin_float(_x);
335}
336
337template <>
338EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
339pcos<Packet16f>(const Packet16f& _x) {
340 return pcos_float(_x);
341}
342
343template <>
344EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
345ptanh<Packet16f>(const Packet16f& _x) {
346 return internal::generic_fast_tanh_float(_x);
347}
348
349F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
350F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
351F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
352
353BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
354BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos)
355BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
356
357} // end namespace internal
358
359} // end namespace Eigen
360
361#endif // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
Namespace containing all symbols from the Eigen library.
Definition LDLT.h:16