12 #ifndef MFEM_KERNELS_HPP
13 #define MFEM_KERNELS_HPP
16 #define _USE_MATH_DEFINES
20 #include "../config/config.hpp"
21 #include "../general/cuda.hpp"
22 #include "../general/globals.hpp"
45 MFEM_HOST_DEVICE
inline
46 double Norml2(
const int size,
const T *data)
48 if (0 == size) {
return 0.0; }
49 if (1 == size) {
return std::abs(data[0]); }
52 for (
int i = 0; i < size; i++)
56 const T absdata = fabs(data[i]);
59 const T sqr_arg = scale / absdata;
60 sum = 1.0 + sum * (sqr_arg * sqr_arg);
64 const T sqr_arg = absdata / scale;
65 sum += (sqr_arg * sqr_arg);
68 return scale * sqrt(sum);
74 template<
typename TA,
typename TX,
typename TY>
75 MFEM_HOST_DEVICE
inline
76 void Mult(
const int height,
const int width, TA *data,
const TX *x, TY *y)
80 for (
int row = 0; row < height; row++)
88 for (
int row = 0; row < height; row++)
90 y[row] = x_col*d_col[row];
93 for (
int col = 1; col < width; col++)
96 for (
int row = 0; row < height; row++)
98 y[row] += x_col*d_col[row];
106 MFEM_HOST_DEVICE
inline
109 for (
int i = 0; i < size; i++)
111 for (
int j = 0; j < i; j++)
113 const T
a = 0.5 * (data[i*size+j] + data[j*size+i]);
114 data[j*size+i] = data[i*size+j] =
a;
120 template<
int dim,
typename T>
121 MFEM_HOST_DEVICE
inline T
Det(
const T *data)
128 template<
int dim,
typename T>
129 MFEM_HOST_DEVICE
inline
133 const T det = TAdjDet<T>(layout_t(), data, layout_t(), inv_data);
134 TAssign<AssignOp::Mult>(layout_t(), inv_data,
static_cast<T
>(1.0)/det);
139 template<
typename TALPHA,
typename TA,
typename TB,
typename TC>
140 MFEM_HOST_DEVICE
inline
141 void Add(
const int height,
const int width,
const TALPHA
alpha,
142 const TA *Adata,
const TB *Bdata, TC *Cdata)
144 for (
int j = 0; j < width; j++)
146 for (
int i = 0; i < height; i++)
148 const int n = i*width+j;
149 Cdata[n] = Adata[n] + alpha * Bdata[n];
157 template<
typename TA,
typename TB,
typename TC>
158 MFEM_HOST_DEVICE
inline
159 void Mult(
const int Aheight,
const int Awidth,
const int Bwidth,
160 const TB *Bdata,
const TC *Cdata, TA *Adata)
162 const int ah_x_aw = Aheight * Awidth;
163 for (
int i = 0; i < ah_x_aw; i++) { Adata[i] = 0.0; }
164 for (
int j = 0; j < Awidth; j++)
166 for (
int k = 0; k < Bwidth; k++)
168 for (
int i = 0; i < Aheight; i++)
170 Adata[i+j*Aheight] += Bdata[i+k*Aheight] * Cdata[k+j*Bwidth];
179 template<
typename TA,
typename TB,
typename TC>
180 MFEM_HOST_DEVICE
inline
181 void MultABt(
const int Aheight,
const int Awidth,
const int Bheight,
182 const TA *Adata,
const TB *Bdata, TC *ABtdata)
184 const int ah_x_bh = Aheight * Bheight;
185 for (
int i = 0; i < ah_x_bh; i++) { ABtdata[i] = 0.0; }
186 for (
int k = 0; k < Awidth; k++)
189 for (
int j = 0; j < Bheight; j++)
191 const double bjk = Bdata[j];
192 for (
int i = 0; i < Aheight; i++)
194 c[i] += Adata[i] * bjk;
220 MFEM_HOST_DEVICE
static inline
231 MFEM_HOST_DEVICE
static inline
232 void Eigenvalues2S(
const double &d12,
double &d1,
double &d2)
234 const double sqrt_1_eps = sqrt(1./Epsilon);
239 const double zeta = (d2 - d1)/(2*d12);
240 if (fabs(zeta) < sqrt_1_eps)
242 t = d12*copysign(1./(fabs(zeta) + sqrt(1. + zeta*zeta)), zeta);
246 t = d12*copysign(0.5/fabs(zeta), zeta);
254 MFEM_HOST_DEVICE
static inline
255 void Eigensystem2S(
const double &d12,
double &d1,
double &d2,
256 double &c,
double &s)
258 const double sqrt_1_eps = sqrt(1./Epsilon);
268 const double zeta = (d2 - d1)/(2*d12);
269 const double azeta = fabs(zeta);
270 if (azeta < sqrt_1_eps)
272 t = copysign(1./(azeta + sqrt(1. + zeta*zeta)), zeta);
276 t = copysign(0.5/azeta, zeta);
278 c = sqrt(1./(1. + t*t));
288 MFEM_HOST_DEVICE
static inline
289 void GetScalingFactor(
const double &d_max,
double &mult)
294 mult = frexp(d_max, &d_exp);
295 if (d_exp == std::numeric_limits<double>::max_exponent)
297 mult *= std::numeric_limits<double>::radix;
310 MFEM_HOST_DEVICE
static inline
311 bool KernelVector2G(
const int &mode,
312 double &d1,
double &d12,
double &d21,
double &d2)
323 double n1 = fabs(d1) + fabs(d21);
324 double n2 = fabs(d2) + fabs(d12);
326 bool swap_columns = (n2 > n1);
338 if (fabs(d1) > fabs(d21))
346 if (fabs(d1) < fabs(d21))
358 if (fabs(d12) > fabs(d2))
371 if (fabs(d12) < fabs(d2))
390 mu = copysign(n1, d1);
391 n1 = -d21*(d21/(d1 + mu));
395 if (fabs(n1) <= fabs(d21))
399 mu = (2./(1. + n1*n1))*(n1*d12 + d2);
407 mu = (2./(1. + n2*n2))*(d12 + n2*d2);
429 n2 = 1./(1. + fabs(mu));
431 if (fabs(d1) <= n2*fabs(d2))
452 MFEM_HOST_DEVICE
static inline
453 void Vec_normalize3_aux(
const double &x1,
const double &x2,
455 double &n1,
double &n2,
double &n3)
459 const double m = fabs(x1);
463 t = sqrt(1./(t + r*r));
464 n1 = copysign(t, x1);
471 MFEM_HOST_DEVICE
static inline
472 void Vec_normalize3(
const double &x1,
const double &x2,
const double &x3,
473 double &n1,
double &n2,
double &n3)
476 if (fabs(x1) >= fabs(x2))
478 if (fabs(x1) >= fabs(x3))
482 Vec_normalize3_aux(x1, x2, x3, n1, n2, n3);
491 else if (fabs(x2) >= fabs(x3))
493 Vec_normalize3_aux(x2, x1, x3, n2, n1, n3);
496 Vec_normalize3_aux(x3, x1, x2, n3, n1, n2);
500 MFEM_HOST_DEVICE
static inline
501 int KernelVector3G_aux(
const int &mode,
502 double &d1,
double &d2,
double &d3,
503 double &c12,
double &c13,
double &c23,
504 double &c21,
double &c31,
double &c32)
507 double mu, n1, n2, n3, s1, s2, s3;
509 s1 = hypot(c21, c31);
516 mu = copysign(n1, d1);
517 n1 = -s1*(s1/(d1 + mu));
522 if (fabs(n1) >= fabs(c21))
524 if (fabs(n1) >= fabs(c31))
529 mu = 2./(1. + s2*s2 + s3*s3);
530 n2 = mu*(c12 + s2*d2 + s3*c32);
531 n3 = mu*(c13 + s2*c23 + s3*d3);
541 else if (fabs(c21) >= fabs(c31))
546 mu = 2./(1. + s1*s1 + s3*s3);
547 n2 = mu*(s1*c12 + d2 + s3*c32);
548 n3 = mu*(s1*c13 + c23 + s3*d3);
560 mu = 2./(1. + s1*s1 + s2*s2);
561 n2 = mu*(s1*c12 + s2*d2 + c32);
562 n3 = mu*(s1*c13 + s2*c23 + d3);
576 if (KernelVector2G(mode, d2, c23, c32, d3))
597 d1 = -(c12*d2 + c13*d3)/d1;
601 Vec_normalize3(d1, d2, d3, d1, d2, d3);
607 MFEM_HOST_DEVICE
static inline
608 int KernelVector3S(
const int &mode,
const double &d12,
609 const double &d13,
const double &d23,
610 double &d1,
double &d2,
double &d3)
623 double c12 = d12, c13 = d13, c23 = d23;
624 double c21, c31, c32;
628 c32 = fabs(d1) + fabs(c12) + fabs(c13);
629 c31 = fabs(d2) + fabs(c12) + fabs(c23);
630 c21 = fabs(d3) + fabs(c13) + fabs(c23);
635 col = (c32 >= c31) ? 1 : 2;
639 col = (c31 >= c21) ? 2 : 3;
671 if (fabs(d1) <= fabs(c13))
673 row = (fabs(d1) <= fabs(c12)) ? 1 : 2;
677 row = (fabs(c12) <= fabs(c13)) ? 2 : 3;
682 if (fabs(d1) >= fabs(c13))
684 row = (fabs(d1) >= fabs(c12)) ? 1 : 2;
688 row = (fabs(c12) >= fabs(c13)) ? 2 : 3;
719 row = KernelVector3G_aux(mode, d1, d2, d3, c12, c13, c23, c21, c31, c32);
735 MFEM_HOST_DEVICE
static inline
736 int Reduce3S(
const int &mode,
737 double &d1,
double &d2,
double &d3,
738 double &d12,
double &d13,
double &d23,
739 double &z1,
double &z2,
double &z3,
740 double &v1,
double &v2,
double &v3,
760 double s, w1, w2, w3;
766 if (fabs(z1) <= fabs(z3))
768 k = (fabs(z1) <= fabs(z2)) ? 1 : 2;
772 k = (fabs(z2) <= fabs(z3)) ? 2 : 3;
778 if (fabs(z1) >= fabs(z3))
780 k = (fabs(z1) >= fabs(z2)) ? 1 : 2;
784 k = (fabs(z2) >= fabs(z3)) ? 2 : 3;
811 g = copysign(1., z1);
812 v1 = -s*(s/(z1 + g));
815 if (fabs(z2) > g) { g = fabs(z2); }
816 if (fabs(z3) > g) { g = fabs(z3); }
820 g = 2./(v1*v1 + v2*v2 + v3*v3);
825 w1 = g*( d1*v1 + d12*v2 + d13*v3);
826 w2 = g*(d12*v1 + d2*v2 + d23*v3);
827 w3 = g*(d13*v1 + d23*v2 + d3*v3);
829 s = (g/2)*(v1*w1 + v2*w2 + v3*w3);
836 d23 -= v2*w3 + v3*w2;
841 s = d12 - v1*w2 - v2*w1;
842 s = d13 - v1*w3 - v3*w1;
865 template<> MFEM_HOST_DEVICE
inline
872 internal::Eigensystem2S(d2, d0, d3, c, s);
896 template<> MFEM_HOST_DEVICE
inline
899 double d11 = data[0];
900 double d12 = data[3];
901 double d22 = data[4];
902 double d13 = data[6];
903 double d23 = data[7];
904 double d33 = data[8];
908 double d_max = fabs(d11);
909 if (d_max < fabs(d22)) { d_max = fabs(d22); }
910 if (d_max < fabs(d33)) { d_max = fabs(d33); }
911 if (d_max < fabs(d12)) { d_max = fabs(d12); }
912 if (d_max < fabs(d13)) { d_max = fabs(d13); }
913 if (d_max < fabs(d23)) { d_max = fabs(d23); }
915 internal::GetScalingFactor(d_max, mult);
918 d11 /= mult; d22 /= mult; d33 /= mult;
919 d12 /= mult; d13 /= mult; d23 /= mult;
921 double aa = (d11 + d22 + d33)/3;
922 double c1 = d11 - aa;
923 double c2 = d22 - aa;
924 double c3 = d33 - aa;
928 Q = (2*(d12*d12 + d13*d13 + d23*d23) + c1*c1 + c2*c2 + c3*c3)/6;
929 R = (c1*(d23*d23 - c2*c3)+ d12*(d12*c3 - 2*d13*d23) + d13*d13*c2)/2;
933 lambda[0] = lambda[1] = lambda[2] = aa;
934 vec[0] = 1.; vec[3] = 0.; vec[6] = 0.;
935 vec[1] = 0.; vec[4] = 1.; vec[7] = 0.;
936 vec[2] = 0.; vec[5] = 0.; vec[8] = 1.;
940 double sqrtQ = sqrt(Q);
941 double sqrtQ3 = Q*sqrtQ;
945 if (fabs(R) >= sqrtQ3)
964 r = -2*sqrtQ*cos((acos(R) + 2.0*M_PI)/3);
968 r = -2*sqrtQ*cos(acos(R)/3);
992 switch (internal::KernelVector3S(mode, d12, d13, d23, c1, c2, c3))
996 lambda[0] = lambda[1] = lambda[2] = aa;
997 vec[0] = 1.; vec[3] = 0.; vec[6] = 0.;
998 vec[1] = 0.; vec[4] = 1.; vec[7] = 0.;
999 vec[2] = 0.; vec[5] = 0.; vec[8] = 1.;
1013 double v1, v2, v3, g;
1014 int k = internal::Reduce3S(mode, d11, d22, d33, d12, d13, d23,
1015 c1, c2, c3, v1, v2, v3, g);
1023 internal::Eigensystem2S(d23, d22, d33, c, s);
1026 double *vec_1, *vec_2, *vec_3;
1031 lambda[0] = d11; vec_1 = vec;
1032 lambda[1] = d22; vec_2 = vec + 3;
1033 lambda[2] = d33; vec_3 = vec + 6;
1035 else if (d11 <= d33)
1037 lambda[0] = d11; vec_1 = vec;
1038 lambda[1] = d33; vec_3 = vec + 3;
1039 lambda[2] = d22; vec_2 = vec + 6;
1043 lambda[0] = d33; vec_3 = vec;
1044 lambda[1] = d11; vec_1 = vec + 3;
1045 lambda[2] = d22; vec_2 = vec + 6;
1052 lambda[0] = d22; vec_2 = vec;
1053 lambda[1] = d11; vec_1 = vec + 3;
1054 lambda[2] = d33; vec_3 = vec + 6;
1056 else if (d22 <= d33)
1058 lambda[0] = d22; vec_2 = vec;
1059 lambda[1] = d33; vec_3 = vec + 3;
1060 lambda[2] = d11; vec_1 = vec + 6;
1064 lambda[0] = d33; vec_3 = vec;
1065 lambda[1] = d22; vec_2 = vec + 3;
1066 lambda[2] = d11; vec_1 = vec + 6;
1073 d22 = g*(v2*c - v3*s);
1074 d33 = g*(v2*s + v3*c);
1075 vec_2[0] = - v1*d22; vec_3[0] = - v1*d33;
1076 vec_2[1] = c - v2*d22; vec_3[1] = s - v2*d33;
1077 vec_2[2] = -s - v3*d22; vec_3[2] = c - v3*d33;
1098 template<> MFEM_HOST_DEVICE
inline
1101 double d0, d1, d2, d3;
1109 double d_max = fabs(d0);
1110 if (d_max < fabs(d1)) { d_max = fabs(d1); }
1111 if (d_max < fabs(d2)) { d_max = fabs(d2); }
1112 if (d_max < fabs(d3)) { d_max = fabs(d3); }
1113 internal::GetScalingFactor(d_max, mult);
1121 double t = 0.5*((d0+d2)*(d0-d2)+(d1-d3)*(d1+d3));
1122 double s = d0*d2 + d1*d3;
1123 s = sqrt(0.5*(d0*d0 + d1*d1 + d2*d2 + d3*d3) + sqrt(t*t + s*s));
1129 t = fabs(d0*d3 - d1*d2) / s;
1146 template<> MFEM_HOST_DEVICE
inline
1149 double d0, d1, d2, d3, d4, d5, d6, d7, d8;
1150 d0 = data[0]; d3 = data[3]; d6 = data[6];
1151 d1 = data[1]; d4 = data[4]; d7 = data[7];
1152 d2 = data[2]; d5 = data[5]; d8 = data[8];
1155 double d_max = fabs(d0);
1156 if (d_max < fabs(d1)) { d_max = fabs(d1); }
1157 if (d_max < fabs(d2)) { d_max = fabs(d2); }
1158 if (d_max < fabs(d3)) { d_max = fabs(d3); }
1159 if (d_max < fabs(d4)) { d_max = fabs(d4); }
1160 if (d_max < fabs(d5)) { d_max = fabs(d5); }
1161 if (d_max < fabs(d6)) { d_max = fabs(d6); }
1162 if (d_max < fabs(d7)) { d_max = fabs(d7); }
1163 if (d_max < fabs(d8)) { d_max = fabs(d8); }
1164 internal::GetScalingFactor(d_max, mult);
1167 d0 /= mult; d1 /= mult; d2 /= mult;
1168 d3 /= mult; d4 /= mult; d5 /= mult;
1169 d6 /= mult; d7 /= mult; d8 /= mult;
1171 double b11 = d0*d0 + d1*d1 + d2*d2;
1172 double b12 = d0*d3 + d1*d4 + d2*d5;
1173 double b13 = d0*d6 + d1*d7 + d2*d8;
1174 double b22 = d3*d3 + d4*d4 + d5*d5;
1175 double b23 = d3*d6 + d4*d7 + d5*d8;
1176 double b33 = d6*d6 + d7*d7 + d8*d8;
1195 double aa = (b11 + b22 + b33)/3;
1201 double b11_b22 = ((d0-d3)*(d0+d3)+(d1-d4)*(d1+d4)+(d2-d5)*(d2+d5));
1202 double b22_b33 = ((d3-d6)*(d3+d6)+(d4-d7)*(d4+d7)+(d5-d8)*(d5+d8));
1203 double b33_b11 = ((d6-d0)*(d6+d0)+(d7-d1)*(d7+d1)+(d8-d2)*(d8+d2));
1204 c1 = (b11_b22 - b33_b11)/3;
1205 c2 = (b22_b33 - b11_b22)/3;
1206 c3 = (b33_b11 - b22_b33)/3;
1209 Q = (2*(b12*b12 + b13*b13 + b23*b23) + c1*c1 + c2*c2 + c3*c3)/6;
1210 R = (c1*(b23*b23 - c2*c3)+ b12*(b12*c3 - 2*b13*b23) +b13*b13*c2)/2;
1243 double sqrtQ = sqrt(Q);
1244 double sqrtQ3 = Q*sqrtQ;
1249 if (fabs(R) >= sqrtQ3)
1271 aa -= 2*sqrtQ*cos(acos(R)/3);
1275 aa -= 2*sqrtQ*cos((acos(R) + 2.0*M_PI)/3);
1279 aa -= 2*sqrtQ*cos((acos(R) - 2.0*M_PI)/3);
1286 r = -2*sqrtQ*cos((acos(R) + 2.0*M_PI)/3);
1295 r = -2*sqrtQ*cos(acos(R)/3);
1326 switch (internal::KernelVector3S(mode, b12, b13, b23, c1, c2, c3))
1342 double v1, v2, v3, g;
1343 internal::Reduce3S(mode, b11, b22, b33, b12, b13, b23,
1344 c1, c2, c3, v1, v2, v3, g);
1351 internal::Eigenvalues2S(b23, b22, b33);
1355 aa = fmin(fmin(b11, b22), b33);
1361 aa = (b22 <= b33) ? b22 : fmax(b11, b33);
1365 aa = (b11 <= b33) ? b11 : fmax(b33, b22);
1370 aa = fmax(fmax(b11, b22), b33);
1376 return sqrt(fabs(aa))*mult;
1383 #endif // MFEM_KERNELS_HPP
MFEM_HOST_DEVICE T Det(const T *data)
Compute the determinant of a square matrix of size dim with given data.
MFEM_HOST_DEVICE void MultABt(const int Aheight, const int Awidth, const int Bheight, const TA *Adata, const TB *Bdata, TC *ABtdata)
Multiply a matrix of size Aheight x Awidth and data Adata with the transpose of a matrix of size Bhei...
MFEM_HOST_DEVICE void Add(const int height, const int width, const TALPHA alpha, const TA *Adata, const TB *Bdata, TC *Cdata)
Compute C = A + alpha*B, where the matrices A, B and C are of size height x width with data Adata...
double CalcSingularvalue(const double *data, const int i)
Return the i'th singular value of the matrix of size dim with given data.
MFEM_HOST_DEVICE double CalcSingularvalue< 3 >(const double *data, const int i)
Return the i'th singular value of the matrix of size 3 with given data.
MFEM_HOST_DEVICE void CalcEigenvalues< 2 >(const double *data, double *lambda, double *vec)
MFEM_HOST_DEVICE double Norml2(const int size, const T *data)
Returns the l2 norm of the Vector with given size and data.
void Swap(Array< T > &, Array< T > &)
MFEM_HOST_DEVICE double CalcSingularvalue< 2 >(const double *data, const int i)
Return the i'th singular value of the matrix of size 2 with given data.
MFEM_HOST_DEVICE void CalcEigenvalues< 3 >(const double *data, double *lambda, double *vec)
MFEM_HOST_DEVICE void CalcInverse(const T *data, T *inv_data)
Return the inverse a matrix with given size and data into the matrix with data inv_data.
double epsilon(const Vector &x)
MFEM_HOST_DEVICE void Mult(const int height, const int width, TA *data, const TX *x, TY *y)
Matrix vector multiplication: y = A x, where the matrix A is of size height x width with given data...
void CalcEigenvalues(const double *data, double *lambda, double *vec)
MFEM_HOST_DEVICE void Symmetrize(const int size, T *data)
Symmetrize a square matrix with given size and data: A -> (A+A^T)/2.