20#ifdef MFEM_HAVE_GCC_PRAGMA_DIAGNOSTIC
21#pragma GCC diagnostic push
22#pragma GCC diagnostic ignored "-Wunused-function"
25#ifndef GSLIB_RELEASE_VERSION
26#define GSLIB_RELEASE_VERSION 10007
28#ifdef MFEM_HAVE_GCC_PRAGMA_DIAGNOSTIC
29#pragma GCC diagnostic pop
33#if GSLIB_RELEASE_VERSION >= 10009
34#define CODE_INTERNAL 0
36#define CODE_NOT_FOUND 2
43 double x[
DIM], r[
DIM], oldr[
DIM], dist2, dist2p, tr;
49 double *x[
DIM], *dxdn[
DIM];
73struct findptsLocalHashData_t
84static MFEM_HOST_DEVICE
inline void lag_eval_first_der(
double *p0,
double x,
85 int i,
const double *z,
89 double u0 = 1, u1 = 0;
90 for (
int j = 0; j < pN; ++j)
94 double d_j = 2*(x-z[j]);
100 p0[pN+i] = 2.0*lCoeff[i]*u1;
105static MFEM_HOST_DEVICE
inline void lag_eval_second_der(
double *p0,
double x,
106 int i,
const double *z,
107 const double *lCoeff,
110 double u0 = 1, u1 = 0, u2 = 0;
111 for (
int j = 0; j < pN; ++j)
115 double d_j = 2*(x-z[j]);
121 p0[i] = lCoeff[i]*u0;
122 p0[pN+i] = 2.0*lCoeff[i]*u1;
123 p0[2*pN+i] = 8.0*lCoeff[i]*u2;
127static MFEM_HOST_DEVICE
inline double AABB_test(
const obbox_t *
const b,
131 for (
int d = 0; d < 3; ++d)
133 b_d = (x[d]-
b->x[d].min)*(
b->x[d].max-x[d]);
134 if (b_d < 0) {
return b_d; }
140static MFEM_HOST_DEVICE
inline double bbox_test(
const obbox_t *
const b,
143 const double bxyz = AABB_test(
b, x);
151 for (
int d = 0; d < 3; ++d)
153 dxyz[d] = x[d]-
b->c0[d];
156 for (
int d = 0; d < 3; ++d)
159 for (
int e = 0; e < 3; ++e)
161 rst +=
b->A[d*3+e]*dxyz[e];
163 double brst = (rst+1)*(1-rst);
164 test = test < 0 ? test : brst;
171static MFEM_HOST_DEVICE
inline int hash_index(
const findptsLocalHashData_t *
p,
174 const int n =
p->hash_n;
176 for (
int d = 3-1; d >= 0; --d)
179 int i = (int)floor((x[d]-
p->bnd[d].min)*
p->fac[d]);
180 sum += i < 0 ? 0 : (n-1 < i ? n-1 : i);
186static MFEM_HOST_DEVICE
inline void lin_solve_3(
double x[3],
const double A[9],
189 const double a = A[4]*A[8]-A[5]*A[7],
b = A[5]*A[6]-A[3]*A[8],
190 c = A[3]*A[7]-A[4]*A[6],
191 idet = 1 / (A[0]*
a+A[1]*
b+A[2]*c);
192 const double inv0 =
a, inv1 = A[2]*A[7]-A[1]*A[8],
193 inv2 = A[1]*A[5]-A[2]*A[4], inv3 =
b,
194 inv4 = A[0]*A[8]-A[2]*A[6], inv5 = A[2]*A[3]-A[0]*A[5],
195 inv6 = c, inv7 = A[1]*A[6]-A[0]*A[7],
196 inv8 = A[0]*A[4]-A[1]*A[3];
197 x[0] = idet*(inv0*y[0]+inv1*y[1]+inv2*y[2]);
198 x[1] = idet*(inv3*y[0]+inv4*y[1]+inv5*y[2]);
199 x[2] = idet*(inv6*y[0]+inv7*y[1]+inv8*y[2]);
203static MFEM_HOST_DEVICE
inline void lin_solve_sym_2(
double x[2],
207 const double idet = 1 / (A[0]*A[2]-A[1]*A[1]);
208 x[0] = idet*(A[2]*y[0]-A[1]*y[1]);
209 x[1] = idet*(A[0]*y[1]-A[1]*y[0]);
213static MFEM_HOST_DEVICE
inline double l2norm2(
const double x[3])
215 return x[0]*x[0]+x[1]*x[1]+x[2]*x[2];
226#define CONVERGED_FLAG (1u << 6)
227#define FLAG_MASK 0x7fu
230static MFEM_HOST_DEVICE
inline int num_constrained(
const int flags)
232 const int y = flags | flags >> 1;
233 return (y & 1u)+(y >> 2 & 1u)+(((3 == 2) | y >> 4) & 1u);
237static MFEM_HOST_DEVICE
inline int plus_1_mod_3(
const int x)
239 return ((x | x >> 1)+1) & 3u;
241static MFEM_HOST_DEVICE
inline int plus_2_mod_3(
const int x)
243 const int y = (x-1) & 3u;
248static MFEM_HOST_DEVICE
inline int which_bit(
const int x)
250 const int y = x & 7u;
251 return (y-(y >> 2)) | ((x-1) & 4u) | (x >> 4);
255static MFEM_HOST_DEVICE
inline int face_index(
const int x)
257 return which_bit(x)-1;
261static MFEM_HOST_DEVICE
inline int edge_index(
const int x)
263 const int y = ~((x >> 1) | x);
264 const int RTSR = ((x >> 1) & 1u) | ((x >> 2) & 2u) |
265 ((x >> 3) & 4u) | ((x << 2) & 8u);
266 const int re = RTSR >> 1;
267 const int se = 4u | RTSR >> 2;
268 const int te = 8u | (RTSR & 3u);
269 return ((0
u-(y & 1u)) & re) | ((0
u-((y >> 2) & 1u)) & se) |
270 ((0
u-((y >> 4) & 1u)) & te);
274static MFEM_HOST_DEVICE
inline int point_index(
const int x)
276 return ((x >> 1) & 1u) | ((x >> 2) & 2u) | ((x >> 3) & 4u);
281static MFEM_HOST_DEVICE
inline findptsElemFace
282get_face(
const double *elx[3],
const double *wtend,
int fi,
double *workspace,
283 int &side_init,
int jidx,
int pN)
285 const int dn = fi >> 1, d1 = plus_1_mod_3(dn), d2 = plus_2_mod_3(dn);
286 const int side_n = fi & 1;
287 const int p_Nfr = pN*pN;
288 findptsElemFace face;
289 const int jj = jidx % pN;
290 const int dd = jidx / pN;
291 for (
int d = 0; d < 3; ++d)
293 face.x[d] = workspace+d*p_Nfr;
294 face.dxdn[d] = workspace+(3+d)*p_Nfr;
297 if (side_init != (1u << fi))
299 const int e_stride[3] = {1, pN, pN*pN};
300#define ELX(d, j, k, l) elx[d][j*e_stride[d1]+k*e_stride[d2]+l*e_stride[dn]]
301 for (
int k = 0; k < pN; ++k)
304 face.x[dd][jj+k*pN] = ELX(dd, jj, k, side_n*(pN-1));
308 for (
int l = 0; l < pN; ++l)
310 sum_l += wtend[pN+l]*ELX(dd, jj, k, l);
312 face.dxdn[dd][jj+k*pN] = sum_l;
321static MFEM_HOST_DEVICE
inline findptsElemEdge
322get_edge(
const double *elx[3],
const double *wtend,
int ei,
double *workspace,
323 int &side_init,
int jidx,
int pN)
325 findptsElemEdge edge;
326 const int de = ei >> 2, dn1 = plus_1_mod_3(de), dn2 = plus_2_mod_3(de);
327 const int side_n1 = ei & 1, side_n2 = (ei & 2) >> 1;
329 const int in1 = side_n1*(pN-1), in2 = side_n2*(pN-1);
330 const double *wt1 = wtend+side_n1*pN*3;
331 const double *wt2 = wtend+side_n2*pN*3;
332 const int jj = jidx % pN;
333 const int dd = jidx / pN;
334 for (
int d = 0; d < 3; ++d)
336 edge.x[d] = workspace+d*pN;
337 edge.dxdn1[d] = workspace+(3+d)*pN;
338 edge.dxdn2[d] = workspace+(6+d)*pN;
339 edge.d2xdn1[d] = workspace+(9+d)*pN;
340 edge.d2xdn2[d] = workspace+(12+d)*pN;
343 if (jidx >= 3*pN) {
return edge; }
345 if (side_init != (64u << ei))
347 const int e_stride[3] = {1, pN, pN*pN};
348#define ELX(d, j, k, l) elx[d][j*e_stride[de]+k*e_stride[dn1]+l*e_stride[dn2]]
350 edge.x[dd][jj] = ELX(dd, jj, in1, in2);
353 double sums_k[2] = {0, 0};
354 for (
int k = 0; k < pN; ++k)
356 sums_k[0] += wt1[pN+k]*ELX(dd, jj, k, in2);
357 sums_k[1] += wt1[2*pN+k]*ELX(dd, jj, k, in2);
359 edge.dxdn1[dd][jj] = sums_k[0];
360 edge.d2xdn1[dd][jj] = sums_k[1];
363 sums_k[0] = 0, sums_k[1] = 0;
364 for (
int k = 0; k < pN; ++k)
366 sums_k[0] += wt2[pN+k]*ELX(dd, jj, in1, k);
367 sums_k[1] += wt2[2*pN+k]*ELX(dd, jj, in1, k);
369 edge.dxdn2[dd][jj] = sums_k[0];
370 edge.d2xdn2[dd][jj] = sums_k[1];
377static MFEM_HOST_DEVICE
inline findptsElemPt get_pt(
const double *elx[3],
381 const int side_n1 = pi & 1, side_n2 = (pi >> 1) & 1, side_n3 = (pi >> 2) & 1;
382 const int in1 = side_n1*(pN-1), in2 = side_n2*(pN-1),
383 in3 = side_n3*(pN-1);
384 const int hes_stride = (3+1)*3 / 2;
387#define ELX(d, j, k, l) elx[d][j+k*pN+l*pN*pN]
388 for (
int d = 0; d < 3; ++d)
390 pt.x[d] = ELX(d, side_n1*(pN-1), side_n2*(pN-1),
393 const double *wt1 = wtend+pN*(1+3*side_n1);
394 const double *wt2 = wtend+pN*(1+3*side_n2);
395 const double *wt3 = wtend+pN*(1+3*side_n3);
397 for (
int i = 0; i < 3; ++i)
401 for (
int i = 0; i < hes_stride; ++i)
403 pt.hes[hes_stride*d+i] = 0;
406 for (
int j = 0; j < pN; ++j)
408 pt.jac[3*d+0] += wt1[j]*ELX(d, j, in2, in3);
409 pt.hes[hes_stride*d] += wt1[pN+j]*ELX(d, j, in2, in3);
412 const int hes_off = hes_stride*d+hes_stride / 2;
413 for (
int k = 0; k < pN; ++k)
415 pt.jac[3*d+1] += wt2[k]*ELX(d, in1, k, in3);
416 pt.hes[hes_off] += wt2[pN+k]*ELX(d, in1, k, in3);
419 for (
int l = 0; l < pN; ++l)
421 pt.jac[3*d+2] += wt3[l]*ELX(d, in1, in2, l);
422 pt.hes[hes_stride*d+5] += wt3[pN+l]*ELX(d, in1, in2, l);
425 for (
int l = 0; l < pN; ++l)
427 double sum_k = 0, sum_j = 0;
428 for (
int k = 0; k < pN; ++k)
430 sum_k += wt2[k]*ELX(d, in1, k, l);
432 for (
int j = 0; j < pN; ++j)
434 sum_j += wt1[j]*ELX(d, j, in2, l);
436 pt.hes[hes_stride*d+2] += wt3[l]*sum_j;
437 pt.hes[hes_stride*d+4] += wt3[l]*sum_k;
439 for (
int k = 0; k < pN; ++k)
442 for (
int j = 0; j < pN; ++j)
444 sum_j += wt1[j]*ELX(d, j, k, in3);
446 pt.hes[hes_stride*d+1] += wt2[k]*sum_j;
457static MFEM_HOST_DEVICE
bool reject_prior_step_q(findptsPt *res,
458 const double resid[3],
462 const double dist2 = l2norm2(resid);
463 const double decr =
p->dist2-dist2;
464 const double pred =
p->dist2p;
465 for (
int d = 0; d < 3; ++d)
468 res->oldr[d] =
p->r[d];
471 if (decr >= 0.01*pred)
473 if (decr >= 0.9*pred)
491 double v0 = fabs(
p->r[0]-
p->oldr[0]);
492 double v1 = fabs(
p->r[1]-
p->oldr[1]);
493 double v2 = fabs(
p->r[2]-
p->oldr[2]);
494 res->tr = (v1 > v2 ? (v0 > v1 ? v0 : v1) : (v0 > v2 ? v0 : v2)) / 4;
495 res->dist2 =
p->dist2;
496 for (
int d = 0; d < 3; ++d)
498 res->r[d] =
p->oldr[d];
500 res->flags =
p->flags >> 7;
501 res->dist2p = -HUGE_VAL;
502 if (pred < dist2*tol)
504 res->flags |= CONVERGED_FLAG;
512static MFEM_HOST_DEVICE
void newton_vol(findptsPt *
const res,
514 const double resid[3],
515 const findptsPt *
const p,
518 const double tr =
p->tr;
519 double bnd[6] = {-1, 1, -1, 1, -1, 1};
523 r0[0] =
p->r[0], r0[1] =
p->r[1], r0[2] =
p->r[2];
526 for (d = 0; d < 3; ++d)
530 bnd[2*d] = r0[d]-tr, mask ^= 1u << (2*d);
534 bnd[2*d+1] = r0[d]+tr, mask ^= 2u << (2*d);
538 lin_solve_3(dr, jac, resid);
541 for (d = 0; d < 3; ++d)
543 double nr = r0[d]+dr[d];
544 if ((nr-bnd[2*d])*(bnd[2*d+1]-nr) >= 0)
550 double f = (bnd[2*d]-r0[d]) / dr[d];
553 fac =
f, flags = 1u << (2*d);
558 double f = (bnd[2*d+1]-r0[d]) / dr[d];
561 fac =
f, flags = 2u << (2*d);
571 for (d = 0; d < 3; ++d)
578 const int fi = face_index(flags);
579 const int dn = fi >> 1, d1 = plus_1_mod_3(dn), d2 = plus_2_mod_3(dn);
580 double drc[2], facc = 1;
582 double ress[3], y[2], JtJ[3];
583 ress[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]);
584 ress[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]);
585 ress[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]);
587 y[0] = jac[d1]*ress[0]+jac[3+d1]*ress[1]+jac[6+d1]*ress[2];
588 y[1] = jac[d2]*ress[0]+jac[3+d2]*ress[1]+jac[6+d2]*ress[2];
590 JtJ[0] = jac[d1]*jac[d1]+jac[3+d1]*jac[3+d1] +
591 jac[6+d1]*jac[6 +d1];
592 JtJ[1] = jac[d1]*jac[d2]+jac[3+d1]*jac[3+d2] +
594 JtJ[2] = jac[d2]*jac[d2]+jac[3+d2]*jac[3+d2] +
596 lin_solve_sym_2(drc, JtJ, y);
597#define CHECK_CONSTRAINT(drcd, d3) \
599 const double rz = r0[d3]+dr[d3], lb = bnd[2*d3], ub = bnd[2*d3+1]; \
600 const double delta = drcd, nr = r0[d3]+(dr[d3]+delta); \
601 if ((nr-lb)*(ub-nr) < 0) { \
603 double f = (lb-rz) / delta; \
605 fac = f; new_flags = 1u << (2*d3); \
609 double f = (ub-rz) / delta; \
611 fac = f; new_flags = 2u << (2*d3); \
616 CHECK_CONSTRAINT(drc[0], d1);
617 CHECK_CONSTRAINT(drc[1], d2);
618 dr[d1] += facc*drc[0], dr[d2] += facc*drc[1];
628 const int ei = edge_index(flags);
629 const int de = ei >> 2;
632 double ress[3], y, JtJ, drc;
633 ress[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]);
634 ress[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]);
635 ress[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]);
637 y = jac[de]*ress[0]+jac[3+de]*ress[1]+jac[6+de]*ress[2];
639 JtJ = jac[de]*jac[de]+jac[3+de]*jac[3+de] +
642 CHECK_CONSTRAINT(drc, de);
643#undef CHECK_CONSTRAINT
646 goto newton_vol_relax;
652 const int old_flags = flags;
653 double ress[3], y[3];
655 ress[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]);
656 ress[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]);
657 ress[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]);
659 y[0] = jac[0]*ress[0]+jac[3]*ress[1]+jac[6]*ress[2];
660 y[1] = jac[1]*ress[0]+jac[4]*ress[1]+jac[7]*ress[2];
661 y[2] = jac[2]*ress[0]+jac[5]*ress[1]+jac[8]*ress[2];
662 for (
int dd = 0; dd < 3; ++dd)
664 int f = flags >> (2*dd) & 3u;
667 dr[dd] = bnd[2*dd+(
f-1)]-r0[dd];
668 if (dr[dd]*y[dd] < 0)
670 flags &= ~(3u << (2*dd));
674 if (flags == old_flags)
678 switch (num_constrained(flags))
681 goto newton_vol_face;
683 goto newton_vol_edge;
689 if (fabs(dr[0])+fabs(dr[1])+fabs(dr[2]) < tol)
691 flags |= CONVERGED_FLAG;
694 const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1] +
696 const double res1 = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1] +
698 const double res2 = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1] +
700 res->dist2p = resid[0]*resid[0]+resid[1]*resid[1] +
702 (res0*res0+res1*res1+res2*res2);
704 for (
int dd = 0; dd < 3; ++dd)
706 int f = flags >> (2*dd) & 3u;
707 res->r[dd] =
f == 0 ? r0[dd]+dr[dd] : (
f == 1 ? -1 : 1);
709 res->flags = flags | (
p->flags << 7);
713static MFEM_HOST_DEVICE
void newton_face(findptsPt *
const res,
715 const double rhes[3],
716 const double resid[3],
721 const findptsPt *
const p,
724 const double tr =
p->tr;
726 double r[2], dr[2] = {0, 0};
730 double A[3], y[2], r0[2];
732 A[0] = jac[d1]*jac[d1]+jac[3+d1]*jac[3+d1] +
733 jac[6+d1]*jac[6+d1]-rhes[0];
734 A[1] = jac[d1]*jac[d2]+jac[3+d1]*jac[3+d2] +
735 jac[6+d1]*jac[6+d2]-rhes[1];
736 A[2] = jac[d2]*jac[d2]+jac[3+d2]*jac[3+d2] +
737 jac[6+d2]*jac[6+d2]-rhes[2];
739 y[0] = jac[d1]*resid[0]+jac[3+d1]*resid[1]+jac[6+d1]*resid[2];
740 y[1] = jac[d2]*resid[0]+jac[3+d2]*resid[1]+jac[6+d2]*resid[2];
783 if (A[0]+A[2] <= 0 || A[0]*A[2] <= A[1]*A[1])
785 goto newton_face_constrained;
787 lin_solve_sym_2(dr, A, y);
789#define EVAL(r, s) -(y[0]*r+y[1]*s)+(r*A[0]*r+(2*r*A[1]+s*A[2])*s) / 2
790 if ((dr[0]-bnd[0])*(bnd[1]-dr[0]) >= 0 &&
791 (dr[1]-bnd[2])*(bnd[3]-dr[1]) >= 0)
793 r[0] = r0[0]+dr[0], r[1] = r0[1]+dr[1];
794 v = EVAL(dr[0], dr[1]);
795 goto newton_face_fin;
797newton_face_constrained:
798 v = EVAL(bnd[0], bnd[2]);
800 tv = EVAL(bnd[1], bnd[2]);
806 tv = EVAL(bnd[0], bnd[3]);
812 tv = EVAL(bnd[1], bnd[3]);
821 drc = (y[0]-A[1]*bnd[2]) / A[0];
822 if ((drc-bnd[0])*(bnd[1]-drc) >= 0 && (tv = EVAL(drc, bnd[2])) < v)
828 drc = (y[0]-A[1]*bnd[3]) / A[0];
829 if ((drc-bnd[0])*(bnd[1]-drc) >= 0 && (tv = EVAL(drc, bnd[3])) < v)
839 drc = (y[1]-A[1]*bnd[0]) / A[2];
840 if ((drc-bnd[2])*(bnd[3]-drc) >= 0 && (tv = EVAL(bnd[0], drc)) < v)
846 drc = (y[1]-A[1]*bnd[1]) / A[2];
847 if ((drc-bnd[2])*(bnd[3]-drc) >= 0 && (tv = EVAL(bnd[1], drc)) < v)
860 for (
int d = 0; d < 2; ++d)
862 const int f = i >> (2*d) & 3u;
869 if ((
f & (mask >> (2*d))) == 0)
871 r[d] = r0[d]+(
f == 1 ? -tr : tr);
875 r[d] = (
f == 1 ? -1 : 1);
876 new_flags |=
f << (2*dir[d]);
883 dr[0] = r[0]-
p->r[d1];
884 dr[1] = r[1]-
p->r[d2];
885 if (fabs(dr[0])+fabs(dr[1]) < tol)
887 new_flags |= CONVERGED_FLAG;
889 res->r[dn] =
p->r[dn];
892 res->flags = new_flags | (
p->flags << 7);
896static MFEM_HOST_DEVICE
inline void newton_edge(findptsPt *
const res,
899 const double resid[3],
904 const findptsPt *
const p,
907 const double tr =
p->tr;
909 const double A = jac[de]*jac[de]+jac[3+de]*jac[3+de]+jac[6+de] *
912 const double y = jac[de]*resid[0]+jac[3+de]*resid[1]+jac[6+de] *
915 const double oldr =
p->r[de];
916 double dr, nr, tdr, tnr;
918 int new_flags = 0, tnew_flags = 0;
920#define EVAL(dr) (dr*A-2*y)*dr
927 if (fabs(dr) < tr && fabs(nr) < 1)
930 goto newton_edge_fin;
934 if ((nr = oldr-tr) > -1)
942 new_flags = flags | 1u << (2*de);
946 if ((tnr = oldr+tr) < 1)
954 tnew_flags = flags | 2u << (2*de);
963 new_flags = tnew_flags;
970 new_flags |= CONVERGED_FLAG;
973 res->r[dn1] =
p->r[dn1];
974 res->r[dn2] =
p->r[dn2];
976 res->flags = flags | new_flags | (
p->flags << 7);
980static MFEM_HOST_DEVICE
void seed_j(
const double *elx[3],
991 for (
int l = 0; l < pN; ++l)
993 const double zt = z[l];
994 for (
int k = 0; k < pN; ++k)
998 const int jkl = j+k*pN+l*pN*pN;
1000 for (
int d = 0; d < 3; ++d)
1002 dx[d] = x[d]-elx[d][jkl];
1004 const double dist2_jkl = l2norm2(dx);
1005 if (dist2[j] > dist2_jkl)
1007 dist2[j] = dist2_jkl;
1018static MFEM_HOST_DEVICE
double tensor_ig3_j(
double *g_partials,
1032 for (
int k = 0; k < pN; ++k)
1036 for (
int l = 0; l < pN; ++l)
1038 uJt +=
u[j+k*pN+l*pN*pN]*Jt[l];
1039 uDt +=
u[j+k*pN+l*pN*pN]*Dt[l];
1047 g_partials[0] = uJtJs*Dr[j];
1048 g_partials[1] = uJtDs*Jr[j];
1049 g_partials[2] = uDtJs*Jr[j];
1053template<
int T_D1D = 0>
1054static void FindPointsLocal3DKernel(
const int npt,
1057 const int point_pos_ordering,
1058 const double *xElemCoord,
1060 const double *wtend,
1061 const double *boxinfo,
1063 const double *hashMin,
1064 const double *hashFac,
1065 unsigned int *hashOffset,
1066 unsigned int *
const code_base,
1067 unsigned int *
const el_base,
1068 double *
const r_base,
1069 double *
const dist2_base,
1070 const double *gll1D,
1071 const double *lagcoeff,
1074 const int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1075 const int D1D = T_D1D ? T_D1D : pN;
1076 const int p_NE = D1D*D1D*D1D;
1077 MFEM_VERIFY(MD1 <= DofQuadLimits::MAX_D1D,
1078 "Increase Max allowable polynomial order.");
1079 MFEM_VERIFY(D1D != 0,
"Polynomial order not specified.");
1080#define MAXC(a, b) (((a) > (b)) ? (a) : (b))
1081 const int nThreads = MAXC(D1D*
DIM, 15);
1086 constexpr int size1 = 21*MD1+15;
1088 constexpr int size2 = MAXC(MD1*MD1*6, MD1*3*5);
1090 constexpr int size3 = MD1*MD1*MD1*
DIM;
1092 MFEM_SHARED
double r_workspace[size1];
1093 MFEM_SHARED findptsPt el_pts[2];
1095 MFEM_SHARED
double constraint_workspace[size2];
1096 MFEM_SHARED
int face_edge_init;
1098 MFEM_SHARED
double elem_coords[MD1 <= 6 ? size3 : 1];
1100 double *r_workspace_ptr;
1101 findptsPt *fpt, *tmp;
1102 MFEM_FOREACH_THREAD(j,x,nThreads)
1104 r_workspace_ptr = r_workspace;
1110 int id_x = point_pos_ordering == 0 ? i : i*
DIM;
1111 int id_y = point_pos_ordering == 0 ? i+npt : i*
DIM+1;
1112 int id_z = point_pos_ordering == 0 ? i+2*npt : i*
DIM+2;
1113 double x_i[3] = {x[id_x], x[id_y], x[id_z]};
1115 unsigned int *code_i = code_base+i;
1116 double *dist2_i = dist2_base+i;
1119 findptsLocalHashData_t hash;
1120 for (
int d = 0; d <
DIM; ++d)
1122 hash.bnd[d].min = hashMin[d];
1123 hash.fac[d] = hashFac[d];
1125 hash.hash_n = hash_n;
1126 hash.offset = hashOffset;
1127 const unsigned int hi = hash_index(&hash, x_i);
1128 const unsigned int *elp = hash.offset+hash.offset[hi],
1129 *
const ele = hash.offset+hash.offset[hi+1];
1130 *code_i = CODE_NOT_FOUND;
1131 *dist2_i = HUGE_VAL;
1133 for (; elp != ele; ++elp)
1137 const int el = *elp;
1141 int n_box_ents = 3*
DIM+DIM2;
1142 for (
int idx = 0; idx <
DIM; ++idx)
1144 box.c0[idx] = boxinfo[n_box_ents*el+idx];
1145 box.x[idx].min = boxinfo[n_box_ents*el+
DIM+idx];
1146 box.x[idx].max = boxinfo[n_box_ents*el+2*
DIM+idx];
1149 for (
int idx = 0; idx < DIM2; ++idx)
1151 box.A[idx] = boxinfo[n_box_ents*el+3*
DIM+idx];
1154 if (bbox_test(&box, x_i) < 0) {
continue; }
1161 MFEM_FOREACH_THREAD(j,x,D1D*
DIM)
1163 const int qp = j % D1D;
1164 const int d = j / D1D;
1165 for (
int l = 0; l < D1D; ++l)
1167 for (
int k = 0; k < D1D; ++k)
1169 const int jkl = qp+k*D1D+l*D1D*D1D;
1170 elem_coords[jkl+d*p_NE] =
1171 xElemCoord[jkl+el*p_NE+d*nel*p_NE];
1178 const double *elx[
DIM];
1179 for (
int d = 0; d <
DIM; d++)
1181 elx[d] = MD1<= 6 ? &elem_coords[d*p_NE] :
1182 xElemCoord+d*nel*p_NE+el*p_NE;
1188 MFEM_FOREACH_THREAD(j,x,1)
1190 fpt->dist2 = HUGE_VAL;
1195 MFEM_FOREACH_THREAD(j,x,
DIM)
1203 double *dist2_temp = r_workspace_ptr;
1204 double *r_temp[
DIM];
1205 for (
int d = 0; d <
DIM; ++d)
1207 r_temp[d] = dist2_temp+(1+d)*D1D;
1210 MFEM_FOREACH_THREAD(j,x,D1D)
1212 seed_j(elx, x_i, gll1D, dist2_temp, r_temp, j, D1D);
1216 MFEM_FOREACH_THREAD(j,x,1)
1218 fpt->dist2 = HUGE_VAL;
1219 for (
int jj = 0; jj < D1D; ++jj)
1221 if (dist2_temp[jj] < fpt->dist2)
1223 fpt->dist2 = dist2_temp[jj];
1224 for (
int d = 0; d <
DIM; ++d)
1226 fpt->r[d] = r_temp[d][jj];
1234 MFEM_FOREACH_THREAD(j,x,1)
1236 tmp->dist2 = HUGE_VAL;
1241 MFEM_FOREACH_THREAD(j,x,
DIM)
1243 tmp->x[j] = fpt->x[j];
1244 tmp->r[j] = fpt->r[j];
1248 for (
int step = 0; step < 50; step++)
1250 switch (num_constrained(tmp->flags & FLAG_MASK))
1254 double *wtr = r_workspace_ptr;
1256 double *resid = wtr+6*D1D;
1257 double *jac = resid+3;
1258 double *resid_temp = jac+9;
1259 double *jac_temp = resid_temp+3*D1D;
1261 MFEM_FOREACH_THREAD(j,x,D1D*
DIM)
1263 const int qp = j % D1D;
1264 const int d = j / D1D;
1265 lag_eval_first_der(wtr+2*d*D1D, tmp->r[d], qp,
1266 gll1D, lagcoeff, D1D);
1270 MFEM_FOREACH_THREAD(j,x,D1D*
DIM)
1272 const int qp = j % D1D;
1273 const int d = j / D1D;
1274 double *idx = jac_temp+3*d+9*qp;
1275 resid_temp[d+qp*3] = tensor_ig3_j(idx,
1287 MFEM_FOREACH_THREAD(l,x,3)
1289 resid[l] = tmp->x[l];
1290 for (
int j = 0; j < D1D; ++j)
1292 resid[l] -= resid_temp[l+j*3];
1295 MFEM_FOREACH_THREAD(l,x,9)
1298 for (
int j = 0; j < D1D; ++j)
1300 jac[l] += jac_temp[l+j*9];
1305 MFEM_FOREACH_THREAD(l,x,1)
1309 if (!reject_prior_step_q(fpt, resid, tmp, tol))
1311 newton_vol(fpt, jac, resid, tmp, tol);
1320 const int fi = face_index(tmp->flags & FLAG_MASK);
1321 const int dn = fi >> 1;
1322 const int d1 = plus_1_mod_3(dn), d2 = plus_2_mod_3(dn);
1324 double *wt1 = r_workspace_ptr;
1325 double *resid = wt1+6*D1D;
1326 double *jac = resid+3;
1327 double *resid_temp = jac+9;
1328 double *jac_temp = resid_temp+3*D1D;
1329 double *hes = jac_temp+9*D1D;
1330 double *hes_temp = hes+3;
1333 MFEM_FOREACH_THREAD(j,x,D1D*2)
1338 const int d = j / D1D;
1339 const int qp = j % D1D;
1340 lag_eval_second_der(wt1+3*d*D1D, tmp->r[dd[d]],
1341 qp, gll1D, lagcoeff, D1D);
1345 double *J1 = wt1, *D1 = wt1+D1D;
1346 double *J2 = wt1+3*D1D, *D2 = J2+D1D;
1347 double *DD1 = D1+D1D, *DD2 = D2+D1D;
1348 findptsElemFace face;
1350 MFEM_FOREACH_THREAD(j,x,D1D*
DIM)
1353 face = get_face(elx, wtend, fi, constraint_workspace,
1354 face_edge_init, j, D1D);
1358 MFEM_FOREACH_THREAD(j,x,D1D*
DIM)
1360 if (j == 0) { face_edge_init = (1 << fi); }
1361 const int qp = j % D1D;
1362 const int d = j / D1D;
1363 const double *
u = face.x[d];
1364 const double *du = face.dxdn[d];
1365 double sums_k[4] = {0.0, 0.0, 0.0, 0.0};
1366 for (
int k = 0; k < D1D; ++k)
1368 sums_k[0] +=
u[qp+k*D1D]*J2[k];
1369 sums_k[1] +=
u[qp+k*D1D]*D2[k];
1370 sums_k[2] +=
u[qp+k*D1D]*DD2[k];
1371 sums_k[3] += du[qp+k*D1D]*J2[k];
1374 resid_temp[3*qp+d] = sums_k[0]*J1[qp];
1375 jac_temp[9*qp+3*d+d1] = sums_k[0]*D1[qp];
1376 jac_temp[9*qp+3*d+d2] = sums_k[1]*J1[qp];
1377 jac_temp[9*qp+3*d+dn] = sums_k[3]*J1[qp];
1380 hes_temp[3*qp] = sums_k[0]*DD1[qp];
1381 hes_temp[3*qp+1] = sums_k[1]*D1[qp];
1382 hes_temp[3*qp+2] = sums_k[2]*J1[qp];
1387 MFEM_FOREACH_THREAD(l,x,3)
1389 resid[l] = fpt->x[l];
1391 for (
int j = 0; j < D1D; ++j)
1393 resid[l] -= resid_temp[l+j*3];
1394 hes[l] += hes_temp[l+3*j];
1399 MFEM_FOREACH_THREAD(l,x,9)
1402 for (
int j = 0; j < D1D; ++j)
1404 jac[l] += jac_temp[l+j*9];
1409 MFEM_FOREACH_THREAD(l,x,1)
1411 if (!reject_prior_step_q(fpt, resid, tmp, tol))
1413 const double steep = resid[0]*jac[dn]+
1416 if (steep*tmp->r[dn] < 0)
1419 newton_vol(fpt, jac, resid, tmp, tol);
1423 newton_face(fpt, jac, hes, resid, d1,
1424 d2, dn, tmp->flags&FLAG_MASK,
1434 const int ei = edge_index(tmp->flags & FLAG_MASK);
1435 const int de = ei >> 2,
1436 dn1 = plus_1_mod_3(de),
1437 dn2 = plus_2_mod_3(de);
1442 const int hes_count = 2*3-1;
1444 double *wt = r_workspace_ptr;
1445 double *resid = wt+3*D1D;
1446 double *jac = resid+3;
1447 double *hes_T = jac+9;
1448 double *hes = hes_T+hes_count*3;
1449 findptsElemEdge edge;
1451 MFEM_FOREACH_THREAD(j,x,nThreads)
1454 edge = get_edge(elx, wtend, ei,
1455 constraint_workspace,
1461 const double *
const *e_x[3+3] = {edge.x, edge.x,
1468 MFEM_FOREACH_THREAD(j,x,D1D)
1470 if (j == 0) { face_edge_init = (64 << ei); }
1471 lag_eval_second_der(wt, tmp->r[de], j, gll1D,
1476 MFEM_FOREACH_THREAD(j,x,hes_count*3)
1478 const int d = j % 3;
1479 const int row = j / 3;
1484 double *wt_j = wt+(row == 1 ? D1D : 0);
1485 const double *x = e_x[row][d];
1487 for (
int k = 0; k < D1D; ++k)
1490 sum += wt_j[k]*x[k];
1494 resid[j] = tmp->x[j]-sum;
1498 jac[d*3+d_j[row-1]] = sum;
1506 double *wt_j = wt+D1D*(2-(row+1) / 2);
1507 const double *x = e_x[row+1][d];
1509 for (
int k = 0; k < D1D; ++k)
1511 hes_T[j] += wt_j[k]*x[k];
1517 MFEM_FOREACH_THREAD(j,x,hes_count)
1520 for (
int d = 0; d < 3; ++d)
1522 hes[j] += resid[d]*hes_T[j*3+d];
1528 MFEM_FOREACH_THREAD(l,x,1)
1531 if (!reject_prior_step_q(fpt, resid, tmp, tol))
1535 for (
int k = 0; k < 3-1; ++k)
1539 for (
int d = 0; d < 3; ++d)
1541 steep[k] += jac[dn+d*3]*resid[d];
1543 steep[k] *= tmp->r[dn];
1549 newton_vol(fpt, jac, resid, tmp, tol);
1557 newton_face(fpt, jac, rh, resid, de,
1559 tmp->flags&(3u<<(dn2*2)),
1571 newton_face(fpt, jac, rh, resid, dn2,
1573 tmp->flags&(3u<<(dn1*2)),
1578 newton_edge(fpt, jac, hes[0], resid,
1580 tmp->flags & FLAG_MASK,
1591 MFEM_FOREACH_THREAD(j,x,1)
1594 const int pi=point_index(tmp->flags & FLAG_MASK);
1595 const findptsElemPt gpt=get_pt(elx,wtend,pi,D1D);
1596 const double *
const pt_x = gpt.x;
1597 const double *
const jac = gpt.jac;
1598 const double *
const hes = gpt.hes;
1600 double resid[3], steep[3];
1601 for (
int d = 0; d < 3; ++d)
1603 resid[d] = fpt->x[d]-pt_x[d];
1605 if (!reject_prior_step_q(fpt, resid, tmp, tol))
1607 for (
int d = 0; d < 3; ++d)
1610 for (
int e = 0; e < 3; ++e)
1612 steep[d] += jac[d+e*3]*resid[e];
1614 steep[d] *= tmp->r[d];
1616 int de, dn1, dn2, d1, d2, dn, hi0, hi1, hi2;
1623 newton_vol(fpt,jac,resid,tmp,tol);
1627 d1 = 0; d2 = 1; dn = 2; hi0 = 0;
1630 rh[0] = resid[0]*hes[hi0] +
1631 resid[1]*hes[6+hi0] +
1632 resid[2]*hes[12+hi0];
1633 rh[1] = resid[0]*hes[hi1] +
1634 resid[1]*hes[6+hi1] +
1635 resid[2]*hes[12+hi1];
1636 rh[2] = resid[0]*hes[hi2] +
1637 resid[1]*hes[6+hi2] +
1638 resid[2]*hes[12+hi2];
1639 newton_face(fpt, jac, rh, resid,
1641 (tmp->flags)&(3u<<(2*dn)),
1649 d1 = 2; d2 = 0; dn = 1; hi0 = 5;
1652 rh[0] = resid[0]*hes[hi0] +
1653 resid[1]*hes[6+hi0] +
1654 resid[2]*hes[12+hi0];
1655 rh[1] = resid[0]*hes[hi1] +
1656 resid[1]*hes[6+hi1] +
1657 resid[2]*hes[12+hi1];
1658 rh[2] = resid[0]*hes[hi2] +
1659 resid[1]*hes[6+hi2] +
1660 resid[2]*hes[12+hi2];
1661 newton_face(fpt, jac, rh, resid,
1663 (tmp->flags)&(3u<<(2*dn)),
1668 de = 0, dn1 = 1, dn2 = 2, hi0 = 0;
1671 resid[1]*hes[6+hi0] +
1672 resid[2]*hes[12+hi0];
1673 newton_edge(fpt, jac, rh, resid,
1675 tmp->flags&(~(3u<<(2*de))),
1686 d1 = 1, d2 = 2, dn = 0;
1687 hi0 = 3, hi1 = 4, hi2 = 5;
1689 rh[0] = resid[0]*hes[hi0] +
1690 resid[1]*hes[6+hi0] +
1691 resid[2]*hes[12+hi0];
1692 rh[1] = resid[0]*hes[hi1] +
1693 resid[1]*hes[6+hi1] +
1694 resid[2]*hes[12+hi1];
1695 rh[2] = resid[0]*hes[hi2] +
1696 resid[1]*hes[6+hi2] +
1697 resid[2]*hes[12+hi2];
1698 newton_face(fpt, jac, rh, resid,
1700 (tmp->flags)&(3u<<(2*dn)),
1705 de = 1, dn1 = 2, dn2 = 0, hi0 = 3;
1708 resid[1]*hes[6+hi0] +
1709 resid[2]*hes[12+hi0];
1710 newton_edge(fpt, jac, rh, resid,
1712 tmp->flags&(~(3u<<(2*de))),
1720 de = 2, dn1 = 0, dn2 = 1, hi0 = 5;
1723 resid[1]*hes[6+hi0] +
1724 resid[2]*hes[12+hi0];
1725 newton_edge(fpt, jac, rh, resid,
1727 tmp->flags&(~(3u<<(2*de))),
1732 fpt->r[0] = tmp->r[0];
1733 fpt->r[1] = tmp->r[1];
1734 fpt->r[2] = tmp->r[2];
1736 fpt->flags =tmp->flags|CONVERGED_FLAG;
1746 if (fpt->flags & CONVERGED_FLAG)
1751 MFEM_FOREACH_THREAD(j,x,1)
1759 bool converged_internal = (fpt->flags&FLAG_MASK)==CONVERGED_FLAG;
1760 if (*code_i == CODE_NOT_FOUND || converged_internal ||
1761 fpt->dist2 < *dist2_i)
1763 MFEM_FOREACH_THREAD(j,x,1)
1766 *code_i = converged_internal ? CODE_INTERNAL :
1768 *dist2_i = fpt->dist2;
1770 MFEM_FOREACH_THREAD(j,x,
DIM)
1772 *(r_base+
DIM*i+j) = fpt->r[j];
1775 if (converged_internal)
1786 int point_pos_ordering,
1795 auto pp = point_pos.
Read();
1797 auto pwt =
DEV.wtend.Read();
1798 auto pbb =
DEV.bb.Read();
1799 auto plhm =
DEV.loc_hash_min.Read();
1800 auto plhf =
DEV.loc_hash_fac.Read();
1801 auto plho =
DEV.loc_hash_offset.ReadWrite();
1802 auto pcode = code.
Write();
1803 auto pelem = elem.
Write();
1804 auto pref = ref.
Write();
1805 auto pdist = dist.
Write();
1806 auto pgll1d =
DEV.gll1d.ReadWrite();
1807 auto plc =
DEV.lagcoeff.Read();
1811 FindPointsLocal3DKernel<2>(npt,
DEV.newt_tol, pp, point_pos_ordering,
1813 plhf, plho, pcode, pelem, pref, pdist, pgll1d,
1817 FindPointsLocal3DKernel<3>(npt,
DEV.newt_tol, pp, point_pos_ordering,
1819 plhf, plho, pcode, pelem, pref, pdist, pgll1d,
1823 FindPointsLocal3DKernel<4>(npt,
DEV.newt_tol, pp, point_pos_ordering,
1825 plhf, plho, pcode, pelem, pref, pdist, pgll1d,
1829 FindPointsLocal3DKernel<5>(npt,
DEV.newt_tol, pp, point_pos_ordering,
1831 plhf, plho, pcode, pelem, pref, pdist, pgll1d,
1835 FindPointsLocal3DKernel(npt,
DEV.newt_tol, pp, point_pos_ordering, pgslm,
1837 plho, pcode, pelem, pref, pdist, pgll1d, plc,
1846#undef CODE_NOT_FOUND
1849 int point_pos_ordering,
1852 Vector &dist,
int npt) {};
T * Write(bool on_dev=true)
Shortcut for mfem::Write(a.GetMemory(), a.Size(), on_dev).
struct mfem::FindPointsGSLIB::@24 DEV
void FindPointsLocal3(const Vector &point_pos, int point_pos_ordering, Array< unsigned int > &gsl_code_dev_l, Array< unsigned int > &gsl_elem_dev_l, Vector &gsl_ref_l, Vector &gsl_dist_l, int npt)
virtual const real_t * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
virtual real_t * Write(bool on_dev=true)
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), on_dev).
real_t u(const Vector &xvec)
void forall_2D(int N, int X, int Y, lambda &&body)
std::function< real_t(const Vector &)> f(real_t mass_coeff)
real_t p(const Vector &x, real_t t)