MFEM v4.8.0
Finite element discretization library
Loading...
Searching...
No Matches
findpts_local_3.cpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#include "../gslib.hpp"
15
16#include <climits>
17
18#ifdef MFEM_USE_GSLIB
19
20#ifdef MFEM_HAVE_GCC_PRAGMA_DIAGNOSTIC
21#pragma GCC diagnostic push
22#pragma GCC diagnostic ignored "-Wunused-function"
23#endif
24#include "gslib.h"
25#ifndef GSLIB_RELEASE_VERSION //gslib v1.0.7
26#define GSLIB_RELEASE_VERSION 10007
27#endif
28#ifdef MFEM_HAVE_GCC_PRAGMA_DIAGNOSTIC
29#pragma GCC diagnostic pop
30#endif
31namespace mfem
32{
33#if GSLIB_RELEASE_VERSION >= 10009
34#define CODE_INTERNAL 0
35#define CODE_BORDER 1
36#define CODE_NOT_FOUND 2
37#define DIM 3
38#define DIM2 DIM*DIM
39#define pMax 10
40
41struct findptsPt
42{
43 double x[DIM], r[DIM], oldr[DIM], dist2, dist2p, tr;
44 int flags;
45};
46
47struct findptsElemFace
48{
49 double *x[DIM], *dxdn[DIM];
50};
51
52struct findptsElemEdge
53{
54 double *x[DIM], *dxdn1[DIM], *dxdn2[DIM], *d2xdn1[DIM], *d2xdn2[DIM];
55};
56
57struct findptsElemPt
58{
59 double x[DIM], jac[DIM * DIM], hes[18];
60};
61
62struct dbl_range_t
63{
64 double min, max;
65};
66
67struct obbox_t
68{
69 double c0[DIM], A[DIM * DIM];
70 dbl_range_t x[DIM];
71};
72
73struct findptsLocalHashData_t
74{
75 int hash_n;
76 dbl_range_t bnd[DIM];
77 double fac[DIM];
78 unsigned int *offset;
79 // int max;
80};
81
82// Eval the ith Lagrange interpolant and its first derivative at x.
83// Note: lCoeff stores pre-computed coefficients for fast evaluation.
84static MFEM_HOST_DEVICE inline void lag_eval_first_der(double *p0, double x,
85 int i, const double *z,
86 const double *lCoeff,
87 int pN)
88{
89 double u0 = 1, u1 = 0;
90 for (int j = 0; j < pN; ++j)
91 {
92 if (i != j)
93 {
94 double d_j = 2*(x-z[j]);
95 u1 = d_j*u1+u0;
96 u0 = d_j*u0;
97 }
98 }
99 p0[i] = lCoeff[i]*u0;
100 p0[pN+i] = 2.0*lCoeff[i]*u1;
101}
102
103// Eval the ith Lagrange interpolant and its first and second derivative at x.
104// Note: lCoeff stores pre-computed coefficients for fast evaluation.
105static MFEM_HOST_DEVICE inline void lag_eval_second_der(double *p0, double x,
106 int i, const double *z,
107 const double *lCoeff,
108 int pN)
109{
110 double u0 = 1, u1 = 0, u2 = 0;
111 for (int j = 0; j < pN; ++j)
112 {
113 if (i != j)
114 {
115 double d_j = 2*(x-z[j]);
116 u2 = d_j*u2+u1;
117 u1 = d_j*u1+u0;
118 u0 = d_j*u0;
119 }
120 }
121 p0[i] = lCoeff[i]*u0;
122 p0[pN+i] = 2.0*lCoeff[i]*u1;
123 p0[2*pN+i] = 8.0*lCoeff[i]*u2;
124}
125
126// Axis-aligned bounding box test.
127static MFEM_HOST_DEVICE inline double AABB_test(const obbox_t *const b,
128 const double x[3])
129{
130 double b_d;
131 for (int d = 0; d < 3; ++d)
132 {
133 b_d = (x[d]-b->x[d].min)*(b->x[d].max-x[d]);
134 if (b_d < 0) { return b_d; }
135 }
136 return b_d;
137}
138
139// Axis-aligned bounding box test followed by oriented bounding-box test.
140static MFEM_HOST_DEVICE inline double bbox_test(const obbox_t *const b,
141 const double x[3])
142{
143 const double bxyz = AABB_test(b, x);
144 if (bxyz < 0)
145 {
146 return bxyz;
147 }
148 else
149 {
150 double dxyz[3];
151 for (int d = 0; d < 3; ++d)
152 {
153 dxyz[d] = x[d]-b->c0[d];
154 }
155 double test = 1;
156 for (int d = 0; d < 3; ++d)
157 {
158 double rst = 0;
159 for (int e = 0; e < 3; ++e)
160 {
161 rst += b->A[d*3+e]*dxyz[e];
162 }
163 double brst = (rst+1)*(1-rst);
164 test = test < 0 ? test : brst;
165 }
166 return test;
167 }
168}
169
170// Element index corresponding to hash mesh that the point is located in.
171static MFEM_HOST_DEVICE inline int hash_index(const findptsLocalHashData_t *p,
172 const double x[3])
173{
174 const int n = p->hash_n;
175 int sum = 0;
176 for (int d = 3-1; d >= 0; --d)
177 {
178 sum *= n;
179 int i = (int)floor((x[d]-p->bnd[d].min)*p->fac[d]);
180 sum += i < 0 ? 0 : (n-1 < i ? n-1 : i);
181 }
182 return sum;
183}
184
185// Solve Ax=y. A is row-major.
186static MFEM_HOST_DEVICE inline void lin_solve_3(double x[3], const double A[9],
187 const double y[3])
188{
189 const double a = A[4]*A[8]-A[5]*A[7], b = A[5]*A[6]-A[3]*A[8],
190 c = A[3]*A[7]-A[4]*A[6],
191 idet = 1 / (A[0]*a+A[1]*b+A[2]*c);
192 const double inv0 = a, inv1 = A[2]*A[7]-A[1]*A[8],
193 inv2 = A[1]*A[5]-A[2]*A[4], inv3 = b,
194 inv4 = A[0]*A[8]-A[2]*A[6], inv5 = A[2]*A[3]-A[0]*A[5],
195 inv6 = c, inv7 = A[1]*A[6]-A[0]*A[7],
196 inv8 = A[0]*A[4]-A[1]*A[3];
197 x[0] = idet*(inv0*y[0]+inv1*y[1]+inv2*y[2]);
198 x[1] = idet*(inv3*y[0]+inv4*y[1]+inv5*y[2]);
199 x[2] = idet*(inv6*y[0]+inv7*y[1]+inv8*y[2]);
200}
201
202// Solve Ax=y. A is a symmetric 2x2 matrix.
203static MFEM_HOST_DEVICE inline void lin_solve_sym_2(double x[2],
204 const double A[3],
205 const double y[2])
206{
207 const double idet = 1 / (A[0]*A[2]-A[1]*A[1]);
208 x[0] = idet*(A[2]*y[0]-A[1]*y[1]);
209 x[1] = idet*(A[0]*y[1]-A[1]*y[0]);
210}
211
212// L2 norm.
213static MFEM_HOST_DEVICE inline double l2norm2(const double x[3])
214{
215 return x[0]*x[0]+x[1]*x[1]+x[2]*x[2];
216}
217
218/* the bit structure of flags is CTTSSRR
219 the C bit --- 1<<6 --- is set when the point is converged
220 RR is 0 = 00b if r is unconstrained,
221 1 = 01b if r is constrained at -1
222 2 = 10b if r is constrained at +1
223 SS, TT are similarly for s and t constraints
224 TT is ignored, but treated as set when 3==2
225*/
226#define CONVERGED_FLAG (1u << 6)
227#define FLAG_MASK 0x7fu // set all 7 bits to 1
228
229// Determine number of constraints based on the CTTSSRR bits.
230static MFEM_HOST_DEVICE inline int num_constrained(const int flags)
231{
232 const int y = flags | flags >> 1;
233 return (y & 1u)+(y >> 2 & 1u)+(((3 == 2) | y >> 4) & 1u);
234}
235
236// Helper functions. Assumes x = 0, 1, or 2.
237static MFEM_HOST_DEVICE inline int plus_1_mod_3(const int x)
238{
239 return ((x | x >> 1)+1) & 3u;
240}
241static MFEM_HOST_DEVICE inline int plus_2_mod_3(const int x)
242{
243 const int y = (x-1) & 3u;
244 return y ^ (y >> 1);
245}
246
247// Assumes x = 1 << i, with i < 6, returns i+1.
248static MFEM_HOST_DEVICE inline int which_bit(const int x)
249{
250 const int y = x & 7u;
251 return (y-(y >> 2)) | ((x-1) & 4u) | (x >> 4);
252}
253
254// Get face index based on the TTSSRR bits.
255static MFEM_HOST_DEVICE inline int face_index(const int x)
256{
257 return which_bit(x)-1;
258}
259
260// Get edge index based on the TTSSRR bits.
261static MFEM_HOST_DEVICE inline int edge_index(const int x)
262{
263 const int y = ~((x >> 1) | x);
264 const int RTSR = ((x >> 1) & 1u) | ((x >> 2) & 2u) |
265 ((x >> 3) & 4u) | ((x << 2) & 8u);
266 const int re = RTSR >> 1;
267 const int se = 4u | RTSR >> 2;
268 const int te = 8u | (RTSR & 3u);
269 return ((0u-(y & 1u)) & re) | ((0u-((y >> 2) & 1u)) & se) |
270 ((0u-((y >> 4) & 1u)) & te);
271}
272
273// Get point index based on the TTSSRR bits.
274static MFEM_HOST_DEVICE inline int point_index(const int x)
275{
276 return ((x >> 1) & 1u) | ((x >> 2) & 2u) | ((x >> 3) & 4u);
277}
278
279// Gets mesh nodal coordinates and normal derivative in reference-space at the
280// given face.
281static MFEM_HOST_DEVICE inline findptsElemFace
282get_face(const double *elx[3], const double *wtend, int fi, double *workspace,
283 int &side_init, int jidx, int pN) // jidx < 3*pN
284{
285 const int dn = fi >> 1, d1 = plus_1_mod_3(dn), d2 = plus_2_mod_3(dn);
286 const int side_n = fi & 1;
287 const int p_Nfr = pN*pN;
288 findptsElemFace face;
289 const int jj = jidx % pN;
290 const int dd = jidx / pN;
291 for (int d = 0; d < 3; ++d)
292 {
293 face.x[d] = workspace+d*p_Nfr;
294 face.dxdn[d] = workspace+(3+d)*p_Nfr;
295 }
296
297 if (side_init != (1u << fi))
298 {
299 const int e_stride[3] = {1, pN, pN*pN};
300#define ELX(d, j, k, l) elx[d][j*e_stride[d1]+k*e_stride[d2]+l*e_stride[dn]]
301 for (int k = 0; k < pN; ++k)
302 {
303 // copy first/last entries in normal direction
304 face.x[dd][jj+k*pN] = ELX(dd, jj, k, side_n*(pN-1));
305
306 // tensor product between elx and derivative in normal direction
307 double sum_l = 0;
308 for (int l = 0; l < pN; ++l)
309 {
310 sum_l += wtend[pN+l]*ELX(dd, jj, k, l);
311 }
312 face.dxdn[dd][jj+k*pN] = sum_l;
313 }
314#undef ELX
315 }
316 return face;
317}
318
319// Gets mesh nodal coordinates and normal derivatives in reference-space at the
320// given edge.
321static MFEM_HOST_DEVICE inline findptsElemEdge
322get_edge(const double *elx[3], const double *wtend, int ei, double *workspace,
323 int &side_init, int jidx, int pN)
324{
325 findptsElemEdge edge;
326 const int de = ei >> 2, dn1 = plus_1_mod_3(de), dn2 = plus_2_mod_3(de);
327 const int side_n1 = ei & 1, side_n2 = (ei & 2) >> 1;
328
329 const int in1 = side_n1*(pN-1), in2 = side_n2*(pN-1);
330 const double *wt1 = wtend+side_n1*pN*3;
331 const double *wt2 = wtend+side_n2*pN*3;
332 const int jj = jidx % pN;
333 const int dd = jidx / pN;
334 for (int d = 0; d < 3; ++d)
335 {
336 edge.x[d] = workspace+d*pN;
337 edge.dxdn1[d] = workspace+(3+d)*pN;
338 edge.dxdn2[d] = workspace+(6+d)*pN;
339 edge.d2xdn1[d] = workspace+(9+d)*pN;
340 edge.d2xdn2[d] = workspace+(12+d)*pN;
341 }
342
343 if (jidx >= 3*pN) { return edge; }
344
345 if (side_init != (64u << ei))
346 {
347 const int e_stride[3] = {1, pN, pN*pN};
348#define ELX(d, j, k, l) elx[d][j*e_stride[de]+k*e_stride[dn1]+l*e_stride[dn2]]
349 // copy first/last entries in normal directions
350 edge.x[dd][jj] = ELX(dd, jj, in1, in2);
351 // tensor product between elx (w/ first/last entries in second direction)
352 // and the derivatives in the first normal direction
353 double sums_k[2] = {0, 0};
354 for (int k = 0; k < pN; ++k)
355 {
356 sums_k[0] += wt1[pN+k]*ELX(dd, jj, k, in2);
357 sums_k[1] += wt1[2*pN+k]*ELX(dd, jj, k, in2);
358 }
359 edge.dxdn1[dd][jj] = sums_k[0];
360 edge.d2xdn1[dd][jj] = sums_k[1];
361 // tensor product between elx (w/ first/last entries in first direction)
362 // and the derivatives in the second normal direction
363 sums_k[0] = 0, sums_k[1] = 0;
364 for (int k = 0; k < pN; ++k)
365 {
366 sums_k[0] += wt2[pN+k]*ELX(dd, jj, in1, k);
367 sums_k[1] += wt2[2*pN+k]*ELX(dd, jj, in1, k);
368 }
369 edge.dxdn2[dd][jj] = sums_k[0];
370 edge.d2xdn2[dd][jj] = sums_k[1];
371#undef ELX
372 }
373 return edge;
374}
375
376// Gets nodal coordinate and derivatives at the given vertex of the element.
377static MFEM_HOST_DEVICE inline findptsElemPt get_pt(const double *elx[3],
378 const double *wtend,
379 int pi, int pN)
380{
381 const int side_n1 = pi & 1, side_n2 = (pi >> 1) & 1, side_n3 = (pi >> 2) & 1;
382 const int in1 = side_n1*(pN-1), in2 = side_n2*(pN-1),
383 in3 = side_n3*(pN-1);
384 const int hes_stride = (3+1)*3 / 2;
385 findptsElemPt pt;
386
387#define ELX(d, j, k, l) elx[d][j+k*pN+l*pN*pN]
388 for (int d = 0; d < 3; ++d)
389 {
390 pt.x[d] = ELX(d, side_n1*(pN-1), side_n2*(pN-1),
391 side_n3*(pN-1));
392
393 const double *wt1 = wtend+pN*(1+3*side_n1);
394 const double *wt2 = wtend+pN*(1+3*side_n2);
395 const double *wt3 = wtend+pN*(1+3*side_n3);
396
397 for (int i = 0; i < 3; ++i)
398 {
399 pt.jac[3*d+i] = 0;
400 }
401 for (int i = 0; i < hes_stride; ++i)
402 {
403 pt.hes[hes_stride*d+i] = 0;
404 }
405
406 for (int j = 0; j < pN; ++j)
407 {
408 pt.jac[3*d+0] += wt1[j]*ELX(d, j, in2, in3);
409 pt.hes[hes_stride*d] += wt1[pN+j]*ELX(d, j, in2, in3);
410 }
411
412 const int hes_off = hes_stride*d+hes_stride / 2;
413 for (int k = 0; k < pN; ++k)
414 {
415 pt.jac[3*d+1] += wt2[k]*ELX(d, in1, k, in3);
416 pt.hes[hes_off] += wt2[pN+k]*ELX(d, in1, k, in3);
417 }
418
419 for (int l = 0; l < pN; ++l)
420 {
421 pt.jac[3*d+2] += wt3[l]*ELX(d, in1, in2, l);
422 pt.hes[hes_stride*d+5] += wt3[pN+l]*ELX(d, in1, in2, l);
423 }
424
425 for (int l = 0; l < pN; ++l)
426 {
427 double sum_k = 0, sum_j = 0;
428 for (int k = 0; k < pN; ++k)
429 {
430 sum_k += wt2[k]*ELX(d, in1, k, l);
431 }
432 for (int j = 0; j < pN; ++j)
433 {
434 sum_j += wt1[j]*ELX(d, j, in2, l);
435 }
436 pt.hes[hes_stride*d+2] += wt3[l]*sum_j;
437 pt.hes[hes_stride*d+4] += wt3[l]*sum_k;
438 }
439 for (int k = 0; k < pN; ++k)
440 {
441 double sum_j = 0;
442 for (int j = 0; j < pN; ++j)
443 {
444 sum_j += wt1[j]*ELX(d, j, k, in3);
445 }
446 pt.hes[hes_stride*d+1] += wt2[k]*sum_j;
447 }
448#undef ELX
449 }
450 return pt;
451}
452
453/* Check reduction in objective against prediction, and adjust trust region
454 radius (p->tr) accordingly. May reject the prior step, returning 1; otherwise
455 returns 0 sets res->dist2, res->index, res->x, res->oldr in any event,
456 leaving res->r, res->dr, res->flags to be set when returning 0 */
457static MFEM_HOST_DEVICE bool reject_prior_step_q(findptsPt *res,
458 const double resid[3],
459 const findptsPt *p,
460 const double tol)
461{
462 const double dist2 = l2norm2(resid);
463 const double decr = p->dist2-dist2;
464 const double pred = p->dist2p;
465 for (int d = 0; d < 3; ++d)
466 {
467 res->x[d] = p->x[d];
468 res->oldr[d] = p->r[d];
469 }
470 res->dist2 = dist2;
471 if (decr >= 0.01*pred)
472 {
473 if (decr >= 0.9*pred)
474 {
475 // very good iteration
476 res->tr = p->tr*2;
477 }
478 else
479 {
480 // good iteration
481 res->tr = p->tr;
482 }
483 return false;
484 }
485 else
486 {
487 /* reject step; note: the point will pass through this routine
488 again, and we set things up here so it gets classed as a
489 "very good iteration" --- this doubles the trust radius,
490 which is why we divide by 4 below */
491 double v0 = fabs(p->r[0]-p->oldr[0]);
492 double v1 = fabs(p->r[1]-p->oldr[1]);
493 double v2 = fabs(p->r[2]-p->oldr[2]);
494 res->tr = (v1 > v2 ? (v0 > v1 ? v0 : v1) : (v0 > v2 ? v0 : v2)) / 4;
495 res->dist2 = p->dist2;
496 for (int d = 0; d < 3; ++d)
497 {
498 res->r[d] = p->oldr[d];
499 }
500 res->flags = p->flags >> 7;
501 res->dist2p = -HUGE_VAL;
502 if (pred < dist2*tol)
503 {
504 res->flags |= CONVERGED_FLAG;
505 }
506 return true;
507 }
508}
509
510/* minimize 0.5||x* - x(r)||^2_2 using gradient-descent, with
511 |dr| <= tr and |r0+dr|<=1*/
512static MFEM_HOST_DEVICE void newton_vol(findptsPt *const res,
513 const double jac[9],
514 const double resid[3],
515 const findptsPt *const p,
516 const double tol)
517{
518 const double tr = p->tr;
519 double bnd[6] = {-1, 1, -1, 1, -1, 1};
520 double r0[3];
521 double dr[3], fac;
522 int d, mask, flags;
523 r0[0] = p->r[0], r0[1] = p->r[1], r0[2] = p->r[2];
524
525 mask = 0x3fu;
526 for (d = 0; d < 3; ++d)
527 {
528 if (r0[d]-tr > -1)
529 {
530 bnd[2*d] = r0[d]-tr, mask ^= 1u << (2*d);
531 }
532 if (r0[d]+tr < 1)
533 {
534 bnd[2*d+1] = r0[d]+tr, mask ^= 2u << (2*d);
535 }
536 }
537
538 lin_solve_3(dr, jac, resid);
539
540 fac = 1, flags = 0;
541 for (d = 0; d < 3; ++d)
542 {
543 double nr = r0[d]+dr[d];
544 if ((nr-bnd[2*d])*(bnd[2*d+1]-nr) >= 0)
545 {
546 continue;
547 }
548 if (nr < bnd[2*d])
549 {
550 double f = (bnd[2*d]-r0[d]) / dr[d];
551 if (f < fac)
552 {
553 fac = f, flags = 1u << (2*d);
554 }
555 }
556 else
557 {
558 double f = (bnd[2*d+1]-r0[d]) / dr[d];
559 if (f < fac)
560 {
561 fac = f, flags = 2u << (2*d);
562 }
563 }
564 }
565
566 if (flags == 0)
567 {
568 goto newton_vol_fin;
569 }
570
571 for (d = 0; d < 3; ++d)
572 {
573 dr[d] *= fac;
574 }
575
576newton_vol_face :
577 {
578 const int fi = face_index(flags);
579 const int dn = fi >> 1, d1 = plus_1_mod_3(dn), d2 = plus_2_mod_3(dn);
580 double drc[2], facc = 1;
581 int new_flags = 0;
582 double ress[3], y[2], JtJ[3];
583 ress[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]);
584 ress[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]);
585 ress[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]);
586 /* y = J_u^T res */
587 y[0] = jac[d1]*ress[0]+jac[3+d1]*ress[1]+jac[6+d1]*ress[2];
588 y[1] = jac[d2]*ress[0]+jac[3+d2]*ress[1]+jac[6+d2]*ress[2];
589 /* JtJ = J_u^T J_u */
590 JtJ[0] = jac[d1]*jac[d1]+jac[3+d1]*jac[3+d1] +
591 jac[6+d1]*jac[6 +d1];
592 JtJ[1] = jac[d1]*jac[d2]+jac[3+d1]*jac[3+d2] +
593 jac[6+d1]*jac[6+d2];
594 JtJ[2] = jac[d2]*jac[d2]+jac[3+d2]*jac[3+d2] +
595 jac[6+d2]*jac[6+d2];
596 lin_solve_sym_2(drc, JtJ, y);
597#define CHECK_CONSTRAINT(drcd, d3) \
598{ \
599 const double rz = r0[d3]+dr[d3], lb = bnd[2*d3], ub = bnd[2*d3+1]; \
600 const double delta = drcd, nr = r0[d3]+(dr[d3]+delta); \
601 if ((nr-lb)*(ub-nr) < 0) { \
602 if (nr < lb) { \
603 double f = (lb-rz) / delta; \
604 if (f < fac) { \
605 fac = f; new_flags = 1u << (2*d3); \
606 } \
607 } \
608 else { \
609 double f = (ub-rz) / delta; \
610 if (f < fac) { \
611 fac = f; new_flags = 2u << (2*d3); \
612 } \
613 } \
614 } \
615}
616 CHECK_CONSTRAINT(drc[0], d1);
617 CHECK_CONSTRAINT(drc[1], d2);
618 dr[d1] += facc*drc[0], dr[d2] += facc*drc[1];
619 if (new_flags == 0)
620 {
621 goto newton_vol_fin;
622 }
623 flags |= new_flags;
624 }
625
626newton_vol_edge :
627 {
628 const int ei = edge_index(flags);
629 const int de = ei >> 2;
630 double facc = 1;
631 int new_flags = 0;
632 double ress[3], y, JtJ, drc;
633 ress[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]);
634 ress[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]);
635 ress[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]);
636 /* y = J_u^T res */
637 y = jac[de]*ress[0]+jac[3+de]*ress[1]+jac[6+de]*ress[2];
638 /* JtJ = J_u^T J_u */
639 JtJ = jac[de]*jac[de]+jac[3+de]*jac[3+de] +
640 jac[6+de]*jac[6+de];
641 drc = y / JtJ;
642 CHECK_CONSTRAINT(drc, de);
643#undef CHECK_CONSTRAINT
644 dr[de] += facc*drc;
645 flags |= new_flags;
646 goto newton_vol_relax;
647 }
648
649 /* check and possibly relax constraints */
650newton_vol_relax :
651 {
652 const int old_flags = flags;
653 double ress[3], y[3];
654 /* res := res_0-J dr */
655 ress[0] = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1]+jac[2]*dr[2]);
656 ress[1] = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1]+jac[5]*dr[2]);
657 ress[2] = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1]+jac[8]*dr[2]);
658 /* y := J^T res */
659 y[0] = jac[0]*ress[0]+jac[3]*ress[1]+jac[6]*ress[2];
660 y[1] = jac[1]*ress[0]+jac[4]*ress[1]+jac[7]*ress[2];
661 y[2] = jac[2]*ress[0]+jac[5]*ress[1]+jac[8]*ress[2];
662 for (int dd = 0; dd < 3; ++dd)
663 {
664 int f = flags >> (2*dd) & 3u;
665 if (f)
666 {
667 dr[dd] = bnd[2*dd+(f-1)]-r0[dd];
668 if (dr[dd]*y[dd] < 0)
669 {
670 flags &= ~(3u << (2*dd));
671 }
672 }
673 }
674 if (flags == old_flags)
675 {
676 goto newton_vol_fin;
677 }
678 switch (num_constrained(flags))
679 {
680 case 1:
681 goto newton_vol_face;
682 case 2:
683 goto newton_vol_edge;
684 }
685 }
686
687newton_vol_fin:
688 flags &= mask;
689 if (fabs(dr[0])+fabs(dr[1])+fabs(dr[2]) < tol)
690 {
691 flags |= CONVERGED_FLAG;
692 }
693 {
694 const double res0 = resid[0]-(jac[0]*dr[0]+jac[1]*dr[1] +
695 jac[2]*dr[2]);
696 const double res1 = resid[1]-(jac[3]*dr[0]+jac[4]*dr[1] +
697 jac[5]*dr[2]);
698 const double res2 = resid[2]-(jac[6]*dr[0]+jac[7]*dr[1] +
699 jac[8]*dr[2]);
700 res->dist2p = resid[0]*resid[0]+resid[1]*resid[1] +
701 resid[2]*resid[2] -
702 (res0*res0+res1*res1+res2*res2);
703 }
704 for (int dd = 0; dd < 3; ++dd)
705 {
706 int f = flags >> (2*dd) & 3u;
707 res->r[dd] = f == 0 ? r0[dd]+dr[dd] : (f == 1 ? -1 : 1);
708 }
709 res->flags = flags | (p->flags << 7);
710}
711
712// Full Newton solve on the face. One of r/s/t is constrained.
713static MFEM_HOST_DEVICE void newton_face(findptsPt *const res,
714 const double jac[9],
715 const double rhes[3],
716 const double resid[3],
717 const int d1,
718 const int d2,
719 const int dn,
720 const int flags,
721 const findptsPt *const p,
722 const double tol)
723{
724 const double tr = p->tr;
725 double bnd[4];
726 double r[2], dr[2] = {0, 0};
727 int mask, new_flags;
728 double v, tv;
729 int i;
730 double A[3], y[2], r0[2];
731 /* A = J^T J-resid_d H_d */
732 A[0] = jac[d1]*jac[d1]+jac[3+d1]*jac[3+d1] +
733 jac[6+d1]*jac[6+d1]-rhes[0];
734 A[1] = jac[d1]*jac[d2]+jac[3+d1]*jac[3+d2] +
735 jac[6+d1]*jac[6+d2]-rhes[1];
736 A[2] = jac[d2]*jac[d2]+jac[3+d2]*jac[3+d2] +
737 jac[6+d2]*jac[6+d2]-rhes[2];
738 /* y = J^T r */
739 y[0] = jac[d1]*resid[0]+jac[3+d1]*resid[1]+jac[6+d1]*resid[2];
740 y[1] = jac[d2]*resid[0]+jac[3+d2]*resid[1]+jac[6+d2]*resid[2];
741 r0[0] = p->r[d1];
742 r0[1] = p->r[d2];
743
744 new_flags = flags;
745 mask = 0x3fu;
746 if (r0[0]-tr > -1)
747 {
748 bnd[0] = -tr;
749 mask ^= 1u;
750 }
751 else
752 {
753 bnd[0] = -1-r0[0];
754 }
755 if (r0[0]+tr < 1)
756 {
757 bnd[1] = tr;
758 mask ^= 2u;
759 }
760 else
761 {
762 bnd[1] = 1-r0[0];
763 }
764 if (r0[1]-tr > -1)
765 {
766 bnd[2] = -tr;
767 mask ^= 1u << 2;
768 }
769 else
770 {
771 bnd[2] = -1-r0[1];
772 }
773 if (r0[1]+tr < 1)
774 {
775 bnd[3] = tr;
776 mask ^= 2u << 2;
777 }
778 else
779 {
780 bnd[3] = 1-r0[1];
781 }
782
783 if (A[0]+A[2] <= 0 || A[0]*A[2] <= A[1]*A[1])
784 {
785 goto newton_face_constrained;
786 }
787 lin_solve_sym_2(dr, A, y);
788
789#define EVAL(r, s) -(y[0]*r+y[1]*s)+(r*A[0]*r+(2*r*A[1]+s*A[2])*s) / 2
790 if ((dr[0]-bnd[0])*(bnd[1]-dr[0]) >= 0 &&
791 (dr[1]-bnd[2])*(bnd[3]-dr[1]) >= 0)
792 {
793 r[0] = r0[0]+dr[0], r[1] = r0[1]+dr[1];
794 v = EVAL(dr[0], dr[1]);
795 goto newton_face_fin;
796 }
797newton_face_constrained:
798 v = EVAL(bnd[0], bnd[2]);
799 i = 1u | (1u << 2);
800 tv = EVAL(bnd[1], bnd[2]);
801 if (tv < v)
802 {
803 v = tv;
804 i = 2u | (1u << 2);
805 }
806 tv = EVAL(bnd[0], bnd[3]);
807 if (tv < v)
808 {
809 v = tv;
810 i = 1u | (2u << 2);
811 }
812 tv = EVAL(bnd[1], bnd[3]);
813 if (tv < v)
814 {
815 v = tv;
816 i = 2u | (2u << 2);
817 }
818 if (A[0] > 0)
819 {
820 double drc;
821 drc = (y[0]-A[1]*bnd[2]) / A[0];
822 if ((drc-bnd[0])*(bnd[1]-drc) >= 0 && (tv = EVAL(drc, bnd[2])) < v)
823 {
824 v = tv;
825 i = 1u << 2;
826 dr[0] = drc;
827 }
828 drc = (y[0]-A[1]*bnd[3]) / A[0];
829 if ((drc-bnd[0])*(bnd[1]-drc) >= 0 && (tv = EVAL(drc, bnd[3])) < v)
830 {
831 v = tv;
832 i = 2u << 2;
833 dr[0] = drc;
834 }
835 }
836 if (A[2] > 0)
837 {
838 double drc;
839 drc = (y[1]-A[1]*bnd[0]) / A[2];
840 if ((drc-bnd[2])*(bnd[3]-drc) >= 0 && (tv = EVAL(bnd[0], drc)) < v)
841 {
842 v = tv;
843 i = 1u;
844 dr[1] = drc;
845 }
846 drc = (y[1]-A[1]*bnd[1]) / A[2];
847 if ((drc-bnd[2])*(bnd[3]-drc) >= 0 && (tv = EVAL(bnd[1], drc)) < v)
848 {
849 v = tv;
850 i = 2u;
851 dr[1] = drc;
852 }
853 }
854#undef EVAL
855
856 {
857 int dir[2];
858 dir[0] = d1;
859 dir[1] = d2;
860 for (int d = 0; d < 2; ++d)
861 {
862 const int f = i >> (2*d) & 3u;
863 if (f == 0)
864 {
865 r[d] = r0[d]+dr[d];
866 }
867 else
868 {
869 if ((f & (mask >> (2*d))) == 0)
870 {
871 r[d] = r0[d]+(f == 1 ? -tr : tr);
872 }
873 else
874 {
875 r[d] = (f == 1 ? -1 : 1);
876 new_flags |= f << (2*dir[d]);
877 }
878 }
879 }
880 }
881newton_face_fin:
882 res->dist2p = -2*v;
883 dr[0] = r[0]-p->r[d1];
884 dr[1] = r[1]-p->r[d2];
885 if (fabs(dr[0])+fabs(dr[1]) < tol)
886 {
887 new_flags |= CONVERGED_FLAG;
888 }
889 res->r[dn] = p->r[dn];
890 res->r[d1] = r[0];
891 res->r[d2] = r[1];
892 res->flags = new_flags | (p->flags << 7);
893}
894
895// Full Newton solve on the edge. Two of r/s/t are constrained.
896static MFEM_HOST_DEVICE inline void newton_edge(findptsPt *const res,
897 const double jac[9],
898 const double rhes,
899 const double resid[3],
900 const int de,
901 const int dn1,
902 const int dn2,
903 int flags,
904 const findptsPt *const p,
905 const double tol)
906{
907 const double tr = p->tr;
908 /* A = J^T J-resid_d H_d */
909 const double A = jac[de]*jac[de]+jac[3+de]*jac[3+de]+jac[6+de] *
910 jac[6+de]-rhes;
911 /* y = J^T r */
912 const double y = jac[de]*resid[0]+jac[3+de]*resid[1]+jac[6+de] *
913 resid[2];
914
915 const double oldr = p->r[de];
916 double dr, nr, tdr, tnr;
917 double v, tv;
918 int new_flags = 0, tnew_flags = 0;
919
920#define EVAL(dr) (dr*A-2*y)*dr
921
922 /* if A is not SPD, quadratic model has no minimum */
923 if (A > 0)
924 {
925 dr = y / A;
926 nr = oldr+dr;
927 if (fabs(dr) < tr && fabs(nr) < 1)
928 {
929 v = EVAL(dr);
930 goto newton_edge_fin;
931 }
932 }
933
934 if ((nr = oldr-tr) > -1)
935 {
936 dr = -tr;
937 }
938 else
939 {
940 nr = -1;
941 dr = -1-oldr;
942 new_flags = flags | 1u << (2*de);
943 }
944 v = EVAL(dr);
945
946 if ((tnr = oldr+tr) < 1)
947 {
948 tdr = tr;
949 }
950 else
951 {
952 tnr = 1;
953 tdr = 1-oldr;
954 tnew_flags = flags | 2u << (2*de);
955 }
956 tv = EVAL(tdr);
957
958 if (tv < v)
959 {
960 nr = tnr;
961 dr = tdr;
962 v = tv;
963 new_flags = tnew_flags;
964 }
965
966newton_edge_fin:
967 /* check convergence */
968 if (fabs(dr) < tol)
969 {
970 new_flags |= CONVERGED_FLAG;
971 }
972 res->r[de] = nr;
973 res->r[dn1] = p->r[dn1];
974 res->r[dn2] = p->r[dn2];
975 res->dist2p = -v;
976 res->flags = flags | new_flags | (p->flags << 7);
977}
978
979// Find closest mesh node to the sought point.
980static MFEM_HOST_DEVICE void seed_j(const double *elx[3],
981 const double x[3],
982 const double *z,//GLL point locations [-1, 1]
983 double *dist2,
984 double *r[3],
985 const int j,
986 const int pN) // assumes j < pN
987{
988 dist2[j] = HUGE_VAL;
989
990 double zr = z[j];
991 for (int l = 0; l < pN; ++l)
992 {
993 const double zt = z[l];
994 for (int k = 0; k < pN; ++k)
995 {
996 double zs = z[k];
997
998 const int jkl = j+k*pN+l*pN*pN;
999 double dx[3];
1000 for (int d = 0; d < 3; ++d)
1001 {
1002 dx[d] = x[d]-elx[d][jkl];
1003 }
1004 const double dist2_jkl = l2norm2(dx);
1005 if (dist2[j] > dist2_jkl)
1006 {
1007 dist2[j] = dist2_jkl;
1008 r[0][j] = zr;
1009 r[1][j] = zs;
1010 r[2][j] = zt;
1011 }
1012 }
1013 }
1014}
1015
1016/* Compute contribution towards function value and its derivatives in each
1017 reference direction. */
1018static MFEM_HOST_DEVICE double tensor_ig3_j(double *g_partials,
1019 const double *Jr,
1020 const double *Dr,
1021 const double *Js,
1022 const double *Ds,
1023 const double *Jt,
1024 const double *Dt,
1025 const double *u,
1026 const int j,
1027 const int pN)
1028{
1029 double uJtJs = 0.0;
1030 double uDtJs = 0.0;
1031 double uJtDs = 0.0;
1032 for (int k = 0; k < pN; ++k)
1033 {
1034 double uJt = 0.0;
1035 double uDt = 0.0;
1036 for (int l = 0; l < pN; ++l)
1037 {
1038 uJt += u[j+k*pN+l*pN*pN]*Jt[l];
1039 uDt += u[j+k*pN+l*pN*pN]*Dt[l];
1040 }
1041
1042 uJtJs += uJt*Js[k];
1043 uJtDs += uJt*Ds[k];
1044 uDtJs += uDt*Js[k];
1045 }
1046
1047 g_partials[0] = uJtJs*Dr[j];
1048 g_partials[1] = uJtDs*Jr[j];
1049 g_partials[2] = uDtJs*Jr[j];
1050 return uJtJs*Jr[j];
1051}
1052
1053template<int T_D1D = 0>
1054static void FindPointsLocal3DKernel(const int npt,
1055 const double tol,
1056 const double *x,
1057 const int point_pos_ordering,
1058 const double *xElemCoord,
1059 const int nel,
1060 const double *wtend,
1061 const double *boxinfo,
1062 const int hash_n,
1063 const double *hashMin,
1064 const double *hashFac,
1065 unsigned int *hashOffset,
1066 unsigned int *const code_base,
1067 unsigned int *const el_base,
1068 double *const r_base,
1069 double *const dist2_base,
1070 const double *gll1D,
1071 const double *lagcoeff,
1072 const int pN = 0)
1073{
1074 const int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1075 const int D1D = T_D1D ? T_D1D : pN;
1076 const int p_NE = D1D*D1D*D1D;
1077 MFEM_VERIFY(MD1 <= DofQuadLimits::MAX_D1D,
1078 "Increase Max allowable polynomial order.");
1079 MFEM_VERIFY(D1D != 0, "Polynomial order not specified.");
1080#define MAXC(a, b) (((a) > (b)) ? (a) : (b))
1081 const int nThreads = MAXC(D1D*DIM, 15);
1082
1083 mfem::forall_2D(npt, nThreads, 1, [=] MFEM_HOST_DEVICE (int i)
1084 {
1085 // 4 D1D for seed, 18D1D+12 for vol, 21D1D+15 for face, 3D1D+32 for edge.
1086 constexpr int size1 = 21*MD1+15;
1087 // 6D1D^2 for face, 15D1D for edge.
1088 constexpr int size2 = MAXC(MD1*MD1*6, MD1*3*5);
1089 //size depends on max of info for faces and edges
1090 constexpr int size3 = MD1*MD1*MD1*DIM; // local element coordinates
1091
1092 MFEM_SHARED double r_workspace[size1];
1093 MFEM_SHARED findptsPt el_pts[2];
1094
1095 MFEM_SHARED double constraint_workspace[size2];
1096 MFEM_SHARED int face_edge_init;
1097
1098 MFEM_SHARED double elem_coords[MD1 <= 6 ? size3 : 1];
1099
1100 double *r_workspace_ptr;
1101 findptsPt *fpt, *tmp;
1102 MFEM_FOREACH_THREAD(j,x,nThreads)
1103 {
1104 r_workspace_ptr = r_workspace;
1105 fpt = el_pts+0;
1106 tmp = el_pts+1;
1107 }
1108 MFEM_SYNC_THREAD;
1109
1110 int id_x = point_pos_ordering == 0 ? i : i*DIM;
1111 int id_y = point_pos_ordering == 0 ? i+npt : i*DIM+1;
1112 int id_z = point_pos_ordering == 0 ? i+2*npt : i*DIM+2;
1113 double x_i[3] = {x[id_x], x[id_y], x[id_z]};
1114
1115 unsigned int *code_i = code_base+i;
1116 double *dist2_i = dist2_base+i;
1117
1118 //// map_points_to_els ////
1119 findptsLocalHashData_t hash;
1120 for (int d = 0; d < DIM; ++d)
1121 {
1122 hash.bnd[d].min = hashMin[d];
1123 hash.fac[d] = hashFac[d];
1124 }
1125 hash.hash_n = hash_n;
1126 hash.offset = hashOffset;
1127 const unsigned int hi = hash_index(&hash, x_i);
1128 const unsigned int *elp = hash.offset+hash.offset[hi],
1129 *const ele = hash.offset+hash.offset[hi+1];
1130 *code_i = CODE_NOT_FOUND;
1131 *dist2_i = HUGE_VAL;
1132
1133 for (; elp != ele; ++elp)
1134 {
1135 //elp
1136
1137 const int el = *elp;
1138
1139 // construct obbox_t on the fly from data
1140 obbox_t box;
1141 int n_box_ents = 3*DIM+DIM2;
1142 for (int idx = 0; idx < DIM; ++idx)
1143 {
1144 box.c0[idx] = boxinfo[n_box_ents*el+idx];
1145 box.x[idx].min = boxinfo[n_box_ents*el+DIM+idx];
1146 box.x[idx].max = boxinfo[n_box_ents*el+2*DIM+idx];
1147 }
1148
1149 for (int idx = 0; idx < DIM2; ++idx)
1150 {
1151 box.A[idx] = boxinfo[n_box_ents*el+3*DIM+idx];
1152 }
1153
1154 if (bbox_test(&box, x_i) < 0) { continue; }
1155
1156 //// findpts_local ////
1157 {
1158 // read element coordinates into shared memory
1159 if (MD1 <= 6)
1160 {
1161 MFEM_FOREACH_THREAD(j,x,D1D*DIM)
1162 {
1163 const int qp = j % D1D;
1164 const int d = j / D1D;
1165 for (int l = 0; l < D1D; ++l)
1166 {
1167 for (int k = 0; k < D1D; ++k)
1168 {
1169 const int jkl = qp+k*D1D+l*D1D*D1D;
1170 elem_coords[jkl+d*p_NE] =
1171 xElemCoord[jkl+el*p_NE+d*nel*p_NE];
1172 }
1173 }
1174 }
1175 MFEM_SYNC_THREAD;
1176 }
1177
1178 const double *elx[DIM];
1179 for (int d = 0; d < DIM; d++)
1180 {
1181 elx[d] = MD1<= 6 ? &elem_coords[d*p_NE] :
1182 xElemCoord+d*nel*p_NE+el*p_NE;
1183 }
1184
1185 //// findpts_el ////
1186 {
1187 MFEM_SYNC_THREAD;
1188 MFEM_FOREACH_THREAD(j,x,1)
1189 {
1190 fpt->dist2 = HUGE_VAL;
1191 fpt->dist2p = 0;
1192 fpt->tr = 1;
1193 face_edge_init = 0;
1194 }
1195 MFEM_FOREACH_THREAD(j,x,DIM)
1196 {
1197 fpt->x[j] = x_i[j];
1198 }
1199 MFEM_SYNC_THREAD;
1200
1201 //// seed ////
1202 {
1203 double *dist2_temp = r_workspace_ptr;
1204 double *r_temp[DIM];
1205 for (int d = 0; d < DIM; ++d)
1206 {
1207 r_temp[d] = dist2_temp+(1+d)*D1D;
1208 }
1209
1210 MFEM_FOREACH_THREAD(j,x,D1D)
1211 {
1212 seed_j(elx, x_i, gll1D, dist2_temp, r_temp, j, D1D);
1213 }
1214 MFEM_SYNC_THREAD;
1215
1216 MFEM_FOREACH_THREAD(j,x,1)
1217 {
1218 fpt->dist2 = HUGE_VAL;
1219 for (int jj = 0; jj < D1D; ++jj)
1220 {
1221 if (dist2_temp[jj] < fpt->dist2)
1222 {
1223 fpt->dist2 = dist2_temp[jj];
1224 for (int d = 0; d < DIM; ++d)
1225 {
1226 fpt->r[d] = r_temp[d][jj];
1227 }
1228 }
1229 }
1230 }
1231 MFEM_SYNC_THREAD;
1232 } //seed done
1233
1234 MFEM_FOREACH_THREAD(j,x,1)
1235 {
1236 tmp->dist2 = HUGE_VAL;
1237 tmp->dist2p = 0;
1238 tmp->tr = 1;
1239 tmp->flags = 0; // we do newton_vol regardless of seed.
1240 }
1241 MFEM_FOREACH_THREAD(j,x,DIM)
1242 {
1243 tmp->x[j] = fpt->x[j];
1244 tmp->r[j] = fpt->r[j];
1245 }
1246 MFEM_SYNC_THREAD;
1247
1248 for (int step = 0; step < 50; step++)
1249 {
1250 switch (num_constrained(tmp->flags & FLAG_MASK))
1251 {
1252 case 0: // findpt_vol
1253 {
1254 double *wtr = r_workspace_ptr;
1255
1256 double *resid = wtr+6*D1D;
1257 double *jac = resid+3;
1258 double *resid_temp = jac+9;
1259 double *jac_temp = resid_temp+3*D1D;
1260
1261 MFEM_FOREACH_THREAD(j,x,D1D*DIM)
1262 {
1263 const int qp = j % D1D;
1264 const int d = j / D1D;
1265 lag_eval_first_der(wtr+2*d*D1D, tmp->r[d], qp,
1266 gll1D, lagcoeff, D1D);
1267 }
1268 MFEM_SYNC_THREAD;
1269
1270 MFEM_FOREACH_THREAD(j,x,D1D*DIM)
1271 {
1272 const int qp = j % D1D;
1273 const int d = j / D1D;
1274 double *idx = jac_temp+3*d+9*qp;
1275 resid_temp[d+qp*3] = tensor_ig3_j(idx,
1276 wtr,
1277 wtr+D1D,
1278 wtr+2*D1D,
1279 wtr+3*D1D,
1280 wtr+4*D1D,
1281 wtr+5*D1D,
1282 elx[d],
1283 qp, D1D);
1284 }
1285 MFEM_SYNC_THREAD;
1286
1287 MFEM_FOREACH_THREAD(l,x,3)
1288 {
1289 resid[l] = tmp->x[l];
1290 for (int j = 0; j < D1D; ++j)
1291 {
1292 resid[l] -= resid_temp[l+j*3];
1293 }
1294 }
1295 MFEM_FOREACH_THREAD(l,x,9)
1296 {
1297 jac[l] = 0;
1298 for (int j = 0; j < D1D; ++j)
1299 {
1300 jac[l] += jac_temp[l+j*9];
1301 }
1302 }
1303 MFEM_SYNC_THREAD;
1304
1305 MFEM_FOREACH_THREAD(l,x,1)
1306 {
1307 // if (l == 0)
1308 {
1309 if (!reject_prior_step_q(fpt, resid, tmp, tol))
1310 {
1311 newton_vol(fpt, jac, resid, tmp, tol);
1312 }
1313 }
1314 }
1315 MFEM_SYNC_THREAD;
1316 break;
1317 } //case 0
1318 case 1: // findpt_face
1319 {
1320 const int fi = face_index(tmp->flags & FLAG_MASK);
1321 const int dn = fi >> 1;
1322 const int d1 = plus_1_mod_3(dn), d2 = plus_2_mod_3(dn);
1323
1324 double *wt1 = r_workspace_ptr;
1325 double *resid = wt1+6*D1D;
1326 double *jac = resid+3;
1327 double *resid_temp = jac+9;
1328 double *jac_temp = resid_temp+3*D1D;
1329 double *hes = jac_temp+9*D1D;
1330 double *hes_temp = hes+3;
1331 MFEM_SYNC_THREAD;
1332
1333 MFEM_FOREACH_THREAD(j,x,D1D*2)
1334 {
1335 int dd[2];
1336 dd[0] = d1;
1337 dd[1] = d2;
1338 const int d = j / D1D;
1339 const int qp = j % D1D;
1340 lag_eval_second_der(wt1+3*d*D1D, tmp->r[dd[d]],
1341 qp, gll1D, lagcoeff, D1D);
1342 }
1343 MFEM_SYNC_THREAD;
1344
1345 double *J1 = wt1, *D1 = wt1+D1D;
1346 double *J2 = wt1+3*D1D, *D2 = J2+D1D;
1347 double *DD1 = D1+D1D, *DD2 = D2+D1D;
1348 findptsElemFace face;
1349
1350 MFEM_FOREACH_THREAD(j,x,D1D*DIM)
1351 {
1352 // utilizes first 3*D1D threads
1353 face = get_face(elx, wtend, fi, constraint_workspace,
1354 face_edge_init, j, D1D);
1355 }
1356 MFEM_SYNC_THREAD;
1357
1358 MFEM_FOREACH_THREAD(j,x,D1D*DIM)
1359 {
1360 if (j == 0) { face_edge_init = (1 << fi); }
1361 const int qp = j % D1D;
1362 const int d = j / D1D;
1363 const double *u = face.x[d];
1364 const double *du = face.dxdn[d];
1365 double sums_k[4] = {0.0, 0.0, 0.0, 0.0};
1366 for (int k = 0; k < D1D; ++k)
1367 {
1368 sums_k[0] += u[qp+k*D1D]*J2[k];
1369 sums_k[1] += u[qp+k*D1D]*D2[k];
1370 sums_k[2] += u[qp+k*D1D]*DD2[k];
1371 sums_k[3] += du[qp+k*D1D]*J2[k];
1372 }
1373
1374 resid_temp[3*qp+d] = sums_k[0]*J1[qp];
1375 jac_temp[9*qp+3*d+d1] = sums_k[0]*D1[qp];
1376 jac_temp[9*qp+3*d+d2] = sums_k[1]*J1[qp];
1377 jac_temp[9*qp+3*d+dn] = sums_k[3]*J1[qp];
1378 if (d == 0)
1379 {
1380 hes_temp[3*qp] = sums_k[0]*DD1[qp];
1381 hes_temp[3*qp+1] = sums_k[1]*D1[qp];
1382 hes_temp[3*qp+2] = sums_k[2]*J1[qp];
1383 }
1384 }
1385 MFEM_SYNC_THREAD;
1386
1387 MFEM_FOREACH_THREAD(l,x,3)
1388 {
1389 resid[l] = fpt->x[l];
1390 hes[l] = 0;
1391 for (int j = 0; j < D1D; ++j)
1392 {
1393 resid[l] -= resid_temp[l+j*3];
1394 hes[l] += hes_temp[l+3*j];
1395 }
1396 hes[l] *= resid[l];
1397 }
1398
1399 MFEM_FOREACH_THREAD(l,x,9)
1400 {
1401 jac[l] = 0;
1402 for (int j = 0; j < D1D; ++j)
1403 {
1404 jac[l] += jac_temp[l+j*9];
1405 }
1406 }
1407 MFEM_SYNC_THREAD;
1408
1409 MFEM_FOREACH_THREAD(l,x,1)
1410 {
1411 if (!reject_prior_step_q(fpt, resid, tmp, tol))
1412 {
1413 const double steep = resid[0]*jac[dn]+
1414 resid[1]*jac[3+dn]+
1415 resid[2]*jac[6+dn];
1416 if (steep*tmp->r[dn] < 0)
1417 {
1418 // relax constraint //
1419 newton_vol(fpt, jac, resid, tmp, tol);
1420 }
1421 else
1422 {
1423 newton_face(fpt, jac, hes, resid, d1,
1424 d2, dn, tmp->flags&FLAG_MASK,
1425 tmp, tol);
1426 }
1427 }
1428 }
1429 MFEM_SYNC_THREAD;
1430 break;
1431 }
1432 case 2: // findpt_edge
1433 {
1434 const int ei = edge_index(tmp->flags & FLAG_MASK);
1435 const int de = ei >> 2,
1436 dn1 = plus_1_mod_3(de),
1437 dn2 = plus_2_mod_3(de);
1438 int d_j[3];
1439 d_j[0] = de;
1440 d_j[1] = dn1;
1441 d_j[2] = dn2;
1442 const int hes_count = 2*3-1;
1443
1444 double *wt = r_workspace_ptr;
1445 double *resid = wt+3*D1D;
1446 double *jac = resid+3;
1447 double *hes_T = jac+9;
1448 double *hes = hes_T+hes_count*3;
1449 findptsElemEdge edge;
1450
1451 MFEM_FOREACH_THREAD(j,x,nThreads)
1452 {
1453 // utilizes first 3*D1D threads f
1454 edge = get_edge(elx, wtend, ei,
1455 constraint_workspace,
1456 face_edge_init, j,
1457 D1D);
1458 }
1459 MFEM_SYNC_THREAD;
1460
1461 const double *const *e_x[3+3] = {edge.x, edge.x,
1462 edge.dxdn1,
1463 edge.dxdn2,
1464 edge.d2xdn1,
1465 edge.d2xdn2
1466 };
1467
1468 MFEM_FOREACH_THREAD(j,x,D1D)
1469 {
1470 if (j == 0) { face_edge_init = (64 << ei); }
1471 lag_eval_second_der(wt, tmp->r[de], j, gll1D,
1472 lagcoeff, D1D);
1473 }
1474 MFEM_SYNC_THREAD;
1475
1476 MFEM_FOREACH_THREAD(j,x,hes_count*3)
1477 {
1478 const int d = j % 3;
1479 const int row = j / 3;
1480 if (j < (3+1)*3)
1481 {
1482 // resid and jac_T
1483 // [0, 1, 0, 0]
1484 double *wt_j = wt+(row == 1 ? D1D : 0);
1485 const double *x = e_x[row][d];
1486 double sum = 0.0;
1487 for (int k = 0; k < D1D; ++k)
1488 {
1489 // resid+3 == jac_T
1490 sum += wt_j[k]*x[k];
1491 }
1492 if (j < 3)
1493 {
1494 resid[j] = tmp->x[j]-sum;
1495 }
1496 else
1497 {
1498 jac[d*3+d_j[row-1]] = sum;
1499 }
1500 }
1501
1502 {
1503 // Hes_T is transposed version (i.e. in col major)
1504 // n1*[2, 1, 1, 0, 0]
1505 // j==1 => wt_j = wt+n1
1506 double *wt_j = wt+D1D*(2-(row+1) / 2);
1507 const double *x = e_x[row+1][d];
1508 hes_T[j] = 0.0;
1509 for (int k = 0; k < D1D; ++k)
1510 {
1511 hes_T[j] += wt_j[k]*x[k];
1512 }
1513 }
1514 }
1515 MFEM_SYNC_THREAD;
1516
1517 MFEM_FOREACH_THREAD(j,x,hes_count)
1518 {
1519 hes[j] = 0.0;
1520 for (int d = 0; d < 3; ++d)
1521 {
1522 hes[j] += resid[d]*hes_T[j*3+d];
1523 }
1524 }
1525
1526 MFEM_SYNC_THREAD;
1527
1528 MFEM_FOREACH_THREAD(l,x,1)
1529 {
1530 // check prior step //
1531 if (!reject_prior_step_q(fpt, resid, tmp, tol))
1532 {
1533 // check constraint //
1534 double steep[3-1];
1535 for (int k = 0; k < 3-1; ++k)
1536 {
1537 int dn = d_j[k+1];
1538 steep[k] = 0;
1539 for (int d = 0; d < 3; ++d)
1540 {
1541 steep[k] += jac[dn+d*3]*resid[d];
1542 }
1543 steep[k] *= tmp->r[dn];
1544 }
1545 if (steep[0] < 0)
1546 {
1547 if (steep[1] < 0)
1548 {
1549 newton_vol(fpt, jac, resid, tmp, tol);
1550 }
1551 else
1552 {
1553 double rh[3];
1554 rh[0] = hes[0];
1555 rh[1] = hes[1];
1556 rh[2] = hes[3];
1557 newton_face(fpt, jac, rh, resid, de,
1558 dn1, dn2,
1559 tmp->flags&(3u<<(dn2*2)),
1560 tmp, tol);
1561 }
1562 }
1563 else
1564 {
1565 if (steep[1] < 0)
1566 {
1567 double rh[3];
1568 rh[0] = hes[4];
1569 rh[1] = hes[2];
1570 rh[2] = hes[0];
1571 newton_face(fpt, jac, rh, resid, dn2,
1572 de, dn1,
1573 tmp->flags&(3u<<(dn1*2)),
1574 tmp, tol);
1575 }
1576 else
1577 {
1578 newton_edge(fpt, jac, hes[0], resid,
1579 de, dn1, dn2,
1580 tmp->flags & FLAG_MASK,
1581 tmp, tol);
1582 }
1583 }
1584 }
1585 }
1586 MFEM_SYNC_THREAD;
1587 break;
1588 }
1589 case 3: // findpts_pt
1590 {
1591 MFEM_FOREACH_THREAD(j,x,1)
1592 {
1593 // if (j == 0)
1594 const int pi=point_index(tmp->flags & FLAG_MASK);
1595 const findptsElemPt gpt=get_pt(elx,wtend,pi,D1D);
1596 const double *const pt_x = gpt.x;
1597 const double *const jac = gpt.jac;
1598 const double *const hes = gpt.hes;
1599
1600 double resid[3], steep[3];
1601 for (int d = 0; d < 3; ++d)
1602 {
1603 resid[d] = fpt->x[d]-pt_x[d];
1604 }
1605 if (!reject_prior_step_q(fpt, resid, tmp, tol))
1606 {
1607 for (int d = 0; d < 3; ++d)
1608 {
1609 steep[d] = 0;
1610 for (int e = 0; e < 3; ++e)
1611 {
1612 steep[d] += jac[d+e*3]*resid[e];
1613 }
1614 steep[d] *= tmp->r[d];
1615 }
1616 int de, dn1, dn2, d1, d2, dn, hi0, hi1, hi2;
1617 if (steep[0] < 0)
1618 {
1619 if (steep[1] < 0)
1620 {
1621 if (steep[2] < 0)
1622 {
1623 newton_vol(fpt,jac,resid,tmp,tol);
1624 }
1625 else
1626 {
1627 d1 = 0; d2 = 1; dn = 2; hi0 = 0;
1628 hi1 = 1, hi2 = 3;
1629 double rh[3];
1630 rh[0] = resid[0]*hes[hi0] +
1631 resid[1]*hes[6+hi0] +
1632 resid[2]*hes[12+hi0];
1633 rh[1] = resid[0]*hes[hi1] +
1634 resid[1]*hes[6+hi1] +
1635 resid[2]*hes[12+hi1];
1636 rh[2] = resid[0]*hes[hi2] +
1637 resid[1]*hes[6+hi2] +
1638 resid[2]*hes[12+hi2];
1639 newton_face(fpt, jac, rh, resid,
1640 d1, d2, dn,
1641 (tmp->flags)&(3u<<(2*dn)),
1642 tmp, tol);
1643 }
1644 }
1645 else
1646 {
1647 if (steep[2] < 0)
1648 {
1649 d1 = 2; d2 = 0; dn = 1; hi0 = 5;
1650 hi1 = 2, hi2 = 0;
1651 double rh[3];
1652 rh[0] = resid[0]*hes[hi0] +
1653 resid[1]*hes[6+hi0] +
1654 resid[2]*hes[12+hi0];
1655 rh[1] = resid[0]*hes[hi1] +
1656 resid[1]*hes[6+hi1] +
1657 resid[2]*hes[12+hi1];
1658 rh[2] = resid[0]*hes[hi2] +
1659 resid[1]*hes[6+hi2] +
1660 resid[2]*hes[12+hi2];
1661 newton_face(fpt, jac, rh, resid,
1662 d1, d2, dn,
1663 (tmp->flags)&(3u<<(2*dn)),
1664 tmp, tol);
1665 }
1666 else
1667 {
1668 de = 0, dn1 = 1, dn2 = 2, hi0 = 0;
1669 const double rh =
1670 resid[0]*hes[hi0] +
1671 resid[1]*hes[6+hi0] +
1672 resid[2]*hes[12+hi0];
1673 newton_edge(fpt, jac, rh, resid,
1674 de, dn1, dn2,
1675 tmp->flags&(~(3u<<(2*de))),
1676 tmp, tol);
1677 }
1678 }
1679 }
1680 else
1681 {
1682 if (steep[1] < 0)
1683 {
1684 if (steep[2] < 0)
1685 {
1686 d1 = 1, d2 = 2, dn = 0;
1687 hi0 = 3, hi1 = 4, hi2 = 5;
1688 double rh[3];
1689 rh[0] = resid[0]*hes[hi0] +
1690 resid[1]*hes[6+hi0] +
1691 resid[2]*hes[12+hi0];
1692 rh[1] = resid[0]*hes[hi1] +
1693 resid[1]*hes[6+hi1] +
1694 resid[2]*hes[12+hi1];
1695 rh[2] = resid[0]*hes[hi2] +
1696 resid[1]*hes[6+hi2] +
1697 resid[2]*hes[12+hi2];
1698 newton_face(fpt, jac, rh, resid,
1699 d1, d2, dn,
1700 (tmp->flags)&(3u<<(2*dn)),
1701 tmp, tol);
1702 }
1703 else
1704 {
1705 de = 1, dn1 = 2, dn2 = 0, hi0 = 3;
1706 const double rh =
1707 resid[0]*hes[hi0] +
1708 resid[1]*hes[6+hi0] +
1709 resid[2]*hes[12+hi0];
1710 newton_edge(fpt, jac, rh, resid,
1711 de, dn1, dn2,
1712 tmp->flags&(~(3u<<(2*de))),
1713 tmp, tol);
1714 }
1715 }
1716 else
1717 {
1718 if (steep[2] < 0)
1719 {
1720 de = 2, dn1 = 0, dn2 = 1, hi0 = 5;
1721 const double rh =
1722 resid[0]*hes[hi0] +
1723 resid[1]*hes[6+hi0] +
1724 resid[2]*hes[12+hi0];
1725 newton_edge(fpt, jac, rh, resid,
1726 de, dn1, dn2,
1727 tmp->flags&(~(3u<<(2*de))),
1728 tmp, tol);
1729 }
1730 else
1731 {
1732 fpt->r[0] = tmp->r[0];
1733 fpt->r[1] = tmp->r[1];
1734 fpt->r[2] = tmp->r[2];
1735 fpt->dist2p = 0;
1736 fpt->flags =tmp->flags|CONVERGED_FLAG;
1737 }
1738 }
1739 }
1740 }
1741 }
1742 MFEM_SYNC_THREAD;
1743 break;
1744 } //case 3
1745 } //switch
1746 if (fpt->flags & CONVERGED_FLAG)
1747 {
1748 break;
1749 }
1750 MFEM_SYNC_THREAD;
1751 MFEM_FOREACH_THREAD(j,x,1)
1752 {
1753 *tmp = *fpt;
1754 }
1755 MFEM_SYNC_THREAD;
1756 } //for int step < 50
1757 } //findpts_el
1758
1759 bool converged_internal = (fpt->flags&FLAG_MASK)==CONVERGED_FLAG;
1760 if (*code_i == CODE_NOT_FOUND || converged_internal ||
1761 fpt->dist2 < *dist2_i)
1762 {
1763 MFEM_FOREACH_THREAD(j,x,1)
1764 {
1765 *(el_base+i) = el;
1766 *code_i = converged_internal ? CODE_INTERNAL :
1767 CODE_BORDER;
1768 *dist2_i = fpt->dist2;
1769 }
1770 MFEM_FOREACH_THREAD(j,x,DIM)
1771 {
1772 *(r_base+DIM*i+j) = fpt->r[j];
1773 }
1774 MFEM_SYNC_THREAD;
1775 if (converged_internal)
1776 {
1777 break;
1778 }
1779 }
1780 } //findpts_local
1781 } //elp
1782 });
1783}
1784
1786 int point_pos_ordering,
1787 Array<unsigned int> &code,
1788 Array<unsigned int> &elem, Vector &ref,
1789 Vector &dist, int npt)
1790{
1791 if (npt == 0)
1792 {
1793 return;
1794 }
1795 auto pp = point_pos.Read();
1796 auto pgslm = gsl_mesh.Read();
1797 auto pwt = DEV.wtend.Read();
1798 auto pbb = DEV.bb.Read();
1799 auto plhm = DEV.loc_hash_min.Read();
1800 auto plhf = DEV.loc_hash_fac.Read();
1801 auto plho = DEV.loc_hash_offset.ReadWrite();
1802 auto pcode = code.Write();
1803 auto pelem = elem.Write();
1804 auto pref = ref.Write();
1805 auto pdist = dist.Write();
1806 auto pgll1d = DEV.gll1d.ReadWrite();
1807 auto plc = DEV.lagcoeff.Read();
1808 switch (DEV.dof1d)
1809 {
1810 case 2:
1811 FindPointsLocal3DKernel<2>(npt, DEV.newt_tol, pp, point_pos_ordering,
1812 pgslm, NE_split_total, pwt, pbb, DEV.h_nx, plhm,
1813 plhf, plho, pcode, pelem, pref, pdist, pgll1d,
1814 plc);
1815 break;
1816 case 3:
1817 FindPointsLocal3DKernel<3>(npt, DEV.newt_tol, pp, point_pos_ordering,
1818 pgslm, NE_split_total, pwt, pbb, DEV.h_nx, plhm,
1819 plhf, plho, pcode, pelem, pref, pdist, pgll1d,
1820 plc);
1821 break;
1822 case 4:
1823 FindPointsLocal3DKernel<4>(npt, DEV.newt_tol, pp, point_pos_ordering,
1824 pgslm, NE_split_total, pwt, pbb, DEV.h_nx, plhm,
1825 plhf, plho, pcode, pelem, pref, pdist, pgll1d,
1826 plc);
1827 break;
1828 case 5:
1829 FindPointsLocal3DKernel<5>(npt, DEV.newt_tol, pp, point_pos_ordering,
1830 pgslm, NE_split_total, pwt, pbb, DEV.h_nx, plhm,
1831 plhf, plho, pcode, pelem, pref, pdist, pgll1d,
1832 plc);
1833 break;
1834 default:
1835 FindPointsLocal3DKernel(npt, DEV.newt_tol, pp, point_pos_ordering, pgslm,
1836 NE_split_total, pwt, pbb, DEV.h_nx, plhm, plhf,
1837 plho, pcode, pelem, pref, pdist, pgll1d, plc,
1838 DEV.dof1d);
1839 }
1840}
1841#undef pMax
1842#undef DIM2
1843#undef DIM
1844#undef CODE_INTERNAL
1845#undef CODE_BORDER
1846#undef CODE_NOT_FOUND
1847#else
1848void FindPointsGSLIB::FindPointsLocal3(const Vector &point_pos,
1849 int point_pos_ordering,
1850 Array<unsigned int> &code,
1851 Array<unsigned int> &elem, Vector &ref,
1852 Vector &dist, int npt) {};
1853#endif
1854} // namespace mfem
1855#endif //ifdef MFEM_USE_GSLIB
T * Write(bool on_dev=true)
Shortcut for mfem::Write(a.GetMemory(), a.Size(), on_dev).
Definition array.hpp:345
struct mfem::FindPointsGSLIB::@24 DEV
void FindPointsLocal3(const Vector &point_pos, int point_pos_ordering, Array< unsigned int > &gsl_code_dev_l, Array< unsigned int > &gsl_elem_dev_l, Vector &gsl_ref_l, Vector &gsl_dist_l, int npt)
Vector data type.
Definition vector.hpp:82
virtual const real_t * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
Definition vector.hpp:494
virtual real_t * Write(bool on_dev=true)
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), on_dev).
Definition vector.hpp:502
real_t b
Definition lissajous.cpp:42
real_t a
Definition lissajous.cpp:41
constexpr int DIM
real_t u(const Vector &xvec)
Definition lor_mms.hpp:22
void forall_2D(int N, int X, int Y, lambda &&body)
Definition forall.hpp:762
std::function< real_t(const Vector &)> f(real_t mass_coeff)
Definition lor_mms.hpp:30
real_t p(const Vector &x, real_t t)