// // Created by jundafeng on 3/23/22. // #ifndef DGTD_KERNELS_CU #define DGTD_KERNELS_CU #include "kernels.cuh" /* Basic kernel: one tetra per thread * Opt1: one block per tetra, one entry per thread, registers can be shuffled. * New kernels can be parallelized in a smarter way: * 1. volume integral can be parallelized in (12,12) * 2. surface integral can be parallelized in (6,6) * 3. coupling integral can be parallelized in (6,6) * 4. excitation can be parallelized in (12,1) */ //-----------------------CONSTANTS-------------------------------------------------------------------------------------------- __device__ constexpr int GAUSS_POINT_NUM[4] = {6, 9, 9, 9}; __device__ constexpr int TetPolyOrderDim[4] = {6, 12, 30, 45}; __device__ constexpr int FacePolyOrderDim[4] = {3, 6, 12, 18}; #define g6_a 0.816847572980459 #define g6_b (1.0 - g6_a) / 2.0 #define g6_c 0.108103018168070 #define g6_d (1.0 - g6_c) / 2.0 #define g6_W1 0.109951743655322 #define g6_W2 0.223381589678011 #define g9_a 0.437525248383384 #define g9_b 1.0 - 2.0 * g9_a #define g9_c 0.797112651860071 #define g9_d 0.165409927389841 #define g9_e 1.0 - g9_c - g9_d #define g9_W1 0.205950504760887 #define g9_W2 0.063691414286223 #if defined(DGTD_USE_CUDA) #include #include #include #include "cuda_utils.h" __device__ fp_t_ts g2d_6[6][4] = { g6_a, g6_b, g6_b, g6_W1, g6_b, g6_a, g6_b, g6_W1, g6_b, g6_b, g6_a, g6_W1, g6_c, g6_d, g6_d, g6_W2, g6_d, g6_c, g6_d, g6_W2, g6_d, g6_d, g6_c, g6_W2}; // added by Seung-Cheol Lee & Marinos N. Vouvakis 08/12/02 // Order = 5 __device__ fp_t_ts g2d_9[9][4] = { g9_b, g9_a, g9_a, g9_W1, g9_a, g9_b, g9_a, g9_W1, g9_a, g9_a, g9_b, g9_W1, g9_c, g9_d, g9_e, g9_W2, g9_c, g9_e, g9_d, g9_W2, g9_d, g9_c, g9_e, g9_W2, g9_d, g9_e, g9_c, g9_W2, g9_e, g9_c, g9_d, g9_W2, g9_e, g9_d, g9_c, g9_W2}; // __device__ const int FACE_BASIS_IJK[6][2] = { // {1, 2}, // {0, 2}, // {0, 1}, // {1, 2}, // {0, 2}, // {0, 1}, // }; __device__ const int FACE_BASIS_IJK[18][3] = { {1, 2, -1}, {0, 2, -1}, {0, 1, -1}, {1, 2, -1}, {0, 2, -1}, {0, 1, -1}, {0, 1, 2}, {0, 1, 2}, {1, 2, -1}, {0, 2, -1}, {0, 1, -1}, {0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 2} }; __device__ const int FAC2TET[4][18] = { {5, 4, 3, 11, 10, 9, 12, 13, 25, 24, 23, 26, 30, 31, 32, 42, 43, 44}, {5, 2, 1, 11, 8, 7, 14, 15, 25, 22, 21, 27, 33, 34, 35, 42, 43, 44}, {4, 2, 0, 10, 8, 6, 16, 17, 24, 22, 20, 28, 36, 37, 38, 42, 43, 44}, {3, 1, 0, 9, 7, 6, 18, 19, 23, 21, 20, 29, 39, 40, 41, 42, 43, 44} }; #endif //---------------------------------------------------------------------------------------------------------------------------- #if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL) __global__ void makeNeighField(int* MappingArray, fp_t_ts* Field, fp_t_ts* auxiliar, int elements) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < elements){ auxiliar[idx] = Field[MappingArray[idx]]; // printf("%d\n", MappingArray[idx]); } } __global__ void makeNeighFieldRegular(int* MappingArray, fp_t_ts* Field, fp_t_ts* auxiliar, int elements, int polyOrder){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < NumOfFaces * FacePolyOrderDim[polyOrder] * elements){ int blockId = idx / FacePolyOrderDim[polyOrder]; int basisId = idx % FacePolyOrderDim[polyOrder]; int tetId = blockId / NumOfFaces; int faceId = blockId % NumOfFaces; auxiliar[(faceId * elements + tetId) * FacePolyOrderDim[polyOrder] + basisId] = Field[MappingArray[idx]]; // for(int i = 0; i < NumOfFaces; i++){ // auxiliar[idx + i * elements] = Field[MappingArray[idx + i * FacePolyOrderDim[polyOrder]]]; // } } } __global__ void makeNeighFieldRegular_custom(int* MappingArray, fp_t_ts* Field, fp_t_ts* auxiliar, int elements, int polyOrder, int numFaces){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < numFaces * FacePolyOrderDim[polyOrder] * elements){ int blockId = idx / FacePolyOrderDim[polyOrder]; int basisId = idx % FacePolyOrderDim[polyOrder]; int tetId = blockId / numFaces; int faceId = blockId % numFaces; auxiliar[(faceId * elements + tetId) * FacePolyOrderDim[polyOrder] + basisId] = Field[MappingArray[idx]]; } } __global__ void addCouplingResults(fp_t_ts* Field, fp_t_ts* auxiliar, int* neighs, int* neighsOffset, int elements){ int tet = blockIdx.x * blockDim.y + threadIdx.y; if(tet < elements){ int basis = blockDim.x; fp_t_ts fieldAux = 0.0; int offset = (neighsOffset[tet] - neighsOffset[0]) * basis; for(int i = 0; i < neighs[tet]; i++){ fieldAux += auxiliar[offset + basis * i + threadIdx.x]; } Field[tet * basis + threadIdx.x] += fieldAux; } } __global__ void addCouplingResultsRegular(fp_t_ts* Field, fp_t_ts* auxiliar, int elements){ int tet = blockIdx.x * blockDim.y + threadIdx.y; if(tet < elements){ int basis = blockDim.x; fp_t_ts fieldAux = 0.0; for(int i = 0; i < NumOfFaces; i++){ fieldAux += auxiliar[tet * basis + elements * basis * i + threadIdx.x]; } Field[tet * basis + threadIdx.x] += fieldAux; } } __global__ void addCouplingResultsRegular_custom(fp_t_ts* Field, fp_t_ts* auxiliar, int elements, int numFaces){ int tet = blockIdx.x * blockDim.y + threadIdx.y; if(tet < elements) { int basis = blockDim.x; fp_t_ts fieldAux = 0.0; for(int i = 0; i < numFaces; i++) { fieldAux += auxiliar[tet * basis + elements * basis * i + threadIdx.x]; } Field[tet * basis + threadIdx.x] += fieldAux; } } __global__ void fill_ones_cols(fp_t_ts* __restrict__ M, int rows, int cols, int ld) { int r = blockIdx.x * blockDim.x + threadIdx.x; int c = blockIdx.y * blockDim.y + threadIdx.y; if (r < rows && c < cols) M[(size_t)c * ld + r] = 1.0f; } // 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 // // 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 // // 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 // __device__ fp_t_ts dotP3131(const fp_t_ts *v1, const fp_t_ts *v2){ fp_t_ts result; result = v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; return result; } __device__ void crossP3(const fp_t_ts *v1, const fp_t_ts *v2, fp_t_ts *result){ result[0] = v1[1] * v2[2] - v1[2] * v2[1]; result[1] = -(v1[0] * v2[2] - v1[2] * v2[0]); result[2] = v1[0] * v2[1] - v1[1] * v2[0]; } __device__ void scale3(fp_t_ts *v1, const fp_t_ts factor){ v1[0] = v1[0] * factor; v1[1] = v1[1] * factor; v1[2] = v1[2] * factor; } __device__ void scale3(const fp_t_ts *v1, const fp_t_ts factor, fp_t_ts *v2){ v2[0] = v1[0] * factor; v2[1] = v1[1] * factor; v2[2] = v1[2] * factor; } __device__ void unit3(fp_t_ts *v1){ fp_t_ts work[3]; fp_t_ts mag = sqrt(v1[0] * v1[0] + v1[1] * v1[1] + v1[2] * v1[2]); work[0] = v1[0] / mag; work[1] = v1[1] / mag; work[2] = v1[2] / mag; v1[0] = work[0]; v1[1] = work[1]; v1[2] = work[2]; } __device__ void plus3(fp_t_ts *v1, fp_t_ts *v2){ v1[0] = v1[0] + v2[0]; v1[1] = v1[1] + v2[1]; v1[2] = v1[2] + v2[2]; } __device__ void set3(const fp_t_ts *v1, fp_t_ts *v2){ v2[0] = v1[0]; v2[1] = v1[1]; v2[2] = v1[2]; } __device__ void plus3(const fp_t_ts *v1, const fp_t_ts *v2, fp_t_ts *result){ result[0] = v1[0] + v2[0]; result[1] = v1[1] + v2[1]; result[2] = v1[2] + v2[2]; } __device__ void plus33(const fp_t_ts *v1, const fp_t_ts *v2, const fp_t_ts *v3, fp_t_ts *result){ result[0] = v1[0] + v2[0] + v3[0]; result[1] = v1[1] + v2[1] + v3[1]; result[2] = v1[2] + v2[2] + v3[2]; } __device__ void minus3(const fp_t_ts *v1, const fp_t_ts *v2, fp_t_ts *result){ result[0] = v1[0] - v2[0]; result[1] = v1[1] - v2[1]; result[2] = v1[2] - v2[2]; } __device__ void reset3(fp_t_ts *result){ result[0] = 0.0; result[1] = 0.0; result[2] = 0.0; } __device__ void geometry3(const fp_t_ts *nd_coord, fp_t_ts lvtr[][3], fp_t_ts avtr[][3], fp_t_ts *vol){ int i, i1, i2, i3; fp_t_ts vv[3], tt[3]; for (i = 0; i < 3; i++){ minus3(&nd_coord[(i + 1) * 3], &nd_coord[0], lvtr[i]); } for (i = 0; i < 3; i++){ i1 = (i + 1) % 4; i2 = (i + 2) % 4; i3 = (i + 3) % 4; minus3(&nd_coord[i2 * 3], &nd_coord[i1 * 3], tt); minus3(&nd_coord[i3 * 3], &nd_coord[i1 * 3], vv); crossP3(tt, vv, avtr[i]); scale3(avtr[i], 0.5); minus3(&nd_coord[i * 3], &nd_coord[i1 * 3], tt); if (dotP3131(avtr[i], tt) < 0.0) scale3(avtr[i], -1); ; } reset3(avtr[3]); plus33(avtr[0], avtr[1], avtr[2], avtr[3]); scale3(avtr[3], -1); (*vol) = dotP3131(avtr[2], lvtr[1]) / 3.0; } __device__ fp_t_ts magnitude3(const fp_t_ts *v1){ return sqrt(v1[0] * v1[0] + v1[1] * v1[1] + v1[2] * v1[2]); } __device__ void tet_center(const fp_t_ts *tet_nd_coord, fp_t_ts *tet_center){ tet_center[0] = (tet_nd_coord[0 * 3 + 0] + tet_nd_coord[1 * 3 + 0] + tet_nd_coord[2 * 3 + 0] + tet_nd_coord[3 * 3 + 0]) * 0.25; tet_center[1] = (tet_nd_coord[0 * 3 + 1] + tet_nd_coord[1 * 3 + 1] + tet_nd_coord[2 * 3 + 1] + tet_nd_coord[3 * 3 + 1]) * 0.25; tet_center[2] = (tet_nd_coord[0 * 3 + 2] + tet_nd_coord[1 * 3 + 2] + tet_nd_coord[2 * 3 + 2] + tet_nd_coord[3 * 3 + 2]) * 0.25; } // have to use 2d arrays in the device functions, using 1d array with offsets will result in erroneous values __device__ void geometry2(const fp_t_ts *nd_coord, const fp_t_ts *tet0_nd_coord, fp_t_ts *normal, fp_t_ts nvtr[][3], fp_t_ts *area){ fp_t_ts t0[3], t1[3], tv[3], tet0center[3]; minus3(&nd_coord[2 * 3], &nd_coord[1 * 3], t0); minus3(&nd_coord[0 * 3], &nd_coord[2 * 3], t1); crossP3(t0, t1, normal); (*area) = 0.5 * magnitude3(normal); scale3(normal, 1 / magnitude3(normal)); crossP3(normal, t0, nvtr[0]); crossP3(normal, t1, nvtr[1]); if (dotP3131(nvtr[0], t1) < 0.0) scale3(nvtr[0], -1); if (dotP3131(nvtr[1], t0) > 0.0) scale3(nvtr[1], -1); tet_center(tet0_nd_coord, tet0center); minus3(tet0center, &nd_coord[0 * 3], tv); if (dotP3131(tv, normal) > 0.0) scale3(normal, -1); } __device__ void geometry2(const fp_t_ts *nd_coord, fp_t_ts nvtr[][3], fp_t_ts *area){ fp_t_ts t0[3], t1[3], normal[3]; minus3(&nd_coord[2 * 3], &nd_coord[1 * 3], t0); minus3(&nd_coord[0 * 3], &nd_coord[2 * 3], t1); crossP3(t0, t1, normal); (*area) = 0.5 * magnitude3(normal); scale3(normal, 1 / magnitude3(normal)); crossP3(normal, t0, nvtr[0]); crossP3(normal, t1, nvtr[1]); if (dotP3131(nvtr[0], t1) < 0.0) scale3(nvtr[0], -1); if (dotP3131(nvtr[1], t0) > 0.0) scale3(nvtr[1], -1); } __device__ void dotP3331(const fp_t_ts *tensor, const fp_t_ts *v1, fp_t_ts *result){ result[0] = v1[0] * tensor[0] + v1[1] * tensor[1] + v1[2] * tensor[2]; result[1] = v1[0] * tensor[3] + v1[1] * tensor[4] + v1[2] * tensor[5]; result[2] = v1[0] * tensor[6] + v1[1] * tensor[7] + v1[2] * tensor[8]; } __device__ void makeCoeff33(const fp_t_ts avtr[][3], const fp_t_ts *epsr, fp_t_ts coeffA[][3]){ int i, j; fp_t_ts temp[3]; for (i = 0; i < 3; i++){ for (j = 0; j < 3; j++){ dotP3331(epsr, avtr[j], temp); coeffA[i][j] = dotP3131(temp, avtr[i]); } } } __device__ void quadObtention(int np, int id, fp_t_ts &zeta0, fp_t_ts &zeta1, fp_t_ts &zeta2, fp_t_ts &weight){ if (id < 0 || id >= np){ printf("Error in quadObtention"); return; } switch (np){ case 6: zeta0 = g2d_6[id][0]; zeta1 = g2d_6[id][1]; zeta2 = g2d_6[id][2]; weight = g2d_6[id][3]; break; case 9: zeta0 = g2d_9[id][0]; zeta1 = g2d_9[id][1]; zeta2 = g2d_9[id][2]; weight = g2d_9[id][3]; break; default: break; } } __device__ void faceBasisW(const fp_t_ts *node_coords, const fp_t_ts *zeta, const int p, fp_t_ts *W){ fp_t_ts gradZeta[3][3]; fp_t_ts area; geometry2(node_coords, gradZeta, &area); scale3(gradZeta[0], 0.5 / area); scale3(gradZeta[1], 0.5 / area); gradZeta[2][0] = 0; gradZeta[2][1] = 0; gradZeta[2][2] = 0; minus3(gradZeta[2], gradZeta[0], gradZeta[2]); minus3(gradZeta[2], gradZeta[1], gradZeta[2]); int i = FACE_BASIS_IJK[p][0]; int j = FACE_BASIS_IJK[p][1]; int k = FACE_BASIS_IJK[p][2]; switch (p){ case 0: scale3(gradZeta[j], zeta[i]); scale3(gradZeta[i], zeta[j]); minus3(gradZeta[j], gradZeta[i], W); break; case 1: scale3(gradZeta[j], zeta[i]); scale3(gradZeta[i], zeta[j]); minus3(gradZeta[j], gradZeta[i], W); break; case 2: scale3(gradZeta[j], zeta[i]); scale3(gradZeta[i], zeta[j]); minus3(gradZeta[j], gradZeta[i], W); break; case 3: scale3(gradZeta[j], zeta[i]); scale3(gradZeta[i], zeta[j]); plus3(gradZeta[j], gradZeta[i], W); scale3(W, 4.); break; case 4: scale3(gradZeta[j], zeta[i]); scale3(gradZeta[i], zeta[j]); plus3(gradZeta[j], gradZeta[i], W); scale3(W, 4.); break; case 5: scale3(gradZeta[j], zeta[i]); scale3(gradZeta[i], zeta[j]); plus3(gradZeta[j], gradZeta[i], W); scale3(W, 4.); break; case 6: scale3(gradZeta[k], zeta[i]); scale3(gradZeta[i], zeta[k]); minus3(gradZeta[k], gradZeta[i], W); scale3(W, 4 * zeta[j]); break; case 7: scale3(gradZeta[j], zeta[i]); scale3(gradZeta[i], zeta[j]); minus3(gradZeta[j], gradZeta[i], W); scale3(W, 4. * zeta[k]); break; case 8: scale3(gradZeta[j], zeta[i] * zeta[i] - 2. * zeta[i] * zeta[j]); scale3(gradZeta[i], zeta[j] * zeta[j] - 2. * zeta[i] * zeta[j]); minus3(gradZeta[j], gradZeta[i], W); scale3(W, 4.); break; case 9: scale3(gradZeta[j], zeta[i] * zeta[i] - 2. * zeta[i] * zeta[j]); scale3(gradZeta[i], zeta[j] * zeta[j] - 2. * zeta[i] * zeta[j]); minus3(gradZeta[j], gradZeta[i], W); scale3(W, 4.); break; case 10: scale3(gradZeta[j], zeta[i] * zeta[i] - 2. * zeta[i] * zeta[j]); scale3(gradZeta[i], zeta[j] * zeta[j] - 2. * zeta[i] * zeta[j]); minus3(gradZeta[j], gradZeta[i], W); scale3(W, 4.); break; case 11: scale3(gradZeta[k], zeta[i] * zeta[j]); scale3(gradZeta[j], zeta[i] * zeta[k]); scale3(gradZeta[i], zeta[k] * zeta[j]); plus33(gradZeta[k], gradZeta[j], gradZeta[i], W); scale3(W, 8.); break; case 12: scale3(gradZeta[k], zeta[j]); scale3(gradZeta[j], zeta[k]); minus3(gradZeta[k], gradZeta[j], W); scale3(W, 4. * zeta[i] * zeta[i]); break; case 13: scale3(gradZeta[k], zeta[i]); scale3(gradZeta[i], zeta[k]); minus3(gradZeta[k], gradZeta[i], W); scale3(W, 4. * zeta[j] * zeta[j]); break; case 14: scale3(gradZeta[j], zeta[i]); scale3(gradZeta[i], zeta[j]); minus3(gradZeta[j], gradZeta[i], W); scale3(W, 4. * zeta[k] * zeta[k]); break; case 15: W[0] = W[1] = W[2] = 0; break; case 16: W[0] = W[1] = W[2] = 0; break; case 17: W[0] = W[1] = W[2] = 0; break; default: W[0] = W[1] = W[2] = 0; printf("Check faceBasisW: you are asking for somthing out of scope"); break; } } __device__ void getCartesianCoord(const fp_t_ts *node_coords, const fp_t_ts *zeta, fp_t_ts *pnt){ pnt[0] = node_coords[0 * 3 + 0] * zeta[0] + node_coords[1 * 3 + 0] * zeta[1] + node_coords[2 * 3 + 0] * zeta[2]; pnt[1] = node_coords[0 * 3 + 1] * zeta[0] + node_coords[1 * 3 + 1] * zeta[1] + node_coords[2 * 3 + 1] * zeta[2]; pnt[2] = node_coords[0 * 3 + 2] * zeta[0] + node_coords[1 * 3 + 2] * zeta[1] + node_coords[2 * 3 + 2] * zeta[2]; } __device__ void PlaneWaveFields(int ExcitationFlag, fp_t_ts t, fp_t_ts to, fp_t_ts tau, const fp_t_ts *r, const fp_t_ts *ro, fp_t_ts freq_m, fp_t_ts *kvtr, fp_t_ts eta, fp_t_ts Emagnitude, fp_t_ts &IncidExcit_E, fp_t_ts &IncidExcit_H) { fp_t_ts omega = 2.0 * Pi * freq_m * MEGA; fp_t_ts Exponent; fp_t_ts SinModul; fp_t_ts Neuman; fp_t_ts work[3]; // printf("%d", ExcitationFlag); switch (ExcitationFlag){ case 0: minus3(r, ro, work); Exponent = t - to - dotP3131(kvtr, work) / Vo; // Time Harmonic if (Exponent >= 0.0){ SinModul = cos(omega * Exponent); IncidExcit_E = Emagnitude * SinModul; IncidExcit_H = (Emagnitude / eta) * SinModul; }else{ IncidExcit_E = 0.0; IncidExcit_H = 0.0; } break; case 1: // Gauss Pulse minus3(r, ro, work); Exponent = t - to - dotP3131(kvtr, work) / Vo; SinModul = cos(omega * Exponent); IncidExcit_E = Emagnitude * SinModul * exp(-(Exponent * Exponent) / (tau * tau)); IncidExcit_H = (Emagnitude / eta) * SinModul * exp(-(Exponent * Exponent) / (tau * tau)); break; case 2: // Neuman Pulse minus3(r, ro, work); Exponent = t - to - dotP3131(kvtr, work) / Vo; Neuman = (2.0 * Exponent) / (tau * tau); IncidExcit_E = (Emagnitude * Neuman) * exp(-(Exponent * Exponent) / (tau * tau)); IncidExcit_H = ((Emagnitude / eta) * Neuman) * exp(-(Exponent * Exponent) / (tau * tau)); break; case 3: { fp_t_ts work[3]; minus3(r, ro, work); // work = r - ro Exponent = t - dotP3131(kvtr, work) / Vo; if (Exponent >= 0.0 && Exponent <= to) { // Chirp parameters fp_t_ts f_end = freq_m * MEGA; // Center frequency in Hz fp_t_ts B = tau * MEGA; // Bandwidth in Hz fp_t_ts f0 = f_end - B; // Start frequency fp_t_ts f1 = f_end; // End frequency fp_t_ts Tchirp = to; // Chirp duration // Chirp phase fp_t_ts chirpArg = 2.0 * Pi * f0 * Exponent + Pi * (f1 - f0) / Tchirp * Exponent * Exponent; // Final tapered chirp signal fp_t_ts chirpSignal = sin(chirpArg); IncidExcit_E = Emagnitude * chirpSignal; IncidExcit_H = (Emagnitude / eta) * chirpSignal; } else { IncidExcit_E = 0.0; IncidExcit_H = 0.0; } break; } default: break; } } __global__ void CalculatePadeFreq(fp_t* a_k, fp_t* b_k, int M_global, int N, int* constantM, cuDoubleComplex* Hf){ int m = blockIdx.x * blockDim.x + threadIdx.x; if(m < M_global){ fp_t sum_real_a = 0.0; fp_t sum_imag_a = 0.0; fp_t sum_real_b = 0.0; fp_t sum_imag_b = 0.0; for (int n = 0; n < N; n++) { fp_t angle = -2.0f * Pi * constantM[m] * n / M_global; fp_t cos_val, sin_val; sincos(angle, &sin_val, &cos_val); sum_real_a += a_k[n] * cos_val; sum_imag_a += a_k[n] * sin_val; sum_real_b += b_k[n] * cos_val; sum_imag_b += b_k[n] * sin_val; } Hf[m] = cuCdiv(make_cuDoubleComplex(sum_real_a, sum_imag_a), make_cuDoubleComplex(sum_real_b, sum_imag_b)); } return; } // === Utility Functions for Double Precision === __device__ __forceinline__ fp_t_ts cleanSmall(fp_t_ts val, fp_t_ts eps = 1e-12) { return fabs(val) < eps ? 0.0 : val; } __device__ __forceinline__ void cleanVec3(fp_t_ts* v, fp_t_ts eps = 1e-12) { for (int i = 0; i < 3; ++i) v[i] = cleanSmall(v[i], eps); } // -------------------------------------------------------------------------------------- // Plane Wave Exciation Update // -------------------------------------------------------------------------------------- __device__ void E_updateinc_PW(const fp_t_ts t, const fp_t_ts to, const fp_t_ts tau, const fp_t_ts freq_m, const fp_t_ts eta, const fp_t_ts Emagnitude, const fp_t_ts *ro, const fp_t_ts theta, const fp_t_ts phi, const fp_t_ts *Epol, const int ExcitationFlag, const fp_t_ts *r, const fp_t_ts *normalvtr, fp_t_ts *nxHinc, fp_t_ts *nxnxEinc) { fp_t_ts IncidExcit_E, IncidExcit_H; fp_t_ts theta_in_rad = theta * (fp_t_ts)Pi / (fp_t_ts)180.0; fp_t_ts phi_in_rad = phi * (fp_t_ts)Pi / (fp_t_ts)180.0; fp_t_ts kvtr[3], Hpol[3], work[3]; kvtr[0] = sin(theta_in_rad) * cos(phi_in_rad); kvtr[1] = sin(theta_in_rad) * sin(phi_in_rad); kvtr[2] = cos(theta_in_rad); // Fields in Time Domain PlaneWaveFields(ExcitationFlag, t, to, tau, r, ro, freq_m, kvtr, eta, Emagnitude, IncidExcit_E, IncidExcit_H); // EField crossP3(normalvtr, Epol, work); crossP3(normalvtr, work, nxnxEinc); scale3(nxnxEinc, IncidExcit_E); // HField crossP3(kvtr, Epol, work); crossP3(normalvtr, work, Hpol); scale3(Hpol, IncidExcit_H, nxHinc); /* if (r[2] == 0.0 && r[1] < 1e-4 && r[1] > 0.0 && r[0] < 5e-4 && r[0] > 0.0) { printf( "r=(%e,%e,%e) | nxnxE =(%e,%e,%e) | nxH =(%e,%e,%e) \n", (double)r[0], (double)r[1], (double)r[2], (double)nxnxEinc[0], (double)nxnxEinc[1], (double)nxnxEinc[2], (double)nxHinc[0], (double)nxHinc[1], (double)nxHinc[2] ); } */ for (int i = 0; i < 3; ++i) { if (fabs(nxHinc[i]) < (fp_t_ts)1e-12) nxHinc[i] = 0.0; if (fabs(nxnxEinc[i]) < (fp_t_ts)1e-12) nxnxEinc[i] = 0.0; } } __device__ void H_updateinc_PW(const fp_t_ts t, const fp_t_ts to, const fp_t_ts tau, const fp_t_ts freq_m, const fp_t_ts eta, const fp_t_ts Emagnitude, const fp_t_ts *ro, const fp_t_ts theta, const fp_t_ts phi, const fp_t_ts *Epol, const int ExcitationFlag, const fp_t_ts *r, const fp_t_ts *normalvtr, fp_t_ts *nxEinc, fp_t_ts *nxnxHinc) { fp_t_ts IncidExcit_E, IncidExcit_H; fp_t_ts theta_in_rad = theta * (fp_t_ts)Pi / (fp_t_ts)180.0; fp_t_ts phi_in_rad = phi * (fp_t_ts)Pi / (fp_t_ts)180.0; fp_t_ts kvtr[3], Hpol[3], work[3]; kvtr[0] = sin(theta_in_rad) * cos(phi_in_rad); kvtr[1] = sin(theta_in_rad) * sin(phi_in_rad); kvtr[2] = cos(theta_in_rad); PlaneWaveFields(ExcitationFlag, t, to, tau, r, ro, freq_m, kvtr, eta, Emagnitude, IncidExcit_E, IncidExcit_H); // EField crossP3(normalvtr, Epol, work); scale3(work, IncidExcit_E, nxEinc); // HField crossP3(kvtr, Epol, Hpol); crossP3(normalvtr, Hpol, work); crossP3(normalvtr, work, nxnxHinc); scale3(nxnxHinc, IncidExcit_H); for (int i = 0; i < 3; ++i) { if (fabs(nxEinc[i]) < (fp_t_ts)1e-12) nxEinc[i] = 0.0; if (fabs(nxnxHinc[i]) < (fp_t_ts)1e-12) nxnxHinc[i] = 0.0; } } __global__ void addExcitationH_PML( int* ExcitationFacesCnt, int* ExcitationFacesOffset, int* ExcitationFacesNum, int excitationCNT, int excitationPMLCNT, int8_t* map, ExcitationProp exci_prop, int PolyFlag, fp_t_ts dt_muo, fp_t_ts t, fp_t_ts* nd_coords_tet, fp_t_ts* nd_coords_face, fp_t_ts* Z_face, fp_t_ts* InvMat, fp_t_ts* Field) { int tet_id = blockIdx.x * blockDim.x + threadIdx.x; if (tet_id < excitationCNT) { //-----TODO: Make this part dynamic---------------------------- const int TetPolyOrderDim2 = TetPolyOrderDim[3]; const int FacePolyOrderDim2 = FacePolyOrderDim[3]; fp_t_ts nxe_inc[TetPolyOrderDim2] = {0}; fp_t_ts nxnxh_inc[TetPolyOrderDim2] = {0}; //------------------------------------------------------------- fp_t_ts Z = 0.0; for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++){ int face_id = ExcitationFacesOffset[tet_id] + i; Z = Z_face[face_id]; fp_t_ts nvtr[2][3]; fp_t_ts normvtr[3] = {0}; fp_t_ts area = 0; fp_t_ts nxentry; fp_t_ts nxnxentry; fp_t_ts z[3], w[3], r[3]; fp_t_ts wgt; //-----------------TODO: Make this part dynamic-------------------------------------------- fp_t_ts nxelem[FacePolyOrderDim2] = {0}; fp_t_ts nxnxelem[FacePolyOrderDim2] = {0}; //----------------------------------------------------------------------------------------- fp_t_ts nxE_inc[3]; fp_t_ts nxnxH_inc[3]; geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area); for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){ nxentry = 0; nxnxentry = 0; for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++){ quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt); faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w); getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r); H_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, Z, exci_prop.Emagnitude, exci_prop.ro, exci_prop.theta, exci_prop.phi, exci_prop.Epol, exci_prop.ExcitationFlag, r, normvtr, nxE_inc, nxnxH_inc); if (!(isnan(nxE_inc[0]))){ nxentry += area * wgt * dotP3131(w, nxE_inc); nxnxentry += area * wgt * dotP3131(w, nxnxH_inc); } } //nxelem[j] += nxentry; //nxnxelem[j] += nxnxentry; nxelem[j] += cleanSmall(nxentry); nxnxelem[j] += cleanSmall(nxnxentry); } for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){ // printf("%d\n", ExcitationFacesNum[face_id]); nxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j]; nxnxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j]; } } for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){ nxe_inc[i] *= -dt_muo; //nxe_inc[i] += (-Z * dt_muo) * nxnxh_inc[i]; nxe_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i]; } fp_t_ts sum; for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){ sum = 0.0; for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++){ sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxe_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i]; } sum = cleanSmall(sum); if (tet_id < excitationPMLCNT) { Field[tet_id * TetPolyOrderDim[PolyFlag] + i] -= 0.5 * sum; // Scattered Region } else { Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += 0.5 * sum; // Total Region } } } } __global__ void addExcitationE_PML( int* ExcitationFacesCnt, int* ExcitationFacesOffset, int* ExcitationFacesNum, int excitationCNT, int excitationPMLCNT, int8_t* map, ExcitationProp exci_prop, int PolyFlag, fp_t_ts dt_epsilon, fp_t_ts t, fp_t_ts* nd_coords_tet, fp_t_ts* nd_coords_face, fp_t_ts* Z_face, fp_t_ts* InvMat, fp_t_ts* Field){ int tet_id = blockIdx.x * blockDim.x + threadIdx.x; if (tet_id < excitationCNT){ //-----TODO: Make this part dynamic---------------------------- const int TetPolyOrderDim2 = TetPolyOrderDim[3]; const int FacePolyOrderDim2 = FacePolyOrderDim[3]; fp_t_ts nxh_inc[TetPolyOrderDim2] = {0}; fp_t_ts nxnxe_inc[TetPolyOrderDim2] = {0}; //------------------------------------------------------------- fp_t_ts Y = 0.0; for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++){ int face_id = ExcitationFacesOffset[tet_id] + i; Y = 1 / Z_face[face_id]; fp_t_ts nvtr[2][3]; fp_t_ts normvtr[3] = {0}; fp_t_ts area = 0; fp_t_ts nxentry; fp_t_ts nxnxentry; fp_t_ts z[3], w[3], r[3]; fp_t_ts wgt; //-----------------TODO: Make this part dynamic-------------------------------------------- fp_t_ts nxelem[FacePolyOrderDim2] = {0}; fp_t_ts nxnxelem[FacePolyOrderDim2] = {0}; //----------------------------------------------------------------------------------------- fp_t_ts nxH_inc[3]; fp_t_ts nxnxE_inc[3]; geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area); for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){ nxentry = 0; nxnxentry = 0; for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++){ quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt); faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w); getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r); E_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, Z_face[face_id], exci_prop.Emagnitude, exci_prop.ro, exci_prop.theta, exci_prop.phi, exci_prop.Epol, exci_prop.ExcitationFlag, r, normvtr, nxH_inc, nxnxE_inc); if (!(isnan(nxH_inc[0]))){ nxentry += area * wgt * dotP3131(w, nxH_inc); nxnxentry += area * wgt * dotP3131(w, nxnxE_inc); } } //nxelem[j] += nxentry; //nxnxelem[j] += nxnxentry; nxelem[j] += cleanSmall(nxentry); nxnxelem[j] += cleanSmall(nxnxentry); } for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){ // printf("%d\n", ExcitationFacesNum[face_id]); nxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j]; nxnxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j]; } } for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){ nxh_inc[i] *= dt_epsilon; //nxh_inc[i] += (-Y * dt_epsilon) * nxnxe_inc[i]; nxh_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i]; } fp_t_ts sum; for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){ sum = 0.0; for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++) { sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxh_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i]; } sum = cleanSmall(sum); if (tet_id < excitationPMLCNT) { Field[tet_id * TetPolyOrderDim[PolyFlag] + i] -= 0.5 * sum; // Scattered Region } else { Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += 0.5 * sum; // Total Region } } } } __global__ void addExcitationE( int* ExcitationFacesCnt, int* ExcitationFacesOffset, int* ExcitationFacesNum, int excitationCNT, int8_t* map, ExcitationProp exci_prop, int PolyFlag, fp_t_ts dt_epsilon, fp_t_ts t, fp_t_ts* nd_coords_tet, fp_t_ts* nd_coords_face, fp_t_ts* Z_face, fp_t_ts* InvMat, fp_t_ts* Field) { int tet_id = blockIdx.x * blockDim.x + threadIdx.x; if (tet_id < excitationCNT) { //-----TODO: Make this part dynamic---------------------------- const int TetPolyOrderDim2 = TetPolyOrderDim[3]; const int FacePolyOrderDim2 = FacePolyOrderDim[3]; fp_t_ts nxh_inc[TetPolyOrderDim2] = {0}; fp_t_ts nxnxe_inc[TetPolyOrderDim2] = {0}; //------------------------------------------------------------- fp_t_ts Y = 0.0; for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++) { int face_id = ExcitationFacesOffset[tet_id] + i; Y = 1 / Z_face[face_id]; fp_t_ts nvtr[2][3]; fp_t_ts normvtr[3] = {0}; fp_t_ts area = 0; fp_t_ts nxentry; fp_t_ts nxnxentry; fp_t_ts z[3], w[3], r[3]; fp_t_ts wgt; //-----------------TODO: Make this part dynamic-------------------------------------------- fp_t_ts nxelem[FacePolyOrderDim2] = {0}; fp_t_ts nxnxelem[FacePolyOrderDim2] = {0}; //----------------------------------------------------------------------------------------- fp_t_ts nxH_inc[3]; fp_t_ts nxnxE_inc[3]; geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area); for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++) { nxentry = 0; nxnxentry = 0; for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++) { quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt); faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w); getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r); E_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, Z_face[face_id], exci_prop.Emagnitude, exci_prop.ro, exci_prop.theta, exci_prop.phi,exci_prop.Epol, exci_prop.ExcitationFlag,r,normvtr,nxH_inc,nxnxE_inc); if (!(isnan(nxH_inc[0]))) { nxentry += area * wgt * dotP3131(w, nxH_inc); nxnxentry += area * wgt * dotP3131(w, nxnxE_inc); } /* if (r[2] == 0.0) { printf("r = (%.6e, %.6e, %.6e) | face_id=%d | q=%d | nxH_inc=(%.6e, %.6e, %.6e) | nxnxE_inc=(%.6e, %.6e, %.6e) | nxentry=%.6e | nxnxentry=%.6e\n", (double)r[0], (double)r[1], (double)r[2], face_id, k, (double)nxH_inc[0], (double)nxH_inc[1], (double)nxH_inc[2], (double)nxnxE_inc[0], (double)nxnxE_inc[1], (double)nxnxE_inc[2], (double)nxentry, (double)nxnxentry ); } */ } //printf("face_id=%d j=%d nxentry=%.6e nxnxentry=%.6e\n", face_id, j, (double)nxentry, (double)nxnxentry); nxelem[j] += nxentry; nxnxelem[j] += nxnxentry; } for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){ //printf("%d\n", ExcitationFacesNum[face_id]); nxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j]; nxnxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j]; //printf("tet_id=%d face_id=%d j=%d TetNode=%d nxh_inc=%.6e nxnxe_inc=%.6e\n", tet_id, face_id, j, FAC2TET[ExcitationFacesNum[face_id]][j], (double)nxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]], (double)nxnxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] ); } } for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){ nxh_inc[i] *= dt_epsilon; //nxh_inc[i] += (-Y * dt_epsilon) * nxnxe_inc[i]; nxh_inc[i] += (-Y * dt_epsilon) * nxnxe_inc[i]; nxh_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i]; //printf("tet_id=%d i=%d nxh_inc=%.6e nxnxe_inc=%6e Y=%6e dt_epsilon=%6e \n", tet_id, i, (double)nxh_inc[i], (double)nxnxe_inc[i], (double)Y, (double)dt_epsilon ); } fp_t_ts sum; for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){ sum = 0.0; for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++){ sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxh_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i]; //printf("InvMat[%d][%d]=%.6e | nxh_inc[%d]=%.6e | map=%d\n", j, i, (double)InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i], j, (double)nxh_inc[j], map[tet_id * TetPolyOrderDim[PolyFlag] + i]); } //printf("E-field update: tet_id=%d i=%d sum=%.6e\n", tet_id, i, (double)sum); Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += sum; } } } __global__ void addExcitationH( int* ExcitationFacesCnt, int* ExcitationFacesOffset, int* ExcitationFacesNum, int excitationCNT, int8_t* map, ExcitationProp exci_prop, int PolyFlag, fp_t_ts dt_muo, fp_t_ts t, fp_t_ts* nd_coords_tet, fp_t_ts* nd_coords_face, fp_t_ts* Z_face, fp_t_ts* InvMat, fp_t_ts* Field) { int tet_id = blockIdx.x * blockDim.x + threadIdx.x; if (tet_id < excitationCNT) { //-----TODO: Make this part dynamic---------------------------- const int TetPolyOrderDim2 = TetPolyOrderDim[3]; const int FacePolyOrderDim2 = FacePolyOrderDim[3]; fp_t_ts nxe_inc[TetPolyOrderDim2] = {0}; fp_t_ts nxnxh_inc[TetPolyOrderDim2] = {0}; //------------------------------------------------------------- fp_t_ts Z = 0.0; for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++){ int face_id = ExcitationFacesOffset[tet_id] + i; Z = Z_face[face_id]; fp_t_ts nvtr[2][3]; fp_t_ts normvtr[3] = {0}; fp_t_ts area = 0; fp_t_ts nxentry; fp_t_ts nxnxentry; fp_t_ts z[3], w[3], r[3]; fp_t_ts wgt; //-----------------TODO: Make this part dynamic-------------------------------------------- fp_t_ts nxelem[FacePolyOrderDim2] = {0}; fp_t_ts nxnxelem[FacePolyOrderDim2] = {0}; //----------------------------------------------------------------------------------------- fp_t_ts nxE_inc[3]; fp_t_ts nxnxH_inc[3]; geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area); for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++) { nxentry = 0; nxnxentry = 0; for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++){ quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt); faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w); getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r); H_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, Z, exci_prop.Emagnitude, exci_prop.ro, exci_prop.theta, exci_prop.phi,exci_prop.Epol, exci_prop.ExcitationFlag,r,normvtr,nxE_inc,nxnxH_inc); if (!(isnan(nxE_inc[0]))){ nxentry += area * wgt * dotP3131(w, nxE_inc); nxnxentry += area * wgt * dotP3131(w, nxnxH_inc); } } nxelem[j] += nxentry; nxnxelem[j] += nxnxentry; } for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){ // printf("%d\n", ExcitationFacesNum[face_id]); nxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j]; nxnxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j]; } } for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){ nxe_inc[i] *= -dt_muo; nxe_inc[i] += (-Z * dt_muo) * nxnxh_inc[i]; nxe_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i]; } fp_t_ts sum; for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){ sum = 0.0; for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++){ sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxe_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i]; } Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += sum; } } } __global__ void checker(fp_t_ts* a, fp_t_ts* b, int length){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < length){ if(a[idx] != b[idx]){ printf("ERROR %.10f\n", a[idx] - b[idx]); } } } // ---- math helpers for fp_t_ts ---- __device__ inline fp_t_ts my_abs(fp_t_ts x) { #if defined(__CUDACC__) || defined(__CUDA_ARCH__) // CUDA device intrinsics return fabsf(x); // if fp_t_ts=float // return fabs(x); // if fp_t_ts=double #else return std::fabs(x); #endif } // max |a[i]-b[i]|; optionally return index of worst component __device__ inline fp_t_ts max_abs_diff3(const fp_t_ts a[3], const fp_t_ts b[3], int* idx_out) { fp_t_ts d0 = my_abs(a[0] - b[0]); fp_t_ts d1 = my_abs(a[1] - b[1]); fp_t_ts d2 = my_abs(a[2] - b[2]); fp_t_ts md = d0; int mi = 0; if (d1 > md) { md = d1; mi = 1; } if (d2 > md) { md = d2; mi = 2; } if (idx_out) *idx_out = mi; return md; } // Print both vectors side by side (cast to double for printf) __device__ inline void print_vec3_same_line(const char* label, const fp_t_ts comp[3], const fp_t_ts ref[3]) { printf("%s: comp=(% .6e,% .6e,% .6e) | " "ref=(% .6e,% .6e,% .6e) | " "diff=(% .3e,% .3e,% .3e)\n", label, (double)comp[0], (double)comp[1], (double)comp[2], (double)ref[0], (double)ref[1], (double)ref[2], (double)(comp[0]-ref[0]), (double)(comp[1]-ref[1]), (double)(comp[2]-ref[2])); } // -------------------------------------------------------------------------------------- // Antenna Port Update // Implement the port excitation using analytical solution // TEM Port // -------------------------------------------------------------------------------------- // Port-only version of H-field excitation __global__ void addExcitationH_port( int* ExcitationFacesCnt, int* ExcitationFacesOffset, int* ExcitationFacesNum, int excitationCNT, int8_t* map, ExcitationProp* exci_prop_vec, int* PortFacePidx_d, int PolyFlag, fp_t_ts dt_muo, fp_t_ts t, fp_t_ts* nd_coords_tet, fp_t_ts* nd_coords_face, fp_t_ts* Z_face, fp_t_ts* InvMat, fp_t_ts* Field, fp_t_ts* Etan_qp_d, fp_t_ts* Htan_qp_d) { int tet_id = blockIdx.x * blockDim.x + threadIdx.x; if (tet_id < excitationCNT) { //-----TODO: Make this part dynamic---------------------------- const int TetPolyOrderDim2 = TetPolyOrderDim[3]; const int FacePolyOrderDim2 = FacePolyOrderDim[3]; fp_t_ts nxe_inc[TetPolyOrderDim2] = {0}; fp_t_ts nxnxh_inc[TetPolyOrderDim2] = {0}; //------------------------------------------------------------- fp_t_ts Z = 0.0; for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++) { int face_id = ExcitationFacesOffset[tet_id] + i; //printf("face_id = %d\n",face_id); Z = Z_face[face_id]; int port_id = PortFacePidx_d[face_id]; ExcitationProp exci_prop = exci_prop_vec[port_id]; fp_t_ts nvtr[2][3]; fp_t_ts normvtr[3] = {0}; fp_t_ts area = 0; fp_t_ts nxentry; fp_t_ts nxnxentry; fp_t_ts z[3], w[3], r[3]; fp_t_ts wgt; //-----------------TODO: Make this part dynamic-------------------------------------------- fp_t_ts nxelem[FacePolyOrderDim2] = {0}; fp_t_ts nxnxelem[FacePolyOrderDim2] = {0}; //----------------------------------------------------------------------------------------- fp_t_ts nxE_inc_[3]; fp_t_ts nxnxH_inc_[3]; fp_t_ts Hvec[3]; fp_t_ts Evec[3]; geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area); int Q = GAUSS_POINT_NUM[PolyFlag]; for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++) { nxentry = 0; nxnxentry = 0; for (int k = 0; k < Q; k++) { quadObtention(Q, k, z[0], z[1], z[2], wgt); faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w); getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r); int qbase = (face_id * Q + k) * 3; Hvec[0] = Htan_qp_d[qbase + 0]; Hvec[1] = Htan_qp_d[qbase + 1]; Hvec[2] = Htan_qp_d[qbase + 2]; Evec[0] = Etan_qp_d[qbase + 0]; Evec[1] = Etan_qp_d[qbase + 1]; Evec[2] = Etan_qp_d[qbase + 2]; H_updateinc_PORT( t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, exci_prop.TimeDistributionFlag, exci_prop.PortDirection, Evec, Hvec, exci_prop.CHIRP_BW_MHZ, nxE_inc_, nxnxH_inc_); nxentry += area * wgt * dotP3131(w, nxE_inc_); nxnxentry += area * wgt * dotP3131(w, nxnxH_inc_); } nxelem[j] += nxentry; nxnxelem[j] += nxnxentry; } for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++) { // printf("%d\n", ExcitationFacesNum[face_id]); nxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j]; nxnxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j]; } } for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){ nxe_inc[i] *= -dt_muo; nxe_inc[i] += (-Z * dt_muo) * nxnxh_inc[i]; nxe_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i]; } fp_t_ts sum; for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++) { sum = 0.0; for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++){ sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxe_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i]; } Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += sum; } } } __device__ void H_updateinc_PORT( fp_t_ts t, fp_t_ts to, fp_t_ts tau, fp_t_ts freq_m, int TimeDistributionFlag, fp_t_ts* PortDirection, // unit propagation dir fp_t_ts* Et_pre, // precomputed tangential E at quad point fp_t_ts* Ht_pre, // precomputed tangential H at quad point fp_t_ts CHIRP_BW_MHZ, // bandwidth for chirp fp_t_ts* nxEinc, // out: n × E_inc fp_t_ts* nxnxHinc) // out: n × (n × H_inc) { // Time envelopes fp_t_ts IncidExcit = 1.0f; fp_t_ts Emag = 1.0f; TimeModulationInc(TimeDistributionFlag, t, to, freq_m, Emag, tau, CHIRP_BW_MHZ, IncidExcit); // === Your required sequence === fp_t_ts work[3]; fp_t_ts field[3]; field[0] = Et_pre[0]; field[1] = Et_pre[1]; field[2] = Et_pre[2]; // EField crossP3(PortDirection, field, work); scale3(work, IncidExcit, nxEinc); field[0] = Ht_pre[0]; field[1] = Ht_pre[1]; field[2] = Ht_pre[2]; // HField crossP3(PortDirection, field, work); crossP3(PortDirection, work, nxnxHinc); scale3(nxnxHinc, IncidExcit); for (int i = 0; i < 3; ++i) { if (fabs(nxEinc[i]) < (fp_t_ts)1e-12) nxEinc[i] = 0.0; if (fabs(nxnxHinc[i]) < (fp_t_ts)1e-12) nxnxHinc[i] = 0.0; } } __device__ void TimeModulationInc(int TimeDistributionFlag, fp_t_ts t, fp_t_ts to, fp_t_ts freq_m, fp_t_ts Emag, fp_t_ts tau, fp_t_ts CHIRP_BW_MHZ, fp_t_ts &IncidExcit) { fp_t_ts Exponent; fp_t_ts omega = 2.0 * Pi * freq_m * MEGA; fp_t_ts Neuman; fp_t_ts SinModul; fp_t_ts kr; fp_t_ts work[3]; switch (TimeDistributionFlag){ // Time Harmonic case 0: IncidExcit = Emag * sin(- omega * (t)); break; // Gauss Pulse case 1: Exponent = t - to; SinModul = cos(omega * (t - to)); IncidExcit = Emag * SinModul * exp(-(Exponent * Exponent) / (tau * tau)); break; // Neuman Pulse case 2: Exponent = t - to; Neuman = (2.0 * Exponent) / (tau * tau); IncidExcit = (Emag * Neuman) * exp(-(Exponent * Exponent) / (tau * tau)); break; // 3) Linear FM chirp starting at 'to', duration 'tau', bandwidth CHIRP_BW_MHZ, // Hann-tapered on [0, tau] to reduce spectral sidelobes. case 3: { // Time since chirp start (seconds) fp_t_ts tt = t - to; // Gate to [0, tau] if (tt >= (fp_t_ts)0.0 && tt <= tau) { // Sweep parameters const fp_t_ts B = (fp_t_ts)(CHIRP_BW_MHZ * MEGA); // Hz const fp_t_ts fc = (fp_t_ts)(freq_m * MEGA); // Hz const fp_t_ts f0 = fc - (fp_t_ts)0.5 * B; // start freq [Hz] const fp_t_ts k = (tau != (fp_t_ts)0 ? B / tau : (fp_t_ts)0); // Hz/s // Phase: 2π( f0*tt + 0.5*k*tt^2 ) const fp_t_ts phase = (fp_t_ts)(2.0 * Pi) * (f0 * tt + (fp_t_ts)0.5 * k * tt * tt); // Hann window on [0, tau]: w = 0.5*(1 - cos(2π*tt/tau)) const fp_t_ts w = (fp_t_ts)0.5 * ((fp_t_ts)1.0 - cos((fp_t_ts)(2.0 * Pi) * (tt / tau))); const fp_t_ts s = sin(phase) * w; IncidExcit = Emag * s; } else { // Outside the chirp window IncidExcit = (fp_t_ts)0.0; } } break; // 4) Linear FM chirp starting at 'to', duration 'tau', bandwidth CHIRP_BW_MHZ, case 4: { // Time since chirp start (seconds) fp_t_ts tt = t - to; // Gate to [0, tau] if (tt >= (fp_t_ts)0.0 && tt <= tau) { // Sweep parameters const fp_t_ts B = (fp_t_ts)(CHIRP_BW_MHZ * MEGA); // Hz const fp_t_ts fc = (fp_t_ts)(freq_m * MEGA); // Hz const fp_t_ts f0 = fc - (fp_t_ts)0.5 * B; // start freq [Hz] const fp_t_ts k = (tau != (fp_t_ts)0 ? B / tau : (fp_t_ts)0); // Hz/s // Phase: 2π( f0*tt + 0.5*k*tt^2 ) const fp_t_ts phase = (fp_t_ts)(2.0 * Pi) * (f0 * tt + (fp_t_ts)0.5 * k * tt * tt); const fp_t_ts s = sin(phase); IncidExcit = Emag * s; } else { // Outside the chirp window IncidExcit = (fp_t_ts)0.0; } } break; default: break; } } // ============================================================ // Device helper: build n×H_inc and n×(n×E_inc) from precomputed Et/Ht // ============================================================ __device__ void E_updateinc_PORT( fp_t_ts t, fp_t_ts to, fp_t_ts tau, fp_t_ts freq_m, int TimeDistributionFlag, fp_t_ts* PortDirection, // unit propagation dir fp_t_ts* Et_pre, // precomputed tangential E at quad point fp_t_ts* Ht_pre, // precomputed tangential H at quad point fp_t_ts CHIRP_BW_MHZ, // bandwidth for chirp fp_t_ts* nxHinc, // out: n × H_inc fp_t_ts* nxnxEinc) // out: n × (n × E_inc) { // Time envelopes fp_t_ts IncidExcit = 1.0f; fp_t_ts Emag = 1.0f; TimeModulationInc(TimeDistributionFlag, t, to, freq_m, Emag, tau, CHIRP_BW_MHZ, IncidExcit); // Build the required quantities fp_t_ts work[3]; // n × H_inc crossP3(PortDirection, Ht_pre, work); scale3(work, IncidExcit, nxHinc); // n × (n × E_inc) crossP3(PortDirection, Et_pre, work); crossP3(PortDirection, work, nxnxEinc); scale3(nxnxEinc, IncidExcit); for (int i = 0; i < 3; ++i) { if (fabs(nxHinc[i]) < (fp_t_ts)1e-12) nxHinc[i] = 0.0; if (fabs(nxnxEinc[i]) < (fp_t_ts)1e-12) nxnxEinc[i] = 0.0; } } // ============================================================ // Kernel: Port-only version of E-field excitation // Uses precomputed Etan_qp_d / Htan_qp_d // ============================================================ // Tiny helper to print a 3-vector __device__ inline void dprint3(const char* tag, fp_t_ts v0, fp_t_ts v1, fp_t_ts v2) { printf("%s(%.6e, %.6e, %.6e)\n", tag, (double)v0, (double)v1, (double)v2); } __global__ void addExcitationE_port( int* ExcitationFacesCnt, int* ExcitationFacesOffset, int* ExcitationFacesNum, int excitationCNT, int8_t* map, ExcitationProp* exci_prop_vec, int* PortFacePidx_d, int PolyFlag, fp_t_ts dt_epsilon, fp_t_ts t, fp_t_ts* nd_coords_tet, fp_t_ts* nd_coords_face, fp_t_ts* Z_face, fp_t_ts* InvMat, fp_t_ts* Field, fp_t_ts* Etan_qp_d, fp_t_ts* Htan_qp_d) { int tet_id = blockIdx.x * blockDim.x + threadIdx.x; if (tet_id < excitationCNT) { //-----TODO: Make this part dynamic---------------------------- const int TetPolyOrderDim2 = TetPolyOrderDim[3]; const int FacePolyOrderDim2 = FacePolyOrderDim[3]; fp_t_ts nxh_inc[TetPolyOrderDim2] = {0}; fp_t_ts nxnxe_inc[TetPolyOrderDim2] = {0}; //------------------------------------------------------------- fp_t_ts Y = 0.0; for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++) { int face_id = ExcitationFacesOffset[tet_id] + i; Y = 1 / Z_face[face_id]; int port_id = PortFacePidx_d[face_id]; ExcitationProp exci_prop = exci_prop_vec[port_id]; fp_t_ts nvtr[2][3]; fp_t_ts normvtr[3] = {0}; fp_t_ts area = 0; fp_t_ts nxentry; fp_t_ts nxnxentry; fp_t_ts z[3], w[3], r[3]; fp_t_ts wgt; //-----------------TODO: Make this part dynamic-------------------------------------------- fp_t_ts nxelem[FacePolyOrderDim2] = {0}; fp_t_ts nxnxelem[FacePolyOrderDim2] = {0}; //----------------------------------------------------------------------------------------- fp_t_ts nxH_inc_[3]; fp_t_ts nxnxE_inc_[3]; fp_t_ts Hvec[3]; fp_t_ts Evec[3]; geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area); int Q = GAUSS_POINT_NUM[PolyFlag]; for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++) { nxentry = 0; nxnxentry = 0; for (int k = 0; k < Q; k++) { quadObtention(Q, k, z[0], z[1], z[2], wgt); faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w); getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r); int qbase = (face_id * Q + k) * 3; Hvec[0] = Htan_qp_d[qbase + 0]; Hvec[1] = Htan_qp_d[qbase + 1]; Hvec[2] = Htan_qp_d[qbase + 2]; Evec[0] = Etan_qp_d[qbase + 0]; Evec[1] = Etan_qp_d[qbase + 1]; Evec[2] = Etan_qp_d[qbase + 2]; E_updateinc_PORT( t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, exci_prop.TimeDistributionFlag, exci_prop.PortDirection, Evec, Hvec, exci_prop.CHIRP_BW_MHZ, nxH_inc_, nxnxE_inc_); nxentry += area * wgt * dotP3131(w, nxH_inc_); nxnxentry += area * wgt * dotP3131(w, nxnxE_inc_); } nxelem[j] += nxentry; nxnxelem[j] += nxnxentry; } for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++) { nxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j]; nxnxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j]; } } for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++) { nxh_inc[i] *= dt_epsilon; nxh_inc[i] += (-Y * dt_epsilon) * nxnxe_inc[i]; nxh_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i]; } fp_t_ts sum; for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++) { sum = 0.0; for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++) { sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxh_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i]; } Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += sum; } } } __global__ void fillOnes(fp_t_ts* arr, size_t N) { size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < N) arr[i] = (fp_t_ts)1.0; } #endif #endif // DGTD_KERNELS_CU