You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1392 lines
45 KiB
1392 lines
45 KiB
//
|
|
// Created by jundafeng on 3/23/22.
|
|
//
|
|
|
|
#ifndef DGTD_KERNELS_CU
|
|
#define DGTD_KERNELS_CU
|
|
#include "kernels.cuh"
|
|
/* Basic kernel: one tetra per thread
|
|
* Opt1: one block per tetra, one entry per thread, registers can be shuffled.
|
|
* New kernels can be parallelized in a smarter way:
|
|
* 1. volume integral can be parallelized in (12,12)
|
|
* 2. surface integral can be parallelized in (6,6)
|
|
* 3. coupling integral can be parallelized in (6,6)
|
|
* 4. excitation can be parallelized in (12,1)
|
|
*/
|
|
|
|
//-----------------------CONSTANTS--------------------------------------------------------------------------------------------
|
|
|
|
__device__ constexpr int GAUSS_POINT_NUM[4] = {6, 9, 9, 9};
|
|
__device__ constexpr int TetPolyOrderDim[4] = {6, 12, 30, 45};
|
|
__device__ constexpr int FacePolyOrderDim[4] = {3, 6, 12, 18};
|
|
|
|
#define g6_a 0.816847572980459
|
|
#define g6_b (1.0 - g6_a) / 2.0
|
|
#define g6_c 0.108103018168070
|
|
#define g6_d (1.0 - g6_c) / 2.0
|
|
#define g6_W1 0.109951743655322
|
|
#define g6_W2 0.223381589678011
|
|
|
|
#define g9_a 0.437525248383384
|
|
#define g9_b 1.0 - 2.0 * g9_a
|
|
#define g9_c 0.797112651860071
|
|
#define g9_d 0.165409927389841
|
|
#define g9_e 1.0 - g9_c - g9_d
|
|
#define g9_W1 0.205950504760887
|
|
#define g9_W2 0.063691414286223
|
|
|
|
#if defined(DGTD_USE_CUDA)
|
|
|
|
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
#include <cooperative_groups.h>
|
|
#include "cuda_utils.h"
|
|
|
|
__device__ fp_t_ts g2d_6[6][4] = {
|
|
g6_a, g6_b, g6_b, g6_W1,
|
|
g6_b, g6_a, g6_b, g6_W1,
|
|
g6_b, g6_b, g6_a, g6_W1,
|
|
g6_c, g6_d, g6_d, g6_W2,
|
|
g6_d, g6_c, g6_d, g6_W2,
|
|
g6_d, g6_d, g6_c, g6_W2};
|
|
|
|
// added by Seung-Cheol Lee & Marinos N. Vouvakis 08/12/02
|
|
// Order = 5
|
|
__device__ fp_t_ts g2d_9[9][4] = {
|
|
g9_b, g9_a, g9_a, g9_W1,
|
|
g9_a, g9_b, g9_a, g9_W1,
|
|
g9_a, g9_a, g9_b, g9_W1,
|
|
|
|
g9_c, g9_d, g9_e, g9_W2,
|
|
g9_c, g9_e, g9_d, g9_W2,
|
|
g9_d, g9_c, g9_e, g9_W2,
|
|
g9_d, g9_e, g9_c, g9_W2,
|
|
g9_e, g9_c, g9_d, g9_W2,
|
|
g9_e, g9_d, g9_c, g9_W2};
|
|
|
|
// __device__ const int FACE_BASIS_IJK[6][2] = {
|
|
// {1, 2},
|
|
// {0, 2},
|
|
// {0, 1},
|
|
// {1, 2},
|
|
// {0, 2},
|
|
// {0, 1},
|
|
// };
|
|
|
|
__device__ const int FACE_BASIS_IJK[18][3] = {
|
|
{1, 2, -1},
|
|
{0, 2, -1},
|
|
{0, 1, -1},
|
|
{1, 2, -1},
|
|
{0, 2, -1},
|
|
{0, 1, -1},
|
|
{0, 1, 2},
|
|
{0, 1, 2},
|
|
{1, 2, -1},
|
|
{0, 2, -1},
|
|
{0, 1, -1},
|
|
{0, 1, 2},
|
|
{0, 1, 2},
|
|
{0, 1, 2},
|
|
{0, 1, 2},
|
|
{0, 1, 2},
|
|
{0, 1, 2},
|
|
{0, 1, 2}
|
|
};
|
|
|
|
__device__ const int FAC2TET[4][18] = {
|
|
{5, 4, 3, 11, 10, 9, 12, 13, 25, 24, 23, 26, 30, 31, 32, 42, 43, 44},
|
|
{5, 2, 1, 11, 8, 7, 14, 15, 25, 22, 21, 27, 33, 34, 35, 42, 43, 44},
|
|
{4, 2, 0, 10, 8, 6, 16, 17, 24, 22, 20, 28, 36, 37, 38, 42, 43, 44},
|
|
{3, 1, 0, 9, 7, 6, 18, 19, 23, 21, 20, 29, 39, 40, 41, 42, 43, 44}
|
|
};
|
|
|
|
|
|
#endif
|
|
|
|
//----------------------------------------------------------------------------------------------------------------------------
|
|
|
|
#if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)
|
|
|
|
__global__ void makeNeighField(int* MappingArray, fp_t_ts* Field, fp_t_ts* auxiliar, int elements)
|
|
{
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if(idx < elements){
|
|
auxiliar[idx] = Field[MappingArray[idx]];
|
|
// printf("%d\n", MappingArray[idx]);
|
|
}
|
|
}
|
|
|
|
__global__ void makeNeighFieldRegular(int* MappingArray, fp_t_ts* Field, fp_t_ts* auxiliar, int elements, int polyOrder){
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if(idx < NumOfFaces * FacePolyOrderDim[polyOrder] * elements){
|
|
|
|
int blockId = idx / FacePolyOrderDim[polyOrder];
|
|
|
|
int basisId = idx % FacePolyOrderDim[polyOrder];
|
|
int tetId = blockId / NumOfFaces;
|
|
int faceId = blockId % NumOfFaces;
|
|
|
|
auxiliar[(faceId * elements + tetId) * FacePolyOrderDim[polyOrder] + basisId] = Field[MappingArray[idx]];
|
|
|
|
// for(int i = 0; i < NumOfFaces; i++){
|
|
// auxiliar[idx + i * elements] = Field[MappingArray[idx + i * FacePolyOrderDim[polyOrder]]];
|
|
// }
|
|
}
|
|
}
|
|
|
|
|
|
__global__ void makeNeighFieldRegular_custom(int* MappingArray, fp_t_ts* Field, fp_t_ts* auxiliar, int elements, int polyOrder, int numFaces){
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if(idx < numFaces * FacePolyOrderDim[polyOrder] * elements){
|
|
|
|
int blockId = idx / FacePolyOrderDim[polyOrder];
|
|
|
|
int basisId = idx % FacePolyOrderDim[polyOrder];
|
|
int tetId = blockId / numFaces;
|
|
int faceId = blockId % numFaces;
|
|
|
|
auxiliar[(faceId * elements + tetId) * FacePolyOrderDim[polyOrder] + basisId] = Field[MappingArray[idx]];
|
|
|
|
}
|
|
}
|
|
|
|
|
|
__global__ void addCouplingResults(fp_t_ts* Field, fp_t_ts* auxiliar, int* neighs, int* neighsOffset, int elements){
|
|
int tet = blockIdx.x * blockDim.y + threadIdx.y;
|
|
|
|
if(tet < elements){
|
|
int basis = blockDim.x;
|
|
|
|
fp_t_ts fieldAux = 0.0;
|
|
int offset = (neighsOffset[tet] - neighsOffset[0]) * basis;
|
|
|
|
for(int i = 0; i < neighs[tet]; i++){
|
|
fieldAux += auxiliar[offset + basis * i + threadIdx.x];
|
|
}
|
|
|
|
Field[tet * basis + threadIdx.x] += fieldAux;
|
|
}
|
|
}
|
|
|
|
__global__ void addCouplingResultsRegular(fp_t_ts* Field, fp_t_ts* auxiliar, int elements){
|
|
int tet = blockIdx.x * blockDim.y + threadIdx.y;
|
|
|
|
if(tet < elements){
|
|
int basis = blockDim.x;
|
|
|
|
fp_t_ts fieldAux = 0.0;
|
|
|
|
for(int i = 0; i < NumOfFaces; i++){
|
|
fieldAux += auxiliar[tet * basis + elements * basis * i + threadIdx.x];
|
|
}
|
|
|
|
Field[tet * basis + threadIdx.x] += fieldAux;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void addCouplingResultsRegular_custom(fp_t_ts* Field, fp_t_ts* auxiliar, int elements, int numFaces){
|
|
|
|
int tet = blockIdx.x * blockDim.y + threadIdx.y;
|
|
|
|
if(tet < elements)
|
|
{
|
|
int basis = blockDim.x;
|
|
|
|
fp_t_ts fieldAux = 0.0;
|
|
|
|
for(int i = 0; i < numFaces; i++)
|
|
{
|
|
fieldAux += auxiliar[tet * basis + elements * basis * i + threadIdx.x];
|
|
}
|
|
|
|
Field[tet * basis + threadIdx.x] += fieldAux;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void fill_ones_cols(fp_t_ts* __restrict__ M, int rows, int cols, int ld)
|
|
{
|
|
int r = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int c = blockIdx.y * blockDim.y + threadIdx.y;
|
|
if (r < rows && c < cols) M[(size_t)c * ld + r] = 1.0f;
|
|
}
|
|
|
|
|
|
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
|
|
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
|
|
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
|
|
|
|
__device__ fp_t_ts dotP3131(const fp_t_ts *v1, const fp_t_ts *v2){
|
|
fp_t_ts result;
|
|
result = v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
|
|
return result;
|
|
}
|
|
|
|
__device__ void crossP3(const fp_t_ts *v1, const fp_t_ts *v2, fp_t_ts *result){
|
|
result[0] = v1[1] * v2[2] - v1[2] * v2[1];
|
|
result[1] = -(v1[0] * v2[2] - v1[2] * v2[0]);
|
|
result[2] = v1[0] * v2[1] - v1[1] * v2[0];
|
|
}
|
|
|
|
__device__ void scale3(fp_t_ts *v1, const fp_t_ts factor){
|
|
v1[0] = v1[0] * factor;
|
|
v1[1] = v1[1] * factor;
|
|
v1[2] = v1[2] * factor;
|
|
}
|
|
|
|
__device__ void scale3(const fp_t_ts *v1, const fp_t_ts factor, fp_t_ts *v2){
|
|
v2[0] = v1[0] * factor;
|
|
v2[1] = v1[1] * factor;
|
|
v2[2] = v1[2] * factor;
|
|
}
|
|
|
|
__device__ void unit3(fp_t_ts *v1){
|
|
fp_t_ts work[3];
|
|
fp_t_ts mag = sqrt(v1[0] * v1[0] + v1[1] * v1[1] + v1[2] * v1[2]);
|
|
|
|
work[0] = v1[0] / mag;
|
|
work[1] = v1[1] / mag;
|
|
work[2] = v1[2] / mag;
|
|
|
|
v1[0] = work[0];
|
|
v1[1] = work[1];
|
|
v1[2] = work[2];
|
|
}
|
|
|
|
__device__ void plus3(fp_t_ts *v1, fp_t_ts *v2){
|
|
v1[0] = v1[0] + v2[0];
|
|
v1[1] = v1[1] + v2[1];
|
|
v1[2] = v1[2] + v2[2];
|
|
}
|
|
|
|
__device__ void set3(const fp_t_ts *v1, fp_t_ts *v2){
|
|
v2[0] = v1[0];
|
|
v2[1] = v1[1];
|
|
v2[2] = v1[2];
|
|
}
|
|
|
|
__device__ void plus3(const fp_t_ts *v1, const fp_t_ts *v2, fp_t_ts *result){
|
|
result[0] = v1[0] + v2[0];
|
|
result[1] = v1[1] + v2[1];
|
|
result[2] = v1[2] + v2[2];
|
|
}
|
|
|
|
__device__ void plus33(const fp_t_ts *v1, const fp_t_ts *v2, const fp_t_ts *v3, fp_t_ts *result){
|
|
result[0] = v1[0] + v2[0] + v3[0];
|
|
result[1] = v1[1] + v2[1] + v3[1];
|
|
result[2] = v1[2] + v2[2] + v3[2];
|
|
}
|
|
|
|
__device__ void minus3(const fp_t_ts *v1, const fp_t_ts *v2, fp_t_ts *result){
|
|
result[0] = v1[0] - v2[0];
|
|
result[1] = v1[1] - v2[1];
|
|
result[2] = v1[2] - v2[2];
|
|
}
|
|
|
|
__device__ void reset3(fp_t_ts *result){
|
|
result[0] = 0.0;
|
|
result[1] = 0.0;
|
|
result[2] = 0.0;
|
|
}
|
|
|
|
__device__ void geometry3(const fp_t_ts *nd_coord, fp_t_ts lvtr[][3], fp_t_ts avtr[][3], fp_t_ts *vol){
|
|
int i, i1, i2, i3;
|
|
fp_t_ts vv[3], tt[3];
|
|
|
|
for (i = 0; i < 3; i++){
|
|
minus3(&nd_coord[(i + 1) * 3], &nd_coord[0], lvtr[i]);
|
|
}
|
|
|
|
for (i = 0; i < 3; i++){
|
|
i1 = (i + 1) % 4;
|
|
i2 = (i + 2) % 4;
|
|
i3 = (i + 3) % 4;
|
|
|
|
minus3(&nd_coord[i2 * 3], &nd_coord[i1 * 3], tt);
|
|
minus3(&nd_coord[i3 * 3], &nd_coord[i1 * 3], vv);
|
|
crossP3(tt, vv, avtr[i]);
|
|
scale3(avtr[i], 0.5);
|
|
|
|
minus3(&nd_coord[i * 3], &nd_coord[i1 * 3], tt);
|
|
if (dotP3131(avtr[i], tt) < 0.0)
|
|
scale3(avtr[i], -1);
|
|
;
|
|
}
|
|
|
|
reset3(avtr[3]);
|
|
plus33(avtr[0], avtr[1], avtr[2], avtr[3]);
|
|
scale3(avtr[3], -1);
|
|
|
|
(*vol) = dotP3131(avtr[2], lvtr[1]) / 3.0;
|
|
}
|
|
|
|
__device__ fp_t_ts magnitude3(const fp_t_ts *v1){
|
|
return sqrt(v1[0] * v1[0] + v1[1] * v1[1] + v1[2] * v1[2]);
|
|
}
|
|
|
|
__device__ void tet_center(const fp_t_ts *tet_nd_coord, fp_t_ts *tet_center){
|
|
tet_center[0] = (tet_nd_coord[0 * 3 + 0] + tet_nd_coord[1 * 3 + 0] + tet_nd_coord[2 * 3 + 0] + tet_nd_coord[3 * 3 + 0]) * 0.25;
|
|
tet_center[1] = (tet_nd_coord[0 * 3 + 1] + tet_nd_coord[1 * 3 + 1] + tet_nd_coord[2 * 3 + 1] + tet_nd_coord[3 * 3 + 1]) * 0.25;
|
|
tet_center[2] = (tet_nd_coord[0 * 3 + 2] + tet_nd_coord[1 * 3 + 2] + tet_nd_coord[2 * 3 + 2] + tet_nd_coord[3 * 3 + 2]) * 0.25;
|
|
}
|
|
|
|
// have to use 2d arrays in the device functions, using 1d array with offsets will result in erroneous values
|
|
__device__ void geometry2(const fp_t_ts *nd_coord, const fp_t_ts *tet0_nd_coord, fp_t_ts *normal, fp_t_ts nvtr[][3], fp_t_ts *area){
|
|
fp_t_ts t0[3], t1[3], tv[3], tet0center[3];
|
|
|
|
minus3(&nd_coord[2 * 3], &nd_coord[1 * 3], t0);
|
|
minus3(&nd_coord[0 * 3], &nd_coord[2 * 3], t1);
|
|
|
|
crossP3(t0, t1, normal);
|
|
|
|
(*area) = 0.5 * magnitude3(normal);
|
|
scale3(normal, 1 / magnitude3(normal));
|
|
|
|
crossP3(normal, t0, nvtr[0]);
|
|
crossP3(normal, t1, nvtr[1]);
|
|
|
|
if (dotP3131(nvtr[0], t1) < 0.0)
|
|
scale3(nvtr[0], -1);
|
|
if (dotP3131(nvtr[1], t0) > 0.0)
|
|
scale3(nvtr[1], -1);
|
|
|
|
tet_center(tet0_nd_coord, tet0center);
|
|
minus3(tet0center, &nd_coord[0 * 3], tv);
|
|
|
|
if (dotP3131(tv, normal) > 0.0)
|
|
scale3(normal, -1);
|
|
}
|
|
|
|
__device__ void geometry2(const fp_t_ts *nd_coord, fp_t_ts nvtr[][3], fp_t_ts *area){
|
|
fp_t_ts t0[3], t1[3], normal[3];
|
|
minus3(&nd_coord[2 * 3], &nd_coord[1 * 3], t0);
|
|
minus3(&nd_coord[0 * 3], &nd_coord[2 * 3], t1);
|
|
|
|
crossP3(t0, t1, normal);
|
|
|
|
(*area) = 0.5 * magnitude3(normal);
|
|
scale3(normal, 1 / magnitude3(normal));
|
|
|
|
crossP3(normal, t0, nvtr[0]);
|
|
crossP3(normal, t1, nvtr[1]);
|
|
|
|
if (dotP3131(nvtr[0], t1) < 0.0)
|
|
scale3(nvtr[0], -1);
|
|
if (dotP3131(nvtr[1], t0) > 0.0)
|
|
scale3(nvtr[1], -1);
|
|
}
|
|
|
|
__device__ void dotP3331(const fp_t_ts *tensor, const fp_t_ts *v1, fp_t_ts *result){
|
|
result[0] = v1[0] * tensor[0] + v1[1] * tensor[1] + v1[2] * tensor[2];
|
|
result[1] = v1[0] * tensor[3] + v1[1] * tensor[4] + v1[2] * tensor[5];
|
|
result[2] = v1[0] * tensor[6] + v1[1] * tensor[7] + v1[2] * tensor[8];
|
|
}
|
|
|
|
__device__ void makeCoeff33(const fp_t_ts avtr[][3], const fp_t_ts *epsr, fp_t_ts coeffA[][3]){
|
|
int i, j;
|
|
fp_t_ts temp[3];
|
|
for (i = 0; i < 3; i++){
|
|
for (j = 0; j < 3; j++){
|
|
dotP3331(epsr, avtr[j], temp);
|
|
coeffA[i][j] = dotP3131(temp, avtr[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ void quadObtention(int np, int id, fp_t_ts &zeta0, fp_t_ts &zeta1, fp_t_ts &zeta2, fp_t_ts &weight){
|
|
if (id < 0 || id >= np){
|
|
printf("Error in quadObtention");
|
|
return;
|
|
}
|
|
|
|
switch (np){
|
|
case 6:
|
|
zeta0 = g2d_6[id][0];
|
|
zeta1 = g2d_6[id][1];
|
|
zeta2 = g2d_6[id][2];
|
|
weight = g2d_6[id][3];
|
|
break;
|
|
case 9:
|
|
zeta0 = g2d_9[id][0];
|
|
zeta1 = g2d_9[id][1];
|
|
zeta2 = g2d_9[id][2];
|
|
weight = g2d_9[id][3];
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
__device__ void faceBasisW(const fp_t_ts *node_coords, const fp_t_ts *zeta, const int p, fp_t_ts *W){
|
|
fp_t_ts gradZeta[3][3];
|
|
fp_t_ts area;
|
|
geometry2(node_coords, gradZeta, &area);
|
|
|
|
scale3(gradZeta[0], 0.5 / area);
|
|
scale3(gradZeta[1], 0.5 / area);
|
|
gradZeta[2][0] = 0;
|
|
gradZeta[2][1] = 0;
|
|
gradZeta[2][2] = 0;
|
|
minus3(gradZeta[2], gradZeta[0], gradZeta[2]);
|
|
minus3(gradZeta[2], gradZeta[1], gradZeta[2]);
|
|
|
|
int i = FACE_BASIS_IJK[p][0];
|
|
int j = FACE_BASIS_IJK[p][1];
|
|
int k = FACE_BASIS_IJK[p][2];
|
|
|
|
switch (p){
|
|
case 0:
|
|
scale3(gradZeta[j], zeta[i]);
|
|
scale3(gradZeta[i], zeta[j]);
|
|
minus3(gradZeta[j], gradZeta[i], W);
|
|
break;
|
|
case 1:
|
|
scale3(gradZeta[j], zeta[i]);
|
|
scale3(gradZeta[i], zeta[j]);
|
|
minus3(gradZeta[j], gradZeta[i], W);
|
|
break;
|
|
case 2:
|
|
scale3(gradZeta[j], zeta[i]);
|
|
scale3(gradZeta[i], zeta[j]);
|
|
minus3(gradZeta[j], gradZeta[i], W);
|
|
break;
|
|
case 3:
|
|
scale3(gradZeta[j], zeta[i]);
|
|
scale3(gradZeta[i], zeta[j]);
|
|
plus3(gradZeta[j], gradZeta[i], W);
|
|
scale3(W, 4.);
|
|
break;
|
|
case 4:
|
|
scale3(gradZeta[j], zeta[i]);
|
|
scale3(gradZeta[i], zeta[j]);
|
|
plus3(gradZeta[j], gradZeta[i], W);
|
|
scale3(W, 4.);
|
|
break;
|
|
case 5:
|
|
scale3(gradZeta[j], zeta[i]);
|
|
scale3(gradZeta[i], zeta[j]);
|
|
plus3(gradZeta[j], gradZeta[i], W);
|
|
scale3(W, 4.);
|
|
break;
|
|
case 6:
|
|
scale3(gradZeta[k], zeta[i]);
|
|
scale3(gradZeta[i], zeta[k]);
|
|
minus3(gradZeta[k], gradZeta[i], W);
|
|
scale3(W, 4 * zeta[j]);
|
|
break;
|
|
case 7:
|
|
scale3(gradZeta[j], zeta[i]);
|
|
scale3(gradZeta[i], zeta[j]);
|
|
minus3(gradZeta[j], gradZeta[i], W);
|
|
scale3(W, 4. * zeta[k]);
|
|
break;
|
|
case 8:
|
|
scale3(gradZeta[j], zeta[i] * zeta[i] - 2. * zeta[i] * zeta[j]);
|
|
scale3(gradZeta[i], zeta[j] * zeta[j] - 2. * zeta[i] * zeta[j]);
|
|
minus3(gradZeta[j], gradZeta[i], W);
|
|
scale3(W, 4.);
|
|
break;
|
|
case 9:
|
|
scale3(gradZeta[j], zeta[i] * zeta[i] - 2. * zeta[i] * zeta[j]);
|
|
scale3(gradZeta[i], zeta[j] * zeta[j] - 2. * zeta[i] * zeta[j]);
|
|
minus3(gradZeta[j], gradZeta[i], W);
|
|
scale3(W, 4.);
|
|
break;
|
|
case 10:
|
|
scale3(gradZeta[j], zeta[i] * zeta[i] - 2. * zeta[i] * zeta[j]);
|
|
scale3(gradZeta[i], zeta[j] * zeta[j] - 2. * zeta[i] * zeta[j]);
|
|
minus3(gradZeta[j], gradZeta[i], W);
|
|
scale3(W, 4.);
|
|
break;
|
|
case 11:
|
|
scale3(gradZeta[k], zeta[i] * zeta[j]);
|
|
scale3(gradZeta[j], zeta[i] * zeta[k]);
|
|
scale3(gradZeta[i], zeta[k] * zeta[j]);
|
|
plus33(gradZeta[k], gradZeta[j], gradZeta[i], W);
|
|
scale3(W, 8.);
|
|
break;
|
|
case 12:
|
|
scale3(gradZeta[k], zeta[j]);
|
|
scale3(gradZeta[j], zeta[k]);
|
|
minus3(gradZeta[k], gradZeta[j], W);
|
|
scale3(W, 4. * zeta[i] * zeta[i]);
|
|
break;
|
|
case 13:
|
|
scale3(gradZeta[k], zeta[i]);
|
|
scale3(gradZeta[i], zeta[k]);
|
|
minus3(gradZeta[k], gradZeta[i], W);
|
|
scale3(W, 4. * zeta[j] * zeta[j]);
|
|
break;
|
|
case 14:
|
|
scale3(gradZeta[j], zeta[i]);
|
|
scale3(gradZeta[i], zeta[j]);
|
|
minus3(gradZeta[j], gradZeta[i], W);
|
|
scale3(W, 4. * zeta[k] * zeta[k]);
|
|
break;
|
|
case 15:
|
|
W[0] = W[1] = W[2] = 0;
|
|
break;
|
|
case 16:
|
|
W[0] = W[1] = W[2] = 0;
|
|
break;
|
|
case 17:
|
|
W[0] = W[1] = W[2] = 0;
|
|
break;
|
|
default:
|
|
W[0] = W[1] = W[2] = 0;
|
|
printf("Check faceBasisW: you are asking for somthing out of scope");
|
|
break;
|
|
}
|
|
}
|
|
|
|
__device__ void getCartesianCoord(const fp_t_ts *node_coords, const fp_t_ts *zeta, fp_t_ts *pnt){
|
|
pnt[0] = node_coords[0 * 3 + 0] * zeta[0] + node_coords[1 * 3 + 0] * zeta[1] + node_coords[2 * 3 + 0] * zeta[2];
|
|
pnt[1] = node_coords[0 * 3 + 1] * zeta[0] + node_coords[1 * 3 + 1] * zeta[1] + node_coords[2 * 3 + 1] * zeta[2];
|
|
pnt[2] = node_coords[0 * 3 + 2] * zeta[0] + node_coords[1 * 3 + 2] * zeta[1] + node_coords[2 * 3 + 2] * zeta[2];
|
|
}
|
|
|
|
__device__ void PlaneWaveFields(int ExcitationFlag, fp_t_ts t, fp_t_ts to, fp_t_ts tau, const fp_t_ts *r, const fp_t_ts *ro, fp_t_ts freq_m, fp_t_ts *kvtr, fp_t_ts eta, fp_t_ts Emagnitude, fp_t_ts &IncidExcit_E, fp_t_ts &IncidExcit_H)
|
|
{
|
|
|
|
fp_t_ts omega = 2.0 * Pi * freq_m * MEGA;
|
|
fp_t_ts Exponent;
|
|
fp_t_ts SinModul;
|
|
fp_t_ts Neuman;
|
|
fp_t_ts work[3];
|
|
// printf("%d", ExcitationFlag);
|
|
switch (ExcitationFlag){
|
|
case 0:
|
|
minus3(r, ro, work);
|
|
Exponent = t - to - dotP3131(kvtr, work) / Vo;
|
|
// Time Harmonic
|
|
if (Exponent >= 0.0){
|
|
SinModul = cos(omega * Exponent);
|
|
IncidExcit_E = Emagnitude * SinModul;
|
|
IncidExcit_H = (Emagnitude / eta) * SinModul;
|
|
}else{
|
|
IncidExcit_E = 0.0;
|
|
IncidExcit_H = 0.0;
|
|
}
|
|
break;
|
|
|
|
case 1:
|
|
// Gauss Pulse
|
|
minus3(r, ro, work);
|
|
Exponent = t - to - dotP3131(kvtr, work) / Vo;
|
|
SinModul = cos(omega * Exponent);
|
|
IncidExcit_E = Emagnitude * SinModul * exp(-(Exponent * Exponent) / (tau * tau));
|
|
IncidExcit_H = (Emagnitude / eta) * SinModul * exp(-(Exponent * Exponent) / (tau * tau));
|
|
break;
|
|
|
|
case 2:
|
|
// Neuman Pulse
|
|
minus3(r, ro, work);
|
|
Exponent = t - to - dotP3131(kvtr, work) / Vo;
|
|
Neuman = (2.0 * Exponent) / (tau * tau);
|
|
IncidExcit_E = (Emagnitude * Neuman) * exp(-(Exponent * Exponent) / (tau * tau));
|
|
IncidExcit_H = ((Emagnitude / eta) * Neuman) * exp(-(Exponent * Exponent) / (tau * tau));
|
|
break;
|
|
|
|
case 3:
|
|
{
|
|
// DC-Free Hann-Modulated Cosine Pulse with time delay (tdelay = to)
|
|
fp_t tdelay = to;
|
|
minus3(r, ro, work);
|
|
fp_t t_rel, Exponent, window, SinModul;
|
|
|
|
Exponent = t - tdelay - dotP3131(kvtr, work) / Vo;
|
|
|
|
if (Exponent >= 0.0 && Exponent <= tau)
|
|
{
|
|
// Shift Exponent to center cosine inside the window
|
|
t_rel = Exponent - tau / 2.0;
|
|
|
|
// Hann window centered on [tdelay, tdelay + tau]
|
|
window = 0.5 * (1.0 - cos(2.0 * M_PI * Exponent / tau));
|
|
SinModul = cos(omega * t_rel);
|
|
|
|
IncidExcit_E = Emagnitude * SinModul * window;
|
|
IncidExcit_H = (Emagnitude / eta) * SinModul * window;
|
|
}
|
|
else
|
|
{
|
|
IncidExcit_E = 0.0;
|
|
IncidExcit_H = 0.0;
|
|
}
|
|
break;
|
|
}
|
|
|
|
case 4:
|
|
{
|
|
// Linear Chirp Excitation using sin to start at zero and Hann window to taper
|
|
fp_t_ts work[3];
|
|
minus3(r, ro, work); // work = r - ro
|
|
|
|
Exponent = t - dotP3131(kvtr, work) / Vo;
|
|
|
|
if (Exponent >= 0.0 && Exponent <= to)
|
|
{
|
|
// Chirp parameters
|
|
fp_t_ts f_end = freq_m * MEGA; // Center frequency in Hz
|
|
fp_t_ts B = tau * MEGA; // Bandwidth in Hz
|
|
fp_t_ts f0 = f_end - B; // Start frequency
|
|
fp_t_ts f1 = f_end; // End frequency
|
|
fp_t_ts Tchirp = to; // Chirp duration
|
|
|
|
// Chirp phase
|
|
fp_t_ts chirpArg = 2.0 * Pi * f0 * Exponent
|
|
+ Pi * (f1 - f0) / Tchirp * Exponent * Exponent;
|
|
|
|
// Hann window for smooth tapering
|
|
fp_t_ts window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tchirp));
|
|
|
|
// Final tapered chirp signal
|
|
fp_t_ts chirpSignal = sin(chirpArg) * window;
|
|
|
|
IncidExcit_E = Emagnitude * chirpSignal;
|
|
IncidExcit_H = (Emagnitude / eta) * chirpSignal;
|
|
}
|
|
else
|
|
{
|
|
IncidExcit_E = 0.0;
|
|
IncidExcit_H = 0.0;
|
|
}
|
|
break;
|
|
}
|
|
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
__device__ void PortFields(int PortFlag, const fp_t_ts *PortDirection, fp_t_ts Emagnitude, fp_t_ts &Emag, fp_t_ts &Hmag, fp_t_ts PortImpedance, fp_t_ts *Epol, fp_t_ts *Hpol, const fp_t_ts *zeta, const fp_t_ts *nodes, const int index_, const fp_t_ts *E_coeff_d, const fp_t_ts *H_coeff_d, int face_id)
|
|
{
|
|
int TriNumBas;
|
|
switch (PortFlag){
|
|
case 1:
|
|
fp_t_ts basis_k[3];
|
|
TriNumBas = 6;
|
|
|
|
// ------------------------------------------------
|
|
// E Fields
|
|
reset3(Epol);
|
|
|
|
for (int k = 0; k < TriNumBas; k++){
|
|
faceBasisW(&nodes[3 * 3 * face_id], zeta, k, basis_k);
|
|
scale3(basis_k, E_coeff_d[index_ + k]);
|
|
plus3(Epol, basis_k);
|
|
}
|
|
|
|
Emag = magnitude3(Epol) * Emagnitude * 2.0;
|
|
unit3(Epol);
|
|
// ------------------------------------------------
|
|
// H Fields
|
|
reset3(Hpol);
|
|
|
|
for (int k = 0; k < TriNumBas; k++)
|
|
{
|
|
faceBasisW(&nodes[3 * 3 * face_id], zeta, k, basis_k);
|
|
scale3(basis_k, H_coeff_d[index_ + k]);
|
|
|
|
plus3(Hpol, basis_k);
|
|
}
|
|
|
|
Hmag = magnitude3(Hpol) * Emagnitude / PortImpedance * 2.0;
|
|
unit3(Hpol);
|
|
// ------------------------------------------------
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
__device__ void TimeModulationInc(int TimeDistributionFlag, fp_t_ts t, fp_t_ts to, const fp_t_ts *PortDirection, const fp_t_ts *r, const fp_t_ts *ro, fp_t_ts freq_m, fp_t_ts Emag, fp_t_ts Hmag, fp_t_ts tau, fp_t_ts &IncidExcit_E, fp_t_ts &IncidExcit_H)
|
|
{
|
|
fp_t_ts Exponent;
|
|
fp_t_ts omega = 2.0 * Pi * freq_m * MEGA;
|
|
fp_t_ts Neuman;
|
|
fp_t_ts SinModul;
|
|
fp_t_ts kr;
|
|
|
|
fp_t_ts work[3];
|
|
|
|
switch (TimeDistributionFlag){
|
|
// Time Harmonic
|
|
case 0:
|
|
minus3(r, ro, work);
|
|
kr = dotP3131(PortDirection, work) * omega / Vo;
|
|
IncidExcit_E = Emag * sin(kr - omega * (t));
|
|
IncidExcit_H = Hmag * sin(kr - omega * (t));
|
|
break;
|
|
|
|
// Gauss Pulse
|
|
case 1:
|
|
minus3(r, ro, work);
|
|
Exponent = t - to - dotP3131(PortDirection, work) / Vo;
|
|
SinModul = cos(omega * (t - to));
|
|
IncidExcit_E = Emag * SinModul * exp(-(Exponent * Exponent) / (tau * tau));
|
|
IncidExcit_H = Hmag * SinModul * exp(-(Exponent * Exponent) / (tau * tau));
|
|
break;
|
|
|
|
// Neuman Pulse
|
|
case 2:
|
|
minus3(r, ro, work);
|
|
Exponent = t - to - dotP3131(PortDirection, work) / Vo;
|
|
Neuman = (2.0 * Exponent) / (tau * tau);
|
|
IncidExcit_E = (Emag * Neuman) * exp(-(Exponent * Exponent) / (tau * tau));
|
|
IncidExcit_H = (Hmag * Neuman) * exp(-(Exponent * Exponent) / (tau * tau));
|
|
break;
|
|
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
__device__ void E_updateinc_PW(const fp_t_ts t, const fp_t_ts to, const fp_t_ts tau, const fp_t_ts freq_m, const fp_t_ts eta, const fp_t_ts Emagnitude, const fp_t_ts *ro, const fp_t_ts theta, const fp_t_ts phi, const fp_t_ts *Epol, const int ExcitationFlag, const fp_t_ts *r, const fp_t_ts *normalvtr, fp_t_ts *nxHinc, fp_t_ts *nxnxEinc)
|
|
{
|
|
fp_t_ts IncidExcit_E, IncidExcit_H;
|
|
|
|
fp_t_ts theta_in_rad = theta * Pi / 180.0;
|
|
fp_t_ts phi_in_rad = phi * Pi / 180.0;
|
|
fp_t_ts kvtr[3], Hpol[3], work[3];
|
|
kvtr[0] = sin(theta_in_rad) * cos(phi_in_rad);
|
|
kvtr[1] = sin(theta_in_rad) * sin(phi_in_rad);
|
|
kvtr[2] = cos(theta_in_rad);
|
|
|
|
PlaneWaveFields(ExcitationFlag, t, to, tau, r, ro, freq_m, kvtr, eta, Emagnitude, IncidExcit_E, IncidExcit_H);
|
|
|
|
// EField
|
|
crossP3(normalvtr, Epol, work);
|
|
crossP3(normalvtr, work, nxnxEinc);
|
|
scale3(nxnxEinc, IncidExcit_E);
|
|
|
|
// HField
|
|
crossP3(kvtr, Epol, work);
|
|
crossP3(normalvtr, work, Hpol);
|
|
scale3(Hpol, IncidExcit_H, nxHinc);
|
|
|
|
for (int i = 0; i < 3; ++i)
|
|
{
|
|
if (fabs(nxHinc[i]) < 1e-6) nxHinc[i] = 0.0;
|
|
if (fabs(nxnxEinc[i]) < 1e-6) nxnxEinc[i] = 0.0;
|
|
}
|
|
}
|
|
|
|
|
|
__device__ void H_updateinc_PW(const fp_t_ts t, const fp_t_ts to, const fp_t_ts tau, const fp_t_ts freq_m, const fp_t_ts eta, const fp_t_ts Emagnitude, const fp_t_ts *ro, const fp_t_ts theta, const fp_t_ts phi, const fp_t_ts *Epol, const int ExcitationFlag, const fp_t_ts *r, const fp_t_ts *normalvtr, fp_t_ts *nxEinc, fp_t_ts *nxnxHinc)
|
|
{
|
|
fp_t_ts IncidExcit_E, IncidExcit_H;
|
|
fp_t_ts theta_in_rad = theta * Pi / 180.0;
|
|
fp_t_ts phi_in_rad = phi * Pi / 180.0;
|
|
fp_t_ts kvtr[3], Hpol[3], work[3];
|
|
kvtr[0] = sin(theta_in_rad) * cos(phi_in_rad);
|
|
kvtr[1] = sin(theta_in_rad) * sin(phi_in_rad);
|
|
kvtr[2] = cos(theta_in_rad);
|
|
|
|
PlaneWaveFields(ExcitationFlag, t, to, tau, r, ro, freq_m, kvtr, eta, Emagnitude, IncidExcit_E, IncidExcit_H);
|
|
|
|
// EField
|
|
crossP3(normalvtr, Epol, work);
|
|
scale3(work, IncidExcit_E, nxEinc);
|
|
|
|
// HField
|
|
crossP3(kvtr, Epol, Hpol);
|
|
crossP3(normalvtr, Hpol, work);
|
|
crossP3(normalvtr, work, nxnxHinc);
|
|
scale3(nxnxHinc, IncidExcit_H);
|
|
|
|
for (int i = 0; i < 3; ++i)
|
|
{
|
|
if (fabs(nxEinc[i]) < 1e-6) nxEinc[i] = 0.0;
|
|
if (fabs(nxnxHinc[i]) < 1e-6) nxnxHinc[i] = 0.0;
|
|
}
|
|
}
|
|
*/
|
|
|
|
__device__ void E_updateinc_PORT(const fp_t_ts t, const fp_t_ts to, const fp_t_ts tau, const fp_t_ts freq_m, const fp_t_ts eta, const fp_t_ts Emagnitude, const fp_t_ts *ro, const fp_t_ts *r1,
|
|
const fp_t_ts *PortDirection, const fp_t_ts *zeta, const fp_t_ts *nodes, const fp_t_ts *E_coeff_d, const fp_t_ts *H_coeff_d, const int face_id,
|
|
const int index_, const int PortFlag, const int TimeDistributionFlag, const fp_t_ts PortImpedance, const fp_t_ts *r, const fp_t_ts *normalvtr, fp_t_ts *nxHinc, fp_t_ts *nxnxEinc)
|
|
{
|
|
fp_t_ts IncidExcit_E, IncidExcit_H;
|
|
fp_t_ts Emag, Hmag;
|
|
fp_t_ts Epol[3], Hpol[3];
|
|
fp_t_ts work[3];
|
|
|
|
PortFields(PortFlag, PortDirection, Emagnitude, Emag, Hmag, PortImpedance, Epol, Hpol, zeta, nodes, index_, E_coeff_d, H_coeff_d, face_id);
|
|
|
|
TimeModulationInc(TimeDistributionFlag, t, to, PortDirection, r, ro, freq_m, Emag, Hmag, tau, IncidExcit_E, IncidExcit_H);
|
|
|
|
// EField
|
|
crossP3(normalvtr, Epol, work);
|
|
crossP3(normalvtr, work, nxnxEinc);
|
|
scale3(nxnxEinc, IncidExcit_E);
|
|
|
|
// HField
|
|
crossP3(PortDirection, Epol, work);
|
|
crossP3(normalvtr, work, Hpol);
|
|
scale3(Hpol, IncidExcit_H, nxHinc);
|
|
}
|
|
|
|
__device__ void H_updateinc_PORT(const fp_t_ts t, const fp_t_ts to, const fp_t_ts tau, const fp_t_ts freq_m, const fp_t_ts eta, const fp_t_ts Emagnitude, const fp_t_ts *ro, const fp_t_ts *r1,
|
|
const fp_t_ts *PortDirection, const fp_t_ts *zeta, const fp_t_ts *nodes, const fp_t_ts *E_coeff_d, const fp_t_ts *H_coeff_d, const int face_id, const int index_,
|
|
const int PortFlag, const int TimeDistributionFlag, const fp_t_ts PortImpedance, const fp_t_ts *r, const fp_t_ts *normalvtr, fp_t_ts *nxEinc, fp_t_ts *nxnxHinc)
|
|
{
|
|
fp_t_ts IncidExcit_E, IncidExcit_H;
|
|
fp_t_ts Emag, Hmag;
|
|
fp_t_ts Epol[3], Hpol[3];
|
|
fp_t_ts work[3];
|
|
|
|
PortFields(PortFlag, PortDirection, Emagnitude, Emag, Hmag, PortImpedance, Epol, Hpol, zeta, nodes, index_, E_coeff_d, H_coeff_d, face_id);
|
|
|
|
TimeModulationInc(TimeDistributionFlag, t, to, PortDirection, r, ro, freq_m, Emag, Hmag, tau, IncidExcit_E, IncidExcit_H);
|
|
|
|
// EField
|
|
crossP3(normalvtr, Epol, work);
|
|
scale3(work, IncidExcit_E, nxEinc);
|
|
|
|
// HField
|
|
crossP3(PortDirection, Epol, Hpol);
|
|
crossP3(normalvtr, Hpol, work);
|
|
crossP3(normalvtr, work, nxnxHinc);
|
|
scale3(nxnxHinc, IncidExcit_H);
|
|
}
|
|
|
|
__global__ void CalculatePadeFreq(fp_t* a_k, fp_t* b_k, int M_global, int N, int* constantM, cuDoubleComplex* Hf){
|
|
|
|
int m = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if(m < M_global){
|
|
fp_t sum_real_a = 0.0;
|
|
fp_t sum_imag_a = 0.0;
|
|
fp_t sum_real_b = 0.0;
|
|
fp_t sum_imag_b = 0.0;
|
|
|
|
for (int n = 0; n < N; n++) {
|
|
fp_t angle = -2.0f * Pi * constantM[m] * n / M_global;
|
|
fp_t cos_val, sin_val;
|
|
|
|
sincos(angle, &sin_val, &cos_val);
|
|
|
|
sum_real_a += a_k[n] * cos_val;
|
|
sum_imag_a += a_k[n] * sin_val;
|
|
|
|
sum_real_b += b_k[n] * cos_val;
|
|
sum_imag_b += b_k[n] * sin_val;
|
|
}
|
|
|
|
Hf[m] = cuCdiv(make_cuDoubleComplex(sum_real_a, sum_imag_a), make_cuDoubleComplex(sum_real_b, sum_imag_b));
|
|
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
// === Utility Functions for Double Precision ===
|
|
__device__ __forceinline__ fp_t_ts cleanSmall(fp_t_ts val, fp_t_ts eps = 1e-12) {
|
|
return fabs(val) < eps ? 0.0 : val;
|
|
}
|
|
|
|
__device__ __forceinline__ void cleanVec3(fp_t_ts* v, fp_t_ts eps = 1e-12) {
|
|
for (int i = 0; i < 3; ++i)
|
|
v[i] = cleanSmall(v[i], eps);
|
|
}
|
|
|
|
__device__ void E_updateinc_PW(const fp_t_ts t, const fp_t_ts to, const fp_t_ts tau, const fp_t_ts freq_m, const fp_t_ts eta, const fp_t_ts Emagnitude, const fp_t_ts *ro, const fp_t_ts theta, const fp_t_ts phi, const fp_t_ts *Epol, const int ExcitationFlag, const fp_t_ts *r, const fp_t_ts *normalvtr, fp_t_ts *nxHinc, fp_t_ts *nxnxEinc)
|
|
{
|
|
fp_t_ts IncidExcit_E, IncidExcit_H;
|
|
|
|
fp_t_ts theta_in_rad = theta * (fp_t_ts)Pi / (fp_t_ts)180.0;
|
|
fp_t_ts phi_in_rad = phi * (fp_t_ts)Pi / (fp_t_ts)180.0;
|
|
fp_t_ts kvtr[3], Hpol[3], work[3];
|
|
kvtr[0] = sin(theta_in_rad) * cos(phi_in_rad);
|
|
kvtr[1] = sin(theta_in_rad) * sin(phi_in_rad);
|
|
kvtr[2] = cos(theta_in_rad);
|
|
|
|
PlaneWaveFields(ExcitationFlag, t, to, tau, r, ro, freq_m, kvtr, eta, Emagnitude, IncidExcit_E, IncidExcit_H);
|
|
|
|
// EField
|
|
crossP3(normalvtr, Epol, work);
|
|
crossP3(normalvtr, work, nxnxEinc);
|
|
scale3(nxnxEinc, IncidExcit_E);
|
|
|
|
// HField
|
|
crossP3(kvtr, Epol, work);
|
|
crossP3(normalvtr, work, Hpol);
|
|
scale3(Hpol, IncidExcit_H, nxHinc);
|
|
|
|
for (int i = 0; i < 3; ++i)
|
|
{
|
|
if (fabs(nxHinc[i]) < (fp_t_ts)1e-12) nxHinc[i] = 0.0;
|
|
if (fabs(nxnxEinc[i]) < (fp_t_ts)1e-12) nxnxEinc[i] = 0.0;
|
|
}
|
|
}
|
|
|
|
|
|
__device__ void H_updateinc_PW(const fp_t_ts t, const fp_t_ts to, const fp_t_ts tau, const fp_t_ts freq_m, const fp_t_ts eta, const fp_t_ts Emagnitude, const fp_t_ts *ro, const fp_t_ts theta, const fp_t_ts phi, const fp_t_ts *Epol, const int ExcitationFlag, const fp_t_ts *r, const fp_t_ts *normalvtr, fp_t_ts *nxEinc, fp_t_ts *nxnxHinc)
|
|
{
|
|
fp_t_ts IncidExcit_E, IncidExcit_H;
|
|
fp_t_ts theta_in_rad = theta * (fp_t_ts)Pi / (fp_t_ts)180.0;
|
|
fp_t_ts phi_in_rad = phi * (fp_t_ts)Pi / (fp_t_ts)180.0;
|
|
fp_t_ts kvtr[3], Hpol[3], work[3];
|
|
kvtr[0] = sin(theta_in_rad) * cos(phi_in_rad);
|
|
kvtr[1] = sin(theta_in_rad) * sin(phi_in_rad);
|
|
kvtr[2] = cos(theta_in_rad);
|
|
|
|
PlaneWaveFields(ExcitationFlag, t, to, tau, r, ro, freq_m, kvtr, eta, Emagnitude, IncidExcit_E, IncidExcit_H);
|
|
|
|
// EField
|
|
crossP3(normalvtr, Epol, work);
|
|
scale3(work, IncidExcit_E, nxEinc);
|
|
|
|
// HField
|
|
crossP3(kvtr, Epol, Hpol);
|
|
crossP3(normalvtr, Hpol, work);
|
|
crossP3(normalvtr, work, nxnxHinc);
|
|
scale3(nxnxHinc, IncidExcit_H);
|
|
|
|
for (int i = 0; i < 3; ++i)
|
|
{
|
|
if (fabs(nxEinc[i]) < (fp_t_ts)1e-12) nxEinc[i] = 0.0;
|
|
if (fabs(nxnxHinc[i]) < (fp_t_ts)1e-12) nxnxHinc[i] = 0.0;
|
|
}
|
|
}
|
|
|
|
__global__ void addExcitationH_PML( int* ExcitationFacesCnt,
|
|
int* ExcitationFacesOffset,
|
|
int* ExcitationFacesNum,
|
|
int excitationCNT,
|
|
int excitationPMLCNT,
|
|
int8_t* map,
|
|
ExcitationProp exci_prop,
|
|
int PolyFlag,
|
|
fp_t_ts dt_muo, fp_t_ts t,
|
|
fp_t_ts* nd_coords_tet,
|
|
fp_t_ts* nd_coords_face,
|
|
fp_t_ts* Z_face,
|
|
fp_t_ts* InvMat,
|
|
fp_t_ts* Field){
|
|
|
|
int tet_id = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (tet_id < excitationCNT){
|
|
//-----TODO: Make this part dynamic----------------------------
|
|
const int TetPolyOrderDim2 = TetPolyOrderDim[3];
|
|
const int FacePolyOrderDim2 = FacePolyOrderDim[3];
|
|
|
|
fp_t_ts nxe_inc[TetPolyOrderDim2] = {0};
|
|
fp_t_ts nxnxh_inc[TetPolyOrderDim2] = {0};
|
|
//-------------------------------------------------------------
|
|
fp_t_ts Z = 0.0;
|
|
for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++){
|
|
int face_id = ExcitationFacesOffset[tet_id] + i;
|
|
Z = Z_face[face_id];
|
|
|
|
fp_t_ts nvtr[2][3];
|
|
fp_t_ts normvtr[3] = {0};
|
|
fp_t_ts area = 0;
|
|
fp_t_ts nxentry;
|
|
fp_t_ts nxnxentry;
|
|
fp_t_ts z[3], w[3], r[3];
|
|
fp_t_ts wgt;
|
|
|
|
//-----------------TODO: Make this part dynamic--------------------------------------------
|
|
fp_t_ts nxelem[FacePolyOrderDim2] = {0};
|
|
fp_t_ts nxnxelem[FacePolyOrderDim2] = {0};
|
|
//-----------------------------------------------------------------------------------------
|
|
|
|
fp_t_ts nxE_inc[3];
|
|
fp_t_ts nxnxH_inc[3];
|
|
|
|
geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area);
|
|
|
|
for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
|
|
nxentry = 0;
|
|
nxnxentry = 0;
|
|
for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++){
|
|
quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt);
|
|
faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w);
|
|
getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r);
|
|
|
|
H_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, Z, exci_prop.Emagnitude, exci_prop.ro, exci_prop.theta, exci_prop.phi, exci_prop.Epol, exci_prop.ExcitationFlag, r, normvtr, nxE_inc, nxnxH_inc);
|
|
|
|
if (!(isnan(nxE_inc[0]))){
|
|
nxentry += area * wgt * dotP3131(w, nxE_inc);
|
|
nxnxentry += area * wgt * dotP3131(w, nxnxH_inc);
|
|
}
|
|
}
|
|
//nxelem[j] += nxentry;
|
|
//nxnxelem[j] += nxnxentry;
|
|
nxelem[j] += cleanSmall(nxentry);
|
|
nxnxelem[j] += cleanSmall(nxnxentry);
|
|
}
|
|
for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
|
|
// printf("%d\n", ExcitationFacesNum[face_id]);
|
|
nxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j];
|
|
nxnxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j];
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
|
|
nxe_inc[i] *= -dt_muo;
|
|
//nxe_inc[i] += (-Z * dt_muo) * nxnxh_inc[i];
|
|
nxe_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i];
|
|
}
|
|
fp_t_ts sum;
|
|
for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
|
|
sum = 0.0;
|
|
for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++){
|
|
sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxe_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i];
|
|
}
|
|
sum = cleanSmall(sum);
|
|
|
|
if (tet_id < excitationPMLCNT)
|
|
{
|
|
Field[tet_id * TetPolyOrderDim[PolyFlag] + i] -= 0.5 * sum; // Scattered Region
|
|
}
|
|
else
|
|
{
|
|
Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += 0.5 * sum; // Total Region
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void addExcitationE_PML( int* ExcitationFacesCnt,
|
|
int* ExcitationFacesOffset,
|
|
int* ExcitationFacesNum,
|
|
int excitationCNT,
|
|
int excitationPMLCNT,
|
|
int8_t* map,
|
|
ExcitationProp exci_prop,
|
|
int PolyFlag,
|
|
fp_t_ts dt_epsilon, fp_t_ts t,
|
|
fp_t_ts* nd_coords_tet,
|
|
fp_t_ts* nd_coords_face,
|
|
fp_t_ts* Z_face,
|
|
fp_t_ts* InvMat,
|
|
fp_t_ts* Field){
|
|
|
|
int tet_id = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (tet_id < excitationCNT){
|
|
//-----TODO: Make this part dynamic----------------------------
|
|
const int TetPolyOrderDim2 = TetPolyOrderDim[3];
|
|
const int FacePolyOrderDim2 = FacePolyOrderDim[3];
|
|
|
|
fp_t_ts nxh_inc[TetPolyOrderDim2] = {0};
|
|
fp_t_ts nxnxe_inc[TetPolyOrderDim2] = {0};
|
|
//-------------------------------------------------------------
|
|
fp_t_ts Y = 0.0;
|
|
for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++){
|
|
int face_id = ExcitationFacesOffset[tet_id] + i;
|
|
Y = 1 / Z_face[face_id];
|
|
|
|
fp_t_ts nvtr[2][3];
|
|
fp_t_ts normvtr[3] = {0};
|
|
fp_t_ts area = 0;
|
|
fp_t_ts nxentry;
|
|
fp_t_ts nxnxentry;
|
|
fp_t_ts z[3], w[3], r[3];
|
|
fp_t_ts wgt;
|
|
|
|
//-----------------TODO: Make this part dynamic--------------------------------------------
|
|
fp_t_ts nxelem[FacePolyOrderDim2] = {0};
|
|
fp_t_ts nxnxelem[FacePolyOrderDim2] = {0};
|
|
//-----------------------------------------------------------------------------------------
|
|
|
|
fp_t_ts nxH_inc[3];
|
|
fp_t_ts nxnxE_inc[3];
|
|
|
|
geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area);
|
|
|
|
for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
|
|
nxentry = 0;
|
|
nxnxentry = 0;
|
|
for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++){
|
|
quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt);
|
|
faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w);
|
|
getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r);
|
|
|
|
E_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, Z_face[face_id], exci_prop.Emagnitude, exci_prop.ro,
|
|
exci_prop.theta, exci_prop.phi, exci_prop.Epol, exci_prop.ExcitationFlag, r, normvtr, nxH_inc, nxnxE_inc);
|
|
|
|
|
|
if (!(isnan(nxH_inc[0]))){
|
|
nxentry += area * wgt * dotP3131(w, nxH_inc);
|
|
nxnxentry += area * wgt * dotP3131(w, nxnxE_inc);
|
|
}
|
|
}
|
|
|
|
//nxelem[j] += nxentry;
|
|
//nxnxelem[j] += nxnxentry;
|
|
nxelem[j] += cleanSmall(nxentry);
|
|
nxnxelem[j] += cleanSmall(nxnxentry);
|
|
}
|
|
for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
|
|
// printf("%d\n", ExcitationFacesNum[face_id]);
|
|
nxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j];
|
|
nxnxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j];
|
|
}
|
|
|
|
}
|
|
|
|
for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
|
|
nxh_inc[i] *= dt_epsilon;
|
|
//nxh_inc[i] += (-Y * dt_epsilon) * nxnxe_inc[i];
|
|
nxh_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i];
|
|
}
|
|
fp_t_ts sum;
|
|
for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
|
|
sum = 0.0;
|
|
for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++)
|
|
{
|
|
sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxh_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i];
|
|
}
|
|
|
|
sum = cleanSmall(sum);
|
|
|
|
if (tet_id < excitationPMLCNT)
|
|
{
|
|
Field[tet_id * TetPolyOrderDim[PolyFlag] + i] -= 0.5 * sum; // Scattered Region
|
|
}
|
|
else
|
|
{
|
|
Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += 0.5 * sum; // Total Region
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void addExcitationH( int* ExcitationFacesCnt,
|
|
int* ExcitationFacesOffset,
|
|
int* ExcitationFacesNum,
|
|
int excitationCNT,
|
|
int8_t* map,
|
|
ExcitationProp exci_prop,
|
|
int PolyFlag,
|
|
fp_t_ts dt_muo, fp_t_ts t,
|
|
fp_t_ts* nd_coords_tet,
|
|
fp_t_ts* nd_coords_face,
|
|
fp_t_ts* Z_face,
|
|
fp_t_ts* InvMat,
|
|
fp_t_ts* Field){
|
|
|
|
int tet_id = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (tet_id < excitationCNT){
|
|
//-----TODO: Make this part dynamic----------------------------
|
|
const int TetPolyOrderDim2 = TetPolyOrderDim[3];
|
|
const int FacePolyOrderDim2 = FacePolyOrderDim[3];
|
|
|
|
fp_t_ts nxe_inc[TetPolyOrderDim2] = {0};
|
|
fp_t_ts nxnxh_inc[TetPolyOrderDim2] = {0};
|
|
//-------------------------------------------------------------
|
|
fp_t_ts Z = 0.0;
|
|
for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++){
|
|
int face_id = ExcitationFacesOffset[tet_id] + i;
|
|
Z = Z_face[face_id];
|
|
|
|
fp_t_ts nvtr[2][3];
|
|
fp_t_ts normvtr[3] = {0};
|
|
fp_t_ts area = 0;
|
|
fp_t_ts nxentry;
|
|
fp_t_ts nxnxentry;
|
|
fp_t_ts z[3], w[3], r[3];
|
|
fp_t_ts wgt;
|
|
|
|
//-----------------TODO: Make this part dynamic--------------------------------------------
|
|
fp_t_ts nxelem[FacePolyOrderDim2] = {0};
|
|
fp_t_ts nxnxelem[FacePolyOrderDim2] = {0};
|
|
//-----------------------------------------------------------------------------------------
|
|
|
|
fp_t_ts nxE_inc[3];
|
|
fp_t_ts nxnxH_inc[3];
|
|
|
|
geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area);
|
|
|
|
for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
|
|
nxentry = 0;
|
|
nxnxentry = 0;
|
|
for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++){
|
|
quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt);
|
|
faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w);
|
|
getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r);
|
|
|
|
H_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, Z, exci_prop.Emagnitude, exci_prop.ro, exci_prop.theta, exci_prop.phi, exci_prop.Epol, exci_prop.ExcitationFlag, r, normvtr, nxE_inc, nxnxH_inc);
|
|
|
|
if (!(isnan(nxE_inc[0]))){
|
|
nxentry += area * wgt * dotP3131(w, nxE_inc);
|
|
nxnxentry += area * wgt * dotP3131(w, nxnxH_inc);
|
|
}
|
|
}
|
|
nxelem[j] += nxentry;
|
|
nxnxelem[j] += nxnxentry;
|
|
}
|
|
for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
|
|
// printf("%d\n", ExcitationFacesNum[face_id]);
|
|
nxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j];
|
|
nxnxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j];
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
|
|
nxe_inc[i] *= -dt_muo;
|
|
nxe_inc[i] += (-Z * dt_muo) * nxnxh_inc[i];
|
|
nxe_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i];
|
|
}
|
|
fp_t_ts sum;
|
|
for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
|
|
sum = 0.0;
|
|
for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++){
|
|
sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxe_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i];
|
|
}
|
|
|
|
Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += sum;
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void addExcitationE( int* ExcitationFacesCnt,
|
|
int* ExcitationFacesOffset,
|
|
int* ExcitationFacesNum,
|
|
int excitationCNT,
|
|
int8_t* map,
|
|
ExcitationProp exci_prop,
|
|
int PolyFlag,
|
|
fp_t_ts dt_epsilon, fp_t_ts t,
|
|
fp_t_ts* nd_coords_tet,
|
|
fp_t_ts* nd_coords_face,
|
|
fp_t_ts* Z_face,
|
|
fp_t_ts* InvMat,
|
|
fp_t_ts* Field){
|
|
|
|
int tet_id = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (tet_id < excitationCNT){
|
|
//-----TODO: Make this part dynamic----------------------------
|
|
const int TetPolyOrderDim2 = TetPolyOrderDim[3];
|
|
const int FacePolyOrderDim2 = FacePolyOrderDim[3];
|
|
|
|
fp_t_ts nxh_inc[TetPolyOrderDim2] = {0};
|
|
fp_t_ts nxnxe_inc[TetPolyOrderDim2] = {0};
|
|
//-------------------------------------------------------------
|
|
fp_t_ts Y = 0.0;
|
|
for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++){
|
|
int face_id = ExcitationFacesOffset[tet_id] + i;
|
|
Y = 1 / Z_face[face_id];
|
|
|
|
fp_t_ts nvtr[2][3];
|
|
fp_t_ts normvtr[3] = {0};
|
|
fp_t_ts area = 0;
|
|
fp_t_ts nxentry;
|
|
fp_t_ts nxnxentry;
|
|
fp_t_ts z[3], w[3], r[3];
|
|
fp_t_ts wgt;
|
|
|
|
//-----------------TODO: Make this part dynamic--------------------------------------------
|
|
fp_t_ts nxelem[FacePolyOrderDim2] = {0};
|
|
fp_t_ts nxnxelem[FacePolyOrderDim2] = {0};
|
|
//-----------------------------------------------------------------------------------------
|
|
|
|
fp_t_ts nxH_inc[3];
|
|
fp_t_ts nxnxE_inc[3];
|
|
|
|
geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area);
|
|
|
|
for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
|
|
nxentry = 0;
|
|
nxnxentry = 0;
|
|
for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++){
|
|
quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt);
|
|
faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w);
|
|
getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r);
|
|
|
|
E_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, Z_face[face_id], exci_prop.Emagnitude, exci_prop.ro, exci_prop.theta, exci_prop.phi, exci_prop.Epol, exci_prop.ExcitationFlag, r, normvtr, nxH_inc, nxnxE_inc);
|
|
|
|
if (!(isnan(nxH_inc[0]))){
|
|
nxentry += area * wgt * dotP3131(w, nxH_inc);
|
|
nxnxentry += area * wgt * dotP3131(w, nxnxE_inc);
|
|
}
|
|
}
|
|
|
|
nxelem[j] += nxentry;
|
|
nxnxelem[j] += nxnxentry;
|
|
}
|
|
for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
|
|
// printf("%d\n", ExcitationFacesNum[face_id]);
|
|
nxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j];
|
|
nxnxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j];
|
|
}
|
|
|
|
}
|
|
|
|
for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
|
|
nxh_inc[i] *= dt_epsilon;
|
|
nxh_inc[i] += (-Y * dt_epsilon) * nxnxe_inc[i];
|
|
nxh_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i];
|
|
}
|
|
fp_t_ts sum;
|
|
for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
|
|
sum = 0.0;
|
|
for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++){
|
|
sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxh_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i];
|
|
|
|
}
|
|
|
|
|
|
Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += sum;
|
|
|
|
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void checker(fp_t_ts* a, fp_t_ts* b, int length){
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if(idx < length){
|
|
if(a[idx] != b[idx]){
|
|
printf("ERROR %.10f\n", a[idx] - b[idx]);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void fillOnes(fp_t_ts* arr, size_t N) {
|
|
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (i < N) arr[i] = (fp_t_ts)1.0;
|
|
}
|
|
|
|
|
|
#endif
|
|
#endif // DGTD_KERNELS_CU
|
|
|