This repository serve as a backup for my Maxwell-TD code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

10430 lines
390 KiB

#include <cstdlib>
#include <cmath>
#include <iostream>
#include "femgrp.h"
#include "matconv.h"
#include "Constants.h"
#include "vtkwriter.h"
#ifdef _OPENMP
#include <omp.h>
#endif
#include <map>
#include "MeshPartition_METIS5.h"
#include <vector>
#include "debug.hpp"
#include "vtk-5.0/vtkTetra.h"
#include "rapidcsv.h"
#include <string>
#include <sys/stat.h>
#include <sys/types.h>
#include <errno.h>
#include <cstring> // For strerror
#include <cstdio> // For perror or printf
#include <algorithm> // for std::max
#include <fstream>
#include <string>
#include <iomanip>
#include <filesystem> // at top of file
auto check_dev_ptr = [](const void* p, const char* name) -> bool {
if (!p) {
fprintf(stderr, "[addExcitationE_port] ❌ NULL pointer: %s\n", name);
return false;
}
cudaPointerAttributes attr;
#if CUDART_VERSION >= 10000
cudaError_t perr = cudaPointerGetAttributes(&attr, p);
if (perr != cudaSuccess) {
fprintf(stderr, "[addExcitationE_port] ⚠ cudaPointerGetAttributes failed for %s: %s\n",
name, cudaGetErrorString(perr));
// Still allow launch; you can change to 'return false;' if you prefer.
} else {
// cudaMemoryTypeDevice == 2 in older runtimes; in newer, use attr.type == cudaMemoryTypeDevice
#if CUDART_VERSION >= 11000
bool is_dev = (attr.type == cudaMemoryTypeDevice);
#else
bool is_dev = (attr.memoryType == cudaMemoryTypeDevice);
#endif
if (!is_dev) {
fprintf(stderr, "[addExcitationE_port] ⚠ %s is NOT a device pointer (type=%d)\n",
name,
#if CUDART_VERSION >= 11000
(int)attr.type
#else
(int)attr.memoryType
#endif
);
}
}
#endif
return true;
};
// ======================================
// Interpolation Quadrature Points (host tables)
#define g6_a_h 0.816847572980459
#define g6_b_h (1.0 - g6_a_h) / 2.0
#define g6_c_h 0.108103018168070
#define g6_d_h (1.0 - g6_c_h) / 2.0
#define g6_W1_h 0.109951743655322
#define g6_W2_h 0.223381589678011
#define g9_a_h 0.437525248383384
#define g9_b_h (1.0 - 2.0 * g9_a_h)
#define g9_c_h 0.797112651860071
#define g9_d_h 0.165409927389841
#define g9_e_h (1.0 - g9_c_h - g9_d_h)
#define g9_W1_h 0.205950504760887
#define g9_W2_h 0.063691414286223
fp_t_ts g2d_6_h[6][4] = {
{g6_a_h, g6_b_h, g6_b_h, g6_W1_h},
{g6_b_h, g6_a_h, g6_b_h, g6_W1_h},
{g6_b_h, g6_b_h, g6_a_h, g6_W1_h},
{g6_c_h, g6_d_h, g6_d_h, g6_W2_h},
{g6_d_h, g6_c_h, g6_d_h, g6_W2_h},
{g6_d_h, g6_d_h, g6_c_h, g6_W2_h}
};
fp_t_ts g2d_9_h[9][4] = {
{g9_b_h, g9_a_h, g9_a_h, g9_W1_h},
{g9_a_h, g9_b_h, g9_a_h, g9_W1_h},
{g9_a_h, g9_a_h, g9_b_h, g9_W1_h},
{g9_c_h, g9_d_h, g9_e_h, g9_W2_h},
{g9_c_h, g9_e_h, g9_d_h, g9_W2_h},
{g9_d_h, g9_c_h, g9_e_h, g9_W2_h},
{g9_d_h, g9_e_h, g9_c_h, g9_W2_h},
{g9_e_h, g9_c_h, g9_d_h, g9_W2_h},
{g9_e_h, g9_d_h, g9_c_h, g9_W2_h}
};
const int GAUSS_POINT_NUM_h[4] = {6, 9, 9, 9};
// ---- Shapes for quadratic triangle (P2) at barycentric l=(l0,l1,l2) ----
static inline void triP2_shapes(const double l[3], double N[6]) {
const double l0=l[0], l1=l[1], l2=l[2];
N[0] = l0*(2.0*l0-1.0); // vertex 0
N[1] = l1*(2.0*l1-1.0); // vertex 1
N[2] = l2*(2.0*l2-1.0); // vertex 2
N[3] = 4.0*l1*l2; // edge(1,2)
N[4] = 4.0*l0*l2; // edge(0,2)
N[5] = 4.0*l0*l1; // edge(0,1)
}
// ---- One normal + area from 3 points (xyz9 = x0,y0,z0, x1,y1,z1, x2,y2,z2) ----
static inline void face_geometry9_host(const fp_t_ts* xyz9, double n[3], double& area) {
const double x0=xyz9[0], y0=xyz9[1], z0=xyz9[2];
const double x1=xyz9[3], y1=xyz9[4], z1=xyz9[5];
const double x2=xyz9[6], y2=xyz9[7], z2=xyz9[8];
double a[3] = {x1-x0, y1-y0, z1-z0};
double b[3] = {x2-x0, y2-y0, z2-z0};
// n ∝ a × b
n[0] = a[1]*b[2] - a[2]*b[1];
n[1] = a[2]*b[0] - a[0]*b[2];
n[2] = a[0]*b[1] - a[1]*b[0];
double nn = sqrt(n[0]*n[0] + n[1]*n[1] + n[2]*n[2]);
area = 0.5*nn;
if (nn > 0) { n[0]/=nn; n[1]/=nn; n[2]/=nn; }
}
// ---- Project vector to tangential plane (in-place) ----
static inline void proj_tangent(double v[3], const double n[3])
{
const double vn = v[0]*n[0] + v[1]*n[1] + v[2]*n[2];
v[0]-=vn*n[0]; v[1]-=vn*n[1]; v[2]-=vn*n[2];
}
// ---- Host quadrature accessor using your *_h tables ----
static inline void tri_gauss_host(int Q, int q, fp_t& z0, fp_t& z1, fp_t& z2, fp_t& w) {
if (Q == 6) { z0 = g2d_6_h[q][0]; z1 = g2d_6_h[q][1]; z2 = g2d_6_h[q][2]; w = g2d_6_h[q][3]; return; }
if (Q == 9) { z0 = g2d_9_h[q][0]; z1 = g2d_9_h[q][1]; z2 = g2d_9_h[q][2]; w = g2d_9_h[q][3]; return; }
// add more orders if you enable them
z0=z1=z2=w=0;
}
// ---- Interpolate E/H to Q quadrature points and project tangential ----
static inline void interp_port_fields_to_quads(
const fp_t_ts* xyz9, // x0 y0 z0 x1 y1 z1 x2 y2 z2
const vtr evtr[6], // P2 nodal vectors for E (face order: 0..5)
const vtr hvtr[6], // P2 nodal vectors for H
int PolyFlag,
fp_t_ts* Etan_out, // [Q*3]
fp_t_ts* Htan_out, // [Q*3]
fp_t_ts port_excitation_magnitude)
{
const int Q = GAUSS_POINT_NUM_h[PolyFlag];
double n[3], area;
face_geometry9_host(xyz9, n, area);
for (int q=0; q<Q; ++q)
{
fp_t z0,z1,z2,w; tri_gauss_host(Q,q,z0,z1,z2,w);
double l[3] = { (double)z0, (double)z1, (double)z2 };
double N[6]; triP2_shapes(l,N);
double E[3]={0,0,0}, H[3]={0,0,0};
for (int m=0; m<6; ++m)
{
const double a = N[m];
E[0]+=a*evtr[m].getx();
E[1]+=a*evtr[m].gety();
E[2]+=a*evtr[m].getz();
H[0]+=a*hvtr[m].getx();
H[1]+=a*hvtr[m].gety();
H[2]+=a*hvtr[m].getz();
}
proj_tangent(E,n);
proj_tangent(H,n);
Etan_out[q*3+0] = (fp_t_ts)E[0] * port_excitation_magnitude;
Etan_out[q*3+1] = (fp_t_ts)E[1] * port_excitation_magnitude;
Etan_out[q*3+2] = (fp_t_ts)E[2] * port_excitation_magnitude;
Htan_out[q*3+0] = (fp_t_ts)H[0] * port_excitation_magnitude;
Htan_out[q*3+1] = (fp_t_ts)H[1] * port_excitation_magnitude;
Htan_out[q*3+2] = (fp_t_ts)H[2] * port_excitation_magnitude;
}
}
// Write port quadrature fields to CSV
// Columns: face_idx,global_face_id,tet_id,port_idx,q,z0,z1,z2,w,x,y,z,Et_x,Et_y,Et_z,Ht_x,Ht_y,Ht_z
bool write_port_quadrature_csv(
const char* out_path,
int PolyFlag,
int excitationFaces,
const int* PortFacePidx_h, // length = excitationFaces; -1 for non-port faces
const int* FaceID_excitation_h, // length = excitationFaces (optional; can pass nullptr)
const int* TetID_excitation_h, // length = excitationFaces (optional; can pass nullptr)
const fp_t_ts* nd_coords_face_h, // length = excitationFaces * 9
const fp_t_ts* Etan_qp_h, // length = excitationFaces * Q * 3
const fp_t_ts* Htan_qp_h // length = excitationFaces * Q * 3
) {
if (!out_path || !nd_coords_face_h || !Etan_qp_h || !Htan_qp_h || !PortFacePidx_h) {
fprintf(stderr, "write_port_quadrature_csv: null pointer argument.\n");
return false;
}
const int Q = GAUSS_POINT_NUM_h[PolyFlag];
std::ofstream ofs(out_path);
if (!ofs) {
fprintf(stderr, "write_port_quadrature_csv: failed to open %s\n", out_path);
return false;
}
ofs.setf(std::ios::scientific);
ofs << std::setprecision(9);
// Header
ofs << "face_idx,global_face_id,tet_id,port_idx,q,"
"z0,z1,z2,w,x,y,z,Et_x,Et_y,Et_z,Ht_x,Ht_y,Ht_z\n";
for (int f = 0; f < excitationFaces; ++f) {
int pidx = PortFacePidx_h[f];
if (pidx < 0) continue; // skip non-port faces
int global_face_id = FaceID_excitation_h ? FaceID_excitation_h[f] : -1;
int tet_id = TetID_excitation_h ? TetID_excitation_h[f] : -1;
// Triangle vertices
const fp_t_ts* xyz9 = &nd_coords_face_h[3 * 3 * f];
double Ax = (double)xyz9[0], Ay = (double)xyz9[1], Az = (double)xyz9[2];
double Bx = (double)xyz9[3], By = (double)xyz9[4], Bz = (double)xyz9[5];
double Cx = (double)xyz9[6], Cy = (double)xyz9[7], Cz = (double)xyz9[8];
// Fields
const fp_t_ts* Eface = &Etan_qp_h[(size_t)f * Q * 3];
const fp_t_ts* Hface = &Htan_qp_h[(size_t)f * Q * 3];
for (int q = 0; q < Q; ++q) {
fp_t z0, z1, z2, w;
tri_gauss_host(Q, q, z0, z1, z2, w);
// Quadrature point physical coords
double x = z0 * Ax + z1 * Bx + z2 * Cx;
double y = z0 * Ay + z1 * By + z2 * Cy;
double z = z0 * Az + z1 * Bz + z2 * Cz;
ofs << f << ','
<< global_face_id << ','
<< tet_id << ','
<< pidx << ','
<< q << ','
<< (double)z0 << ','
<< (double)z1 << ','
<< (double)z2 << ','
<< (double)w << ','
<< x << ',' << y << ',' << z << ','
<< (double)Eface[q*3+0] << ','
<< (double)Eface[q*3+1] << ','
<< (double)Eface[q*3+2] << ','
<< (double)Hface[q*3+0] << ','
<< (double)Hface[q*3+1] << ','
<< (double)Hface[q*3+2] << '\n';
}
}
ofs.close();
return true;
}
// Evaluate at centroid
static inline void interp_port_fields_to_centroid(
const fp_t_ts* xyz9, // x0 y0 z0 x1 y1 z1 x2 y2 z2
const vtr evtr[6], // P2 nodal vectors for E
const vtr hvtr[6], // P2 nodal vectors for H
fp_t_ts Etan_out[3], // centroid E_t
fp_t_ts Htan_out[3]) // centroid H_t
{
// Face normal (for tangential projection)
double n[3], area;
face_geometry9_host(xyz9, n, area);
// Centroid barycentrics
const double l[3] = { 1.0/3.0, 1.0/3.0, 1.0/3.0 };
// Quadratic triangle shape functions at centroid
double N[6];
triP2_shapes(l, N);
// Interpolate P2 field
double E[3] = {0.0, 0.0, 0.0};
double H[3] = {0.0, 0.0, 0.0};
for (int m = 0; m < 6; ++m) {
const double a = N[m];
E[0] += a * evtr[m].getx(); E[1] += a * evtr[m].gety(); E[2] += a * evtr[m].getz();
H[0] += a * hvtr[m].getx(); H[1] += a * hvtr[m].gety(); H[2] += a * hvtr[m].getz();
}
// Project onto the tangential plane
proj_tangent(E, n);
proj_tangent(H, n);
// Output single centroid values
Etan_out[0] = (fp_t_ts)E[0];
Etan_out[1] = (fp_t_ts)E[1];
Etan_out[2] = (fp_t_ts)E[2];
Htan_out[0] = (fp_t_ts)H[0];
Htan_out[1] = (fp_t_ts)H[1];
Htan_out[2] = (fp_t_ts)H[2];
}
// ======================================
// ---- centroid helper (assumes tet->nd[0..3] exist and have getCoord().getx/y/z()) ----
void make_dir_if_not_exist(const char* path) {
struct stat st;
if (stat(path, &st) != 0) {
// Directory does not exist, try to create it
if (mkdir(path, 0755) != 0) {
perror("mkdir failed");
}
} else if (!S_ISDIR(st.st_mode)) {
fprintf(stderr, "%s exists but is not a directory\n", path);
}
}
void exportNeighData(
int* NeighMap_h, int neighMapSize,
int* NeighClass_h, int N_class,
int* NeighClassOffset_h)
{
// Export NeighMap_h
{
std::ofstream ofs("NeighMap.txt");
for (int i = 0; i < neighMapSize; i++) {
ofs << NeighMap_h[i] << "\n";
}
}
// Export NeighClass_h
{
std::ofstream ofs("NeighClass.txt");
for (int i = 0; i < N_class; i++) {
ofs << NeighClass_h[i] << "\n";
}
}
// Export NeighClassOffset_h
{
std::ofstream ofs("NeighClassOffset.txt");
for (int i = 0; i < N_class; i++) {
ofs << NeighClassOffset_h[i] << "\n";
}
}
}
// ---- Safe CUDA helpers -------------------------------------------------------
inline cudaError_t SafeCudaMalloc(void** p, size_t nbytes)
{
if (nbytes == 0) { *p = nullptr; return cudaSuccess; }
return cudaMalloc(p, nbytes);
}
inline cudaError_t SafeCudaMemcpyH2D(void* dst, const void* src, size_t nbytes)
{
if (nbytes == 0 || !dst || !src) return cudaSuccess;
return cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
}
inline cudaError_t SafeCudaMemset0(void* dst, size_t nbytes)
{
if (nbytes == 0 || !dst) return cudaSuccess;
return cudaMemset(dst, 0, nbytes); // zero is always safe
}
#define BYTES(T, count) (static_cast<size_t>(count) * sizeof(T))
#define CUDA_SAFE_MALLOC(ptr, bytes) CUDA_SAFE_CALL(SafeCudaMalloc((void**)&(ptr), (bytes)))
#define CUDA_SAFE_COPY(dst, src, bytes) CUDA_SAFE_CALL(SafeCudaMemcpyH2D((dst), (src), (bytes)))
#define CUDA_SAFE_ZERO(dst, bytes) CUDA_SAFE_CALL(SafeCudaMemset0((dst), (bytes)))
// ---- Safe CUDA helpers -------------------------------------------------------
#if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)
#include "kernels.cuh"
cudaStream_t stream_E, stream_H;
cudaStream_t stream_Pade;
ExcitationProp excitationProp;
std::vector<ExcitationProp> portExcitations;
ExcitationProp* ExcitationProps_d;
#endif
using namespace ClipperLib;
using namespace std;
int TriNumBas = 6;
bool ModuleFlag = true;
static fp_t BaryCoord[SecondOrderNodes][4] = {
{1.0, 0.0, 0.0, 0.0},
{0.0, 1.0, 0.0, 0.0},
{0.0, 0.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 1.0},
{0.5, 0.5, 0.0, 0.0},
{0.5, 0.0, 0.5, 0.0},
{0.5, 0.0, 0.0, 0.5},
{0.0, 0.5, 0.5, 0.0},
{0.0, 0.5, 0.0, 0.5},
{0.0, 0.0, 0.5, 0.5}
};
static int fac2tet[4][18] = {
{5, 4, 3, 11, 10, 9, 12, 13, 25, 24, 23, 26, 30, 31, 32, 42, 43, 44},
{5, 2, 1, 11, 8, 7, 14, 15, 25, 22, 21, 27, 33, 34, 35, 42, 43, 44},
{4, 2, 0, 10, 8, 6, 16, 17, 24, 22, 20, 28, 36, 37, 38, 42, 43, 44},
{3, 1, 0, 9, 7, 6, 18, 19, 23, 21, 20, 29, 39, 40, 41, 42, 43, 44}
};
int faceExcitationOrder[15] = {
1, 2, 4, 8, 3, 5, 6, 9, 10, 12, 3, 7, 11, 13, 14
};
int First2Second[3][2] = {
{1, 2},
{0, 2},
{0, 1}
};
template<typename T>
void writeDenseMatrixToCSV_rapidcsv(const std::string& filename, denseMat<T>* mat, int dim)
{
std::vector<std::vector<T>> data(dim, std::vector<T>(dim));
for (int i = 0; i < dim; ++i)
for (int j = 0; j < dim; ++j)
data[i][j] = mat->getEntry(i, j);
// rapidcsv needs column-major data
std::vector<std::vector<T>> cols(dim, std::vector<T>(dim));
for (int j = 0; j < dim; ++j)
for (int i = 0; i < dim; ++i)
cols[j][i] = data[i][j];
rapidcsv::Document doc;
for (int j = 0; j < dim; ++j)
doc.SetColumn<T>(j, cols[j]);
doc.Save(filename);
}
template<typename T_in, typename T_out>
denseMat<T_out>* wrapFlatMatrixConvert(const T_in* data, int dim) {
auto* mat = new denseMat<T_out>(dim, dim);
for (int i = 0; i < dim; ++i)
for (int j = 0; j < dim; ++j)
mat->setEntry(i, j, static_cast<T_out>(data[i * dim + j]));
return mat;
}
FemGrp::FemGrp(){
nodeCNT = 0;
edgeCNT = 0;
faceCNT = 0;
tetraCNT = 0;
bcCNT = 0;
regularCNT = 1; //at least there is a non regular group
regularTetraCNT = 0;
ndARRAY = nullptr;
tetARRAY = nullptr;
edgeARRAY = nullptr;
faceARRAY = nullptr;
regularReferenceARRAY = nullptr;
objProp = nullptr;
totalObjNum = 0;
usePade = false;
padeTime = -1;
padeCNT = 0;
tsSource = 0;
nonConformalCase = false;
nonConformalCNT = 0;
neighCNT = 0;
writeWhilePade = false;
writePadeTD = false;
Coord.setO(0.0, 0.0, 0.0);
Coord.setx_axis(1.0, 0.0, 0.0);
Coord.sety_axis(0.0, 1.0, 0.0);
Coord.setz_axis(0.0, 0.0, 1.0);
freq = 0.0;
// Added for DGTD
TimeStep_dt = 0.0;
ClassMul = 0;
dt_min = 0.0;
dt_max = 0.0;
dimE = 0;
dimH = 0;
N_class = 0;
NtimeSteps = 0;
LocTimeSteps = nullptr;
LocalExciIndexE = nullptr;
LocalExciIndexH = nullptr;
ClassTetraCnt = nullptr;
ClassTetraIndex = nullptr;
ClassTetraOffset = nullptr;
planeWaveMesh = nullptr;
InterSurfMesh = nullptr;
SurfMesh = nullptr;
To = 0.0;
Tau = 0.0;
SamplingRate = 1.0;
FinalTime = 0.0;
TimeDistFlag = 0; // Port
ExcitFlag = 0; // Scattering
regularRegionFlag = false;
PlaneWaveBCFlag = false;
PortBCFlag = false;
fieldEnergy = 0.0;
maxFieldEnergy = 0.0;
energyDecayFactor = 0.0;
numberOfEnergyPoints = 0;
UseQuadratureMatrices = true;
#if defined(DGTD_USE_CUDA)
cudaStreamCreate(&stream_E);
cudaStreamCreate(&stream_H);
cudaStreamCreate(&stream_Pade);
En_d = nullptr;
Hn12_d = nullptr;
En1_d = nullptr;
Hn32_d = nullptr;
#endif
}
FemGrp::~FemGrp(){
}
void FemGrp::readNODE(){
// Read only the nodes belonging to this subdomain and neighbors
char nname[StrLenShort];
int pType;
fp_t singORDER, Priority, x, y, z;
sprintf(nname, "%s.node", fname);
ifstream nodefile(nname, ios::in);
if(!nodefile){
cout << "File " << nname << " does NOT exist " << endl;
exit(1);
}
if(usePade){
initializeMaxMinPoints();
}
int nodeTotal;
nodefile >> unit;
nodefile >> nodeTotal;
nodeCNT = nodeTotal; // only one domain, global = local
if(nodeCNT >= 1){
ndARRAY = new node[nodeCNT];
for(int k = 0; k < nodeTotal; k ++){
ndARRAY[k].set_globalId(k);
nodefile >> pType >> Priority >> singORDER >> x >> y >> z;
ndARRAY[k].set_n(k);
ndARRAY[k].set_pType(pType);
ndARRAY[k].setPType(static_cast<PointType>(pType));
ndARRAY[k].set_singORDER(singORDER);
ndARRAY[k].set_coord(x * unit, y * unit, z * unit);
// ndARRAY[k].print();
if(usePade){
setMaxMinPoints(x * unit, y * unit, z * unit);
}
}
cout << "MaxPoint = (" << maxPoint.getx() << ", " << maxPoint.gety() << ", " << maxPoint.getz() << ") " << endl;
cout << "MinPoint = (" << minPoint.getx() << ", " << minPoint.gety() << ", " << minPoint.getz() << ") " << endl;
}
}
void FemGrp::readTETRA(){
// Read only the tetras in this subdomain and neighbors
int i, j, objNum, ndid[NumOfNodes], bcd[NumOfFaces], sNum[NumOfFaces];
node *nd[NumOfNodes];
char tname[StrLenShort];
readBcMap(); // read in surface-btype map
sprintf(tname, "%s.tetra", fname);
ifstream tetrafile(tname, ios::in);
if(!tetrafile){
cout << "File " << tname << " does NOT exist " << endl;
exit(1);
}
int tetraTotal;
tetrafile >> tetraTotal;
// Only one domain exists
tetraCNT = tetraTotal;
if(tetraCNT >= 1){
tetARRAY = new tetra[tetraCNT];
for(i = 0; i < tetraTotal; i ++){
tetrafile >> objNum;
if(objNum > totalObjNum)
totalObjNum = objNum;
tetrafile >> ndid[0] >> ndid[1] >> ndid[2] >> ndid[3]; //get the ids of the nodes
tetrafile >> sNum[0] >> sNum[1] >> sNum[2] >> sNum[3]; //get the bc number of the faces
for(j = 0; j < 4; j++){
nd[j] = &(ndARRAY[ndid[j]]);
bcd[j] = bcMap[sNum[j]];
}
tetARRAY[i].set_objNum(objNum);
tetARRAY[i].set_node(nd[0], nd[1], nd[2], nd[3]);
tetARRAY[i].set_nbc(bcd[0], bcd[1], bcd[2], bcd[3]);
tetARRAY[i].reArrange(); //set the nodes and bc from smallest to greatest id
tetARRAY[i].setcnt(i);
}
}
}
void FemGrp::readBcMap(){
char name[StrLenShort];
int i, surfCNT, sNum, bNum;
sprintf(name, "%s.bcmap", fname);
ifstream foo(name, ios::in);
if(!foo){
cout << "File " << name << " does NOT exist " << endl;
exit(1);
}
foo >> surfCNT;
if(surfCNT > 0){
bcMap = new int[surfCNT + 1];
bcMap[0] = 0;
for(i = 0; i < surfCNT; i ++){
foo >> sNum >> bNum;
bcMap[sNum] = bNum;
}
}
}
void FemGrp::readMaterial(){
char name[StrLenShort], matName[StrLenShort], dirName[StrLenShort], tmpName[StrLenShort], materialName[StrLenShort];
int i, j, k;
fp_t real, imaginary, cval, temp;
FILE *matFILE;
totalObjNum ++;
objProp = new Material[totalObjNum];
sprintf(name, "%s.prop", fname);
ifstream foo(name, ios::in);
if(!foo){
cout << "File " << name << " does NOT exist " << endl;
exit(1);
}
foo >> dirName; //directory where the materials are storaged
DEBUG_INFO("totalObjNum: " + to_string(totalObjNum));
//TODO: it only takes the real part
for(i = 0; i < totalObjNum; i++)
{
foo >> materialName;
sprintf(matName, "%s/%s.m", dirName, materialName);
matFILE = fopen(matName, "r");
cout << "Reading material properties from file: " << materialName << endl;
fscanf(matFILE, "%s", tmpName);
// relative dielectric constant
for(j = 0; j < NumOfUnitaryVectors; j ++){
for(k = 0; k < NumOfUnitaryVectors; k ++){
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le %le ", &real, &imaginary);
#else
fscanf(matFILE, "%e %e ", &real, &imaginary);
#endif
cval = real;
objProp[i].epsr.setEntry(j, k, cval);
}
}
// relative permeability
for(j = 0; j < NumOfUnitaryVectors; j ++){
for(k = 0; k < NumOfUnitaryVectors; k ++){
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le %le ", &real, &imaginary);
#else
fscanf(matFILE, "%e %e ", &real, &imaginary);
#endif
cval = real;
objProp[i].mur.setEntry(j, k, cval);
}
}
// conductivity
for(j = 0; j < NumOfUnitaryVectors; j ++){
for(k = 0; k < NumOfUnitaryVectors; k ++){
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &real);
#else
fscanf(matFILE, "%e ", &real);
#endif
cval = real;
objProp[i].sigma.setEntry(j, k, cval);
}
}
objProp[i].rum = objProp[i].mur.inverse();
// Tag Scattering Region
if (strncmp(materialName, "scattering", 10) == 0)
{
objProp[i].scattering_region = true;
}
else
{
objProp[i].scattering_region = false;
}
// PML
if (strncmp(materialName, "pml", 3) == 0)
{
PML_flag = true;
// Set Tetrahedron PML type true
objProp[i].set_PML_Flag(1);
cout << "PML Material Properties: " << endl;
// PML Max Conductivity
fp_t conductivity_PML = objProp[i].sigma.getEntry(0, 0);
cout << "conductivity_PML = " << conductivity_PML << endl;
// PML Order
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &temp);
#else
fscanf(matFILE, "%e ", &temp);
#endif
PML_conductivity_order = temp;
objProp[i].set_PML_m_ord(PML_conductivity_order);
cout << "PML_m_ord: " << objProp[i].get_PML_m_ord() << endl;
// PML Thickness
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &temp);
#else
fscanf(matFILE, "%e ", &temp);
#endif
PML_thickness = temp;
objProp[i].set_PML_thick(PML_thickness);
cout << "PML_thickness: " << objProp[i].get_PML_thick() << endl;
// PML Geometry
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &temp);
#else
fscanf(matFILE, "%e ", &temp);
#endif
Ellipse_Rx = temp;
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &temp);
#else
fscanf(matFILE, "%e ", &temp);
#endif
Ellipse_Ry = temp;
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &temp);
#else
fscanf(matFILE, "%e ", &temp);
#endif
Ellipse_Rz = temp;
cout << "Ellipse_Rx: " << Ellipse_Rx << endl;
cout << "Ellipse_Ry: " << Ellipse_Ry << endl;
cout << "Ellipse_Rz: " << Ellipse_Rz << endl;
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &temp);
#else
fscanf(matFILE, "%e ", &temp);
#endif
planewave_xmin = temp;
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &temp);
#else
fscanf(matFILE, "%e ", &temp);
#endif
planewave_xmax = temp;
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &temp);
#else
fscanf(matFILE, "%e ", &temp);
#endif
planewave_ymin = temp;
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &temp);
#else
fscanf(matFILE, "%e ", &temp);
#endif
planewave_ymax = temp;
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &temp);
#else
fscanf(matFILE, "%e ", &temp);
#endif
planewave_zmin = temp;
#ifdef DGTD_USE_DOUBLE
fscanf(matFILE, "%le ", &temp);
#else
fscanf(matFILE, "%e ", &temp);
#endif
planewave_zmax = temp;
cout << "PML Region:\n";
cout << " x: [" << planewave_xmin << ", " << planewave_xmax << "]\n";
cout << " y: [" << planewave_ymin << ", " << planewave_ymax << "]\n";
cout << " z: [" << planewave_zmin << ", " << planewave_zmax << "]\n";
}
else
{
// Set Tetrahedron PML type false
objProp[i].set_PML_Flag(0);
}
fclose(matFILE);
}
}
void FemGrp::readBC()
{
char name[StrLenShort], bcName[StrLenShort], portName[StrLenShort];
int i, pNum, bNum, bType;
fp_t impR, impI, magE;
fp_t x, y, z;
fp_t theta, phi;
fp_t rox, roy, roz;
fp_t r1x, r1y, r1z;
int PortFlag;
fp_t CHIRP_BW_MHZ;
fp_t phaseE;
fp_t port_dx, port_dy, port_dz;
fp_t vpath_x, vpath_y, vpath_z;
fp_t epr, mur;
PEC_PMC_port_flag = 0;
// For ports
bcNumToPnum.clear();
pnumToBcNum.clear();
// For PML
int pmlMode; // 0->radiation(port) , 1->scattering
fp_t pol_x, pol_y, pol_z;
PML_flag = false;
readBcMap();
sprintf(name, "%s.bc", fname);
ifstream foo(name, ios::in);
if(!foo){
cout << "File " << name << " does NOT exist " << endl;
exit(1);
}
foo >> bcCNT;
bcARRAY = new bc[bcCNT];
portCNT = 0;
nonConformalCNT = 0;
for(i = 0; i < bcCNT; i ++)
{
foo >> bNum >> bcName;
bcARRAY[i].set_bNum(bNum); // id in file
bcARRAY[i].set_name(bcName); // name in file
bType = bcTypeConvert(bcName);
bcARRAY[i].set_bType(bType);
switch (bType)
{
case 0: // none
{
break;
}
case pmcType: // pmc
{
break;
}
case fieldPlaneType:
{
break; // fieldPlane
}
case outputSurfType:
{
cout << "outputSurfType" << endl;
break;
}
case abcType:
{
foo >> impR; //abc
bcARRAY[i].set_rval(impR * No);
break;
}
case constE:
{
foo >> x >> y >> z; // constE
bcARRAY[i].SETFIELD(x, y, z);
break;
}
case pecType:
{
break; // pec
}
case impType:
{
foo >> impR >> impI; //original
bcARRAY[i].set_cval(impR, impI);
break;
}
case portType:
{
// (1) TEM rectangular port
// port <name> <pNum> 1 <impR> <impI> <magE> <dx> <dy> <dz> <BW> <epr> <mur> <vpath_x> <vpath_y> <vpath_z>
// (2) TEM coaxial port
// port <name> <pNum> 2 <impR> <impI> <magE> <dx> <dy> <dz> <BW> <epr> <mur> <r0x> <r0y> <r0z> <r1x> <r1y> <r1z> <r2x> <r2y> <r2z>
// (3) TE rectangular port (a is along height and b is along width)
// port <name> <pNum> 3 <impR> <impI> <magE> <dx> <dy> <dz> <BW> <epr> <mur> <a> <b> <m> <n> <uv0x> <uv0y> <uv0z> <vpx> <vpy> <vpz>
if (!(foo >> portName >> pNum >> PortFlag))
{
std::cerr << "[PORT] Failed to read <name pNum PortFlag>\n";
break;
}
cout << "pNum = " << pNum << endl;
cout << "portField = " << PortFlag << endl;
// Initialization of the variables
impR=0.0, impI=0.0, magE=1.0;
port_dx=0.0, port_dy=0.0, port_dz=1.0;
CHIRP_BW_MHZ=0.0, epr=1.0, mur=1.0;
if (!(foo >> impR >> impI >> magE >> port_dx >> port_dy >> port_dz >> CHIRP_BW_MHZ >> epr >> mur))
{
std::cerr << "[PORT] Failed to read common fields for port " << portName << "\n";
break;
}
// Book-keeping
bcARRAY[i].set_name(portName);
bcARRAY[i].set_cval(impR, impI);
bcARRAY[i].set_rval(impR);
bcARRAY[i].set_pNum(pNum);
bcARRAY[i].set_PortFlag(PortFlag);
portCNT++;
PWorPort = 1;
PortBCFlag = true;
// If user gives impR==0, let device compute eta
const double MU0 = 1.2566370614359173e-6; // 4π·1e-7
const double EPS0 = 8.854187817e-12;
const double PI = 3.14159265358979323846;
if (epr <= 0.0) epr = 1.0;
if (mur <= 0.0) mur = 1.0;
const double mu = mur * MU0;
const double eps = epr * EPS0;
#if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)
excitationProp.ExcitationFlag = ExcitFlag;
ExcitationProp portEx{};
portEx.portNum = pNum;
portEx.BCNum = i;
// Timing / envelope
portEx.TimeDistributionFlag = getTimeDist();
portEx.to = To;
portEx.tau = Tau;
portEx.freq_m = (fp_t_ts)freq; // MHz
portEx.CHIRP_BW_MHZ = (fp_t_ts)CHIRP_BW_MHZ;
// Medium / amplitude
portEx.epr = (fp_t_ts)((epr>0.0)? epr : 1.0);
portEx.mur = (fp_t_ts)((mur>0.0)? mur : 1.0);
portEx.Emagnitude = (fp_t_ts)magE;
// Direction vector (diagnostic; geometry gives unit normal)
portEx.PortDirection[0] = (fp_t_ts)port_dx;
portEx.PortDirection[1] = (fp_t_ts)port_dy;
portEx.PortDirection[2] = (fp_t_ts)port_dz;
// E/H field impedance (if 0, device computes implicitly)
portEx.PortImpedance = (fp_t_ts)impR;
portEx.PortFlag = PortFlag;
// Map BC <-> Port
bcNumToPnum[portEx.BCNum] = portEx.portNum;
pnumToBcNum[portEx.portNum] = portEx.BCNum;
// ---- Branch by PortFlag for extra fields ----
switch (PortFlag)
{
case 1: // TEM rectangular: needs vpath
{
double vpx=0, vpy=0, vpz=0;
if (!(foo >> vpx >> vpy >> vpz))
{
std::cerr << "[PORT] TEM-rect missing <vpath_x vpath_y vpath_z> for " << portName << "\n";
// Default vpath to PortDirection if absent
vpx = port_dx; vpy = port_dy; vpz = port_dz;
}
portEx.vpath[0] = (fp_t_ts)vpx;
portEx.vpath[1] = (fp_t_ts)vpy;
portEx.vpath[2] = (fp_t_ts)vpz;
if (impR == 0.0 && impI == 0.0)
{
double eta = std::sqrt(mu/eps); // η = sqrt(μ/ε)
portEx.PortImpedance = (fp_t_ts)eta;
bcARRAY[i].set_rval(eta);
}
else
{
portEx.PortImpedance = (fp_t_ts)impR;
bcARRAY[i].set_rval(impR);
}
break;
}
case 2: // TEM coax: needs r0 (center), r1 (inner), r2 (outer)
{
double r0x, r0y, r0z, r1x, r1y, r1z, r2x, r2y, r2z;
if (!(foo >> r0x >> r0y >> r0z
>> r1x >> r1y >> r1z
>> r2x >> r2y >> r2z)) {
std::cerr << "[PORT] TEM-coax missing r0/r1/r2 for " << portName << "\n";
// Provide safe defaults (degenerate; will inject 0)
r0x=r0y=r0z=0; r1x=1e-3; r1y=r1z=0; r2x=4e-3; r2y=r2z=0;
}
portEx.r0_port[0]=(fp_t_ts)r0x; portEx.r0_port[1]=(fp_t_ts)r0y; portEx.r0_port[2]=(fp_t_ts)r0z;
portEx.r1_port[0]=(fp_t_ts)r1x; portEx.r1_port[1]=(fp_t_ts)r1y; portEx.r1_port[2]=(fp_t_ts)r1z;
portEx.r2_port[0]=(fp_t_ts)r2x; portEx.r2_port[1]=(fp_t_ts)r2y; portEx.r2_port[2]=(fp_t_ts)r2z;
std::array<double,3> v10 = { r1x - r0x, r1y - r0y, r1z - r0z };
std::array<double,3> v20 = { r2x - r0x, r2y - r0y, r2z - r0z };
const double a = std::sqrt(v10[0]*v10[0] + v10[1]*v10[1] + v10[2]*v10[2]);
const double b = std::sqrt(v20[0]*v20[0] + v20[1]*v20[1] + v20[2]*v20[2]);
if (impR == 0.0 && impI == 0.0)
{
double eta = std::sqrt(mu/eps); // η = sqrt(μ/ε)
// Characteristic (V/I) line impedance of the coax
double Z0_line = std::numeric_limits<double>::quiet_NaN();
bool geom_ok = (a > 0.0) && (b > a);
if (geom_ok)
{
Z0_line = (eta / (2.0*PI)) * std::log(b/a);
}
else
{
std::cerr << "[PORT] TEM-coax invalid radii (a=" << a << ", b=" << b
<< "). Using only field impedance eta for BC.\n";
}
portEx.PortImpedance = (fp_t_ts)Z0_line;
bcARRAY[i].set_rval(Z0_line);
}
else
{
portEx.PortImpedance = (fp_t_ts)impR;
bcARRAY[i].set_rval(impR);
}
break;
}
case 3: // TE_mn rectangular: needs rect_a rect_b m n uv0x uv0y uv0z vpx vpy vpz
{
double rect_a, rect_b;
int m, n;
double uv0x, uv0y, uv0z;
double vpx, vpy, vpz;
if (!(foo >> rect_a >> rect_b >> m >> n >> uv0x >> uv0y >> uv0z >> vpx >> vpy >> vpz))
{
std::cerr << "[PORT] TE_mn missing <a b m n uv0x uv0y uv0z vpx vpy vpz> for " << portName << "\n";
// Safe defaults (device clamps tiny a/b)
rect_a = 1.0; rect_b = 1.0; m = 1; n = 0;
uv0x = uv0y = uv0z = 0.0;
// use PortDirection as fallback vpath
vpx = port_dx; vpy = port_dy; vpz = port_dz;
}
portEx.rect_a = (fp_t_ts)rect_a;
portEx.rect_b = (fp_t_ts)rect_b;
portEx.m = m;
portEx.n = n;
portEx.uv0[0]=(fp_t_ts)uv0x;
portEx.uv0[1]=(fp_t_ts)uv0y;
portEx.uv0[2]=(fp_t_ts)uv0z;
// store the raw vpath too (optional, but handy for logging/diagnostics)
portEx.vpath[0] = (fp_t_ts)vpx;
portEx.vpath[1] = (fp_t_ts)vpy;
portEx.vpath[2] = (fp_t_ts)vpz;
// ---- Build t1, t2 from vpath and PortDirection (n) ----
// n = normalized PortDirection
double nx = port_dx, ny = port_dy, nz = port_dz;
double nrm = std::sqrt(nx*nx + ny*ny + nz*nz);
if (nrm < 1e-14) { nx = 0.0; ny = 0.0; nz = 1.0; nrm = 1.0; }
nx /= nrm; ny /= nrm; nz /= nrm;
double t1x = vpx;
double t1y = vpy;
double t1z = vpz;
// t2 = n × t1
double t2x = ny*t1z - nz*t1y;
double t2y = nz*t1x - nx*t1z;
double t2z = nx*t1y - ny*t1x;
double t2n = std::sqrt(t2x*t2x + t2y*t2y + t2z*t2z);
t2x /= t2n; t2y /= t2n; t2z /= t2n;
// store in the excitation
portEx.t1[0] = (fp_t_ts)t1x; portEx.t1[1] = (fp_t_ts)t1y; portEx.t1[2] = (fp_t_ts)t1z;
portEx.t2[0] = (fp_t_ts)t2x; portEx.t2[1] = (fp_t_ts)t2y; portEx.t2[2] = (fp_t_ts)t2z;
if (impR == 0.0 && impI == 0.0)
{
// Geometry (meters) & mode indices already read into rect_a, rect_b, m, n
const double a = (rect_a > 0.0) ? rect_a : 1e-12;
const double b = (rect_b > 0.0) ? rect_b : 1e-12;
// Frequency (MHz in your code)
const double omega = 2.0 * PI * freq * 1.0e6;
const double kc2 = std::pow(m*PI/a, 2.0) + std::pow(n*PI/b, 2.0); // k_cutoff^2
const double k2 = omega*omega * mu * eps; // k^2
double Z_TE_real = std::numeric_limits<double>::quiet_NaN();
double Z_TE_imag = 0.0;
if (k2 <= kc2)
{
// Below cutoff: Z_TE = -j*(ωμ/α), purely reactive
const double alpha = std::sqrt(kc2 - k2);
Z_TE_imag = -(omega * mu) / alpha;
Z_TE_real = 1e12; // large real placeholder for BC scalar
std::cerr << "[PORT] TE_mn below cutoff (a=" << a << ", b=" << b
<< ", m=" << m << ", n=" << n << "). Using large real Z for BC, "
<< "Im{Z_TE}=" << Z_TE_imag << " Ohm.\n";
}
else
{
// Above cutoff: Z_TE is real and positive
const double beta = std::sqrt(k2 - kc2);
Z_TE_real = (omega * mu) / beta;
}
// User asked us to determine impedance → store TE wave impedance
portEx.PortImpedance = (fp_t_ts)Z_TE_real;
bcARRAY[i].set_rval(Z_TE_real);
bcARRAY[i].set_cval(Z_TE_real, Z_TE_imag);
}
else
{
// User-specified
portEx.PortImpedance = (fp_t_ts)impR;
bcARRAY[i].set_rval(impR);
bcARRAY[i].set_cval(impR, impI);
}
break;
}
default:
{
std::cerr << "[PORT] Unknown PortFlag=" << PortFlag << " for " << portName
<< ". Defaulting to TEM-rect with vpath=PortDirection.\n";
portEx.PortFlag = 1;
portEx.vpath[0] = (fp_t_ts)port_dx;
portEx.vpath[1] = (fp_t_ts)port_dy;
portEx.vpath[2] = (fp_t_ts)port_dz;
if (impR == 0.0 && impI == 0.0) portEx.PortImpedance = (fp_t_ts)0.0;
break;
}
}
portExcitations.push_back(portEx);
// Log summary
std::cout << "\n=========================\n"
<< " PORT BOUNDARY CONDITION \n"
<< "=========================\n"
<< "PortName : " << portName << "\n"
<< "PortNum : " << (portEx.portNum - 1) << "\n"
<< "PortFlag : " << portEx.PortFlag << " (1=TEM-rect, 2=TEM-coax, 3=TE_mn)\n"
<< "E/H Zport : " << portEx.PortImpedance << " + j" << impI << " (0 => implicit)\n"
<< "magE : " << portEx.Emagnitude << "\n"
<< "PortDir : (" << port_dx << ", " << port_dy << ", " << port_dz << ")\n"
<< "epr, mur : " << epr << ", " << mur << "\n";
if (portEx.PortFlag == 1)
{
std::cout << "vpath : (" << portEx.vpath[0] << ", " << portEx.vpath[1] << ", " << portEx.vpath[2] << ")\n";
}
else if (portEx.PortFlag == 2)
{
std::cout << "r0 : (" << portEx.r0_port[0] << ", " << portEx.r0_port[1] << ", " << portEx.r0_port[2] << ")\n"
<< "r1(inner) : (" << portEx.r1_port[0] << ", " << portEx.r1_port[1] << ", " << portEx.r1_port[2] << ")\n"
<< "r2(outer) : (" << portEx.r2_port[0] << ", " << portEx.r2_port[1] << ", " << portEx.r2_port[2] << ")\n";
}
else if (portEx.PortFlag == 3)
{
std::cout << "rect(a,b) : " << portEx.rect_a << ", " << portEx.rect_b << "\n"
<< "m,n : " << portEx.m << ", " << portEx.n << "\n"
<< "uv0 : (" << portEx.uv0[0] << ", " << portEx.uv0[1] << ", " << portEx.uv0[2] << ")\n";
}
std::cout << "=========================\n\n";
#endif
break;
}
case planeWaveType: // planeWave (theta, phi, ex, ey, ez)
{
char typeName[StrLenShort];
foo >> typeName >> magE >> theta >> phi >> x >> y >> z >> rox >> roy >> roz;
cout << " " << endl;
cout << "====================================================================================================" << endl;
cout << " PLANEWAVE BOUNDARY CONDITION " << endl;
cout << "====================================================================================================" << endl;
printf(" PlaneWaveType : %f %f %f %f %f %f %f %f %f\n", magE, theta, phi, x, y, z, rox, roy, roz);
printf(" Unit : %f\n", unit);
bcARRAY[i].set_name(typeName);
bcARRAY[i].set_magE(magE);
bcARRAY[i].setTheta(theta);
bcARRAY[i].setPhi(phi);
bcARRAY[i].set_cval(No, 0.0);
bcARRAY[i].SETFIELD(x, y, z);
bcARRAY[i].setPW_ro(rox * unit, roy * unit, roz * unit);
cout << " Name : " << typeName << endl;
cout << " magE : " << magE << endl;
cout << " Theta : " << theta << endl;
cout << " Phi : " << phi << endl;
cout << " POL : " << "(" << x << ", " << y << ", " << z << ")" << endl;
cout << " r0 : " << "(" << rox << ", " << roy << ", " << roz << ")" << endl;
cout << "====================================================================================================" << endl;
cout << " " << endl;
PWorPort = 0;
#if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)
// for cuda kernel
excitationProp.ro[0] = rox * unit;
excitationProp.ro[1] = roy * unit;
excitationProp.ro[2] = roz * unit;
excitationProp.Emagnitude = magE;
excitationProp.Epol[0] = x;
excitationProp.Epol[1] = y;
excitationProp.Epol[2] = z;
excitationProp.ExcitationFlag = ExcitFlag;
excitationProp.freq_m = freq;
excitationProp.to = To;
excitationProp.tau = Tau;
excitationProp.phi = phi;
excitationProp.theta = theta;
#endif
interior_excitation_flag = false;
planeWaveMesh = new PlaneWaveMesh;
planeWaveMesh->setName(typeName);
PlaneWaveBCFlag = true;
break;
}
case nonConformal:
{
nonConformalCase = true;
break;
}
// Excitation Mode (PlaneWave into PML region)
case pmlType:
{
foo >> pmlMode >> portName >> magE >> theta >> phi >> pol_x >> pol_y >> pol_z >> rox >> roy >> roz;
PWorPort = 0;
std::cout << "\n";
std::cout << "====================================================================================================" << std::endl;
std::cout << " PML EXCITATION BOUNDARY CONDITION " << std::endl;
std::cout << "====================================================================================================" << std::endl;
printf(" PML Mode : %d\n", pmlMode);
printf(" Port Name : %s\n", portName);
printf(" magE : %f\n", magE);
printf(" Theta : %f\n", theta);
printf(" Phi : %f\n", phi);
printf(" POL : (%f, %f, %f)\n", pol_x, pol_y, pol_z);
printf(" r0 : (%f, %f, %f)\n", rox, roy, roz);
printf(" Unit : %f\n", unit); // Make sure `unit` is defined
std::cout << "====================================================================================================" << std::endl;
std::cout << "\n";
// Apply to BC object
bcARRAY[i].set_name(portName);
bcARRAY[i].set_magE(magE);
bcARRAY[i].setTheta(theta);
bcARRAY[i].setPhi(phi);
bcARRAY[i].set_cval(No, 0.0);
bcARRAY[i].SETFIELD(pol_x, pol_y, pol_z); // Assuming SETFIELD is for polarization
bcARRAY[i].setPW_ro(rox * unit, roy * unit, roz * unit);
#if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)
excitationProp.ro[0] = rox * unit;
excitationProp.ro[1] = roy * unit;
excitationProp.ro[2] = roz * unit;
excitationProp.Emagnitude = magE;
excitationProp.Epol[0] = pol_x;
excitationProp.Epol[1] = pol_y;
excitationProp.Epol[2] = pol_z;
excitationProp.ExcitationFlag = ExcitFlag; // Must be defined
excitationProp.freq_m = freq; // Must be defined
excitationProp.to = To; // Must be defined
excitationProp.tau = Tau; // Must be defined
excitationProp.phi = phi;
excitationProp.theta = theta;
#endif
if (pmlMode == 1)
{
interior_excitation_flag = true;
planeWaveMesh = new PlaneWaveMesh;
planeWaveMesh->setName(portName);
PlaneWaveBCFlag = true;
}
break;
}
}
}
}
/**
Make the egde and face arrays
*/
int localEdgebType(int n, int nbc[]){
int bType, nb1, nb2;
switch (n){
case 0:
nb1 = nbc[2];
nb2 = nbc[3];
break;
case 1:
nb1 = nbc[1];
nb2 = nbc[3];
break;
case 2:
nb1 = nbc[1];
nb2 = nbc[2];
break;
case 3:
nb1 = nbc[0];
nb2 = nbc[3];
break;
case 4:
nb1 = nbc[0];
nb2 = nbc[2];
break;
case 5:
nb1 = nbc[0];
nb2 = nbc[1];
break;
}
bType = (nb1 > nb2) ? nb1 : nb2; //original
return bType;
}
void FemGrp::makeEdgeArray(){
int i, j;
// oversized array for edge BCs
int* edgeBcs = new int[tetraCNT * 6];
// store global edge ids for set/array use
int** edgeIds = new int*[tetraCNT];
for(i = 0; i < tetraCNT; i++)
edgeIds[i] = new int[NumOfEdges];
int nbc[NumOfFaces];
list<edge*> edgeList;
list<edge*>::iterator edgeListIter;
edgeSetPtr = new set<edge>;
set<edge>::iterator edgeSetIter;
int index = 0;
for(i = 0; i < tetraCNT; i++){
tetra* tet = &(tetARRAY[i]);
for(j = 0; j < NumOfFaces; j++)
nbc[j] = bcArrange(tet->getbc(j)); //return the bc (the number define for the material) of each face
for(j = 0; j < NumOfEdges; j++){
int n0 = edgeMAP[j][0];
int n1 = edgeMAP[j][1];
int bType = localEdgebType(j, nbc); //return the most important bc of the edge checking both faces
node* nd0 = tet->getNode(n0);
node* nd1 = tet->getNode(n1);
edge* eg = new edge;
eg->setEdge(nd0, nd1);
//add each edge just once
edgeSetIter = edgeSetPtr->find(*eg);
if(edgeSetIter == edgeSetPtr->end()){
// new edge
eg->setGlobalCnt(index);
edgeIds[i][j] = index;
eg->setbType(bType);
edgeBcs[index] = bType;
edgeSetPtr->insert(*eg);
edgeList.push_back(eg);
index++;
}else{
// set the boundary condicion of higher value if the edge was already set
delete eg;
edgeIds[i][j] = edgeSetIter->getGlobalCnt();
if(bType > edgeSetIter->getbType()){
edgeBcs[edgeIds[i][j]] = bType;
(const_cast<edge&>(*edgeSetIter)).setbType(bType);
}
}
}
}
// convert the list into an array
edgeCNT = edgeList.size();
cout << " edgeCNT == " << edgeCNT << endl;
edgeARRAY = new edge*[edgeCNT];
index = 0;
for(edgeListIter = edgeList.begin(); edgeListIter != edgeList.end(); edgeListIter++)
edgeARRAY[index++] = *edgeListIter;
// set the boundary conditions
for(i = 0; i < edgeCNT; i++)
edgeARRAY[i]->setbType(edgeBcs[i]);
delete [] edgeBcs;
// get tetra-edge linkage
for(i = 0; i < tetraCNT; i++){
for(j = 0; j < NumOfEdges; j++)
tetARRAY[i].setEdge(edgeARRAY[edgeIds[i][j]], j);
}
for(i = 0; i < tetraCNT; i++)
delete [] edgeIds[i];
delete [] edgeIds;
}
void FemGrp::makeNonConformalArray(){
ncARRAY = new int[nonConformalCNT];
int index = 0;
for(int i=0; i < tetraCNT; i++){
tetra* tet = &(tetARRAY[i]);
if(tet->getIsNC()){
ncARRAY[index] = tet->cnt;
index++;
}
}
if(nonConformalCNT != index)
cout << "ERROR in makeNonConformalArray" << endl;
}
void FemGrp::makeFaceArray()
{
int i, j;
// oversized arrays for face BCs and a map from global IDs with PEC faces to IDs without PEC face
int* faceBcs = new int[tetraCNT * NumOfFaces];
int* indexMap = new int[tetraCNT * NumOfFaces]; //TODO: review what's the use of this array
memset(faceBcs, 0, tetraCNT * NumOfFaces * sizeof(int));
memset(indexMap, 0, tetraCNT * NumOfFaces * sizeof(int));
// store global face ids for set/array use
int** faceIds = new int*[tetraCNT];
for(i = 0; i < tetraCNT; i++){
faceIds[i] = new int[NumOfFaces];
memset(faceIds[i], 0, NumOfFaces * sizeof(int));
}
edge eg;
list<face*> faceList;
vector<face*> faceListVector;
list<face*>::iterator faceListIter;
faceSetPtr = new set<face>;
set<face>::iterator faceSetIter;
int index = 0;
int indexNoPec = 0; //TODO: review what's the use of this variable
for(i = 0; i < tetraCNT; i++){
tetra* tet = &(tetARRAY[i]);
for(j = 0; j < NumOfFaces; j++){
int bcNum = tet->getbc(j); // marker
int bType = bcArrange(bcNum); // bc type in the defines
bc* bcPtr = getbcPtr(bcNum); // pointer to the bc
if(bType == nonConformal && !(tet->isNonConformal)){
nonConformalCNT++;
tet->setIsNC(true);
}
node* nd0 = tet->getNode(faceMAP[j][0]);
node* nd1 = tet->getNode(faceMAP[j][1]);
node* nd2 = tet->getNode(faceMAP[j][2]);
face* fc = new face;
fc->setFace(nd0, nd1, nd2); //set a face with the nodes ordered from smallest to greatest id
faceSetIter = faceSetPtr->find(*fc);
if(faceSetIter == faceSetPtr->end()){
// new face
fc->setcnt(index);
faceIds[i][j] = index;
if(bType != pecType)
indexMap[index] = indexNoPec++;
faceBcs[index] = bType;
fc->setbcPtr(bcPtr);
// set up face-edge linkage
eg.setEdge(nd1, nd2);
fc->setEdge(edgeARRAY[edgeSetPtr->find(eg)->getGlobalCnt()], 0);
eg.setEdge(nd0, nd2);
fc->setEdge(edgeARRAY[edgeSetPtr->find(eg)->getGlobalCnt()], 1);
eg.setEdge(nd1, nd0);
fc->setEdge(edgeARRAY[edgeSetPtr->find(eg)->getGlobalCnt()], 2);
index++;
faceSetPtr->insert(*fc);
faceList.push_back(fc);
faceListVector.push_back(fc);
}else{
delete fc;
faceIds[i][j] = faceSetIter->getcnt(); // the j-th local face of tetra i is an old face
if(bType > faceSetIter->getbType()){ // choose btype with a larger value
faceBcs[faceIds[i][j]] = bType;
(const_cast<face&>(*faceSetIter)).setbType(bType);
face* f = faceListVector[faceIds[i][j]];
f->setbType(bType);
f->setbcPtr(bcPtr);
}
}
}
}
// convert the list into an array
int totalFaceCount = faceList.size();
cout << " totalFaceCount == " << totalFaceCount << endl;
face** totalFaceArray = new face*[totalFaceCount];
index = 0;
for(faceListIter = faceList.begin(); faceListIter != faceList.end(); faceListIter++)
totalFaceArray[index++] = *faceListIter;
// set the boundary conditions
for(i = 0; i < totalFaceCount; i++){
totalFaceArray[i]->setbType(faceBcs[i]);
}
// set tetra-face linkage
for(i = 0; i < tetraCNT; i++){
tetra* tet = &(tetARRAY[i]);
for(j = 0; j < 4; j++){
face* fc = totalFaceArray[faceIds[i][j]];
tet->setFace(fc, j);
if(fc->hydra[0] == nullptr){ // newly found face linkage
fc->hydra[0] = tet;
} else { // already existed, half-linked
fc->hydra[1] = tet;
fc->tetraArrange(); //order hydra[0] < hydra[1]
}
}
}
for(i = 0; i < tetraCNT; i++)
delete [] faceIds[i];
delete [] faceIds;
delete [] totalFaceArray;
delete [] faceBcs;
delete [] indexMap;
// convert the reduced list into an array
faceCNT = faceList.size();
faceARRAY = new face*[faceCNT];
indexNoPec = 0;
for(faceListIter = faceList.begin(); faceListIter != faceList.end(); faceListIter++)
faceARRAY[indexNoPec++] = *faceListIter;
while (faceSetIter != faceSetPtr->end()){
set<face>::iterator tmpIter = faceSetIter;
faceSetIter++;
faceSetPtr->erase(tmpIter);
}
faceSetPtr->clear();
delete faceSetPtr;
set<edge>::iterator edgeSetIter = edgeSetPtr->begin();
while(edgeSetIter != edgeSetPtr->end()){
set<edge>::iterator tmpIter = edgeSetIter;
edgeSetIter++;
edgeSetPtr->erase(tmpIter);
}
edgeSetPtr->clear();
delete edgeSetPtr;
}
int FemGrp::bcArrange(int bNum){
// from that indicated in file to type defined in bc.h (marker to bc type)
for(int i = 0; i < bcCNT; i ++){
if(bcARRAY[i].getbNum() == bNum)
return bcARRAY[i].getbType();
}
return 0;
}
bc *FemGrp::getbcPtr(int bNum){
for(int i = 0; i < bcCNT; i ++)
if(bcARRAY[i].getbNum() == bNum)
return &(bcARRAY[i]);
return nullptr;
}
void FemGrp::AssignExcitParamToFace(){
for(int i = 0; i < faceCNT; i++){
faceARRAY[i]->setTo(To);
faceARRAY[i]->setTau(Tau);
faceARRAY[i]->setTimeDist(TimeDistFlag);
faceARRAY[i]->setExciFlag(ExcitFlag);
faceARRAY[i]->setFrequency(freq);
}
}
void FemGrp::AssignMaterialProperties(){
int i;
tetra *tet;
for(i = 0; i < tetraCNT; i++)
{
tet = &(tetARRAY[i]);
tet->SetFacePEC();
tet->SetFacePMC();
tet->set_mat(&(objProp[tet->getobjNum()]));
tet->set_ConductivityFlag();
// Additional routine for scattering region
if (tet->getMat()->scattering_region)
{
tet->scattering_region = true;
}
// Additional routine for PML
if (tet->getMat()->get_PML_Flag() == 1)
{
tet->set_PML_Flag(1);
}
else
{
tet->set_PML_Flag(0);
}
if (tet->get_PML_Flag() == -1) cout << "PML_Flag() not set " << endl;
}
}
void FemGrp::AssignTetraFlags(){
int AbcCount = 0;
int InterCount = 0;
int PortCount = 0;
tetra *tet;
cout << " " << endl;
cout << "======================================================" << endl;
cout << " Total number of TetraHedra " << endl;
cout << "======================================================" << endl;
cout << " Total number of TetraHedra := " << tetraCNT << endl;
// Parallelized by Qi Jian
#pragma omp parallel for
for(int i = 0; i < tetraCNT; i++)
{
tet = &(tetARRAY[i]);
tet->set_TetrahedronFlag();
}
double min_AABB_size = 3e8 / (freq * 1e6) / 10.0;
// For every tetrahedron, set the neighbor tetrahedra
#pragma omp parallel for
for(int i = 0; i < tetraCNT; i++)
{
tet = &(tetARRAY[i]);
tet->set_NeighborTetra(tetARRAY, ncARRAY, nonConformalCNT, &octree_object, min_AABB_size);
}
for(int i = 0; i < tetraCNT; i++)
{
tet = &(tetARRAY[i]);
tet->set_PolyOrderFlagDebug(PolyFlag);
// The following code is node thread safe.
if (tet->TetrahedronFlag == 0) InterCount++;
if (tet->TetrahedronFlag == 1) AbcCount++;
if (tet->ExcitationFlag == 1) PortCount++;
}
cout << " Total number of P" << PolyFlag << " TetraHedra := " << tetraCNT << endl;
cout << " Total number of Interior TetraHedra := " << InterCount << endl;
cout << " Total number of AbcCount TetraHedra := " << AbcCount << endl;
cout << " Total number of Port/PlaneWave TetraHedra := " << PortCount << endl;
cout << "======================================================" << endl;
cout << " " << endl;
int min_poly = tetARRAY[0].get_PolyOrderFlag();
for(int i = 1; i < tetraCNT; i++){
if(tetARRAY[i].get_PolyOrderFlag() < min_poly)
min_poly = tetARRAY[i].get_PolyOrderFlag();
}
for(int i = 0; i < tetraCNT; i++)
tetARRAY[i].set_MinimumPoly(min_poly);
// Define Excitation tetrahedral
TetExcitIndexArraySize = PortCount;
TetExcitIndexArray = (int*)malloc(sizeof(int) * TetExcitIndexArraySize);
int index = 0;
for(int i = 0; i < tetraCNT; i ++){
tet = &(tetARRAY[i]);
if(tet->ExcitationFlag == 1){
TetExcitIndexArray[index] = i;
index++;
}
}
}
void FemGrp::makePlaneWaveMesh(){
int i, j;
set<int> meshNodeIds;
// count the number of plane wave faces
int pwFaceNum = 0;
for(i = 0; i < faceCNT; i++){
if(faceARRAY[i]->getbType() == planeWaveType || faceARRAY[i]->getbType() == pmlType)
pwFaceNum++;
}
// set planeWaveMesh_'s faceCnt_ and allocate its faceArray_
planeWaveMesh->setFaceCnt(pwFaceNum);
cout << " pwFaceNum == " << pwFaceNum << endl;
cout << " planeWaveMesh->faceCNT == " << planeWaveMesh->faceCNT << endl;
// populate faceArray_
int index = 0;
for(i = 0; i < faceCNT; i++){
if(faceARRAY[i]->getbType() == planeWaveType || faceARRAY[i]->getbType() == pmlType){
planeWaveMesh->setFace(faceARRAY[i], index);
index++;
// add unique node ids
for(j = 0; j < NumOfNodesPerFace; j++)
meshNodeIds.insert(faceARRAY[i]->getNode(j)->getid());
}
}
// allocate and add node pointers to array keep local mapping
int nodeNum = meshNodeIds.size();
planeWaveMesh->setNodeCnt(nodeNum);
cout << " nodeNum == " << nodeNum << endl;
cout << " planeWaveMesh->nodeCNT == " << planeWaveMesh->nodeCNT << endl;
planeWaveMesh->allocGlobToLocMap();
node** PlaneWaveNodeArray = planeWaveMesh->getNodeArray();
map<int, int>& globToLocMap = planeWaveMesh->getGlobToLocMap();
set<int>::iterator meshNodeIdIter;
int nodeCount = 0;
for(meshNodeIdIter = meshNodeIds.begin(); meshNodeIdIter != meshNodeIds.end(); meshNodeIdIter++){
PlaneWaveNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]);
globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++;
}
// Set the bounding box coordinates for the Planewave mesh
// Useful for PML
/*
planeWaveMesh->computeBoundingBox();
planewave_xmin = planeWaveMesh->getXmin();
planewave_xmax = planeWaveMesh->getXmax();
planewave_ymin = planeWaveMesh->getYmin();
planewave_ymax = planeWaveMesh->getYmax();
planewave_zmin = planeWaveMesh->getZmin();
planewave_zmax = planeWaveMesh->getZmax();
cout << "Planewave bounding box coordinates: " << std::endl;
cout << "xmin: " << planewave_xmin << ", xmax: " << planewave_xmax << std::endl;
cout << "ymin: " << planewave_ymin << ", ymax: " << planewave_ymax << std::endl;
cout << "zmin: " << planewave_zmin << ", zmax: " << planewave_zmax << std::endl;
*/
}
// Single BC_ID
void FemGrp::makeInterSurfMesh(int BC_id){
cout << " Generating InterSurf Mesh with " << BC_id << endl;
InterSurfMesh = new PlaneWaveMesh;
int i, j;
set<int> InterSurfNodeIds;
// count the number of faces
int InterFaceNum = 0;
int* FaceMap = new int[faceCNT];
for(i = 0; i < faceCNT; i++)
FaceMap[i] = -1;
// Find the faces
for(i = 0; i < faceCNT; i++){
if(faceARRAY[i]->getbcPtr()->getbType() == BC_id){ //change
InterFaceNum++;
FaceMap[i] = i;
}
}
if(InterFaceNum == 0)
return;
// set InterSurfMesh_'s faceCnt_ and allocate its faceArray_
cout << " InterFaceNum == " << InterFaceNum << endl;
InterSurfMesh->setFaceCnt(InterFaceNum);
cout << " FaceNum == " << InterFaceNum << endl;
cout << " ->faceCNT == " << InterSurfMesh->faceCNT << endl;
// populate faceArray_
int index = 0;
for(i = 0; i < faceCNT; i++){
if(FaceMap[i] > 0){
InterSurfMesh->setFace(faceARRAY[i], index);
index++;
// add unique node ids
for(j = 0; j < NumOfNodesPerFace; j++)
InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid());
}
}
// allocate and add node pointers to array
// keep local mapping
int nodeNum = InterSurfNodeIds.size();
InterSurfMesh->setNodeCnt(nodeNum);
cout << " nodeNum == " << nodeNum << endl;
cout << " ->nodeCNT == " << InterSurfMesh->nodeCNT << endl;
InterSurfMesh->allocGlobToLocMap();
node** InterSurfNodeArray = InterSurfMesh->getNodeArray();
map<int, int>& globToLocMap = InterSurfMesh->getGlobToLocMap();
set<int>::iterator meshNodeIdIter;
int nodeCount = 0;
for(meshNodeIdIter = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){
InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]);
globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++;
}
//write file
char Currents_vtkFile[StrOutput];
sprintf(Currents_vtkFile, "SurfBC_%s_%d", fname, BC_id);
node** locNodeArray = new node*[InterSurfMesh->nodeCNT];
for(i = 0; i < InterSurfMesh->nodeCNT; i++){
node& Node = *(InterSurfMesh->ndArray[i]);
index = InterSurfMesh->globToLocMap_->find(Node.getid())->second;
locNodeArray[index] = new node(index,
Node.getPType(),
Node.getSingOrder(),
Node.getCoord().getx(),
Node.getCoord().gety(),
Node.getCoord().getz());
}
face** locFaceArray = new face*[InterSurfMesh->faceCNT];
for(i = 0; i < InterSurfMesh->faceCNT; i++){
face& Face = *(InterSurfMesh->fcArray[i]);
locFaceArray[i] = new face(Face);
locFaceArray[i]->setFace(locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second],
locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second],
locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]);
}
//TODO: check why unit is 1. instead of unit
VtkWriter vtkWriter(1.);
//TODO: check why order is 1. instead of order
vtkWriter.writeTriUg(Currents_vtkFile, InterSurfMesh->nodeCNT, locNodeArray, InterSurfMesh->faceCNT, locFaceArray, 1);
for(i = 0; i < InterSurfMesh->nodeCNT; i++)
delete locNodeArray[i];
delete [] locNodeArray;
for(i = 0; i < InterSurfMesh->faceCNT; i++)
delete locFaceArray[i];
delete [] locFaceArray;
}
// Double BC_ID
void FemGrp::makeInterSurfMesh(int BC_id1,int BC_id2){
InterSurfMesh = new PlaneWaveMesh;
int i, j;
set<int> InterSurfNodeIds;
// count the number of faces
int InterFaceNum = 0;
int* FaceMap = new int[faceCNT];
for(i = 0; i < faceCNT; i++) FaceMap[i] = -1;
// Find the faces
for(i = 0; i < faceCNT; i++){
if((faceARRAY[i]->getbcPtr()->getbType() == BC_id1) || (faceARRAY[i]->getbcPtr()->getbType() == BC_id2)){
InterFaceNum++;
FaceMap[i] = i;
}
}
if(InterFaceNum == 0) return;
// set InterSurfMesh_'s faceCnt_ and allocate its faceArray_
cout << "== InterFaceNum == " << InterFaceNum << endl;
InterSurfMesh->setFaceCnt(InterFaceNum);
cout << "== FaceNum == " << InterFaceNum << endl;
cout << "== ->faceCNT == " << InterSurfMesh->faceCNT << endl;
// populate faceArray_
int index = 0;
for(i = 0; i < faceCNT; i++){
if(FaceMap[i] > 0){
InterSurfMesh->setFace(faceARRAY[i], index);
index++;
// add unique node ids
for(j = 0; j < NumOfNodesPerFace; j++)
InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid());
}
}
// allocate and add node pointers to array
// keep local mapping
int nodeNum = InterSurfNodeIds.size();
InterSurfMesh->setNodeCnt(nodeNum);
cout << "== nodeNum == " << nodeNum << endl;
cout << "== ->nodeCNT == " << InterSurfMesh->nodeCNT << endl;
InterSurfMesh->allocGlobToLocMap();
node** InterSurfNodeArray = InterSurfMesh->getNodeArray();
map<int, int>& globToLocMap = InterSurfMesh->getGlobToLocMap();
set<int>::iterator meshNodeIdIter;
int nodeCount = 0;
for(meshNodeIdIter = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){
InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]);
globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++;
}
//write file
char Currents_vtkFile[StrOutput];
sprintf(Currents_vtkFile, "SurfBC_%s", fname);
node** locNodeArray = new node*[InterSurfMesh->nodeCNT];
for(i = 0; i < InterSurfMesh->nodeCNT; i++){
node& Node = *(InterSurfMesh->ndArray[i]);
index = InterSurfMesh->globToLocMap_->find(Node.getid())->second;
locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz());
}
face** locFaceArray = new face*[InterSurfMesh->faceCNT];
for(i = 0; i < InterSurfMesh->faceCNT; i++){
face& Face = *(InterSurfMesh->fcArray[i]);
locFaceArray[i] = new face(Face);
locFaceArray[i]->setFace(
locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second],
locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second],
locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]);
}
//TODO: check why unit is 1. instead of unit
VtkWriter vtkWriter(1.);
//TODO: check why order is 1. instead of order
vtkWriter.writeTriUg(Currents_vtkFile, InterSurfMesh->nodeCNT, locNodeArray, InterSurfMesh->faceCNT, locFaceArray, 1);
for(i = 0; i < InterSurfMesh->nodeCNT; i++)
delete locNodeArray[i];
delete [] locNodeArray;
for(i = 0; i < InterSurfMesh->faceCNT; i++)
delete locFaceArray[i];
delete [] locFaceArray;
}
void FemGrp::makeSurfMesh(int BC_id){
cout << "Generating Surf Mesh with " << BC_id << endl;
SurfMesh = new PlaneWaveMesh;
int i, j;
set<int> InterSurfNodeIds;
// count the number of faces
int InterFaceNum = 0;
int* FaceMap = new int[faceCNT];
for(i = 0; i < faceCNT; i++)
FaceMap[i] = -1;
// Find the faces
for(i = 0; i < faceCNT; i++){
if(faceARRAY[i]->getbcPtr()->getbType() == BC_id){ //change
InterFaceNum++;
FaceMap[i] = i;
}
}
if(InterFaceNum == 0)
return;
// set SurfMesh_'s faceCnt_ and allocate its faceArray_
cout << "== InterFaceNum == " << InterFaceNum << endl;
SurfMesh->setFaceCnt(InterFaceNum);
cout << "== FaceNum == " << InterFaceNum << endl;
cout << "== ->faceCNT == " << SurfMesh->faceCNT << endl;
// populate faceArray_
int index = 0;
for(i = 0; i < faceCNT; i++){
if(FaceMap[i] > 0){
SurfMesh->setFace(faceARRAY[i], index);
index++;
// add unique node ids
for(j = 0; j < NumOfNodesPerFace; j++)
InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid());
}
}
// allocate and add node pointers to array
// keep local mapping
int nodeNum = InterSurfNodeIds.size();
SurfMesh->setNodeCnt(nodeNum);
cout << "== nodeNum == " << nodeNum << endl;
cout << "== ->nodeCNT == " << SurfMesh->nodeCNT << endl;
SurfMesh->allocGlobToLocMap();
node** InterSurfNodeArray = SurfMesh->getNodeArray();
map<int, int>& globToLocMap = SurfMesh->getGlobToLocMap();
set<int>::iterator meshNodeIdIter;
int nodeCount = 0;
for(meshNodeIdIter = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){
InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]);
globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++;
}
//write file
char Currents_vtkFile[StrOutput];
sprintf(Currents_vtkFile, "SurfBC_%s_%d", fname, BC_id);
node** locNodeArray = new node*[SurfMesh->nodeCNT];
for(i = 0; i < SurfMesh->nodeCNT; i++){
node& Node = *(SurfMesh->ndArray[i]);
index = SurfMesh->globToLocMap_->find(Node.getid())->second;
locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz());
}
face** locFaceArray = new face*[SurfMesh->faceCNT];
for(i = 0; i < SurfMesh->faceCNT; i++){
face& Face = *(SurfMesh->fcArray[i]);
locFaceArray[i] = new face(Face);
locFaceArray[i]->setFace(locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second],
locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second],
locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]);
}
//TODO: check why unit is 1. instead of unit (it may be because the node coordinates are already scaled after readin. So they are true unit of the geometry)
VtkWriter vtkWriter(1.);
vtkWriter.writeTriUg(Currents_vtkFile, SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, 1); //The one is because we only work with first order geometry (modify if we want to work with higher order structures)
for(i = 0; i < SurfMesh->nodeCNT; i++)
delete locNodeArray[i];
delete [] locNodeArray;
for(i = 0; i < SurfMesh->faceCNT; i++)
delete locFaceArray[i];
delete [] locFaceArray;
}
// Double BC_ID
void FemGrp::makeSurfMesh(int BC_id1,int BC_id2){
SurfMesh = new PlaneWaveMesh;
int i, j;
set<int> InterSurfNodeIds;
// count the number of faces
int InterFaceNum = 0;
int* FaceMap = new int[faceCNT];
for(i = 0; i < faceCNT; i++)
FaceMap[i] = -1;
// Find the faces
for(i = 0; i < faceCNT; i++){
if((faceARRAY[i]->getbcPtr()->getbType() == BC_id1) || (faceARRAY[i]->getbcPtr()->getbType() == BC_id2)){
InterFaceNum++;
FaceMap[i] = i;
}
}
if(InterFaceNum == 0)
return;
// set SurfMesh_'s faceCnt_ and allocate its faceArray_
cout << "== InterFaceNum == " << InterFaceNum << endl;
SurfMesh->setFaceCnt(InterFaceNum);
cout << "== FaceNum == " << InterFaceNum << endl;
cout << "== ->faceCNT == " << SurfMesh->faceCNT << endl;
// populate faceArray_
int index = 0;
for(i = 0; i < faceCNT; i++){
if(FaceMap[i] > 0){
SurfMesh->setFace(faceARRAY[i], index);
index++;
// add unique node ids
for(j = 0; j < NumOfNodesPerFace; j++)
InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid());
}
}
// allocate and add node pointers to array
// keep local mapping
int nodeNum = InterSurfNodeIds.size();
SurfMesh->setNodeCnt(nodeNum);
cout << "== nodeNum == " << nodeNum << endl;
cout << "== ->nodeCNT == " << SurfMesh->nodeCNT << endl;
SurfMesh->allocGlobToLocMap();
node** InterSurfNodeArray = SurfMesh->getNodeArray();
map<int, int>& globToLocMap = SurfMesh->getGlobToLocMap();
set<int>::iterator meshNodeIdIter;
int nodeCount = 0;
for(meshNodeIdIter = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){
InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]);
globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++;
}
//write file
char Currents_vtkFile[StrOutput];
sprintf(Currents_vtkFile, "SurfBC_%s", fname);
node** locNodeArray = new node*[SurfMesh->nodeCNT];
for(i = 0; i < SurfMesh->nodeCNT; i++){
node& Node = *(SurfMesh->ndArray[i]);
index = SurfMesh->globToLocMap_->find(Node.getid())->second;
locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz());
}
face** locFaceArray = new face*[SurfMesh->faceCNT];
for(i = 0; i < SurfMesh->faceCNT; i++){
face& Face = *(SurfMesh->fcArray[i]);
locFaceArray[i] = new face(Face);
locFaceArray[i]->setFace(
locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second],
locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second],
locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]);
}
//TODO: check why unit is 1. instead of unit
VtkWriter vtkWriter(1.);
//TODO: check why order is 1. instead of order
vtkWriter.writeTriUg(Currents_vtkFile, SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, 1);
for(i = 0; i < SurfMesh->nodeCNT; i++)
delete locNodeArray[i];
delete [] locNodeArray;
for(i = 0; i < SurfMesh->faceCNT; i++)
delete locFaceArray[i];
delete [] locFaceArray;
}
// Set up the tet mass matrices and also the local inverses
// If non-matrix free is used also precompute and store the update matrices
void FemGrp::GetMatrices(){
int i;
tetra *tet;
timer_start("CPU Matrices Evaluation",'u');
// this gets the mass matrices for the local tets only
cout << "tetraCNT = " << tetraCNT << endl;
//std::vector<fp_t> vec_x1, vec_y1, vec_z1;
//std::vector<fp_t> vec_A2x, vec_A2y, vec_A2z;
//fp_t cutoff_freq = freq * 1e6; // Convert MHz to Hz
#pragma omp parallel for schedule(dynamic) private(tet,i)
for(i = 0; i < tetraCNT; i ++)
{
#if defined(DGTD_USE_CUDA)
//cout << "regularRegionFlag = " << regularRegionFlag << endl;
//cout << "regularReferenceARRAY[" << i << "] = " << regularReferenceARRAY[i] << endl;
//It is important in this order to avoid the checking of a null pointer
if(!regularRegionFlag || regularReferenceARRAY[i] == i)
{
tet = &(tetARRAY[i]);
tet->set_flux_GAMMA(factor_Flux);
bool isPML = tet->get_PML_Flag();
// -------------------------------------------------------------------------------
if (isPML)
{
tet->set_Conductivity_Profile_Planar(planewave_xmin, planewave_ymin, planewave_zmin,
planewave_xmax, planewave_ymax, planewave_zmax);
if (UseQuadratureMatrices)
{
tensor identity(1.0, 0.0, 0.0,
0.0, 1.0, 0.0,
0.0, 0.0, 1.0);
tet->Calculate_M_Matrix_E_Numeric();
tet->Calculate_M_Matrix_I_E_Numeric();
tet->Calculate_ABC_E_Numeric();
tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_epA_E, tet->matA, tet->mat->epsr, true,
"A", planewave_xmin, planewave_ymin, planewave_zmin,
planewave_xmax, planewave_ymax, planewave_zmax,
Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // epA
tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_epB_E, tet->matB, tet->mat->epsr, true,
"B", planewave_xmin, planewave_ymin, planewave_zmin,
planewave_xmax, planewave_ymax, planewave_zmax,
Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // epB
tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_epC_E, tet->matC, tet->mat->epsr, true,
"C", planewave_xmin, planewave_ymin, planewave_zmin,
planewave_xmax, planewave_ymax, planewave_zmax,
Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // epC
tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_D_E, tet->matD, identity, true,
"D", planewave_xmin, planewave_ymin, planewave_zmin,
planewave_xmax, planewave_ymax, planewave_zmax,
Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // D
tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_F_E, tet->matF, identity, true,
"F", planewave_xmin, planewave_ymin, planewave_zmin,
planewave_xmax, planewave_ymax, planewave_zmax,
Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // F
tet->Calculate_Bii_Matrix_E_Numeric();
tet->Calculate_Bij_Matrix_E_Numeric();
tet->Calculate_S_Matrix_E_Numeric();
tet->Calculate_Fii_Matrix_E_Numeric();
tet->Calculate_Fij_Matrix_E_Numeric();
tet->SetUp_LocalFaceToTetraMapE_NMF1_PML(tet->Class_dt);
tet->Calculate_M_Matrix_H_Numeric();
tet->Calculate_M_Matrix_I_H_Numeric();
tet->Calculate_ABC_H_Numeric();
tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_muA_H, tet->matA, tet->mat->mur, false,
"A", planewave_xmin, planewave_ymin, planewave_zmin,
planewave_xmax, planewave_ymax, planewave_zmax,
Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // muA
tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_muB_H, tet->matB, tet->mat->mur, false,
"B", planewave_xmin, planewave_ymin, planewave_zmin,
planewave_xmax, planewave_ymax, planewave_zmax,
Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // muB
tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_muC_H, tet->matC, tet->mat->mur, false,
"C", planewave_xmin, planewave_ymin, planewave_zmin,
planewave_xmax, planewave_ymax, planewave_zmax,
Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // muC
tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_D_H, tet->matD, identity, false,
"D", planewave_xmin, planewave_ymin, planewave_zmin,
planewave_xmax, planewave_ymax, planewave_zmax,
Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // D
tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_F_H, tet->matF, identity, false,
"F", planewave_xmin, planewave_ymin, planewave_zmin,
planewave_xmax, planewave_ymax, planewave_zmax,
Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // F
tet->Calculate_Bii_Matrix_H_Numeric();
tet->Calculate_Bij_Matrix_H_Numeric();
tet->Calculate_S_Matrix_H_Numeric();
tet->Calculate_Fii_Matrix_H_Numeric();
tet->Calculate_Fij_Matrix_H_Numeric();
tet->SetUp_LocalFaceToTetraMapH_NMF1_PML(tet->Class_dt);
}
else
{
tensor identity(1.0, 0.0, 0.0,
0.0, 1.0, 0.0,
0.0, 0.0, 1.0);
tet->Calculate_M_Matrix_E();
tet->Calculate_M_Matrix_I_E();
tet->Calculate_ABC_E();
tet->Calculate_Mass_Material_Matrix( tet->Mass_epA_E, tet->matA, tet->mat->epsr, true); // epA
tet->Calculate_Mass_Material_Matrix( tet->Mass_epB_E, tet->matB, tet->mat->epsr, true); // epB
tet->Calculate_Mass_Material_Matrix( tet->Mass_epC_E, tet->matC, tet->mat->epsr, true); // epC
tet->Calculate_Mass_Material_Matrix( tet->Mass_D_E, tet->matD, identity, true); // D
tet->Calculate_Mass_Material_Matrix( tet->Mass_F_E, tet->matF, identity, true); // F
tet->Calculate_Bii_Matrix_E();
tet->Calculate_Bij_Matrix_E();
tet->Calculate_S_Matrix_E();
tet->Calculate_Fii_Matrix_E();
tet->Calculate_Fij_Matrix_E();
tet->SetUp_LocalFaceToTetraMapE_NMF1_PML(tet->Class_dt);
tet->Calculate_M_Matrix_H();
tet->Calculate_M_Matrix_I_H();
tet->Calculate_ABC_H();
tet->Calculate_Mass_Material_Matrix( tet->Mass_muA_H, tet->matA, tet->mat->mur, false); // muA
tet->Calculate_Mass_Material_Matrix( tet->Mass_muB_H, tet->matB, tet->mat->mur, false); // muB
tet->Calculate_Mass_Material_Matrix( tet->Mass_muC_H, tet->matC, tet->mat->mur, false); // muC
tet->Calculate_Mass_Material_Matrix( tet->Mass_D_H, tet->matD, identity, false); // D
tet->Calculate_Mass_Material_Matrix( tet->Mass_F_H, tet->matF, identity, false); // F
tet->Calculate_Bii_Matrix_H();
tet->Calculate_Bij_Matrix_H();
tet->Calculate_S_Matrix_H();
tet->Calculate_Fii_Matrix_H();
tet->Calculate_Fij_Matrix_H();
tet->SetUp_LocalFaceToTetraMapH_NMF1_PML(tet->Class_dt);
}
}
// PML
// -------------------------------------------------------------------------------
else
{
if (UseQuadratureMatrices)
{
tet->Calculate_M_Matrix_E_Numeric();
tet->Calculate_M_Matrix_H_Numeric();
tet->Calculate_Bii_Matrix_E_Numeric();
tet->Calculate_Bij_Matrix_E_Numeric();
tet->Calculate_S_Matrix_E_Numeric();
tet->Calculate_Fii_Matrix_E_Numeric();
tet->Calculate_Fij_Matrix_E_Numeric();
tet->SetUp_LocalFaceToTetraMapE_NMF1_Numeric(tet->Class_dt);
tet->Calculate_Bii_Matrix_H_Numeric();
tet->Calculate_Bij_Matrix_H_Numeric();
tet->Calculate_S_Matrix_H_Numeric();
tet->Calculate_Fii_Matrix_H_Numeric();
tet->Calculate_Fij_Matrix_H_Numeric();
tet->SetUp_LocalFaceToTetraMapH_NMF1_Numeric(tet->Class_dt);
}
else
{
tet->Calculate_M_Matrix_E();
tet->Calculate_M_Matrix_H();
tet->Calculate_Bii_Matrix_E();
tet->Calculate_Bij_Matrix_E();
tet->Calculate_S_Matrix_E();
tet->Calculate_Fii_Matrix_E();
tet->Calculate_Fij_Matrix_E();
tet->SetUp_LocalFaceToTetraMapE_NMF1(tet->Class_dt);
tet->Calculate_Bii_Matrix_H();
tet->Calculate_Bij_Matrix_H();
tet->Calculate_S_Matrix_H();
tet->Calculate_Fii_Matrix_H();
tet->Calculate_Fij_Matrix_H();
tet->SetUp_LocalFaceToTetraMapH_NMF1(tet->Class_dt);
}
}
}
#else
tet = &(tetARRAY[i]);
tet->set_flux_GAMMA(factor_Flux);
tet->Calculate_M_Matrix_E();
tet->Calculate_M_Matrix_H();
// this
tet->Calculate_Bii_Matrix_E();
tet->Calculate_Bij_Matrix_E();
tet->Calculate_S_Matrix_E();
tet->Calculate_Fii_Matrix_E();
tet->Calculate_Fij_Matrix_E();
tet->SetUp_LocalFaceToTetraMapE_NMF1(tet->Class_dt);
tet->Calculate_Bii_Matrix_H();
tet->Calculate_Bij_Matrix_H();
tet->Calculate_S_Matrix_H();
tet->Calculate_Fii_Matrix_H();
tet->Calculate_Fij_Matrix_H();
tet->SetUp_LocalFaceToTetraMapH_NMF1(tet->Class_dt);
#endif
}
timer_stop('u');
}
void FemGrp::SetUpMatrixVector(){
DimE = dimE;
DimH = dimH;
#if defined(DGTD_USE_CUDA)
// MemSizeE = DimE * sizeof(fp_t_ts);
// MemSizeH = DimH * sizeof(fp_t_ts);
// CUDA_SAFE_CALL(cudaMallocHost((void**)&En1_h, MemSizeE, cudaHostAllocMapped));
// CUDA_SAFE_CALL(cudaMallocHost((void**)&Hn32_h, MemSizeH, cudaHostAllocMapped));
#else
MemSizeE = DimE * sizeof(fp_t);
MemSizeH = DimH * sizeof(fp_t);
en = new ArrayFP<fp_t>(DimE);
hn_12 = new ArrayFP<fp_t>(DimH);
en_1 = new ArrayFP<fp_t>(DimE);
hn_32 = new ArrayFP<fp_t>(DimH);
#endif
// pre-compute the facial matrices required for coupling
#pragma omp parallel for schedule(static)
for(int i = 0; i < faceCNT; i++)
faceARRAY[i]->SetUpMatrixFree();
// #pragma omp parallel for schedule(dynamic) private(tet,i)
#pragma omp parallel for schedule(dynamic)
for(int i = 0; i < tetraCNT; i++){
tetARRAY[i].SetUpMatrixFree();
}
}
void FemGrp::DG_AssignOffsets(){
int i;
int OffsetE = 0;
int OffsetH = 0;
tetra* tet;
for(i = 0; i < tetraCNT; i ++){
tet = &(tetARRAY[i]);
tet->CountDOF_E();
tet->CountDOF_H();
dimE = dimE + tet->LocalEDOF;
dimH = dimH + tet->LocalHDOF;
tet->set_LocalOffsetE(OffsetE);
OffsetE = OffsetE + tet->LocalEDOF;
tet->set_LocalOffsetH(OffsetH);
OffsetH = OffsetH + tet->LocalHDOF;
}
cout << " " << endl;
cout << "=================" << endl;
cout << " Dimensions " << endl;
cout << "=================" << endl;
cout << " dimE = " << dimE << endl;
cout << " dimH = " << dimH << endl;
cout << "=================" << endl;
cout << " " << endl;
}
void FemGrp::Get_dt_min_max(){
int printSc = tetraCNT / 10;
fp_t V_P;
fp_t LocaldtMin = 1.0 * 1e6;
fp_t LocalDt;
fp_t LocaldtMax = 0.0;
// #pragma omp parallel for schedule(dynamic) shared(LocaldtMin) private(LocalDt, V_P)
for(int i = 0; i < tetraCNT; i ++){
tetra* tet = &(tetARRAY[i]);
tet->TimeStepEstimate(LocalDt, V_P);
tet->set_Stability_dt(LocalDt); // May 5 2011
if(LocalDt < LocaldtMin){
#pragma omp atomic write
LocaldtMin = LocalDt;
}
if(LocalDt > LocaldtMax){
#pragma omp atomic write
LocaldtMax = LocalDt;
}
if(i % printSc == 0)
DEBUG_INFO(" Finished: " + to_string(i / (fp_t)tetraCNT * 100.0) + " %");
}
dt_min = LocaldtMin;
dt_max = LocaldtMax;
}
void FemGrp::LocalTimeSteppingClassPartioning()
{
cout.setf(ios::scientific,ios_base::floatfield);
cout.precision(20);
cout << " " << endl;
cout << "========================================================" << endl;
cout << " LocalTimeSteppingClassPartioning " << endl << flush;
cout << "========================================================" << endl;
//////////////////////////////////////////////////////////////////////////////////////
// In this part we calculate the minimum and maximum time-step, with these //
// values, we calculate the number of classes and the ttime-step of each class as: //
// dt_k = (2.0 * m + 1)^k * dt_min //
// - m = class factor //
// - k = number of the class(starts in 0) //
// - dt_k = timestep of class k //
// - dt_min = minimun timestep //
// we also assign to each tetra the class they belong to //
//////////////////////////////////////////////////////////////////////////////////////
int ClassCnt = 0;
int PMLClassCnt = 0; // For PML
setClassMul(1);// this is actually the m not (2m+1)
fp_t m = getClassMul();
cout << " Class Factor: (2m + 1), m = " << m << " " << endl << flush;
cout << " " << endl;
fp_t LocalDt;
fp_t LocalDt_down;
fp_t LocalDt_up;
tetra *tet;
cout << " Calculating Time steps " << endl;
Get_dt_min_max();
cout << " " << endl;
cout << " Get_dt_min = " << dt_min << endl;
cout << " Get_dt_max = " << dt_max << endl;
cout << " " << endl;
cout.setf(ios::scientific,ios_base::floatfield);
cout.precision(8);
cout << " Starting class partitioning" << endl;
N_class = (int)ceil(log((dt_max / dt_min)) / log(2.0 * m + 1.0));
if(scalbSty == 1 || N_class == 0) //only 1 if DGTD_USE_LTS is NOT defined
N_class = 1;
LocTimeSteps = new double[N_class];
ClassTetraCnt = new int[N_class];
ClassPMLTetraCnt = new int[N_class];
for(int i = 0 ; i < N_class; i++)
{
ClassTetraCnt[i] = 0;
ClassPMLTetraCnt[i] = 0;
}
cout << " " << endl;
cout << " N_class: " << N_class << endl;
if(scalbSty)
TimeStep_dt = dt_min;
numberPML = 0;
for(int i = 0 ; i < N_class; i++)
{
LocalDt_down = pow((2.0 * m + 1.0), i) * dt_min;
LocalDt_up = pow((2.0 * m + 1.0), (i + 1)) * dt_min;
LocTimeSteps[i] = 1.0 * LocalDt_down;
#pragma omp parallel for schedule(dynamic) shared(ClassCnt,PMLClassCnt) private(tet, LocalDt)
for(int j = 0; j < tetraCNT; j ++)
{
tet = &(tetARRAY[j]);
if(scalbSty)
{
tet->set_LTS_Flag(i);
tet->set_Class_dt(1.0 * LocalDt_down);
bool isExcitation = tet->get_ExcitationFlag();
#pragma omp atomic
ClassCnt++;
if (tet->get_PML_Flag() && !isExcitation)
{
#pragma omp atomic
PMLClassCnt++;
}
else
{
// Increment the count of tetrahedra in this class
#pragma omp atomic
ClassCnt++;
}
}
else
{
LocalDt = tet->get_Stability_dt();
//LocalDt = 0.93 * LocalDt;
if(LocalDt_down <= LocalDt && (LocalDt < LocalDt_up || i == N_class - 1))
{
tet->set_LTS_Flag(i);
tet->set_Class_dt(1.0 * LocalDt_down);
bool isExcitation = tet->get_ExcitationFlag();
if (tet->get_PML_Flag() && !isExcitation)
{
#pragma omp atomic
PMLClassCnt++;
}
else
{
// Increment the count of tetrahedra in this class
#pragma omp atomic
ClassCnt++;
}
}
}
}
ClassTetraCnt[i] = ClassCnt;
ClassPMLTetraCnt[i] = PMLClassCnt;
numberPML += PMLClassCnt;
cout << " Number of Tetra in class: " << i << " = " << ClassTetraCnt[i] << endl;
cout << " Number of PML Tetra in class: " << i << " = " << ClassPMLTetraCnt[i] << std::endl;
cout << "-------------------------------------------------------------" << endl;
ClassCnt = 0;
PMLClassCnt = 0;
}
cout << "Total Number of PML Tetras = " << numberPML << endl;
////////////////////////////////////////////////////////////////////////////////////
// In this part we check if there is enough elements in one class to be efficient //
// if not, those elements will be moved to the previous class //
////////////////////////////////////////////////////////////////////////////////////
if(N_class > 1)
{
bool reduceN_class = false;
bool balanced = false;
for(int i = 0; i < N_class - 1; i++)
{
int classN = (N_class - 1) - i;
fp_t number_of_tetra_in_classN = (fp_t)ClassTetraCnt[classN] + (fp_t)ClassPMLTetraCnt[classN];
fp_t relClassCnt = number_of_tetra_in_classN / tetraCNT;
fp_t previousClassDt = pow((2.0 * m + 1.0), classN - 1) * dt_min;
if (relClassCnt < ClassRelMinCNT && number_of_tetra_in_classN < ClassMinCNT)
{
if(i == 0)
{
reduceN_class = true;
}
balanced = true;
ClassTetraCnt[classN - 1] += ClassTetraCnt[classN];
ClassTetraCnt[classN] = 0;
ClassPMLTetraCnt[classN - 1] += ClassPMLTetraCnt[classN];
ClassPMLTetraCnt[classN] = 0;
#pragma omp parallel for schedule(dynamic) private(tet)
for(int j = 0; j < tetraCNT; j ++)
{
tet = &(tetARRAY[j]);
if(tetARRAY[j].get_LTS_Flag() == classN)
{
tet->set_LTS_Flag(classN - 1);
tet->set_Class_dt(1.0 * previousClassDt);
}
}
}
}
if(reduceN_class)
{
N_class -= 1;
}
if(balanced)
{
cout << "=================================" << endl;
cout << "Classes have been balanced\n";
for (int i = 0; i < N_class; i++)
{
cout << " Number of Tetra in class: " << i << " = " << ClassTetraCnt[i] << std::endl;
cout << " Number of PML Tetra in class: " << i << " = " << ClassPMLTetraCnt[i] << std::endl << endl;
}
cout << "=================================" << endl;
}
}
// Check that all the elements are associated with a class
for(int j = 0; j < tetraCNT; j ++)
{
if(tetARRAY[j].get_LTS_Flag() < 0)
cout << " tet " << tetARRAY[j].getcnt() << " has LTS_flag = " << tetARRAY[j].get_LTS_Flag() << " and LTS time step " << tetARRAY[j].get_Class_dt() << endl;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// In this part we order the tetras in the most efficient way for the GPU //
// - 1st: we order by class, from smaller time-step to larger //
// - 2nd: each class is ordered by nonConformal tetras 1st and then conformal ones //
// - 3rd: we order the nonconformal ones as: excitation (ordered by number of exciting faces 1-2-3) - nonExcitation //
// - 4th: we order the conformal ones as: nonRegular - Reg1 - Reg2 - ... //
// //
// *** NOTE: in nonConformal we also include any tetra with a face without neighbor *** //
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// -----------------------------------------------------------------------------------------------------
// Determine cutoff between Normal-regular groups and Regular-PML groups.
// Assumptions:
// - regularGroup == 0 -> Irregular (both non-PML and PML)
// - regularGroup > 0 -> Regular
// - Groups are assigned so that all non-PML regular groups use smaller IDs
// than any PML regular groups (i.e., there exists a clean cutoff).
//
// Outputs:
// regularCNT_Normal : number of regular groups used by non-PML (g in [1 .. cutoff-1])
// regularCNT_PML : number of regular groups used by PML (g in [cutoff .. regularCNT-1])
// -----------------------------------------------------------------------------------------------------
cout << "-----------------------" << endl;
if (regularCNT > 1)
{
regularCNT_Normal = 0;
for(int j = 0; j < tetraCNT; j ++)
{
tet = &(tetARRAY[j]);
int groupID = tet->getRegularGroup();
bool isPML = tet->get_PML_Flag();
if (!isPML)
{
if ((groupID > regularCNT_Normal))
{
regularCNT_Normal = groupID;
}
}
}
regularCNT_PML = regularCNT - regularCNT_Normal - 1;
}
else
{
regularCNT_Normal = 0;
regularCNT_PML = 0;
}
cout << "regularCNT = " << regularCNT << endl;
std::cout << "regularCNT_Normal = " << regularCNT_Normal << "\n";
std::cout << "regularCNT_PML = " << regularCNT_PML << "\n";
int NumGroups = regularCNT + 4 + portCNT;
cout << "NumGroups = " << NumGroups << endl;
// -----------------------
// Populate the TetraIndex
// -----------------------
// ----------------------------------------------------------------- //
// Store the tetrahedra in the ClassTetraIndexAux array //
// ----------------------------------------------------------------- //
list<int>* ClassTetraIndexAux = new list<int>[NumGroups];
ClassTetraIndex = new int*[N_class];
ClassExcitationCount = new int[N_class];
ClassExcitationOffset = new int[N_class];
ClassExcitation_sc_CNT = new int[N_class];
list<int> ClassExcitationPerFaceList[(int)pow(2, NumOfFaces) - 1];
if (portCNT > 0)
{
ClassPortCnt_h = new int[N_class * portCNT];
ClassPortOffset_h = new int[N_class * portCNT];
ClassPortNum_h = new int[N_class * portCNT];
}
for(int i = 0 ; i < N_class; i++)
{
ClassTetraIndex[i] = new int[ClassTetraCnt[i] + ClassPMLTetraCnt[i]];
ClassExcitationCount[i] = 0;
ClassExcitationOffset[i] = 0;
ClassExcitation_sc_CNT[i] = 0;
}
int PML_Case = NumGroups - 1;
int Scattering_Excited_Case = NumGroups - 2;
int Total_Excited_Case = NumGroups - 3;
int NC_Case = NumGroups - 4;
int Port_Case = NumGroups - 4 - portCNT; // First port case
int Conformal_Case = 0;
int index;
int DGface_bc;
int auxCNT = 0;
excitationFaces = 0;
int ClassOffSet = 0;
ClassTetraOffset = new int[N_class];
ClassPMLTetraOffset = new int[N_class];
for(int i = 0 ; i < N_class; i++)
{
for(int j = 0; j < tetraCNT; j ++)
{
tet = &(tetARRAY[j]);
bool isExcite = tet->ExcitationFlag;
bool isPML = tet->get_PML_Flag();
bool isNC = tet->getIsNC();
if(tet->LTS_Flag == i)
{
if(tet->getRegularGroup() > 0)
ClassTetraIndexAux[tet->getRegularGroup()].push_back(tet->getcnt());
else if(!isNC && tet->get_NeighNum() == 4 && !isPML && !isExcite)
ClassTetraIndexAux[Conformal_Case].push_back(tet->getcnt());
else if (isPML)
ClassTetraIndexAux[PML_Case].push_back(tet->getcnt());
else
{
if(isExcite)
{
ClassExcitationCount[i]++;
int face = 0;
for(int k = 0; k < NumOfFaces; k++)
{
if (!tet->fc[k] || !tet->fc[k]->bcPtr) continue; // optional null guard
DGface_bc = tet->fc[k]->bcPtr->getbType();
if(DGface_bc == planeWaveType || DGface_bc == portType || DGface_bc == pmlType)
{
face += (1 << k);
excitationFaces++;
}
}
if (face > 0)
ClassExcitationPerFaceList[face - 1].push_back(tet->getcnt());
}
else
{
ClassTetraIndexAux[NC_Case].push_back(tet->getcnt());
}
}
}
}
// ----------------------------------------------------------------- //
// Excitation //
// ----------------------------------------------------------------- //
ClassExcitationOffset[i] = auxCNT;
auxCNT += ClassExcitationCount[i];
for(int j = (1 << NumOfFaces) - 2; j >= 0; j--)
{
int listIndex = faceExcitationOrder[j] - 1;
int auxSize = ClassExcitationPerFaceList[listIndex].size();
for(int k = 0; k < auxSize; k++)
{
int tet_id = ClassExcitationPerFaceList[listIndex].back();
tet = &(tetARRAY[tet_id]);
if (PlaneWaveBCFlag)
{
if (tet->scattering_region)
ClassTetraIndexAux[Scattering_Excited_Case].push_back(tet_id);
else
ClassTetraIndexAux[Total_Excited_Case].push_front(tet_id);
}
else
{
int port_id = -1;
for (int k=0; k<NumOfFaces; k++)
{
int bc_number = tet->getbc(k);
if (tet->fc[k]->bcPtr->getbType() == portType)
{
int pnum = bcNumToPnum[bc_number];
ClassTetraIndexAux[Port_Case+pnum].push_front(tet_id);
break;
}
}
}
ClassExcitationPerFaceList[listIndex].pop_back();
}
}
// ----------------------------------------------------------------- //
// Store the tetrahedra in the ClassTetraIndex array //
// ----------------------------------------------------------------- //
index = 0;
auto addGroupToIndex = [&](int group) {
int size = ClassTetraIndexAux[group].size();
for (int l = 0; l < size; l++)
{
ClassTetraIndex[i][index++] = ClassTetraIndexAux[group].front();
ClassTetraIndexAux[group].pop_front();
}
};
// -----------------------------------------------------------------------------------------------
// Order: Scattered Field Excited, Total Field Excited, NC, Conformal, Regular, PML, Regular PML
// -----------------------------------------------------------------------------------------------
if (PlaneWaveBCFlag)
{
addGroupToIndex(Scattering_Excited_Case);
ClassExcitation_sc_CNT[i] = index;
addGroupToIndex(Total_Excited_Case);
}
else
{
for(int p = 0; p < portCNT; p++)
{
ClassPortOffset_h[i * portCNT + p] = index;
addGroupToIndex(Port_Case + p);
ClassPortCnt_h[i * portCNT + p] = index - ClassPortOffset_h[i * portCNT + p];
ClassPortNum_h[i * portCNT + p] = p;
}
}
addGroupToIndex(NC_Case);
addGroupToIndex(Conformal_Case);
// Add Regular Tetrahedra
// WE assume that there are only 6 regular tetrehedron that are non-PML
if ( regularCNT > 1)
{
for (int k = 1; k <= regularCNT_Normal; k++)
{
addGroupToIndex(k);
}
}
cout << "Class " << i << " | PML index = " << index << endl;
addGroupToIndex(PML_Case);
// Add PML Regular Tetrahedra
if ( regularCNT > 6)
{
for (int k = regularCNT_Normal; k < regularCNT; k++)
{
addGroupToIndex(k);
}
}
ClassTetraOffset[i] = ClassOffSet;
ClassOffSet += ClassTetraCnt[i] + ClassPMLTetraCnt[i];
ClassPMLTetraOffset[i] = ClassOffSet - ClassPMLTetraCnt[i];
}
for(int i = 0; i < N_class; i++)
{
std::cout << " ClassExcitationCount[" << i << "] = " << ClassExcitationCount[i] << std::endl;
std::cout << " ClassTetraOffset[" << i << "] = " << ClassTetraOffset[i] << std::endl;
std::cout << " ClassPMLTetraOffset[" << i << "] = " << ClassPMLTetraOffset[i] << std::endl;
}
std::cout << "excitationFaces = " << excitationFaces << std::endl;
std::cout << "========================================================" << std::endl;
}
/**
OpenMP Local Time-Stepping for matrix free Recursive
Explained in "Dissipative terms and local time-stepping improvements
in a spatial high order Discontinuous Galerkin scheme
for the time-domain Maxwell’s equations" by E. Montseny
*/
void FemGrp::ComputeE_MatrixFree(int class_i, fp_t dt_i){
if(class_i == 0){
LeapFrogE(class_i, LocTimeSteps[class_i]);
}
else{
LeapFrogE(class_i, LocTimeSteps[class_i]);
ComputeE_MatrixFree(class_i - 1, LocTimeSteps[class_i-1]);
ComputeH_MatrixFree(class_i - 1, LocTimeSteps[class_i-1]);
ComputeE_MatrixFree(class_i - 1, LocTimeSteps[class_i-1]);
}
}
void FemGrp::ComputeH_MatrixFree(int class_i, fp_t dt_i){
if(class_i == 0){
LeapFrogH(class_i, LocTimeSteps[class_i]);
}
else{
LeapFrogH(class_i, LocTimeSteps[class_i]);
ComputeH_MatrixFree(class_i - 1, LocTimeSteps[class_i - 1]);
ComputeE_MatrixFree(class_i - 1, LocTimeSteps[class_i - 1]);
ComputeH_MatrixFree(class_i - 1, LocTimeSteps[class_i - 1]);
}
}
void FemGrp::LeapFrogE(int class_i, fp_t dt_i){
int i;
int n;
fp_t InitTime = 0.0;
n = LocalExciIndexE[class_i];
#pragma omp parallel for schedule(dynamic) private(i)
for(i = 0; i < ClassTetraCnt[class_i]; i++){
tetra* tet = &(tetARRAY[ClassTetraIndex[class_i][i]]);
tet->LocalFaceToTetraMapE_NMF1(*en_1, *en, *hn_12, dt_i, InitTime + (n + 0.5) * dt_i);
}
#pragma omp parallel for schedule(dynamic) private(i)
for(i = 0 ; i < DimE; i++){
en->setentry(i, en_1->getentry(i));
}
LocalExciIndexE[class_i] = LocalExciIndexE[class_i] + 1;
}
void FemGrp::LeapFrogH(int class_i, fp_t dt_i){
int i;
int n;
fp_t InitTime = 0.0;
n = LocalExciIndexH[class_i];
#pragma omp parallel for schedule(dynamic) private(i)
for(i = 0; i < ClassTetraCnt[class_i]; i ++){
tetra* tet = &(tetARRAY[ClassTetraIndex[class_i][i]]);
tet->LocalFaceToTetraMapH_NMF1(*hn_32, *en_1, *hn_12, dt_i, InitTime + (n + 1.0) * dt_i);
}
#pragma omp parallel for schedule(dynamic) private(i)
for(i = 0 ; i < DimH ; i++){
hn_12->setentry(i, hn_32->getentry(i));
}
LocalExciIndexH[class_i] = LocalExciIndexH[class_i] + 1;
}
/**
Local Time-Stepping Update
*/
void FemGrp::LTS_TimeUpdateGlobal_MatrixFree(){
int i, n;
fp_t InitTime = 0.0;
LocalExciIndexE = new int[N_class];
LocalExciIndexH = new int[N_class];
for(i = 0; i < N_class; i++){
LocalExciIndexE[i] = 0;
LocalExciIndexH[i] = 0;
}
NtimeSteps = (int)ceil((FinalTime - InitTime) / LocTimeSteps[N_class -1]);
cout.setf(ios::scientific,ios_base::floatfield);
cout.precision(15);
cout << "Start Time Stepping " << endl;
cout << "FinalTime = " << FinalTime << endl;
cout << "TimeStep_dt = " << LocTimeSteps[N_class -1] << endl;
cout << "tetraCNT = " << tetraCNT << endl;
cout << "NtimeSteps = " << NtimeSteps << endl;
timer_start("Time Stepping", ' ');
fp_t Frequency = freq;
fp_t dt_nyquist = 1.0 / (2.0 * Frequency * MEGA);
// fp_t dt_nyquist = 2.0 / (Frequency * MEGA); //That's wrong
fp_t dt_sample = (1 / SamplingRate) * dt_nyquist;
int postProcIters = (int)ceil(dt_sample / LocTimeSteps[N_class - 1]);
int printScreenIters = 2 * postProcIters;
Write_TD_Data(postProcIters, NtimeSteps);
cout << "dt_nyquist = " << dt_nyquist << endl;
cout << "dt_sample = " << dt_sample << endl;
cout << "printScreenIters = " << printScreenIters << endl;
cout << "postProcIters = " << postProcIters << endl;
cout << "N_class = " << N_class <<endl;
size_t total_time = 0;
fp_t current_time = 0;
current_time -= (double)dt_sample * 1e9;
SYSTEM_MEM_USAGE();
timer_start("Start Time Stepping", ' ');
for(n = 0 ; n < NtimeSteps ; n++)
{
ComputeE_MatrixFree(N_class - 1 , LocTimeSteps[N_class - 1]);
ComputeH_MatrixFree(N_class - 1 , LocTimeSteps[N_class - 1]);
if(n % postProcIters == 0)
{
if(write_AnalyticalIncidentProbes)
{
if(probeCNT > 0)
{
CalculateL2Error(n, LocTimeSteps[N_class - 1], ExcitFlag);
CalculateL2ErrorProbes(n, LocTimeSteps[N_class - 1], ExcitFlag);
}
writeAnalyticalIncidentPWProbes(n);
}
if(write_probes && probeCNT > 0)
{
writeFieldProbe(n);
}
if(write_fields)
{
writeFieldGlobal(n);
}
if(portCNT != 0)
{
EvaluateSparametersGlobal(n, LocTimeSteps[N_class -1], true);
}
cout << "E field norm " << en_1->magnitude() << endl;
//cout << "H field norm " << hn_32->magnitude() << endl;
total_time += timer_stop(' ');
timer_start(to_string(postProcIters)+" steps ", ' ');
DEBUG_INFO("Percentage Completed :" + to_string((double)n / (double)NtimeSteps * 100.0) + "%");
current_time += (double)dt_sample * 1e9;
DEBUG_INFO("Current Time : " + to_string(current_time) + "ns");
DEBUG_INFO("Average iteration time : "+ to_string(((double)total_time / (double)(n + 1.0))) + " sec");
}
}
DEBUG_INFO("Total iteration time: "+ to_string(((double)total_time)) + " sec");
timer_stop(' ');
}
//*****************
void FemGrp::Write_TD_Data(int tsPerSample, int nTimeSteps){
// fp_t to = 4.0 * pow(10.0, -9.0);
// fp_t tau = 0.8 * pow(10.0, -9.0);
char TD_data[180];
sprintf(TD_data, "./PROBES/%s.TD_Data", fname);
ofstream TD_datafile(TD_data, ios_base::out);
if(!TD_datafile){
cout << "Error in opening file: " << TD_data << "for write"<< endl;
}
TD_datafile << LocTimeSteps[N_class -1] << endl;
TD_datafile << nTimeSteps << endl;
TD_datafile << To << endl;
TD_datafile << Tau << endl;
TD_datafile << tsPerSample << endl;
TD_datafile << probeCNT << endl;
}
// Modifed by qi jian to use octree to store the probes barycentric coordinates
void FemGrp::readPROBE()
{
// Read only the nodes belonging to this subdomain and neighbors
char nname[StrLenShort];
// Read the probe file
sprintf(nname, "%s.probe", fname);
rapidcsv::Document probe_doc(nname);
std::vector<double> x_col = probe_doc.GetColumn<double>("X");
std::vector<double> y_col = probe_doc.GetColumn<double>("Y");
std::vector<double> z_col = probe_doc.GetColumn<double>("Z");
// Check that all the columns have the same size
assert(x_col.size() == y_col.size());
assert(y_col.size() == z_col.size());
assert(z_col.size() == x_col.size());
probeCNT = x_col.size();
if(padeCNT > probeCNT)
{
padeCNT = probeCNT;
cout << "Pade Number Of Elements REDUCED to " << probeCNT << endl;
}
probes_bary.resize(probeCNT);
std::cout << "Compute the Barycentric coordinates of the Probes" << std::endl;
const double tol = 1e-8;
//#pragma omp parallel for schedule(dynamic)
for (int node_id = 0; node_id < probeCNT; ++node_id)
{
double probe_xyz[3] = {x_col[node_id] * unit, y_col[node_id] * unit, z_col[node_id] * unit};
std::vector<std::pair<int, std::array<double, 4>>> found_tets;
bool success = octree_object.findTetraInOctree(probe_xyz, found_tets, tol);
if (success)
{
probes_bary[node_id].first = static_cast<int>(found_tets.size());
probes_bary[node_id].second = found_tets;
}
else
{
probes_bary[node_id].first = -1;
}
}
// Report and verify
bool error_flag = false;
for (int i = 0; i < probeCNT; ++i)
{
if (probes_bary[i].first < 0)
{
std::cerr << "Node " << i << " not found in simulation domain" << std::endl;
double probe_xyz[3] = {x_col[i] * unit, y_col[i] * unit, z_col[i] * unit};
std::cerr << probe_xyz[0] << " " << probe_xyz[1] << " " << probe_xyz[2] << std::endl;
error_flag = true;
}
}
if (error_flag)
{
std::cerr << "Error: Some nodes were not found in the simulation domain. Exiting." << std::endl;
std::exit(EXIT_FAILURE);
}
}
auto v3_dot (const fp_t_ts* a, const fp_t_ts* b)
{
return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];
};
auto v3_norm(const fp_t_ts* a)
{
return (fp_t_ts)std::sqrt(v3_dot(a,a));
};
auto v3_normed(fp_t_ts* out, const fp_t_ts* a)
{
fp_t_ts n=v3_norm(a); if(n>0){ out[0]=a[0]/n; out[1]=a[1]/n; out[2]=a[2]/n; } else { out[0]=out[1]=out[2]=0; }
};
auto v3_cross(fp_t_ts* out, const fp_t_ts* x, const fp_t_ts* y)
{
out[0]=x[1]*y[2]-x[2]*y[1];
out[1]=x[2]*y[0]-x[0]*y[2];
out[2]=x[0]*y[1]-x[1]*y[0];
};
auto v3_add(fp_t_ts* a, const fp_t_ts* b)
{
a[0]+=b[0]; a[1]+=b[1]; a[2]+=b[2];
};
auto v3_sub(fp_t_ts* out, const fp_t_ts* a, const fp_t_ts* b)
{
out[0]=a[0]-b[0]; out[1]=a[1]-b[1]; out[2]=a[2]-b[2];
};
void FemGrp::prepPortPROBE()
{
const int prec = std::numeric_limits<fp_t_ts>::max_digits10; // round-trip safe
std::cout << std::scientific << std::setprecision(prec);
// Perform this for each port
int port_index_count = 0;
for (int p = 0; p < portCNT; p++)
{
// Port type
int port_type = portExcitations[p].PortFlag;
PortProbeOffset_h[p] = port_index_count;
// Rectangular ports
if (port_type == 1 || port_type == 3)
{
// TEM Rectangular Port
// ----------------------------------------------------------------------------------------------------------
// (1) Collect faces of this port, compute port centroid
fp_t_ts Cport[3] = {0,0,0};
fp_t_ts face_count = 0;
for (int f = 0; f < excitationFaces; ++f)
{
const int portID = PortFacePidx_h[f];
if (portID == p)
{
fp_t_ts local_port_coord[3] = {0,0,0};
//cout << "portID = " << portID << endl;
port_index_count++;
const int base = NumOfUnitaryVectors * (NumOfNodesPerFace * f);
fp_t_ts P0[3] = {nd_coords_face_h[base + 0],nd_coords_face_h[base + 1],nd_coords_face_h[base + 2]};
fp_t_ts P1[3] = {nd_coords_face_h[base + NumOfUnitaryVectors*1],nd_coords_face_h[base + NumOfUnitaryVectors*1+1],nd_coords_face_h[base + NumOfUnitaryVectors*1+2]};
fp_t_ts P2[3] = {nd_coords_face_h[base + NumOfUnitaryVectors*2],nd_coords_face_h[base + NumOfUnitaryVectors*2+1],nd_coords_face_h[base + NumOfUnitaryVectors*2+2]};
//cout << "P0 = " << P0[0] << " , " << P0[1] << " , " << P0[2] << endl;
//cout << "P1 = " << P1[0] << " , " << P1[1] << " , " << P1[2] << endl;
//cout << "P2 = " << P2[0] << " , " << P2[1] << " , " << P2[2] << endl;
face_count += 1.0;
local_port_coord[0] = (P0[0] + P1[0] + P2[0]) / 3.0;
local_port_coord[1] = (P0[1] + P1[1] + P2[1]) / 3.0;
local_port_coord[2] = (P0[2] + P1[2] + P2[2]) / 3.0;
Cport[0] += local_port_coord[0];
Cport[1] += local_port_coord[1];
Cport[2] += local_port_coord[2];
}
}
Cport[0] /= face_count;
Cport[1] /= face_count;
Cport[2] /= face_count;
cout << "Port " << p << " | Centroid = " << Cport[0] << " , " << Cport[1] << " , " << Cport[2] << endl;
// ----------------------------------------------------------------------------------------------------------
// (2) Port centroid of all the centroids of the faces in each port
fp_t_ts vpath[3] = {portExcitations[p].vpath[0],portExcitations[p].vpath[1],portExcitations[p].vpath[2]};
fp_t_ts L_pos = 0.0;
for (int f = 0; f < excitationFaces; ++f)
{
const int portID = PortFacePidx_h[f];
if (portID == p)
{
const int base = NumOfUnitaryVectors * (NumOfNodesPerFace * f);
fp_t_ts P0[3] = {nd_coords_face_h[base + 0],nd_coords_face_h[base + 1],nd_coords_face_h[base + 2]};
fp_t_ts P1[3] = {nd_coords_face_h[base + NumOfUnitaryVectors*1],nd_coords_face_h[base + NumOfUnitaryVectors*1+1],nd_coords_face_h[base + NumOfUnitaryVectors*1+2]};
fp_t_ts P2[3] = {nd_coords_face_h[base + NumOfUnitaryVectors*2],nd_coords_face_h[base + NumOfUnitaryVectors*2+1],nd_coords_face_h[base + NumOfUnitaryVectors*2+2]};
fp_t_ts d[3];
d[0] = P0[0] - Cport[0];
d[1] = P0[1] - Cport[1];
d[2] = P0[2] - Cport[2];
//cout << d[0] << " , " << d[1] << " , " << d[2] << endl;
//cout << P0[0] << " , " << P0[1] << " , " << P0[2] << endl;
//cout << Cport[0] << " , " << Cport[1] << " , " << Cport[2] << endl;
fp_t_ts dpar = v3_dot(d, vpath);
dpar = abs(dpar);
if (dpar > L_pos) L_pos = dpar;
//cout << dpar << endl;
d[0] = P1[0] - Cport[0];
d[1] = P1[1] - Cport[1];
d[2] = P1[2] - Cport[2];
//cout << d[0] << " , " << d[1] << " , " << d[2] << endl;
//cout << P1[0] << " , " << P1[1] << " , " << P1[2] << endl;
//cout << Cport[0] << " , " << Cport[1] << " , " << Cport[2] << endl;
dpar = v3_dot(d, vpath);
dpar = abs(dpar);
if (dpar > L_pos) L_pos = dpar;
//cout << dpar << endl;
d[0] = P2[0] - Cport[0];
d[1] = P2[1] - Cport[1];
d[2] = P2[2] - Cport[2];
//cout << d[0] << " , " << d[1] << " , " << d[2] << endl;
//cout << P2[0] << " , " << P2[1] << " , " << P2[2] << endl;
//cout << Cport[0] << " , " << Cport[1] << " , " << Cport[2] << endl;
dpar = v3_dot(d, vpath);
dpar = abs(dpar);
if (dpar > L_pos) L_pos = dpar;
//cout << dpar << endl;
}
}
if (!(L_pos > (fp_t_ts)0)) L_pos = (fp_t_ts)1e-6; // avoid zero length
// Create some slack to avoid sampling at the edges
L_pos *= 0.98;
cout << "L_pos = " << L_pos << endl;
cout << "VPATH = " << vpath[0] << " , " << vpath[1] << " , " << vpath[2] << endl;
// ----------------------------------------------------------------------------------------------------------
// (3)) Build probe points for this port line and write SAME line to each face of the port
int numProbes = 3;
for (int q = 0; q < numProbes; ++q)
{
const fp_t_ts t = (numProbes==1) ? (fp_t_ts)0.5 : (fp_t_ts)q/(fp_t_ts)(numProbes-1);
PortProbes_h[p*3*numProbes + 3*q+0] = Cport[0] + t*L_pos*vpath[0];
PortProbes_h[p*3*numProbes + 3*q+1] = Cport[1] + t*L_pos*vpath[1];
PortProbes_h[p*3*numProbes +3*q+2] = Cport[2] + t*L_pos*vpath[2];
}
}
else if(port_type == 2)
{
// 0) Populate the offsets
for (int f = 0; f < excitationFaces; ++f)
{
if (PortFacePidx_h[f] != p+1) continue;
port_index_count++;
}
// ----------------------------------------------------------------------------------------------------------
// 1) Coax geometry
const fp_t_ts* r0 = portExcitations[p].r0_port; // center
const fp_t_ts* r1 = portExcitations[p].r1_port; // on inner circle
const fp_t_ts* r2 = portExcitations[p].r2_port; // on outer circle
// Vectors from center to inner/outer points
fp_t_ts a_vec[3], b_vec[3];
v3_sub(a_vec, r1, r0);
v3_sub(b_vec, r2, r0);
fp_t_ts a = v3_norm(a_vec); // inner radius
fp_t_ts b = v3_norm(b_vec); // outer radius
// Radial unit r̂ (prefer toward r2)
fp_t_ts r_hat[3] = {0,0,0};
v3_sub(r_hat, b_vec, a_vec);
fp_t_ts L_pos = v3_norm(r_hat);
v3_normed(r_hat, b_vec);
// Create some slack to avoid sampling at the edges
L_pos *= 0.95;
cout << "L_pos = " << L_pos << endl;
// ----------------------------------------------------------------------------------------------------------
// (2) Build probe points for this port line and write SAME line to each face of the port
int numProbes = 3;
for (int q = 0; q < numProbes; ++q)
{
const fp_t_ts t = (numProbes==1) ? (fp_t_ts)0.5 : (fp_t_ts)q/(fp_t_ts)(numProbes-1);
PortProbes_h[p*3*numProbes + 3*q+0] = r_hat[0]*0.02 + a_vec[0] + t*L_pos*r_hat[0];
PortProbes_h[p*3*numProbes + 3*q+1] = r_hat[1]*0.02 + a_vec[1] + t*L_pos*r_hat[1];
PortProbes_h[p*3*numProbes + 3*q+2] = r_hat[2]*0.02 + a_vec[2] + t*L_pos*r_hat[2];
}
}
if (p == 0)
{
PortProbeCount_h[p] = port_index_count;
}
else
{
PortProbeCount_h[p] = port_index_count - PortProbeOffset_h[p];
}
cout << "PortProbeCount_h[" << p << "] = " << PortProbeCount_h[p] << endl;
cout << "PortProbeOffset_h[" << p << "] = " << PortProbeOffset_h[p] << endl;
cout << "---------------------------------------------------------------" << endl;
}
int numProbes = 3;
for (int p = 0; p < portCNT; p++)
{
cout << "Probes for port " << p << endl;
for (int q = 0; q < numProbes; ++q)
{
cout << PortProbes_h[p*3*numProbes + 3*q+0] << " , " << PortProbes_h[p*3*numProbes + 3*q+1] << " , " << PortProbes_h[p*3*numProbes + 3*q+2] << endl;
}
}
cout << "============================================" << endl;
const double tol = 1e-3;
std::cout << "Compute the Barycentric coordinates of Probes on Ports" << std::endl;
port_bary.clear();
port_bary.resize(excitationFaces);
int not_found = 0;
int done = 0;
long long total_found_tets = 0; // sum of found_tets.size() over successes
int success_faces = 0; // number of faces with success==true
// Perform this for each port
for (int p = 0; p < portCNT; p++)
{
cout << "For PORT " << p << endl;
int offset = PortProbeOffset_h[p];
int count = PortProbeCount_h[p];
for (int q = 0; q < numProbes; ++q)
{
int index = p*3*numProbes + 3*q;
double xyz[3] = { (double)PortProbes_h[index],
(double)PortProbes_h[index+1],
(double)PortProbes_h[index+2] };
std::vector<std::pair<int, std::array<double,4>>> found_tets;
bool success = octree_object.findTetraInOctree(xyz, found_tets, tol);
if (success)
{
port_bary[index].first = static_cast<int>(found_tets.size());
port_bary[index].second = std::move(found_tets);
++done;
// [NEW] accumulate for average
total_found_tets += port_bary[index].first;
++success_faces;
}
else
{
port_bary[index].first = -1;
++not_found;
// Debug print (can be silenced)
std::cerr << "Probe NOT found at " << xyz[0] << " " << xyz[1] << " " << xyz[2] << "\n";
}
}
}
std::cout << "Located " << done << " | " << not_found << " not found.\n";
if (not_found == 0 && success_faces > 0) {
const double avg = static_cast<double>(total_found_tets) / static_cast<double>(success_faces);
std::cout << "Average owning tets per probe = " << avg
<< " (over " << success_faces << " faces)\n";
}
// Hard error if any were not found (match readPROBE behavior if you prefer)
if (not_found > 0)
{
std::cerr << "Error: Some probes were not found in the domain. Exiting.\n";
std::exit(EXIT_FAILURE);
}
}
// TODO!!!
/*
// - excitationFaces (flattened exc. faces count)
// - PortFacePidx_h (int[excitationFaces], -1 for non-port faces)
// - PortFaceCentroid_h (fp_t_ts[excitationFaces*3], centroid coords per face)
*/
// Uses TetID_excitation_h (owner tet id) to compute barycentrics of each
// port-face centroid inside its owning tetra. No octree/hydra traversal.
//
// Inputs assumed ready:
// - excitationFaces
// - PortFacePidx_h : int[excitationFaces], -1 if NOT a port face
// - PortProbes_h : fp_t_ts[3*excitationFaces] (cx,cy,cz per face)
// - TetID_excitation_h : int[excitationFaces] (owner tetra index 0..tetraCNT-1)
// - FaceID_excitation_h : int[excitationFaces] (optional, not strictly needed here)
//
// Output:
// - portFaceCentroid_bary[f].first = 1 on success, -1 if non-port or error
// - portFaceCentroid_bary[f].second = { { tetId, {l0,l1,l2,l3} } } (exactly one entry)
/*
void FemGrp::prepPortFacePROBE()
{
if (portCNT <= 0 || !PortFacePidx_h || !PortProbes_h || !TetID_excitation_h)
{
std::cerr << "[prepPortFacePROBE] Missing inputs or no ports.\n";
return;
}
auto det3 = [](const double x[3], const double y[3], const double z[3])
{
return x[0]*(y[1]*z[2]-y[2]*z[1])
- x[1]*(y[0]*z[2]-y[2]*z[0])
+ x[2]*(y[0]*z[1]-y[1]*z[0]);
};
std::cout << "Compute barycentric coords of port-face centroids (using TetID_excitation_h)\n";
portFaceCentroid_bary.clear();
portFaceCentroid_bary.resize(excitationFaces);
int done = 0, errors = 0;
for (int f = 0; f < excitationFaces; ++f)
{
// Skip non-port faces
if (PortFacePidx_h[f] < 0)
{
portFaceCentroid_bary[f].first = -1;
continue;
}
// Owner tetra index from your pre-filled array
const int tId = TetID_excitation_h[f];
if (tId < 0 || tId >= tetraCNT)
{
std::cerr << "[PortCentroid] Invalid owner tId=" << tId << " for excitation face f=" << f << "\n";
portFaceCentroid_bary[f].first = -1;
++errors;
continue;
}
const tetra& T = tetARRAY[tId];
// Tetra vertices
double v[4][3];
for (int i = 0; i < 4; ++i)
{
v[i][0] = T.nd[i]->getCoord().getx();
v[i][1] = T.nd[i]->getCoord().gety();
v[i][2] = T.nd[i]->getCoord().getz();
}
// Face centroid (cx,cy,cz)
const fp_t_ts* C = &PortFaceCentroid_h[3 * f];
const double P[3] = { (double)C[0], (double)C[1], (double)C[2] };
// Barycentric via Cramer's rule
double a[3] = { v[0][0]-v[3][0], v[0][1]-v[3][1], v[0][2]-v[3][2] };
double b[3] = { v[1][0]-v[3][0], v[1][1]-v[3][1], v[1][2]-v[3][2] };
double c[3] = { v[2][0]-v[3][0], v[2][1]-v[3][1], v[2][2]-v[3][2] };
double r[3] = { P[0]-v[3][0], P[1]-v[3][1], P[2]-v[3][2] };
const double D = det3(a,b,c);
if (std::abs(D) == 0.0)
{
std::cerr << "[PortCentroid] Degenerate tetra (D=0) at tId=" << tId << " for f=" << f << "\n";
portFaceCentroid_bary[f].first = -1;
++errors;
continue;
}
double l0 = det3(r,b,c) / D;
double l1 = det3(a,r,c) / D;
double l2 = det3(a,b,r) / D;
double l3 = 1.0 - (l0 + l1 + l2);
// Gentle renormalization (handles tiny FP drift)
double sumL = l0 + l1 + l2 + l3;
if (std::abs(sumL - 1.0) > 1e-10)
{
l3 = 1.0 - (l0 + l1 + l2);
}
// Store exactly one (tet, lambdas)
std::vector<std::pair<int, std::array<double,4>>> vec;
vec.emplace_back(tId, std::array<double,4>{l0,l1,l2,l3});
portFaceCentroid_bary[f].first = 1;
portFaceCentroid_bary[f].second = std::move(vec);
++done;
//cout << l0 << " " << l1 << " " << l2 << " " << l3 << "\n";
}
std::cout << "[prepPortFaceCentroidPROBE] Completed: " << done
<< " faces; errors=" << errors << ".\n";
if (errors > 0) {
std::cerr << "Error: Some port-face centroids could not be assigned.\n";
std::exit(EXIT_FAILURE);
}
}
*/
void FemGrp::readREGULAR(){
// writeFieldGlobal(1);
char tname[StrLenShort];
sprintf(tname, "%s.regular", fname);
ifstream regularAreaFile(tname, ios::in);
if(!regularAreaFile){
cout << "File " << tname << " does NOT exist " << endl;
exit(1);
}
int numOfRegions;
int region;
regularAreaFile >> numOfRegions;
regularTetraCNT = 0;
// Only one domain exists
regularCNT = numOfRegions;
if(regularCNT >= 1){
regularReferenceARRAY = new int[tetraCNT];
regionARRAY = new int[regularCNT];
for(int i = 0; i < regularCNT; i++)
regionARRAY[i] = -1;
for(int i = 0; i < tetraCNT; i ++){
tetra* tet = &(tetARRAY[i]);
regularAreaFile >> region;
tet->setRegularGroup(region);
if(region == 0){
regularReferenceARRAY[i] = i;
}
else
{
regularTetraCNT++;
if(regionARRAY[region] == -1)
{
regionARRAY[region] = i;
regularReferenceARRAY[i] = i;
}
else
{
regularReferenceARRAY[i] = regionARRAY[region];
}
}
// cout << "i = " << i << " reference = " << regularReferenceARRAY[i] << " region = " << region << endl;
}
}
}
void FemGrp::initializeMaxMinPoints(){
maxPoint.setvtr(std::numeric_limits<fp_t>::min(), std::numeric_limits<fp_t>::min(), std::numeric_limits<fp_t>::min());
minPoint.setvtr(std::numeric_limits<fp_t>::max(), std::numeric_limits<fp_t>::max(), std::numeric_limits<fp_t>::max());
}
void FemGrp::setMaxMinPoints(fp_t x, fp_t y, fp_t z){
maxPoint.setvtr(x > maxPoint.getx() ? x : maxPoint.getx(),
y > maxPoint.gety() ? y : maxPoint.gety(),
z > maxPoint.getz() ? z : maxPoint.getz());
minPoint.setvtr(x < minPoint.getx() ? x : minPoint.getx(),
y < minPoint.gety() ? y : minPoint.gety(),
z < minPoint.getz() ? z : minPoint.getz());
}
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000 Post-processing 0000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
// Modified by qi jian to write field at probes (CPU VERSION)
void FemGrp::writeFieldProbe(int timeStep)
{
int i, j;
fp_t vol;
fp_t zeta[4];
vtr lvtr[3];
vtr avtr[4];
int tetraMAP_aux[TetPolyOrderDim[getPolyFlag()]];
#if defined(DGTD_USE_CUDA)
fp_t_ts E_coeff[TetPolyOrderDim[getPolyFlag()]];
fp_t_ts H_coeff[TetPolyOrderDim[getPolyFlag()]];
#else
fp_t E_coeff[TetPolyOrderDim[getPolyFlag()]];
fp_t H_coeff[TetPolyOrderDim[getPolyFlag()]];
#endif
vtr eField;
vtr hField;
vtr eField_all;
vtr hField_all;
char csvFileName[StrOutput];
std::ofstream csvFile;
if(padeCNT == 0 || writeWhilePade)
{
sprintf(csvFileName, "Probes_%s_%04d.csv", fname, timeStep);
csvFile.open(csvFileName);
csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";
}
const int num_nodes = probeCNT;
// Calculate Total Fields at the points
for(i = 0; i < num_nodes; i++)
{
int number_of_associated_tets = probes_bary.at(i).first;
eField.reset();
hField.reset();
std::vector<std::pair<int, std::array<double, 4>>> found_tets = probes_bary.at(i).second;
eField_all.reset();
hField_all.reset();
for (int t = 0; t < number_of_associated_tets; t++)
{
int tet_id = found_tets.at(t).first;
array<double,4> tri_bary_coord = found_tets.at(t).second;
tetra& tet = tetARRAY[tet_id];
tet.geometry(lvtr, avtr, &vol);
avtr[3].reset();
avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);
eField.reset();
hField.reset();
zeta[0] = static_cast<fp_t>(tri_bary_coord[0]);
zeta[1] = static_cast<fp_t>(tri_bary_coord[1]);
zeta[2] = static_cast<fp_t>(tri_bary_coord[2]);
zeta[3] = static_cast<fp_t>(tri_bary_coord[3]);
// Calculate E field
tet.Local_DG_mapE(tetraMAP_aux, tet.LocalOffsetE);
for(j = 0 ; j < TetPolyOrderDim[getPolyFlag()] ; j++)
{
if(tetraMAP_aux[j] < 0)
E_coeff[j] = 0.0;
else
#if defined(DGTD_USE_CUDA)
E_coeff[j] = En1_h[tetraMAP_aux[j]];
#else
E_coeff[j] = en_1->getentry(tetraMAP_aux[j]);
#endif
}
// Calculate H field
tet.Local_DG_mapH(tetraMAP_aux, tet.LocalOffsetH);
for(j = 0 ; j < TetPolyOrderDim[getPolyFlag()] ; j++){
if(tetraMAP_aux[j] < 0)
H_coeff[j] = 0.0;
else
#if defined(DGTD_USE_CUDA)
H_coeff[j] = Hn32_h[tetraMAP_aux[j]];
#else
H_coeff[j] = hn_32->getentry(tetraMAP_aux[j]);
#endif
}
eField = CalcEfield(E_coeff, avtr, vol, zeta, PolyFlag);
hField = CalcEfield(H_coeff, avtr, vol, zeta, PolyFlag);
eField_all = eField_all + eField;
hField_all = hField_all + hField;
}
eField_all = eField_all / ((fp_t) number_of_associated_tets);
hField_all = hField_all / ((fp_t) number_of_associated_tets);
if(usePade){ // && i < padeCNT
int row = ((int)(timeStep / tsPerSampling)) * NumOfFieldComponents * probeCNT;
int column = i * NumOfFieldComponents;
fieldProbes[row + column + 0] = eField_all.getx();
fieldProbes[row + column + 1] = eField_all.gety();
fieldProbes[row + column + 2] = eField_all.getz();
fieldProbes[row + column + 3] = hField_all.getx();
fieldProbes[row + column + 4] = hField_all.gety();
fieldProbes[row + column + 5] = hField_all.getz();
}
if(padeCNT == 0 || writeWhilePade)
{
const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
csvFile << std::setprecision(max_precision) << eField_all.getx() << "," << eField_all.gety() << "," << eField_all.getz() << "," << hField_all.getx() << "," << hField_all.gety() << "," << hField_all.getz() << "\n";
}
}
if(padeCNT == 0 || writeWhilePade)
{
usleep(100);
csvFile.close();
}
}
void FemGrp::writeFieldProbeAfterPade(int tsSize)
{
const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
#pragma omp parallel for
for(int i = 0; i < (int)ceil((1.0 * NtimeSteps) / tsPerSampling); i++){
char csvFileName[StrOutput];
std::ofstream csvFile;
sprintf(csvFileName, "./PROBES/Probes_%s_%04d.csv", fname, i * tsSize);
csvFile.open(csvFileName);
csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";
for(int probe = 0; probe < probeCNT; probe++)
{
int column = probe * NumOfFieldComponents;
int row = i * NumOfFieldComponents * probeCNT;
for(int j = 0; j < NumOfFieldComponents; j++)
{
csvFile << std::setprecision(max_precision) << fieldProbes[row + column + j];
if(j == NumOfFieldComponents - 1)
csvFile << "\n";
else
csvFile << ",";
}
}
usleep(100);
csvFile.close();
}
}
void FemGrp::writeFieldGlobal(int timeStep){
int i, j;
fp_t vol;
fp_t zeta[4];
vtr lvtr[3];
vtr avtr[4];
vtr coord[4];
vtr eLocal[4];
vtr hLocal[4];
int* tetraMAP_aux;
int* MapE_Pe;
#if defined(DGTD_USE_CUDA)
fp_t_ts* E_coeff;
fp_t_ts* H_coeff;
#else
fp_t* E_coeff;
fp_t* H_coeff;
#endif
vtr* eField = new vtr[nodeCNT];
vtr* hField = new vtr[nodeCNT];
int* count = new int[nodeCNT];
memset(count, 0, nodeCNT * sizeof(int));
// only initialize the memory for the first solution
if(regE.TetraReg == 0)
regE.initial(tetraCNT);
if(regH.TetraReg == 0)
regH.initial(tetraCNT);
int* polyOrder = new int[tetraCNT];
for(i = 0; i < tetraCNT; i++){
tetra& tet = tetARRAY[i];
polyOrder[i] = tet.PolyOrderFlag;
for(j = 0; j < NumOfNodes; j++){
coord[j] = (tet.getNode(j))->getCoord();
}
tet.geometry(lvtr, avtr, &vol);
avtr[3].reset();
avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);
tetraMAP_aux = new int[TetPolyOrderDim[tet.PolyOrderFlag]];
MapE_Pe = new int[2 * TetPolyOrderDim[tet.PolyOrderFlag]];
#if defined(DGTD_USE_CUDA)
E_coeff = new fp_t_ts[TetPolyOrderDim[tet.PolyOrderFlag]];
H_coeff = new fp_t_ts[TetPolyOrderDim[tet.PolyOrderFlag]];
#else
E_coeff = new fp_t[TetPolyOrderDim[tet.PolyOrderFlag]];
H_coeff = new fp_t[TetPolyOrderDim[tet.PolyOrderFlag]];
#endif
// E field
tet.Local_DG_mapE(tetraMAP_aux, tet.LocalOffsetE);
for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){
if(tetraMAP_aux[j] < 0)
E_coeff[j] = 0.0;
else
#if defined(DGTD_USE_CUDA)
E_coeff[j] = En1_h[tetraMAP_aux[j]];
#else
E_coeff[j] = en_1->getentry(tetraMAP_aux[j]);
#endif
}
// H field
tet.Local_DG_mapH(tetraMAP_aux, tet.LocalOffsetH);
for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){
if(tetraMAP_aux[j] < 0)
H_coeff[j] = 0.0;
else
#if defined(DGTD_USE_CUDA)
H_coeff[j] = Hn32_h[tetraMAP_aux[j]];
#else
H_coeff[j] = hn_32->getentry(tetraMAP_aux[j]);
#endif
}
for(j = 0; j < 4; j++){
zeta[0] = BaryCoord[j][0];
zeta[1] = BaryCoord[j][1];
zeta[2] = BaryCoord[j][2];
zeta[3] = BaryCoord[j][3];
eLocal[j] = CalcEfield(E_coeff, avtr, vol, zeta, tet.PolyOrderFlag);
hLocal[j] = CalcEfield(H_coeff, avtr, vol, zeta, tet.PolyOrderFlag);
int index = tet.nd[j]->getid();
eField[index] = eField[index] + eLocal[j] /*- Einc*/;
hField[index] = hField[index] + hLocal[j] /*- Hinc*/;
count[index] += 1;
}
regE.setRegister(i, eLocal);
regH.setRegister(i, hLocal);
delete [] tetraMAP_aux;
delete [] MapE_Pe;
delete [] E_coeff;
delete [] H_coeff;
}
for(i = 0; i < nodeCNT; i++){
eField[i] = eField[i] / static_cast<fp_t>(count[i]);
hField[i] = hField[i] / static_cast<fp_t>(count[i]);
}
VtkWriter vtkWriter(1.0);
// VtkWriter vtkWriter(unit);
char vtkFilePrefix[128];
memset(vtkFilePrefix, 0, 128 * sizeof(char));
sprintf(vtkFilePrefix, "%s_%04d", fname, timeStep);
vtkWriter.writeField(vtkFilePrefix, nodeCNT, ndARRAY, tetraCNT, tetARRAY, eField, hField, polyOrder, 0, 0); //TODO: why here polyorder is not 1
delete [] eField;
delete [] hField;
delete [] polyOrder;
delete [] count;
}
// Modified by qi jian to compute the analytical incident field at the probes
void FemGrp::writeAnalyticalIncidentPWProbes(int timeStep){
int i;
vtr Einc;
vtr Hinc;
vtr r;
vtr Einc_field;
vtr Hinc_field;
fp_t zeta[4];
char csvFileName[StrOutput];
sprintf(csvFileName, "AnalyticalIncidentField_%s_%04d.csv", fname, timeStep);
std::ofstream csvFile(csvFileName);
csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";
for(i = 0; i < probeCNT; i++)
{
// Get the Incident Field at the probe
int number_of_associated_tets = probes_bary.at(i).first;
Einc.reset();
Hinc.reset();
std::vector<std::pair<int, std::array<double, 4>>> found_tets = probes_bary.at(i).second;
Einc_field.reset(); // Store for all valid candidate tets
Hinc_field.reset(); // Store for all valid candidate tets
for (int t = 0; t < number_of_associated_tets; t++)
{
int tet_id = found_tets.at(t).first;
array<double,4> tri_bary_coord = found_tets.at(t).second;
tetra& tet = tetARRAY[tet_id];
zeta[0] = static_cast<fp_t>(tri_bary_coord[0]);
zeta[1] = static_cast<fp_t>(tri_bary_coord[1]);
zeta[2] = static_cast<fp_t>(tri_bary_coord[2]);
zeta[3] = static_cast<fp_t>(tri_bary_coord[3]);
SimplexToCartesian(tet, r, zeta);
getAnalyticalPWField(tet, r, Einc, Hinc, timeStep, LocTimeSteps[N_class -1]);
Einc_field = Einc_field + Einc;
Hinc_field = Hinc_field + Hinc;
}
Einc_field = Einc_field / ((fp_t) number_of_associated_tets);
Hinc_field = Hinc_field / ((fp_t) number_of_associated_tets);
const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
csvFile << std::setprecision(max_precision) << Einc_field.getx() << "," << Einc_field.gety() << "," << Einc_field.getz() << "," << Hinc_field.getx() << "," << Hinc_field.gety() << "," << Hinc_field.getz() << "\n";
}
usleep(100);
csvFile.close();
}
void FemGrp::getAnalyticalPWField(tetra& tet, vtr& r, vtr& Einc, vtr& Hinc, int timeStep, fp_t dt){
fp_t eta = No * sqrt(tet.mat->mur.getEntry(0,0) / tet.mat->epsr.getEntry(0,0));
fp_t V_light = Vo / sqrt(tet.mat->epsr.getEntry(0,0) * tet.mat->mur.getEntry(0,0));
fp_t omega = 2.0 * Pi * freq * MEGA;
fp_t Exponent;
fp_t SinModul;
fp_t Neuman;
fp_t IncidExcit_E;
fp_t IncidExcit_H;
fp_t t;
for(int i = 0; i < bcCNT; i++){
bc bc_i = bcARRAY[i];
if(bc_i.getbType() == planeWaveType || bc_i.getbType() == pmlType){
fp_t Emagnitude = bc_i.getMagE();
fp_t theta_in_rad = bc_i.getTheta() * Pi / 180.0;
fp_t phi_in_rad = bc_i.getPhi() * Pi / 180.0;
vtr Epol = bc_i.getField();
vtr kvtr(sin(theta_in_rad) * cos(phi_in_rad), sin(theta_in_rad) * sin(phi_in_rad), cos(theta_in_rad));
vtr Hpol = kvtr * Epol;
vtr ro = bc_i.getPW_ro();
fp_t Hmagnitude = Emagnitude / eta;
Hpol.unitvtr();
Epol.unitvtr();
switch(ExcitFlag){
case 0: //(not tested)
if(Exponent >= 0.0){
// Plane wave E
t = dt * (timeStep + 1.0);
Exponent = t - To - dotP(kvtr, r - ro) / Vo;
SinModul = cos(omega * Exponent);
IncidExcit_E = Emagnitude * SinModul;
t = dt * (timeStep + 1.5);
Exponent = t - To - dotP(kvtr, r - ro) / Vo;
SinModul = cos(omega * Exponent);
IncidExcit_H = Hmagnitude * SinModul;
}else{
IncidExcit_E = 0.0;
IncidExcit_H = 0.0;
}
break;
case 1:
// Gauss Pulse
t = dt * (timeStep + 1.0);
Exponent = t - To - dotP(kvtr, r - ro) / Vo;
SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
IncidExcit_E = Emagnitude * SinModul * exp(-(Exponent * Exponent) / (Tau * Tau));
t = dt * (timeStep + 1.5);
Exponent = t - To - dotP(kvtr, r - ro) / Vo;
SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
IncidExcit_H = Hmagnitude * SinModul * exp(-(Exponent * Exponent) / (Tau * Tau));
break;
case 2: //(not tested)
// Neuman Pulse E
t = dt * (timeStep + 1.0);
Exponent = t - To - dotP(kvtr, r - ro) / Vo;
Neuman = (2.0 * Exponent) / (Tau * Tau);
IncidExcit_E = (Emagnitude * Neuman) * exp(-(Exponent * Exponent) / (Tau * Tau));
t = dt * (timeStep + 1.5);
Exponent = t - To - dotP(kvtr, r - ro) / Vo;
Neuman = (2.0 * Exponent) / (Tau * Tau);
IncidExcit_H = Hmagnitude * Neuman * exp(-(Exponent * Exponent) / (Tau * Tau));
break;
case 3:
{
// DC-Free Hann-Modulated Cosine Pulse (with time delay)
fp_t tdelay = To; // To represents the delay time
t = dt * (timeStep + 1.0);
Exponent = t - tdelay - dotP(kvtr, r - ro) / Vo;
if (Exponent >= 0.0 && Exponent <= Tau) {
// Shift exponent relative to pulse center
fp_t t_rel = Exponent - Tau / 2.0;
fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tau)); // Hann window
SinModul = cos(omega * t_rel);
IncidExcit_E = Emagnitude * SinModul * window;
} else {
IncidExcit_E = 0.0;
}
t = dt * (timeStep + 1.5);
Exponent = t - tdelay - dotP(kvtr, r - ro) / Vo;
if (Exponent >= 0.0 && Exponent <= Tau) {
fp_t t_rel = Exponent - Tau / 2.0;
fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tau)); // Hann window
SinModul = cos(omega * t_rel);
IncidExcit_H = Hmagnitude * SinModul * window;
} else {
IncidExcit_H = 0.0;
}
break;
}
case 4: // Linear Chirp Excitation with sine start and Hann window
{
fp_t f_end = freq * MEGA;
fp_t B = Tau * MEGA;
fp_t f0 = f_end - B;
fp_t f1 = f_end;
fp_t Tchirp = To;
// Incident Electric Field (E)
t = dt * (timeStep + 1.0);
Exponent = t - dotP(kvtr, r - ro) / Vo;
if (Exponent >= 0.0 && Exponent <= Tchirp)
{
fp_t chirpArg = 2.0 * Pi * f0 * Exponent + Pi * (f1 - f0) / Tchirp * Exponent * Exponent;
fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tchirp)); // Hann window
IncidExcit_E = Emagnitude * sin(chirpArg) * window;
}
else
{
IncidExcit_E = 0.0;
}
// Incident Magnetic Field (H)
t = dt * (timeStep + 1.5);
Exponent = t - To - dotP(kvtr, r - ro) / Vo;
if (Exponent >= 0.0 && Exponent <= Tchirp)
{
fp_t chirpArg = 2.0 * Pi * f0 * Exponent + Pi * (f1 - f0) / Tchirp * Exponent * Exponent;
fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tchirp)); // Hann window
IncidExcit_H = Hmagnitude * sin(chirpArg) * window;
}
else
{
IncidExcit_H = 0.0;
}
break;
}
default:
break;
}
Einc = Epol * IncidExcit_E;
Hinc = Hpol * IncidExcit_H;
// cout << "Einc at: (" << r.getx() << ", " << r.gety() << ", " << r.getz() << ") = (" << Einc.getx() << ", " << Einc.gety() << ", " << Einc.getz() << ")" << endl;
}
}
}
void FemGrp::writeEquivalentSurfaceCurrents_(int timeStep){
int i, j;
int m;
int index;
int FaceNum;
fp_t vol;
fp_t zeta[4];
fp_t Area;
vtr NormalVtr;
vtr lvtr[3];
vtr avtr[4];
vtr coord[4];
vtr eLocal[4];
vtr hLocal[4];
vtr eLocalFace[3];
vtr hLocalFace[3];
tetra* tet;
ArrayFP<fp_t>* origEn_1 = new ArrayFP<fp_t>(TetPolyOrderDim[PolyFlag]);
ArrayFP<fp_t>* origHn_32 = new ArrayFP<fp_t>(TetPolyOrderDim[PolyFlag]);
char Currents_vtkFile[StrOutput];
sprintf(Currents_vtkFile, "Currents_%s_%04d", fname, timeStep);
// fill the port field with averaged values
vtr* JField = new vtr[SurfMesh->nodeCNT];
vtr* MField = new vtr[SurfMesh->nodeCNT];
int* count = new int[SurfMesh->nodeCNT];
memset(count, 0, SurfMesh->nodeCNT * sizeof(int));
regMface = new Register[SurfMesh->faceCNT];
regJface = new Register[SurfMesh->faceCNT];
for(i = 0; i < SurfMesh->faceCNT; i++){
SurfMesh->fcArray[i]->getAreaNormal(&Area, &NormalVtr);
tet = SurfMesh->fcArray[i]->hydra[0];
tet->geometry(lvtr, avtr, &vol);
avtr[3].reset();
avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);
Get_Coefficients_(tet, origEn_1, origHn_32);
for(m = 0; m < NumOfFaces; m++){
zeta[m] = 0.0;
if(SurfMesh->fcArray[i] == tet->getFacePtr(m))
FaceNum = m;
}
for(j = 0; j < 4; j++){
zeta[0] = BaryCoord[j][0];
zeta[1] = BaryCoord[j][1];
zeta[2] = BaryCoord[j][2];
zeta[3] = BaryCoord[j][3];
eLocal[j] = CalcEfield(origEn_1->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag);
hLocal[j] = CalcEfield(origHn_32->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag);
}
regMface[i].initial(3);
regJface[i].initial(3);
for(j = 0; j < 3; j++){
eLocalFace[j] = eLocal[faceMAP[FaceNum][j]];
hLocalFace[j] = hLocal[faceMAP[FaceNum][j]];
index = SurfMesh->globToLocMap_->find(SurfMesh->fcArray[i]->getNode(j)->getid())->second;
MField[index] = MField[index] + NormalVtr * eLocalFace[j] * (-1.0);
JField[index] = JField[index] + NormalVtr * hLocalFace[j] * (1.0);
// No averaging
regMface[i].setField(j, NormalVtr * eLocalFace[j] * (-1.0));
regJface[i].setField(j, NormalVtr * hLocalFace[j] * (1.0));
count[index] += 1;
}
}
// This is for visualization in the vtk format
for(i = 0; i < SurfMesh->nodeCNT; i++){
MField[i] = MField[i] / static_cast<fp_t>(count[i]);
JField[i] = JField[i] / static_cast<fp_t>(count[i]);
}
node** locNodeArray = new node*[SurfMesh->nodeCNT];
for(i = 0; i < SurfMesh->nodeCNT; i++){
node& Node = *(SurfMesh->ndArray[i]);
int index = SurfMesh->globToLocMap_->find(Node.getid())->second;
locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz());
}
face** locFaceArray = new face*[SurfMesh->faceCNT];
for(i = 0; i < SurfMesh->faceCNT; i++){
face& Face = *(SurfMesh->fcArray[i]);
locFaceArray[i] = new face(Face);
locFaceArray[i]->setFace(
locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second],
locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second],
locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]);
}
// Vtk
VtkWriter vtkWriter(1.);
vtkWriter.writeTriUg(Currents_vtkFile, SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, MField, JField, 1);
// Register
char regFileName[StrOutput];
char regFileNameDebug[StrOutput];
memset(regFileName, 0, StrOutput * sizeof(char));
sprintf(regFileName, "Currents_%s_%05d", fname, timeStep);
sprintf(regFileNameDebug, "Currents_%s_%05d_dbg", fname, timeStep);
printRegister(regMface, regJface, SurfMesh->faceCNT, regFileName,1);
// printRegisterDebug(regMface, regJface, SurfMesh->faceCNT, regFileNameDebug,2);
if(timeStep == 0)
printTriMesh(SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, fname);
for(i = 0; i < SurfMesh->nodeCNT; i++)
delete locNodeArray[i];
delete [] locNodeArray;
for(i = 0; i < SurfMesh->faceCNT; i++)
delete locFaceArray[i];
delete [] locFaceArray;
delete [] MField;
delete [] JField;
delete [] count;
delete origEn_1;
delete origHn_32;
}
// Print face registers
void FemGrp::printRegister(Register* regMface, Register* regJface, int FaceCnt, char *prjName, int order){
int i, j;
char fnameJ3[180];
char fnameM3[180];
sprintf(fnameM3, "%s_BC.curM", prjName);
sprintf(fnameJ3, "%s_BC.curJ", prjName);
ofstream foutJ3, foutM3;
foutM3.open(fnameM3, ios::out);
foutJ3.open(fnameJ3, ios::out);
for(i = 0; i < FaceCnt; i++){
if(order == 1){
for(j = 0; j < 3; j ++){
foutM3 << regMface[i].getField(j).getx() << endl;
foutM3 << regMface[i].getField(j).gety() << endl;
foutM3 << regMface[i].getField(j).getz() << endl;
}
foutM3 << endl;
for(j = 0; j < 3; j ++){
foutJ3 << regJface[i].getField(j).getx() << endl;
foutJ3 << regJface[i].getField(j).gety() << endl;
foutJ3 << regJface[i].getField(j).getz() << endl;
}
foutJ3 << endl;
}else if(order == 2){
for(j = 0; j < 3; j ++){
foutM3 << regMface[i].getField(j).getx() << endl;
foutM3 << regMface[i].getField(j).gety() << endl;
foutM3 << regMface[i].getField(j).getz() << endl;
}
for(j = 0 ; j < 3 ; j++){
int index0 = First2Second[j][0];
int index1 = First2Second[j][1];
foutM3 << 0.5 * (regMface[i].getField(index0).getx() + regMface[i].getField(index1).getx()) << endl;
foutM3 << 0.5 * (regMface[i].getField(index0).gety() + regMface[i].getField(index1).gety()) << endl;
foutM3 << 0.5 * (regMface[i].getField(index0).getz() + regMface[i].getField(index1).getz()) << endl;
}
foutM3 << endl;
for(j = 0; j < 3; j ++){
foutJ3 << regJface[i].getField(j).getx() << endl;
foutJ3 << regJface[i].getField(j).gety() << endl;
foutJ3 << regJface[i].getField(j).getz() << endl;
}
for(j = 0 ; j < 3 ; j++){
int index0 = First2Second[j][0];
int index1 = First2Second[j][1];
foutJ3 << 0.5 * (regJface[i].getField(index0).getx() + regJface[i].getField(index1).getx()) << endl;
foutJ3 << 0.5 * (regJface[i].getField(index0).gety() + regJface[i].getField(index1).gety()) << endl;
foutJ3 << 0.5 * (regJface[i].getField(index0).getz() + regJface[i].getField(index1).getz()) << endl;
}
foutJ3 << endl;
}
}
foutJ3.close();
foutM3.close();
}
// Print out Outer Surface node & triangle info on *.tri
void FemGrp::printTriMesh(int ndNum, node **ndArray, int fcNum, face **fcArray, char *prjName){
int i;
face* fcPtr;
FILE* fd;
char triName[360];
sprintf(triName, "%s.tri", prjName);
fd = fopen(triName, "wt");
fprintf(fd, "%f\n", unit);
fprintf(fd, "%d\n", ndNum);
for(i = 0; i < ndNum; i ++){
fprintf(fd, "%f %f %f\n",
(ndArray[i]->getCoord().getx()) / unit,
(ndArray[i]->getCoord().gety()) / unit,
(ndArray[i]->getCoord().getz()) / unit);
}
fprintf(fd,"%d\n", fcNum);
for(i = 0; i < fcNum; i ++){
fcPtr = fcArray[i];
node* n0Ptr;
node* n1Ptr;
node* n2Ptr;
n0Ptr = fcPtr->getNode(0);
n1Ptr = fcPtr->getNode(1);
n2Ptr = fcPtr->getNode(2);
fprintf(fd, "%d %d %d\n", n0Ptr->getid(), n1Ptr->getid(), n2Ptr->getid());
}
fclose(fd);
}
// Modified by qi jian to compute the L2 error at the probes
void FemGrp::CalculateL2ErrorProbes(int& timeStep, fp_t dt, int TimeDistFlag){
int i, j;
fp_t vol;
fp_t zeta[4];
vtr lvtr[3];
vtr avtr[4];
vtr eLocal;
vtr hLocal;
vtr eLocal_exa;
vtr hLocal_exa;
vtr eLocal_all;
vtr hLocal_all;
vtr eLocal_exa_all;
vtr hLocal_exa_all;
fp_t E_coeff[TetPolyOrderDim[getPolyFlag()]];
fp_t H_coeff[TetPolyOrderDim[getPolyFlag()]];
fp_t IntegrOmegaE = 0.0;
fp_t IntegrOmegaH = 0.0;
vtr r;
vtr Exa_NumE;
vtr Exa_NumH;
char Error_E_TimeLog[180];
char Error_H_TimeLog[180];
int outOfModelProbes = 0;
for(i = 0; i < probeCNT; i++)
{
int number_of_associated_tets = probes_bary.at(i).first;
eLocal.reset();
hLocal.reset();
std::vector<std::pair<int, std::array<double, 4>>> found_tets = probes_bary.at(i).second;
eLocal_exa.reset();
hLocal_exa.reset();
eLocal_all.reset();
hLocal_all.reset();
eLocal_exa_all.reset();
hLocal_exa_all.reset();
for (int t = 0; t < number_of_associated_tets; t++)
{
int tet_id = found_tets.at(t).first;
array<double,4> probe_bary_coord = found_tets.at(t).second;
tetra& tet = tetARRAY[tet_id];
int tetraMAP[TetPolyOrderDim[tet.PolyOrderFlag]];
tet.geometry(lvtr, avtr, &vol);
avtr[3].reset();
avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);
// Compute the Efield
tet.Local_DG_mapE(tetraMAP, tet.LocalOffsetE);
for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){
if(tetraMAP[j] < 0)
E_coeff[j] = 0.0;
else
E_coeff[j] = en_1->getentry(tetraMAP[j]);
}
// Compute the Hfield
tet.Local_DG_mapH(tetraMAP, tet.LocalOffsetH);
for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){
if(tetraMAP[j] < 0)
H_coeff[j] = 0.0;
else
H_coeff[j] = hn_32->getentry(tetraMAP[j]);
}
eLocal.reset();
hLocal.reset();
eLocal_exa.reset();
hLocal_exa.reset();
zeta[0] = static_cast<fp_t>(probe_bary_coord[0]);
zeta[1] = static_cast<fp_t>(probe_bary_coord[1]);
zeta[2] = static_cast<fp_t>(probe_bary_coord[2]);
zeta[3] = static_cast<fp_t>(probe_bary_coord[3]);
SimplexToCartesian(tet, r, zeta);
eLocal = CalcEfield(E_coeff, avtr, vol, zeta, tet.PolyOrderFlag);
hLocal = CalcEfield(H_coeff, avtr, vol, zeta, tet.PolyOrderFlag);
GetExactSolution(tet, r, eLocal_exa, hLocal_exa, timeStep, dt, TimeDistFlag);
// Add all the local fields from all relevant tets
eLocal_all = eLocal_all + eLocal;
hLocal_all = hLocal_all + hLocal;
eLocal_exa_all = eLocal_exa_all + eLocal_exa;
hLocal_exa_all = hLocal_exa_all + hLocal_exa;
}
eLocal_all = eLocal_all / ((fp_t) number_of_associated_tets);
hLocal_all = hLocal_all / ((fp_t) number_of_associated_tets);
eLocal_exa_all = eLocal_exa_all / ((fp_t) number_of_associated_tets);
hLocal_exa_all = hLocal_exa_all / ((fp_t) number_of_associated_tets);
Exa_NumE = eLocal_exa_all - eLocal_all;
Exa_NumH = hLocal_exa_all - hLocal_all;
IntegrOmegaE += Exa_NumE.magnitude() * Exa_NumE.magnitude();
IntegrOmegaH += Exa_NumH.magnitude() * Exa_NumH.magnitude();
sprintf(Error_E_TimeLog, "%s_Probe_%d.TDerrorE", fname, i);
sprintf(Error_H_TimeLog, "%s_Probe_%d.TDerrorH", fname, i);
ofstream Error_E(Error_E_TimeLog, ios_base::out | ios::app);
Error_E.setf(ios::scientific, ios::floatfield);
Error_E.precision(15);
if(!Error_E)
cout << "Error in opening file: " << Error_E_TimeLog << " for write " << endl;
Error_E << "[" << (timeStep + 1.0) * dt << ", " << Exa_NumE.magnitude() << "]; \n";
Error_E.close();
ofstream Error_H(Error_H_TimeLog, ios_base::out | ios::app);
Error_H.setf(ios::scientific, ios::floatfield);
Error_H.precision(15);
if(!Error_H)
cout << "Error in opening file: " << Error_H_TimeLog << " for write " << endl;
Error_H << "[" << (timeStep + 1.5) * dt << ", " << Exa_NumH.magnitude() << "]; \n";
Error_H.close();
}
// Write to file
if(outOfModelProbes < probeCNT)
{
sprintf(Error_E_TimeLog, "%s_Probes_Global.TDerrorE", fname);
sprintf(Error_H_TimeLog, "%s_Probes_Global.TDerrorH", fname);
ofstream Error_E(Error_E_TimeLog, ios_base::out | ios::app);
Error_E.setf(ios::scientific, ios::floatfield);
Error_E.precision(15);
if(!Error_E)
cout << "Error in opening file: " << Error_E_TimeLog << " for write " << endl;
Error_E << "[" << (timeStep + 1.0) * dt << ", " << sqrt(IntegrOmegaE / (probeCNT - outOfModelProbes)) << "]; \n";
Error_E.close();
ofstream Error_H(Error_H_TimeLog, ios_base::out | ios::app);
Error_H.setf(ios::scientific, ios::floatfield);
Error_H.precision(15);
if(!Error_H)
cout << "Error in opening file: " << Error_H_TimeLog << " for write " << endl;
Error_H << "[" << (timeStep + 1.5) * dt << ", " << sqrt(IntegrOmegaH / (probeCNT - outOfModelProbes)) << "]; \n";
Error_H.close();
}
}
void FemGrp::CalculateL2Error(int& timeStep, fp_t dt, int TimeDistFlag){
int i, j;
fp_t vol;
fp_t zeta[4];
vtr lvtr[3];
vtr avtr[4];
vtr coord[4];
vtr eLocal[4];
vtr hLocal[4];
vtr eLocal_exa[4];
vtr hLocal_exa[4];
int QuadOrder = 2; //TODO: Recheck with the order of the basis
int points = 4;
fp_t** ZetaMat = new fp_t*[points];
fp_t* weights = new fp_t[points];
for(int i = 0; i < points; i++)
ZetaMat[i] = new fp_t[4];
GetTetQuadRule(QuadOrder, points, ZetaMat, weights);
fp_t IntegrOmegaE = 0.0;
fp_t IntegrOmegaH = 0.0;
fp_t NormalizeOmegaE = 0.0;
fp_t NormalizeOmegaH = 0.0;
for(i = 0; i < tetraCNT; i++){
tetra& tet = tetARRAY[i];
int tetraMAP_E[TetPolyOrderDim[tet.PolyOrderFlag]];
int tetraMAP_H[TetPolyOrderDim[tet.PolyOrderFlag]];
auto origEn_1 = new ArrayFP<fp_t>(TetPolyOrderDim[tet.PolyOrderFlag]);
auto origHn_32 = new ArrayFP<fp_t>(TetPolyOrderDim[tet.PolyOrderFlag]);
for(j = 0; j < 4; j++){
coord[j] = (tet.getNode(j))->getCoord();
}
tet.geometry(lvtr, avtr, &vol);
avtr[3].reset();
avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);
tet.Local_DG_mapE(tetraMAP_E, tet.LocalOffsetE);
tet.Local_DG_mapH(tetraMAP_H, tet.LocalOffsetH);
origEn_1->reset();
origHn_32->reset();
for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag]; j++){
if(tetraMAP_E[j] < 0)
origEn_1->setentry(j, 0.0);
else
origEn_1->setentry(j, en_1->getentry(tetraMAP_E[j]));
if(tetraMAP_H[j] < 0)
origHn_32->setentry(j, 0.0);
else
origHn_32->setentry(j, hn_32->getentry(tetraMAP_H[j]));
}
fp_t IntegrValueE = 0.0;
fp_t IntegrValueH = 0.0;
fp_t NormalizeValueE = 0.0;
fp_t NormalizeValueH = 0.0;
vtr r;
vtr Exa_NumE;
Exa_NumE.reset();
vtr Exa_NumH;
Exa_NumH.reset();
//Tetrahedron integration
for(j = 0; j < points; j++){
zeta[0] = ZetaMat[j][0];
zeta[1] = ZetaMat[j][1];
zeta[2] = ZetaMat[j][2];
zeta[3] = ZetaMat[j][3];
SimplexToCartesian(tet, r, zeta);
eLocal[j] = CalcEfield(origEn_1->getEntryPtr(), avtr, vol, zeta, tet.PolyOrderFlag);
hLocal[j] = CalcEfield(origHn_32->getEntryPtr(), avtr, vol, zeta, tet.PolyOrderFlag);
GetExactSolution(tet, r, eLocal_exa[j], hLocal_exa[j], timeStep, dt, TimeDistFlag);
Exa_NumE = eLocal_exa[j] - eLocal[j];
Exa_NumH = hLocal_exa[j] - hLocal[j];
IntegrValueE += weights[j] * vol * (Exa_NumE.magnitude() * Exa_NumE.magnitude());
IntegrValueH += weights[j] * vol * (Exa_NumH.magnitude() * Exa_NumH.magnitude());
NormalizeValueE += weights[j] * vol * (eLocal_exa[j].magnitude() * eLocal_exa[j].magnitude());
NormalizeValueH += weights[j] * vol * (hLocal_exa[j].magnitude() * hLocal_exa[j].magnitude());
}
IntegrOmegaE = IntegrOmegaE + IntegrValueE;
IntegrOmegaH = IntegrOmegaH + IntegrValueH;
NormalizeOmegaE = NormalizeOmegaE + NormalizeValueE;
NormalizeOmegaH = NormalizeOmegaH + NormalizeValueH;
}
// Write to file
char Error_E_TimeLog[180];
char Error_H_TimeLog[180];
sprintf(Error_E_TimeLog, "%s.TDerrorE", fname);
sprintf(Error_H_TimeLog, "%s.TDerrorH", fname);
ofstream Error_E(Error_E_TimeLog, ios_base::out | ios::app);
Error_E.setf(ios::scientific, ios::floatfield);
Error_E.precision(15);
if(!Error_E)
cout << "Error in opening file: " << Error_E_TimeLog << " for write " << endl;
Error_E << "[" << (timeStep + 1.0) * dt << ", " << sqrt(IntegrOmegaE) << "]; \n";
Error_E.close();
ofstream Error_H(Error_H_TimeLog, ios_base::out | ios::app);
Error_H.setf(ios::scientific, ios::floatfield);
Error_H.precision(15);
if(!Error_H)
cout << "Error in opening file: " << Error_H_TimeLog << " for write " << endl;
Error_H << "[" << (timeStep + 1.5) * dt << ", " << sqrt(IntegrOmegaH) << "]; \n";
Error_H.close();
delete[] weights;
for(i = 0; i < points; i++)
delete[] ZetaMat[i];
delete[] ZetaMat;
}
void FemGrp::SimplexToCartesian(tetra& tet, vtr& r, fp_t zeta[4]){
fp_t x = 0.;
fp_t y = 0.;
fp_t z = 0.;
for(int i = 0; i < 4 ; i++){
x += tet.getNode(i)->getCoord().getx() * zeta[i];
y += tet.getNode(i)->getCoord().gety() * zeta[i];
z += tet.getNode(i)->getCoord().getz() * zeta[i];
}
r.setvtr(x, y, z);
}
void FemGrp::GetExactSolution(tetra& tet, vtr& r, vtr& Einc, vtr& Hinc, int timeStep, fp_t dt, int Flag){
fp_t to = To;
fp_t tau = Tau;
fp_t eta = No * sqrt(tet.mat->mur.getEntry(0,0) / tet.mat->epsr.getEntry(0,0));
fp_t V_light = Vo / sqrt(tet.mat->epsr.getEntry(0,0) * tet.mat->mur.getEntry(0,0));
fp_t Neuman;
fp_t Frequency = freq;
fp_t omega = 2.0 * Pi * Frequency * MEGA;
fp_t Exponent;
fp_t SinModul;
for(int i = 0; i < bcCNT; i++){
bc bc_i = bcARRAY[i];
if(bc_i.getbType() == planeWaveType || bc_i.getbType() == pmlType){
fp_t Emagnitude = bc_i.getMagE();
fp_t theta_in_rad = bc_i.getTheta() * Pi / 180.0;
fp_t phi_in_rad = bc_i.getPhi() * Pi / 180.0;
vtr Epol = bc_i.getField();
vtr kvtr(sin(theta_in_rad) * cos(phi_in_rad), sin(theta_in_rad) * sin(phi_in_rad), cos(theta_in_rad));
kvtr.unitvtr();
vtr Hpol = kvtr * Epol;
vtr ro = bc_i.getPW_ro();
fp_t Hmagnitude = Emagnitude / eta;
Hpol.unitvtr();
Epol.unitvtr();
switch (Flag){
case 0:
kvtr.Scale((omega / V_light));
Hinc = Hpol * (Hmagnitude * cos(dotP(kvtr, r - ro) - omega * (timeStep + 1.5) * dt));
Einc = Epol * (Emagnitude * cos(dotP(kvtr, r - ro) - omega * (timeStep + 1.0) * dt));
break;
case 1:
Exponent = (timeStep + 1.0) * dt - to - (dotP(kvtr, r - ro) / V_light);
SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
Einc = Epol * SinModul * (Emagnitude * exp(- (Exponent * Exponent) / (tau * tau)));
Exponent = (timeStep + 1.5) * dt - to - (dotP(kvtr, r - ro) / V_light);
SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
Hinc = Hpol * SinModul * (Hmagnitude * exp(- (Exponent * Exponent) / (tau * tau)));
break;
case 2:
Exponent = (timeStep + 1.5) * dt - to - (dotP(kvtr, r - ro) / V_light);
Neuman = (2.0 * Exponent) / (tau * tau);
Hinc = Hpol * (Hmagnitude * Neuman * exp(- (Exponent * Exponent) / (tau * tau)));
Exponent = (timeStep + 1.0) * dt - to - (dotP(kvtr, r - ro) / V_light);
Neuman = (2.0 * Exponent) / (tau * tau);
Einc = Epol * (Emagnitude * Neuman * exp(- (Exponent * Exponent) / (tau * tau)));
break;
default:
break;
}
}
}
}
/* "Early Time Behavior in Reverberation Chambers and
Its Effect on the Relationships Between Coherence
Bandwidth, Chamber Decay Time, RMS Delay
Spread, and the Chamber Buildup Time", Christopher L. Holloway et al. */
bool FemGrp::calculatePade(int currentTimeStep){
int M = currentTimeStep / tsPerSampling;
int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
int N = (int)floor(M / 2.0);
int finish = 0;
timer_start("Process : ", 'm');
#pragma omp parallel for schedule(static) shared(finish)
for(int pade = 0; pade < padeCNT; pade++){
int auxFinish = 0;
fp_t convergence = 0.0;
fp_t maxProbe = 0.0;
for(int component = 0; component < NumOfFieldComponents; component++){
fp_t a_k[N] = {0};
fp_t b_k[N] = {0};
fp_t_ts maxValComponent = 0.0;
getPadeCoef(a_k, b_k, &fieldProbes[pade * totalSamples * NumOfFieldComponents], N, component, &maxValComponent);
maxProbe += maxValComponent;
convergence += maxValComponent * getFreqDomainPade(a_k, b_k, totalSamples, N, &tranferencePadeFunctionFD[pade * totalSamples * NumOfFieldComponents], component, pade, currentTimeStep / tsPerPade == 1);
cout << "Probe = " << pade << " Component = " << component << " Value = " << (convergence / maxProbe) << endl;
if((currentTimeStep / tsPerPade == 1 || (convergence / maxProbe) < PadeTolerance) && (component == NumOfUnitaryVectors - 1 || component == NumOfFieldComponents - 1)){
auxFinish++;
maxProbe = 0.0;
convergence = 0.0;
}
}
#pragma omp atomic update
finish += auxFinish;
}
timer_stop('m');
return finish == 0;
}
void FemGrp::calculatePadeEnd(int currentTimeStep){
int M = currentTimeStep / tsPerSampling;
int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
int N = (int)floor(M / 2.0);
int finish = 0;
timer_start("Process : ", 'm');
for(int pade = 0; pade < padeCNT; pade++){
Complex* FD = new Complex[totalSamples * NumOfFieldComponents];
// #pragma omp parallel for
for(int component = 0; component < NumOfFieldComponents; component++){
fp_t a_k[N] = {0};
fp_t b_k[N] = {0};
fp_t_ts maxValComponent = 0.0;
timer_start("Coef: " + std::to_string(component) + ": ",'m');
getPadeCoef(a_k, b_k, &fieldProbes[pade * totalSamples * NumOfFieldComponents], N, component, &maxValComponent);
timer_stop('m');
timer_start("Freq Dom " + std::to_string(component) + ": ",'m');
getFreqDomainPade(a_k, b_k, totalSamples, N, FD, component, pade, true);
timer_stop('m');
}
// getPadeIFFTEnd(pade, FD);
timer_start("IFFF " + std::to_string(pade) + ": ",'m');
getPadeIFFT(pade, FD);
timer_stop('m');
delete[] FD;
cout << "Pade point exported: " << pade << endl;
}
timer_stop('m');
return;
}
#if defined(DGTD_USE_CUDA)
void FemGrp::calculatePadeEndCUDA(int currentTimeStep){
int M = currentTimeStep / tsPerSampling;
int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
int N = (int)floor(M / 2.0);
int nFields = padeCNT * NumOfFieldComponents;
int finish = 0;
timer_start("Process : ", 'm');
cudaStream_t* streams = (cudaStream_t*)malloc(NumOfFieldComponents * sizeof(cudaStream_t));
CUDA_SAFE_CALL(cudaMalloc((void**)&padeFreqConstant_d, totalSamples * sizeof(int)));
CUDA_SAFE_CALL(cudaMemcpy(padeFreqConstant_d, padeFreqConstant, totalSamples * sizeof(int), cudaMemcpyHostToDevice));
for(int i = 0; i < NumOfFieldComponents; i++){
cudaStreamCreate(&streams[i]);
}
cuDoubleComplex* Hf;
CUDA_SAFE_CALL(cudaMallocHost((void**)&Hf, totalSamples * nFields * sizeof(cuDoubleComplex), cudaHostAllocMapped));
for(int pade = 0; pade < padeCNT; pade++){
fp_t* maxValComponent = new fp_t[NumOfFieldComponents];
for(int component = 0; component < NumOfFieldComponents; component++){
fp_t* a_k;
fp_t* b_k;
CUDA_SAFE_CALL(cudaMallocHost((void**)&a_k, N * sizeof(fp_t), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&b_k, N * sizeof(fp_t), cudaHostAllocMapped));
cuDoubleComplex* FD = &Hf[totalSamples * (pade * NumOfFieldComponents + component)];
getPadeCoefCUDA(a_k, b_k, &maxValComponent[component], pade * NumOfFieldComponents + component, streams[component], currentTimeStep);
getFreqDomainPadeCUDA(a_k, b_k, totalSamples, N, FD, streams[component]);
}
getPadeIFFT(pade, &Hf[pade * totalSamples * NumOfFieldComponents]);
}
for(int i = 0; i < NumOfFieldComponents; i++){
cudaStreamDestroy(streams[i]);
}
timer_stop('m');
CUDA_SAFE_CALL(cudaFree(padeFreqConstant_d));
CUDA_SAFE_CALL(cudaFreeHost(Hf));
return;
}
bool FemGrp::calculatePadeCUDA(int currentTimeStep, bool isFirst, bool isEnd){
if(isEnd){
FreeGPU();
}
int M = currentTimeStep / tsPerSampling;
int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
int N = (int)floor(M / 2.0);
int nFields = padeCNT * NumOfFieldComponents;
int finish = 0;
bool* exitArray = new bool[padeCNT];
timer_start("Process : ", 'm');
cudaStream_t* streams = (cudaStream_t*)malloc(NumOfFieldComponents * sizeof(cudaStream_t));
CUDA_SAFE_CALL(cudaMalloc((void**)&padeFreqConstant_d, totalSamples * sizeof(int)));
CUDA_SAFE_CALL(cudaMemcpy(padeFreqConstant_d, padeFreqConstant, totalSamples * sizeof(int), cudaMemcpyHostToDevice));
for(int i = 0; i < NumOfFieldComponents; i++){
cudaStreamCreate(&streams[i]);
}
int nPoints = isEnd ? probeCNT : padeCNT;
for(int pade = 0; pade < nPoints; pade++){
timer_start("Process : ", 'm');
fp_t* maxValComponent = new fp_t[NumOfFieldComponents];
cuDoubleComplex* Hf;
CUDA_SAFE_CALL(cudaMallocHost((void**)&Hf, totalSamples * NumOfFieldComponents * sizeof(cuDoubleComplex), cudaHostAllocMapped));
#pragma omp parallel for
for(int component = 0; component < NumOfFieldComponents; component++){
fp_t* a_k;
fp_t* b_k;
CUDA_SAFE_CALL(cudaMallocHost((void**)&a_k, N * sizeof(fp_t), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&b_k, N * sizeof(fp_t), cudaHostAllocMapped));
cuDoubleComplex* FD = &Hf[totalSamples * component];
getPadeCoefCUDA(a_k, b_k, &maxValComponent[component], pade * NumOfFieldComponents + component, streams[component], currentTimeStep);
getFreqDomainPadeCUDA(a_k, b_k, totalSamples, N, FD, streams[component]);
}
if(!isFirst && !isEnd){
exitArray[pade] = studyPadeConvergence(&tranferencePadeFunctionFD_h[pade * NumOfFieldComponents * totalSamples], Hf, maxValComponent, totalSamples, pade);
}
if(isEnd){
printFD(pade, Hf);
if(pade < padeCNT && writePadeTD){
getPadeIFFT(pade, Hf);
}
cout << "Final Pade Point " << pade << "completed" << endl;
}else{
CUDA_SAFE_CALL(cudaMemcpy(&tranferencePadeFunctionFD_h[pade * NumOfFieldComponents * totalSamples], Hf, totalSamples * NumOfFieldComponents * sizeof(cuDoubleComplex), cudaMemcpyHostToHost));
}
delete [] maxValComponent;
CUDA_SAFE_CALL(cudaFreeHost(Hf));
timer_stop('m');
}
for(int i = 0; i < NumOfFieldComponents; i++){
cudaStreamDestroy(streams[i]);
}
CUDA_SAFE_CALL(cudaFree(padeFreqConstant_d));
bool exitValue = false;
if(!isFirst && !isEnd){
for(int pade = 0; pade < padeCNT; pade++){
if(pade == 0){
exitValue = exitArray[0];
}else
exitValue = exitValue & exitArray[pade];
}
}
delete [] exitArray;
timer_stop('m');
return exitValue;
}
bool FemGrp::studyPadeConvergence(cuDoubleComplex* oldField, cuDoubleComplex* newField, fp_t* maxFields, int M_global, int point){
for(int typeOfField = 0; typeOfField < TypeOfFields; typeOfField++){
fp_t convergence = 0.0;
fp_t maxProbe = 0.0;
#pragma omp parallel for shared(convergence, maxProbe)
for(int component = 0; component < NumOfUnitaryVectors; component++){
fp_t sum_X = 0.0, sum_Y = 0.0, sum_XY = 0.0, sum_XX = 0.0, sum_YY = 0.0;
fp_t lastYf_abs = 0.0;
fp_t currentYf_abs = 0.0;
for(int i = 0; i < M_global; i++){
int arrayMap = component * M_global + i;
lastYf_abs = sqrt(pow(oldField[arrayMap].x,2)+pow(oldField[arrayMap].y,2));
currentYf_abs = sqrt(pow(newField[arrayMap].x,2)+pow(newField[arrayMap].y,2));
sum_X = sum_X + currentYf_abs;
sum_Y = sum_Y + lastYf_abs;
sum_XY = sum_XY + currentYf_abs * lastYf_abs;
sum_XX = sum_XX + currentYf_abs * currentYf_abs;
sum_YY = sum_YY + lastYf_abs * lastYf_abs;
}
#pragma omp atomic update
convergence += maxFields[component] * (M_global * sum_XY - sum_X * sum_Y) / sqrt((M_global * sum_XX - sum_X * sum_X) * (M_global * sum_YY - sum_Y * sum_Y));
#pragma omp atomic update
maxProbe += maxFields[component];
}
cout << "Convergence Point " << point << " Fields " << (typeOfField ? "H" : "E") << ": " << (convergence / maxProbe) << endl;
if((convergence / maxProbe) < PadeTolerance){
return false;
}
}
return true;
}
void FemGrp::getPadeCoefCUDA(fp_t* a_k, fp_t* b_k, fp_t* maxField, int local_id, cudaStream_t stream, int currentTimeStep){
int M = currentTimeStep / tsPerSampling;
int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
int N = (int)floor(M / 2.0);
int nFields = padeCNT * NumOfFieldComponents;
a_k[0] = fieldProbes[local_id];
b_k[0] = 1;
*maxField = abs(fieldProbes[local_id]);
cusolverDnHandle_t handle;
cusolverDnCreate(&handle);
cusolverDnSetStream(handle, stream);
int n = N-1;
// int n = 2;
int nrhs = 1;
fp_t* G_h;
fp_t* d_h;
fp_t* G_d;
fp_t* d_d;
CUDA_SAFE_CALL(cudaMallocHost((void**)&G_h, n * n * sizeof(fp_t), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&d_h, n * sizeof(fp_t), cudaHostAllocMapped));
for(int i = 0; i < n; i++){
for(int j = 0; j < n; j++){
G_h[j * n + i] = fieldProbes[(N - j + i) * probeCNT * NumOfFieldComponents + local_id];
*maxField = max(abs(fieldProbes[(N - j + i) * probeCNT * NumOfFieldComponents + local_id]), *maxField);
}
d_h[i] = -fieldProbes[(N + i + 1) * probeCNT * NumOfFieldComponents + local_id];
}
//Copy matrices
CUDA_SAFE_CALL(cudaMalloc((void**)&G_d, n * n * sizeof(fp_t)));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_d, n * sizeof(fp_t)));
CUDA_SAFE_CALL(cudaMemcpyAsync(G_d, G_h, n * n * sizeof(fp_t), cudaMemcpyHostToDevice, stream));
CUDA_SAFE_CALL(cudaMemcpyAsync(d_d, d_h, n * sizeof(fp_t), cudaMemcpyHostToDevice, stream));
//Calculate buffer
int bufferSize;
cusolverDnDgetrf_bufferSize(handle, n, n, G_d, n, &bufferSize);
//Initialize variables
int* info;
CUDA_SAFE_CALL(cudaMalloc((void**)&info, sizeof(int)));
fp_t* buffer; // workspace for gesv
CUDA_SAFE_CALL(cudaMalloc((void**)&buffer, bufferSize * sizeof(fp_t)));
int *ipiv = NULL; // pivoting sequence
CUDA_SAFE_CALL(cudaMalloc((void**)&ipiv, n * sizeof(int)));
//Solve problem
cusolverDnDgetrf(handle, n, n, G_d, n, buffer, ipiv, info);
cusolverDnDgetrs(handle, CUBLAS_OP_N, n, nrhs, G_d, n, ipiv, d_d, n, info);
//Copy data back to CPU
CUDA_SAFE_CALL(cudaMemcpyAsync(d_h, d_d, n * sizeof(fp_t), cudaMemcpyDeviceToHost, stream));
//Free GPU
CUDA_SAFE_CALL(cudaFree(G_d));
CUDA_SAFE_CALL(cudaFree(d_d));
CUDA_SAFE_CALL(cudaFree(buffer));
CUDA_SAFE_CALL(cudaFree(info));
CUDA_SAFE_CALL(cudaFree(ipiv));
CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
cusolverDnDestroy(handle);
CUDA_SAFE_CALL(cudaFreeHost(G_h));
for(int i = 0; i < n; i++){
b_k[i + 1] = d_h[i];
a_k[i + 1] = 0.0;
for(int j = 0; j < i + 1; j++){
a_k[i + 1] += b_k[j] * fieldProbes[(i + 1 - j) * probeCNT * NumOfFieldComponents + local_id];
}
}
CUDA_SAFE_CALL(cudaFreeHost(d_h));
}
void FemGrp::getFreqDomainPadeCUDA(fp_t* a_k, fp_t* b_k, int M_global, int N, cuDoubleComplex* H_f, cudaStream_t stream){
fp_t* a_k_d;
fp_t* b_k_d;
CUDA_SAFE_CALL(cudaMalloc((void**)&a_k_d, N * sizeof(fp_t)));
CUDA_SAFE_CALL(cudaMalloc((void**)&b_k_d, N * sizeof(fp_t)));
CUDA_SAFE_CALL(cudaMemcpyAsync(a_k_d, a_k, N * sizeof(fp_t), cudaMemcpyHostToDevice, stream));
CUDA_SAFE_CALL(cudaMemcpyAsync(b_k_d, b_k, N * sizeof(fp_t), cudaMemcpyHostToDevice, stream));
cuDoubleComplex* H_f_d;
CUDA_SAFE_CALL(cudaMalloc((void**)&H_f_d, M_global * sizeof(cuDoubleComplex)));
dim3 blockDim(256, 1, 1);
dim3 gridDim(ceil_div(M_global, 256), 1, 1);
CalculatePadeFreq<<<gridDim, blockDim, 0, stream>>>(a_k_d, b_k_d, M_global, N, padeFreqConstant_d, H_f_d);
CUDA_SAFE_CALL(cudaMemcpyAsync(H_f, H_f_d, M_global * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, stream));
CUDA_SAFE_CALL(cudaFree(a_k_d));
CUDA_SAFE_CALL(cudaFree(b_k_d));
CUDA_SAFE_CALL(cudaFree(H_f_d));
CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
}
void FemGrp::getPadeIFFT(int probe, cuDoubleComplex* fDomainField){
int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
double* tDomainField = new double[M_global];
double* tDomainFieldOutput = new double[M_global * NumOfFieldComponents];
const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
for(int component = 0; component < NumOfFieldComponents; component++){
fftw_complex* fft;
fftw_plan ifft;
fft = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * M_global);
#pragma omp parallel for
for(int k = 0; k < M_global; k++){
cuDoubleComplex field = fDomainField[component * M_global + k];
Complex aux = (std::complex<float>(field.x, field.y) / sourceFreqDomain[k]) / M_global;
fft[k][0] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.real();
fft[k][1] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.imag();
}
ifft = fftw_plan_dft_c2r_1d(M_global, fft, tDomainField, FFTW_ESTIMATE);
fftw_execute(ifft);
fftw_destroy_plan(ifft);
fftw_free(fft);
#pragma omp parallel for
for (int i = 0; i < M_global; i++) {
tDomainFieldOutput[component * M_global + i] = 0.0;
for (int j = 0; j <= min(i, tsSource); j++) {
tDomainFieldOutput[component * M_global + i] += tDomainField[i - j] * sourceTimeDomain[j]; // Main convolution operation
}
}
}
char csvFileName[StrOutput];
sprintf(csvFileName, "./PROBES/TD_Pade_%s_Probe_%d.csv", fname, probe);
std::ofstream csvFile(csvFileName);
csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";
for(int n = 0; n < M_global; n++){
for(int component = 0; component < NumOfFieldComponents; component++){
if (component > 0){
csvFile << ",";
}
csvFile << std::setprecision(max_precision) << tDomainFieldOutput[component * M_global + n];
}
csvFile << "\n";
}
usleep(100);
csvFile.close();
delete [] tDomainField;
delete [] tDomainFieldOutput;
}
void FemGrp::printFD(int probe, cuDoubleComplex* fDomainField){
int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
char csvFileName[StrOutput];
sprintf(csvFileName, "./PROBES/FD_Pade_%s_Probe_%d.csv", fname, probe);
std::ofstream csvFile(csvFileName);
csvFile << "ExRe" << "," << "ExIm" << "," << "EyRe" << "," << "EyIm" << "," << "EzRe" << "," << "EzIm" << "," << "HxRe" << "," << "HxIm" << "," << "HyRe" << "," << "HyIm" << "," << "HzRe" << "," << "HzIm" << "\n";
for(int n = 0; n < M_global; n++){
for(int component = 0; component < NumOfFieldComponents; component++){
if (component > 0){
csvFile << ",";
}
csvFile << std::setprecision(max_precision) << fDomainField[component * M_global + n].x << "," << fDomainField[component * M_global + n].y;
}
csvFile << "\n";
}
}
void FemGrp::testEnd(){
int ts = 0;
char tname[StrLenShort];
int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
try {
while(1){
sprintf(tname, "PROBES_aux/Probes_%s_%04i.csv", fname, ts * tsPerSampling);
cout << tname << endl;
rapidcsv::Document probe_doc(tname);
std::vector<double> Ex_col = probe_doc.GetColumn<double>("Ex");
std::vector<double> Ey_col = probe_doc.GetColumn<double>("Ey");
std::vector<double> Ez_col = probe_doc.GetColumn<double>("Ez");
std::vector<double> Hx_col = probe_doc.GetColumn<double>("Hx");
std::vector<double> Hy_col = probe_doc.GetColumn<double>("Hy");
std::vector<double> Hz_col = probe_doc.GetColumn<double>("Hz");
for(int i = 0; i < Ey_col.size(); i++){
// fieldProbes[i * totalSamples * NumOfFieldComponents + ts * NumOfFieldComponents + 0] = Ex_col[i];
// fieldProbes[i * totalSamples * NumOfUnitaryVectors * TypeO#pragma omp parallel forfFields + ts * NumOfFieldComponents + 5] = Hz_col[i];
// cout << ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 0 << endl;
fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 0] = Ex_col[i];
fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 1] = Ey_col[i];
fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 2] = Ez_col[i];
fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 3] = Hx_col[i];
fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 4] = Hy_col[i];
fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 5] = Hz_col[i];
}
ts++;
}
}catch(...){
calculatePadeCUDA(ts * tsPerSampling, false, true);
// for(int i = ts/8 ; i <= ts; i += ts/8){
// cout << i << " " << (ts+1)/8 << " " << ts << " " << (i == (ts/8)) << " " << (i == 8 * (ts/8)) << endl;
// cout << calculatePadeCUDA(i * tsPerSampling, i == ts/8, i == 8 * (ts/8)) << endl;
// }
return;
}
}
#endif
void FemGrp::getPadeCoef(fp_t* a_k, fp_t* b_k, fp_t_ts* field, int N, int component, fp_t_ts* maxField){
denseMat<fp_t>* G = new denseMat<fp_t>(N-1, N-1);
ArrayFP<fp_t> d(N-1);
a_k[0] = field[component];
b_k[0] = 1;
*maxField = field[component];
// timer_start("Fill : ", 'm');
timer_start("getPadeCoef " + std::to_string(1) + ": ",'m');
for(int k = 0; k < N-1; k++){
for(int m = 0; m < N-1; m++){
G->setEntry(k,m, field[(N - m + k) * NumOfFieldComponents + component]); //it has to be in column form
*maxField = max(abs(field[(N - m + k) * NumOfFieldComponents + component]), *maxField);
}
d[k] = -field[(N + k + 1) * NumOfFieldComponents + component];
}
timer_stop('m');
timer_start("getPadeCoef " + std::to_string(2) + ": ",'m');
G->SelfTranspose();
timer_stop('m');
timer_start("getPadeCoef " + std::to_string(3) + ": ",'m');
solveAx_B(*G, d);
timer_stop('m');
timer_start("getPadeCoef " + std::to_string(4) + ": ",'m');
for(int k = 0; k < N-1; k++){
b_k[k + 1] = d[k];
for(int m = 0; m < k + 1; m++){
a_k[k + 1] += b_k[m] * field[(k + 1 - m) * NumOfFieldComponents + component];
}
}
timer_stop('m');
G->Clear();
for(int i = 0; i<N; i++){
cout << a_k[i] << " " << b_k[i] << endl;
}
// timer_stop('m');
}
void FemGrp::getPadeFreq(int N, int tsPerSampling){
#if defined(DGTD_USE_CUDA)
CUDA_SAFE_CALL(cudaMallocHost((void**)&padeFreqConstant, N * sizeof(int), cudaHostAllocMapped));
#endif
sourceFreqDomain = new Complex[N];
sourceTimeDomain = new fp_t[N];
#pragma omp parallel for
for(int i = 0; i < N; i++){
getSourceTimeDomain(i * tsPerSampling, &sourceTimeDomain[i], ExcitFlag);
if(abs(sourceTimeDomain[i]) > SourceTolerancePade){
tsSource = i;
}
}
int finish = N % 2 == 0 ? N / 2 - 1 : (N - 1) / 2;
fftw_complex* fftOut;
fftOut = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
fftw_plan fft;
fft = fftw_plan_dft_r2c_1d(N, sourceTimeDomain, fftOut, FFTW_ESTIMATE);
fftw_execute(fft);
fftw_destroy_plan(fft);
#pragma omp parallel for
for (int i = 0; i < N; ++i) {
sourceFreqDomain[i] = std::complex<fp_t>(fftOut[i][0], fftOut[i][1]);
if (i <= finish) {
padeFreqConstant[i] = i;
} else {
padeFreqConstant[i] = -N + i;
}
}
fftw_free(fftOut);
}
void FemGrp::getSourceTimeDomain(int timeStep, fp_t* Einc, int ExcitFlag){
fp_t dt = LocTimeSteps[N_class - 1];
fp_t omega = 2.0 * Pi * freq * MEGA;
fp_t to = To;
fp_t tau = Tau;
fp_t Exponent, SinModul;
switch (ExcitFlag){
case 0:
*Einc = static_cast<fp_t>(cos(omega * (timeStep + 1.0) * dt));
break;
case 1:
Exponent = (timeStep + 1.0) * dt - to;
SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
*Einc = static_cast<fp_t>(SinModul * exp(- (Exponent * Exponent) / (tau * tau)));
break;
case 2:
Exponent = (timeStep + 1.0) * dt - to;
SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
*Einc = static_cast<fp_t>(SinModul * exp(- (Exponent * Exponent) / (tau * tau)));
break;
default:
break;
}
}
fp_t FemGrp::getFreqDomainPade(fp_t* a_k, fp_t* b_k, int M_global, int N, Complex* H_f, int component, int probe, bool firstValue){
const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
char csvFileName[StrOutput];
sprintf(csvFileName, "Pade_Freq_1_%d_%d_%d.csv", N, probe, component);
std::ofstream csvFile(csvFileName);
if(firstValue){
Complex sumA_k = 0;
Complex sumB_k = 0;
Complex j = Complex (0.0, 1.0);
for(int i = 0; i < M_global; i++){
sumA_k = 0;
sumB_k = 0;
for(int k = 0; k < N; k++){
sumA_k += a_k[k] * pow(padeFreqs[i], k);
sumB_k += b_k[k] * pow(padeFreqs[i], k);
}
Complex freqVal = sumA_k / sumB_k;
csvFile << std::setprecision(max_precision) << sqrt(pow(freqVal.real(),2)+pow(freqVal.imag(),2));
csvFile << "\n";
H_f[i * NumOfFieldComponents + component] = freqVal;
}
cout << "First/Final Pade Calculation" << endl;
csvFile.close();
return 0.0;
}else{
Complex sumA_k = 0;
Complex sumB_k = 0;
Complex j = Complex (0.0, 1.0);
fp_t lastYf_abs = 0.0;
fp_t currentYf_abs = 0.0;
fp_t freqNorm = 0.0;
fp_t errorNorm = 0.0;
fp_t sum_X = 0.0, sum_Y = 0.0, sum_XY = 0.0, sum_XX = 0.0, sum_YY = 0.0;
for(int i = 0; i < M_global; i++){
sumA_k = 0;
sumB_k = 0;
for(int k = 0; k < N; k++){
sumA_k += a_k[k] * pow(padeFreqs[i], k);
sumB_k += b_k[k] * pow(padeFreqs[i], k);
}
Complex freqVal = sumA_k / sumB_k;
// csvFile << std::setprecision(max_precision) << sqrt(pow(freqVal.real(),2)+pow(freqVal.imag(),2));
// csvFile << "\n";
lastYf_abs = sqrt(H_f[i * NumOfFieldComponents + component].real() * H_f[i * NumOfFieldComponents + component].real() + H_f[i * NumOfFieldComponents + component].imag() * H_f[i * NumOfFieldComponents + component].imag());
H_f[i * NumOfFieldComponents + component] = freqVal;
currentYf_abs = sqrt(pow(freqVal.real(),2)+pow(freqVal.imag(),2));
sum_X = sum_X + currentYf_abs;
sum_Y = sum_Y + lastYf_abs;
sum_XY = sum_XY + currentYf_abs * lastYf_abs;
sum_XX = sum_XX + currentYf_abs * currentYf_abs;
sum_YY = sum_YY + lastYf_abs * lastYf_abs;
}
fp_t corr = (M_global * sum_XY - sum_X * sum_Y) / sqrt((M_global * sum_XX - sum_X * sum_X) * (M_global * sum_YY - sum_Y * sum_Y));
// cout << "Current Error In Pade (Probe = " << probe << ", Component = " << component <<") = " << corr << endl;
return corr;
}
return 0.0;
}
void FemGrp::getPadeIFFTEnd(int probe, Complex* fDomainField){
int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
Complex j = Complex (0.0, 1.0);
fp_t* tDomainField = new fp_t[M_global * NumOfFieldComponents];
const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
for(int component = 0; component < NumOfFieldComponents; component++){
Complex* tDomainTransferFunction = new Complex[M_global];
#pragma omp parallel for
for(int n = 0; n < M_global; n++){
tDomainTransferFunction[n] = 0.0;
for(int k = 0; k < M_global; k++){
tDomainTransferFunction[n] += abs(sourceFreqDomain[k]) < SourceTolerancePade ? 0.0 : fDomainField[k * NumOfFieldComponents + component] / sourceFreqDomain[k] * exp(j * 2 * Pi * n * k / M_global);
}
tDomainTransferFunction[n] /= M_global;
}
#pragma omp parallel for
for(int n = 0; n < M_global; n++){
tDomainField[n * NumOfFieldComponents + component] = 0.0;
for(int k = 0; k <= n; k++){
tDomainField[n * NumOfFieldComponents + component] += tDomainTransferFunction[n-k].real() * sourceTimeDomain[k];
}
}
delete [] tDomainTransferFunction;
}
char csvFileName[StrOutput];
sprintf(csvFileName, "Pade_%s_Probe_%d.csv", fname, probe);
std::ofstream csvFile(csvFileName);
csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";
for(int n = 0; n < M_global; n++){
for(int component = 0; component < NumOfFieldComponents; component++){
if (component > 0){
csvFile << ",";
}
csvFile << std::setprecision(max_precision) << tDomainField[n * NumOfFieldComponents + component];
}
csvFile << "\n";
}
usleep(100);
csvFile.close();
delete [] tDomainField;
}
void FemGrp::getPadeIFFT(int probe, Complex* fDomainField){
cout << "hello" << endl;
int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
double* tDomainField = new double[M_global * NumOfFieldComponents];
double* tDomainFieldOutput = new double[M_global * NumOfFieldComponents];
const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
for(int component = 0; component < NumOfFieldComponents; component++){
fftw_complex* fft;
fftw_plan ifft;
fft = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * M_global);
#pragma omp parallel for
for(int k = 0; k < M_global; k++){
// Complex aux = (fDomainField[k * NumOfFieldComponents + component] / sourceFreqDomain[k]) / M_global;
Complex aux = (fDomainField[probe * M_global * NumOfFieldComponents + component * M_global + k] / sourceFreqDomain[k]) / M_global;
fft[k][0] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.real();
fft[k][1] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.imag();
// fft[k] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? (fftw_complex)0.0 : (fftw_complex)fDomainField[k * NumOfFieldComponents + component];
}
double* tDomainFieldVec = &tDomainField[M_global * component];
ifft = fftw_plan_dft_c2r_1d(M_global, fft, tDomainFieldVec, FFTW_ESTIMATE);
fftw_execute(ifft);
fftw_destroy_plan(ifft);
fftw_free(fft);
#pragma omp parallel for
for (int i = 0; i < M_global; i++) {
tDomainFieldOutput[component * M_global + i] = 0.0;
for (int j = 0; j <= min(i, tsSource); j++) {
tDomainFieldOutput[component * M_global + i] += tDomainField[component * M_global + i - j] * sourceTimeDomain[j]; // Main convolution operation
}
}
}
char csvFileName[StrOutput];
sprintf(csvFileName, "Pade_%s_Probe_%d.csv", fname, probe);
std::ofstream csvFile(csvFileName);
csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";
for(int n = 0; n < M_global; n++){
for(int component = 0; component < NumOfFieldComponents; component++){
if (component > 0){
csvFile << ",";
}
csvFile << std::setprecision(max_precision) << tDomainFieldOutput[component * M_global + n];
}
csvFile << "\n";
}
usleep(100);
csvFile.close();
delete [] tDomainField;
delete [] tDomainFieldOutput;
}
void FemGrp::GetTetQuadRule(int PolyOrder, int& points, fp_t** zeta, fp_t* weights){
if(PolyOrder == 1){
zeta[0][0] = 0.25;
zeta[0][0] = 0.25;
zeta[0][0] = 0.25;
weights[0] = 1.0;
}else if(PolyOrder == 2){
zeta[0][0] = 0.585410196624969;
zeta[0][1] = 0.138196601125011;
zeta[0][2] = 0.138196601125011;
zeta[0][3] = 0.138196601125011;
//
zeta[1][0] = 0.138196601125011;
zeta[1][1] = 0.585410196624969;
zeta[1][2] = 0.138196601125011;
zeta[1][3] = 0.138196601125011;
//
zeta[2][0] = 0.138196601125011;
zeta[2][1] = 0.138196601125011;
zeta[2][2] = 0.585410196624969;
zeta[2][3] = 0.138196601125011;
//
zeta[3][0] = 0.138196601125011;
zeta[3][1] = 0.138196601125011;
zeta[3][2] = 0.138196601125011;
zeta[3][3] = 0.585410196624969;
//
weights[0] = 0.250000000000000;
weights[1] = 0.250000000000000;
weights[2] = 0.250000000000000;
weights[3] = 0.250000000000000;
}else if(PolyOrder == 3){
zeta[0][0] = 0.250000000000000;
zeta[0][1] = 0.250000000000000;
zeta[0][2] = 0.250000000000000;
zeta[0][3] = 0.250000000000000;
//
zeta[1][0] = 0.500000000000000;
zeta[1][1] = 0.166666666666667;
zeta[1][2] = 0.166666666666667;
zeta[1][3] = 0.166666666666667;
//
zeta[2][0] = 0.166666666666667;
zeta[2][1] = 0.500000000000000;
zeta[2][2] = 0.166666666666667;
zeta[2][3] = 0.166666666666667;
//
zeta[3][0] = 0.166666666666667;
zeta[3][1] = 0.166666666666667;
zeta[3][2] = 0.500000000000000;
zeta[3][3] = 0.166666666666667;
//
//
zeta[4][0] = 0.166666666666667;
zeta[4][1] = 0.166666666666667;
zeta[4][2] = 0.166666666666667;
zeta[4][3] = 0.500000000000000;
//
weights[0] = -0.800000000000000;
weights[1] = 0.450000000000000;
weights[2] = 0.450000000000000;
weights[3] = 0.450000000000000;
weights[4] = 0.450000000000000;
}
}
void FemGrp::Get_Coefficients_(tetra* tet, ArrayFP<fp_t>* origEn_1, ArrayFP<fp_t>* origHn_32){
int* tetraMAP_E = new int[TetPolyOrderDim[tet->PolyOrderFlag]];
int* tetraMAP_H = new int[TetPolyOrderDim[tet->PolyOrderFlag]];
tet->Local_DG_mapE(tetraMAP_E, tet->LocalOffsetE);
tet->Local_DG_mapH(tetraMAP_H, tet->LocalOffsetH);
origEn_1->reset();
origHn_32->reset();
for(int i = 0 ; i < TetPolyOrderDim[tet->PolyOrderFlag]; i++){
origEn_1->setentry(i, tetraMAP_E[i] < 0 ? 0.0 : en_1->getentry(tetraMAP_E[i]));
origHn_32->setentry(i, tetraMAP_H[i] < 0 ? 0.0 : hn_32->getentry(tetraMAP_H[i]));
}
}
void FemGrp::numberDofs(){
tetra* tet = 0;
int LocalDim = TetPolyOrderDim[PolyFlag];
int *tetraEMap = 0;
int *tetraHMap = 0;
int EdofOffset = 0;//[E H] offset
int HdofOffset = DimE;
for(int i = 0; i < tetraCNT; i++){
tet = &(tetARRAY[i]);
tet->allocDofMap();
tetraEMap = tet->get_LocalEMap(); // obtained from SetupMatrixFree
tetraHMap = tet->get_LocalHMap();
for(int j = 0; j < LocalDim; j++){
//in case there is -1
tet->setEHGlobalMap(j,
(tetraEMap[j] != NOT_NUMBERED) ? (tetraEMap[j] + EdofOffset) : (tetraEMap[j]),
(tetraHMap[j] != NOT_NUMBERED) ? (tetraHMap[j] + HdofOffset) : (tetraHMap[j]));
}
}
size_t matrixDIM_com = dimE + dimH;
cout << " " << endl;
cout << "==============================================" << endl;
cout << " NUMBER OF DEGREES OF FREEDOM " << endl;
cout << "==============================================" << endl;
cout << " Global Number of dof is " << matrixDIM_com << endl;
cout << " Global Matrix dim is (w/o compress) " << tetraCNT * LocalDim * 2 << endl;
cout << "==============================================" << endl;
cout << " " << endl;
}
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000 Port Meshes 00000000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
/*
void FemGrp::makePortMeshes()
{
int i, j;
if(portCNT == 0)
return;
map<int,int> PortMap, PortMapRes;
set<int> PortSet;
set<int>::iterator it;
int DGface_bc;
for(int idx = 0; idx < tetraCNT; idx++){
for(j = 0 ; j < NumOfFaces ; j++){
DGface_bc = tetARRAY[idx].fc[j]->getbType();
if(DGface_bc >= portType && DGface_bc < pecType)
PortSet.insert(DGface_bc);
}
}
LocPortCnt = (int)PortSet.size();
cout << "PortSet.size = " << (int)PortSet.size() << endl;
cout << "portCNT = " << portCNT << endl;
cout << "LocPortCnt = " << LocPortCnt << endl;
portCNT = LocPortCnt;
cout << "portCNT = " << portCNT << endl;
cout << "LocPortCnt = " << LocPortCnt << endl;
if(LocPortCnt == 0)
return;
for(it = PortSet.begin(); it != PortSet.end(); it++)
cout << "Port_type:" << *it << endl;
int counter = 0;
for(it = PortSet.begin(); it != PortSet.end(); it++){
PortMap[*it] = counter;
PortMapRes[counter] = *it;
counter++;
}
pMeshARRAY = new portMesh[LocPortCnt];
// count the port faces (portFaceNums)
// get pointers to port faces (portFaceLists)
// keep set of unique global node ids for faces (portNodeIds)
int* portFaceNums = new int[LocPortCnt];
list<face*>* portFaceLists = new list<face*>[LocPortCnt];
set<int>* portNodeIds = new set<int>[LocPortCnt];
memset(portFaceNums, 0, portCNT * sizeof(int));
for(i = 0; i < faceCNT; i++){
int bType = faceARRAY[i]->getbType();
if((bType >= portType) && (bType != pecType)){
int portNum = PortMap.find(bType)->second;
(portFaceNums[portNum])++; // increment the face count
portFaceLists[portNum].push_back(faceARRAY[i]); // add face pointer
// add unique node ids
for(j = 0; j < NumOfNodesPerFace; j++)
portNodeIds[portNum].insert(faceARRAY[i]->getNode(j)->getid());
}
}
for(i = 0; i < LocPortCnt; i++){
portMesh& portmesh = pMeshARRAY[i];
// set port name, magnitude and impedance
for(j = 0; j < bcCNT; j++){
if(bcARRAY[j].getbType() == PortMapRes[i]){
portmesh.setName(bcARRAY[j].getName());
cout<<"This is " << portmesh.getName() << endl;
portmesh.setMagE(bcARRAY[j].getMagE());
portmesh.setImpZ(bcARRAY[j].getCval());
break;
}
}
// allocate and add face pointers to array
int faceNum = portFaceNums[i];
portmesh.setFaceCnt(faceNum);
if(faceNum > 0){
face** portFaceArray = portmesh.getFaceArray();
list<face*>::iterator faceListIter = portFaceLists[i].begin();
for(j = 0; j < faceNum; j++){
portFaceArray[j] = *faceListIter;
faceListIter++;
}
// allocate and add node pointers to array
// keep local mapping
int nodeNum = portNodeIds[i].size();
portmesh.setNodeCnt(nodeNum);
portmesh.allocGlobToLocMap();
node** portNodeArray = portmesh.getNodeArray();
map<int, int>& globToLocMap = portmesh.getGlobToLocMap();
set<int>::iterator portNodeIdIter;
int nodeCount = 0;
for(portNodeIdIter = portNodeIds[i].begin(); portNodeIdIter != portNodeIds[i].end(); portNodeIdIter++){
portNodeArray[nodeCount] = &(ndARRAY[*portNodeIdIter]);
globToLocMap[ndARRAY[*portNodeIdIter].getid()] = nodeCount++;
}
// setup the remaining port mesh stuff
scalingLength = 1.0;
portmesh.makeCoordSystem();
portmesh.makeObjMap();
portmesh.readVline(unit);
portmesh.writeMesh(objProp);
cout.setf(ios::scientific);
cout.precision(15);
#if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)
vtr PortDirection_vtr = portmesh.getPortDirection();
excitationProp.PortDirection[0] = PortDirection_vtr.getx();
excitationProp.PortDirection[1] = PortDirection_vtr.gety();
excitationProp.PortDirection[2] = PortDirection_vtr.getz();
#endif
}
}
delete [] portFaceNums;
delete [] portFaceLists;
delete [] portNodeIds;
}
*/
void FemGrp::makePortMeshes()
{
int i, j;
if (portCNT == 0) return;
LocPortCnt = portCNT;
pMeshARRAY = new portMesh[LocPortCnt];
// Collectors per port
int* portFaceNums = new int[LocPortCnt];
std::list<face*>* portFaceLists = new std::list<face*>[LocPortCnt];
std::set<int>* portNodeIds = new std::set<int>[LocPortCnt];
std::memset(portFaceNums, 0, LocPortCnt * sizeof(int));
// Pass 1: walk faces and collect them by portNum (via bcNumToPnum)
for (i = 0; i < faceCNT; ++i)
{
int bType = faceARRAY[i]->getbType();
if (bType != portType) continue; // only port faces
// pick the valid owning tetra (check hydra pointers BEFORE deref)
tetra* tet = nullptr;
if (faceARRAY[i]->hydra[0] != nullptr)
{
tet = faceARRAY[i]->hydra[0];
}
else if (faceARRAY[i]->hydra[1] != nullptr)
{
tet = faceARRAY[i]->hydra[1];
}
else
{
continue; // no owner; defensive
}
// Find bc_number for THIS face inside its tetra (match same face by pointer)
int bc_number = -1;
for (int k = 0; k < NumOfFaces; ++k)
{
if (tet->fc[k] == faceARRAY[i])
{
bc_number = tet->getbc(k);
break;
}
}
if (bc_number < 0) continue;
int portNum = bcNumToPnum[bc_number]-1;
++portFaceNums[portNum];
portFaceLists[portNum].push_back(faceARRAY[i]);
for (j = 0; j < NumOfNodesPerFace; ++j)
{
portNodeIds[portNum].insert(faceARRAY[i]->getNode(j)->getid());
}
}
// Optional: sanity check
for (int p = 1; p < LocPortCnt+1; ++p)
{
std::cout << "Port " << p
<< " (BCNum=" << pnumToBcNum[p] << ") has "
<< portNodeIds[p-1].size() << " unique nodes and "
<< portFaceNums[p-1] << " faces.\n";
}
// Pass 2: finalize each port mesh
for (int p = 0; p < LocPortCnt; ++p)
{
portMesh& portmesh = pMeshARRAY[p];
// Initialize from bcARRAY using BCNum directly
int bc_number = pnumToBcNum[p+1];
if (bc_number >= 0 && bc_number < bcCNT)
{
auto& rec = bcARRAY[bc_number]; // <-- no bcRec type name
string name = rec.getName();
fp_t magnitudeE = rec.getMagE();
cout << "bc_number = " << bc_number << " name = " << name << " | magE = " << magnitudeE << endl;
portmesh.setName(rec.getName());
portmesh.setMagE(magnitudeE);
portmesh.setImpZ(rec.getCval());
}
// Faces
int faceNum = portFaceNums[p];
portmesh.setFaceCnt(faceNum);
if (faceNum > 0) {
face** portFaceArray = portmesh.getFaceArray();
auto itF = portFaceLists[p].begin();
for (j = 0; j < faceNum; ++j, ++itF) {
portFaceArray[j] = *itF;
}
// Nodes + local map
int nodeNum = static_cast<int>(portNodeIds[p].size());
portmesh.setNodeCnt(nodeNum);
portmesh.allocGlobToLocMap();
node** portNodeArray = portmesh.getNodeArray();
std::map<int,int>& globToLocMap = portmesh.getGlobToLocMap();
int nodeCount = 0;
for (int gid : portNodeIds[p]) {
// If ids aren't dense indices into ndARRAY, replace with your id->index lookup.
portNodeArray[nodeCount] = &(ndARRAY[gid]);
globToLocMap[ ndARRAY[gid].getid() ] = nodeCount++;
}
// Remaining setup
scalingLength = 1.0;
portmesh.makeCoordSystem();
portmesh.makeObjMap();
portmesh.readVline(unit);
portmesh.writeMesh(objProp);
}
}
delete [] portFaceNums;
delete [] portFaceLists;
delete [] portNodeIds;
}
/*
void FemGrp::solveWaveguidePorts()
{
char command[1000];
memset(command, 0, 1000 * sizeof(char));
sprintf(command, "anwg_h1 %s %e 1 \n",pMeshARRAY->portName, freq);
cout<<"=============Running Command:============"<<endl;
cout << command << endl;
system(command);
}
*/
/*
void FemGrp::WriteWaveguidePortFields()
{
// For each port
for(int i = 0; i < portCNT ; i++)
{
portMesh& portmesh = pMeshARRAY[i];
portmesh.writeVtk();
}
}
*/
// Using anwg to solve for the port excitation mode (1st mode)
void FemGrp::solveWaveguidePorts()
{
// run for each detected port
for (int i = 0; i < portCNT; ++i)
{
const std::string name = pMeshARRAY[i].getName(); // uses the name you set from bcARRAY
char command[1024];
// quote the name in case it has spaces; print freq with good precision
std::snprintf(command, sizeof(command), "anwg_h1 \"%s\" %.16e 1", name.c_str(), freq);
std::cout << "============= Running Command (port " << i << "): =============\n";
std::cout << command << std::endl;
int rc = std::system(command);
if (rc != 0)
{
std::cerr << "anwg_h1 failed for port " << i << " (rc = " << rc << ")\n";
}
}
}
void FemGrp::WriteWaveguidePortFields()
{
// For each port
for (int i = 0; i < portCNT; ++i)
{
portMesh& portmesh = pMeshARRAY[i];
std::cout << "Writing VTK for port " << i << " (" << portmesh.getName() << ")\n";
portmesh.writeVtk();
}
}
void FemGrp::AssignPortFieldsInFaces()
{
for(int i = 0 ; i < portCNT ; i++)
{
pMeshARRAY[i].makeRHS_E();
pMeshARRAY[i].makeRHS_H();
}
}
void FemGrp::AssignPortFieldsInFaces_TEM()
{
for (int i = 0; i < portCNT; ++i)
{
const auto& ex = portExcitations[i];
portMesh& pm = pMeshARRAY[i];
pm.makeRHS_TEM(ex.freq_m * 1e6, ex.epr,
ex.vpath[0], ex.vpath[1], ex.vpath[2],
ex.PortDirection[0], ex.PortDirection[1], ex.PortDirection[2]);
}
}
//TODO: make dynamic
void FemGrp::EvaluateSparametersGlobal(int timeStep, fp_t dt, bool isCompact)
{
int i, j, k, m;
int FaceNum;
int Nsample = 102;
int GaussPnt = Nsample - 1;
int IsOnFace;
int tetraMAP_P2[30];
int tetraMAP_P1[12];
int tetraMAP_P0[6];
vtr lvtr[3];
vtr avtr[4];
fp_t vol;
fp_t zeta0, zeta1, zeta2;
fp_t zetaFace[3];
fp_t zeta[4];
fp_t wgt = 1.0;
fp_t EvalueTotal;
fp_t EvalueInc;
fp_t h;
fp_t* VoltEntryInc = new fp_t[portCNT];
fp_t* VoltEntryTotal = new fp_t[portCNT];
vtr Total_E_Local;
vtr Inc_E_Local;
vtr Point;
vtr PortDirection;
vtr Normal;
fp_t area = 0.0;
tetra* tet;
ArrayFP<fp_t>* origEn_1_P2 = new ArrayFP<fp_t>(30);
ArrayFP<fp_t>* origEn_1_P1 = new ArrayFP<fp_t>(12);
ArrayFP<fp_t>* origEn_1_P0 = new ArrayFP<fp_t>(6);
for(i = 0; i < portCNT; i++){
VoltEntryInc[i] = 0.0;
VoltEntryTotal[i] = 0.0;
}
for(i = 0; i < portCNT; i++)
{
vtr VoltLine = pMeshARRAY[i].vline.coord[1] - pMeshARRAY[i].vline.coord[0];
vtr VoltLineUnit = pMeshARRAY[i].vline.coord[1] - pMeshARRAY[i].vline.coord[0];
VoltLineUnit.unitvtr();
h = VoltLine.magnitude() / GaussPnt;
for(k = 0; k < GaussPnt; k++){
Point = pMeshARRAY[i].vline.coord[0] + VoltLineUnit * (k + 0.5) * h;
//cout << "k = " << k << " FCCNT = " << pMeshARRAY[i].faceCNT << endl;
for(j = 0; j < pMeshARRAY[i].faceCNT; j++){
IsOnFace = pMeshARRAY[i].fcArray[j]->PointInFace(Point, zeta0, zeta1, zeta2);
zetaFace[0] = zeta0;
zetaFace[1] = zeta1;
zetaFace[2] = zeta2;
if(IsOnFace == 1)
{
pMeshARRAY[i].fcArray[j]->getAreaNormal(&area, &Normal);
PortDirection = pMeshARRAY[i].fcArray[j]->bcPtr->get_PortDirection();
if(dotP(Normal, PortDirection) < 0.0)
tet = pMeshARRAY[i].fcArray[j]->hydra[0];
else
tet = pMeshARRAY[i].fcArray[j]->hydra[1];
tet->geometry(lvtr, avtr, &vol);
for(m = 0 ; m < 4; m++){
if(pMeshARRAY[i].fcArray[j] == tet->getFacePtr(m))
FaceNum = m;
}
avtr[3].reset();
avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);
// 0th order polynomial
if(tet->PolyOrderFlag == 0){
tet->Local_DG_mapE(tetraMAP_P0, tet->LocalOffsetE);
origEn_1_P0->reset();
for(int Cnt1 = 0 ; Cnt1 < 6 ; Cnt1++){
if(tetraMAP_P0[Cnt1] < 0)
origEn_1_P0->setentry(Cnt1, 0.0);
else
origEn_1_P0->setentry(Cnt1, en_1->getentry(tetraMAP_P0[Cnt1]));
}
}else if(tet->PolyOrderFlag == 1){ // 1st order polynomial
tet->Local_DG_mapE(tetraMAP_P1, tet->LocalOffsetE);
origEn_1_P1->reset();
for(int Cnt2 = 0 ; Cnt2 < 12 ; Cnt2++){
if(tetraMAP_P1[Cnt2] < 0)
origEn_1_P1->setentry(Cnt2, 0.0);
else
origEn_1_P1->setentry(Cnt2, en_1->getentry(tetraMAP_P1[Cnt2]));
}
}else if(tet->PolyOrderFlag == 2){ // 2nd order polynomial
tet->Local_DG_mapE(tetraMAP_P2, tet->LocalOffsetE);
origEn_1_P2->reset();
for(int Cnt2 = 0 ; Cnt2 < 30 ; Cnt2++){
if(tetraMAP_P2[Cnt2] < 0)
origEn_1_P2->setentry(Cnt2, 0.0);
else
origEn_1_P2->setentry(Cnt2, en_1->getentry(tetraMAP_P2[Cnt2]));
}
}
for(m = 0 ; m < 4 ; m++){
zeta[m] = 0.0;
}
zeta[faceMAP[FaceNum][0]] = zetaFace[0];
zeta[faceMAP[FaceNum][1]] = zetaFace[1];
zeta[faceMAP[FaceNum][2]] = zetaFace[2];
// 0th order polynomial
if(tet->PolyOrderFlag == 0){
Total_E_Local = CalcEfield(origEn_1_P0->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag);
pMeshARRAY[i].fcArray[j]->SparEinc((timeStep + 1.0) * dt, Point, Inc_E_Local, zetaFace);
}else if(tet->PolyOrderFlag == 1){// 1st order polynomial
Total_E_Local = CalcEfield(origEn_1_P1->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag);
pMeshARRAY[i].fcArray[j]->SparEinc((timeStep + 1.0) * dt, Point, Inc_E_Local, zetaFace);
}else if(tet->PolyOrderFlag == 2){// 2nd order polynomial
Total_E_Local = CalcEfield(origEn_1_P2->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag);
pMeshARRAY[i].fcArray[j]->SparEinc((timeStep + 1.0) * dt, Point, Inc_E_Local, zetaFace);
}
EvalueTotal = dotP(Total_E_Local, VoltLineUnit);
EvalueInc = dotP(Inc_E_Local, VoltLineUnit);
VoltEntryInc[i] += - 1.0 * h * wgt * EvalueInc;
VoltEntryTotal[i] += - 1.0 * h * wgt * EvalueTotal;
}
}
}
// Write a file with all the impendances of the ports
if(timeStep == 0){
char Impedance_Log[180];
sprintf(Impedance_Log, "%s.ImpZ", fname);
ofstream ImpedanceOutfile(Impedance_Log, ios_base::out);
if(!ImpedanceOutfile)
cout << "Error in opening file: " << Impedance_Log << " for write " << endl;
for(i = 0 ; i < portCNT ; i++)
ImpedanceOutfile << pMeshARRAY[i].impZ << " ";
ImpedanceOutfile.close();
}
// Write to file Vinc
if(timeStep == 0)
system("mkdir TimeDomainVoltages");
char IncVoltage_TimeLog[180];
ofstream IncVoltageOutfile;
if(isCompact){
sprintf(IncVoltage_TimeLog, "./TimeDomainVoltages/%s.TD_Vinc", fname);
IncVoltageOutfile.open(IncVoltage_TimeLog, ios_base::out | ios::app);
}else{
sprintf(IncVoltage_TimeLog, "./TimeDomainVoltages/%s_%05d.TD_Vinc", fname, timeStep);
IncVoltageOutfile.open(IncVoltage_TimeLog, ios_base::out);
}
IncVoltageOutfile.setf(ios::scientific, ios::floatfield);
IncVoltageOutfile.precision(15);
if(!IncVoltageOutfile)
cout << "Error in opening file: " << IncVoltage_TimeLog << " for write " << endl;
IncVoltageOutfile << (timeStep + 1.0) * dt << " ";
for(i = 0 ; i < portCNT ; i++)
IncVoltageOutfile << VoltEntryInc[i]<< " ";
IncVoltageOutfile<<endl;
IncVoltageOutfile.close();
// Write to file Vtotal
char TotVoltage_TimeLog[180];
ofstream TotVoltageOutfile;
if(isCompact){
sprintf(TotVoltage_TimeLog, "./TimeDomainVoltages/%s.TD_Vtot", fname);
TotVoltageOutfile.open(TotVoltage_TimeLog, ios_base::out | ios::app);
}else{
sprintf(TotVoltage_TimeLog, "./TimeDomainVoltages/%s_%05d.TD_Vtot", fname, timeStep);
TotVoltageOutfile.open(TotVoltage_TimeLog, ios_base::out);
}
TotVoltageOutfile.setf(ios::scientific, ios::floatfield);
TotVoltageOutfile.precision(15);
if(!TotVoltageOutfile)
cout << "Error in opening file: " << TotVoltage_TimeLog << "for write"<< endl;
TotVoltageOutfile << (timeStep + 1.0) * dt << " ";
for(i = 0 ; i < portCNT ; i++)
TotVoltageOutfile << VoltEntryTotal[i] << " ";
TotVoltageOutfile << endl;
TotVoltageOutfile.close();
// Write to file Vref
char ReflVoltage_TimeLog [180];
ofstream ReflVoltageOutfile;
if(isCompact){
sprintf(ReflVoltage_TimeLog, "./TimeDomainVoltages/%s.TD_Vref", fname);
ReflVoltageOutfile.open(ReflVoltage_TimeLog, ios_base::out | ios::app);
}else{
sprintf(ReflVoltage_TimeLog, "./TimeDomainVoltages/%s_%05d.TD_Vref", fname, timeStep);
ReflVoltageOutfile.open(ReflVoltage_TimeLog, ios_base::out);
}
ReflVoltageOutfile.setf(ios::scientific, ios::floatfield);
ReflVoltageOutfile.precision(15);
if(!ReflVoltageOutfile)
cout << "Error in opening file: " << ReflVoltage_TimeLog << "for write"<< endl;
ReflVoltageOutfile.close();
delete origEn_1_P2;
delete origEn_1_P1;
delete origEn_1_P0;
delete [] VoltEntryInc;
delete [] VoltEntryTotal;
}
}
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
// 000000000000000000000000000000000000 GPU ROUTINES 00000000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
// DEVICE implementations
#if defined (DGTD_USE_CUDA)
#if defined (CUDA_NON_HEAVY)
////////////////////////////////////////////////////////////////////////////////////////////////////////
// OUTPUT Functions
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Modified by Qi Jian to write field from the PROBES
void FemGrp::writeFieldProbeCuBLAS(int timeStep)
{
fp_t vol;
fp_t zeta[4];
vtr lvtr[3];
vtr avtr[4];
vtr eField;
vtr hField;
vtr eField_all;
vtr hField_all;
char csvFileName[StrOutput];
std::ofstream csvFile;
if(padeCNT == 0 || writeWhilePade)
{
sprintf(csvFileName, "./PROBES/Probes_%s_%04d.csv", fname, timeStep);
csvFile.open(csvFileName);
csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";
}
const int num_nodes = probeCNT;
// Calculate Total Fields at the points
for(int i = 0; i < num_nodes; i++)
{
int number_of_associated_tets = probes_bary.at(i).first;
eField.reset();
hField.reset();
std::vector<std::pair<int, std::array<double, 4>>> found_tets = probes_bary.at(i).second;
eField_all.reset();
hField_all.reset();
for (int t = 0; t < number_of_associated_tets; t++)
{
int tet_id = found_tets.at(t).first;
array<double,4> tri_bary_coord = found_tets.at(t).second;
tetra& tet = tetARRAY[tet_id];
tet.geometry(lvtr, avtr, &vol);
avtr[3].reset();
avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);
eField.reset();
hField.reset();
zeta[0] = static_cast<fp_t>(tri_bary_coord[0]);
zeta[1] = static_cast<fp_t>(tri_bary_coord[1]);
zeta[2] = static_cast<fp_t>(tri_bary_coord[2]);
zeta[3] = static_cast<fp_t>(tri_bary_coord[3]);
eField = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);
hField = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);
eField_all = eField_all + eField;
hField_all = hField_all + hField;
}
eField_all = eField_all / ((fp_t) number_of_associated_tets);
hField_all = hField_all / ((fp_t) number_of_associated_tets);
if(usePade){ // && i < padeCNT
int row = (int)(timeStep / tsPerSampling)* NumOfFieldComponents * probeCNT ;
int column = i * NumOfFieldComponents;
fieldProbes[row + column + 0] = eField_all.getx();
fieldProbes[row + column + 1] = eField_all.gety();
fieldProbes[row + column + 2] = eField_all.getz();
fieldProbes[row + column + 3] = hField_all.getx();
fieldProbes[row + column + 4] = hField_all.gety();
fieldProbes[row + column + 5] = hField_all.getz();
}
if(padeCNT == 0 || writeWhilePade){
const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
csvFile << std::setprecision(max_precision) << eField_all.getx() << "," << eField_all.gety() << "," << eField_all.getz() << "," << hField_all.getx() << "," << hField_all.gety() << "," << hField_all.getz() << "\n";
}
}
if(padeCNT == 0 || writeWhilePade)
{
usleep(100);
csvFile.close();
}
}
// ----------------------------------------------------------------------
// Port-face centroid probes: one CSV per port, per timestep, folders
// ----------------------------------------------------------------------
inline void updateinc_PORT_host(fp_t_ts t,
const ExcitationProp& exci_prop,
const fp_t_ts r[3], // evaluation point on the port plane
fp_t_ts E_tan[3], // out: tangential E
fp_t_ts H_tan[3]) // out: tangential H
{
const fp_t_ts pi = (fp_t_ts)3.14159265358979323846;
// time envelope * Emagnitude (mirrors device)
fp_t_ts scale;
TimeModulationInc(exci_prop.TimeDistributionFlag, t, exci_prop.to, exci_prop.freq_m,
(fp_t_ts)exci_prop.Emagnitude, exci_prop.tau, exci_prop.CHIRP_BW_MHZ, scale);
// Build E_tan by PortFlag
E_tan[0]=E_tan[1]=E_tan[2]=0;
if (exci_prop.PortFlag == 1)
{
// ========== TEM rectangular: uniform along projected vpath ==========
fp_t_ts vhat[3] = { exci_prop.vpath[0], exci_prop.vpath[1], exci_prop.vpath[2] };
v3_normed(vhat, vhat);
E_tan[0] = vhat[0]*scale;
E_tan[1] = vhat[1]*scale;
E_tan[2] = vhat[2]*scale;
}
else if (exci_prop.PortFlag == 2)
{
// ========== TEM coax: E = (V / (rho ln(b/a))) * rho_hat ==========
fp_t_ts r0[3] = {exci_prop.r0_port[0], exci_prop.r0_port[1], exci_prop.r0_port[2]};
fp_t_ts r1[3] = {exci_prop.r1_port[0], exci_prop.r1_port[1], exci_prop.r1_port[2]};
fp_t_ts r2[3] = {exci_prop.r2_port[0], exci_prop.r2_port[1], exci_prop.r2_port[2]};
fp_t_ts v10[3], v20[3];
v3_sub(v10, r1, r0);
v3_sub(v20, r2, r0);
fp_t_ts a = v3_norm(v10);
fp_t_ts b = v3_norm(v20);
if (!(b > a)) { std::swap(a,b); if (!(b > a)) b = a + (fp_t_ts)1e-6; }
fp_t_ts rp0[3]; v3_sub(rp0, r, r0);
fp_t_ts rho = v3_norm(rp0);
if (rho >= a && rho <= b)
{
fp_t_ts rho_hat[3]; v3_normed(rho_hat, rp0);
const fp_t_ts lnba = std::log(b/a);
const fp_t_ts amp = (lnba > (fp_t_ts)1e-12) ? (scale / (rho * lnba)) : (fp_t_ts)0;
E_tan[0] = rho_hat[0]*amp;
E_tan[1] = rho_hat[1]*amp;
E_tan[2] = rho_hat[2]*amp;
}
}
else
{
// ========== TE_mn rectangular ==========
// local basis t1,t2 spanning the port plane
fp_t_ts t1[3] = {exci_prop.t1[0],exci_prop.t1[1],exci_prop.t1[2]};
fp_t_ts t2p[3] = {exci_prop.t2[0],exci_prop.t2[1],exci_prop.t2[2]};
v3_normed(t1,t1);
v3_normed(t2p, t2p);
// coords (u,v) from uv0
fp_t_ts uv0[3] = {exci_prop.uv0[0], exci_prop.uv0[1], exci_prop.uv0[2]};
fp_t_ts drv[3]; v3_sub(drv, r, uv0);
const fp_t_ts u = v3_dot(drv, t1);
const fp_t_ts v = v3_dot(drv, t2p);
const int m = exci_prop.m;
const int nmn = exci_prop.n;
const fp_t_ts a_rect = (exci_prop.rect_a > (fp_t_ts)0 ? exci_prop.rect_a : (fp_t_ts)1e-12);
const fp_t_ts b_rect = (exci_prop.rect_b > (fp_t_ts)0 ? exci_prop.rect_b : (fp_t_ts)1e-12);
const fp_t_ts cu = std::cos((fp_t_ts)m * pi * u / a_rect);
const fp_t_ts su = std::sin((fp_t_ts)m * pi * u / a_rect);
const fp_t_ts cv = std::cos((fp_t_ts)nmn * pi * v / b_rect);
const fp_t_ts sv = std::sin((fp_t_ts)nmn * pi * v / b_rect);
const fp_t_ts mu_u = (nmn!=0) ? ((fp_t_ts)nmn*pi/b_rect) : (fp_t_ts)0;
const fp_t_ts mu_v = (m !=0) ? ((fp_t_ts)m *pi/a_rect) : (fp_t_ts)0;
fp_t_ts Eu = ( mu_u ) * ( cu * sv ) * scale;
fp_t_ts Ev = -( mu_v ) * ( su * cv ) * scale;
E_tan[0] = Eu*t1[0] + Ev*t2p[0];
E_tan[1] = Eu*t1[1] + Ev*t2p[1];
E_tan[2] = Eu*t1[2] + Ev*t2p[2];
}
// H_tan = (1/Zport) n × E_tan (you already use this on device)
fp_t_ts Zport = (fp_t_ts)exci_prop.PortImpedance; // must be > 0
v3_cross(H_tan, exci_prop.PortDirection, E_tan);
H_tan[0] /= Zport;
H_tan[1] /= Zport;
H_tan[2] /= Zport;
}
// ----------------------------------------------------------------------
// Port incident-field probes: one CSV per port, per timestep
// Writes: ./PortProbesInc/Port<id>/Port<id>_inc_<t>.csv
// Columns: x1,y1,z1,Eix,Eiy,Eiz,Hix,Hiy,Hiz
// ----------------------------------------------------------------------
void FemGrp::writePortIncidentProbeCuBLAS(int timeStep)
{
if (!(portCNT > 0 && PortFacePidx_h && PortProbes_h)) return;
// Ensure base dir exists
mkdir("./PortProbesInc", 0755);
fp_t_ts dt = LocTimeSteps[N_class -1];
int numProbes = 3;
for (int p = 0; p < portCNT; ++p)
{
// Make per-port dir
std::string portDirInc = "./PortProbesInc/Port" + std::to_string(p);
mkdir(portDirInc.c_str(), 0755);
// Open CSV
char pCsvInc[512];
std::snprintf(pCsvInc, sizeof(pCsvInc), "%s/Port%d_inc_%04d.csv", portDirInc.c_str(), p, timeStep);
std::ofstream pcsvInc(pCsvInc);
if (!pcsvInc.is_open()) { std::cerr << "Error opening file: " << pCsvInc << "\n"; continue; }
// Header + precision
const auto max_precision = std::numeric_limits<fp_t>::digits10 + 1;
pcsvInc << "x1,y1,z1,Eix,Eiy,Eiz,Hix,Hiy,Hiz\n";
pcsvInc << std::fixed << std::setprecision(max_precision);
// Probe range for this port
const int offset = PortProbeOffset_h[p];
const int count = PortProbeCount_h[p];
for (int q = 0; q < numProbes; ++q)
{
int index = p*3*numProbes + 3*q;
// Probe coordinate (index into the global probe buffer)
const fp_t_ts* C = &PortProbes_h[index];
const double cx = (double)C[0];
const double cy = (double)C[1];
const double cz = (double)C[2];
// Incident fields at this probe
fp_t_ts Einc[3], Hinc[3];
fp_t_ts temp[3];
fp_t_ts timeE = dt * (timeStep + 1.0);
updateinc_PORT_host(timeE, portExcitations[p], C, Einc, temp);
fp_t_ts timeH = dt * (timeStep + 1.5);
updateinc_PORT_host(timeH, portExcitations[p], C, temp, Hinc);
// Write row
pcsvInc << cx << "," << cy << "," << cz << ","
<< Einc[0] << "," << Einc[1] << "," << Einc[2] << ","
<< Hinc[0] << "," << Hinc[1] << "," << Hinc[2] << "\n";
}
pcsvInc.close();
}
}
void FemGrp::writePortFieldProbeCuBLAS(int timeStep)
{
fp_t vol;
fp_t zeta[4];
vtr lvtr[3];
vtr avtr[4];
vtr eField, hField;
vtr eField_all, hField_all;
char csvFileName[StrOutput];
std::ofstream csvFile;
if (portCNT > 0 && PortFacePidx_h && PortProbes_h && !port_bary.empty())
{
// Base output directory and per-port subdirs
mkdir("./PortProbes", 0755);
int numProbes = 3;
for (int p = 0; p < portCNT; ++p)
{
std::string portDir = "./PortProbes/Port" + std::to_string(p);
mkdir(portDir.c_str(), 0755);
// Open CSV for this port + timestep
char pCsv[512];
std::snprintf(pCsv, sizeof(pCsv), "%s/Port%d_%04d.csv", portDir.c_str(), p, timeStep);
std::ofstream pcsv(pCsv);
if (!pcsv.is_open())
{
std::cerr << "Error opening file: " << pCsv << "\n";
continue;
}
// Header: centroid only
pcsv << "x1,y1,z1,Ex,Ey,Ez,Hx,Hy,Hz\n";
const auto max_precision = std::numeric_limits<fp_t>::digits10 + 1;
pcsv << std::fixed << std::setprecision(max_precision);
int offset = PortProbeOffset_h[p];
int count = PortProbeCount_h[p];
// Iterate over each probes
for (int q = 0; q < numProbes; ++q)
{
int index = p*3*numProbes + 3*q;
// Centroid position from buffer
const fp_t_ts* C = &PortProbes_h[index];
const double cx = static_cast<double>(C[0]);
const double cy = static_cast<double>(C[1]);
const double cz = static_cast<double>(C[2]);
// Bary search results for this centroid (should be present)
int nAssoc = (int)port_bary[index].first;
if (nAssoc <= 0)
{
// If you prefer hard-fail, you can exit as in readPROBE()
// Here we just skip gracefully.
continue;
}
const auto& found_tets = port_bary[index].second;
// Average E/H over owning tets (same pattern as node probes)
eField_all.reset();
hField_all.reset();
for (int t = 0; t < nAssoc; ++t)
{
int tet_id = found_tets[t].first;
const std::array<double,4>& b = found_tets[t].second;
tetra& tet = tetARRAY[tet_id];
tet.geometry(lvtr, avtr, &vol);
avtr[3].reset();
avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);
zeta[0] = (fp_t)b[0];
zeta[1] = (fp_t)b[1];
zeta[2] = (fp_t)b[2];
zeta[3] = (fp_t)b[3];
eField = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);
hField = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);
eField_all = eField_all + eField;
hField_all = hField_all + hField;
}
eField_all = eField_all / ((fp_t)nAssoc);
hField_all = hField_all / ((fp_t)nAssoc);
// Write one row: centroid + averaged fields
pcsv << cx << "," << cy << "," << cz << ","
<< eField_all.getx() << "," << eField_all.gety() << "," << eField_all.getz() << ","
<< hField_all.getx() << "," << hField_all.gety() << "," << hField_all.getz() << "\n";
}
pcsv.close();
}
}
}
void FemGrp::writeFieldGlobalCuBLAS(int timeStep){
fp_t vol;
fp_t zeta[4];
vtr lvtr[3];
vtr avtr[4];
vtr coord[4];
vtr eLocal[4];
vtr hLocal[4];
vtr* eField = new vtr[nodeCNT];
vtr* hField = new vtr[nodeCNT];
int* count = new int[nodeCNT];
memset(count, 0, nodeCNT * sizeof(int));
int* polyOrder = new int[tetraCNT];
for(int i = 0; i < tetraCNT; i++){
tetra& tet = tetARRAY[i];
polyOrder[i] = tet.PolyOrderFlag;
tet.geometry(lvtr, avtr, &vol);
avtr[3].reset();
avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);
for(int j = 0; j < 4; j++){
zeta[0] = BaryCoord[j][0];
zeta[1] = BaryCoord[j][1];
zeta[2] = BaryCoord[j][2];
zeta[3] = BaryCoord[j][3];
eLocal[j] = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, tet.PolyOrderFlag);
hLocal[j] = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, tet.PolyOrderFlag);
int index = tet.nd[j]->getid();
eField[index] = eField[index] + eLocal[j] /*- Einc*/;
hField[index] = hField[index] + hLocal[j] /*- Hinc*/;
count[index] += 1;
}
}
for(int i = 0; i < nodeCNT; i++){
eField[i] = eField[i] / static_cast<fp_t>(count[i]);
hField[i] = hField[i] / static_cast<fp_t>(count[i]);
}
VtkWriter vtkWriter(1.0);
// VtkWriter vtkWriter(unit);
char vtkFilePrefix[128];
memset(vtkFilePrefix, 0, 128 * sizeof(char));
sprintf(vtkFilePrefix, "./VTU_LTS/%s_%04d", fname, timeStep);
vtkWriter.writeField(vtkFilePrefix, nodeCNT, ndARRAY, tetraCNT, tetARRAY, eField, hField, polyOrder, 0, 0); //TODO: why here polyorder is not 1
delete [] eField;
delete [] hField;
delete [] count;
delete [] polyOrder;
}
bool FemGrp::checkEnergyDecay(){
fieldEnergy /= numberOfEnergyPoints * NumOfSampleEnergyCheck;
maxFieldEnergy = max(maxFieldEnergy, fieldEnergy);
return (fieldEnergy < energyDecayFactor * maxFieldEnergy);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Organize GPU Memory
////////////////////////////////////////////////////////////////////////////////////////////////////////
void FemGrp::PrepareGPUcuBLAS()
{
tetra* tet;
int cntAux;
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Prepare Excitation Info
////////////////////////////////////////////////////////////////////////////////////////////////////////
int exciCNT = 0;
for(int i = 0; i < N_class; i ++)
{
exciCNT += ClassExcitationCount[i];
}
CUDA_SAFE_CALL(cudaMallocHost((void**)&ExcitationFacesCnt_h, exciCNT * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&ExcitationFacesOffset_h, exciCNT * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&ExcitationFacesNum_h, excitationFaces * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&nd_coords_tet_h, NumOfUnitaryVectors * NumOfNodes * exciCNT * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&nd_coords_face_h, NumOfUnitaryVectors * NumOfNodesPerFace * excitationFaces * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&mapE_h, exciCNT * TetPolyOrderDim[PolyFlag] * sizeof(int8_t), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&mapH_h, exciCNT * TetPolyOrderDim[PolyFlag] * sizeof(int8_t), cudaHostAllocMapped));
// for(int i = 0; i < exciCNT * TetPolyOrderDim[PolyFlag]; i++){
// mapE_h[i] = 1;
// mapH_h[i] = 1;
// }
// ===============================================
// Allocate storage for port fields
// ===============================================
const int Q = GAUSS_POINT_NUM_h[PolyFlag]; // same as GPU kernel uses
cout << "excitationFaces = " << excitationFaces << endl;
cout << "exciCNT = " << exciCNT << endl;
if (portCNT > 0)
{
CUDA_SAFE_CALL(cudaMallocHost((void**)&Etan_qp_h, excitationFaces * Q * 3 * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Htan_qp_h, excitationFaces * Q * 3 * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&PortFacePidx_h, excitationFaces * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Etan_center_h, excitationFaces * 3 * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Htan_center_h, excitationFaces * 3 * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&FaceID_excitation_h, excitationFaces * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&TetID_excitation_h, excitationFaces * sizeof(int), cudaHostAllocMapped));
// 3 port probes along the vpath
CUDA_SAFE_CALL(cudaMallocHost((void**)&PortProbes_h, 3 * 3 * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&PortProbeOffset_h, portCNT * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&PortProbeCount_h, portCNT * sizeof(int), cudaHostAllocMapped));
}
// ===============================================
// Allocated Impedance for Planewave
// ===============================================
cout << "PlaneWaveBCFlag = " << PlaneWaveBCFlag << endl;
cout << "Number of Ports = " << portCNT << endl;
if(PlaneWaveBCFlag)
{
CUDA_SAFE_CALL(cudaMallocHost((void**)&Z_face_pw_h, excitationFaces * sizeof(fp_t_ts), cudaHostAllocMapped));
}
excitationFaces = 0;
exciCNT = 0;
for (int i = 0; i < N_class; i ++)
{
cout << "\nN CLASS = " << i << endl;
for(int j = 0; j < ClassExcitationCount[i]; j ++)
{
tet = &(tetARRAY[ClassTetraIndex[i][j]]);
cout << ClassTetraIndex[i][j] << " ";
for(int k = 0; k < TetPolyOrderDim[PolyFlag]; k++)
{
mapE_h[exciCNT * TetPolyOrderDim[PolyFlag] + k] = (tet->LocMapE[k] < 0 ? 0 : 1);
mapH_h[exciCNT * TetPolyOrderDim[PolyFlag] + k] = (tet->LocMapH[k] < 0 ? 0 : 1);
}
ExcitationFacesOffset_h[exciCNT] = excitationFaces;
for(int k = 0; k < NumOfFaces; k++)
{
for(int node = 0; node < NumOfNodes; node++)
{
nd_coords_tet_h[NumOfUnitaryVectors * (NumOfNodes * exciCNT + node) + 0] = tet->nd[node]->getCoord().getx();
nd_coords_tet_h[NumOfUnitaryVectors * (NumOfNodes * exciCNT + node) + 1] = tet->nd[node]->getCoord().gety();
nd_coords_tet_h[NumOfUnitaryVectors * (NumOfNodes * exciCNT + node) + 2] = tet->nd[node]->getCoord().getz();
//cout << "TET ID = " << tet->getcnt() << " Face ID = " << tet->fc[k]->getcnt() << " BC = " << tet->fc[k]->bcPtr->getbType() << endl;
//cout << tet->nd[node]->getCoord().getx() << " " << tet->nd[node]->getCoord().gety() << " " << tet->nd[node]->getCoord().getz() << endl;
}
int DGface_bc = tet->fc[k]->bcPtr->getbType();
if(DGface_bc == planeWaveType || DGface_bc == portType || DGface_bc == pmlType)
{
ExcitationFacesNum_h[excitationFaces] = k;
for(int node = 0; node < NumOfNodesPerFace; node++)
{
nd_coords_face_h[NumOfUnitaryVectors * (NumOfNodesPerFace * excitationFaces + node) + 0] = tet->fc[k]->nd[node]->getCoord().getx();
nd_coords_face_h[NumOfUnitaryVectors * (NumOfNodesPerFace * excitationFaces + node) + 1] = tet->fc[k]->nd[node]->getCoord().gety();
nd_coords_face_h[NumOfUnitaryVectors * (NumOfNodesPerFace * excitationFaces + node) + 2] = tet->fc[k]->nd[node]->getCoord().getz();
//cout << tet->fc[k]->nd[node]->getCoord().getx() << " , "
// << tet->fc[k]->nd[node]->getCoord().gety() << " , "
// << tet->fc[k]->nd[node]->getCoord().getz() << endl;
}
cout << "\n";
if(PlaneWaveBCFlag)
{
Z_face_pw_h[excitationFaces] = No * sqrt(tet->mat->mur.getEntry(0,0) / tet->mat->epsr.getEntry(0,0));
}
// ====================== Ports ======================
if (DGface_bc == portType)
{
int bc_number = tet->getbc(k);
int pnum = bcNumToPnum[bc_number]-1;
int face_id = tet->fc[k]->getcnt();
int tet_id = tet->getcnt();
PortFacePidx_h[excitationFaces] = pnum;
FaceID_excitation_h[excitationFaces] = face_id;
TetID_excitation_h[excitationFaces] = tet_id;
}
excitationFaces++;
}
}
ExcitationFacesCnt_h[exciCNT] = excitationFaces - ExcitationFacesOffset_h[exciCNT];
exciCNT++;
}
}
cout << " exciCNT = " << exciCNT << endl;
if (portCNT > 0)
{
cout << "-------- Preparing Port Probes ----------\n";
prepPortPROBE();
cout << "-------- Completed ------------\n";
}
// To save the current time step through the execution
LocalExciIndexE = new int[N_class];
LocalExciIndexH = new int[N_class];
for(int i = 0; i < N_class; i ++)
{
LocalExciIndexE[i] = 0;
LocalExciIndexH[i] = 0;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Create the fields at the HOST (only the ones that we will use to calculate the fields at the probes)
////////////////////////////////////////////////////////////////////////////////////////////////////////
int sizeField = TetPolyOrderDim[PolyFlag] * tetraCNT;
CUDA_SAFE_CALL(cudaMallocHost((void**)&En1_h, sizeField * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Hn32_h, sizeField * sizeof(fp_t_ts), cudaHostAllocMapped));
////////////////////////////////////////////////////////////////////////////////////////////////////////
// For Regular Tetrahedras
////////////////////////////////////////////////////////////////////////////////////////////////////////
flag1 = true;
// ---- Helpers ----
// Check for overflow
auto safe_add = [](int a, int b) -> int
{
if ((b > 0 && a > INT_MAX - b) || (b < 0 && a < INT_MIN - b))
{
fprintf(stderr, "Integer overflow in addition (%d + %d)\n", a, b);
abort();
}
return a + b;
};
// Check if index is within range
auto check_idx = [&](int idx, int lo, int hi, const char* what) {
if (idx < lo || idx > hi) {
fprintf(stderr, "Index out of range for %s: %d (expected [%d, %d])\n",
what, idx, lo, hi);
abort();
}
};
// Check for null pointer
auto check_ptr = [&](void* p, const char* what) {
if (!p) { fprintf(stderr, "Null pointer: %s\n", what); abort(); }
};
// ---- Allocations (pinned) ----
CUDA_SAFE_CALL(cudaMallocHost((void**)&classregNeighPML_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularTetraCnt_h, (size_t)N_class * regularCNT * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classIrregularTetraOffset_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classPMLTetraOffset_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighIrregular_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighIrregularOffset_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighPML_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighPMLOffset_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classTetraOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classPMLTetraOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighPMLOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&nonRegularTetraCnt_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&nonRegularPMLTetraCnt_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
// Always allocate these “per-class meta” arrays irrespective of regularTetraCNT,
// so we can safely write zeros even if there are no regulars.
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsCnt_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLGroupsCnt_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
// These hold per-class pointers allocated later per class; init to nullptr
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularTetraOffset_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsId_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped));
for (int i = 0; i < N_class; ++i)
{
classRegularTetraOffset_h[i] = nullptr;
classRegularGroupsId_h[i] = nullptr;
}
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraOffset_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraFaceOffset_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLGroupsId_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped));
for (int i = 0; i < N_class; ++i)
{
classRegularPMLTetraOffset_h[i] = nullptr;
classRegularPMLGroupsId_h[i] = nullptr;
classRegularPMLTetraFaceOffset_h[i] = nullptr;
}
// Per group (global)
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsNeighCnt_h, (size_t)regularCNT * sizeof(int), cudaHostAllocMapped));
// ---- Zero-init everything deterministically ----
memset(classRegularTetraCnt_h, 0, (size_t)N_class * regularCNT * sizeof(int));
memset(classIrregularTetraOffset_h, 0, (size_t)N_class * sizeof(int));
memset(classPMLTetraOffset_h, 0, (size_t)N_class * sizeof(int));
memset(classNeighIrregular_h, 0, (size_t)N_class * sizeof(int));
memset(classNeighIrregularOffset_h, 0, (size_t)N_class * sizeof(int));
memset(classNeighPML_h, 0, (size_t)N_class * sizeof(int));
memset(classregNeighPML_h, 0, (size_t)N_class * sizeof(int));
memset(classNeighPMLOffset_h, 0, (size_t)N_class * sizeof(int));
memset(classTetraOffset_loc_h, 0, (size_t)N_class * sizeof(int));
memset(classNeighOffset_loc_h, 0, (size_t)N_class * sizeof(int));
memset(classPMLTetraOffset_loc_h, 0, (size_t)N_class * sizeof(int));
memset(classNeighPMLOffset_loc_h, 0, (size_t)N_class * sizeof(int));
memset(nonRegularTetraCnt_h, 0, (size_t)N_class * sizeof(int));
memset(nonRegularPMLTetraCnt_h, 0, (size_t)N_class * sizeof(int));
memset(classRegularGroupsCnt_h, 0, (size_t)N_class * sizeof(int));
memset(classRegularPMLGroupsCnt_h, 0, (size_t)N_class * sizeof(int));
memset(classRegularGroupsNeighCnt_h, 0, (size_t)regularCNT * sizeof(int));
// ---- Locals ----
std::set<int> ID_aux, ID_aux_PML;
totalRegularNeighFaceCnt = 0;
totalRegularPMLNeighFaceCnt = 0;
numRegTetras = 0;
numRegPMLTetras = 0;
int irregularTetras = 0;
int irregularNeighbours= 0;
int PMLTetras = 0;
int PMLNeighbours = 0;
// ---- Main loop ----
for (int i = 0; i < N_class; ++i)
{
// Safe offsets (depend on previous class)
if (i == 0)
{
classIrregularTetraOffset_h[i] = 0;
classNeighIrregularOffset_h[i] = 0;
}
else
{
// read-only of previous indices is safe now
int prev = i - 1;
check_idx(prev, 0, N_class-1, "prev class index");
// Prevent overflow and guarantee non-negative
int pml_tetra_off = classPMLTetraOffset_h[prev];
int pml_tetra_cnt = ClassPMLTetraCnt[prev];
int pml_neigh_off = classNeighPMLOffset_h[prev];
int pml_neigh_cnt = classNeighPML_h[prev];
int reg_neigh_cnt = classregNeighPML_h[prev];
if (pml_tetra_off < 0 || pml_tetra_cnt < 0 || pml_neigh_off < 0 || pml_neigh_cnt < 0) {
fprintf(stderr, "Negative offsets/cnts detected for prev class %d\n", prev);
abort();
}
classIrregularTetraOffset_h[i] = pml_tetra_off + pml_tetra_cnt;
classNeighIrregularOffset_h[i] = pml_neigh_off + pml_neigh_cnt + reg_neigh_cnt;
}
classTetraOffset_loc_h[i] = irregularTetras;
classNeighOffset_loc_h[i] = irregularNeighbours;
int totalNeighbors = 0;
// ----- Non-PML tetras in class i -----
for (int j = 0; j < ClassTetraCnt[i]; ++j)
{
int tIdx = ClassTetraIndex[i][j];
tet = &(tetARRAY[tIdx]);
check_ptr(tet, "tet ptr");
int group_ID = tet->getRegularGroup();
// Count per class and group
classRegularTetraCnt_h[i * regularCNT + group_ID]++;
int neigh = tet->get_NeighNum();
if (group_ID == 0)
{
nonRegularTetraCnt_h[i]++;
irregularTetras++;
irregularNeighbours += neigh;
classNeighIrregular_h[i] += neigh;
totalNeighbors += neigh;
}
else
{
ID_aux.insert(group_ID);
classRegularGroupsNeighCnt_h[group_ID] = neigh;
totalRegularNeighFaceCnt += neigh;
numRegTetras++;
totalNeighbors += neigh;
}
}
// ----- Build per-class arrays for REGULAR groups -----
if (!ID_aux.empty())
{
int G = (int)ID_aux.size();
classRegularGroupsCnt_h[i] = G;
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsId_h[i], (size_t)G * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularTetraOffset_h[i], (size_t)G * sizeof(int), cudaHostAllocMapped));
check_ptr(classRegularGroupsId_h[i], "classRegularGroupsId_h[i]");
check_ptr(classRegularTetraOffset_h[i], "classRegularTetraOffset_h[i]");
cout << "Regular Tet group = " << endl;
int cntAux = 0;
for (int ID : ID_aux)
{
classRegularGroupsId_h[i][cntAux] = ID;
cout << ID << endl;
if (cntAux == 0)
{
classRegularTetraOffset_h[i][0] = 0;
}
else
{
int prevID = classRegularGroupsId_h[i][cntAux - 1];
int prevCnt = classRegularTetraCnt_h[i * regularCNT + prevID];
classRegularTetraOffset_h[i][cntAux] = classRegularTetraOffset_h[i][cntAux - 1] + prevCnt;
}
cntAux++;
}
ID_aux.clear();
}
else
{
classRegularGroupsCnt_h[i] = 0;
}
// ----- PML part -----
if (PML_flag)
{
classPMLTetraOffset_h[i] = classIrregularTetraOffset_h[i] + ClassTetraCnt[i];
classNeighPML_h[i] = 0;
classNeighPMLOffset_h[i] = classNeighIrregularOffset_h[i] + totalNeighbors;
classPMLTetraOffset_loc_h[i] = PMLTetras;
classNeighPMLOffset_loc_h[i] = PMLNeighbours;
cout << "classNeighPMLOffset_loc_h[" << i << "] =" << classNeighPMLOffset_loc_h[i] << endl;
//cout << "classNeighPMLOffset_loc_h[" << i << "] =" << classNeighPMLOffset_loc_h[i] << endl;
cout << " classPMLTetraOffset_loc_h[ " << i << "] " << classPMLTetraOffset_loc_h[i] << endl;
int pml_cnt = ClassPMLTetraCnt[i];
check_idx(pml_cnt, 0, INT_MAX, "ClassPMLTetraCnt[i]");
for (int j = 0; j < pml_cnt; ++j)
{
int idx = safe_add(ClassTetraCnt[i], j);
int tIdx = ClassTetraIndex[i][idx];
tet = &(tetARRAY[tIdx]);
check_ptr(tet, "tet ptr (PML)");
int group_ID = tet->getRegularGroup();
classRegularTetraCnt_h[i * regularCNT + group_ID]++;
int neigh = tet->get_NeighNum();
if (group_ID == 0)
{
nonRegularPMLTetraCnt_h[i]++;
PMLTetras = safe_add(PMLTetras, 1);
PMLNeighbours = safe_add(PMLNeighbours, neigh);
classNeighPML_h[i] = safe_add(classNeighPML_h[i], neigh);
}
else
{
ID_aux_PML.insert(group_ID);
classRegularGroupsNeighCnt_h[group_ID] = neigh;
totalRegularPMLNeighFaceCnt = safe_add(totalRegularPMLNeighFaceCnt, neigh);
numRegPMLTetras = safe_add(numRegPMLTetras, 1);
classregNeighPML_h[i] += neigh;
}
}
cout << "PMLNeighbours = " << PMLNeighbours << endl;
}
// ----- Build per-class arrays for REGULAR PML groups -----
if (PML_flag)
{
if (!ID_aux_PML.empty())
{
int Gp = (int)ID_aux_PML.size();
classRegularPMLGroupsCnt_h[i] = Gp;
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLGroupsId_h[i], (size_t)Gp * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraOffset_h[i], (size_t)Gp * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraFaceOffset_h[i], (size_t)Gp * sizeof(int), cudaHostAllocMapped));
check_ptr(classRegularPMLGroupsId_h[i], "classRegularPMLGroupsId_h[i]");
check_ptr(classRegularPMLTetraOffset_h[i], "classRegularPMLTetraOffset_h[i]");
check_ptr(classRegularPMLTetraFaceOffset_h[i], "classRegularPMLTetraFaceOffset_h[i]");
cout << "Regular PML Tet group = " << endl;
int cntAux = 0;
for (int ID : ID_aux_PML)
{
cout << ID << endl;
classRegularPMLGroupsId_h[i][cntAux] = ID;
if (cntAux == 0)
{
classRegularPMLTetraOffset_h[i][0] = 0;
classRegularPMLTetraFaceOffset_h[i][0] = 0;
}
else
{
int prevID = classRegularPMLGroupsId_h[i][cntAux - 1];
int prevCnt = classRegularTetraCnt_h[i * regularCNT + prevID];
classRegularPMLTetraOffset_h[i][cntAux] = classRegularPMLTetraOffset_h[i][cntAux - 1] + prevCnt;
int neigh = classRegularGroupsNeighCnt_h[prevID];
int num_element = classRegularTetraCnt_h[i * regularCNT + prevID];
int number_neigh = neigh * num_element;
classRegularPMLTetraFaceOffset_h[i][cntAux] = classRegularPMLTetraFaceOffset_h[i][cntAux-1] + number_neigh;
}
cntAux++;
}
ID_aux_PML.clear();
}
else
{
classRegularPMLGroupsCnt_h[i] = 0;
}
}
}
// ---- Final tallies ----
nonregularCNT_Normal = irregularTetras;
nonregularCNT_PML = PMLTetras;
num_elements_regular_PML = numRegPMLTetras;
cout << "nonregularCNT_Normal = " << nonregularCNT_Normal << endl;
cout << "nonregularCNT_PML = " << nonregularCNT_PML << endl;
cout << "num_elements_regular_PML = " << num_elements_regular_PML << endl;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Create the matrices for the regular groups (4 sets per regular group):
// - Loc1E/Loc1H: local matrices that are multiplied by the same type of field (Field1 = Loc1 * Field0 + ...)
// - Loc2E/Loc2H: local matrices that are multiplied by the opposite type of field (E1 = Loc2E * H1_2 + ...)
// - Neigh1E/Neigh1H: matrices related to the neighbors opposite filed
// - Neigh2E/Neigh2H: matrices related to the neighbors same filed
//
// *** NOTE: each of these matrices is Column-Major Order
// *** NOTE: since they are regular, we assume that the elements are conformal and with 4 neighbours
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int localMatrixSize = TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag];
int neighMatrixSize = TetPolyOrderDim[PolyFlag] * FacePolyOrderDim[PolyFlag];
cout << "--------------------------------------------------------------------------------------------------" << endl;
cout << "regularCNT_Normal = " << regularCNT_Normal << endl;
cout << "totalRegularNeighFaceCnt = " << totalRegularNeighFaceCnt << endl;
if(regularRegionFlag && regularCNT_Normal > 0)
{
cout << "========== FILLING regular ===============" << endl;
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularLoc1E_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularLoc2E_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularLoc1H_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularLoc2H_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularNeigh1E_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularNeigh2E_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularNeigh1H_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularNeigh2H_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
int localPosition = 0;
int couplingPosition = 0;
for(int i = 1; i < regularCNT_Normal+1; i++)
{
cout << "Group " << i << endl;
tet = &(tetARRAY[regionARRAY[i]]);
tet->prepareCuBLAS(&regularLoc1E_h[localPosition], &regularLoc2E_h[localPosition], &regularNeigh1E_h[couplingPosition], &regularNeigh2E_h[couplingPosition], nullptr,
&regularLoc1H_h[localPosition], &regularLoc2H_h[localPosition], &regularNeigh1H_h[couplingPosition], &regularNeigh2H_h[couplingPosition], nullptr);
localPosition += localMatrixSize;
couplingPosition += classRegularGroupsNeighCnt_h[i] * neighMatrixSize;
}
}
cout << "Complete regular matrices preparation" << endl;
cout << "--------------------------------------------------------------------------------------------------" << endl;
cout << "regularCNT_PML = " << regularCNT_PML << endl;
cout << "totalRegularPMLNeighFaceCnt = " << totalRegularPMLNeighFaceCnt << endl;
if(regularRegionFlag && regularCNT_PML > 0)
{
cout << "========== FILLING regular PML ===============" << endl;
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc1E_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc2E_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc1H_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc2H_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLNeigh1E_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLNeigh2E_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLNeigh1H_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLNeigh2H_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLAuxE_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLAuxH_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc1M_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc2M_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc1J_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc2J_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
int localPosition = 0;
int couplingPosition = 0;
for(int i = regularCNT_Normal+1; i < regularCNT_Normal+regularCNT_PML+1; i++)
{
cout << "Group " << i << endl;
tet = &(tetARRAY[regionARRAY[i]]);
cout << "------------" << endl;
tet->prepareCuBLAS_PML(&regularPMLLoc1E_h[localPosition], &regularPMLLoc2E_h[localPosition],
&regularPMLNeigh1E_h[couplingPosition], &regularPMLNeigh2E_h[couplingPosition],
&regularPMLLoc1H_h[localPosition], &regularPMLLoc2H_h[localPosition],
&regularPMLNeigh1H_h[couplingPosition], &regularPMLNeigh2H_h[couplingPosition],
&regularPMLAuxE_h[localPosition], &regularPMLAuxH_h[localPosition],
&regularPMLLoc1M_h[localPosition], &regularPMLLoc2M_h[localPosition],
&regularPMLLoc1J_h[localPosition],&regularPMLLoc2J_h[localPosition]);
localPosition += localMatrixSize;
couplingPosition += classRegularGroupsNeighCnt_h[i] * neighMatrixSize;
}
}
cout << "Complete regular PML matrices preparation" << endl;
cout << "--------------------------------------------------------------------------------------------------" << endl;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Calculate the neighbors (number per position + offset) so we know the number of matrices that we are going to need
// Also, we generate an array that is going to map the ID and the order
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cout << "Neighbor matrices preparation" << endl;
cout << "tetraCNT = " << tetraCNT << endl;
int neighCNT = 0;
CUDA_SAFE_CALL(cudaMallocHost((void**)&mapIdLoc, tetraCNT * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Neighbours_h, tetraCNT * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighboursOffset_h, tetraCNT * sizeof(int), cudaHostAllocMapped));
cntAux = 0;
for(int i = 0; i < N_class; i++)
{
for(int j = 0; j < ClassTetraCnt[i] + ClassPMLTetraCnt[i]; j++)
{
tet = &(tetARRAY[ClassTetraIndex[i][j]]);
mapIdLoc[ClassTetraIndex[i][j]] = cntAux;
Neighbours_h[cntAux] = tet->get_NeighNum();
NeighboursOffset_h[cntAux] = neighCNT;
neighCNT += tet->get_NeighNum();
cntAux++;
}
}
cout << "cntAux = " << cntAux << endl;
CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighMap_h, neighCNT * FacePolyOrderDim[PolyFlag] * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighClass_h, N_class * sizeof(int), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighClassOffset_h, N_class * sizeof(int), cudaHostAllocMapped));
int maxNeighClass = 0;
neighCNT = 0;
cntAux = 0;
for(int i = 0; i < N_class; i++)
{
NeighClassOffset_h[i] = neighCNT;
//cout << "====== Class " << i << endl;
//cout << "Non-PML " << endl;
for(int j = 0; j < ClassTetraCnt[i]; j++)
{
tet = &(tetARRAY[ClassTetraIndex[i][j]]);
bool isPML = tet->get_PML_Flag();
//cout << "TET = " << ClassTetraIndex[i][j] << " | PML = " << isPML << endl;
for(int neigh = 0; neigh < tet->get_NeighNum(); neigh++)
{
tetra* neighbor = tet->get_NeighborTetra(neigh);
int neighFace = tet->getNeighFace(neighbor);
int offset = mapIdLoc[neighbor->getcnt()] * TetPolyOrderDim[PolyFlag];
int neighID = mapIdLoc[neighbor->getcnt()];
bool isPML2 = neighbor->get_PML_Flag();
//cout << "TET = " << neighID << " | PML = " << isPML2 << endl;
for(int k = 0; k < FacePolyOrderDim[PolyFlag]; k++)
{
NeighMap_h[cntAux++] = offset + fac2tet[neighFace][k];
}
}
neighCNT += tet->get_NeighNum();
}
for(int j = ClassTetraCnt[i]; j < ClassTetraCnt[i] + ClassPMLTetraCnt[i]; j++)
{
tet = &(tetARRAY[ClassTetraIndex[i][j]]);
bool isPML = tet->get_PML_Flag();
for(int neigh = 0; neigh < tet->get_NeighNum(); neigh++)
{
tetra* neighbor = tet->get_NeighborTetra(neigh);
int neighFace = tet->getNeighFace(neighbor);
int offset = mapIdLoc[neighbor->getcnt()] * TetPolyOrderDim[PolyFlag];
int neighID = mapIdLoc[neighbor->getcnt()];
bool isPML2 = neighbor->get_PML_Flag();
for(int k = 0; k < FacePolyOrderDim[PolyFlag]; k++)
{
NeighMap_h[cntAux++] = offset + fac2tet[neighFace][k];
}
}
neighCNT += tet->get_NeighNum();
}
NeighClass_h[i] = neighCNT - NeighClassOffset_h[i];
maxNeighClass = (int)std::max(maxNeighClass, NeighClass_h[i]);
}
cout << "Complete Neighbor matrices preparation" << endl;
cout << "neighCNT = " << neighCNT << endl;
cout << "--------------------------------------------------------------------------------------------------" << endl;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Create the matrices (4 sets per field + inverse for exited elements):
// - Loc1E/Loc1H: local matrices that are multiplied by the same type of field (Field1 = Loc1 * Field0 + ...)
// - Loc2E/Loc2H: local matrices that are multiplied by the opposite type of field (E1 = Loc2E * H1_2 + ...)
// - Neigh1E/Neigh1H: matrices related to the neighbors opposite filed
// - Neigh2E/Neigh2H: matrices related to the neighbors same filed
// - InvE_h/InvH_h: inverse Mass matrices (only for excited terms)
//
// *** NOTE: each of these matrices is Column-Major Order ***
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cout << "Excitation preparation" << endl;
cout << "exciCNT = " << exciCNT << endl;
if (nonregularCNT_Normal > 0)
{
cout << "========== FILLING Irregular ===============" << endl;
CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1E_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2E_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1H_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2H_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1E_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2E_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1H_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2H_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&InvE_h, exciCNT * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&InvH_h, exciCNT * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
std::cout << "Begin irregular CuBLAS preparation" << std::endl;
std::cout << "N_class = " << N_class << std::endl;
cout << "irregularTetras = " << irregularTetras << endl;
cout << "nonregularCNT_Normal = " << nonregularCNT_Normal << endl;
exciCNT = 0;
irregularTetras = 0;
irregularNeighbours = 0;
//NOTE: this only works because of the order of the tetras in ClassTetraIndex (Exci0 NonExci0 Exci1 ...) where the number is the class
//NOTE: classRegularTetraCnt_h[i * regularCNT + 0] means that we only take into consideration the group 0 (irregular mesh) since the others were already done in the regular section
for(int i = 0; i < N_class; i++)
{
for(int j = 0; j < nonRegularTetraCnt_h[i]; j++)
{
tet = &(tetARRAY[ClassTetraIndex[i][j]]);
int localPosition = irregularTetras * localMatrixSize;
int couplingPosition = irregularNeighbours * neighMatrixSize;
fp_t_ts* InvEptr = j < ClassExcitationCount[i] ? &InvE_h[(exciCNT + j) * localMatrixSize] : nullptr;
fp_t_ts* InvHptr = j < ClassExcitationCount[i] ? &InvH_h[(exciCNT + j) * localMatrixSize] : nullptr;
tet->prepareCuBLAS(&Loc1E_h[localPosition], &Loc2E_h[localPosition], &Neigh1E_h[couplingPosition], &Neigh2E_h[couplingPosition], InvEptr,
&Loc1H_h[localPosition], &Loc2H_h[localPosition], &Neigh1H_h[couplingPosition], &Neigh2H_h[couplingPosition], InvHptr);
irregularTetras++;
irregularNeighbours += tet->get_NeighNum();
}
exciCNT += ClassExcitationCount[i];
}
cout << "irregularTetras = " << irregularTetras << endl;
cout << "exciCNT = " << exciCNT << endl;
}
cout << "--------------------------------------------------------------------------------------------------" << endl;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Create the matrices (4 sets per field + inverse for exited elements):
// - Loc1E_PML/Loc1H_PML: local matrices that are multiplied by the same type of field (Field1 = Loc1 * Field0 + ...)
// - Loc2E_PML/Loc2H_PML: local matrices that are multiplied by the opposite type of field (E1 = Loc2E * H1_2 + ...)
// - Neigh1E_PML/Neigh1H_PML: matrices related to the neighbors opposite filed
// - Neigh2E_PML/Neigh2H_PML: matrices related to the neighbors same filed
// - InvE_h/InvH_h: inverse Mass matrices (only for excited terms)
//
// *** NOTE: each of these matrices is Column-Major Order ***
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cout << "nonregularCNT_PML = " << nonregularCNT_PML << endl;
if (nonregularCNT_PML > 0)
{
cout << "========== FILLING PML ===============" << endl;
CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1E_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2E_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1H_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2H_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1E_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2E_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1H_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2H_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxE_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxH_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxM1_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxJ1_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxM2_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxJ2_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
cout << "PMLTetras = " << PMLTetras << endl;
cout << "PMLNeighbours = " << PMLNeighbours << endl;
// Reset counters before starting matrix population
PMLTetras = 0;
PMLNeighbours = 0;
// Loop over all LTS classes
for (int i = 0; i < N_class; i++)
{
for (int j = ClassTetraCnt[i]; j < ClassTetraCnt[i] + nonRegularPMLTetraCnt_h[i]; j++)
{
// Get pointer to the j-th irregular tetrahedron in class i
tet = &(tetARRAY[ClassTetraIndex[i][j]]);
// Non-PML Irregular Tetrahedron: compute memory positions for local and neighbor matrices
int localPos = PMLTetras * localMatrixSize;
int neighPos = PMLNeighbours * neighMatrixSize;
// Fill in the local and coupling matrices for non-PML irregular tetra
tet->prepareCuBLAS_PML(&Loc1E_PML_h[localPos], &Loc2E_PML_h[localPos],
&Neigh1E_PML_h[neighPos], &Neigh2E_PML_h[neighPos],
&Loc1H_PML_h[localPos], &Loc2H_PML_h[localPos],
&Neigh1H_PML_h[neighPos], &Neigh2H_PML_h[neighPos],
&AuxE_h[localPos], &AuxH_h[localPos],
&AuxM1_h[localPos], &AuxM2_h[localPos],
&AuxJ1_h[localPos],&AuxJ2_h[localPos]);
// Increment running totals for non-PML irregular tetrahedra and their neighbors
PMLTetras++;
PMLNeighbours += tet->get_NeighNum();
}
}
cout << "PMLTetras = " << PMLTetras << endl;
}
int sizePML = PMLTetras * TetPolyOrderDim[PolyFlag];
cout << "--------------------------------------------------------------------------------------------------" << endl;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Check GPU Memory
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct MemItem { const char* label; size_t bytes; };
auto BYTES_ = [](size_t elems, size_t sizeofT){ return elems * sizeofT; };
auto GB = [](size_t bytes){ return double(bytes) / 1e9; };
auto sum_bytes = [](const std::vector<MemItem>& v)->size_t{
size_t s=0; for (auto& it: v) s += it.bytes; return s;
};
// ===== Memory accounting (exact, by allocation) ===================================
const int TPO = TetPolyOrderDim[PolyFlag];
const int FPO = FacePolyOrderDim[PolyFlag];
const size_t localElems = static_cast<size_t>(TPO) * TPO;
const size_t neighElems = static_cast<size_t>(TPO) * FPO;
const int exciCNT_total = exciCNT;
const int irregularTetras_total = irregularTetras;
const int irregularNeighbours_total = irregularNeighbours;
const int PMLTetras_total = PMLTetras;
const int PMLNeighbours_total = PMLNeighbours;
const int regNormGroups = regularCNT_Normal;
const int regPMLGroups = regularCNT_PML;
const int regNormFacesTotal = totalRegularNeighFaceCnt;
const int regPMLFacesTotal = totalRegularPMLNeighFaceCnt;
const size_t sizeFieldElems = sizeField; // already in elements
const size_t sizePMLElems = sizePML; // already in elements (if you keep a global PML state)
const size_t neighMapElems = static_cast<size_t>(neighCNT) * FPO;
const size_t neighboursElems = tetraCNT;
const size_t auxInElems = static_cast<size_t>(maxNeighClass) * FPO;
const size_t auxOutElems = static_cast<size_t>(maxNeighClass) * TPO;
const size_t mapElemsPerExci = TPO;
const size_t tetNdElems = static_cast<size_t>(NumOfUnitaryVectors) * NumOfNodes * exciCNT_total;
const size_t faceNdElems = static_cast<size_t>(NumOfUnitaryVectors) * NumOfNodesPerFace * excitationFaces;
// ============ Build accounting vectors matching your allocations ==================
std::vector<MemItem> excit, prop, state, neighs;
// ---- Excitation maps & counts ----
excit.push_back({"mapE (int8)", BYTES_(size_t(exciCNT_total) * mapElemsPerExci, sizeof(int8_t))});
excit.push_back({"mapH (int8)", BYTES_(size_t(exciCNT_total) * mapElemsPerExci, sizeof(int8_t))});
excit.push_back({"ExcitationFacesCnt (int)", BYTES_(exciCNT_total, sizeof(int))});
excit.push_back({"ExcitationFacesOffset (int)", BYTES_(exciCNT_total, sizeof(int))});
excit.push_back({"ExcitationFacesNum (int)", BYTES_(excitationFaces, sizeof(int))});
excit.push_back({"nd_coords_tet", BYTES_(tetNdElems, sizeof(fp_t_ts))});
excit.push_back({"nd_coords_face", BYTES_(faceNdElems, sizeof(fp_t_ts))});
if (PlaneWaveBCFlag && excitationFaces > 0) {
excit.push_back({"Z_face_pw", BYTES_(excitationFaces, sizeof(fp_t_ts))});
}
// Inverses only for excitations
excit.push_back({"InvE", BYTES_(size_t(exciCNT_total) * localElems, sizeof(fp_t_ts))});
excit.push_back({"InvH", BYTES_(size_t(exciCNT_total) * localElems, sizeof(fp_t_ts))});
// ---- Irregular (non-PML) ----
prop.push_back({"Loc1E (irreg)", BYTES_(size_t(irregularTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"Loc2E (irreg)", BYTES_(size_t(irregularTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"Loc1H (irreg)", BYTES_(size_t(irregularTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"Loc2H (irreg)", BYTES_(size_t(irregularTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"Neigh1E (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"Neigh2E (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"Neigh1H (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"Neigh2H (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))});
// ---- Regular (non-PML) ----
if (regNormGroups > 0) {
prop.push_back({"regularLoc1E", BYTES_(size_t(regNormGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularLoc2E", BYTES_(size_t(regNormGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularLoc1H", BYTES_(size_t(regNormGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularLoc2H", BYTES_(size_t(regNormGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularNeigh1E", BYTES_(size_t(regNormFacesTotal) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"regularNeigh2E", BYTES_(size_t(regNormFacesTotal) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"regularNeigh1H", BYTES_(size_t(regNormFacesTotal) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"regularNeigh2H", BYTES_(size_t(regNormFacesTotal) * neighElems, sizeof(fp_t_ts))});
}
// ---- Regular PML ----
if (regPMLGroups > 0)
{
prop.push_back({"regularPMLLoc1E", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLLoc2E", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLLoc1H", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLLoc2H", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLNeigh1E", BYTES_(size_t(regPMLFacesTotal) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLNeigh2E", BYTES_(size_t(regPMLFacesTotal) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLNeigh1H", BYTES_(size_t(regPMLFacesTotal) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLNeigh2H", BYTES_(size_t(regPMLFacesTotal) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLAuxE", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLAuxH", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLLoc1M", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLLoc2M", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLLoc1J", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))});
prop.push_back({"regularPMLLoc2J", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))});
// per-element state for regular-PML region
state.push_back({"r_Mn", BYTES_(size_t(numRegPMLTetras) * localElems, sizeof(fp_t_ts))});
state.push_back({"r_Mn1", BYTES_(size_t(numRegPMLTetras) * localElems, sizeof(fp_t_ts))});
state.push_back({"r_Jn12", BYTES_(size_t(numRegPMLTetras) * localElems, sizeof(fp_t_ts))});
state.push_back({"r_Jn32", BYTES_(size_t(numRegPMLTetras) * localElems, sizeof(fp_t_ts))});
}
// ---- Irregular PML ----
if (PMLTetras_total > 0)
{
prop.push_back({"Loc1E_PML", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"Loc2E_PML", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"Loc1H_PML", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"Loc2H_PML", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"Neigh1E_PML", BYTES_(size_t(PMLNeighbours_total) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"Neigh2E_PML", BYTES_(size_t(PMLNeighbours_total) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"Neigh1H_PML", BYTES_(size_t(PMLNeighbours_total) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"Neigh2H_PML", BYTES_(size_t(PMLNeighbours_total) * neighElems, sizeof(fp_t_ts))});
prop.push_back({"AuxE", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"AuxH", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"AuxM1", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"AuxJ1", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"AuxM2", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
prop.push_back({"AuxJ2", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
// per-element PML state arrays
state.push_back({"Mn", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
state.push_back({"Mn1", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
state.push_back({"Jn12", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
state.push_back({"Jn32", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))});
}
// ---- Global field buffers ----
state.push_back({"En", BYTES_(sizeFieldElems, sizeof(fp_t_ts))});
state.push_back({"En1", BYTES_(sizeFieldElems, sizeof(fp_t_ts))});
state.push_back({"Hn12", BYTES_(sizeFieldElems, sizeof(fp_t_ts))});
state.push_back({"Hn32", BYTES_(sizeFieldElems, sizeof(fp_t_ts))});
// ---- Neighbor maps/structs ----
neighs.push_back({"NeighMap (int)", BYTES_(neighMapElems, sizeof(int))});
neighs.push_back({"Neighbours (int)", BYTES_(neighboursElems, sizeof(int))});
neighs.push_back({"NeighboursOffset (int)", BYTES_(neighboursElems, sizeof(int))});
neighs.push_back({"auxFieldInput", BYTES_(auxInElems, sizeof(fp_t_ts))});
neighs.push_back({"auxFieldOutput", BYTES_(auxOutElems, sizeof(fp_t_ts))});
// ============================ Totals & printing ===================================
const size_t bytesExcit = sum_bytes(excit);
const size_t bytesProp = sum_bytes(prop);
const size_t bytesState = sum_bytes(state);
const size_t bytesNeigh = sum_bytes(neighs);
const double factor = usageSecurityThresholdFactor; // e.g., 1.05
const double gExcit = GB(bytesExcit) * factor;
const double gProp = GB(bytesProp ) * factor;
const double gState = GB(bytesState) * factor;
const double gNeigh = GB(bytesNeigh) * factor;
const double gTotal = gExcit + gProp + gState + gNeigh;
size_t free_cudamem=0, total_cudamem=0;
CUDA_SAFE_CALL(cudaMemGetInfo(&free_cudamem, &total_cudamem));
auto print_rows = [](const char* category, std::vector<MemItem> v, bool sort_by_size = true)
{
if (sort_by_size) {
std::sort(v.begin(), v.end(),
[](const MemItem& a, const MemItem& b){ return a.bytes > b.bytes; });
}
for (auto& it: v) if (it.bytes) {
std::cout << std::left << std::setw(16) << category
<< std::setw(36) << it.label
<< std::right << std::setw(12) << std::fixed << std::setprecision(6)
<< (double(it.bytes)/1e9) << '\n';
}
};
std::cout << "============================================================================================\n";
std::cout << std::left << std::setw(16) << "Category"
<< std::setw(36) << "Buffer"
<< std::right << std::setw(12) << "Size [GB]" << '\n';
std::cout << "--------------------------------------------------------------------------------------------\n";
print_rows("Excitation", excit);
print_rows("Propagation", prop);
print_rows("Fields/State",state);
print_rows("Neighbors", neighs);
std::cout << "--------------------------------------------------------------------------------------------\n";
std::cout << std::left << std::setw(16) << "TOTALS"
<< std::setw(36) << "Excitation"
<< std::right << std::setw(12) << std::fixed << std::setprecision(6) << gExcit << '\n';
std::cout << std::left << std::setw(16) << "TOTALS"
<< std::setw(36) << "Propagation"
<< std::right << std::setw(12) << std::fixed << std::setprecision(6) << gProp << '\n';
std::cout << std::left << std::setw(16) << "TOTALS"
<< std::setw(36) << "Fields/State"
<< std::right << std::setw(12) << std::fixed << std::setprecision(6) << gState << '\n';
std::cout << std::left << std::setw(16) << "TOTALS"
<< std::setw(36) << "Neighbors"
<< std::right << std::setw(12) << std::fixed << std::setprecision(6) << gNeigh << '\n';
std::cout << std::left << std::setw(16) << "TOTAL (est.)"
<< std::setw(36) << ""
<< std::right << std::setw(12) << std::fixed << std::setprecision(6) << gTotal << '\n';
std::cout << "--------------------------------------------------------------------------------------------\n";
std::cout << "GPU Memory Free / Total [GB]: "
<< std::fixed << std::setprecision(2)
<< double(free_cudamem)/1e9 << " / " << double(total_cudamem)/1e9 << '\n';
std::cout << "============================================================================================\n";
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copy to GPU Memory
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// ---- Excitation maps & counts -------------------------------------------------
CUDA_SAFE_MALLOC(mapE_d, BYTES(int8_t, exciCNT_total * mapElemsPerExci));
CUDA_SAFE_MALLOC(mapH_d, BYTES(int8_t, exciCNT_total * mapElemsPerExci));
CUDA_SAFE_MALLOC(ExcitationFacesCnt_d, BYTES(int, exciCNT_total));
CUDA_SAFE_MALLOC(ExcitationFacesOffset_d, BYTES(int, exciCNT_total));
CUDA_SAFE_MALLOC(ExcitationFacesNum_d, BYTES(int, excitationFaces));
CUDA_SAFE_MALLOC(nd_coords_tet_d, BYTES(fp_t_ts, tetNdElems));
CUDA_SAFE_MALLOC(nd_coords_face_d, BYTES(fp_t_ts, faceNdElems));
if (PlaneWaveBCFlag)
{
CUDA_SAFE_MALLOC(Z_face_pw_d, BYTES(fp_t_ts, excitationFaces));
}
// --- Allocate precomputed tangential fields (only port faces) ---
if (portCNT > 0)
{
CUDA_SAFE_CALL(cudaMalloc((void**)&Etan_qp_d, excitationFaces * Q * 3 * sizeof(fp_t_ts)));
CUDA_SAFE_CALL(cudaMalloc((void**)&Htan_qp_d, excitationFaces * Q * 3 * sizeof(fp_t_ts)));
CUDA_SAFE_CALL(cudaMalloc((void**)&PortFacePidx_d, excitationFaces * sizeof(int)));
const int nPorts = (int)portExcitations.size();
CUDA_SAFE_CALL(cudaMalloc((void**)&ExcitationProps_d, nPorts * sizeof(ExcitationProp)));
}
CUDA_SAFE_COPY(mapE_d, mapE_h, BYTES(int8_t, exciCNT_total * mapElemsPerExci));
CUDA_SAFE_COPY(mapH_d, mapH_h, BYTES(int8_t, exciCNT_total * mapElemsPerExci));
CUDA_SAFE_COPY(ExcitationFacesCnt_d, ExcitationFacesCnt_h, BYTES(int, exciCNT_total));
CUDA_SAFE_COPY(ExcitationFacesOffset_d, ExcitationFacesOffset_h, BYTES(int, exciCNT_total));
CUDA_SAFE_COPY(ExcitationFacesNum_d, ExcitationFacesNum_h, BYTES(int, excitationFaces));
CUDA_SAFE_COPY(nd_coords_tet_d, nd_coords_tet_h, BYTES(fp_t_ts, tetNdElems));
CUDA_SAFE_COPY(nd_coords_face_d, nd_coords_face_h, BYTES(fp_t_ts, faceNdElems));
if (PlaneWaveBCFlag)
{
CUDA_SAFE_COPY(Z_face_pw_d, Z_face_pw_h, BYTES(fp_t_ts, excitationFaces));
}
// --- copy precomputed tangential fields (only port faces) ---
if (portCNT > 0)
{
cout << "Export Etan and Htan" << endl;
CUDA_SAFE_CALL(cudaMemset(Etan_qp_d, 0.0, BYTES(fp_t_ts, excitationFaces * Q * 3)));
CUDA_SAFE_CALL(cudaMemset(Htan_qp_d, 0.0, BYTES(fp_t_ts, excitationFaces * Q * 3)));
CUDA_SAFE_COPY(Etan_qp_d, Etan_qp_h, BYTES(fp_t_ts, excitationFaces * Q * 3));
CUDA_SAFE_COPY(Htan_qp_d, Htan_qp_h, BYTES(fp_t_ts, excitationFaces * Q * 3));
CUDA_SAFE_COPY(PortFacePidx_d, PortFacePidx_h, BYTES(int, excitationFaces));
const int nPorts = (int)portExcitations.size();
CUDA_SAFE_COPY(ExcitationProps_d, portExcitations.data(), nPorts * sizeof(ExcitationProp));
}
// ---- Irregular (non-PML) -----------------------------------------------------
CUDA_SAFE_MALLOC(Loc1E_d, BYTES(fp_t_ts, irregularTetras_total * localElems));
CUDA_SAFE_MALLOC(Loc2E_d, BYTES(fp_t_ts, irregularTetras_total * localElems));
CUDA_SAFE_MALLOC(Loc1H_d, BYTES(fp_t_ts, irregularTetras_total * localElems));
CUDA_SAFE_MALLOC(Loc2H_d, BYTES(fp_t_ts, irregularTetras_total * localElems));
CUDA_SAFE_MALLOC(Neigh1E_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
CUDA_SAFE_MALLOC(Neigh2E_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
CUDA_SAFE_MALLOC(Neigh1H_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
CUDA_SAFE_MALLOC(Neigh2H_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
// Inverses only for excitations
CUDA_SAFE_MALLOC(InvE_d, BYTES(fp_t_ts, exciCNT_total * localElems));
CUDA_SAFE_MALLOC(InvH_d, BYTES(fp_t_ts, exciCNT_total * localElems));
// Irregular (non-PML)
CUDA_SAFE_COPY(Loc1E_d, Loc1E_h, BYTES(fp_t_ts, irregularTetras_total * localElems));
CUDA_SAFE_COPY(Loc2E_d, Loc2E_h, BYTES(fp_t_ts, irregularTetras_total * localElems));
CUDA_SAFE_COPY(Loc1H_d, Loc1H_h, BYTES(fp_t_ts, irregularTetras_total * localElems));
CUDA_SAFE_COPY(Loc2H_d, Loc2H_h, BYTES(fp_t_ts, irregularTetras_total * localElems));
CUDA_SAFE_COPY(Neigh1E_d, Neigh1E_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
CUDA_SAFE_COPY(Neigh2E_d, Neigh2E_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
CUDA_SAFE_COPY(Neigh1H_d, Neigh1H_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
CUDA_SAFE_COPY(Neigh2H_d, Neigh2H_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
CUDA_SAFE_COPY(InvE_d, InvE_h, BYTES(fp_t_ts, exciCNT_total * localElems));
CUDA_SAFE_COPY(InvH_d, InvH_h, BYTES(fp_t_ts, exciCNT_total * localElems));
// ---- Regular (prototype per group) -------------------------------------------
// Use exact counts — NOT (regularCNT - 1) or "*4"
if (regularRegionFlag)
{
if (regNormGroups > 0)
{
CUDA_SAFE_MALLOC(regularLoc1E_d, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
CUDA_SAFE_MALLOC(regularLoc2E_d, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
CUDA_SAFE_MALLOC(regularLoc1H_d, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
CUDA_SAFE_MALLOC(regularLoc2H_d, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
CUDA_SAFE_MALLOC(regularNeigh1E_d, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
CUDA_SAFE_MALLOC(regularNeigh2E_d, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
CUDA_SAFE_MALLOC(regularNeigh1H_d, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
CUDA_SAFE_MALLOC(regularNeigh2H_d, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
CUDA_SAFE_COPY(regularLoc1E_d, regularLoc1E_h, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
CUDA_SAFE_COPY(regularLoc2E_d, regularLoc2E_h, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
CUDA_SAFE_COPY(regularLoc1H_d, regularLoc1H_h, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
CUDA_SAFE_COPY(regularLoc2H_d, regularLoc2H_h, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
CUDA_SAFE_COPY(regularNeigh1E_d, regularNeigh1E_h, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
CUDA_SAFE_COPY(regularNeigh2E_d, regularNeigh2E_h, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
CUDA_SAFE_COPY(regularNeigh1H_d, regularNeigh1H_h, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
CUDA_SAFE_COPY(regularNeigh2H_d, regularNeigh2H_h, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
}
if (regPMLGroups > 0)
{
// PML-regular
CUDA_SAFE_MALLOC(regularPMLLoc1E_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_MALLOC(regularPMLLoc2E_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_MALLOC(regularPMLLoc1H_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_MALLOC(regularPMLLoc2H_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_MALLOC(regularPMLNeigh1E_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
CUDA_SAFE_MALLOC(regularPMLNeigh2E_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
CUDA_SAFE_MALLOC(regularPMLNeigh1H_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
CUDA_SAFE_MALLOC(regularPMLNeigh2H_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
// PML auxiliaries for regular-PML prototypes (if used)
CUDA_SAFE_MALLOC(regularPMLAuxE_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_MALLOC(regularPMLAuxH_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_MALLOC(regularPMLLoc1M_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_MALLOC(regularPMLLoc2M_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_MALLOC(regularPMLLoc1J_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_MALLOC(regularPMLLoc2J_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
// PML-regular
CUDA_SAFE_COPY(regularPMLLoc1E_d, regularPMLLoc1E_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_COPY(regularPMLLoc2E_d, regularPMLLoc2E_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_COPY(regularPMLLoc1H_d, regularPMLLoc1H_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_COPY(regularPMLLoc2H_d, regularPMLLoc2H_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_COPY(regularPMLNeigh1E_d, regularPMLNeigh1E_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
CUDA_SAFE_COPY(regularPMLNeigh2E_d, regularPMLNeigh2E_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
CUDA_SAFE_COPY(regularPMLNeigh1H_d, regularPMLNeigh1H_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
CUDA_SAFE_COPY(regularPMLNeigh2H_d, regularPMLNeigh2H_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
CUDA_SAFE_COPY(regularPMLAuxE_d, regularPMLAuxE_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_COPY(regularPMLAuxH_d, regularPMLAuxH_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_COPY(regularPMLLoc1M_d, regularPMLLoc1M_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_COPY(regularPMLLoc2M_d, regularPMLLoc2M_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_COPY(regularPMLLoc1J_d, regularPMLLoc1J_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_COPY(regularPMLLoc2J_d, regularPMLLoc2J_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
CUDA_SAFE_MALLOC(r_Mn_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
CUDA_SAFE_MALLOC(r_Mn1_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
CUDA_SAFE_MALLOC(r_Jn12_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
CUDA_SAFE_MALLOC(r_Jn32_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
CUDA_SAFE_ZERO(r_Mn_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
CUDA_SAFE_ZERO(r_Mn1_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
CUDA_SAFE_ZERO(r_Jn12_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
CUDA_SAFE_ZERO(r_Jn32_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
}
}
// ---- Irregular PML (per element) ---------------------------------------------
cout << "Non regular PMLTetras_total = " << PMLTetras_total << endl;
if (PMLTetras_total > 0)
{
CUDA_SAFE_MALLOC(Loc1E_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(Loc2E_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(Loc1H_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(Loc2H_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(Neigh1E_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
CUDA_SAFE_MALLOC(Neigh2E_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
CUDA_SAFE_MALLOC(Neigh1H_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
CUDA_SAFE_MALLOC(Neigh2H_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
CUDA_SAFE_MALLOC(AuxE_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(AuxH_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(AuxM1_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(AuxJ1_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(AuxM2_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(AuxJ2_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_COPY(Loc1E_PML_d, Loc1E_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_COPY(Loc2E_PML_d, Loc2E_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_COPY(Loc1H_PML_d, Loc1H_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_COPY(Loc2H_PML_d, Loc2H_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_COPY(Neigh1E_PML_d, Neigh1E_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
CUDA_SAFE_COPY(Neigh2E_PML_d, Neigh2E_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
CUDA_SAFE_COPY(Neigh1H_PML_d, Neigh1H_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
CUDA_SAFE_COPY(Neigh2H_PML_d, Neigh2H_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
CUDA_SAFE_COPY(AuxE_d, AuxE_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_COPY(AuxH_d, AuxH_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_COPY(AuxM1_d, AuxM1_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_COPY(AuxJ1_d, AuxJ1_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_COPY(AuxM2_d, AuxM2_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_COPY(AuxJ2_d, AuxJ2_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(Mn_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(Mn1_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(Jn12_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_MALLOC(Jn32_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_ZERO(Mn_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_ZERO(Mn1_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_ZERO(Jn12_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
CUDA_SAFE_ZERO(Jn32_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
}
// ---- Global field buffers -----------------------------------------------------
CUDA_SAFE_MALLOC(En_d, BYTES(fp_t_ts, sizeFieldElems));
CUDA_SAFE_MALLOC(En1_d, BYTES(fp_t_ts, sizeFieldElems));
CUDA_SAFE_MALLOC(Hn12_d, BYTES(fp_t_ts, sizeFieldElems));
CUDA_SAFE_MALLOC(Hn32_d, BYTES(fp_t_ts, sizeFieldElems));
// Fields zero init
CUDA_SAFE_ZERO(En_d, BYTES(fp_t_ts, sizeFieldElems));
CUDA_SAFE_ZERO(En1_d, BYTES(fp_t_ts, sizeFieldElems));
CUDA_SAFE_ZERO(Hn12_d, BYTES(fp_t_ts, sizeFieldElems));
CUDA_SAFE_ZERO(Hn32_d, BYTES(fp_t_ts, sizeFieldElems));
// ---- Neighbor maps ------------------------------------------------------------
CUDA_SAFE_MALLOC(NeighMap_d, BYTES(int, neighMapElems));
CUDA_SAFE_MALLOC(Neighbours_d, BYTES(int, neighboursElems));
CUDA_SAFE_MALLOC(NeighboursOffset_d, BYTES(int, neighboursElems));
CUDA_SAFE_MALLOC(auxFieldInput, BYTES(fp_t_ts, auxInElems));
CUDA_SAFE_MALLOC(auxFieldOutput, BYTES(fp_t_ts, auxOutElems));
// Neighbor structures
CUDA_SAFE_COPY(NeighMap_d, NeighMap_h, BYTES(int, neighMapElems));
CUDA_SAFE_COPY(Neighbours_d, Neighbours_h, BYTES(int, neighboursElems));
CUDA_SAFE_COPY(NeighboursOffset_d, NeighboursOffset_h, BYTES(int, neighboursElems));
}
void FemGrp::TimeSteppingCuBLAS()
{
fp_t InitTime = 0.0;
fp_t Frequency = freq;
fp_t dt_nyquist = 1.0 / (2.0 * Frequency * MEGA);
fp_t dt_sample = (1 / SamplingRate) * dt_nyquist;
tsPerSampling = (int)ceil(dt_sample / LocTimeSteps[N_class - 1]);
dt_sample = tsPerSampling * LocTimeSteps[N_class - 1];
if(FinalTime > 0)
NtimeSteps = (int)ceil((FinalTime - InitTime) / LocTimeSteps[N_class -1]); // number of time steps for the biggest time step size
else
NtimeSteps = 0;
if(usePade){
fp_t earlyTime = 10 * Length(maxPoint - minPoint) / Vo;
/*7.5 (for saftey use 10) is empirical because in "Early Time Behavior in Reverberation Chambers and
Its Effect on the Relationships Between Coherence
Bandwidth, Chamber Decay Time, RMS Delay
Spread, and the Chamber Buildup Time", Christopher L. Holloway et al.
the value of 3/2 is suggested from equation 30 */
tsPerPade = (int)ceil(earlyTime / LocTimeSteps[N_class -1]);
tsPerPade = tsPerPade + tsPerSampling - tsPerPade % tsPerSampling;
fieldProbes = new fp_t_ts[probeCNT * (int)ceil((1.0 * NtimeSteps) / tsPerSampling) * NumOfFieldComponents];
CUDA_SAFE_CALL(cudaMallocHost((void**)&tranferencePadeFunctionFD_h, padeCNT * (int)ceil((1.0 * NtimeSteps) / tsPerSampling) * NumOfFieldComponents * sizeof(cuDoubleComplex), cudaHostAllocMapped));
getPadeFreq((int)ceil((1.0 * NtimeSteps) / tsPerSampling), tsPerSampling);
}
Write_TD_Data(tsPerSampling, NtimeSteps);
//Output precision set to 15 digits
cout.precision(15);
//Print out data used in the computation
cout << endl;
cout << "=============================================" << endl;
cout << "== Running CUDA Implementation (Non-Heavy) ==" << endl;
cout << "=============================================" << endl;
cout << endl;
cout << "==========================================" << endl;
cout << " PERFORMING INFORMATION " << endl;
cout << "==========================================" << endl;
if(FinalTime > 0)
cout << " Final Time(sec) = " << FinalTime << endl;
else
cout << " Final Time = " << "TBD" << endl;
cout << " Time Step, dt(sec) = " << LocTimeSteps[N_class -1] << endl;
cout << " Number of Tetrahedra = " << tetraCNT << endl;
cout << " Number of Classes = " << N_class << endl;
if(FinalTime > 0)
cout << " Number of Time Steps = " << NtimeSteps << endl;
for(int i = 0; i < N_class ; i++){
cout << " LocTimeSteps[" << i << "] = " << LocTimeSteps[i] << endl;
}
cout << endl;
cout << " dt_nyquist = " << dt_nyquist << endl;
cout << " dt_sample = " << dt_sample << endl;
cout << " tsPerSampling = " << tsPerSampling << endl;
if(FinalTime > 0)
cout << " Number of samplings = " << (int)ceil((1.0 * NtimeSteps) / tsPerSampling) << endl;
if(usePade){
cout << " Time Steps / Pade Calc = " << tsPerPade << endl;
}
cout << "==========================================" << endl;
cout << endl;
//Memory status
SYSTEM_MEM_USAGE();
cout << endl;
cout << " " << endl;
cout << "===================================================" << endl;
cout << " Local Time-Stepping Loop " << endl;
cout << "===================================================" << endl;
// Variables for time tracking
size_t total_time = 0;
fp_t current_time = 0;
bool exitBool = false;
current_time -= (double)dt_sample * 1e9;
if(FinalTime <= 0){
NtimeSteps = NumOfSampleEnergyCheck * tsPerSampling + 1;
fieldEnergy = 0;
maxFieldEnergy = 0;
if(numberOfEnergyPoints == 0){
numberOfEnergyPoints = probeCNT;
}
}
cublasHandle_t handle;
cublasCreate(&handle);
timer_start("Time Stepping", ' ');
timer_start("Start Time Stepping", 'm');
for(int n = 0; n < NtimeSteps; n++){
ComputeE_cuBLAS(handle, N_class - 1);
ComputeH_cuBLAS(handle, N_class - 1);
if(n % tsPerSampling == 0)
{
CUDA_SAFE_CALL(cudaMemcpy(En1_h, En1_d, tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(Hn32_h, Hn32_d, tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaDeviceSynchronize());
total_time += timer_stop('m');
if(write_probes && probeCNT > 0)
{
writeFieldProbeCuBLAS(n);
if(write_AnalyticalIncidentProbes){
writeAnalyticalIncidentPWProbes(n);
}
if(n != 0 && usePade && n % tsPerPade == 0)
{
if(padeTime < 0.0){
exitBool = calculatePadeCUDA(n, n / tsPerPade == 1, false);
}else if(n * LocTimeSteps[N_class - 1] > padeTime * 1e-9){
exitBool = true;
}
}
}
if(write_fields){
writeFieldGlobalCuBLAS(n);
}
// Modified by Qi Jian to write surface currents
if(WriteSurfFlag)
{
writeCurrentsOutputSurfMesh_CuBLAS(n);
}
// Writing the fields on the port surfaces
if (PortBCFlag)
{
writePortFieldProbeCuBLAS(n);
writePortIncidentProbeCuBLAS(n);
}
fp_t_ts magAux = 0;
for(int i = 0; i < tetraCNT * TetPolyOrderDim[PolyFlag]; i++){
magAux += En1_h[i] * En1_h[i];
}
cout << "E field norm^2 " << magAux << endl;
current_time += (double)dt_sample * 1e9;
DEBUG_INFO(" Current Time : " + to_string(current_time) + "ns");
DEBUG_INFO(" Average iteration time : "+ to_string(((double)total_time / (double)(n + 1.0))) + " msec");
if(exitBool){
calculatePadeCUDA(n, false, true);
break;
}
if(FinalTime < 0 && n == NtimeSteps-1){
if(!checkEnergyDecay()){
NtimeSteps += NumOfSampleEnergyCheck * tsPerSampling;
cout << "Max Energy: " << maxFieldEnergy << " - Current Energy: " << fieldEnergy << " - Relation: " << fieldEnergy * 100 / maxFieldEnergy << "%" << endl;
fieldEnergy = 0.0;
}else{
Write_TD_Data(tsPerSampling, NtimeSteps);
break;
}
}
cout << "---------------------------------------------------" << endl;
timer_start(to_string(tsPerSampling)+" steps", 'm');
}
}
if(!exitBool && padeCNT > 0 && !writeWhilePade){
writeFieldProbeAfterPade(tsPerSampling);
}
if(!exitBool && (NtimeSteps-1 % tsPerSampling != 0)){
timer_stop('m');
}
DEBUG_INFO(" Total iteration time: "+ to_string((double)total_time) + " msec");
timer_stop(' ');
}
//The recursivity in ComputeE and ComputeH is due to the LTS process
/**********************************************************************
Local Time-Stepping for CUDA Recursive
Explained in "Dissipative terms and local time-stepping improvements
in a spatial high order Discontinuous Galerkin scheme
for the time-domain Maxwell’s equations" by E. Montseny
**********************************************************************/
void FemGrp::ComputeE_cuBLAS(cublasHandle_t handle, int class_i){
if(class_i == 0){
LE_CuBLAS(handle, class_i);
}else{
LE_CuBLAS(handle, class_i);
ComputeE_cuBLAS(handle, class_i - 1);
ComputeH_cuBLAS(handle, class_i - 1);
ComputeE_cuBLAS(handle, class_i - 1);
}
}
void FemGrp::ComputeH_cuBLAS(cublasHandle_t handle, int class_i){
if(class_i == 0){
LH_CuBLAS(handle, class_i);
}else{
LH_CuBLAS(handle, class_i);
ComputeH_cuBLAS(handle, class_i - 1);
ComputeE_cuBLAS(handle, class_i - 1);
ComputeH_cuBLAS(handle, class_i - 1);
}
}
void FemGrp::LE_CuBLAS(cublasHandle_t handle, int class_i)
{
const int Q = GAUSS_POINT_NUM_h[PolyFlag]; // same as GPU kernel uses
int irregularTetras = nonRegularTetraCnt_h[class_i];
int classOffset = ClassTetraOffset[class_i];
int neighOffset = NeighClassOffset_h[class_i];
int blockSize = 256; //optimal number
int numBlocks;
if(irregularTetras > 0)
{
// Local Mattrices
int nMatrices = irregularTetras;
int matrixOffset = classTetraOffset_loc_h[class_i];
int m = TetPolyOrderDim[PolyFlag]; //rows of A
int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A
long long int strideA = m * n;
long long int strideB = n;
long long int strideC = m;
float alpha = 1.0;
float beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Loc1E_d[matrixOffset * strideA], m,
strideA,
&En_d[classOffset * strideB], n,
strideB,
&beta,
&En1_d[classOffset * strideC], m,
strideC,
nMatrices);
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Loc2E_d[matrixOffset * strideA], m,
strideA,
&Hn12_d[classOffset * strideB], n,
strideB,
&beta,
&En1_d[classOffset * strideC], m,
strideC,
nMatrices);
if(ClassExcitationCount[class_i] > 0)
{
nMatrices = ClassExcitationCount[class_i];
matrixOffset = ClassExcitationOffset[class_i];
//cout << "ClassExcitationCount[" << class_i << "] = " << ClassExcitationCount[class_i] << endl;
//cout << "ClassExcitationOffset[" << class_i << "] = " << ClassExcitationOffset[class_i] << endl;
//cout << "classOffset * strideC " << classOffset * strideC << endl;
numBlocks = (nMatrices + blockSize - 1) / blockSize;
fp_t_ts dt = LocTimeSteps[class_i];
fp_t_ts t = (LocalExciIndexE[class_i] + 0.5) * dt;
LocalExciIndexE[class_i]++;
if (PWorPort == 0)
{
if (interior_excitation_flag)
{
addExcitationE_PML<<<numBlocks, blockSize>>>(&ExcitationFacesCnt_d[matrixOffset],
&ExcitationFacesOffset_d[matrixOffset],
ExcitationFacesNum_d,
nMatrices,
ClassExcitation_sc_CNT[class_i],
&mapE_d[matrixOffset * strideC],
excitationProp,
PolyFlag,
dt /Eo, t,
&nd_coords_tet_d[matrixOffset * NumOfNodes * NumOfUnitaryVectors],
nd_coords_face_d,
Z_face_pw_d,
&InvE_d[matrixOffset * strideA],
&En1_d[classOffset * strideC]);
}
else
{
addExcitationE<<<numBlocks, blockSize>>>(&ExcitationFacesCnt_d[matrixOffset],
&ExcitationFacesOffset_d[matrixOffset],
ExcitationFacesNum_d,
nMatrices,
&mapE_d[matrixOffset * strideC],
excitationProp,
PolyFlag,
dt /Eo, t,
&nd_coords_tet_d[matrixOffset * NumOfNodes * NumOfUnitaryVectors],
nd_coords_face_d,
Z_face_pw_d,
&InvE_d[matrixOffset * strideA],
&En1_d[classOffset * strideC]);
}
}
else
{
addExcitationE_port<<<numBlocks, blockSize>>>( &ExcitationFacesCnt_d[matrixOffset],
&ExcitationFacesOffset_d[matrixOffset],
ExcitationFacesNum_d,
nMatrices,
&mapE_d[matrixOffset * strideC],
ExcitationProps_d,
PortFacePidx_d,
PolyFlag,
dt /Eo, t,
&nd_coords_tet_d[matrixOffset * NumOfNodes * NumOfUnitaryVectors],
nd_coords_face_d,
&InvE_d[matrixOffset * strideA],
&En1_d[classOffset * strideC]);
//cout << "\n\n\n\n\n";
}
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
// Coupling Matrices
nMatrices = classNeighIrregular_h[class_i];
matrixOffset = classNeighOffset_loc_h[class_i];
m = TetPolyOrderDim[PolyFlag]; //rows of A
n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A
strideA = m * n;
strideB = n;
strideC = m;
numBlocks = (nMatrices * n + blockSize - 1) / blockSize;
makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)
beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Neigh1E_d[matrixOffset * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
nMatrices);
makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Neigh2E_d[matrixOffset * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
nMatrices);
int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];
dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
numBlocks = (irregularTetras + blockY - 1) / blockY;
addCouplingResults<<<numBlocks, blockDim>>>(&En1_d[classOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], irregularTetras); //Implement 3D if tetras over blocksize * (2^(31) - 1)
}
// --------------------------------------------------------------------------------------------------
CUDA_SAFE_CALL(cudaDeviceSynchronize());
if(regularRegionFlag && classRegularGroupsCnt_h[class_i] > 0)
{
for(int i = 0; i < classRegularGroupsCnt_h[class_i]; i++)
{
int groupID = classRegularGroupsId_h[class_i][i];
int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID];
int groupOffset = classOffset + nonRegularTetraCnt_h[class_i] + classRegularTetraOffset_h[class_i][i];
CUDA_SAFE_CALL(cudaDeviceSynchronize());
// Local Matrices
int m = TetPolyOrderDim[PolyFlag]; //rows of A
int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A
float alpha = 1.0;
float beta = 0.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularLoc1E_d[(groupID - 1) * m * n], m,
&En_d[groupOffset * n], n,
&beta,
&En1_d[groupOffset * m], m);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularLoc2E_d[(groupID - 1) * m * n], m,
&Hn12_d[groupOffset * n], n,
&beta,
&En1_d[groupOffset * m], m);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
// Coupling Matrices
int regularNeighOffset = neighOffset + classNeighIrregular_h[class_i] + classRegularTetraOffset_h[class_i][i] * NumOfFaces;
m = TetPolyOrderDim[PolyFlag]; //rows of A
n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A
numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize;
makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag);
long long int strideA = m * n;
long long int strideB = n * groupElements;
long long int strideC = m * groupElements;
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularNeigh1E_d[(groupID - 1) * NumOfFaces * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
NumOfFaces);
makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularNeigh2E_d[(groupID - 1) * NumOfFaces * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
NumOfFaces);
int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];
dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
numBlocks = (groupElements + blockY - 1) / blockY;
addCouplingResultsRegular<<<numBlocks, blockDim>>>(&En1_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements);
}
}
// -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
CUDA_SAFE_CALL(cudaDeviceSynchronize());
// --------------------------------------------
// PML Section
int PMLTetras = nonRegularPMLTetraCnt_h[class_i];
classOffset = classPMLTetraOffset_h[class_i];
neighOffset = classNeighPMLOffset_h[class_i];
if(PMLTetras > 0)
{
// Local Mattrices
int nMatrices = PMLTetras;
int matrixOffset = classPMLTetraOffset_loc_h[class_i];
// cout << "classPMLTetraOffset_loc_h[class_i] : " << classPMLTetraOffset_loc_h[class_i] << endl;
int m = TetPolyOrderDim[PolyFlag]; //rows of A
int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A
long long int strideA = m * n;
long long int strideB = n;
long long int strideC = m;
float alpha = 1.0;
float beta = 0.0;
CUDA_SAFE_CALL(cudaDeviceSynchronize());
// --------------------------------------------------------
// Auxilliary J
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&AuxJ1_d[matrixOffset * strideA], m,
strideA,
&Jn12_d[matrixOffset * strideB], n,
strideB,
&beta,
&Jn32_d[matrixOffset * strideC], m,
strideC,
nMatrices);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&AuxJ2_d[matrixOffset * strideA], m,
strideA,
&En_d[classOffset * strideB], n,
strideB,
&beta,
&Jn32_d[matrixOffset * strideC], m,
strideC,
nMatrices);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
// --------------------------------------------------------
alpha = 1.0;
beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Loc1E_PML_d[matrixOffset * strideA], m,
strideA,
&En_d[classOffset * strideB], n,
strideB,
&beta,
&En1_d[classOffset * strideC], m,
strideC,
nMatrices);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Loc2E_PML_d[matrixOffset * strideA], m,
strideA,
&Hn12_d[classOffset * strideB], n,
strideB,
&beta,
&En1_d[classOffset * strideC], m,
strideC,
nMatrices);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
// Add Auxilliary J term
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&AuxE_d[matrixOffset * strideA], m,
strideA,
&Jn32_d[matrixOffset * strideB], n,
strideB,
&beta,
&En1_d[classOffset * strideC], m,
strideC,
nMatrices);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
// Coupling Matrices
nMatrices = classNeighPML_h[class_i];
matrixOffset = classNeighPMLOffset_loc_h[class_i];
m = TetPolyOrderDim[PolyFlag]; //rows of A
n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A
strideA = m * n;
strideB = n;
strideC = m;
numBlocks = (nMatrices * n + blockSize - 1) / blockSize;
makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)
beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Neigh1E_PML_d[matrixOffset * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
nMatrices);
makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Neigh2E_PML_d[matrixOffset * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
nMatrices);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];
dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
numBlocks = (PMLTetras + blockY - 1) / blockY;
//Implement 3D if tetras over blocksize * (2^(31) - 1)
addCouplingResults<<<numBlocks, blockDim>>>(&En1_d[classPMLTetraOffset_h[class_i] * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], PMLTetras);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
if(regularRegionFlag && classRegularPMLGroupsCnt_h[class_i] > 0)
{
for(int i = 0; i < classRegularPMLGroupsCnt_h[class_i]; i++)
{
int groupID = classRegularPMLGroupsId_h[class_i][i];
int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID];
int groupOffset = classOffset + nonRegularPMLTetraCnt_h[class_i] + classRegularPMLTetraOffset_h[class_i][i];
CUDA_SAFE_CALL(cudaDeviceSynchronize());
int local_index = groupID - 1 - regularCNT_Normal;
int aux_offset = classRegularPMLTetraOffset_h[class_i][i];
// Local Matrices
int m = TetPolyOrderDim[PolyFlag]; //rows of A
int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A
// --------------------------------------------------------
// Auxilliary J
float alpha = 1.0;
float beta = 0.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLLoc1J_d[(local_index) * m * n], m,
&r_Jn12_d[aux_offset * n], n,
&beta,
&r_Jn32_d[aux_offset * m], m);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLLoc2J_d[(local_index) * m * n], m,
&En_d[aux_offset * n], n,
&beta,
&r_Jn32_d[aux_offset * m], m);
// --------------------------------------------------------
beta = 0.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLLoc1E_d[(local_index) * m * n], m,
&En_d[groupOffset * n], n,
&beta,
&En1_d[groupOffset * m], m);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLLoc2E_d[(local_index) * m * n], m,
&Hn12_d[groupOffset * n], n,
&beta,
&En1_d[groupOffset * m], m);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLAuxE_d[(local_index) * m * n], m,
&r_Jn32_d[aux_offset * n], n,
&beta,
&En1_d[groupOffset * m], m);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
// Coupling Matrices
int regularNeighOffset = neighOffset + classNeighPML_h[class_i] + classRegularPMLTetraOffset_h[class_i][i] * NumOfFaces;
m = TetPolyOrderDim[PolyFlag]; //rows of A
n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A
numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize;
makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag);
long long int strideA = m * n;
long long int strideB = n * groupElements;
long long int strideC = m * groupElements;
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLNeigh1E_d[(local_index) * NumOfFaces * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
NumOfFaces);
makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLNeigh2E_d[(local_index) * NumOfFaces * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
NumOfFaces);
int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];
dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
numBlocks = (groupElements + blockY - 1) / blockY;
addCouplingResultsRegular<<<numBlocks, blockDim>>>(&En1_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements);
}
}
CUDA_SAFE_CALL(cudaDeviceSynchronize());
/*
int total_tets = ClassTetraCnt[class_i] + ClassPMLTetraCnt[class_i];
int offset = ClassTetraOffset[class_i];
CUDA_SAFE_CALL(cudaMemcpy(&En_d[offset * TetPolyOrderDim[PolyFlag]], &En1_d[offset * TetPolyOrderDim[PolyFlag]],
total_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
CUDA_SAFE_CALL(cudaDeviceSynchronize());
*/
CUDA_SAFE_CALL(cudaMemcpy(&En_d[0], &En1_d[0], tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
cudaDeviceSynchronize();
if(nonRegularPMLTetraCnt_h[class_i] > 0)
{
int num_PML_tets = nonRegularPMLTetraCnt_h[class_i];
int matrixOffset = classPMLTetraOffset_loc_h[class_i];
CUDA_SAFE_CALL(cudaMemcpy(&Jn12_d[matrixOffset * TetPolyOrderDim[PolyFlag]], &Jn32_d[matrixOffset * TetPolyOrderDim[PolyFlag]],
num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
if(classRegularPMLGroupsCnt_h[class_i] > 0)
{
int num_PML_tets = numRegPMLTetras;
CUDA_SAFE_CALL(cudaMemcpy(&Jn12_d[0], &Jn32_d[0], num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
}
void FemGrp::LH_CuBLAS(cublasHandle_t handle, int class_i){
const int Q = GAUSS_POINT_NUM_h[PolyFlag]; // same as GPU kernel uses
int irregularTetras = nonRegularTetraCnt_h[class_i];
int classOffset = ClassTetraOffset[class_i];
int neighOffset = NeighClassOffset_h[class_i];
int blockSize = 256; //optimal number
int numBlocks;
if(irregularTetras > 0)
{
// Local Mattrices
int nMatrices = irregularTetras;
int matrixOffset = classTetraOffset_loc_h[class_i];
int m = TetPolyOrderDim[PolyFlag]; //rows of A
int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A
long long int strideA = m * n;
long long int strideB = n;
long long int strideC = m;
float alpha = 1.0;
float beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Loc1H_d[matrixOffset * strideA], m,
strideA,
&Hn12_d[classOffset * strideB], n,
strideB,
&beta,
&Hn32_d[classOffset * strideC], m,
strideC,
nMatrices);
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Loc2H_d[matrixOffset * strideA], m,
strideA,
&En_d[classOffset * strideB], n,
strideB,
&beta,
&Hn32_d[classOffset * strideC], m,
strideC,
nMatrices);
if(ClassExcitationCount[class_i] > 0){
nMatrices = ClassExcitationCount[class_i];
matrixOffset = ClassExcitationOffset[class_i];
numBlocks = (nMatrices + blockSize - 1) / blockSize;
fp_t_ts dt = LocTimeSteps[class_i];
fp_t_ts t = (LocalExciIndexH[class_i] + 1.0) * dt;
LocalExciIndexH[class_i]++;
if (PWorPort == 0)
{
if (interior_excitation_flag)
{
addExcitationH_PML<<<numBlocks, blockSize>>>(&ExcitationFacesCnt_d[matrixOffset],
&ExcitationFacesOffset_d[matrixOffset],
ExcitationFacesNum_d,
nMatrices,
ClassExcitation_sc_CNT[class_i],
&mapH_d[matrixOffset * strideC],
excitationProp,
PolyFlag,
dt / Uo, t,
&nd_coords_tet_d[4 * 3 * matrixOffset],
nd_coords_face_d,
Z_face_pw_d,
&InvH_d[strideA * matrixOffset],
&Hn32_d[classOffset * strideC]);
}
else
{
addExcitationH<<<numBlocks, blockSize>>>(&ExcitationFacesCnt_d[matrixOffset],
&ExcitationFacesOffset_d[matrixOffset],
ExcitationFacesNum_d,
nMatrices,
&mapH_d[matrixOffset * strideC],
excitationProp,
PolyFlag,
dt / Uo, t,
&nd_coords_tet_d[4 * 3 * matrixOffset],
nd_coords_face_d,
Z_face_pw_d,
&InvH_d[strideA * matrixOffset],
&Hn32_d[classOffset * strideC]);
}
}
else
{
addExcitationH_port<<<numBlocks, blockSize>>>(&ExcitationFacesCnt_d[matrixOffset],
&ExcitationFacesOffset_d[matrixOffset],
ExcitationFacesNum_d,
nMatrices,
&mapH_d[matrixOffset * strideC],
ExcitationProps_d,
PortFacePidx_d,
PolyFlag,
dt / Uo, t,
&nd_coords_tet_d[4 * 3 * matrixOffset],
nd_coords_face_d,
&InvH_d[strideA * matrixOffset],
&Hn32_d[classOffset * strideC]);
}
}
// Coupling Matrices
nMatrices = classNeighIrregular_h[class_i];
matrixOffset = classNeighOffset_loc_h[class_i];
m = TetPolyOrderDim[PolyFlag]; //rows of A
n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A
strideA = m * n;
strideB = n;
strideC = m;
numBlocks = (nMatrices * n + blockSize - 1) / blockSize;
makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)
beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Neigh1H_d[matrixOffset * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
nMatrices);
makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Neigh2H_d[matrixOffset * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
nMatrices);
int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];
dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
numBlocks = (irregularTetras + blockY - 1) / blockY;
addCouplingResults<<<numBlocks, blockDim>>>(&Hn32_d[classOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], irregularTetras); //Implement 3D if tetras over blocksize * (2^(31) - 1)
}
// --------------------------------------------------------------------------------------------------
CUDA_SAFE_CALL(cudaDeviceSynchronize());
if(regularRegionFlag && classRegularGroupsCnt_h[class_i] > 0)
{
for(int i = 0; i < classRegularGroupsCnt_h[class_i]; i++)
{
int groupID = classRegularGroupsId_h[class_i][i];
int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID];
int groupOffset = classOffset + nonRegularTetraCnt_h[class_i] + classRegularTetraOffset_h[class_i][i];
// Local Matrices
int m = TetPolyOrderDim[PolyFlag]; //rows of A
int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A
float alpha = 1.0;
float beta = 0.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularLoc1H_d[(groupID - 1) * m * n], m,
&Hn12_d[groupOffset * n], n,
&beta,
&Hn32_d[groupOffset * m], m);
beta = 1.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularLoc2H_d[(groupID - 1) * m * n], m,
&En_d[groupOffset * n], n,
&beta,
&Hn32_d[groupOffset * m], m);
// Coupling Matrices
int regularNeighOffset = neighOffset + classNeighIrregular_h[class_i] + classRegularTetraOffset_h[class_i][i] * NumOfFaces;
m = TetPolyOrderDim[PolyFlag]; //rows of A
n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A
numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize;
makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag);
long long int strideA = m * n;
long long int strideB = n * groupElements;
long long int strideC = m * groupElements;
beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularNeigh1H_d[(groupID - 1) * NumOfFaces * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
NumOfFaces);
makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag);
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularNeigh2H_d[(groupID - 1) * NumOfFaces * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
NumOfFaces);
int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];
dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
numBlocks = (groupElements + blockY - 1) / blockY;
addCouplingResultsRegular<<<numBlocks, blockDim>>>(&Hn32_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements);
CUDA_SAFE_CALL(cudaDeviceSynchronize()); // make sure prior kernels/GEMMs finished
}
}
// -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
// --------------------------------------------
// PML Section
int PMLTetras = nonRegularPMLTetraCnt_h[class_i];
classOffset = classPMLTetraOffset_h[class_i];
neighOffset = classNeighPMLOffset_h[class_i];
if(PMLTetras > 0)
{
// Local Mattrices
int nMatrices = PMLTetras;
int matrixOffset = classPMLTetraOffset_loc_h[class_i];
int m = TetPolyOrderDim[PolyFlag]; //rows of A
int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A
long long int strideA = m * n;
long long int strideB = n;
long long int strideC = m;
float alpha = 1.0;
float beta = 0.0;
// --------------------------------------------------------
// Auxilliary M
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&AuxM1_d[matrixOffset * strideA], m,
strideA,
&Mn_d[matrixOffset * strideB], n,
strideB,
&beta,
&Mn1_d[matrixOffset * strideC], m,
strideC,
nMatrices);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&AuxM2_d[matrixOffset * strideA], m,
strideA,
&Hn12_d[classOffset * strideB], n,
strideB,
&beta,
&Mn1_d[matrixOffset * strideC], m,
strideC,
nMatrices);
// --------------------------------------------------------
alpha = 1.0;
beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Loc1H_PML_d[matrixOffset * strideA], m,
strideA,
&Hn12_d[classOffset * strideB], n,
strideB,
&beta,
&Hn32_d[classOffset * strideC], m,
strideC,
nMatrices);
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Loc2H_PML_d[matrixOffset * strideA], m,
strideA,
&En_d[classOffset * strideB], n,
strideB,
&beta,
&Hn32_d[classOffset * strideC], m,
strideC,
nMatrices);
// Add Auxilliary Term M
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&AuxH_d[matrixOffset * strideA], m,
strideA,
&Mn1_d[matrixOffset * strideB], n,
strideB,
&beta,
&Hn32_d[classOffset * strideC], m,
strideC,
nMatrices);
// Coupling Matrices
nMatrices = classNeighPML_h[class_i];
matrixOffset = classNeighPMLOffset_loc_h[class_i];
// cout << start << " " << nMatrices << " " << start + nMatrices << endl;
m = TetPolyOrderDim[PolyFlag]; //rows of A
n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A
strideA = m * n;
strideB = n;
strideC = m;
numBlocks = (nMatrices * n + blockSize - 1) / blockSize;
makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)
beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Neigh1H_PML_d[matrixOffset * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
nMatrices);
makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, 1, n,
&alpha,
&Neigh2H_PML_d[matrixOffset * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
nMatrices);
int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];
dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
numBlocks = (PMLTetras + blockY - 1) / blockY;
//Implement 3D if tetras over blocksize * (2^(31) - 1)
addCouplingResults<<<numBlocks, blockDim>>>(&Hn32_d[classPMLTetraOffset_h[class_i] * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], PMLTetras);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
if(regularRegionFlag && classRegularPMLGroupsCnt_h[class_i] > 0)
{
for(int i = 0; i < classRegularPMLGroupsCnt_h[class_i]; i++)
{
int groupID = classRegularPMLGroupsId_h[class_i][i];
int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID];
int groupOffset = classOffset + nonRegularPMLTetraCnt_h[class_i] + classRegularPMLTetraOffset_h[class_i][i];
CUDA_SAFE_CALL(cudaDeviceSynchronize());
int local_index = groupID - 1 - regularCNT_Normal;
int aux_offset = classRegularPMLTetraOffset_h[class_i][i];
// Local Matrices
int m = TetPolyOrderDim[PolyFlag]; //rows of A
int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A
// --------------------------------------------------------
// Auxilliary M
float alpha = 1.0;
float beta = 0.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLLoc1M_d[(local_index) * m * n], m,
&r_Mn_d[aux_offset * n], n,
&beta,
&r_Mn1_d[aux_offset * m], m);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLLoc2M_d[(local_index) * m * n], m,
&Hn12_d[aux_offset * n], n,
&beta,
&r_Mn1_d[aux_offset * m], m);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
// --------------------------------------------------------
beta = 0.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLLoc1H_d[(local_index) * m * n], m,
&Hn12_d[groupOffset * n], n,
&beta,
&Hn32_d[groupOffset * m], m);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLLoc2H_d[(local_index) * m * n], m,
&En_d[groupOffset * n], n,
&beta,
&Hn32_d[groupOffset * m], m);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLAuxH_d[(local_index) * m * n], m,
&r_Mn1_d[aux_offset * n], n,
&beta,
&Hn32_d[groupOffset * m], m);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
// Coupling Matrices
int regularNeighOffset = neighOffset + classNeighPML_h[class_i] + classRegularPMLTetraOffset_h[class_i][i] * NumOfFaces;
m = TetPolyOrderDim[PolyFlag]; //rows of A
n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A
numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize;
makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag);
long long int strideA = m * n;
long long int strideB = n * groupElements;
long long int strideC = m * groupElements;
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 0.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLNeigh1H_d[(local_index) * NumOfFaces * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
NumOfFaces);
makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
beta = 1.0;
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, groupElements, n,
&alpha,
&regularPMLNeigh2H_d[(local_index) * NumOfFaces * strideA], m,
strideA,
auxFieldInput, n,
strideB,
&beta,
auxFieldOutput, m,
strideC,
NumOfFaces);
int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];
dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
numBlocks = (groupElements + blockY - 1) / blockY;
addCouplingResultsRegular<<<numBlocks, blockDim>>>(&Hn32_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements);
}
}
CUDA_SAFE_CALL(cudaDeviceSynchronize());
/*
int total_tets = ClassTetraCnt[class_i] + ClassPMLTetraCnt[class_i];
int offset = ClassTetraOffset[class_i];
CUDA_SAFE_CALL(cudaMemcpy(&Hn12_d[ offset * TetPolyOrderDim[PolyFlag]], &Hn32_d[offset * TetPolyOrderDim[PolyFlag]],
total_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
cudaDeviceSynchronize();
*/
CUDA_SAFE_CALL(cudaMemcpy(&Hn12_d[0], &Hn32_d[0], tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
cudaDeviceSynchronize();
if(nonRegularPMLTetraCnt_h[class_i] > 0)
{
int num_PML_tets = nonRegularPMLTetraCnt_h[class_i];
int matrixOffset = classPMLTetraOffset_loc_h[class_i];
CUDA_SAFE_CALL(cudaMemcpy(&Mn_d[matrixOffset * TetPolyOrderDim[PolyFlag]], &Mn1_d[matrixOffset * TetPolyOrderDim[PolyFlag]], num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
if(classRegularPMLGroupsCnt_h[class_i] > 0)
{
int num_PML_tets = numRegPMLTetras;
CUDA_SAFE_CALL(cudaMemcpy(&r_Mn_d[0], &r_Mn1_d[0], num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
}
void FemGrp::FreeGPU(){
CUDA_SAFE_CALL(cudaFree(mapE_d));
CUDA_SAFE_CALL(cudaFree(mapH_d));
CUDA_SAFE_CALL(cudaFree(ExcitationFacesCnt_d));
CUDA_SAFE_CALL(cudaFree(ExcitationFacesOffset_d));
CUDA_SAFE_CALL(cudaFree(ExcitationFacesNum_d));
CUDA_SAFE_CALL(cudaFree(nd_coords_tet_d));
CUDA_SAFE_CALL(cudaFree(nd_coords_face_d));
if(PlaneWaveBCFlag){
CUDA_SAFE_CALL(cudaFree(Z_face_pw_d));
}
CUDA_SAFE_CALL(cudaFree(InvE_d));
CUDA_SAFE_CALL(cudaFree(InvH_d));
CUDA_SAFE_CALL(cudaFree(Loc1E_d));
CUDA_SAFE_CALL(cudaFree(Loc2E_d));
CUDA_SAFE_CALL(cudaFree(Loc1H_d));
CUDA_SAFE_CALL(cudaFree(Loc2H_d));
CUDA_SAFE_CALL(cudaFree(Neigh1E_d));
CUDA_SAFE_CALL(cudaFree(Neigh2E_d));
CUDA_SAFE_CALL(cudaFree(Neigh1H_d));
CUDA_SAFE_CALL(cudaFree(Neigh2H_d));
if(regularRegionFlag){
CUDA_SAFE_CALL(cudaFree(regularLoc1E_d));
CUDA_SAFE_CALL(cudaFree(regularLoc2E_d));
CUDA_SAFE_CALL(cudaFree(regularLoc1H_d));
CUDA_SAFE_CALL(cudaFree(regularLoc2H_d));
CUDA_SAFE_CALL(cudaFree(regularNeigh1E_d));
CUDA_SAFE_CALL(cudaFree(regularNeigh2E_d));
CUDA_SAFE_CALL(cudaFree(regularNeigh1H_d));
CUDA_SAFE_CALL(cudaFree(regularNeigh2H_d));
}
CUDA_SAFE_CALL(cudaFree(En_d));
CUDA_SAFE_CALL(cudaFree(En1_d));
CUDA_SAFE_CALL(cudaFree(Hn12_d));
CUDA_SAFE_CALL(cudaFree(Hn32_d));
CUDA_SAFE_CALL(cudaFree(NeighMap_d));
CUDA_SAFE_CALL(cudaFree(Neighbours_d));
CUDA_SAFE_CALL(cudaFree(auxFieldInput));
CUDA_SAFE_CALL(cudaFree(auxFieldOutput));
}
// Refactored by Qi Jian to build Octree of the tetrahedrals
void FemGrp::initializeOctree(std::string prjName, bool non_Conformal_flag)
{
cout << "========================================================== \n";
// Initialize octree object
octree_object = Octree();
// Compute AABB for each tetrahedron
/*
std::cout << "Compute AABB for tetrahedral" << std::endl;
octree_object.tetra_boxes.resize(tetraCNT);
#pragma omp parallel for
for (int tet_id = 0; tet_id < tetraCNT; ++tet_id)
{
const tetra& tet = tetARRAY[tet_id];
double x[4], y[4], z[4];
for (int i = 0; i < 4; ++i)
{
x[i] = tet.nd[i]->getCoord().getx();
y[i] = tet.nd[i]->getCoord().gety();
z[i] = tet.nd[i]->getCoord().getz();
}
AABB box;
box.xmin = std::min({x[0], x[1], x[2], x[3]});
box.xmax = std::max({x[0], x[1], x[2], x[3]});
box.ymin = std::min({y[0], y[1], y[2], y[3]});
box.ymax = std::max({y[0], y[1], y[2], y[3]});
box.zmin = std::min({z[0], z[1], z[2], z[3]});
box.zmax = std::max({z[0], z[1], z[2], z[3]});
octree_object.tetra_boxes[tet_id] = box;
}
*/
// Compute AABB for each tetrahedron
std::cout << "Compute AABB for tetrahedral (with buffer)" << std::endl;
octree_object.tetra_boxes.resize(tetraCNT);
// Buffer multiplier (e.g., 5% enlargement)
const double buffer_factor = 2.0;
#pragma omp parallel for
for (int tet_id = 0; tet_id < tetraCNT; ++tet_id)
{
const tetra& tet = tetARRAY[tet_id];
double x[4], y[4], z[4];
for (int i = 0; i < 4; ++i)
{
x[i] = tet.nd[i]->getCoord().getx();
y[i] = tet.nd[i]->getCoord().gety();
z[i] = tet.nd[i]->getCoord().getz();
}
AABB box;
double xmin = std::min({x[0], x[1], x[2], x[3]});
double xmax = std::max({x[0], x[1], x[2], x[3]});
double ymin = std::min({y[0], y[1], y[2], y[3]});
double ymax = std::max({y[0], y[1], y[2], y[3]});
double zmin = std::min({z[0], z[1], z[2], z[3]});
double zmax = std::max({z[0], z[1], z[2], z[3]});
// Compute center and half-sizes
double cx = 0.5 * (xmin + xmax);
double cy = 0.5 * (ymin + ymax);
double cz = 0.5 * (zmin + zmax);
double hx = 0.5 * (xmax - xmin);
double hy = 0.5 * (ymax - ymin);
double hz = 0.5 * (zmax - zmin);
// Apply buffer multiplier
hx *= buffer_factor;
hy *= buffer_factor;
hz *= buffer_factor;
// Store expanded box
box.xmin = cx - hx; box.xmax = cx + hx;
box.ymin = cy - hy; box.ymax = cy + hy;
box.zmin = cz - hz; box.zmax = cz + hz;
octree_object.tetra_boxes[tet_id] = box;
}
std::cout << "Compute global bounding box" << std::endl;
// All the tetrahedra IDs
std::vector<int> all_tet_ids(tetraCNT);
std::iota(all_tet_ids.begin(), all_tet_ids.end(), 0);
// All the non-conformal tetrahedra IDs
std::vector<int> all_NC_tet_ids(nonConformalCNT);
if (non_Conformal_flag)
{
std::cout << "Store non-conformal tetrahedra IDs" << std::endl;
all_NC_tet_ids.assign(ncARRAY, ncARRAY + nonConformalCNT);
}
AABB global_box {
.xmin = std::numeric_limits<float>::max(),
.xmax = -std::numeric_limits<float>::max(),
.ymin = std::numeric_limits<float>::max(),
.ymax = -std::numeric_limits<float>::max(),
.zmin = std::numeric_limits<float>::max(),
.zmax = -std::numeric_limits<float>::max()
};
for (const auto& box : octree_object.tetra_boxes)
{
global_box.xmin = std::min(global_box.xmin, box.xmin);
global_box.xmax = std::max(global_box.xmax, box.xmax);
global_box.ymin = std::min(global_box.ymin, box.ymin);
global_box.ymax = std::max(global_box.ymax, box.ymax);
global_box.zmin = std::min(global_box.zmin, box.zmin);
global_box.zmax = std::max(global_box.zmax, box.zmax);
}
std::cout << "Global Bounding Box:" << std::endl;
std::cout << " xmin = " << global_box.xmin << ", xmax = " << global_box.xmax << std::endl;
std::cout << " ymin = " << global_box.ymin << ", ymax = " << global_box.ymax << std::endl;
std::cout << " zmin = " << global_box.zmin << ", zmax = " << global_box.zmax << std::endl;
fp_t x_range = (global_box.xmax - global_box.xmin);
fp_t y_range = (global_box.ymax - global_box.ymin);
fp_t z_range = (global_box.zmax - global_box.zmin);
fp_t max_range = std::max({x_range, y_range, z_range});
fp_t wavelength = 3e8 / (freq * 1e6);
double box_size = 100.0 * wavelength; // or any desired multiple of λ
int min_depth = 1; // or 2, etc.
int octree_depth = std::max(min_depth, static_cast<int>(std::ceil(std::log2(max_range / box_size))));
double buffer_distance = wavelength / 2.0;
//int octree_depth = static_cast<int>(std::ceil(std::log2((4.0 * max_range) / wavelength))) - 1;
std::cout << "Max Range = " << max_range << " | Wavelength = " << wavelength << std::endl;
std::cout << "Compute octree with octree depth = " << octree_depth << std::endl;
if (non_Conformal_flag)
{
octree_object.buildOctree_withNCFLAGS(all_tet_ids, all_NC_tet_ids, global_box, buffer_distance, 0, octree_depth);
}
else
{
octree_object.buildOctree(all_tet_ids, global_box, buffer_distance, 0, octree_depth);
}
// Link tetrahedron memory
octree_object.tet_ptr = tetARRAY;
octree_object.tet_count = tetraCNT;
std::cout << "Octree build completed" << std::endl;
cout << "========================================================== \n";
}
// Find the Barycentric coordinates of the probes
void FemGrp::computeBarycentricEmbedding()
{
std::cout << "Compute the Barycentric center of the nodes" << std::endl;
const int num_nodes = outputMesh.num_nodes;
const double tol = 1e-8;
//#pragma omp parallel for schedule(dynamic)
for (int node_id = 0; node_id < num_nodes; ++node_id)
{
std::vector<float> node_xyz = outputMesh.getNode(node_id);
double probe_xyz[3] = {node_xyz[0], node_xyz[1], node_xyz[2]};
std::vector<std::pair<int, std::array<double, 4>>> found_tets;
bool success = octree_object.findTetraInOctree(probe_xyz, found_tets, tol);
if (success)
{
tri_nodes_bary[node_id].first = static_cast<int>(found_tets.size());
tri_nodes_bary[node_id].second = found_tets;
}
else
{
tri_nodes_bary[node_id].first = -1;
}
}
// Report and verify
bool error_flag = false;
for (int i = 0; i < num_nodes; ++i)
{
if (tri_nodes_bary[i].first < 0)
{
std::cerr << "Node " << i << " not found in simulation domain" << std::endl;
std::vector<float> node_xyz = outputMesh.getNode(i);
double probe_xyz[3] = {node_xyz[0], node_xyz[1], node_xyz[2]};
std::cerr << probe_xyz[0] << " " << probe_xyz[1] << " " << probe_xyz[2] << std::endl;
error_flag = true;
}
}
if (error_flag)
{
std::cerr << "Error: Some nodes were not found in the simulation domain. Exiting." << std::endl;
std::exit(EXIT_FAILURE);
}
}
// Refactored by Qi Jian to initialize the output surface mesh
// Note that the octree have to be built before calling this function
void FemGrp::makeOutputSurfMesh(std::string prjName)
{
// Load surface mesh
char triName[256];
sprintf(triName, "./%s_out.tri", prjName.c_str());
std::cout << "--------------------" << std::endl;
std::cout << "Reading Tri surface mesh " << triName << std::endl;
outputMesh.readFromFile(triName);
std::cout << "--------------------" << std::endl;
std::cout << "Compute Normals " << std::endl;
outputMesh.computeTriangleNormals();
std::cout << "--------------------" << std::endl;
outputMesh.printSummary();
std::cout << "--------------------" << std::endl;
tri_nodes_bary.resize(outputMesh.num_nodes);
// Fill barycentric coordinate map
computeBarycentricEmbedding();
std::cout << "Completed" << std::endl;
std::cout << "--------------------" << std::endl;
}
// Added by Qi Jian
// Utility to write fields of probes
void FemGrp::writeProbeFieldsCSV(
const std::string& outputDir, // e.g. "./PROBES1"
const std::string& fname, // simulation/project name
int timeStep, // timestep number
const std::vector<int>& node_ids, // node IDs to write
const std::vector<vtr>& Efield, // electric field vectors
const std::vector<vtr>& Hfield // magnetic field vectors
)
{
char csvFileName[512];
sprintf(csvFileName, "%s/Probes_%s_%04d.csv", outputDir.c_str(), fname.c_str(), timeStep);
std::ofstream csvFile(csvFileName);
if (!csvFile.is_open()) {
std::cerr << "Error opening file: " << csvFileName << std::endl;
return;
}
// Write header
csvFile << "Ex,Ey,Ez,Hx,Hy,Hz\n";
// Lambda to write one node's fields
auto write_fields = [&](int node_id)
{
const vtr& E = Efield[node_id];
const vtr& H = Hfield[node_id];
csvFile << std::fixed << std::setprecision(6)
<< E.getx() << "," << E.gety() << "," << E.getz() << ","
<< H.getx() << "," << H.gety() << "," << H.getz() << "\n";
};
for (int i = 0; i < node_ids.size(); ++i)
{
int node_id = node_ids[i];
write_fields(node_id);
}
csvFile.close();
}
void FemGrp::writeCurrentsOutputSurfMesh_CuBLAS(int timeStep)
{
const int num_nodes = outputMesh.num_nodes;
const int num_tri = outputMesh.num_triangles;
// ----------------------------------------------
// Step 1: Compute fields at all nodes (scattered field)
// ----------------------------------------------
// Incident Field at points
std::vector<vtr> E_field(num_nodes);
std::vector<vtr> H_field(num_nodes);
std::vector<vtr> Einc_field(num_nodes);
std::vector<vtr> Hinc_field(num_nodes);
int i, j;
fp_t vol;
fp_t zeta[4];
vtr lvtr[3];
vtr avtr[4];
int tetraMAP_aux[TetPolyOrderDim[getPolyFlag()]];
fp_t_ts E_coeff[TetPolyOrderDim[getPolyFlag()]];
fp_t_ts H_coeff[TetPolyOrderDim[getPolyFlag()]];
vtr Einc;
vtr Hinc;
vtr r;
vtr eField;
vtr hField;
// DEBUG purpose: Store all the node ids as probes
vector<int> node_ids(num_nodes);
for(i = 0; i < num_nodes; i++)
{
node_ids[i] = i;
}
// Compute the Incident Fields
for(i = 0; i < num_nodes; i++)
{
int number_of_associated_tets = tri_nodes_bary.at(i).first;
Einc.reset();
Hinc.reset();
std::vector<std::pair<int, std::array<double, 4>>> found_tets = tri_nodes_bary.at(i).second;
Einc_field[i].reset();
Hinc_field[i].reset();
for (int t = 0; t < number_of_associated_tets; t++)
{
int tet_id = found_tets.at(t).first;
array<double,4> tri_bary_coord = found_tets.at(t).second;
tetra& tet = tetARRAY[tet_id];
zeta[0] = static_cast<fp_t>(tri_bary_coord[0]);
zeta[1] = static_cast<fp_t>(tri_bary_coord[1]);
zeta[2] = static_cast<fp_t>(tri_bary_coord[2]);
zeta[3] = static_cast<fp_t>(tri_bary_coord[3]);
SimplexToCartesian(tet, r, zeta);
getAnalyticalPWField(tet, r, Einc, Hinc, timeStep, LocTimeSteps[N_class -1]);
Einc_field[i] = Einc_field[i] + Einc;
Hinc_field[i] = Hinc_field[i] + Hinc;
}
Einc_field[i] = Einc_field[i] / ((fp_t) number_of_associated_tets);
Hinc_field[i] = Hinc_field[i] / ((fp_t) number_of_associated_tets);
}
//writeProbeFieldsCSV( "./PROBES_inc", fname, timeStep, node_ids, Einc_field, Hinc_field);
make_dir_if_not_exist("./CURRENT_INC");
char regFileName[StrOutput];
// Prepare output file name
regFileName[StrOutput] = {0};
sprintf(regFileName, "./CURRENT_INC/Einc_field_%s_%05d.dat", fname, timeStep);
// Open output file
FILE* fout = fopen(regFileName, "w");
if (!fout)
{
std::cerr << "❌ Failed to open output file: " << regFileName << std::endl;
return;
}
std::vector<int> tri_nodes = outputMesh.getTriangle(1);
int nodeIdx = tri_nodes[0]; // Pick only the first node
const vtr& E = Einc_field[nodeIdx]; // Get E-field vector at that node
// Write full vector (Ex, Ey, Ez) to file
fprintf(fout, "%.10e %.10e %.10e\n", E.getx(), E.gety(), E.getz());
fclose(fout); // Done!
// Calculate Total Fields at the points
for(i = 0; i < num_nodes; i++)
{
int number_of_associated_tets = tri_nodes_bary.at(i).first;
eField.reset();
hField.reset();
std::vector<std::pair<int, std::array<double, 4>>> found_tets = tri_nodes_bary.at(i).second;
E_field[i].reset();
H_field[i].reset();
for (int t = 0; t < number_of_associated_tets; t++)
{
int tet_id = found_tets.at(t).first;
array<double,4> tri_bary_coord = found_tets.at(t).second;
tetra& tet = tetARRAY[tet_id];
tet.geometry(lvtr, avtr, &vol);
avtr[3].reset();
avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);
eField.reset();
hField.reset();
zeta[0] = static_cast<fp_t>(tri_bary_coord[0]);
zeta[1] = static_cast<fp_t>(tri_bary_coord[1]);
zeta[2] = static_cast<fp_t>(tri_bary_coord[2]);
zeta[3] = static_cast<fp_t>(tri_bary_coord[3]);
eField = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);
hField = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);
E_field[i] = E_field[i] + eField;
H_field[i] = H_field[i] + hField;
}
E_field[i] = E_field[i] / ((fp_t) number_of_associated_tets);
H_field[i] = H_field[i] / ((fp_t) number_of_associated_tets);
}
//writeProbeFieldsCSV( "./PROBES_total", fname, timeStep, node_ids, E_field, H_field);
regMface = new Register[outputMesh.num_triangles];
regJface = new Register[outputMesh.num_triangles];
make_dir_if_not_exist("./CURRENT_Total");
for(int i = 0; i < outputMesh.num_triangles; i++)
{
std::vector<int> tri_nodes = outputMesh.getTriangle(i);
std::vector<float> normal_d = outputMesh.getNormal(i);
vtr NormalVtr(normal_d[0], normal_d[1], normal_d[2]);
regMface[i].initial(3);
regJface[i].initial(3);
for(j = 0; j < 3; j++)
{
int nodeIdx = tri_nodes[j];
vtr eLocalFace = E_field[nodeIdx];
vtr hLocalFace = H_field[nodeIdx];
// No averaging
regMface[i].setField(j, NormalVtr * eLocalFace * (-1.0));
regJface[i].setField(j, NormalVtr * hLocalFace * (1.0));
}
}
// Register
memset(regFileName, 0, StrOutput * sizeof(char));
sprintf(regFileName, "./CURRENT_Total/Currents_%s_%05d", fname, timeStep);
printRegister(regMface, regJface, outputMesh.num_triangles, regFileName,1);
delete[] regMface;
delete[] regJface;
// Calculate Scattered Fields at the points
for(int i = 0; i < num_nodes; i++)
{
E_field[i] = E_field[i] - Einc_field[i];
H_field[i] = H_field[i] - Hinc_field[i];
}
//writeProbeFieldsCSV( "./PROBES_sc", fname, timeStep, node_ids, E_field, H_field);
// ----------------------------------------------------------------------------------------------
// Write the Scattered Fields
regMface = new Register[outputMesh.num_triangles];
regJface = new Register[outputMesh.num_triangles];
make_dir_if_not_exist("./CURRENT_SC");
for(int i = 0; i < outputMesh.num_triangles; i++)
{
std::vector<int> tri_nodes = outputMesh.getTriangle(i);
std::vector<float> normal_d = outputMesh.getNormal(i);
vtr NormalVtr(normal_d[0], normal_d[1], normal_d[2]);
regMface[i].initial(3);
regJface[i].initial(3);
for(j = 0; j < 3; j++)
{
int nodeIdx = tri_nodes[j];
vtr eLocalFace = E_field[nodeIdx];
vtr hLocalFace = H_field[nodeIdx];
// No averaging
regMface[i].setField(j, NormalVtr * eLocalFace * (-1.0));
regJface[i].setField(j, NormalVtr * hLocalFace * (1.0));
}
}
// Register
memset(regFileName, 0, StrOutput * sizeof(char));
sprintf(regFileName, "./CURRENT_SC/Currents_%s_%05d", fname, timeStep);
printRegister(regMface, regJface, outputMesh.num_triangles, regFileName,1);
delete[] regMface;
delete[] regJface;
}
#endif
#endif