#include #include #include #include "femgrp.h" #include "matconv.h" #include "Constants.h" #include "vtkwriter.h" #ifdef _OPENMP #include #endif #include #include "MeshPartition_METIS5.h" #include #include "debug.hpp" #include "vtk-5.0/vtkTetra.h" #include "rapidcsv.h" #include #include #include #include #include // For strerror #include // For perror or printf #include // for std::max #include #include #include #include // at top of file auto check_dev_ptr = [](const void* p, const char* name) -> bool { if (!p) { fprintf(stderr, "[addExcitationE_port] ❌ NULL pointer: %s\n", name); return false; } cudaPointerAttributes attr; #if CUDART_VERSION >= 10000 cudaError_t perr = cudaPointerGetAttributes(&attr, p); if (perr != cudaSuccess) { fprintf(stderr, "[addExcitationE_port] ⚠️ cudaPointerGetAttributes failed for %s: %s\n", name, cudaGetErrorString(perr)); // Still allow launch; you can change to 'return false;' if you prefer. } else { // cudaMemoryTypeDevice == 2 in older runtimes; in newer, use attr.type == cudaMemoryTypeDevice #if CUDART_VERSION >= 11000 bool is_dev = (attr.type == cudaMemoryTypeDevice); #else bool is_dev = (attr.memoryType == cudaMemoryTypeDevice); #endif if (!is_dev) { fprintf(stderr, "[addExcitationE_port] ⚠️ %s is NOT a device pointer (type=%d)\n", name, #if CUDART_VERSION >= 11000 (int)attr.type #else (int)attr.memoryType #endif ); } } #endif return true; }; // ====================================== // Interpolation Quadrature Points (host tables) #define g6_a_h 0.816847572980459 #define g6_b_h (1.0 - g6_a_h) / 2.0 #define g6_c_h 0.108103018168070 #define g6_d_h (1.0 - g6_c_h) / 2.0 #define g6_W1_h 0.109951743655322 #define g6_W2_h 0.223381589678011 #define g9_a_h 0.437525248383384 #define g9_b_h (1.0 - 2.0 * g9_a_h) #define g9_c_h 0.797112651860071 #define g9_d_h 0.165409927389841 #define g9_e_h (1.0 - g9_c_h - g9_d_h) #define g9_W1_h 0.205950504760887 #define g9_W2_h 0.063691414286223 fp_t_ts g2d_6_h[6][4] = { {g6_a_h, g6_b_h, g6_b_h, g6_W1_h}, {g6_b_h, g6_a_h, g6_b_h, g6_W1_h}, {g6_b_h, g6_b_h, g6_a_h, g6_W1_h}, {g6_c_h, g6_d_h, g6_d_h, g6_W2_h}, {g6_d_h, g6_c_h, g6_d_h, g6_W2_h}, {g6_d_h, g6_d_h, g6_c_h, g6_W2_h} }; fp_t_ts g2d_9_h[9][4] = { {g9_b_h, g9_a_h, g9_a_h, g9_W1_h}, {g9_a_h, g9_b_h, g9_a_h, g9_W1_h}, {g9_a_h, g9_a_h, g9_b_h, g9_W1_h}, {g9_c_h, g9_d_h, g9_e_h, g9_W2_h}, {g9_c_h, g9_e_h, g9_d_h, g9_W2_h}, {g9_d_h, g9_c_h, g9_e_h, g9_W2_h}, {g9_d_h, g9_e_h, g9_c_h, g9_W2_h}, {g9_e_h, g9_c_h, g9_d_h, g9_W2_h}, {g9_e_h, g9_d_h, g9_c_h, g9_W2_h} }; const int GAUSS_POINT_NUM_h[4] = {6, 9, 9, 9}; // ---- Shapes for quadratic triangle (P2) at barycentric l=(l0,l1,l2) ---- static inline void triP2_shapes(const double l[3], double N[6]) { const double l0=l[0], l1=l[1], l2=l[2]; N[0] = l0*(2.0*l0-1.0); // vertex 0 N[1] = l1*(2.0*l1-1.0); // vertex 1 N[2] = l2*(2.0*l2-1.0); // vertex 2 N[3] = 4.0*l1*l2; // edge(1,2) N[4] = 4.0*l0*l2; // edge(0,2) N[5] = 4.0*l0*l1; // edge(0,1) } // ---- One normal + area from 3 points (xyz9 = x0,y0,z0, x1,y1,z1, x2,y2,z2) ---- static inline void face_geometry9_host(const fp_t_ts* xyz9, double n[3], double& area) { const double x0=xyz9[0], y0=xyz9[1], z0=xyz9[2]; const double x1=xyz9[3], y1=xyz9[4], z1=xyz9[5]; const double x2=xyz9[6], y2=xyz9[7], z2=xyz9[8]; double a[3] = {x1-x0, y1-y0, z1-z0}; double b[3] = {x2-x0, y2-y0, z2-z0}; // n ∝ a × b n[0] = a[1]*b[2] - a[2]*b[1]; n[1] = a[2]*b[0] - a[0]*b[2]; n[2] = a[0]*b[1] - a[1]*b[0]; double nn = sqrt(n[0]*n[0] + n[1]*n[1] + n[2]*n[2]); area = 0.5*nn; if (nn > 0) { n[0]/=nn; n[1]/=nn; n[2]/=nn; } } // ---- Project vector to tangential plane (in-place) ---- static inline void proj_tangent(double v[3], const double n[3]) { const double vn = v[0]*n[0] + v[1]*n[1] + v[2]*n[2]; v[0]-=vn*n[0]; v[1]-=vn*n[1]; v[2]-=vn*n[2]; } // ---- Host quadrature accessor using your *_h tables ---- static inline void tri_gauss_host(int Q, int q, fp_t& z0, fp_t& z1, fp_t& z2, fp_t& w) { if (Q == 6) { z0 = g2d_6_h[q][0]; z1 = g2d_6_h[q][1]; z2 = g2d_6_h[q][2]; w = g2d_6_h[q][3]; return; } if (Q == 9) { z0 = g2d_9_h[q][0]; z1 = g2d_9_h[q][1]; z2 = g2d_9_h[q][2]; w = g2d_9_h[q][3]; return; } // add more orders if you enable them z0=z1=z2=w=0; } // ---- Interpolate E/H to Q quadrature points and project tangential ---- static inline void interp_port_fields_to_quads( const fp_t_ts* xyz9, // x0 y0 z0 x1 y1 z1 x2 y2 z2 const vtr evtr[6], // P2 nodal vectors for E (face order: 0..5) const vtr hvtr[6], // P2 nodal vectors for H int PolyFlag, fp_t_ts* Etan_out, // [Q*3] fp_t_ts* Htan_out, // [Q*3] fp_t_ts port_excitation_magnitude) { const int Q = GAUSS_POINT_NUM_h[PolyFlag]; double n[3], area; face_geometry9_host(xyz9, n, area); for (int q=0; qnd[0..3] exist and have getCoord().getx/y/z()) ---- void make_dir_if_not_exist(const char* path) { struct stat st; if (stat(path, &st) != 0) { // Directory does not exist, try to create it if (mkdir(path, 0755) != 0) { perror("mkdir failed"); } } else if (!S_ISDIR(st.st_mode)) { fprintf(stderr, "%s exists but is not a directory\n", path); } } void exportNeighData( int* NeighMap_h, int neighMapSize, int* NeighClass_h, int N_class, int* NeighClassOffset_h) { // Export NeighMap_h { std::ofstream ofs("NeighMap.txt"); for (int i = 0; i < neighMapSize; i++) { ofs << NeighMap_h[i] << "\n"; } } // Export NeighClass_h { std::ofstream ofs("NeighClass.txt"); for (int i = 0; i < N_class; i++) { ofs << NeighClass_h[i] << "\n"; } } // Export NeighClassOffset_h { std::ofstream ofs("NeighClassOffset.txt"); for (int i = 0; i < N_class; i++) { ofs << NeighClassOffset_h[i] << "\n"; } } } // ---- Safe CUDA helpers ------------------------------------------------------- inline cudaError_t SafeCudaMalloc(void** p, size_t nbytes) { if (nbytes == 0) { *p = nullptr; return cudaSuccess; } return cudaMalloc(p, nbytes); } inline cudaError_t SafeCudaMemcpyH2D(void* dst, const void* src, size_t nbytes) { if (nbytes == 0 || !dst || !src) return cudaSuccess; return cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice); } inline cudaError_t SafeCudaMemset0(void* dst, size_t nbytes) { if (nbytes == 0 || !dst) return cudaSuccess; return cudaMemset(dst, 0, nbytes); // zero is always safe } #define BYTES(T, count) (static_cast(count) * sizeof(T)) #define CUDA_SAFE_MALLOC(ptr, bytes) CUDA_SAFE_CALL(SafeCudaMalloc((void**)&(ptr), (bytes))) #define CUDA_SAFE_COPY(dst, src, bytes) CUDA_SAFE_CALL(SafeCudaMemcpyH2D((dst), (src), (bytes))) #define CUDA_SAFE_ZERO(dst, bytes) CUDA_SAFE_CALL(SafeCudaMemset0((dst), (bytes))) // ---- Safe CUDA helpers ------------------------------------------------------- #if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL) #include "kernels.cuh" cudaStream_t stream_E, stream_H; cudaStream_t stream_Pade; ExcitationProp excitationProp; std::vector portExcitations; ExcitationProp* ExcitationProps_d; #endif using namespace ClipperLib; using namespace std; int TriNumBas = 6; bool ModuleFlag = true; static fp_t BaryCoord[SecondOrderNodes][4] = { {1.0, 0.0, 0.0, 0.0}, {0.0, 1.0, 0.0, 0.0}, {0.0, 0.0, 1.0, 0.0}, {0.0, 0.0, 0.0, 1.0}, {0.5, 0.5, 0.0, 0.0}, {0.5, 0.0, 0.5, 0.0}, {0.5, 0.0, 0.0, 0.5}, {0.0, 0.5, 0.5, 0.0}, {0.0, 0.5, 0.0, 0.5}, {0.0, 0.0, 0.5, 0.5} }; static int fac2tet[4][18] = { {5, 4, 3, 11, 10, 9, 12, 13, 25, 24, 23, 26, 30, 31, 32, 42, 43, 44}, {5, 2, 1, 11, 8, 7, 14, 15, 25, 22, 21, 27, 33, 34, 35, 42, 43, 44}, {4, 2, 0, 10, 8, 6, 16, 17, 24, 22, 20, 28, 36, 37, 38, 42, 43, 44}, {3, 1, 0, 9, 7, 6, 18, 19, 23, 21, 20, 29, 39, 40, 41, 42, 43, 44} }; int faceExcitationOrder[15] = { 1, 2, 4, 8, 3, 5, 6, 9, 10, 12, 3, 7, 11, 13, 14 }; int First2Second[3][2] = { {1, 2}, {0, 2}, {0, 1} }; template void writeDenseMatrixToCSV_rapidcsv(const std::string& filename, denseMat* mat, int dim) { std::vector> data(dim, std::vector(dim)); for (int i = 0; i < dim; ++i) for (int j = 0; j < dim; ++j) data[i][j] = mat->getEntry(i, j); // rapidcsv needs column-major data std::vector> cols(dim, std::vector(dim)); for (int j = 0; j < dim; ++j) for (int i = 0; i < dim; ++i) cols[j][i] = data[i][j]; rapidcsv::Document doc; for (int j = 0; j < dim; ++j) doc.SetColumn(j, cols[j]); doc.Save(filename); } template denseMat* wrapFlatMatrixConvert(const T_in* data, int dim) { auto* mat = new denseMat(dim, dim); for (int i = 0; i < dim; ++i) for (int j = 0; j < dim; ++j) mat->setEntry(i, j, static_cast(data[i * dim + j])); return mat; } FemGrp::FemGrp(){ nodeCNT = 0; edgeCNT = 0; faceCNT = 0; tetraCNT = 0; bcCNT = 0; regularCNT = 1; //at least there is a non regular group regularTetraCNT = 0; ndARRAY = nullptr; tetARRAY = nullptr; edgeARRAY = nullptr; faceARRAY = nullptr; regularReferenceARRAY = nullptr; objProp = nullptr; totalObjNum = 0; usePade = false; padeTime = -1; padeCNT = 0; tsSource = 0; nonConformalCase = false; nonConformalCNT = 0; neighCNT = 0; writeWhilePade = false; writePadeTD = false; Coord.setO(0.0, 0.0, 0.0); Coord.setx_axis(1.0, 0.0, 0.0); Coord.sety_axis(0.0, 1.0, 0.0); Coord.setz_axis(0.0, 0.0, 1.0); freq = 0.0; // Added for DGTD TimeStep_dt = 0.0; ClassMul = 0; dt_min = 0.0; dt_max = 0.0; dimE = 0; dimH = 0; N_class = 0; NtimeSteps = 0; LocTimeSteps = nullptr; LocalExciIndexE = nullptr; LocalExciIndexH = nullptr; ClassTetraCnt = nullptr; ClassTetraIndex = nullptr; ClassTetraOffset = nullptr; planeWaveMesh = nullptr; InterSurfMesh = nullptr; SurfMesh = nullptr; To = 0.0; Tau = 0.0; SamplingRate = 1.0; FinalTime = 0.0; TimeDistFlag = 0; // Port ExcitFlag = 0; // Scattering regularRegionFlag = false; PlaneWaveBCFlag = false; PortBCFlag = false; fieldEnergy = 0.0; maxFieldEnergy = 0.0; energyDecayFactor = 0.0; numberOfEnergyPoints = 0; UseQuadratureMatrices = true; #if defined(DGTD_USE_CUDA) cudaStreamCreate(&stream_E); cudaStreamCreate(&stream_H); cudaStreamCreate(&stream_Pade); En_d = nullptr; Hn12_d = nullptr; En1_d = nullptr; Hn32_d = nullptr; #endif } FemGrp::~FemGrp(){ } void FemGrp::readNODE(){ // Read only the nodes belonging to this subdomain and neighbors char nname[StrLenShort]; int pType; fp_t singORDER, Priority, x, y, z; sprintf(nname, "%s.node", fname); ifstream nodefile(nname, ios::in); if(!nodefile){ cout << "File " << nname << " does NOT exist " << endl; exit(1); } if(usePade){ initializeMaxMinPoints(); } int nodeTotal; nodefile >> unit; nodefile >> nodeTotal; nodeCNT = nodeTotal; // only one domain, global = local if(nodeCNT >= 1){ ndARRAY = new node[nodeCNT]; for(int k = 0; k < nodeTotal; k ++){ ndARRAY[k].set_globalId(k); nodefile >> pType >> Priority >> singORDER >> x >> y >> z; ndARRAY[k].set_n(k); ndARRAY[k].set_pType(pType); ndARRAY[k].setPType(static_cast(pType)); ndARRAY[k].set_singORDER(singORDER); ndARRAY[k].set_coord(x * unit, y * unit, z * unit); // ndARRAY[k].print(); if(usePade){ setMaxMinPoints(x * unit, y * unit, z * unit); } } cout << "MaxPoint = (" << maxPoint.getx() << ", " << maxPoint.gety() << ", " << maxPoint.getz() << ") " << endl; cout << "MinPoint = (" << minPoint.getx() << ", " << minPoint.gety() << ", " << minPoint.getz() << ") " << endl; } } void FemGrp::readTETRA(){ // Read only the tetras in this subdomain and neighbors int i, j, objNum, ndid[NumOfNodes], bcd[NumOfFaces], sNum[NumOfFaces]; node *nd[NumOfNodes]; char tname[StrLenShort]; readBcMap(); // read in surface-btype map sprintf(tname, "%s.tetra", fname); ifstream tetrafile(tname, ios::in); if(!tetrafile){ cout << "File " << tname << " does NOT exist " << endl; exit(1); } int tetraTotal; tetrafile >> tetraTotal; // Only one domain exists tetraCNT = tetraTotal; if(tetraCNT >= 1){ tetARRAY = new tetra[tetraCNT]; for(i = 0; i < tetraTotal; i ++){ tetrafile >> objNum; if(objNum > totalObjNum) totalObjNum = objNum; tetrafile >> ndid[0] >> ndid[1] >> ndid[2] >> ndid[3]; //get the ids of the nodes tetrafile >> sNum[0] >> sNum[1] >> sNum[2] >> sNum[3]; //get the bc number of the faces for(j = 0; j < 4; j++){ nd[j] = &(ndARRAY[ndid[j]]); bcd[j] = bcMap[sNum[j]]; } tetARRAY[i].set_objNum(objNum); tetARRAY[i].set_node(nd[0], nd[1], nd[2], nd[3]); tetARRAY[i].set_nbc(bcd[0], bcd[1], bcd[2], bcd[3]); tetARRAY[i].reArrange(); //set the nodes and bc from smallest to greatest id tetARRAY[i].setcnt(i); } } } void FemGrp::readBcMap(){ char name[StrLenShort]; int i, surfCNT, sNum, bNum; sprintf(name, "%s.bcmap", fname); ifstream foo(name, ios::in); if(!foo){ cout << "File " << name << " does NOT exist " << endl; exit(1); } foo >> surfCNT; if(surfCNT > 0){ bcMap = new int[surfCNT + 1]; bcMap[0] = 0; for(i = 0; i < surfCNT; i ++){ foo >> sNum >> bNum; bcMap[sNum] = bNum; } } } void FemGrp::readMaterial(){ char name[StrLenShort], matName[StrLenShort], dirName[StrLenShort], tmpName[StrLenShort], materialName[StrLenShort]; int i, j, k; fp_t real, imaginary, cval, temp; FILE *matFILE; totalObjNum ++; objProp = new Material[totalObjNum]; sprintf(name, "%s.prop", fname); ifstream foo(name, ios::in); if(!foo){ cout << "File " << name << " does NOT exist " << endl; exit(1); } foo >> dirName; //directory where the materials are storaged DEBUG_INFO("totalObjNum: " + to_string(totalObjNum)); //TODO: it only takes the real part for(i = 0; i < totalObjNum; i++) { foo >> materialName; sprintf(matName, "%s/%s.m", dirName, materialName); matFILE = fopen(matName, "r"); cout << "Reading material properties from file: " << materialName << endl; fscanf(matFILE, "%s", tmpName); // relative dielectric constant for(j = 0; j < NumOfUnitaryVectors; j ++){ for(k = 0; k < NumOfUnitaryVectors; k ++){ #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le %le ", &real, &imaginary); #else fscanf(matFILE, "%e %e ", &real, &imaginary); #endif cval = real; objProp[i].epsr.setEntry(j, k, cval); } } // relative permeability for(j = 0; j < NumOfUnitaryVectors; j ++){ for(k = 0; k < NumOfUnitaryVectors; k ++){ #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le %le ", &real, &imaginary); #else fscanf(matFILE, "%e %e ", &real, &imaginary); #endif cval = real; objProp[i].mur.setEntry(j, k, cval); } } // conductivity for(j = 0; j < NumOfUnitaryVectors; j ++){ for(k = 0; k < NumOfUnitaryVectors; k ++){ #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &real); #else fscanf(matFILE, "%e ", &real); #endif cval = real; objProp[i].sigma.setEntry(j, k, cval); } } objProp[i].rum = objProp[i].mur.inverse(); // Tag Scattering Region if (strncmp(materialName, "scattering", 10) == 0) { objProp[i].scattering_region = true; } else { objProp[i].scattering_region = false; } // PML if (strncmp(materialName, "pml", 3) == 0) { PML_flag = true; // Set Tetrahedron PML type true objProp[i].set_PML_Flag(1); cout << "PML Material Properties: " << endl; // PML Max Conductivity fp_t conductivity_PML = objProp[i].sigma.getEntry(0, 0); cout << "conductivity_PML = " << conductivity_PML << endl; // PML Order #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &temp); #else fscanf(matFILE, "%e ", &temp); #endif PML_conductivity_order = temp; objProp[i].set_PML_m_ord(PML_conductivity_order); cout << "PML_m_ord: " << objProp[i].get_PML_m_ord() << endl; // PML Thickness #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &temp); #else fscanf(matFILE, "%e ", &temp); #endif PML_thickness = temp; objProp[i].set_PML_thick(PML_thickness); cout << "PML_thickness: " << objProp[i].get_PML_thick() << endl; // PML Geometry #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &temp); #else fscanf(matFILE, "%e ", &temp); #endif Ellipse_Rx = temp; #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &temp); #else fscanf(matFILE, "%e ", &temp); #endif Ellipse_Ry = temp; #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &temp); #else fscanf(matFILE, "%e ", &temp); #endif Ellipse_Rz = temp; cout << "Ellipse_Rx: " << Ellipse_Rx << endl; cout << "Ellipse_Ry: " << Ellipse_Ry << endl; cout << "Ellipse_Rz: " << Ellipse_Rz << endl; #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &temp); #else fscanf(matFILE, "%e ", &temp); #endif planewave_xmin = temp; #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &temp); #else fscanf(matFILE, "%e ", &temp); #endif planewave_xmax = temp; #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &temp); #else fscanf(matFILE, "%e ", &temp); #endif planewave_ymin = temp; #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &temp); #else fscanf(matFILE, "%e ", &temp); #endif planewave_ymax = temp; #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &temp); #else fscanf(matFILE, "%e ", &temp); #endif planewave_zmin = temp; #ifdef DGTD_USE_DOUBLE fscanf(matFILE, "%le ", &temp); #else fscanf(matFILE, "%e ", &temp); #endif planewave_zmax = temp; cout << "PML Region:\n"; cout << " x: [" << planewave_xmin << ", " << planewave_xmax << "]\n"; cout << " y: [" << planewave_ymin << ", " << planewave_ymax << "]\n"; cout << " z: [" << planewave_zmin << ", " << planewave_zmax << "]\n"; } else { // Set Tetrahedron PML type false objProp[i].set_PML_Flag(0); } fclose(matFILE); } } void FemGrp::readBC() { char name[StrLenShort], bcName[StrLenShort], portName[StrLenShort]; int i, pNum, bNum, bType; fp_t impR, impI, magE; fp_t x, y, z; fp_t theta, phi; fp_t rox, roy, roz; fp_t r1x, r1y, r1z; int PortFlag; fp_t CHIRP_BW_MHZ; fp_t phaseE; fp_t port_dx, port_dy, port_dz; fp_t vpath_x, vpath_y, vpath_z; fp_t epr, mur; PEC_PMC_port_flag = 0; // For ports bcNumToPnum.clear(); pnumToBcNum.clear(); // For PML int pmlMode; // 0->radiation(port) , 1->scattering fp_t pol_x, pol_y, pol_z; PML_flag = false; readBcMap(); sprintf(name, "%s.bc", fname); ifstream foo(name, ios::in); if(!foo){ cout << "File " << name << " does NOT exist " << endl; exit(1); } foo >> bcCNT; bcARRAY = new bc[bcCNT]; portCNT = 0; nonConformalCNT = 0; for(i = 0; i < bcCNT; i ++) { foo >> bNum >> bcName; bcARRAY[i].set_bNum(bNum); // id in file bcARRAY[i].set_name(bcName); // name in file bType = bcTypeConvert(bcName); bcARRAY[i].set_bType(bType); switch (bType) { case 0: // none { break; } case pmcType: // pmc { break; } case fieldPlaneType: { break; // fieldPlane } case outputSurfType: { cout << "outputSurfType" << endl; break; } case abcType: { foo >> impR; //abc bcARRAY[i].set_rval(impR * No); break; } case constE: { foo >> x >> y >> z; // constE bcARRAY[i].SETFIELD(x, y, z); break; } case pecType: { break; // pec } case impType: { foo >> impR >> impI; //original bcARRAY[i].set_cval(impR, impI); break; } case portType: { // (1) TEM rectangular port // port 1 // (2) TEM coaxial port // port 2 // (3) TE rectangular port (a is along height and b is along width) // port 3 pNum = -1; PortFlag = 0; if (!(foo >> portName >> pNum >> PortFlag)) { std::cerr << "[PORT] Failed to read \n"; break; } cout << "pNum = " << pNum << endl; // Initialization of the variables impR=0.0, impI=0.0, magE=1.0; port_dx=0.0, port_dy=0.0, port_dz=1.0; CHIRP_BW_MHZ=0.0, epr=1.0, mur=1.0; if (!(foo >> impR >> impI >> magE >> port_dx >> port_dy >> port_dz >> CHIRP_BW_MHZ >> epr >> mur)) { std::cerr << "[PORT] Failed to read common fields for port " << portName << "\n"; break; } // Book-keeping bcARRAY[i].set_name(portName); bcARRAY[i].set_cval(impR, impI); bcARRAY[i].set_rval(impR); bcARRAY[i].set_pNum(pNum); bcARRAY[i].set_PortFlag(PortFlag); portCNT++; PWorPort = 1; PortBCFlag = true; // If user gives impR==0, let device compute eta const double MU0 = 1.2566370614359173e-6; // 4π·1e-7 const double EPS0 = 8.854187817e-12; const double PI = 3.14159265358979323846; if (epr <= 0.0) epr = 1.0; if (mur <= 0.0) mur = 1.0; const double mu = mur * MU0; const double eps = epr * EPS0; #if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL) excitationProp.ExcitationFlag = ExcitFlag; ExcitationProp portEx{}; portEx.portNum = pNum; portEx.BCNum = i; // Timing / envelope portEx.TimeDistributionFlag = getTimeDist(); portEx.to = To; portEx.tau = Tau; portEx.freq_m = (fp_t_ts)freq; // MHz portEx.CHIRP_BW_MHZ = (fp_t_ts)CHIRP_BW_MHZ; // Medium / amplitude portEx.epr = (fp_t_ts)((epr>0.0)? epr : 1.0); portEx.mur = (fp_t_ts)((mur>0.0)? mur : 1.0); portEx.Emagnitude = (fp_t_ts)magE; // Direction vector (diagnostic; geometry gives unit normal) portEx.PortDirection[0] = (fp_t_ts)port_dx; portEx.PortDirection[1] = (fp_t_ts)port_dy; portEx.PortDirection[2] = (fp_t_ts)port_dz; // E/H field impedance (if 0, device computes implicitly) portEx.PortImpedance = (fp_t_ts)impR; portEx.PortFlag = PortFlag; // Map BC <-> Port bcNumToPnum[portEx.BCNum] = portEx.portNum; pnumToBcNum[portEx.portNum] = portEx.BCNum; // ---- Branch by PortFlag for extra fields ---- switch (PortFlag) { case 1: // TEM rectangular: needs vpath { double vpx=0, vpy=0, vpz=0; if (!(foo >> vpx >> vpy >> vpz)) { std::cerr << "[PORT] TEM-rect missing for " << portName << "\n"; // Default vpath to PortDirection if absent vpx = port_dx; vpy = port_dy; vpz = port_dz; } portEx.vpath[0] = (fp_t_ts)vpx; portEx.vpath[1] = (fp_t_ts)vpy; portEx.vpath[2] = (fp_t_ts)vpz; if (impR == 0.0 && impI == 0.0) { double eta = std::sqrt(mu/eps); // η = sqrt(μ/ε) portEx.PortImpedance = (fp_t_ts)eta; bcARRAY[i].set_rval(eta); } else { portEx.PortImpedance = (fp_t_ts)impR; bcARRAY[i].set_rval(impR); } break; } case 2: // TEM coax: needs r0 (center), r1 (inner), r2 (outer) { double r0x, r0y, r0z, r1x, r1y, r1z, r2x, r2y, r2z; if (!(foo >> r0x >> r0y >> r0z >> r1x >> r1y >> r1z >> r2x >> r2y >> r2z)) { std::cerr << "[PORT] TEM-coax missing r0/r1/r2 for " << portName << "\n"; // Provide safe defaults (degenerate; will inject 0) r0x=r0y=r0z=0; r1x=1e-3; r1y=r1z=0; r2x=4e-3; r2y=r2z=0; } portEx.r0_port[0]=(fp_t_ts)r0x; portEx.r0_port[1]=(fp_t_ts)r0y; portEx.r0_port[2]=(fp_t_ts)r0z; portEx.r1_port[0]=(fp_t_ts)r1x; portEx.r1_port[1]=(fp_t_ts)r1y; portEx.r1_port[2]=(fp_t_ts)r1z; portEx.r2_port[0]=(fp_t_ts)r2x; portEx.r2_port[1]=(fp_t_ts)r2y; portEx.r2_port[2]=(fp_t_ts)r2z; std::array v10 = { r1x - r0x, r1y - r0y, r1z - r0z }; std::array v20 = { r2x - r0x, r2y - r0y, r2z - r0z }; const double a = std::sqrt(v10[0]*v10[0] + v10[1]*v10[1] + v10[2]*v10[2]); const double b = std::sqrt(v20[0]*v20[0] + v20[1]*v20[1] + v20[2]*v20[2]); if (impR == 0.0 && impI == 0.0) { double eta = std::sqrt(mu/eps); // η = sqrt(μ/ε) // Characteristic (V/I) line impedance of the coax double Z0_line = std::numeric_limits::quiet_NaN(); bool geom_ok = (a > 0.0) && (b > a); if (geom_ok) { Z0_line = (eta / (2.0*PI)) * std::log(b/a); } else { std::cerr << "[PORT] TEM-coax invalid radii (a=" << a << ", b=" << b << "). Using only field impedance eta for BC.\n"; } portEx.PortImpedance = (fp_t_ts)Z0_line; bcARRAY[i].set_rval(Z0_line); } else { portEx.PortImpedance = (fp_t_ts)impR; bcARRAY[i].set_rval(impR); } break; } case 3: // TE_mn rectangular: needs rect_a rect_b m n uv0x uv0y uv0z vpx vpy vpz { double rect_a, rect_b; int m, n; double uv0x, uv0y, uv0z; double vpx, vpy, vpz; if (!(foo >> rect_a >> rect_b >> m >> n >> uv0x >> uv0y >> uv0z >> vpx >> vpy >> vpz)) { std::cerr << "[PORT] TE_mn missing for " << portName << "\n"; // Safe defaults (device clamps tiny a/b) rect_a = 1.0; rect_b = 1.0; m = 1; n = 0; uv0x = uv0y = uv0z = 0.0; // use PortDirection as fallback vpath vpx = port_dx; vpy = port_dy; vpz = port_dz; } portEx.rect_a = (fp_t_ts)rect_a; portEx.rect_b = (fp_t_ts)rect_b; portEx.m = m; portEx.n = n; portEx.uv0[0]=(fp_t_ts)uv0x; portEx.uv0[1]=(fp_t_ts)uv0y; portEx.uv0[2]=(fp_t_ts)uv0z; // store the raw vpath too (optional, but handy for logging/diagnostics) portEx.vpath[0] = (fp_t_ts)vpx; portEx.vpath[1] = (fp_t_ts)vpy; portEx.vpath[2] = (fp_t_ts)vpz; // ---- Build t1, t2 from vpath and PortDirection (n) ---- // n = normalized PortDirection double nx = port_dx, ny = port_dy, nz = port_dz; double nrm = std::sqrt(nx*nx + ny*ny + nz*nz); if (nrm < 1e-14) { nx = 0.0; ny = 0.0; nz = 1.0; nrm = 1.0; } nx /= nrm; ny /= nrm; nz /= nrm; double t1x = vpx; double t1y = vpy; double t1z = vpz; // t2 = n × t1 double t2x = ny*t1z - nz*t1y; double t2y = nz*t1x - nx*t1z; double t2z = nx*t1y - ny*t1x; double t2n = std::sqrt(t2x*t2x + t2y*t2y + t2z*t2z); t2x /= t2n; t2y /= t2n; t2z /= t2n; // store in the excitation portEx.t1[0] = (fp_t_ts)t1x; portEx.t1[1] = (fp_t_ts)t1y; portEx.t1[2] = (fp_t_ts)t1z; portEx.t2[0] = (fp_t_ts)t2x; portEx.t2[1] = (fp_t_ts)t2y; portEx.t2[2] = (fp_t_ts)t2z; if (impR == 0.0 && impI == 0.0) { // Geometry (meters) & mode indices already read into rect_a, rect_b, m, n const double a = (rect_a > 0.0) ? rect_a : 1e-12; const double b = (rect_b > 0.0) ? rect_b : 1e-12; // Frequency (MHz in your code) const double omega = 2.0 * PI * freq * 1.0e6; const double kc2 = std::pow(m*PI/a, 2.0) + std::pow(n*PI/b, 2.0); // k_cutoff^2 const double k2 = omega*omega * mu * eps; // k^2 double Z_TE_real = std::numeric_limits::quiet_NaN(); double Z_TE_imag = 0.0; if (k2 <= kc2) { // Below cutoff: Z_TE = -j*(ωμ/α), purely reactive const double alpha = std::sqrt(kc2 - k2); Z_TE_imag = -(omega * mu) / alpha; Z_TE_real = 1e12; // large real placeholder for BC scalar std::cerr << "[PORT] TE_mn below cutoff (a=" << a << ", b=" << b << ", m=" << m << ", n=" << n << "). Using large real Z for BC, " << "Im{Z_TE}=" << Z_TE_imag << " Ohm.\n"; } else { // Above cutoff: Z_TE is real and positive const double beta = std::sqrt(k2 - kc2); Z_TE_real = (omega * mu) / beta; } // User asked us to determine impedance → store TE wave impedance portEx.PortImpedance = (fp_t_ts)Z_TE_real; bcARRAY[i].set_rval(Z_TE_real); bcARRAY[i].set_cval(Z_TE_real, Z_TE_imag); } else { // User-specified portEx.PortImpedance = (fp_t_ts)impR; bcARRAY[i].set_rval(impR); bcARRAY[i].set_cval(impR, impI); } break; } default: { std::cerr << "[PORT] Unknown PortFlag=" << PortFlag << " for " << portName << ". Defaulting to TEM-rect with vpath=PortDirection.\n"; portEx.PortFlag = 1; portEx.vpath[0] = (fp_t_ts)port_dx; portEx.vpath[1] = (fp_t_ts)port_dy; portEx.vpath[2] = (fp_t_ts)port_dz; if (impR == 0.0 && impI == 0.0) portEx.PortImpedance = (fp_t_ts)0.0; break; } } portExcitations.push_back(portEx); // Log summary std::cout << "\n=========================\n" << " PORT BOUNDARY CONDITION \n" << "=========================\n" << "PortName : " << portName << "\n" << "PortNum : " << (portEx.portNum - 1) << "\n" << "PortFlag : " << portEx.PortFlag << " (1=TEM-rect, 2=TEM-coax, 3=TE_mn)\n" << "E/H Zport : " << portEx.PortImpedance << " + j" << impI << " (0 => implicit)\n" << "magE : " << portEx.Emagnitude << "\n" << "PortDir : (" << port_dx << ", " << port_dy << ", " << port_dz << ")\n" << "epr, mur : " << epr << ", " << mur << "\n"; if (portEx.PortFlag == 1) { std::cout << "vpath : (" << portEx.vpath[0] << ", " << portEx.vpath[1] << ", " << portEx.vpath[2] << ")\n"; } else if (portEx.PortFlag == 2) { std::cout << "r0 : (" << portEx.r0_port[0] << ", " << portEx.r0_port[1] << ", " << portEx.r0_port[2] << ")\n" << "r1(inner) : (" << portEx.r1_port[0] << ", " << portEx.r1_port[1] << ", " << portEx.r1_port[2] << ")\n" << "r2(outer) : (" << portEx.r2_port[0] << ", " << portEx.r2_port[1] << ", " << portEx.r2_port[2] << ")\n"; } else if (portEx.PortFlag == 3) { std::cout << "rect(a,b) : " << portEx.rect_a << ", " << portEx.rect_b << "\n" << "m,n : " << portEx.m << ", " << portEx.n << "\n" << "uv0 : (" << portEx.uv0[0] << ", " << portEx.uv0[1] << ", " << portEx.uv0[2] << ")\n"; } std::cout << "=========================\n\n"; #endif break; } case planeWaveType: // planeWave (theta, phi, ex, ey, ez) { char typeName[StrLenShort]; foo >> typeName >> magE >> theta >> phi >> x >> y >> z >> rox >> roy >> roz; cout << " " << endl; cout << "====================================================================================================" << endl; cout << " PLANEWAVE BOUNDARY CONDITION " << endl; cout << "====================================================================================================" << endl; printf(" PlaneWaveType : %f %f %f %f %f %f %f %f %f\n", magE, theta, phi, x, y, z, rox, roy, roz); printf(" Unit : %f\n", unit); bcARRAY[i].set_name(typeName); bcARRAY[i].set_magE(magE); bcARRAY[i].setTheta(theta); bcARRAY[i].setPhi(phi); bcARRAY[i].set_cval(No, 0.0); bcARRAY[i].SETFIELD(x, y, z); bcARRAY[i].setPW_ro(rox * unit, roy * unit, roz * unit); cout << " Name : " << typeName << endl; cout << " magE : " << magE << endl; cout << " Theta : " << theta << endl; cout << " Phi : " << phi << endl; cout << " POL : " << "(" << x << ", " << y << ", " << z << ")" << endl; cout << " r0 : " << "(" << rox << ", " << roy << ", " << roz << ")" << endl; cout << "====================================================================================================" << endl; cout << " " << endl; PWorPort = 0; #if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL) // for cuda kernel excitationProp.ro[0] = rox * unit; excitationProp.ro[1] = roy * unit; excitationProp.ro[2] = roz * unit; excitationProp.Emagnitude = magE; excitationProp.Epol[0] = x; excitationProp.Epol[1] = y; excitationProp.Epol[2] = z; excitationProp.ExcitationFlag = ExcitFlag; excitationProp.freq_m = freq; excitationProp.to = To; excitationProp.tau = Tau; excitationProp.phi = phi; excitationProp.theta = theta; #endif interior_excitation_flag = false; planeWaveMesh = new PlaneWaveMesh; planeWaveMesh->setName(typeName); PlaneWaveBCFlag = true; break; } case nonConformal: { nonConformalCase = true; break; } // Excitation Mode (PlaneWave into PML region) case pmlType: { foo >> pmlMode >> portName >> magE >> theta >> phi >> pol_x >> pol_y >> pol_z >> rox >> roy >> roz; PWorPort = 0; std::cout << "\n"; std::cout << "====================================================================================================" << std::endl; std::cout << " PML EXCITATION BOUNDARY CONDITION " << std::endl; std::cout << "====================================================================================================" << std::endl; printf(" PML Mode : %d\n", pmlMode); printf(" Port Name : %s\n", portName); printf(" magE : %f\n", magE); printf(" Theta : %f\n", theta); printf(" Phi : %f\n", phi); printf(" POL : (%f, %f, %f)\n", pol_x, pol_y, pol_z); printf(" r0 : (%f, %f, %f)\n", rox, roy, roz); printf(" Unit : %f\n", unit); // Make sure `unit` is defined std::cout << "====================================================================================================" << std::endl; std::cout << "\n"; // Apply to BC object bcARRAY[i].set_name(portName); bcARRAY[i].set_magE(magE); bcARRAY[i].setTheta(theta); bcARRAY[i].setPhi(phi); bcARRAY[i].set_cval(No, 0.0); bcARRAY[i].SETFIELD(pol_x, pol_y, pol_z); // Assuming SETFIELD is for polarization bcARRAY[i].setPW_ro(rox * unit, roy * unit, roz * unit); #if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL) excitationProp.ro[0] = rox * unit; excitationProp.ro[1] = roy * unit; excitationProp.ro[2] = roz * unit; excitationProp.Emagnitude = magE; excitationProp.Epol[0] = pol_x; excitationProp.Epol[1] = pol_y; excitationProp.Epol[2] = pol_z; excitationProp.ExcitationFlag = ExcitFlag; // Must be defined excitationProp.freq_m = freq; // Must be defined excitationProp.to = To; // Must be defined excitationProp.tau = Tau; // Must be defined excitationProp.phi = phi; excitationProp.theta = theta; #endif if (pmlMode == 1) { interior_excitation_flag = true; planeWaveMesh = new PlaneWaveMesh; planeWaveMesh->setName(portName); PlaneWaveBCFlag = true; } break; } } } } /** Make the egde and face arrays */ int localEdgebType(int n, int nbc[]){ int bType, nb1, nb2; switch (n){ case 0: nb1 = nbc[2]; nb2 = nbc[3]; break; case 1: nb1 = nbc[1]; nb2 = nbc[3]; break; case 2: nb1 = nbc[1]; nb2 = nbc[2]; break; case 3: nb1 = nbc[0]; nb2 = nbc[3]; break; case 4: nb1 = nbc[0]; nb2 = nbc[2]; break; case 5: nb1 = nbc[0]; nb2 = nbc[1]; break; } bType = (nb1 > nb2) ? nb1 : nb2; //original return bType; } void FemGrp::makeEdgeArray(){ int i, j; // oversized array for edge BCs int* edgeBcs = new int[tetraCNT * 6]; // store global edge ids for set/array use int** edgeIds = new int*[tetraCNT]; for(i = 0; i < tetraCNT; i++) edgeIds[i] = new int[NumOfEdges]; int nbc[NumOfFaces]; list edgeList; list::iterator edgeListIter; edgeSetPtr = new set; set::iterator edgeSetIter; int index = 0; for(i = 0; i < tetraCNT; i++){ tetra* tet = &(tetARRAY[i]); for(j = 0; j < NumOfFaces; j++) nbc[j] = bcArrange(tet->getbc(j)); //return the bc (the number define for the material) of each face for(j = 0; j < NumOfEdges; j++){ int n0 = edgeMAP[j][0]; int n1 = edgeMAP[j][1]; int bType = localEdgebType(j, nbc); //return the most important bc of the edge checking both faces node* nd0 = tet->getNode(n0); node* nd1 = tet->getNode(n1); edge* eg = new edge; eg->setEdge(nd0, nd1); //add each edge just once edgeSetIter = edgeSetPtr->find(*eg); if(edgeSetIter == edgeSetPtr->end()){ // new edge eg->setGlobalCnt(index); edgeIds[i][j] = index; eg->setbType(bType); edgeBcs[index] = bType; edgeSetPtr->insert(*eg); edgeList.push_back(eg); index++; }else{ // set the boundary condicion of higher value if the edge was already set delete eg; edgeIds[i][j] = edgeSetIter->getGlobalCnt(); if(bType > edgeSetIter->getbType()){ edgeBcs[edgeIds[i][j]] = bType; (const_cast(*edgeSetIter)).setbType(bType); } } } } // convert the list into an array edgeCNT = edgeList.size(); cout << " edgeCNT == " << edgeCNT << endl; edgeARRAY = new edge*[edgeCNT]; index = 0; for(edgeListIter = edgeList.begin(); edgeListIter != edgeList.end(); edgeListIter++) edgeARRAY[index++] = *edgeListIter; // set the boundary conditions for(i = 0; i < edgeCNT; i++) edgeARRAY[i]->setbType(edgeBcs[i]); delete [] edgeBcs; // get tetra-edge linkage for(i = 0; i < tetraCNT; i++){ for(j = 0; j < NumOfEdges; j++) tetARRAY[i].setEdge(edgeARRAY[edgeIds[i][j]], j); } for(i = 0; i < tetraCNT; i++) delete [] edgeIds[i]; delete [] edgeIds; } void FemGrp::makeNonConformalArray(){ ncARRAY = new int[nonConformalCNT]; int index = 0; for(int i=0; i < tetraCNT; i++){ tetra* tet = &(tetARRAY[i]); if(tet->getIsNC()){ ncARRAY[index] = tet->cnt; index++; } } if(nonConformalCNT != index) cout << "ERROR in makeNonConformalArray" << endl; } void FemGrp::makeFaceArray() { int i, j; // oversized arrays for face BCs and a map from global IDs with PEC faces to IDs without PEC face int* faceBcs = new int[tetraCNT * NumOfFaces]; int* indexMap = new int[tetraCNT * NumOfFaces]; //TODO: review what's the use of this array memset(faceBcs, 0, tetraCNT * NumOfFaces * sizeof(int)); memset(indexMap, 0, tetraCNT * NumOfFaces * sizeof(int)); // store global face ids for set/array use int** faceIds = new int*[tetraCNT]; for(i = 0; i < tetraCNT; i++){ faceIds[i] = new int[NumOfFaces]; memset(faceIds[i], 0, NumOfFaces * sizeof(int)); } edge eg; list faceList; vector faceListVector; list::iterator faceListIter; faceSetPtr = new set; set::iterator faceSetIter; int index = 0; int indexNoPec = 0; //TODO: review what's the use of this variable for(i = 0; i < tetraCNT; i++){ tetra* tet = &(tetARRAY[i]); for(j = 0; j < NumOfFaces; j++){ int bcNum = tet->getbc(j); // marker int bType = bcArrange(bcNum); // bc type in the defines bc* bcPtr = getbcPtr(bcNum); // pointer to the bc if(bType == nonConformal && !(tet->isNonConformal)){ nonConformalCNT++; tet->setIsNC(true); } node* nd0 = tet->getNode(faceMAP[j][0]); node* nd1 = tet->getNode(faceMAP[j][1]); node* nd2 = tet->getNode(faceMAP[j][2]); face* fc = new face; fc->setFace(nd0, nd1, nd2); //set a face with the nodes ordered from smallest to greatest id faceSetIter = faceSetPtr->find(*fc); if(faceSetIter == faceSetPtr->end()){ // new face fc->setcnt(index); faceIds[i][j] = index; if(bType != pecType) indexMap[index] = indexNoPec++; faceBcs[index] = bType; fc->setbcPtr(bcPtr); // set up face-edge linkage eg.setEdge(nd1, nd2); fc->setEdge(edgeARRAY[edgeSetPtr->find(eg)->getGlobalCnt()], 0); eg.setEdge(nd0, nd2); fc->setEdge(edgeARRAY[edgeSetPtr->find(eg)->getGlobalCnt()], 1); eg.setEdge(nd1, nd0); fc->setEdge(edgeARRAY[edgeSetPtr->find(eg)->getGlobalCnt()], 2); index++; faceSetPtr->insert(*fc); faceList.push_back(fc); faceListVector.push_back(fc); }else{ delete fc; faceIds[i][j] = faceSetIter->getcnt(); // the j-th local face of tetra i is an old face if(bType > faceSetIter->getbType()){ // choose btype with a larger value faceBcs[faceIds[i][j]] = bType; (const_cast(*faceSetIter)).setbType(bType); face* f = faceListVector[faceIds[i][j]]; f->setbType(bType); f->setbcPtr(bcPtr); } } } } // convert the list into an array int totalFaceCount = faceList.size(); cout << " totalFaceCount == " << totalFaceCount << endl; face** totalFaceArray = new face*[totalFaceCount]; index = 0; for(faceListIter = faceList.begin(); faceListIter != faceList.end(); faceListIter++) totalFaceArray[index++] = *faceListIter; // set the boundary conditions for(i = 0; i < totalFaceCount; i++){ totalFaceArray[i]->setbType(faceBcs[i]); } // set tetra-face linkage for(i = 0; i < tetraCNT; i++){ tetra* tet = &(tetARRAY[i]); for(j = 0; j < 4; j++){ face* fc = totalFaceArray[faceIds[i][j]]; tet->setFace(fc, j); if(fc->hydra[0] == nullptr){ // newly found face linkage fc->hydra[0] = tet; } else { // already existed, half-linked fc->hydra[1] = tet; fc->tetraArrange(); //order hydra[0] < hydra[1] } } } for(i = 0; i < tetraCNT; i++) delete [] faceIds[i]; delete [] faceIds; delete [] totalFaceArray; delete [] faceBcs; delete [] indexMap; // convert the reduced list into an array faceCNT = faceList.size(); faceARRAY = new face*[faceCNT]; indexNoPec = 0; for(faceListIter = faceList.begin(); faceListIter != faceList.end(); faceListIter++) faceARRAY[indexNoPec++] = *faceListIter; while (faceSetIter != faceSetPtr->end()){ set::iterator tmpIter = faceSetIter; faceSetIter++; faceSetPtr->erase(tmpIter); } faceSetPtr->clear(); delete faceSetPtr; set::iterator edgeSetIter = edgeSetPtr->begin(); while(edgeSetIter != edgeSetPtr->end()){ set::iterator tmpIter = edgeSetIter; edgeSetIter++; edgeSetPtr->erase(tmpIter); } edgeSetPtr->clear(); delete edgeSetPtr; } int FemGrp::bcArrange(int bNum){ // from that indicated in file to type defined in bc.h (marker to bc type) for(int i = 0; i < bcCNT; i ++){ if(bcARRAY[i].getbNum() == bNum) return bcARRAY[i].getbType(); } return 0; } bc *FemGrp::getbcPtr(int bNum){ for(int i = 0; i < bcCNT; i ++) if(bcARRAY[i].getbNum() == bNum) return &(bcARRAY[i]); return nullptr; } void FemGrp::AssignExcitParamToFace(){ for(int i = 0; i < faceCNT; i++){ faceARRAY[i]->setTo(To); faceARRAY[i]->setTau(Tau); faceARRAY[i]->setTimeDist(TimeDistFlag); faceARRAY[i]->setExciFlag(ExcitFlag); faceARRAY[i]->setFrequency(freq); } } void FemGrp::AssignMaterialProperties(){ int i; tetra *tet; for(i = 0; i < tetraCNT; i++) { tet = &(tetARRAY[i]); tet->SetFacePEC(); tet->SetFacePMC(); tet->set_mat(&(objProp[tet->getobjNum()])); tet->set_ConductivityFlag(); // Additional routine for scattering region if (tet->getMat()->scattering_region) { tet->scattering_region = true; } // Additional routine for PML if (tet->getMat()->get_PML_Flag() == 1) { tet->set_PML_Flag(1); } else { tet->set_PML_Flag(0); } if (tet->get_PML_Flag() == -1) cout << "PML_Flag() not set " << endl; } } void FemGrp::AssignTetraFlags(){ int AbcCount = 0; int InterCount = 0; int PortCount = 0; tetra *tet; cout << " " << endl; cout << "======================================================" << endl; cout << " Total number of TetraHedra " << endl; cout << "======================================================" << endl; cout << " Total number of TetraHedra := " << tetraCNT << endl; // Parallelized by Qi Jian #pragma omp parallel for for(int i = 0; i < tetraCNT; i++) { tet = &(tetARRAY[i]); tet->set_TetrahedronFlag(); } double min_AABB_size = 3e8 / (freq * 1e6) / 10.0; // For every tetrahedron, set the neighbor tetrahedra #pragma omp parallel for for(int i = 0; i < tetraCNT; i++) { tet = &(tetARRAY[i]); tet->set_NeighborTetra(tetARRAY, ncARRAY, nonConformalCNT, &octree_object, min_AABB_size); } for(int i = 0; i < tetraCNT; i++) { tet = &(tetARRAY[i]); tet->set_PolyOrderFlagDebug(PolyFlag); // The following code is node thread safe. if (tet->TetrahedronFlag == 0) InterCount++; if (tet->TetrahedronFlag == 1) AbcCount++; if (tet->ExcitationFlag == 1) PortCount++; } cout << " Total number of P" << PolyFlag << " TetraHedra := " << tetraCNT << endl; cout << " Total number of Interior TetraHedra := " << InterCount << endl; cout << " Total number of AbcCount TetraHedra := " << AbcCount << endl; cout << " Total number of Port/PlaneWave TetraHedra := " << PortCount << endl; cout << "======================================================" << endl; cout << " " << endl; int min_poly = tetARRAY[0].get_PolyOrderFlag(); for(int i = 1; i < tetraCNT; i++){ if(tetARRAY[i].get_PolyOrderFlag() < min_poly) min_poly = tetARRAY[i].get_PolyOrderFlag(); } for(int i = 0; i < tetraCNT; i++) tetARRAY[i].set_MinimumPoly(min_poly); // Define Excitation tetrahedral TetExcitIndexArraySize = PortCount; TetExcitIndexArray = (int*)malloc(sizeof(int) * TetExcitIndexArraySize); int index = 0; for(int i = 0; i < tetraCNT; i ++){ tet = &(tetARRAY[i]); if(tet->ExcitationFlag == 1){ TetExcitIndexArray[index] = i; index++; } } } void FemGrp::makePlaneWaveMesh(){ int i, j; set meshNodeIds; // count the number of plane wave faces int pwFaceNum = 0; for(i = 0; i < faceCNT; i++){ if(faceARRAY[i]->getbType() == planeWaveType || faceARRAY[i]->getbType() == pmlType) pwFaceNum++; } // set planeWaveMesh_'s faceCnt_ and allocate its faceArray_ planeWaveMesh->setFaceCnt(pwFaceNum); cout << " pwFaceNum == " << pwFaceNum << endl; cout << " planeWaveMesh->faceCNT == " << planeWaveMesh->faceCNT << endl; // populate faceArray_ int index = 0; for(i = 0; i < faceCNT; i++){ if(faceARRAY[i]->getbType() == planeWaveType || faceARRAY[i]->getbType() == pmlType){ planeWaveMesh->setFace(faceARRAY[i], index); index++; // add unique node ids for(j = 0; j < NumOfNodesPerFace; j++) meshNodeIds.insert(faceARRAY[i]->getNode(j)->getid()); } } // allocate and add node pointers to array keep local mapping int nodeNum = meshNodeIds.size(); planeWaveMesh->setNodeCnt(nodeNum); cout << " nodeNum == " << nodeNum << endl; cout << " planeWaveMesh->nodeCNT == " << planeWaveMesh->nodeCNT << endl; planeWaveMesh->allocGlobToLocMap(); node** PlaneWaveNodeArray = planeWaveMesh->getNodeArray(); map& globToLocMap = planeWaveMesh->getGlobToLocMap(); set::iterator meshNodeIdIter; int nodeCount = 0; for(meshNodeIdIter = meshNodeIds.begin(); meshNodeIdIter != meshNodeIds.end(); meshNodeIdIter++){ PlaneWaveNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]); globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++; } // Set the bounding box coordinates for the Planewave mesh // Useful for PML /* planeWaveMesh->computeBoundingBox(); planewave_xmin = planeWaveMesh->getXmin(); planewave_xmax = planeWaveMesh->getXmax(); planewave_ymin = planeWaveMesh->getYmin(); planewave_ymax = planeWaveMesh->getYmax(); planewave_zmin = planeWaveMesh->getZmin(); planewave_zmax = planeWaveMesh->getZmax(); cout << "Planewave bounding box coordinates: " << std::endl; cout << "xmin: " << planewave_xmin << ", xmax: " << planewave_xmax << std::endl; cout << "ymin: " << planewave_ymin << ", ymax: " << planewave_ymax << std::endl; cout << "zmin: " << planewave_zmin << ", zmax: " << planewave_zmax << std::endl; */ } // Single BC_ID void FemGrp::makeInterSurfMesh(int BC_id){ cout << " Generating InterSurf Mesh with " << BC_id << endl; InterSurfMesh = new PlaneWaveMesh; int i, j; set InterSurfNodeIds; // count the number of faces int InterFaceNum = 0; int* FaceMap = new int[faceCNT]; for(i = 0; i < faceCNT; i++) FaceMap[i] = -1; // Find the faces for(i = 0; i < faceCNT; i++){ if(faceARRAY[i]->getbcPtr()->getbType() == BC_id){ //change InterFaceNum++; FaceMap[i] = i; } } if(InterFaceNum == 0) return; // set InterSurfMesh_'s faceCnt_ and allocate its faceArray_ cout << " InterFaceNum == " << InterFaceNum << endl; InterSurfMesh->setFaceCnt(InterFaceNum); cout << " FaceNum == " << InterFaceNum << endl; cout << " ->faceCNT == " << InterSurfMesh->faceCNT << endl; // populate faceArray_ int index = 0; for(i = 0; i < faceCNT; i++){ if(FaceMap[i] > 0){ InterSurfMesh->setFace(faceARRAY[i], index); index++; // add unique node ids for(j = 0; j < NumOfNodesPerFace; j++) InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid()); } } // allocate and add node pointers to array // keep local mapping int nodeNum = InterSurfNodeIds.size(); InterSurfMesh->setNodeCnt(nodeNum); cout << " nodeNum == " << nodeNum << endl; cout << " ->nodeCNT == " << InterSurfMesh->nodeCNT << endl; InterSurfMesh->allocGlobToLocMap(); node** InterSurfNodeArray = InterSurfMesh->getNodeArray(); map& globToLocMap = InterSurfMesh->getGlobToLocMap(); set::iterator meshNodeIdIter; int nodeCount = 0; for(meshNodeIdIter = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){ InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]); globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++; } //write file char Currents_vtkFile[StrOutput]; sprintf(Currents_vtkFile, "SurfBC_%s_%d", fname, BC_id); node** locNodeArray = new node*[InterSurfMesh->nodeCNT]; for(i = 0; i < InterSurfMesh->nodeCNT; i++){ node& Node = *(InterSurfMesh->ndArray[i]); index = InterSurfMesh->globToLocMap_->find(Node.getid())->second; locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz()); } face** locFaceArray = new face*[InterSurfMesh->faceCNT]; for(i = 0; i < InterSurfMesh->faceCNT; i++){ face& Face = *(InterSurfMesh->fcArray[i]); locFaceArray[i] = new face(Face); locFaceArray[i]->setFace(locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second], locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second], locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]); } //TODO: check why unit is 1. instead of unit VtkWriter vtkWriter(1.); //TODO: check why order is 1. instead of order vtkWriter.writeTriUg(Currents_vtkFile, InterSurfMesh->nodeCNT, locNodeArray, InterSurfMesh->faceCNT, locFaceArray, 1); for(i = 0; i < InterSurfMesh->nodeCNT; i++) delete locNodeArray[i]; delete [] locNodeArray; for(i = 0; i < InterSurfMesh->faceCNT; i++) delete locFaceArray[i]; delete [] locFaceArray; } // Double BC_ID void FemGrp::makeInterSurfMesh(int BC_id1,int BC_id2){ InterSurfMesh = new PlaneWaveMesh; int i, j; set InterSurfNodeIds; // count the number of faces int InterFaceNum = 0; int* FaceMap = new int[faceCNT]; for(i = 0; i < faceCNT; i++) FaceMap[i] = -1; // Find the faces for(i = 0; i < faceCNT; i++){ if((faceARRAY[i]->getbcPtr()->getbType() == BC_id1) || (faceARRAY[i]->getbcPtr()->getbType() == BC_id2)){ InterFaceNum++; FaceMap[i] = i; } } if(InterFaceNum == 0) return; // set InterSurfMesh_'s faceCnt_ and allocate its faceArray_ cout << "== InterFaceNum == " << InterFaceNum << endl; InterSurfMesh->setFaceCnt(InterFaceNum); cout << "== FaceNum == " << InterFaceNum << endl; cout << "== ->faceCNT == " << InterSurfMesh->faceCNT << endl; // populate faceArray_ int index = 0; for(i = 0; i < faceCNT; i++){ if(FaceMap[i] > 0){ InterSurfMesh->setFace(faceARRAY[i], index); index++; // add unique node ids for(j = 0; j < NumOfNodesPerFace; j++) InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid()); } } // allocate and add node pointers to array // keep local mapping int nodeNum = InterSurfNodeIds.size(); InterSurfMesh->setNodeCnt(nodeNum); cout << "== nodeNum == " << nodeNum << endl; cout << "== ->nodeCNT == " << InterSurfMesh->nodeCNT << endl; InterSurfMesh->allocGlobToLocMap(); node** InterSurfNodeArray = InterSurfMesh->getNodeArray(); map& globToLocMap = InterSurfMesh->getGlobToLocMap(); set::iterator meshNodeIdIter; int nodeCount = 0; for(meshNodeIdIter = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){ InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]); globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++; } //write file char Currents_vtkFile[StrOutput]; sprintf(Currents_vtkFile, "SurfBC_%s", fname); node** locNodeArray = new node*[InterSurfMesh->nodeCNT]; for(i = 0; i < InterSurfMesh->nodeCNT; i++){ node& Node = *(InterSurfMesh->ndArray[i]); index = InterSurfMesh->globToLocMap_->find(Node.getid())->second; locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz()); } face** locFaceArray = new face*[InterSurfMesh->faceCNT]; for(i = 0; i < InterSurfMesh->faceCNT; i++){ face& Face = *(InterSurfMesh->fcArray[i]); locFaceArray[i] = new face(Face); locFaceArray[i]->setFace( locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second], locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second], locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]); } //TODO: check why unit is 1. instead of unit VtkWriter vtkWriter(1.); //TODO: check why order is 1. instead of order vtkWriter.writeTriUg(Currents_vtkFile, InterSurfMesh->nodeCNT, locNodeArray, InterSurfMesh->faceCNT, locFaceArray, 1); for(i = 0; i < InterSurfMesh->nodeCNT; i++) delete locNodeArray[i]; delete [] locNodeArray; for(i = 0; i < InterSurfMesh->faceCNT; i++) delete locFaceArray[i]; delete [] locFaceArray; } void FemGrp::makeSurfMesh(int BC_id){ cout << "Generating Surf Mesh with " << BC_id << endl; SurfMesh = new PlaneWaveMesh; int i, j; set InterSurfNodeIds; // count the number of faces int InterFaceNum = 0; int* FaceMap = new int[faceCNT]; for(i = 0; i < faceCNT; i++) FaceMap[i] = -1; // Find the faces for(i = 0; i < faceCNT; i++){ if(faceARRAY[i]->getbcPtr()->getbType() == BC_id){ //change InterFaceNum++; FaceMap[i] = i; } } if(InterFaceNum == 0) return; // set SurfMesh_'s faceCnt_ and allocate its faceArray_ cout << "== InterFaceNum == " << InterFaceNum << endl; SurfMesh->setFaceCnt(InterFaceNum); cout << "== FaceNum == " << InterFaceNum << endl; cout << "== ->faceCNT == " << SurfMesh->faceCNT << endl; // populate faceArray_ int index = 0; for(i = 0; i < faceCNT; i++){ if(FaceMap[i] > 0){ SurfMesh->setFace(faceARRAY[i], index); index++; // add unique node ids for(j = 0; j < NumOfNodesPerFace; j++) InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid()); } } // allocate and add node pointers to array // keep local mapping int nodeNum = InterSurfNodeIds.size(); SurfMesh->setNodeCnt(nodeNum); cout << "== nodeNum == " << nodeNum << endl; cout << "== ->nodeCNT == " << SurfMesh->nodeCNT << endl; SurfMesh->allocGlobToLocMap(); node** InterSurfNodeArray = SurfMesh->getNodeArray(); map& globToLocMap = SurfMesh->getGlobToLocMap(); set::iterator meshNodeIdIter; int nodeCount = 0; for(meshNodeIdIter = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){ InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]); globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++; } //write file char Currents_vtkFile[StrOutput]; sprintf(Currents_vtkFile, "SurfBC_%s_%d", fname, BC_id); node** locNodeArray = new node*[SurfMesh->nodeCNT]; for(i = 0; i < SurfMesh->nodeCNT; i++){ node& Node = *(SurfMesh->ndArray[i]); index = SurfMesh->globToLocMap_->find(Node.getid())->second; locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz()); } face** locFaceArray = new face*[SurfMesh->faceCNT]; for(i = 0; i < SurfMesh->faceCNT; i++){ face& Face = *(SurfMesh->fcArray[i]); locFaceArray[i] = new face(Face); locFaceArray[i]->setFace(locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second], locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second], locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]); } //TODO: check why unit is 1. instead of unit (it may be because the node coordinates are already scaled after readin. So they are true unit of the geometry) VtkWriter vtkWriter(1.); vtkWriter.writeTriUg(Currents_vtkFile, SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, 1); //The one is because we only work with first order geometry (modify if we want to work with higher order structures) for(i = 0; i < SurfMesh->nodeCNT; i++) delete locNodeArray[i]; delete [] locNodeArray; for(i = 0; i < SurfMesh->faceCNT; i++) delete locFaceArray[i]; delete [] locFaceArray; } // Double BC_ID void FemGrp::makeSurfMesh(int BC_id1,int BC_id2){ SurfMesh = new PlaneWaveMesh; int i, j; set InterSurfNodeIds; // count the number of faces int InterFaceNum = 0; int* FaceMap = new int[faceCNT]; for(i = 0; i < faceCNT; i++) FaceMap[i] = -1; // Find the faces for(i = 0; i < faceCNT; i++){ if((faceARRAY[i]->getbcPtr()->getbType() == BC_id1) || (faceARRAY[i]->getbcPtr()->getbType() == BC_id2)){ InterFaceNum++; FaceMap[i] = i; } } if(InterFaceNum == 0) return; // set SurfMesh_'s faceCnt_ and allocate its faceArray_ cout << "== InterFaceNum == " << InterFaceNum << endl; SurfMesh->setFaceCnt(InterFaceNum); cout << "== FaceNum == " << InterFaceNum << endl; cout << "== ->faceCNT == " << SurfMesh->faceCNT << endl; // populate faceArray_ int index = 0; for(i = 0; i < faceCNT; i++){ if(FaceMap[i] > 0){ SurfMesh->setFace(faceARRAY[i], index); index++; // add unique node ids for(j = 0; j < NumOfNodesPerFace; j++) InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid()); } } // allocate and add node pointers to array // keep local mapping int nodeNum = InterSurfNodeIds.size(); SurfMesh->setNodeCnt(nodeNum); cout << "== nodeNum == " << nodeNum << endl; cout << "== ->nodeCNT == " << SurfMesh->nodeCNT << endl; SurfMesh->allocGlobToLocMap(); node** InterSurfNodeArray = SurfMesh->getNodeArray(); map& globToLocMap = SurfMesh->getGlobToLocMap(); set::iterator meshNodeIdIter; int nodeCount = 0; for(meshNodeIdIter = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){ InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]); globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++; } //write file char Currents_vtkFile[StrOutput]; sprintf(Currents_vtkFile, "SurfBC_%s", fname); node** locNodeArray = new node*[SurfMesh->nodeCNT]; for(i = 0; i < SurfMesh->nodeCNT; i++){ node& Node = *(SurfMesh->ndArray[i]); index = SurfMesh->globToLocMap_->find(Node.getid())->second; locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz()); } face** locFaceArray = new face*[SurfMesh->faceCNT]; for(i = 0; i < SurfMesh->faceCNT; i++){ face& Face = *(SurfMesh->fcArray[i]); locFaceArray[i] = new face(Face); locFaceArray[i]->setFace( locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second], locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second], locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]); } //TODO: check why unit is 1. instead of unit VtkWriter vtkWriter(1.); //TODO: check why order is 1. instead of order vtkWriter.writeTriUg(Currents_vtkFile, SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, 1); for(i = 0; i < SurfMesh->nodeCNT; i++) delete locNodeArray[i]; delete [] locNodeArray; for(i = 0; i < SurfMesh->faceCNT; i++) delete locFaceArray[i]; delete [] locFaceArray; } // Set up the tet mass matrices and also the local inverses // If non-matrix free is used also precompute and store the update matrices void FemGrp::GetMatrices(){ int i; tetra *tet; timer_start("CPU Matrices Evaluation",'u'); // this gets the mass matrices for the local tets only cout << "tetraCNT = " << tetraCNT << endl; //std::vector vec_x1, vec_y1, vec_z1; //std::vector vec_A2x, vec_A2y, vec_A2z; //fp_t cutoff_freq = freq * 1e6; // Convert MHz to Hz #pragma omp parallel for schedule(dynamic) private(tet,i) for(i = 0; i < tetraCNT; i ++) { #if defined(DGTD_USE_CUDA) //cout << "regularRegionFlag = " << regularRegionFlag << endl; //cout << "regularReferenceARRAY[" << i << "] = " << regularReferenceARRAY[i] << endl; //It is important in this order to avoid the checking of a null pointer if(!regularRegionFlag || regularReferenceARRAY[i] == i) { tet = &(tetARRAY[i]); tet->set_flux_GAMMA(factor_Flux); bool isPML = tet->get_PML_Flag(); // ------------------------------------------------------------------------------- if (isPML) { tet->set_Conductivity_Profile_Planar(planewave_xmin, planewave_ymin, planewave_zmin, planewave_xmax, planewave_ymax, planewave_zmax); if (UseQuadratureMatrices) { tensor identity(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0); tet->Calculate_M_Matrix_E_Numeric(); tet->Calculate_M_Matrix_I_E_Numeric(); tet->Calculate_ABC_E_Numeric(); tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_epA_E, tet->matA, tet->mat->epsr, true, "A", planewave_xmin, planewave_ymin, planewave_zmin, planewave_xmax, planewave_ymax, planewave_zmax, Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // epA tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_epB_E, tet->matB, tet->mat->epsr, true, "B", planewave_xmin, planewave_ymin, planewave_zmin, planewave_xmax, planewave_ymax, planewave_zmax, Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // epB tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_epC_E, tet->matC, tet->mat->epsr, true, "C", planewave_xmin, planewave_ymin, planewave_zmin, planewave_xmax, planewave_ymax, planewave_zmax, Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // epC tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_D_E, tet->matD, identity, true, "D", planewave_xmin, planewave_ymin, planewave_zmin, planewave_xmax, planewave_ymax, planewave_zmax, Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // D tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_F_E, tet->matF, identity, true, "F", planewave_xmin, planewave_ymin, planewave_zmin, planewave_xmax, planewave_ymax, planewave_zmax, Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // F tet->Calculate_Bii_Matrix_E_Numeric(); tet->Calculate_Bij_Matrix_E_Numeric(); tet->Calculate_S_Matrix_E_Numeric(); tet->Calculate_Fii_Matrix_E_Numeric(); tet->Calculate_Fij_Matrix_E_Numeric(); tet->SetUp_LocalFaceToTetraMapE_NMF1_PML(tet->Class_dt); tet->Calculate_M_Matrix_H_Numeric(); tet->Calculate_M_Matrix_I_H_Numeric(); tet->Calculate_ABC_H_Numeric(); tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_muA_H, tet->matA, tet->mat->mur, false, "A", planewave_xmin, planewave_ymin, planewave_zmin, planewave_xmax, planewave_ymax, planewave_zmax, Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // muA tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_muB_H, tet->matB, tet->mat->mur, false, "B", planewave_xmin, planewave_ymin, planewave_zmin, planewave_xmax, planewave_ymax, planewave_zmax, Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // muB tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_muC_H, tet->matC, tet->mat->mur, false, "C", planewave_xmin, planewave_ymin, planewave_zmin, planewave_xmax, planewave_ymax, planewave_zmax, Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // muC tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_D_H, tet->matD, identity, false, "D", planewave_xmin, planewave_ymin, planewave_zmin, planewave_xmax, planewave_ymax, planewave_zmax, Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // D tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_F_H, tet->matF, identity, false, "F", planewave_xmin, planewave_ymin, planewave_zmin, planewave_xmax, planewave_ymax, planewave_zmax, Ellipse_Rx, Ellipse_Ry, Ellipse_Rz); // F tet->Calculate_Bii_Matrix_H_Numeric(); tet->Calculate_Bij_Matrix_H_Numeric(); tet->Calculate_S_Matrix_H_Numeric(); tet->Calculate_Fii_Matrix_H_Numeric(); tet->Calculate_Fij_Matrix_H_Numeric(); tet->SetUp_LocalFaceToTetraMapH_NMF1_PML(tet->Class_dt); } else { tensor identity(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0); tet->Calculate_M_Matrix_E(); tet->Calculate_M_Matrix_I_E(); tet->Calculate_ABC_E(); tet->Calculate_Mass_Material_Matrix( tet->Mass_epA_E, tet->matA, tet->mat->epsr, true); // epA tet->Calculate_Mass_Material_Matrix( tet->Mass_epB_E, tet->matB, tet->mat->epsr, true); // epB tet->Calculate_Mass_Material_Matrix( tet->Mass_epC_E, tet->matC, tet->mat->epsr, true); // epC tet->Calculate_Mass_Material_Matrix( tet->Mass_D_E, tet->matD, identity, true); // D tet->Calculate_Mass_Material_Matrix( tet->Mass_F_E, tet->matF, identity, true); // F tet->Calculate_Bii_Matrix_E(); tet->Calculate_Bij_Matrix_E(); tet->Calculate_S_Matrix_E(); tet->Calculate_Fii_Matrix_E(); tet->Calculate_Fij_Matrix_E(); tet->SetUp_LocalFaceToTetraMapE_NMF1_PML(tet->Class_dt); tet->Calculate_M_Matrix_H(); tet->Calculate_M_Matrix_I_H(); tet->Calculate_ABC_H(); tet->Calculate_Mass_Material_Matrix( tet->Mass_muA_H, tet->matA, tet->mat->mur, false); // muA tet->Calculate_Mass_Material_Matrix( tet->Mass_muB_H, tet->matB, tet->mat->mur, false); // muB tet->Calculate_Mass_Material_Matrix( tet->Mass_muC_H, tet->matC, tet->mat->mur, false); // muC tet->Calculate_Mass_Material_Matrix( tet->Mass_D_H, tet->matD, identity, false); // D tet->Calculate_Mass_Material_Matrix( tet->Mass_F_H, tet->matF, identity, false); // F tet->Calculate_Bii_Matrix_H(); tet->Calculate_Bij_Matrix_H(); tet->Calculate_S_Matrix_H(); tet->Calculate_Fii_Matrix_H(); tet->Calculate_Fij_Matrix_H(); tet->SetUp_LocalFaceToTetraMapH_NMF1_PML(tet->Class_dt); } } // PML // ------------------------------------------------------------------------------- else { if (UseQuadratureMatrices) { tet->Calculate_M_Matrix_E_Numeric(); tet->Calculate_M_Matrix_H_Numeric(); tet->Calculate_Bii_Matrix_E_Numeric(); tet->Calculate_Bij_Matrix_E_Numeric(); tet->Calculate_S_Matrix_E_Numeric(); tet->Calculate_Fii_Matrix_E_Numeric(); tet->Calculate_Fij_Matrix_E_Numeric(); tet->SetUp_LocalFaceToTetraMapE_NMF1_Numeric(tet->Class_dt); tet->Calculate_Bii_Matrix_H_Numeric(); tet->Calculate_Bij_Matrix_H_Numeric(); tet->Calculate_S_Matrix_H_Numeric(); tet->Calculate_Fii_Matrix_H_Numeric(); tet->Calculate_Fij_Matrix_H_Numeric(); tet->SetUp_LocalFaceToTetraMapH_NMF1_Numeric(tet->Class_dt); } else { tet->Calculate_M_Matrix_E(); tet->Calculate_M_Matrix_H(); tet->Calculate_Bii_Matrix_E(); tet->Calculate_Bij_Matrix_E(); tet->Calculate_S_Matrix_E(); tet->Calculate_Fii_Matrix_E(); tet->Calculate_Fij_Matrix_E(); tet->SetUp_LocalFaceToTetraMapE_NMF1(tet->Class_dt); tet->Calculate_Bii_Matrix_H(); tet->Calculate_Bij_Matrix_H(); tet->Calculate_S_Matrix_H(); tet->Calculate_Fii_Matrix_H(); tet->Calculate_Fij_Matrix_H(); tet->SetUp_LocalFaceToTetraMapH_NMF1(tet->Class_dt); } } } #else tet = &(tetARRAY[i]); tet->set_flux_GAMMA(factor_Flux); tet->Calculate_M_Matrix_E(); tet->Calculate_M_Matrix_H(); // this tet->Calculate_Bii_Matrix_E(); tet->Calculate_Bij_Matrix_E(); tet->Calculate_S_Matrix_E(); tet->Calculate_Fii_Matrix_E(); tet->Calculate_Fij_Matrix_E(); tet->SetUp_LocalFaceToTetraMapE_NMF1(tet->Class_dt); tet->Calculate_Bii_Matrix_H(); tet->Calculate_Bij_Matrix_H(); tet->Calculate_S_Matrix_H(); tet->Calculate_Fii_Matrix_H(); tet->Calculate_Fij_Matrix_H(); tet->SetUp_LocalFaceToTetraMapH_NMF1(tet->Class_dt); #endif } timer_stop('u'); } void FemGrp::SetUpMatrixVector(){ DimE = dimE; DimH = dimH; #if defined(DGTD_USE_CUDA) // MemSizeE = DimE * sizeof(fp_t_ts); // MemSizeH = DimH * sizeof(fp_t_ts); // CUDA_SAFE_CALL(cudaMallocHost((void**)&En1_h, MemSizeE, cudaHostAllocMapped)); // CUDA_SAFE_CALL(cudaMallocHost((void**)&Hn32_h, MemSizeH, cudaHostAllocMapped)); #else MemSizeE = DimE * sizeof(fp_t); MemSizeH = DimH * sizeof(fp_t); en = new ArrayFP(DimE); hn_12 = new ArrayFP(DimH); en_1 = new ArrayFP(DimE); hn_32 = new ArrayFP(DimH); #endif // pre-compute the facial matrices required for coupling #pragma omp parallel for schedule(static) for(int i = 0; i < faceCNT; i++) faceARRAY[i]->SetUpMatrixFree(); // #pragma omp parallel for schedule(dynamic) private(tet,i) #pragma omp parallel for schedule(dynamic) for(int i = 0; i < tetraCNT; i++){ tetARRAY[i].SetUpMatrixFree(); } } void FemGrp::DG_AssignOffsets(){ int i; int OffsetE = 0; int OffsetH = 0; tetra* tet; for(i = 0; i < tetraCNT; i ++){ tet = &(tetARRAY[i]); tet->CountDOF_E(); tet->CountDOF_H(); dimE = dimE + tet->LocalEDOF; dimH = dimH + tet->LocalHDOF; tet->set_LocalOffsetE(OffsetE); OffsetE = OffsetE + tet->LocalEDOF; tet->set_LocalOffsetH(OffsetH); OffsetH = OffsetH + tet->LocalHDOF; } cout << " " << endl; cout << "=================" << endl; cout << " Dimensions " << endl; cout << "=================" << endl; cout << " dimE = " << dimE << endl; cout << " dimH = " << dimH << endl; cout << "=================" << endl; cout << " " << endl; } void FemGrp::Get_dt_min_max(){ int printSc = tetraCNT / 10; fp_t V_P; fp_t LocaldtMin = 1.0 * 1e6; fp_t LocalDt; fp_t LocaldtMax = 0.0; // #pragma omp parallel for schedule(dynamic) shared(LocaldtMin) private(LocalDt, V_P) for(int i = 0; i < tetraCNT; i ++){ tetra* tet = &(tetARRAY[i]); tet->TimeStepEstimate(LocalDt, V_P); tet->set_Stability_dt(LocalDt); // May 5 2011 if(LocalDt < LocaldtMin){ #pragma omp atomic write LocaldtMin = LocalDt; } if(LocalDt > LocaldtMax){ #pragma omp atomic write LocaldtMax = LocalDt; } if(i % printSc == 0) DEBUG_INFO(" Finished: " + to_string(i / (fp_t)tetraCNT * 100.0) + " %"); } dt_min = LocaldtMin; dt_max = LocaldtMax; } void FemGrp::LocalTimeSteppingClassPartioning() { cout.setf(ios::scientific,ios_base::floatfield); cout.precision(20); cout << " " << endl; cout << "========================================================" << endl; cout << " LocalTimeSteppingClassPartioning " << endl << flush; cout << "========================================================" << endl; ////////////////////////////////////////////////////////////////////////////////////// // In this part we calculate the minimum and maximum time-step, with these // // values, we calculate the number of classes and the ttime-step of each class as: // // dt_k = (2.0 * m + 1)^k * dt_min // // - m = class factor // // - k = number of the class(starts in 0) // // - dt_k = timestep of class k // // - dt_min = minimun timestep // // we also assign to each tetra the class they belong to // ////////////////////////////////////////////////////////////////////////////////////// int ClassCnt = 0; int PMLClassCnt = 0; // For PML setClassMul(1);// this is actually the m not (2m+1) fp_t m = getClassMul(); cout << " Class Factor: (2m + 1), m = " << m << " " << endl << flush; cout << " " << endl; fp_t LocalDt; fp_t LocalDt_down; fp_t LocalDt_up; tetra *tet; cout << " Calculating Time steps " << endl; Get_dt_min_max(); cout << " " << endl; cout << " Get_dt_min = " << dt_min << endl; cout << " Get_dt_max = " << dt_max << endl; cout << " " << endl; cout.setf(ios::scientific,ios_base::floatfield); cout.precision(8); cout << " Starting class partitioning" << endl; N_class = (int)ceil(log((dt_max / dt_min)) / log(2.0 * m + 1.0)); if(scalbSty == 1 || N_class == 0) //only 1 if DGTD_USE_LTS is NOT defined N_class = 1; LocTimeSteps = new double[N_class]; ClassTetraCnt = new int[N_class]; ClassPMLTetraCnt = new int[N_class]; for(int i = 0 ; i < N_class; i++) { ClassTetraCnt[i] = 0; ClassPMLTetraCnt[i] = 0; } cout << " " << endl; cout << " N_class: " << N_class << endl; if(scalbSty) TimeStep_dt = dt_min; numberPML = 0; for(int i = 0 ; i < N_class; i++) { LocalDt_down = pow((2.0 * m + 1.0), i) * dt_min; LocalDt_up = pow((2.0 * m + 1.0), (i + 1)) * dt_min; LocTimeSteps[i] = 1.0 * LocalDt_down; #pragma omp parallel for schedule(dynamic) shared(ClassCnt,PMLClassCnt) private(tet, LocalDt) for(int j = 0; j < tetraCNT; j ++) { tet = &(tetARRAY[j]); if(scalbSty) { tet->set_LTS_Flag(i); tet->set_Class_dt(1.0 * LocalDt_down); bool isExcitation = tet->get_ExcitationFlag(); #pragma omp atomic ClassCnt++; if (tet->get_PML_Flag() && !isExcitation) { #pragma omp atomic PMLClassCnt++; } else { // Increment the count of tetrahedra in this class #pragma omp atomic ClassCnt++; } } else { LocalDt = tet->get_Stability_dt(); //LocalDt = 0.93 * LocalDt; if(LocalDt_down <= LocalDt && (LocalDt < LocalDt_up || i == N_class - 1)) { tet->set_LTS_Flag(i); tet->set_Class_dt(1.0 * LocalDt_down); bool isExcitation = tet->get_ExcitationFlag(); if (tet->get_PML_Flag() && !isExcitation) { #pragma omp atomic PMLClassCnt++; } else { // Increment the count of tetrahedra in this class #pragma omp atomic ClassCnt++; } } } } ClassTetraCnt[i] = ClassCnt; ClassPMLTetraCnt[i] = PMLClassCnt; numberPML += PMLClassCnt; cout << " Number of Tetra in class: " << i << " = " << ClassTetraCnt[i] << endl; cout << " Number of PML Tetra in class: " << i << " = " << ClassPMLTetraCnt[i] << std::endl; cout << "-------------------------------------------------------------" << endl; ClassCnt = 0; PMLClassCnt = 0; } cout << "Total Number of PML Tetras = " << numberPML << endl; //////////////////////////////////////////////////////////////////////////////////// // In this part we check if there is enough elements in one class to be efficient // // if not, those elements will be moved to the previous class // //////////////////////////////////////////////////////////////////////////////////// if(N_class > 1) { bool reduceN_class = false; bool balanced = false; for(int i = 0; i < N_class - 1; i++) { int classN = (N_class - 1) - i; fp_t number_of_tetra_in_classN = (fp_t)ClassTetraCnt[classN] + (fp_t)ClassPMLTetraCnt[classN]; fp_t relClassCnt = number_of_tetra_in_classN / tetraCNT; fp_t previousClassDt = pow((2.0 * m + 1.0), classN - 1) * dt_min; if (relClassCnt < ClassRelMinCNT && number_of_tetra_in_classN < ClassMinCNT) { if(i == 0) { reduceN_class = true; } balanced = true; ClassTetraCnt[classN - 1] += ClassTetraCnt[classN]; ClassTetraCnt[classN] = 0; ClassPMLTetraCnt[classN - 1] += ClassPMLTetraCnt[classN]; ClassPMLTetraCnt[classN] = 0; #pragma omp parallel for schedule(dynamic) private(tet) for(int j = 0; j < tetraCNT; j ++) { tet = &(tetARRAY[j]); if(tetARRAY[j].get_LTS_Flag() == classN) { tet->set_LTS_Flag(classN - 1); tet->set_Class_dt(1.0 * previousClassDt); } } } } if(reduceN_class) { N_class -= 1; } if(balanced) { cout << "=================================" << endl; cout << "Classes have been balanced\n"; for (int i = 0; i < N_class; i++) { cout << " Number of Tetra in class: " << i << " = " << ClassTetraCnt[i] << std::endl; cout << " Number of PML Tetra in class: " << i << " = " << ClassPMLTetraCnt[i] << std::endl << endl; } cout << "=================================" << endl; } } // Check that all the elements are associated with a class for(int j = 0; j < tetraCNT; j ++) { if(tetARRAY[j].get_LTS_Flag() < 0) cout << " tet " << tetARRAY[j].getcnt() << " has LTS_flag = " << tetARRAY[j].get_LTS_Flag() << " and LTS time step " << tetARRAY[j].get_Class_dt() << endl; } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // In this part we order the tetras in the most efficient way for the GPU // // - 1st: we order by class, from smaller time-step to larger // // - 2nd: each class is ordered by nonConformal tetras 1st and then conformal ones // // - 3rd: we order the nonconformal ones as: excitation (ordered by number of exciting faces 1-2-3) - nonExcitation // // - 4th: we order the conformal ones as: nonRegular - Reg1 - Reg2 - ... // // // // *** NOTE: in nonConformal we also include any tetra with a face without neighbor *** // /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // ----------------------------------------------------------------------------------------------------- // Determine cutoff between Normal-regular groups and Regular-PML groups. // Assumptions: // - regularGroup == 0 -> Irregular (both non-PML and PML) // - regularGroup > 0 -> Regular // - Groups are assigned so that all non-PML regular groups use smaller IDs // than any PML regular groups (i.e., there exists a clean cutoff). // // Outputs: // regularCNT_Normal : number of regular groups used by non-PML (g in [1 .. cutoff-1]) // regularCNT_PML : number of regular groups used by PML (g in [cutoff .. regularCNT-1]) // ----------------------------------------------------------------------------------------------------- cout << "-----------------------" << endl; if (regularCNT > 1) { regularCNT_Normal = 0; for(int j = 0; j < tetraCNT; j ++) { tet = &(tetARRAY[j]); int groupID = tet->getRegularGroup(); bool isPML = tet->get_PML_Flag(); if (!isPML) { if ((groupID > regularCNT_Normal)) { regularCNT_Normal = groupID; } } } regularCNT_PML = regularCNT - regularCNT_Normal - 1; } else { regularCNT_Normal = 0; regularCNT_PML = 0; } cout << "regularCNT = " << regularCNT << endl; std::cout << "regularCNT_Normal = " << regularCNT_Normal << "\n"; std::cout << "regularCNT_PML = " << regularCNT_PML << "\n"; int NumGroups = regularCNT + 4 + portCNT; cout << "NumGroups = " << NumGroups << endl; // ----------------------- // Populate the TetraIndex // ----------------------- // ----------------------------------------------------------------- // // Store the tetrahedra in the ClassTetraIndexAux array // // ----------------------------------------------------------------- // list* ClassTetraIndexAux = new list[NumGroups]; ClassTetraIndex = new int*[N_class]; ClassExcitationCount = new int[N_class]; ClassExcitationOffset = new int[N_class]; ClassExcitation_sc_CNT = new int[N_class]; list ClassExcitationPerFaceList[(int)pow(2, NumOfFaces) - 1]; if (portCNT > 0) { ClassPortCnt_h = new int[N_class * portCNT]; ClassPortOffset_h = new int[N_class * portCNT]; ClassPortNum_h = new int[N_class * portCNT]; } for(int i = 0 ; i < N_class; i++) { ClassTetraIndex[i] = new int[ClassTetraCnt[i] + ClassPMLTetraCnt[i]]; ClassExcitationCount[i] = 0; ClassExcitationOffset[i] = 0; ClassExcitation_sc_CNT[i] = 0; } int PML_Case = NumGroups - 1; int Scattering_Excited_Case = NumGroups - 2; int Total_Excited_Case = NumGroups - 3; int NC_Case = NumGroups - 4; int Port_Case = NumGroups - 4 - portCNT; // First port case int Conformal_Case = 0; int index; int DGface_bc; int auxCNT = 0; excitationFaces = 0; int ClassOffSet = 0; ClassTetraOffset = new int[N_class]; ClassPMLTetraOffset = new int[N_class]; for(int i = 0 ; i < N_class; i++) { for(int j = 0; j < tetraCNT; j ++) { tet = &(tetARRAY[j]); bool isExcite = tet->ExcitationFlag; bool isPML = tet->get_PML_Flag(); bool isNC = tet->getIsNC(); if(tet->LTS_Flag == i) { if(tet->getRegularGroup() > 0) ClassTetraIndexAux[tet->getRegularGroup()].push_back(tet->getcnt()); else if(!isNC && tet->get_NeighNum() == 4 && !isPML && !isExcite) ClassTetraIndexAux[Conformal_Case].push_back(tet->getcnt()); else if (isPML) ClassTetraIndexAux[PML_Case].push_back(tet->getcnt()); else { if(isExcite) { ClassExcitationCount[i]++; int face = 0; for(int k = 0; k < NumOfFaces; k++) { if (!tet->fc[k] || !tet->fc[k]->bcPtr) continue; // optional null guard DGface_bc = tet->fc[k]->bcPtr->getbType(); if(DGface_bc == planeWaveType || DGface_bc == portType || DGface_bc == pmlType) { face += (1 << k); excitationFaces++; } } if (face > 0) ClassExcitationPerFaceList[face - 1].push_back(tet->getcnt()); } else { ClassTetraIndexAux[NC_Case].push_back(tet->getcnt()); } } } } // ----------------------------------------------------------------- // // Excitation // // ----------------------------------------------------------------- // ClassExcitationOffset[i] = auxCNT; auxCNT += ClassExcitationCount[i]; for(int j = (1 << NumOfFaces) - 2; j >= 0; j--) { int listIndex = faceExcitationOrder[j] - 1; int auxSize = ClassExcitationPerFaceList[listIndex].size(); for(int k = 0; k < auxSize; k++) { int tet_id = ClassExcitationPerFaceList[listIndex].back(); tet = &(tetARRAY[tet_id]); if (PlaneWaveBCFlag) { if (tet->scattering_region) ClassTetraIndexAux[Scattering_Excited_Case].push_back(tet_id); else ClassTetraIndexAux[Total_Excited_Case].push_front(tet_id); } else { int port_id = -1; for (int k=0; kgetbc(k); if (tet->fc[k]->bcPtr->getbType() == portType) { int pnum = bcNumToPnum[bc_number]; ClassTetraIndexAux[Port_Case+pnum].push_front(tet_id); break; } } } ClassExcitationPerFaceList[listIndex].pop_back(); } } // ----------------------------------------------------------------- // // Store the tetrahedra in the ClassTetraIndex array // // ----------------------------------------------------------------- // index = 0; auto addGroupToIndex = [&](int group) { int size = ClassTetraIndexAux[group].size(); for (int l = 0; l < size; l++) { ClassTetraIndex[i][index++] = ClassTetraIndexAux[group].front(); ClassTetraIndexAux[group].pop_front(); } }; // ----------------------------------------------------------------------------------------------- // Order: Scattered Field Excited, Total Field Excited, NC, Conformal, Regular, PML, Regular PML // ----------------------------------------------------------------------------------------------- if (PlaneWaveBCFlag) { addGroupToIndex(Scattering_Excited_Case); ClassExcitation_sc_CNT[i] = index; addGroupToIndex(Total_Excited_Case); } else { for(int p = 0; p < portCNT; p++) { ClassPortOffset_h[i * portCNT + p] = index; addGroupToIndex(Port_Case + p); ClassPortCnt_h[i * portCNT + p] = index - ClassPortOffset_h[i * portCNT + p]; ClassPortNum_h[i * portCNT + p] = p; } } addGroupToIndex(NC_Case); addGroupToIndex(Conformal_Case); // Add Regular Tetrahedra // WE assume that there are only 6 regular tetrehedron that are non-PML if ( regularCNT > 1) { for (int k = 1; k <= regularCNT_Normal; k++) { addGroupToIndex(k); } } cout << "Class " << i << " | PML index = " << index << endl; addGroupToIndex(PML_Case); // Add PML Regular Tetrahedra if ( regularCNT > 6) { for (int k = regularCNT_Normal; k < regularCNT; k++) { addGroupToIndex(k); } } ClassTetraOffset[i] = ClassOffSet; ClassOffSet += ClassTetraCnt[i] + ClassPMLTetraCnt[i]; ClassPMLTetraOffset[i] = ClassOffSet - ClassPMLTetraCnt[i]; } for(int i = 0; i < N_class; i++) { std::cout << " ClassExcitationCount[" << i << "] = " << ClassExcitationCount[i] << std::endl; std::cout << " ClassTetraOffset[" << i << "] = " << ClassTetraOffset[i] << std::endl; std::cout << " ClassPMLTetraOffset[" << i << "] = " << ClassPMLTetraOffset[i] << std::endl; } std::cout << "excitationFaces = " << excitationFaces << std::endl; std::cout << "========================================================" << std::endl; } /** OpenMP Local Time-Stepping for matrix free Recursive Explained in "Dissipative terms and local time-stepping improvements in a spatial high order Discontinuous Galerkin scheme for the time-domain Maxwell’s equations" by E. Montseny */ void FemGrp::ComputeE_MatrixFree(int class_i, fp_t dt_i){ if(class_i == 0){ LeapFrogE(class_i, LocTimeSteps[class_i]); } else{ LeapFrogE(class_i, LocTimeSteps[class_i]); ComputeE_MatrixFree(class_i - 1, LocTimeSteps[class_i-1]); ComputeH_MatrixFree(class_i - 1, LocTimeSteps[class_i-1]); ComputeE_MatrixFree(class_i - 1, LocTimeSteps[class_i-1]); } } void FemGrp::ComputeH_MatrixFree(int class_i, fp_t dt_i){ if(class_i == 0){ LeapFrogH(class_i, LocTimeSteps[class_i]); } else{ LeapFrogH(class_i, LocTimeSteps[class_i]); ComputeH_MatrixFree(class_i - 1, LocTimeSteps[class_i - 1]); ComputeE_MatrixFree(class_i - 1, LocTimeSteps[class_i - 1]); ComputeH_MatrixFree(class_i - 1, LocTimeSteps[class_i - 1]); } } void FemGrp::LeapFrogE(int class_i, fp_t dt_i){ int i; int n; fp_t InitTime = 0.0; n = LocalExciIndexE[class_i]; #pragma omp parallel for schedule(dynamic) private(i) for(i = 0; i < ClassTetraCnt[class_i]; i++){ tetra* tet = &(tetARRAY[ClassTetraIndex[class_i][i]]); tet->LocalFaceToTetraMapE_NMF1(*en_1, *en, *hn_12, dt_i, InitTime + (n + 0.5) * dt_i); } #pragma omp parallel for schedule(dynamic) private(i) for(i = 0 ; i < DimE; i++){ en->setentry(i, en_1->getentry(i)); } LocalExciIndexE[class_i] = LocalExciIndexE[class_i] + 1; } void FemGrp::LeapFrogH(int class_i, fp_t dt_i){ int i; int n; fp_t InitTime = 0.0; n = LocalExciIndexH[class_i]; #pragma omp parallel for schedule(dynamic) private(i) for(i = 0; i < ClassTetraCnt[class_i]; i ++){ tetra* tet = &(tetARRAY[ClassTetraIndex[class_i][i]]); tet->LocalFaceToTetraMapH_NMF1(*hn_32, *en_1, *hn_12, dt_i, InitTime + (n + 1.0) * dt_i); } #pragma omp parallel for schedule(dynamic) private(i) for(i = 0 ; i < DimH ; i++){ hn_12->setentry(i, hn_32->getentry(i)); } LocalExciIndexH[class_i] = LocalExciIndexH[class_i] + 1; } /** Local Time-Stepping Update */ void FemGrp::LTS_TimeUpdateGlobal_MatrixFree(){ int i, n; fp_t InitTime = 0.0; LocalExciIndexE = new int[N_class]; LocalExciIndexH = new int[N_class]; for(i = 0; i < N_class; i++){ LocalExciIndexE[i] = 0; LocalExciIndexH[i] = 0; } NtimeSteps = (int)ceil((FinalTime - InitTime) / LocTimeSteps[N_class -1]); cout.setf(ios::scientific,ios_base::floatfield); cout.precision(15); cout << "Start Time Stepping " << endl; cout << "FinalTime = " << FinalTime << endl; cout << "TimeStep_dt = " << LocTimeSteps[N_class -1] << endl; cout << "tetraCNT = " << tetraCNT << endl; cout << "NtimeSteps = " << NtimeSteps << endl; timer_start("Time Stepping", ' '); fp_t Frequency = freq; fp_t dt_nyquist = 1.0 / (2.0 * Frequency * MEGA); // fp_t dt_nyquist = 2.0 / (Frequency * MEGA); //That's wrong fp_t dt_sample = (1 / SamplingRate) * dt_nyquist; int postProcIters = (int)ceil(dt_sample / LocTimeSteps[N_class - 1]); int printScreenIters = 2 * postProcIters; Write_TD_Data(postProcIters, NtimeSteps); cout << "dt_nyquist = " << dt_nyquist << endl; cout << "dt_sample = " << dt_sample << endl; cout << "printScreenIters = " << printScreenIters << endl; cout << "postProcIters = " << postProcIters << endl; cout << "N_class = " << N_class < 0) { CalculateL2Error(n, LocTimeSteps[N_class - 1], ExcitFlag); CalculateL2ErrorProbes(n, LocTimeSteps[N_class - 1], ExcitFlag); } writeAnalyticalIncidentPWProbes(n); } if(write_probes && probeCNT > 0) { writeFieldProbe(n); } if(write_fields) { writeFieldGlobal(n); } if(portCNT != 0) { EvaluateSparametersGlobal(n, LocTimeSteps[N_class -1], true); } cout << "E field norm " << en_1->magnitude() << endl; //cout << "H field norm " << hn_32->magnitude() << endl; total_time += timer_stop(' '); timer_start(to_string(postProcIters)+" steps ", ' '); DEBUG_INFO("Percentage Completed :" + to_string((double)n / (double)NtimeSteps * 100.0) + "%"); current_time += (double)dt_sample * 1e9; DEBUG_INFO("Current Time : " + to_string(current_time) + "ns"); DEBUG_INFO("Average iteration time : "+ to_string(((double)total_time / (double)(n + 1.0))) + " sec"); } } DEBUG_INFO("Total iteration time: "+ to_string(((double)total_time)) + " sec"); timer_stop(' '); } //***************** void FemGrp::Write_TD_Data(int tsPerSample, int nTimeSteps){ // fp_t to = 4.0 * pow(10.0, -9.0); // fp_t tau = 0.8 * pow(10.0, -9.0); char TD_data[180]; sprintf(TD_data, "./PROBES/%s.TD_Data", fname); ofstream TD_datafile(TD_data, ios_base::out); if(!TD_datafile){ cout << "Error in opening file: " << TD_data << "for write"<< endl; } TD_datafile << LocTimeSteps[N_class -1] << endl; TD_datafile << nTimeSteps << endl; TD_datafile << To << endl; TD_datafile << Tau << endl; TD_datafile << tsPerSample << endl; TD_datafile << probeCNT << endl; } // Modifed by qi jian to use octree to store the probes barycentric coordinates void FemGrp::readPROBE() { // Read only the nodes belonging to this subdomain and neighbors char nname[StrLenShort]; // Read the probe file sprintf(nname, "%s.probe", fname); rapidcsv::Document probe_doc(nname); std::vector x_col = probe_doc.GetColumn("X"); std::vector y_col = probe_doc.GetColumn("Y"); std::vector z_col = probe_doc.GetColumn("Z"); // Check that all the columns have the same size assert(x_col.size() == y_col.size()); assert(y_col.size() == z_col.size()); assert(z_col.size() == x_col.size()); probeCNT = x_col.size(); if(padeCNT > probeCNT) { padeCNT = probeCNT; cout << "Pade Number Of Elements REDUCED to " << probeCNT << endl; } probes_bary.resize(probeCNT); std::cout << "Compute the Barycentric coordinates of the Probes" << std::endl; const double tol = 1e-8; //#pragma omp parallel for schedule(dynamic) for (int node_id = 0; node_id < probeCNT; ++node_id) { double probe_xyz[3] = {x_col[node_id] * unit, y_col[node_id] * unit, z_col[node_id] * unit}; std::vector>> found_tets; bool success = octree_object.findTetraInOctree(probe_xyz, found_tets, tol); if (success) { probes_bary[node_id].first = static_cast(found_tets.size()); probes_bary[node_id].second = found_tets; } else { probes_bary[node_id].first = -1; } } // Report and verify bool error_flag = false; for (int i = 0; i < probeCNT; ++i) { if (probes_bary[i].first < 0) { std::cerr << "Node " << i << " not found in simulation domain" << std::endl; double probe_xyz[3] = {x_col[i] * unit, y_col[i] * unit, z_col[i] * unit}; std::cerr << probe_xyz[0] << " " << probe_xyz[1] << " " << probe_xyz[2] << std::endl; error_flag = true; } } if (error_flag) { std::cerr << "Error: Some nodes were not found in the simulation domain. Exiting." << std::endl; std::exit(EXIT_FAILURE); } } // TODO!!! /* // - excitationFaces (flattened exc. faces count) // - PortFacePidx_h (int[excitationFaces], -1 for non-port faces) // - PortFaceCentroid_h (fp_t_ts[excitationFaces*3], centroid coords per face) */ // Uses TetID_excitation_h (owner tet id) to compute barycentrics of each // port-face centroid inside its owning tetra. No octree/hydra traversal. // // Inputs assumed ready: // - excitationFaces // - PortFacePidx_h : int[excitationFaces], -1 if NOT a port face // - PortFaceCentroid_h : fp_t_ts[3*excitationFaces] (cx,cy,cz per face) // - TetID_excitation_h : int[excitationFaces] (owner tetra index 0..tetraCNT-1) // - FaceID_excitation_h : int[excitationFaces] (optional, not strictly needed here) // // Output: // - portFaceCentroid_bary[f].first = 1 on success, -1 if non-port or error // - portFaceCentroid_bary[f].second = { { tetId, {l0,l1,l2,l3} } } (exactly one entry) void FemGrp::prepPortFaceCentroidPROBE() { if (portCNT <= 0 || !PortFacePidx_h || !PortFaceCentroid_h || !TetID_excitation_h) { std::cerr << "[prepPortFaceCentroidPROBE] Missing inputs or no ports.\n"; return; } auto det3 = [](const double x[3], const double y[3], const double z[3]) { return x[0]*(y[1]*z[2]-y[2]*z[1]) - x[1]*(y[0]*z[2]-y[2]*z[0]) + x[2]*(y[0]*z[1]-y[1]*z[0]); }; std::cout << "Compute barycentric coords of port-face centroids (using TetID_excitation_h)\n"; portFaceCentroid_bary.clear(); portFaceCentroid_bary.resize(excitationFaces); int done = 0, errors = 0; for (int f = 0; f < excitationFaces; ++f) { // Skip non-port faces if (PortFacePidx_h[f] < 0) { portFaceCentroid_bary[f].first = -1; continue; } // Owner tetra index from your pre-filled array const int tId = TetID_excitation_h[f]; if (tId < 0 || tId >= tetraCNT) { std::cerr << "[PortCentroid] Invalid owner tId=" << tId << " for excitation face f=" << f << "\n"; portFaceCentroid_bary[f].first = -1; ++errors; continue; } const tetra& T = tetARRAY[tId]; // Tetra vertices double v[4][3]; for (int i = 0; i < 4; ++i) { v[i][0] = T.nd[i]->getCoord().getx(); v[i][1] = T.nd[i]->getCoord().gety(); v[i][2] = T.nd[i]->getCoord().getz(); } // Face centroid (cx,cy,cz) const fp_t_ts* C = &PortFaceCentroid_h[3 * f]; const double P[3] = { (double)C[0], (double)C[1], (double)C[2] }; // Barycentric via Cramer's rule double a[3] = { v[0][0]-v[3][0], v[0][1]-v[3][1], v[0][2]-v[3][2] }; double b[3] = { v[1][0]-v[3][0], v[1][1]-v[3][1], v[1][2]-v[3][2] }; double c[3] = { v[2][0]-v[3][0], v[2][1]-v[3][1], v[2][2]-v[3][2] }; double r[3] = { P[0]-v[3][0], P[1]-v[3][1], P[2]-v[3][2] }; const double D = det3(a,b,c); if (std::abs(D) == 0.0) { std::cerr << "[PortCentroid] Degenerate tetra (D=0) at tId=" << tId << " for f=" << f << "\n"; portFaceCentroid_bary[f].first = -1; ++errors; continue; } double l0 = det3(r,b,c) / D; double l1 = det3(a,r,c) / D; double l2 = det3(a,b,r) / D; double l3 = 1.0 - (l0 + l1 + l2); // Gentle renormalization (handles tiny FP drift) double sumL = l0 + l1 + l2 + l3; if (std::abs(sumL - 1.0) > 1e-10) { l3 = 1.0 - (l0 + l1 + l2); } // Store exactly one (tet, lambdas) std::vector>> vec; vec.emplace_back(tId, std::array{l0,l1,l2,l3}); portFaceCentroid_bary[f].first = 1; portFaceCentroid_bary[f].second = std::move(vec); ++done; //cout << l0 << " " << l1 << " " << l2 << " " << l3 << "\n"; } std::cout << "[prepPortFaceCentroidPROBE] Completed: " << done << " faces; errors=" << errors << ".\n"; if (errors > 0) { std::cerr << "Error: Some port-face centroids could not be assigned.\n"; std::exit(EXIT_FAILURE); } } /* void FemGrp::prepPortFaceCentroidPROBE() { // Requires: // - excitationFaces (flattened exc. faces count) // - PortFacePidx_h (int[excitationFaces], -1 for non-port faces) // - PortFaceCentroid_h (fp_t_ts[excitationFaces*3], centroid coords per face) // - octree_object.findTetraInOctree(double[3], out, tol) if (portCNT <= 0 || !PortFacePidx_h || !PortFaceCentroid_h) { std::cerr << "[readPortFaceCentroidPROBE] No ports or centroid buffers not ready.\n"; return; } const double tol = 1e-3; std::cout << "Compute the Barycentric coordinates of Probes on Ports" << std::endl; portFaceCentroid_bary.clear(); portFaceCentroid_bary.resize(excitationFaces); int not_found = 0; int done = 0; long long total_found_tets = 0; // sum of found_tets.size() over successes int success_faces = 0; // number of faces with success==true // #pragma omp parallel for schedule(dynamic) reduction(+:not_found,done) // (optional) for (int f = 0; f < excitationFaces; ++f) { // Only process port faces if (PortFacePidx_h[f] < 0) { portFaceCentroid_bary[f].first = -1; // mark as N/A (non-port) continue; } // Centroid coordinates of face f // NOTE: These come from node coords directly; do NOT rescale unless your mesh needs it. const fp_t_ts* C = &PortFaceCentroid_h[3 * f]; double xyz[3] = { (double)C[0], (double)C[1], (double)C[2] }; std::vector>> found_tets; bool success = octree_object.findTetraInOctree(xyz, found_tets, tol); if (success) { portFaceCentroid_bary[f].first = static_cast(found_tets.size()); portFaceCentroid_bary[f].second = std::move(found_tets); ++done; // [NEW] accumulate for average total_found_tets += portFaceCentroid_bary[f].first; ++success_faces; } else { portFaceCentroid_bary[f].first = -1; ++not_found; // Debug print (can be silenced) std::cerr << "[PortCentroid] face f=" << f << " (port " << PortFacePidx_h[f] << ") NOT found at " << xyz[0] << " " << xyz[1] << " " << xyz[2] << "\n"; } } std::cout << "[readPortFaceCentroidPROBE] Located " << done << " port-face centroids; " << not_found << " not found.\n"; if (not_found == 0 && success_faces > 0) { const double avg = static_cast(total_found_tets) / static_cast(success_faces); std::cout << "[PortCentroid] average owning tets per centroid = " << avg << " (over " << success_faces << " faces)\n"; } // Hard error if any were not found (match readPROBE behavior if you prefer) if (not_found > 0) { std::cerr << "Error: Some port-face centroids were not found in the domain. Exiting.\n"; std::exit(EXIT_FAILURE); } } */ void FemGrp::readREGULAR(){ // writeFieldGlobal(1); char tname[StrLenShort]; sprintf(tname, "%s.regular", fname); ifstream regularAreaFile(tname, ios::in); if(!regularAreaFile){ cout << "File " << tname << " does NOT exist " << endl; exit(1); } int numOfRegions; int region; regularAreaFile >> numOfRegions; regularTetraCNT = 0; // Only one domain exists regularCNT = numOfRegions; if(regularCNT >= 1){ regularReferenceARRAY = new int[tetraCNT]; regionARRAY = new int[regularCNT]; for(int i = 0; i < regularCNT; i++) regionARRAY[i] = -1; for(int i = 0; i < tetraCNT; i ++){ tetra* tet = &(tetARRAY[i]); regularAreaFile >> region; tet->setRegularGroup(region); if(region == 0){ regularReferenceARRAY[i] = i; } else { regularTetraCNT++; if(regionARRAY[region] == -1) { regionARRAY[region] = i; regularReferenceARRAY[i] = i; } else { regularReferenceARRAY[i] = regionARRAY[region]; } } // cout << "i = " << i << " reference = " << regularReferenceARRAY[i] << " region = " << region << endl; } } } void FemGrp::initializeMaxMinPoints(){ maxPoint.setvtr(std::numeric_limits::min(), std::numeric_limits::min(), std::numeric_limits::min()); minPoint.setvtr(std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max()); } void FemGrp::setMaxMinPoints(fp_t x, fp_t y, fp_t z){ maxPoint.setvtr(x > maxPoint.getx() ? x : maxPoint.getx(), y > maxPoint.gety() ? y : maxPoint.gety(), z > maxPoint.getz() ? z : maxPoint.getz()); minPoint.setvtr(x < minPoint.getx() ? x : minPoint.getx(), y < minPoint.gety() ? y : minPoint.gety(), z < minPoint.getz() ? z : minPoint.getz()); } // 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 // // 0000000000000000000000000000000000000 Post-processing 0000000000000000000000000000000000000000 // // 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 // // Modified by qi jian to write field at probes (CPU VERSION) void FemGrp::writeFieldProbe(int timeStep) { int i, j; fp_t vol; fp_t zeta[4]; vtr lvtr[3]; vtr avtr[4]; int tetraMAP_aux[TetPolyOrderDim[getPolyFlag()]]; #if defined(DGTD_USE_CUDA) fp_t_ts E_coeff[TetPolyOrderDim[getPolyFlag()]]; fp_t_ts H_coeff[TetPolyOrderDim[getPolyFlag()]]; #else fp_t E_coeff[TetPolyOrderDim[getPolyFlag()]]; fp_t H_coeff[TetPolyOrderDim[getPolyFlag()]]; #endif vtr eField; vtr hField; vtr eField_all; vtr hField_all; char csvFileName[StrOutput]; std::ofstream csvFile; if(padeCNT == 0 || writeWhilePade) { sprintf(csvFileName, "Probes_%s_%04d.csv", fname, timeStep); csvFile.open(csvFileName); csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n"; } const int num_nodes = probeCNT; // Calculate Total Fields at the points for(i = 0; i < num_nodes; i++) { int number_of_associated_tets = probes_bary.at(i).first; eField.reset(); hField.reset(); std::vector>> found_tets = probes_bary.at(i).second; eField_all.reset(); hField_all.reset(); for (int t = 0; t < number_of_associated_tets; t++) { int tet_id = found_tets.at(t).first; array tri_bary_coord = found_tets.at(t).second; tetra& tet = tetARRAY[tet_id]; tet.geometry(lvtr, avtr, &vol); avtr[3].reset(); avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]); eField.reset(); hField.reset(); zeta[0] = static_cast(tri_bary_coord[0]); zeta[1] = static_cast(tri_bary_coord[1]); zeta[2] = static_cast(tri_bary_coord[2]); zeta[3] = static_cast(tri_bary_coord[3]); // Calculate E field tet.Local_DG_mapE(tetraMAP_aux, tet.LocalOffsetE); for(j = 0 ; j < TetPolyOrderDim[getPolyFlag()] ; j++) { if(tetraMAP_aux[j] < 0) E_coeff[j] = 0.0; else #if defined(DGTD_USE_CUDA) E_coeff[j] = En1_h[tetraMAP_aux[j]]; #else E_coeff[j] = en_1->getentry(tetraMAP_aux[j]); #endif } // Calculate H field tet.Local_DG_mapH(tetraMAP_aux, tet.LocalOffsetH); for(j = 0 ; j < TetPolyOrderDim[getPolyFlag()] ; j++){ if(tetraMAP_aux[j] < 0) H_coeff[j] = 0.0; else #if defined(DGTD_USE_CUDA) H_coeff[j] = Hn32_h[tetraMAP_aux[j]]; #else H_coeff[j] = hn_32->getentry(tetraMAP_aux[j]); #endif } eField = CalcEfield(E_coeff, avtr, vol, zeta, PolyFlag); hField = CalcEfield(H_coeff, avtr, vol, zeta, PolyFlag); eField_all = eField_all + eField; hField_all = hField_all + hField; } eField_all = eField_all / ((fp_t) number_of_associated_tets); hField_all = hField_all / ((fp_t) number_of_associated_tets); if(usePade){ // && i < padeCNT int row = ((int)(timeStep / tsPerSampling)) * NumOfFieldComponents * probeCNT; int column = i * NumOfFieldComponents; fieldProbes[row + column + 0] = eField_all.getx(); fieldProbes[row + column + 1] = eField_all.gety(); fieldProbes[row + column + 2] = eField_all.getz(); fieldProbes[row + column + 3] = hField_all.getx(); fieldProbes[row + column + 4] = hField_all.gety(); fieldProbes[row + column + 5] = hField_all.getz(); } if(padeCNT == 0 || writeWhilePade) { const auto max_precision {std::numeric_limits::digits10 + 1}; csvFile << std::setprecision(max_precision) << eField_all.getx() << "," << eField_all.gety() << "," << eField_all.getz() << "," << hField_all.getx() << "," << hField_all.gety() << "," << hField_all.getz() << "\n"; } } if(padeCNT == 0 || writeWhilePade) { usleep(100); csvFile.close(); } } void FemGrp::writeFieldProbeAfterPade(int tsSize) { const auto max_precision {std::numeric_limits::digits10 + 1}; #pragma omp parallel for for(int i = 0; i < (int)ceil((1.0 * NtimeSteps) / tsPerSampling); i++){ char csvFileName[StrOutput]; std::ofstream csvFile; sprintf(csvFileName, "./PROBES/Probes_%s_%04d.csv", fname, i * tsSize); csvFile.open(csvFileName); csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n"; for(int probe = 0; probe < probeCNT; probe++) { int column = probe * NumOfFieldComponents; int row = i * NumOfFieldComponents * probeCNT; for(int j = 0; j < NumOfFieldComponents; j++) { csvFile << std::setprecision(max_precision) << fieldProbes[row + column + j]; if(j == NumOfFieldComponents - 1) csvFile << "\n"; else csvFile << ","; } } usleep(100); csvFile.close(); } } void FemGrp::writeFieldGlobal(int timeStep){ int i, j; fp_t vol; fp_t zeta[4]; vtr lvtr[3]; vtr avtr[4]; vtr coord[4]; vtr eLocal[4]; vtr hLocal[4]; int* tetraMAP_aux; int* MapE_Pe; #if defined(DGTD_USE_CUDA) fp_t_ts* E_coeff; fp_t_ts* H_coeff; #else fp_t* E_coeff; fp_t* H_coeff; #endif vtr* eField = new vtr[nodeCNT]; vtr* hField = new vtr[nodeCNT]; int* count = new int[nodeCNT]; memset(count, 0, nodeCNT * sizeof(int)); // only initialize the memory for the first solution if(regE.TetraReg == 0) regE.initial(tetraCNT); if(regH.TetraReg == 0) regH.initial(tetraCNT); int* polyOrder = new int[tetraCNT]; for(i = 0; i < tetraCNT; i++){ tetra& tet = tetARRAY[i]; polyOrder[i] = tet.PolyOrderFlag; for(j = 0; j < NumOfNodes; j++){ coord[j] = (tet.getNode(j))->getCoord(); } tet.geometry(lvtr, avtr, &vol); avtr[3].reset(); avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]); tetraMAP_aux = new int[TetPolyOrderDim[tet.PolyOrderFlag]]; MapE_Pe = new int[2 * TetPolyOrderDim[tet.PolyOrderFlag]]; #if defined(DGTD_USE_CUDA) E_coeff = new fp_t_ts[TetPolyOrderDim[tet.PolyOrderFlag]]; H_coeff = new fp_t_ts[TetPolyOrderDim[tet.PolyOrderFlag]]; #else E_coeff = new fp_t[TetPolyOrderDim[tet.PolyOrderFlag]]; H_coeff = new fp_t[TetPolyOrderDim[tet.PolyOrderFlag]]; #endif // E field tet.Local_DG_mapE(tetraMAP_aux, tet.LocalOffsetE); for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){ if(tetraMAP_aux[j] < 0) E_coeff[j] = 0.0; else #if defined(DGTD_USE_CUDA) E_coeff[j] = En1_h[tetraMAP_aux[j]]; #else E_coeff[j] = en_1->getentry(tetraMAP_aux[j]); #endif } // H field tet.Local_DG_mapH(tetraMAP_aux, tet.LocalOffsetH); for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){ if(tetraMAP_aux[j] < 0) H_coeff[j] = 0.0; else #if defined(DGTD_USE_CUDA) H_coeff[j] = Hn32_h[tetraMAP_aux[j]]; #else H_coeff[j] = hn_32->getentry(tetraMAP_aux[j]); #endif } for(j = 0; j < 4; j++){ zeta[0] = BaryCoord[j][0]; zeta[1] = BaryCoord[j][1]; zeta[2] = BaryCoord[j][2]; zeta[3] = BaryCoord[j][3]; eLocal[j] = CalcEfield(E_coeff, avtr, vol, zeta, tet.PolyOrderFlag); hLocal[j] = CalcEfield(H_coeff, avtr, vol, zeta, tet.PolyOrderFlag); int index = tet.nd[j]->getid(); eField[index] = eField[index] + eLocal[j] /*- Einc*/; hField[index] = hField[index] + hLocal[j] /*- Hinc*/; count[index] += 1; } regE.setRegister(i, eLocal); regH.setRegister(i, hLocal); delete [] tetraMAP_aux; delete [] MapE_Pe; delete [] E_coeff; delete [] H_coeff; } for(i = 0; i < nodeCNT; i++){ eField[i] = eField[i] / static_cast(count[i]); hField[i] = hField[i] / static_cast(count[i]); } VtkWriter vtkWriter(1.0); // VtkWriter vtkWriter(unit); char vtkFilePrefix[128]; memset(vtkFilePrefix, 0, 128 * sizeof(char)); sprintf(vtkFilePrefix, "%s_%04d", fname, timeStep); vtkWriter.writeField(vtkFilePrefix, nodeCNT, ndARRAY, tetraCNT, tetARRAY, eField, hField, polyOrder, 0, 0); //TODO: why here polyorder is not 1 delete [] eField; delete [] hField; delete [] polyOrder; delete [] count; } // Modified by qi jian to compute the analytical incident field at the probes void FemGrp::writeAnalyticalIncidentPWProbes(int timeStep){ int i; vtr Einc; vtr Hinc; vtr r; vtr Einc_field; vtr Hinc_field; fp_t zeta[4]; char csvFileName[StrOutput]; sprintf(csvFileName, "AnalyticalIncidentField_%s_%04d.csv", fname, timeStep); std::ofstream csvFile(csvFileName); csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n"; for(i = 0; i < probeCNT; i++) { // Get the Incident Field at the probe int number_of_associated_tets = probes_bary.at(i).first; Einc.reset(); Hinc.reset(); std::vector>> found_tets = probes_bary.at(i).second; Einc_field.reset(); // Store for all valid candidate tets Hinc_field.reset(); // Store for all valid candidate tets for (int t = 0; t < number_of_associated_tets; t++) { int tet_id = found_tets.at(t).first; array tri_bary_coord = found_tets.at(t).second; tetra& tet = tetARRAY[tet_id]; zeta[0] = static_cast(tri_bary_coord[0]); zeta[1] = static_cast(tri_bary_coord[1]); zeta[2] = static_cast(tri_bary_coord[2]); zeta[3] = static_cast(tri_bary_coord[3]); SimplexToCartesian(tet, r, zeta); getAnalyticalPWField(tet, r, Einc, Hinc, timeStep, LocTimeSteps[N_class -1]); Einc_field = Einc_field + Einc; Hinc_field = Hinc_field + Hinc; } Einc_field = Einc_field / ((fp_t) number_of_associated_tets); Hinc_field = Hinc_field / ((fp_t) number_of_associated_tets); const auto max_precision {std::numeric_limits::digits10 + 1}; csvFile << std::setprecision(max_precision) << Einc_field.getx() << "," << Einc_field.gety() << "," << Einc_field.getz() << "," << Hinc_field.getx() << "," << Hinc_field.gety() << "," << Hinc_field.getz() << "\n"; } usleep(100); csvFile.close(); } void FemGrp::getAnalyticalPWField(tetra& tet, vtr& r, vtr& Einc, vtr& Hinc, int timeStep, fp_t dt){ fp_t eta = No * sqrt(tet.mat->mur.getEntry(0,0) / tet.mat->epsr.getEntry(0,0)); fp_t V_light = Vo / sqrt(tet.mat->epsr.getEntry(0,0) * tet.mat->mur.getEntry(0,0)); fp_t omega = 2.0 * Pi * freq * MEGA; fp_t Exponent; fp_t SinModul; fp_t Neuman; fp_t IncidExcit_E; fp_t IncidExcit_H; fp_t t; for(int i = 0; i < bcCNT; i++){ bc bc_i = bcARRAY[i]; if(bc_i.getbType() == planeWaveType || bc_i.getbType() == pmlType){ fp_t Emagnitude = bc_i.getMagE(); fp_t theta_in_rad = bc_i.getTheta() * Pi / 180.0; fp_t phi_in_rad = bc_i.getPhi() * Pi / 180.0; vtr Epol = bc_i.getField(); vtr kvtr(sin(theta_in_rad) * cos(phi_in_rad), sin(theta_in_rad) * sin(phi_in_rad), cos(theta_in_rad)); vtr Hpol = kvtr * Epol; vtr ro = bc_i.getPW_ro(); fp_t Hmagnitude = Emagnitude / eta; Hpol.unitvtr(); Epol.unitvtr(); switch(ExcitFlag){ case 0: //(not tested) if(Exponent >= 0.0){ // Plane wave E t = dt * (timeStep + 1.0); Exponent = t - To - dotP(kvtr, r - ro) / Vo; SinModul = cos(omega * Exponent); IncidExcit_E = Emagnitude * SinModul; t = dt * (timeStep + 1.5); Exponent = t - To - dotP(kvtr, r - ro) / Vo; SinModul = cos(omega * Exponent); IncidExcit_H = Hmagnitude * SinModul; }else{ IncidExcit_E = 0.0; IncidExcit_H = 0.0; } break; case 1: // Gauss Pulse t = dt * (timeStep + 1.0); Exponent = t - To - dotP(kvtr, r - ro) / Vo; SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0; IncidExcit_E = Emagnitude * SinModul * exp(-(Exponent * Exponent) / (Tau * Tau)); t = dt * (timeStep + 1.5); Exponent = t - To - dotP(kvtr, r - ro) / Vo; SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0; IncidExcit_H = Hmagnitude * SinModul * exp(-(Exponent * Exponent) / (Tau * Tau)); break; case 2: //(not tested) // Neuman Pulse E t = dt * (timeStep + 1.0); Exponent = t - To - dotP(kvtr, r - ro) / Vo; Neuman = (2.0 * Exponent) / (Tau * Tau); IncidExcit_E = (Emagnitude * Neuman) * exp(-(Exponent * Exponent) / (Tau * Tau)); t = dt * (timeStep + 1.5); Exponent = t - To - dotP(kvtr, r - ro) / Vo; Neuman = (2.0 * Exponent) / (Tau * Tau); IncidExcit_H = Hmagnitude * Neuman * exp(-(Exponent * Exponent) / (Tau * Tau)); break; case 3: { // DC-Free Hann-Modulated Cosine Pulse (with time delay) fp_t tdelay = To; // To represents the delay time t = dt * (timeStep + 1.0); Exponent = t - tdelay - dotP(kvtr, r - ro) / Vo; if (Exponent >= 0.0 && Exponent <= Tau) { // Shift exponent relative to pulse center fp_t t_rel = Exponent - Tau / 2.0; fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tau)); // Hann window SinModul = cos(omega * t_rel); IncidExcit_E = Emagnitude * SinModul * window; } else { IncidExcit_E = 0.0; } t = dt * (timeStep + 1.5); Exponent = t - tdelay - dotP(kvtr, r - ro) / Vo; if (Exponent >= 0.0 && Exponent <= Tau) { fp_t t_rel = Exponent - Tau / 2.0; fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tau)); // Hann window SinModul = cos(omega * t_rel); IncidExcit_H = Hmagnitude * SinModul * window; } else { IncidExcit_H = 0.0; } break; } case 4: // Linear Chirp Excitation with sine start and Hann window { fp_t f_end = freq * MEGA; fp_t B = Tau * MEGA; fp_t f0 = f_end - B; fp_t f1 = f_end; fp_t Tchirp = To; // Incident Electric Field (E) t = dt * (timeStep + 1.0); Exponent = t - dotP(kvtr, r - ro) / Vo; if (Exponent >= 0.0 && Exponent <= Tchirp) { fp_t chirpArg = 2.0 * Pi * f0 * Exponent + Pi * (f1 - f0) / Tchirp * Exponent * Exponent; fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tchirp)); // Hann window IncidExcit_E = Emagnitude * sin(chirpArg) * window; } else { IncidExcit_E = 0.0; } // Incident Magnetic Field (H) t = dt * (timeStep + 1.5); Exponent = t - To - dotP(kvtr, r - ro) / Vo; if (Exponent >= 0.0 && Exponent <= Tchirp) { fp_t chirpArg = 2.0 * Pi * f0 * Exponent + Pi * (f1 - f0) / Tchirp * Exponent * Exponent; fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tchirp)); // Hann window IncidExcit_H = Hmagnitude * sin(chirpArg) * window; } else { IncidExcit_H = 0.0; } break; } default: break; } Einc = Epol * IncidExcit_E; Hinc = Hpol * IncidExcit_H; // cout << "Einc at: (" << r.getx() << ", " << r.gety() << ", " << r.getz() << ") = (" << Einc.getx() << ", " << Einc.gety() << ", " << Einc.getz() << ")" << endl; } } } void FemGrp::writeEquivalentSurfaceCurrents_(int timeStep){ int i, j; int m; int index; int FaceNum; fp_t vol; fp_t zeta[4]; fp_t Area; vtr NormalVtr; vtr lvtr[3]; vtr avtr[4]; vtr coord[4]; vtr eLocal[4]; vtr hLocal[4]; vtr eLocalFace[3]; vtr hLocalFace[3]; tetra* tet; ArrayFP* origEn_1 = new ArrayFP(TetPolyOrderDim[PolyFlag]); ArrayFP* origHn_32 = new ArrayFP(TetPolyOrderDim[PolyFlag]); char Currents_vtkFile[StrOutput]; sprintf(Currents_vtkFile, "Currents_%s_%04d", fname, timeStep); // fill the port field with averaged values vtr* JField = new vtr[SurfMesh->nodeCNT]; vtr* MField = new vtr[SurfMesh->nodeCNT]; int* count = new int[SurfMesh->nodeCNT]; memset(count, 0, SurfMesh->nodeCNT * sizeof(int)); regMface = new Register[SurfMesh->faceCNT]; regJface = new Register[SurfMesh->faceCNT]; for(i = 0; i < SurfMesh->faceCNT; i++){ SurfMesh->fcArray[i]->getAreaNormal(&Area, &NormalVtr); tet = SurfMesh->fcArray[i]->hydra[0]; tet->geometry(lvtr, avtr, &vol); avtr[3].reset(); avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]); Get_Coefficients_(tet, origEn_1, origHn_32); for(m = 0; m < NumOfFaces; m++){ zeta[m] = 0.0; if(SurfMesh->fcArray[i] == tet->getFacePtr(m)) FaceNum = m; } for(j = 0; j < 4; j++){ zeta[0] = BaryCoord[j][0]; zeta[1] = BaryCoord[j][1]; zeta[2] = BaryCoord[j][2]; zeta[3] = BaryCoord[j][3]; eLocal[j] = CalcEfield(origEn_1->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag); hLocal[j] = CalcEfield(origHn_32->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag); } regMface[i].initial(3); regJface[i].initial(3); for(j = 0; j < 3; j++){ eLocalFace[j] = eLocal[faceMAP[FaceNum][j]]; hLocalFace[j] = hLocal[faceMAP[FaceNum][j]]; index = SurfMesh->globToLocMap_->find(SurfMesh->fcArray[i]->getNode(j)->getid())->second; MField[index] = MField[index] + NormalVtr * eLocalFace[j] * (-1.0); JField[index] = JField[index] + NormalVtr * hLocalFace[j] * (1.0); // No averaging regMface[i].setField(j, NormalVtr * eLocalFace[j] * (-1.0)); regJface[i].setField(j, NormalVtr * hLocalFace[j] * (1.0)); count[index] += 1; } } // This is for visualization in the vtk format for(i = 0; i < SurfMesh->nodeCNT; i++){ MField[i] = MField[i] / static_cast(count[i]); JField[i] = JField[i] / static_cast(count[i]); } node** locNodeArray = new node*[SurfMesh->nodeCNT]; for(i = 0; i < SurfMesh->nodeCNT; i++){ node& Node = *(SurfMesh->ndArray[i]); int index = SurfMesh->globToLocMap_->find(Node.getid())->second; locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz()); } face** locFaceArray = new face*[SurfMesh->faceCNT]; for(i = 0; i < SurfMesh->faceCNT; i++){ face& Face = *(SurfMesh->fcArray[i]); locFaceArray[i] = new face(Face); locFaceArray[i]->setFace( locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second], locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second], locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]); } // Vtk VtkWriter vtkWriter(1.); vtkWriter.writeTriUg(Currents_vtkFile, SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, MField, JField, 1); // Register char regFileName[StrOutput]; char regFileNameDebug[StrOutput]; memset(regFileName, 0, StrOutput * sizeof(char)); sprintf(regFileName, "Currents_%s_%05d", fname, timeStep); sprintf(regFileNameDebug, "Currents_%s_%05d_dbg", fname, timeStep); printRegister(regMface, regJface, SurfMesh->faceCNT, regFileName,1); // printRegisterDebug(regMface, regJface, SurfMesh->faceCNT, regFileNameDebug,2); if(timeStep == 0) printTriMesh(SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, fname); for(i = 0; i < SurfMesh->nodeCNT; i++) delete locNodeArray[i]; delete [] locNodeArray; for(i = 0; i < SurfMesh->faceCNT; i++) delete locFaceArray[i]; delete [] locFaceArray; delete [] MField; delete [] JField; delete [] count; delete origEn_1; delete origHn_32; } // Print face registers void FemGrp::printRegister(Register* regMface, Register* regJface, int FaceCnt, char *prjName, int order){ int i, j; char fnameJ3[180]; char fnameM3[180]; sprintf(fnameM3, "%s_BC.curM", prjName); sprintf(fnameJ3, "%s_BC.curJ", prjName); ofstream foutJ3, foutM3; foutM3.open(fnameM3, ios::out); foutJ3.open(fnameJ3, ios::out); for(i = 0; i < FaceCnt; i++){ if(order == 1){ for(j = 0; j < 3; j ++){ foutM3 << regMface[i].getField(j).getx() << endl; foutM3 << regMface[i].getField(j).gety() << endl; foutM3 << regMface[i].getField(j).getz() << endl; } foutM3 << endl; for(j = 0; j < 3; j ++){ foutJ3 << regJface[i].getField(j).getx() << endl; foutJ3 << regJface[i].getField(j).gety() << endl; foutJ3 << regJface[i].getField(j).getz() << endl; } foutJ3 << endl; }else if(order == 2){ for(j = 0; j < 3; j ++){ foutM3 << regMface[i].getField(j).getx() << endl; foutM3 << regMface[i].getField(j).gety() << endl; foutM3 << regMface[i].getField(j).getz() << endl; } for(j = 0 ; j < 3 ; j++){ int index0 = First2Second[j][0]; int index1 = First2Second[j][1]; foutM3 << 0.5 * (regMface[i].getField(index0).getx() + regMface[i].getField(index1).getx()) << endl; foutM3 << 0.5 * (regMface[i].getField(index0).gety() + regMface[i].getField(index1).gety()) << endl; foutM3 << 0.5 * (regMface[i].getField(index0).getz() + regMface[i].getField(index1).getz()) << endl; } foutM3 << endl; for(j = 0; j < 3; j ++){ foutJ3 << regJface[i].getField(j).getx() << endl; foutJ3 << regJface[i].getField(j).gety() << endl; foutJ3 << regJface[i].getField(j).getz() << endl; } for(j = 0 ; j < 3 ; j++){ int index0 = First2Second[j][0]; int index1 = First2Second[j][1]; foutJ3 << 0.5 * (regJface[i].getField(index0).getx() + regJface[i].getField(index1).getx()) << endl; foutJ3 << 0.5 * (regJface[i].getField(index0).gety() + regJface[i].getField(index1).gety()) << endl; foutJ3 << 0.5 * (regJface[i].getField(index0).getz() + regJface[i].getField(index1).getz()) << endl; } foutJ3 << endl; } } foutJ3.close(); foutM3.close(); } // Print out Outer Surface node & triangle info on *.tri void FemGrp::printTriMesh(int ndNum, node **ndArray, int fcNum, face **fcArray, char *prjName){ int i; face* fcPtr; FILE* fd; char triName[360]; sprintf(triName, "%s.tri", prjName); fd = fopen(triName, "wt"); fprintf(fd, "%f\n", unit); fprintf(fd, "%d\n", ndNum); for(i = 0; i < ndNum; i ++){ fprintf(fd, "%f %f %f\n", (ndArray[i]->getCoord().getx()) / unit, (ndArray[i]->getCoord().gety()) / unit, (ndArray[i]->getCoord().getz()) / unit); } fprintf(fd,"%d\n", fcNum); for(i = 0; i < fcNum; i ++){ fcPtr = fcArray[i]; node* n0Ptr; node* n1Ptr; node* n2Ptr; n0Ptr = fcPtr->getNode(0); n1Ptr = fcPtr->getNode(1); n2Ptr = fcPtr->getNode(2); fprintf(fd, "%d %d %d\n", n0Ptr->getid(), n1Ptr->getid(), n2Ptr->getid()); } fclose(fd); } // Modified by qi jian to compute the L2 error at the probes void FemGrp::CalculateL2ErrorProbes(int& timeStep, fp_t dt, int TimeDistFlag){ int i, j; fp_t vol; fp_t zeta[4]; vtr lvtr[3]; vtr avtr[4]; vtr eLocal; vtr hLocal; vtr eLocal_exa; vtr hLocal_exa; vtr eLocal_all; vtr hLocal_all; vtr eLocal_exa_all; vtr hLocal_exa_all; fp_t E_coeff[TetPolyOrderDim[getPolyFlag()]]; fp_t H_coeff[TetPolyOrderDim[getPolyFlag()]]; fp_t IntegrOmegaE = 0.0; fp_t IntegrOmegaH = 0.0; vtr r; vtr Exa_NumE; vtr Exa_NumH; char Error_E_TimeLog[180]; char Error_H_TimeLog[180]; int outOfModelProbes = 0; for(i = 0; i < probeCNT; i++) { int number_of_associated_tets = probes_bary.at(i).first; eLocal.reset(); hLocal.reset(); std::vector>> found_tets = probes_bary.at(i).second; eLocal_exa.reset(); hLocal_exa.reset(); eLocal_all.reset(); hLocal_all.reset(); eLocal_exa_all.reset(); hLocal_exa_all.reset(); for (int t = 0; t < number_of_associated_tets; t++) { int tet_id = found_tets.at(t).first; array probe_bary_coord = found_tets.at(t).second; tetra& tet = tetARRAY[tet_id]; int tetraMAP[TetPolyOrderDim[tet.PolyOrderFlag]]; tet.geometry(lvtr, avtr, &vol); avtr[3].reset(); avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]); // Compute the Efield tet.Local_DG_mapE(tetraMAP, tet.LocalOffsetE); for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){ if(tetraMAP[j] < 0) E_coeff[j] = 0.0; else E_coeff[j] = en_1->getentry(tetraMAP[j]); } // Compute the Hfield tet.Local_DG_mapH(tetraMAP, tet.LocalOffsetH); for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){ if(tetraMAP[j] < 0) H_coeff[j] = 0.0; else H_coeff[j] = hn_32->getentry(tetraMAP[j]); } eLocal.reset(); hLocal.reset(); eLocal_exa.reset(); hLocal_exa.reset(); zeta[0] = static_cast(probe_bary_coord[0]); zeta[1] = static_cast(probe_bary_coord[1]); zeta[2] = static_cast(probe_bary_coord[2]); zeta[3] = static_cast(probe_bary_coord[3]); SimplexToCartesian(tet, r, zeta); eLocal = CalcEfield(E_coeff, avtr, vol, zeta, tet.PolyOrderFlag); hLocal = CalcEfield(H_coeff, avtr, vol, zeta, tet.PolyOrderFlag); GetExactSolution(tet, r, eLocal_exa, hLocal_exa, timeStep, dt, TimeDistFlag); // Add all the local fields from all relevant tets eLocal_all = eLocal_all + eLocal; hLocal_all = hLocal_all + hLocal; eLocal_exa_all = eLocal_exa_all + eLocal_exa; hLocal_exa_all = hLocal_exa_all + hLocal_exa; } eLocal_all = eLocal_all / ((fp_t) number_of_associated_tets); hLocal_all = hLocal_all / ((fp_t) number_of_associated_tets); eLocal_exa_all = eLocal_exa_all / ((fp_t) number_of_associated_tets); hLocal_exa_all = hLocal_exa_all / ((fp_t) number_of_associated_tets); Exa_NumE = eLocal_exa_all - eLocal_all; Exa_NumH = hLocal_exa_all - hLocal_all; IntegrOmegaE += Exa_NumE.magnitude() * Exa_NumE.magnitude(); IntegrOmegaH += Exa_NumH.magnitude() * Exa_NumH.magnitude(); sprintf(Error_E_TimeLog, "%s_Probe_%d.TDerrorE", fname, i); sprintf(Error_H_TimeLog, "%s_Probe_%d.TDerrorH", fname, i); ofstream Error_E(Error_E_TimeLog, ios_base::out | ios::app); Error_E.setf(ios::scientific, ios::floatfield); Error_E.precision(15); if(!Error_E) cout << "Error in opening file: " << Error_E_TimeLog << " for write " << endl; Error_E << "[" << (timeStep + 1.0) * dt << ", " << Exa_NumE.magnitude() << "]; \n"; Error_E.close(); ofstream Error_H(Error_H_TimeLog, ios_base::out | ios::app); Error_H.setf(ios::scientific, ios::floatfield); Error_H.precision(15); if(!Error_H) cout << "Error in opening file: " << Error_H_TimeLog << " for write " << endl; Error_H << "[" << (timeStep + 1.5) * dt << ", " << Exa_NumH.magnitude() << "]; \n"; Error_H.close(); } // Write to file if(outOfModelProbes < probeCNT) { sprintf(Error_E_TimeLog, "%s_Probes_Global.TDerrorE", fname); sprintf(Error_H_TimeLog, "%s_Probes_Global.TDerrorH", fname); ofstream Error_E(Error_E_TimeLog, ios_base::out | ios::app); Error_E.setf(ios::scientific, ios::floatfield); Error_E.precision(15); if(!Error_E) cout << "Error in opening file: " << Error_E_TimeLog << " for write " << endl; Error_E << "[" << (timeStep + 1.0) * dt << ", " << sqrt(IntegrOmegaE / (probeCNT - outOfModelProbes)) << "]; \n"; Error_E.close(); ofstream Error_H(Error_H_TimeLog, ios_base::out | ios::app); Error_H.setf(ios::scientific, ios::floatfield); Error_H.precision(15); if(!Error_H) cout << "Error in opening file: " << Error_H_TimeLog << " for write " << endl; Error_H << "[" << (timeStep + 1.5) * dt << ", " << sqrt(IntegrOmegaH / (probeCNT - outOfModelProbes)) << "]; \n"; Error_H.close(); } } void FemGrp::CalculateL2Error(int& timeStep, fp_t dt, int TimeDistFlag){ int i, j; fp_t vol; fp_t zeta[4]; vtr lvtr[3]; vtr avtr[4]; vtr coord[4]; vtr eLocal[4]; vtr hLocal[4]; vtr eLocal_exa[4]; vtr hLocal_exa[4]; int QuadOrder = 2; //TODO: Recheck with the order of the basis int points = 4; fp_t** ZetaMat = new fp_t*[points]; fp_t* weights = new fp_t[points]; for(int i = 0; i < points; i++) ZetaMat[i] = new fp_t[4]; GetTetQuadRule(QuadOrder, points, ZetaMat, weights); fp_t IntegrOmegaE = 0.0; fp_t IntegrOmegaH = 0.0; fp_t NormalizeOmegaE = 0.0; fp_t NormalizeOmegaH = 0.0; for(i = 0; i < tetraCNT; i++){ tetra& tet = tetARRAY[i]; int tetraMAP_E[TetPolyOrderDim[tet.PolyOrderFlag]]; int tetraMAP_H[TetPolyOrderDim[tet.PolyOrderFlag]]; auto origEn_1 = new ArrayFP(TetPolyOrderDim[tet.PolyOrderFlag]); auto origHn_32 = new ArrayFP(TetPolyOrderDim[tet.PolyOrderFlag]); for(j = 0; j < 4; j++){ coord[j] = (tet.getNode(j))->getCoord(); } tet.geometry(lvtr, avtr, &vol); avtr[3].reset(); avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]); tet.Local_DG_mapE(tetraMAP_E, tet.LocalOffsetE); tet.Local_DG_mapH(tetraMAP_H, tet.LocalOffsetH); origEn_1->reset(); origHn_32->reset(); for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag]; j++){ if(tetraMAP_E[j] < 0) origEn_1->setentry(j, 0.0); else origEn_1->setentry(j, en_1->getentry(tetraMAP_E[j])); if(tetraMAP_H[j] < 0) origHn_32->setentry(j, 0.0); else origHn_32->setentry(j, hn_32->getentry(tetraMAP_H[j])); } fp_t IntegrValueE = 0.0; fp_t IntegrValueH = 0.0; fp_t NormalizeValueE = 0.0; fp_t NormalizeValueH = 0.0; vtr r; vtr Exa_NumE; Exa_NumE.reset(); vtr Exa_NumH; Exa_NumH.reset(); //Tetrahedron integration for(j = 0; j < points; j++){ zeta[0] = ZetaMat[j][0]; zeta[1] = ZetaMat[j][1]; zeta[2] = ZetaMat[j][2]; zeta[3] = ZetaMat[j][3]; SimplexToCartesian(tet, r, zeta); eLocal[j] = CalcEfield(origEn_1->getEntryPtr(), avtr, vol, zeta, tet.PolyOrderFlag); hLocal[j] = CalcEfield(origHn_32->getEntryPtr(), avtr, vol, zeta, tet.PolyOrderFlag); GetExactSolution(tet, r, eLocal_exa[j], hLocal_exa[j], timeStep, dt, TimeDistFlag); Exa_NumE = eLocal_exa[j] - eLocal[j]; Exa_NumH = hLocal_exa[j] - hLocal[j]; IntegrValueE += weights[j] * vol * (Exa_NumE.magnitude() * Exa_NumE.magnitude()); IntegrValueH += weights[j] * vol * (Exa_NumH.magnitude() * Exa_NumH.magnitude()); NormalizeValueE += weights[j] * vol * (eLocal_exa[j].magnitude() * eLocal_exa[j].magnitude()); NormalizeValueH += weights[j] * vol * (hLocal_exa[j].magnitude() * hLocal_exa[j].magnitude()); } IntegrOmegaE = IntegrOmegaE + IntegrValueE; IntegrOmegaH = IntegrOmegaH + IntegrValueH; NormalizeOmegaE = NormalizeOmegaE + NormalizeValueE; NormalizeOmegaH = NormalizeOmegaH + NormalizeValueH; } // Write to file char Error_E_TimeLog[180]; char Error_H_TimeLog[180]; sprintf(Error_E_TimeLog, "%s.TDerrorE", fname); sprintf(Error_H_TimeLog, "%s.TDerrorH", fname); ofstream Error_E(Error_E_TimeLog, ios_base::out | ios::app); Error_E.setf(ios::scientific, ios::floatfield); Error_E.precision(15); if(!Error_E) cout << "Error in opening file: " << Error_E_TimeLog << " for write " << endl; Error_E << "[" << (timeStep + 1.0) * dt << ", " << sqrt(IntegrOmegaE) << "]; \n"; Error_E.close(); ofstream Error_H(Error_H_TimeLog, ios_base::out | ios::app); Error_H.setf(ios::scientific, ios::floatfield); Error_H.precision(15); if(!Error_H) cout << "Error in opening file: " << Error_H_TimeLog << " for write " << endl; Error_H << "[" << (timeStep + 1.5) * dt << ", " << sqrt(IntegrOmegaH) << "]; \n"; Error_H.close(); delete[] weights; for(i = 0; i < points; i++) delete[] ZetaMat[i]; delete[] ZetaMat; } void FemGrp::SimplexToCartesian(tetra& tet, vtr& r, fp_t zeta[4]){ fp_t x = 0.; fp_t y = 0.; fp_t z = 0.; for(int i = 0; i < 4 ; i++){ x += tet.getNode(i)->getCoord().getx() * zeta[i]; y += tet.getNode(i)->getCoord().gety() * zeta[i]; z += tet.getNode(i)->getCoord().getz() * zeta[i]; } r.setvtr(x, y, z); } void FemGrp::GetExactSolution(tetra& tet, vtr& r, vtr& Einc, vtr& Hinc, int timeStep, fp_t dt, int Flag){ fp_t to = To; fp_t tau = Tau; fp_t eta = No * sqrt(tet.mat->mur.getEntry(0,0) / tet.mat->epsr.getEntry(0,0)); fp_t V_light = Vo / sqrt(tet.mat->epsr.getEntry(0,0) * tet.mat->mur.getEntry(0,0)); fp_t Neuman; fp_t Frequency = freq; fp_t omega = 2.0 * Pi * Frequency * MEGA; fp_t Exponent; fp_t SinModul; for(int i = 0; i < bcCNT; i++){ bc bc_i = bcARRAY[i]; if(bc_i.getbType() == planeWaveType || bc_i.getbType() == pmlType){ fp_t Emagnitude = bc_i.getMagE(); fp_t theta_in_rad = bc_i.getTheta() * Pi / 180.0; fp_t phi_in_rad = bc_i.getPhi() * Pi / 180.0; vtr Epol = bc_i.getField(); vtr kvtr(sin(theta_in_rad) * cos(phi_in_rad), sin(theta_in_rad) * sin(phi_in_rad), cos(theta_in_rad)); kvtr.unitvtr(); vtr Hpol = kvtr * Epol; vtr ro = bc_i.getPW_ro(); fp_t Hmagnitude = Emagnitude / eta; Hpol.unitvtr(); Epol.unitvtr(); switch (Flag){ case 0: kvtr.Scale((omega / V_light)); Hinc = Hpol * (Hmagnitude * cos(dotP(kvtr, r - ro) - omega * (timeStep + 1.5) * dt)); Einc = Epol * (Emagnitude * cos(dotP(kvtr, r - ro) - omega * (timeStep + 1.0) * dt)); break; case 1: Exponent = (timeStep + 1.0) * dt - to - (dotP(kvtr, r - ro) / V_light); SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0; Einc = Epol * SinModul * (Emagnitude * exp(- (Exponent * Exponent) / (tau * tau))); Exponent = (timeStep + 1.5) * dt - to - (dotP(kvtr, r - ro) / V_light); SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0; Hinc = Hpol * SinModul * (Hmagnitude * exp(- (Exponent * Exponent) / (tau * tau))); break; case 2: Exponent = (timeStep + 1.5) * dt - to - (dotP(kvtr, r - ro) / V_light); Neuman = (2.0 * Exponent) / (tau * tau); Hinc = Hpol * (Hmagnitude * Neuman * exp(- (Exponent * Exponent) / (tau * tau))); Exponent = (timeStep + 1.0) * dt - to - (dotP(kvtr, r - ro) / V_light); Neuman = (2.0 * Exponent) / (tau * tau); Einc = Epol * (Emagnitude * Neuman * exp(- (Exponent * Exponent) / (tau * tau))); break; default: break; } } } } /* "Early Time Behavior in Reverberation Chambers and Its Effect on the Relationships Between Coherence Bandwidth, Chamber Decay Time, RMS Delay Spread, and the Chamber Buildup Time", Christopher L. Holloway et al. */ bool FemGrp::calculatePade(int currentTimeStep){ int M = currentTimeStep / tsPerSampling; int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling); int N = (int)floor(M / 2.0); int finish = 0; timer_start("Process : ", 'm'); #pragma omp parallel for schedule(static) shared(finish) for(int pade = 0; pade < padeCNT; pade++){ int auxFinish = 0; fp_t convergence = 0.0; fp_t maxProbe = 0.0; for(int component = 0; component < NumOfFieldComponents; component++){ fp_t a_k[N] = {0}; fp_t b_k[N] = {0}; fp_t_ts maxValComponent = 0.0; getPadeCoef(a_k, b_k, &fieldProbes[pade * totalSamples * NumOfFieldComponents], N, component, &maxValComponent); maxProbe += maxValComponent; convergence += maxValComponent * getFreqDomainPade(a_k, b_k, totalSamples, N, &tranferencePadeFunctionFD[pade * totalSamples * NumOfFieldComponents], component, pade, currentTimeStep / tsPerPade == 1); cout << "Probe = " << pade << " Component = " << component << " Value = " << (convergence / maxProbe) << endl; if((currentTimeStep / tsPerPade == 1 || (convergence / maxProbe) < PadeTolerance) && (component == NumOfUnitaryVectors - 1 || component == NumOfFieldComponents - 1)){ auxFinish++; maxProbe = 0.0; convergence = 0.0; } } #pragma omp atomic update finish += auxFinish; } timer_stop('m'); return finish == 0; } void FemGrp::calculatePadeEnd(int currentTimeStep){ int M = currentTimeStep / tsPerSampling; int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling); int N = (int)floor(M / 2.0); int finish = 0; timer_start("Process : ", 'm'); for(int pade = 0; pade < padeCNT; pade++){ Complex* FD = new Complex[totalSamples * NumOfFieldComponents]; // #pragma omp parallel for for(int component = 0; component < NumOfFieldComponents; component++){ fp_t a_k[N] = {0}; fp_t b_k[N] = {0}; fp_t_ts maxValComponent = 0.0; timer_start("Coef: " + std::to_string(component) + ": ",'m'); getPadeCoef(a_k, b_k, &fieldProbes[pade * totalSamples * NumOfFieldComponents], N, component, &maxValComponent); timer_stop('m'); timer_start("Freq Dom " + std::to_string(component) + ": ",'m'); getFreqDomainPade(a_k, b_k, totalSamples, N, FD, component, pade, true); timer_stop('m'); } // getPadeIFFTEnd(pade, FD); timer_start("IFFF " + std::to_string(pade) + ": ",'m'); getPadeIFFT(pade, FD); timer_stop('m'); delete[] FD; cout << "Pade point exported: " << pade << endl; } timer_stop('m'); return; } #if defined(DGTD_USE_CUDA) void FemGrp::calculatePadeEndCUDA(int currentTimeStep){ int M = currentTimeStep / tsPerSampling; int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling); int N = (int)floor(M / 2.0); int nFields = padeCNT * NumOfFieldComponents; int finish = 0; timer_start("Process : ", 'm'); cudaStream_t* streams = (cudaStream_t*)malloc(NumOfFieldComponents * sizeof(cudaStream_t)); CUDA_SAFE_CALL(cudaMalloc((void**)&padeFreqConstant_d, totalSamples * sizeof(int))); CUDA_SAFE_CALL(cudaMemcpy(padeFreqConstant_d, padeFreqConstant, totalSamples * sizeof(int), cudaMemcpyHostToDevice)); for(int i = 0; i < NumOfFieldComponents; i++){ cudaStreamCreate(&streams[i]); } cuDoubleComplex* Hf; CUDA_SAFE_CALL(cudaMallocHost((void**)&Hf, totalSamples * nFields * sizeof(cuDoubleComplex), cudaHostAllocMapped)); for(int pade = 0; pade < padeCNT; pade++){ fp_t* maxValComponent = new fp_t[NumOfFieldComponents]; for(int component = 0; component < NumOfFieldComponents; component++){ fp_t* a_k; fp_t* b_k; CUDA_SAFE_CALL(cudaMallocHost((void**)&a_k, N * sizeof(fp_t), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&b_k, N * sizeof(fp_t), cudaHostAllocMapped)); cuDoubleComplex* FD = &Hf[totalSamples * (pade * NumOfFieldComponents + component)]; getPadeCoefCUDA(a_k, b_k, &maxValComponent[component], pade * NumOfFieldComponents + component, streams[component], currentTimeStep); getFreqDomainPadeCUDA(a_k, b_k, totalSamples, N, FD, streams[component]); } getPadeIFFT(pade, &Hf[pade * totalSamples * NumOfFieldComponents]); } for(int i = 0; i < NumOfFieldComponents; i++){ cudaStreamDestroy(streams[i]); } timer_stop('m'); CUDA_SAFE_CALL(cudaFree(padeFreqConstant_d)); CUDA_SAFE_CALL(cudaFreeHost(Hf)); return; } bool FemGrp::calculatePadeCUDA(int currentTimeStep, bool isFirst, bool isEnd){ if(isEnd){ FreeGPU(); } int M = currentTimeStep / tsPerSampling; int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling); int N = (int)floor(M / 2.0); int nFields = padeCNT * NumOfFieldComponents; int finish = 0; bool* exitArray = new bool[padeCNT]; timer_start("Process : ", 'm'); cudaStream_t* streams = (cudaStream_t*)malloc(NumOfFieldComponents * sizeof(cudaStream_t)); CUDA_SAFE_CALL(cudaMalloc((void**)&padeFreqConstant_d, totalSamples * sizeof(int))); CUDA_SAFE_CALL(cudaMemcpy(padeFreqConstant_d, padeFreqConstant, totalSamples * sizeof(int), cudaMemcpyHostToDevice)); for(int i = 0; i < NumOfFieldComponents; i++){ cudaStreamCreate(&streams[i]); } int nPoints = isEnd ? probeCNT : padeCNT; for(int pade = 0; pade < nPoints; pade++){ timer_start("Process : ", 'm'); fp_t* maxValComponent = new fp_t[NumOfFieldComponents]; cuDoubleComplex* Hf; CUDA_SAFE_CALL(cudaMallocHost((void**)&Hf, totalSamples * NumOfFieldComponents * sizeof(cuDoubleComplex), cudaHostAllocMapped)); #pragma omp parallel for for(int component = 0; component < NumOfFieldComponents; component++){ fp_t* a_k; fp_t* b_k; CUDA_SAFE_CALL(cudaMallocHost((void**)&a_k, N * sizeof(fp_t), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&b_k, N * sizeof(fp_t), cudaHostAllocMapped)); cuDoubleComplex* FD = &Hf[totalSamples * component]; getPadeCoefCUDA(a_k, b_k, &maxValComponent[component], pade * NumOfFieldComponents + component, streams[component], currentTimeStep); getFreqDomainPadeCUDA(a_k, b_k, totalSamples, N, FD, streams[component]); } if(!isFirst && !isEnd){ exitArray[pade] = studyPadeConvergence(&tranferencePadeFunctionFD_h[pade * NumOfFieldComponents * totalSamples], Hf, maxValComponent, totalSamples, pade); } if(isEnd){ printFD(pade, Hf); if(pade < padeCNT && writePadeTD){ getPadeIFFT(pade, Hf); } cout << "Final Pade Point " << pade << "completed" << endl; }else{ CUDA_SAFE_CALL(cudaMemcpy(&tranferencePadeFunctionFD_h[pade * NumOfFieldComponents * totalSamples], Hf, totalSamples * NumOfFieldComponents * sizeof(cuDoubleComplex), cudaMemcpyHostToHost)); } delete [] maxValComponent; CUDA_SAFE_CALL(cudaFreeHost(Hf)); timer_stop('m'); } for(int i = 0; i < NumOfFieldComponents; i++){ cudaStreamDestroy(streams[i]); } CUDA_SAFE_CALL(cudaFree(padeFreqConstant_d)); bool exitValue = false; if(!isFirst && !isEnd){ for(int pade = 0; pade < padeCNT; pade++){ if(pade == 0){ exitValue = exitArray[0]; }else exitValue = exitValue & exitArray[pade]; } } delete [] exitArray; timer_stop('m'); return exitValue; } bool FemGrp::studyPadeConvergence(cuDoubleComplex* oldField, cuDoubleComplex* newField, fp_t* maxFields, int M_global, int point){ for(int typeOfField = 0; typeOfField < TypeOfFields; typeOfField++){ fp_t convergence = 0.0; fp_t maxProbe = 0.0; #pragma omp parallel for shared(convergence, maxProbe) for(int component = 0; component < NumOfUnitaryVectors; component++){ fp_t sum_X = 0.0, sum_Y = 0.0, sum_XY = 0.0, sum_XX = 0.0, sum_YY = 0.0; fp_t lastYf_abs = 0.0; fp_t currentYf_abs = 0.0; for(int i = 0; i < M_global; i++){ int arrayMap = component * M_global + i; lastYf_abs = sqrt(pow(oldField[arrayMap].x,2)+pow(oldField[arrayMap].y,2)); currentYf_abs = sqrt(pow(newField[arrayMap].x,2)+pow(newField[arrayMap].y,2)); sum_X = sum_X + currentYf_abs; sum_Y = sum_Y + lastYf_abs; sum_XY = sum_XY + currentYf_abs * lastYf_abs; sum_XX = sum_XX + currentYf_abs * currentYf_abs; sum_YY = sum_YY + lastYf_abs * lastYf_abs; } #pragma omp atomic update convergence += maxFields[component] * (M_global * sum_XY - sum_X * sum_Y) / sqrt((M_global * sum_XX - sum_X * sum_X) * (M_global * sum_YY - sum_Y * sum_Y)); #pragma omp atomic update maxProbe += maxFields[component]; } cout << "Convergence Point " << point << " Fields " << (typeOfField ? "H" : "E") << ": " << (convergence / maxProbe) << endl; if((convergence / maxProbe) < PadeTolerance){ return false; } } return true; } void FemGrp::getPadeCoefCUDA(fp_t* a_k, fp_t* b_k, fp_t* maxField, int local_id, cudaStream_t stream, int currentTimeStep){ int M = currentTimeStep / tsPerSampling; int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling); int N = (int)floor(M / 2.0); int nFields = padeCNT * NumOfFieldComponents; a_k[0] = fieldProbes[local_id]; b_k[0] = 1; *maxField = abs(fieldProbes[local_id]); cusolverDnHandle_t handle; cusolverDnCreate(&handle); cusolverDnSetStream(handle, stream); int n = N-1; // int n = 2; int nrhs = 1; fp_t* G_h; fp_t* d_h; fp_t* G_d; fp_t* d_d; CUDA_SAFE_CALL(cudaMallocHost((void**)&G_h, n * n * sizeof(fp_t), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&d_h, n * sizeof(fp_t), cudaHostAllocMapped)); for(int i = 0; i < n; i++){ for(int j = 0; j < n; j++){ G_h[j * n + i] = fieldProbes[(N - j + i) * probeCNT * NumOfFieldComponents + local_id]; *maxField = max(abs(fieldProbes[(N - j + i) * probeCNT * NumOfFieldComponents + local_id]), *maxField); } d_h[i] = -fieldProbes[(N + i + 1) * probeCNT * NumOfFieldComponents + local_id]; } //Copy matrices CUDA_SAFE_CALL(cudaMalloc((void**)&G_d, n * n * sizeof(fp_t))); CUDA_SAFE_CALL(cudaMalloc((void**)&d_d, n * sizeof(fp_t))); CUDA_SAFE_CALL(cudaMemcpyAsync(G_d, G_h, n * n * sizeof(fp_t), cudaMemcpyHostToDevice, stream)); CUDA_SAFE_CALL(cudaMemcpyAsync(d_d, d_h, n * sizeof(fp_t), cudaMemcpyHostToDevice, stream)); //Calculate buffer int bufferSize; cusolverDnDgetrf_bufferSize(handle, n, n, G_d, n, &bufferSize); //Initialize variables int* info; CUDA_SAFE_CALL(cudaMalloc((void**)&info, sizeof(int))); fp_t* buffer; // workspace for gesv CUDA_SAFE_CALL(cudaMalloc((void**)&buffer, bufferSize * sizeof(fp_t))); int *ipiv = NULL; // pivoting sequence CUDA_SAFE_CALL(cudaMalloc((void**)&ipiv, n * sizeof(int))); //Solve problem cusolverDnDgetrf(handle, n, n, G_d, n, buffer, ipiv, info); cusolverDnDgetrs(handle, CUBLAS_OP_N, n, nrhs, G_d, n, ipiv, d_d, n, info); //Copy data back to CPU CUDA_SAFE_CALL(cudaMemcpyAsync(d_h, d_d, n * sizeof(fp_t), cudaMemcpyDeviceToHost, stream)); //Free GPU CUDA_SAFE_CALL(cudaFree(G_d)); CUDA_SAFE_CALL(cudaFree(d_d)); CUDA_SAFE_CALL(cudaFree(buffer)); CUDA_SAFE_CALL(cudaFree(info)); CUDA_SAFE_CALL(cudaFree(ipiv)); CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); cusolverDnDestroy(handle); CUDA_SAFE_CALL(cudaFreeHost(G_h)); for(int i = 0; i < n; i++){ b_k[i + 1] = d_h[i]; a_k[i + 1] = 0.0; for(int j = 0; j < i + 1; j++){ a_k[i + 1] += b_k[j] * fieldProbes[(i + 1 - j) * probeCNT * NumOfFieldComponents + local_id]; } } CUDA_SAFE_CALL(cudaFreeHost(d_h)); } void FemGrp::getFreqDomainPadeCUDA(fp_t* a_k, fp_t* b_k, int M_global, int N, cuDoubleComplex* H_f, cudaStream_t stream){ fp_t* a_k_d; fp_t* b_k_d; CUDA_SAFE_CALL(cudaMalloc((void**)&a_k_d, N * sizeof(fp_t))); CUDA_SAFE_CALL(cudaMalloc((void**)&b_k_d, N * sizeof(fp_t))); CUDA_SAFE_CALL(cudaMemcpyAsync(a_k_d, a_k, N * sizeof(fp_t), cudaMemcpyHostToDevice, stream)); CUDA_SAFE_CALL(cudaMemcpyAsync(b_k_d, b_k, N * sizeof(fp_t), cudaMemcpyHostToDevice, stream)); cuDoubleComplex* H_f_d; CUDA_SAFE_CALL(cudaMalloc((void**)&H_f_d, M_global * sizeof(cuDoubleComplex))); dim3 blockDim(256, 1, 1); dim3 gridDim(ceil_div(M_global, 256), 1, 1); CalculatePadeFreq<<>>(a_k_d, b_k_d, M_global, N, padeFreqConstant_d, H_f_d); CUDA_SAFE_CALL(cudaMemcpyAsync(H_f, H_f_d, M_global * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, stream)); CUDA_SAFE_CALL(cudaFree(a_k_d)); CUDA_SAFE_CALL(cudaFree(b_k_d)); CUDA_SAFE_CALL(cudaFree(H_f_d)); CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); } void FemGrp::getPadeIFFT(int probe, cuDoubleComplex* fDomainField){ int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling); double* tDomainField = new double[M_global]; double* tDomainFieldOutput = new double[M_global * NumOfFieldComponents]; const auto max_precision {std::numeric_limits::digits10 + 1}; for(int component = 0; component < NumOfFieldComponents; component++){ fftw_complex* fft; fftw_plan ifft; fft = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * M_global); #pragma omp parallel for for(int k = 0; k < M_global; k++){ cuDoubleComplex field = fDomainField[component * M_global + k]; Complex aux = (std::complex(field.x, field.y) / sourceFreqDomain[k]) / M_global; fft[k][0] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.real(); fft[k][1] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.imag(); } ifft = fftw_plan_dft_c2r_1d(M_global, fft, tDomainField, FFTW_ESTIMATE); fftw_execute(ifft); fftw_destroy_plan(ifft); fftw_free(fft); #pragma omp parallel for for (int i = 0; i < M_global; i++) { tDomainFieldOutput[component * M_global + i] = 0.0; for (int j = 0; j <= min(i, tsSource); j++) { tDomainFieldOutput[component * M_global + i] += tDomainField[i - j] * sourceTimeDomain[j]; // Main convolution operation } } } char csvFileName[StrOutput]; sprintf(csvFileName, "./PROBES/TD_Pade_%s_Probe_%d.csv", fname, probe); std::ofstream csvFile(csvFileName); csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n"; for(int n = 0; n < M_global; n++){ for(int component = 0; component < NumOfFieldComponents; component++){ if (component > 0){ csvFile << ","; } csvFile << std::setprecision(max_precision) << tDomainFieldOutput[component * M_global + n]; } csvFile << "\n"; } usleep(100); csvFile.close(); delete [] tDomainField; delete [] tDomainFieldOutput; } void FemGrp::printFD(int probe, cuDoubleComplex* fDomainField){ int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling); const auto max_precision {std::numeric_limits::digits10 + 1}; char csvFileName[StrOutput]; sprintf(csvFileName, "./PROBES/FD_Pade_%s_Probe_%d.csv", fname, probe); std::ofstream csvFile(csvFileName); csvFile << "ExRe" << "," << "ExIm" << "," << "EyRe" << "," << "EyIm" << "," << "EzRe" << "," << "EzIm" << "," << "HxRe" << "," << "HxIm" << "," << "HyRe" << "," << "HyIm" << "," << "HzRe" << "," << "HzIm" << "\n"; for(int n = 0; n < M_global; n++){ for(int component = 0; component < NumOfFieldComponents; component++){ if (component > 0){ csvFile << ","; } csvFile << std::setprecision(max_precision) << fDomainField[component * M_global + n].x << "," << fDomainField[component * M_global + n].y; } csvFile << "\n"; } } void FemGrp::testEnd(){ int ts = 0; char tname[StrLenShort]; int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling); try { while(1){ sprintf(tname, "PROBES_aux/Probes_%s_%04i.csv", fname, ts * tsPerSampling); cout << tname << endl; rapidcsv::Document probe_doc(tname); std::vector Ex_col = probe_doc.GetColumn("Ex"); std::vector Ey_col = probe_doc.GetColumn("Ey"); std::vector Ez_col = probe_doc.GetColumn("Ez"); std::vector Hx_col = probe_doc.GetColumn("Hx"); std::vector Hy_col = probe_doc.GetColumn("Hy"); std::vector Hz_col = probe_doc.GetColumn("Hz"); for(int i = 0; i < Ey_col.size(); i++){ // fieldProbes[i * totalSamples * NumOfFieldComponents + ts * NumOfFieldComponents + 0] = Ex_col[i]; // fieldProbes[i * totalSamples * NumOfUnitaryVectors * TypeO#pragma omp parallel forfFields + ts * NumOfFieldComponents + 5] = Hz_col[i]; // cout << ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 0 << endl; fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 0] = Ex_col[i]; fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 1] = Ey_col[i]; fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 2] = Ez_col[i]; fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 3] = Hx_col[i]; fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 4] = Hy_col[i]; fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 5] = Hz_col[i]; } ts++; } }catch(...){ calculatePadeCUDA(ts * tsPerSampling, false, true); // for(int i = ts/8 ; i <= ts; i += ts/8){ // cout << i << " " << (ts+1)/8 << " " << ts << " " << (i == (ts/8)) << " " << (i == 8 * (ts/8)) << endl; // cout << calculatePadeCUDA(i * tsPerSampling, i == ts/8, i == 8 * (ts/8)) << endl; // } return; } } #endif void FemGrp::getPadeCoef(fp_t* a_k, fp_t* b_k, fp_t_ts* field, int N, int component, fp_t_ts* maxField){ denseMat* G = new denseMat(N-1, N-1); ArrayFP d(N-1); a_k[0] = field[component]; b_k[0] = 1; *maxField = field[component]; // timer_start("Fill : ", 'm'); timer_start("getPadeCoef " + std::to_string(1) + ": ",'m'); for(int k = 0; k < N-1; k++){ for(int m = 0; m < N-1; m++){ G->setEntry(k,m, field[(N - m + k) * NumOfFieldComponents + component]); //it has to be in column form *maxField = max(abs(field[(N - m + k) * NumOfFieldComponents + component]), *maxField); } d[k] = -field[(N + k + 1) * NumOfFieldComponents + component]; } timer_stop('m'); timer_start("getPadeCoef " + std::to_string(2) + ": ",'m'); G->SelfTranspose(); timer_stop('m'); timer_start("getPadeCoef " + std::to_string(3) + ": ",'m'); solveAx_B(*G, d); timer_stop('m'); timer_start("getPadeCoef " + std::to_string(4) + ": ",'m'); for(int k = 0; k < N-1; k++){ b_k[k + 1] = d[k]; for(int m = 0; m < k + 1; m++){ a_k[k + 1] += b_k[m] * field[(k + 1 - m) * NumOfFieldComponents + component]; } } timer_stop('m'); G->Clear(); for(int i = 0; i SourceTolerancePade){ tsSource = i; } } int finish = N % 2 == 0 ? N / 2 - 1 : (N - 1) / 2; fftw_complex* fftOut; fftOut = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N); fftw_plan fft; fft = fftw_plan_dft_r2c_1d(N, sourceTimeDomain, fftOut, FFTW_ESTIMATE); fftw_execute(fft); fftw_destroy_plan(fft); #pragma omp parallel for for (int i = 0; i < N; ++i) { sourceFreqDomain[i] = std::complex(fftOut[i][0], fftOut[i][1]); if (i <= finish) { padeFreqConstant[i] = i; } else { padeFreqConstant[i] = -N + i; } } fftw_free(fftOut); } void FemGrp::getSourceTimeDomain(int timeStep, fp_t* Einc, int ExcitFlag){ fp_t dt = LocTimeSteps[N_class - 1]; fp_t omega = 2.0 * Pi * freq * MEGA; fp_t to = To; fp_t tau = Tau; fp_t Exponent, SinModul; switch (ExcitFlag){ case 0: *Einc = static_cast(cos(omega * (timeStep + 1.0) * dt)); break; case 1: Exponent = (timeStep + 1.0) * dt - to; SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0; *Einc = static_cast(SinModul * exp(- (Exponent * Exponent) / (tau * tau))); break; case 2: Exponent = (timeStep + 1.0) * dt - to; SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0; *Einc = static_cast(SinModul * exp(- (Exponent * Exponent) / (tau * tau))); break; default: break; } } fp_t FemGrp::getFreqDomainPade(fp_t* a_k, fp_t* b_k, int M_global, int N, Complex* H_f, int component, int probe, bool firstValue){ const auto max_precision {std::numeric_limits::digits10 + 1}; char csvFileName[StrOutput]; sprintf(csvFileName, "Pade_Freq_1_%d_%d_%d.csv", N, probe, component); std::ofstream csvFile(csvFileName); if(firstValue){ Complex sumA_k = 0; Complex sumB_k = 0; Complex j = Complex (0.0, 1.0); for(int i = 0; i < M_global; i++){ sumA_k = 0; sumB_k = 0; for(int k = 0; k < N; k++){ sumA_k += a_k[k] * pow(padeFreqs[i], k); sumB_k += b_k[k] * pow(padeFreqs[i], k); } Complex freqVal = sumA_k / sumB_k; csvFile << std::setprecision(max_precision) << sqrt(pow(freqVal.real(),2)+pow(freqVal.imag(),2)); csvFile << "\n"; H_f[i * NumOfFieldComponents + component] = freqVal; } cout << "First/Final Pade Calculation" << endl; csvFile.close(); return 0.0; }else{ Complex sumA_k = 0; Complex sumB_k = 0; Complex j = Complex (0.0, 1.0); fp_t lastYf_abs = 0.0; fp_t currentYf_abs = 0.0; fp_t freqNorm = 0.0; fp_t errorNorm = 0.0; fp_t sum_X = 0.0, sum_Y = 0.0, sum_XY = 0.0, sum_XX = 0.0, sum_YY = 0.0; for(int i = 0; i < M_global; i++){ sumA_k = 0; sumB_k = 0; for(int k = 0; k < N; k++){ sumA_k += a_k[k] * pow(padeFreqs[i], k); sumB_k += b_k[k] * pow(padeFreqs[i], k); } Complex freqVal = sumA_k / sumB_k; // csvFile << std::setprecision(max_precision) << sqrt(pow(freqVal.real(),2)+pow(freqVal.imag(),2)); // csvFile << "\n"; lastYf_abs = sqrt(H_f[i * NumOfFieldComponents + component].real() * H_f[i * NumOfFieldComponents + component].real() + H_f[i * NumOfFieldComponents + component].imag() * H_f[i * NumOfFieldComponents + component].imag()); H_f[i * NumOfFieldComponents + component] = freqVal; currentYf_abs = sqrt(pow(freqVal.real(),2)+pow(freqVal.imag(),2)); sum_X = sum_X + currentYf_abs; sum_Y = sum_Y + lastYf_abs; sum_XY = sum_XY + currentYf_abs * lastYf_abs; sum_XX = sum_XX + currentYf_abs * currentYf_abs; sum_YY = sum_YY + lastYf_abs * lastYf_abs; } fp_t corr = (M_global * sum_XY - sum_X * sum_Y) / sqrt((M_global * sum_XX - sum_X * sum_X) * (M_global * sum_YY - sum_Y * sum_Y)); // cout << "Current Error In Pade (Probe = " << probe << ", Component = " << component <<") = " << corr << endl; return corr; } return 0.0; } void FemGrp::getPadeIFFTEnd(int probe, Complex* fDomainField){ int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling); Complex j = Complex (0.0, 1.0); fp_t* tDomainField = new fp_t[M_global * NumOfFieldComponents]; const auto max_precision {std::numeric_limits::digits10 + 1}; for(int component = 0; component < NumOfFieldComponents; component++){ Complex* tDomainTransferFunction = new Complex[M_global]; #pragma omp parallel for for(int n = 0; n < M_global; n++){ tDomainTransferFunction[n] = 0.0; for(int k = 0; k < M_global; k++){ tDomainTransferFunction[n] += abs(sourceFreqDomain[k]) < SourceTolerancePade ? 0.0 : fDomainField[k * NumOfFieldComponents + component] / sourceFreqDomain[k] * exp(j * 2 * Pi * n * k / M_global); } tDomainTransferFunction[n] /= M_global; } #pragma omp parallel for for(int n = 0; n < M_global; n++){ tDomainField[n * NumOfFieldComponents + component] = 0.0; for(int k = 0; k <= n; k++){ tDomainField[n * NumOfFieldComponents + component] += tDomainTransferFunction[n-k].real() * sourceTimeDomain[k]; } } delete [] tDomainTransferFunction; } char csvFileName[StrOutput]; sprintf(csvFileName, "Pade_%s_Probe_%d.csv", fname, probe); std::ofstream csvFile(csvFileName); csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n"; for(int n = 0; n < M_global; n++){ for(int component = 0; component < NumOfFieldComponents; component++){ if (component > 0){ csvFile << ","; } csvFile << std::setprecision(max_precision) << tDomainField[n * NumOfFieldComponents + component]; } csvFile << "\n"; } usleep(100); csvFile.close(); delete [] tDomainField; } void FemGrp::getPadeIFFT(int probe, Complex* fDomainField){ cout << "hello" << endl; int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling); double* tDomainField = new double[M_global * NumOfFieldComponents]; double* tDomainFieldOutput = new double[M_global * NumOfFieldComponents]; const auto max_precision {std::numeric_limits::digits10 + 1}; for(int component = 0; component < NumOfFieldComponents; component++){ fftw_complex* fft; fftw_plan ifft; fft = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * M_global); #pragma omp parallel for for(int k = 0; k < M_global; k++){ // Complex aux = (fDomainField[k * NumOfFieldComponents + component] / sourceFreqDomain[k]) / M_global; Complex aux = (fDomainField[probe * M_global * NumOfFieldComponents + component * M_global + k] / sourceFreqDomain[k]) / M_global; fft[k][0] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.real(); fft[k][1] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.imag(); // fft[k] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? (fftw_complex)0.0 : (fftw_complex)fDomainField[k * NumOfFieldComponents + component]; } double* tDomainFieldVec = &tDomainField[M_global * component]; ifft = fftw_plan_dft_c2r_1d(M_global, fft, tDomainFieldVec, FFTW_ESTIMATE); fftw_execute(ifft); fftw_destroy_plan(ifft); fftw_free(fft); #pragma omp parallel for for (int i = 0; i < M_global; i++) { tDomainFieldOutput[component * M_global + i] = 0.0; for (int j = 0; j <= min(i, tsSource); j++) { tDomainFieldOutput[component * M_global + i] += tDomainField[component * M_global + i - j] * sourceTimeDomain[j]; // Main convolution operation } } } char csvFileName[StrOutput]; sprintf(csvFileName, "Pade_%s_Probe_%d.csv", fname, probe); std::ofstream csvFile(csvFileName); csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n"; for(int n = 0; n < M_global; n++){ for(int component = 0; component < NumOfFieldComponents; component++){ if (component > 0){ csvFile << ","; } csvFile << std::setprecision(max_precision) << tDomainFieldOutput[component * M_global + n]; } csvFile << "\n"; } usleep(100); csvFile.close(); delete [] tDomainField; delete [] tDomainFieldOutput; } void FemGrp::GetTetQuadRule(int PolyOrder, int& points, fp_t** zeta, fp_t* weights){ if(PolyOrder == 1){ zeta[0][0] = 0.25; zeta[0][0] = 0.25; zeta[0][0] = 0.25; weights[0] = 1.0; }else if(PolyOrder == 2){ zeta[0][0] = 0.585410196624969; zeta[0][1] = 0.138196601125011; zeta[0][2] = 0.138196601125011; zeta[0][3] = 0.138196601125011; // zeta[1][0] = 0.138196601125011; zeta[1][1] = 0.585410196624969; zeta[1][2] = 0.138196601125011; zeta[1][3] = 0.138196601125011; // zeta[2][0] = 0.138196601125011; zeta[2][1] = 0.138196601125011; zeta[2][2] = 0.585410196624969; zeta[2][3] = 0.138196601125011; // zeta[3][0] = 0.138196601125011; zeta[3][1] = 0.138196601125011; zeta[3][2] = 0.138196601125011; zeta[3][3] = 0.585410196624969; // weights[0] = 0.250000000000000; weights[1] = 0.250000000000000; weights[2] = 0.250000000000000; weights[3] = 0.250000000000000; }else if(PolyOrder == 3){ zeta[0][0] = 0.250000000000000; zeta[0][1] = 0.250000000000000; zeta[0][2] = 0.250000000000000; zeta[0][3] = 0.250000000000000; // zeta[1][0] = 0.500000000000000; zeta[1][1] = 0.166666666666667; zeta[1][2] = 0.166666666666667; zeta[1][3] = 0.166666666666667; // zeta[2][0] = 0.166666666666667; zeta[2][1] = 0.500000000000000; zeta[2][2] = 0.166666666666667; zeta[2][3] = 0.166666666666667; // zeta[3][0] = 0.166666666666667; zeta[3][1] = 0.166666666666667; zeta[3][2] = 0.500000000000000; zeta[3][3] = 0.166666666666667; // // zeta[4][0] = 0.166666666666667; zeta[4][1] = 0.166666666666667; zeta[4][2] = 0.166666666666667; zeta[4][3] = 0.500000000000000; // weights[0] = -0.800000000000000; weights[1] = 0.450000000000000; weights[2] = 0.450000000000000; weights[3] = 0.450000000000000; weights[4] = 0.450000000000000; } } void FemGrp::Get_Coefficients_(tetra* tet, ArrayFP* origEn_1, ArrayFP* origHn_32){ int* tetraMAP_E = new int[TetPolyOrderDim[tet->PolyOrderFlag]]; int* tetraMAP_H = new int[TetPolyOrderDim[tet->PolyOrderFlag]]; tet->Local_DG_mapE(tetraMAP_E, tet->LocalOffsetE); tet->Local_DG_mapH(tetraMAP_H, tet->LocalOffsetH); origEn_1->reset(); origHn_32->reset(); for(int i = 0 ; i < TetPolyOrderDim[tet->PolyOrderFlag]; i++){ origEn_1->setentry(i, tetraMAP_E[i] < 0 ? 0.0 : en_1->getentry(tetraMAP_E[i])); origHn_32->setentry(i, tetraMAP_H[i] < 0 ? 0.0 : hn_32->getentry(tetraMAP_H[i])); } } void FemGrp::numberDofs(){ tetra* tet = 0; int LocalDim = TetPolyOrderDim[PolyFlag]; int *tetraEMap = 0; int *tetraHMap = 0; int EdofOffset = 0;//[E H] offset int HdofOffset = DimE; for(int i = 0; i < tetraCNT; i++){ tet = &(tetARRAY[i]); tet->allocDofMap(); tetraEMap = tet->get_LocalEMap(); // obtained from SetupMatrixFree tetraHMap = tet->get_LocalHMap(); for(int j = 0; j < LocalDim; j++){ //in case there is -1 tet->setEHGlobalMap(j, (tetraEMap[j] != NOT_NUMBERED) ? (tetraEMap[j] + EdofOffset) : (tetraEMap[j]), (tetraHMap[j] != NOT_NUMBERED) ? (tetraHMap[j] + HdofOffset) : (tetraHMap[j])); } } size_t matrixDIM_com = dimE + dimH; cout << " " << endl; cout << "==============================================" << endl; cout << " NUMBER OF DEGREES OF FREEDOM " << endl; cout << "==============================================" << endl; cout << " Global Number of dof is " << matrixDIM_com << endl; cout << " Global Matrix dim is (w/o compress) " << tetraCNT * LocalDim * 2 << endl; cout << "==============================================" << endl; cout << " " << endl; } // 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 // // 0000000000000000000000000000000000000 Port Meshes 00000000000000000000000000000000000000000000 // // 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 // /* void FemGrp::makePortMeshes() { int i, j; if(portCNT == 0) return; map PortMap, PortMapRes; set PortSet; set::iterator it; int DGface_bc; for(int idx = 0; idx < tetraCNT; idx++){ for(j = 0 ; j < NumOfFaces ; j++){ DGface_bc = tetARRAY[idx].fc[j]->getbType(); if(DGface_bc >= portType && DGface_bc < pecType) PortSet.insert(DGface_bc); } } LocPortCnt = (int)PortSet.size(); cout << "PortSet.size = " << (int)PortSet.size() << endl; cout << "portCNT = " << portCNT << endl; cout << "LocPortCnt = " << LocPortCnt << endl; portCNT = LocPortCnt; cout << "portCNT = " << portCNT << endl; cout << "LocPortCnt = " << LocPortCnt << endl; if(LocPortCnt == 0) return; for(it = PortSet.begin(); it != PortSet.end(); it++) cout << "Port_type:" << *it << endl; int counter = 0; for(it = PortSet.begin(); it != PortSet.end(); it++){ PortMap[*it] = counter; PortMapRes[counter] = *it; counter++; } pMeshARRAY = new portMesh[LocPortCnt]; // count the port faces (portFaceNums) // get pointers to port faces (portFaceLists) // keep set of unique global node ids for faces (portNodeIds) int* portFaceNums = new int[LocPortCnt]; list* portFaceLists = new list[LocPortCnt]; set* portNodeIds = new set[LocPortCnt]; memset(portFaceNums, 0, portCNT * sizeof(int)); for(i = 0; i < faceCNT; i++){ int bType = faceARRAY[i]->getbType(); if((bType >= portType) && (bType != pecType)){ int portNum = PortMap.find(bType)->second; (portFaceNums[portNum])++; // increment the face count portFaceLists[portNum].push_back(faceARRAY[i]); // add face pointer // add unique node ids for(j = 0; j < NumOfNodesPerFace; j++) portNodeIds[portNum].insert(faceARRAY[i]->getNode(j)->getid()); } } for(i = 0; i < LocPortCnt; i++){ portMesh& portmesh = pMeshARRAY[i]; // set port name, magnitude and impedance for(j = 0; j < bcCNT; j++){ if(bcARRAY[j].getbType() == PortMapRes[i]){ portmesh.setName(bcARRAY[j].getName()); cout<<"This is " << portmesh.getName() << endl; portmesh.setMagE(bcARRAY[j].getMagE()); portmesh.setImpZ(bcARRAY[j].getCval()); break; } } // allocate and add face pointers to array int faceNum = portFaceNums[i]; portmesh.setFaceCnt(faceNum); if(faceNum > 0){ face** portFaceArray = portmesh.getFaceArray(); list::iterator faceListIter = portFaceLists[i].begin(); for(j = 0; j < faceNum; j++){ portFaceArray[j] = *faceListIter; faceListIter++; } // allocate and add node pointers to array // keep local mapping int nodeNum = portNodeIds[i].size(); portmesh.setNodeCnt(nodeNum); portmesh.allocGlobToLocMap(); node** portNodeArray = portmesh.getNodeArray(); map& globToLocMap = portmesh.getGlobToLocMap(); set::iterator portNodeIdIter; int nodeCount = 0; for(portNodeIdIter = portNodeIds[i].begin(); portNodeIdIter != portNodeIds[i].end(); portNodeIdIter++){ portNodeArray[nodeCount] = &(ndARRAY[*portNodeIdIter]); globToLocMap[ndARRAY[*portNodeIdIter].getid()] = nodeCount++; } // setup the remaining port mesh stuff scalingLength = 1.0; portmesh.makeCoordSystem(); portmesh.makeObjMap(); portmesh.readVline(unit); portmesh.writeMesh(objProp); cout.setf(ios::scientific); cout.precision(15); #if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL) vtr PortDirection_vtr = portmesh.getPortDirection(); excitationProp.PortDirection[0] = PortDirection_vtr.getx(); excitationProp.PortDirection[1] = PortDirection_vtr.gety(); excitationProp.PortDirection[2] = PortDirection_vtr.getz(); #endif } } delete [] portFaceNums; delete [] portFaceLists; delete [] portNodeIds; } */ void FemGrp::makePortMeshes() { int i, j; if (portCNT == 0) return; LocPortCnt = portCNT; pMeshARRAY = new portMesh[LocPortCnt]; // Collectors per port int* portFaceNums = new int[LocPortCnt]; std::list* portFaceLists = new std::list[LocPortCnt]; std::set* portNodeIds = new std::set[LocPortCnt]; std::memset(portFaceNums, 0, LocPortCnt * sizeof(int)); // Pass 1: walk faces and collect them by portNum (via bcNumToPnum) for (i = 0; i < faceCNT; ++i) { int bType = faceARRAY[i]->getbType(); if (bType != portType) continue; // only port faces // pick the valid owning tetra (check hydra pointers BEFORE deref) tetra* tet = nullptr; if (faceARRAY[i]->hydra[0] != nullptr) { tet = faceARRAY[i]->hydra[0]; } else if (faceARRAY[i]->hydra[1] != nullptr) { tet = faceARRAY[i]->hydra[1]; } else { continue; // no owner; defensive } // Find bc_number for THIS face inside its tetra (match same face by pointer) int bc_number = -1; for (int k = 0; k < NumOfFaces; ++k) { if (tet->fc[k] == faceARRAY[i]) { bc_number = tet->getbc(k); break; } } if (bc_number < 0) continue; int portNum = bcNumToPnum[bc_number]-1; ++portFaceNums[portNum]; portFaceLists[portNum].push_back(faceARRAY[i]); for (j = 0; j < NumOfNodesPerFace; ++j) { portNodeIds[portNum].insert(faceARRAY[i]->getNode(j)->getid()); } } // Optional: sanity check for (int p = 1; p < LocPortCnt+1; ++p) { std::cout << "Port " << p << " (BCNum=" << pnumToBcNum[p] << ") has " << portNodeIds[p-1].size() << " unique nodes and " << portFaceNums[p-1] << " faces.\n"; } // Pass 2: finalize each port mesh for (int p = 0; p < LocPortCnt; ++p) { portMesh& portmesh = pMeshARRAY[p]; // Initialize from bcARRAY using BCNum directly int bc_number = pnumToBcNum[p+1]; if (bc_number >= 0 && bc_number < bcCNT) { auto& rec = bcARRAY[bc_number]; // <-- no bcRec type name string name = rec.getName(); fp_t magnitudeE = rec.getMagE(); cout << "bc_number = " << bc_number << " name = " << name << " | magE = " << magnitudeE << endl; portmesh.setName(rec.getName()); portmesh.setMagE(magnitudeE); portmesh.setImpZ(rec.getCval()); } // Faces int faceNum = portFaceNums[p]; portmesh.setFaceCnt(faceNum); if (faceNum > 0) { face** portFaceArray = portmesh.getFaceArray(); auto itF = portFaceLists[p].begin(); for (j = 0; j < faceNum; ++j, ++itF) { portFaceArray[j] = *itF; } // Nodes + local map int nodeNum = static_cast(portNodeIds[p].size()); portmesh.setNodeCnt(nodeNum); portmesh.allocGlobToLocMap(); node** portNodeArray = portmesh.getNodeArray(); std::map& globToLocMap = portmesh.getGlobToLocMap(); int nodeCount = 0; for (int gid : portNodeIds[p]) { // If ids aren't dense indices into ndARRAY, replace with your id->index lookup. portNodeArray[nodeCount] = &(ndARRAY[gid]); globToLocMap[ ndARRAY[gid].getid() ] = nodeCount++; } // Remaining setup scalingLength = 1.0; portmesh.makeCoordSystem(); portmesh.makeObjMap(); portmesh.readVline(unit); portmesh.writeMesh(objProp); } } delete [] portFaceNums; delete [] portFaceLists; delete [] portNodeIds; } /* void FemGrp::solveWaveguidePorts() { char command[1000]; memset(command, 0, 1000 * sizeof(char)); sprintf(command, "anwg_h1 %s %e 1 \n",pMeshARRAY->portName, freq); cout<<"=============Running Command:============"<* origEn_1_P2 = new ArrayFP(30); ArrayFP* origEn_1_P1 = new ArrayFP(12); ArrayFP* origEn_1_P0 = new ArrayFP(6); for(i = 0; i < portCNT; i++){ VoltEntryInc[i] = 0.0; VoltEntryTotal[i] = 0.0; } for(i = 0; i < portCNT; i++) { vtr VoltLine = pMeshARRAY[i].vline.coord[1] - pMeshARRAY[i].vline.coord[0]; vtr VoltLineUnit = pMeshARRAY[i].vline.coord[1] - pMeshARRAY[i].vline.coord[0]; VoltLineUnit.unitvtr(); h = VoltLine.magnitude() / GaussPnt; for(k = 0; k < GaussPnt; k++){ Point = pMeshARRAY[i].vline.coord[0] + VoltLineUnit * (k + 0.5) * h; //cout << "k = " << k << " FCCNT = " << pMeshARRAY[i].faceCNT << endl; for(j = 0; j < pMeshARRAY[i].faceCNT; j++){ IsOnFace = pMeshARRAY[i].fcArray[j]->PointInFace(Point, zeta0, zeta1, zeta2); zetaFace[0] = zeta0; zetaFace[1] = zeta1; zetaFace[2] = zeta2; if(IsOnFace == 1) { pMeshARRAY[i].fcArray[j]->getAreaNormal(&area, &Normal); PortDirection = pMeshARRAY[i].fcArray[j]->bcPtr->get_PortDirection(); if(dotP(Normal, PortDirection) < 0.0) tet = pMeshARRAY[i].fcArray[j]->hydra[0]; else tet = pMeshARRAY[i].fcArray[j]->hydra[1]; tet->geometry(lvtr, avtr, &vol); for(m = 0 ; m < 4; m++){ if(pMeshARRAY[i].fcArray[j] == tet->getFacePtr(m)) FaceNum = m; } avtr[3].reset(); avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]); // 0th order polynomial if(tet->PolyOrderFlag == 0){ tet->Local_DG_mapE(tetraMAP_P0, tet->LocalOffsetE); origEn_1_P0->reset(); for(int Cnt1 = 0 ; Cnt1 < 6 ; Cnt1++){ if(tetraMAP_P0[Cnt1] < 0) origEn_1_P0->setentry(Cnt1, 0.0); else origEn_1_P0->setentry(Cnt1, en_1->getentry(tetraMAP_P0[Cnt1])); } }else if(tet->PolyOrderFlag == 1){ // 1st order polynomial tet->Local_DG_mapE(tetraMAP_P1, tet->LocalOffsetE); origEn_1_P1->reset(); for(int Cnt2 = 0 ; Cnt2 < 12 ; Cnt2++){ if(tetraMAP_P1[Cnt2] < 0) origEn_1_P1->setentry(Cnt2, 0.0); else origEn_1_P1->setentry(Cnt2, en_1->getentry(tetraMAP_P1[Cnt2])); } }else if(tet->PolyOrderFlag == 2){ // 2nd order polynomial tet->Local_DG_mapE(tetraMAP_P2, tet->LocalOffsetE); origEn_1_P2->reset(); for(int Cnt2 = 0 ; Cnt2 < 30 ; Cnt2++){ if(tetraMAP_P2[Cnt2] < 0) origEn_1_P2->setentry(Cnt2, 0.0); else origEn_1_P2->setentry(Cnt2, en_1->getentry(tetraMAP_P2[Cnt2])); } } for(m = 0 ; m < 4 ; m++){ zeta[m] = 0.0; } zeta[faceMAP[FaceNum][0]] = zetaFace[0]; zeta[faceMAP[FaceNum][1]] = zetaFace[1]; zeta[faceMAP[FaceNum][2]] = zetaFace[2]; // 0th order polynomial if(tet->PolyOrderFlag == 0){ Total_E_Local = CalcEfield(origEn_1_P0->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag); pMeshARRAY[i].fcArray[j]->SparEinc((timeStep + 1.0) * dt, Point, Inc_E_Local, zetaFace); }else if(tet->PolyOrderFlag == 1){// 1st order polynomial Total_E_Local = CalcEfield(origEn_1_P1->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag); pMeshARRAY[i].fcArray[j]->SparEinc((timeStep + 1.0) * dt, Point, Inc_E_Local, zetaFace); }else if(tet->PolyOrderFlag == 2){// 2nd order polynomial Total_E_Local = CalcEfield(origEn_1_P2->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag); pMeshARRAY[i].fcArray[j]->SparEinc((timeStep + 1.0) * dt, Point, Inc_E_Local, zetaFace); } EvalueTotal = dotP(Total_E_Local, VoltLineUnit); EvalueInc = dotP(Inc_E_Local, VoltLineUnit); VoltEntryInc[i] += - 1.0 * h * wgt * EvalueInc; VoltEntryTotal[i] += - 1.0 * h * wgt * EvalueTotal; } } } // Write a file with all the impendances of the ports if(timeStep == 0){ char Impedance_Log[180]; sprintf(Impedance_Log, "%s.ImpZ", fname); ofstream ImpedanceOutfile(Impedance_Log, ios_base::out); if(!ImpedanceOutfile) cout << "Error in opening file: " << Impedance_Log << " for write " << endl; for(i = 0 ; i < portCNT ; i++) ImpedanceOutfile << pMeshARRAY[i].impZ << " "; ImpedanceOutfile.close(); } // Write to file Vinc if(timeStep == 0) system("mkdir TimeDomainVoltages"); char IncVoltage_TimeLog[180]; ofstream IncVoltageOutfile; if(isCompact){ sprintf(IncVoltage_TimeLog, "./TimeDomainVoltages/%s.TD_Vinc", fname); IncVoltageOutfile.open(IncVoltage_TimeLog, ios_base::out | ios::app); }else{ sprintf(IncVoltage_TimeLog, "./TimeDomainVoltages/%s_%05d.TD_Vinc", fname, timeStep); IncVoltageOutfile.open(IncVoltage_TimeLog, ios_base::out); } IncVoltageOutfile.setf(ios::scientific, ios::floatfield); IncVoltageOutfile.precision(15); if(!IncVoltageOutfile) cout << "Error in opening file: " << IncVoltage_TimeLog << " for write " << endl; IncVoltageOutfile << (timeStep + 1.0) * dt << " "; for(i = 0 ; i < portCNT ; i++) IncVoltageOutfile << VoltEntryInc[i]<< " "; IncVoltageOutfile<>> found_tets = probes_bary.at(i).second; eField_all.reset(); hField_all.reset(); for (int t = 0; t < number_of_associated_tets; t++) { int tet_id = found_tets.at(t).first; array tri_bary_coord = found_tets.at(t).second; tetra& tet = tetARRAY[tet_id]; tet.geometry(lvtr, avtr, &vol); avtr[3].reset(); avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]); eField.reset(); hField.reset(); zeta[0] = static_cast(tri_bary_coord[0]); zeta[1] = static_cast(tri_bary_coord[1]); zeta[2] = static_cast(tri_bary_coord[2]); zeta[3] = static_cast(tri_bary_coord[3]); eField = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag); hField = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag); eField_all = eField_all + eField; hField_all = hField_all + hField; } eField_all = eField_all / ((fp_t) number_of_associated_tets); hField_all = hField_all / ((fp_t) number_of_associated_tets); if(usePade){ // && i < padeCNT int row = (int)(timeStep / tsPerSampling)* NumOfFieldComponents * probeCNT ; int column = i * NumOfFieldComponents; fieldProbes[row + column + 0] = eField_all.getx(); fieldProbes[row + column + 1] = eField_all.gety(); fieldProbes[row + column + 2] = eField_all.getz(); fieldProbes[row + column + 3] = hField_all.getx(); fieldProbes[row + column + 4] = hField_all.gety(); fieldProbes[row + column + 5] = hField_all.getz(); } if(padeCNT == 0 || writeWhilePade){ const auto max_precision {std::numeric_limits::digits10 + 1}; csvFile << std::setprecision(max_precision) << eField_all.getx() << "," << eField_all.gety() << "," << eField_all.getz() << "," << hField_all.getx() << "," << hField_all.gety() << "," << hField_all.getz() << "\n"; } } if(padeCNT == 0 || writeWhilePade) { usleep(100); csvFile.close(); } } // ---------------------------------------------------------------------- // Port-face centroid probes: one CSV per port, per timestep, folders // ---------------------------------------------------------------------- void FemGrp::writePortFieldProbeCuBLAS(int timeStep) { fp_t vol; fp_t zeta[4]; vtr lvtr[3]; vtr avtr[4]; vtr eField, hField; vtr eField_all, hField_all; char csvFileName[StrOutput]; std::ofstream csvFile; if (portCNT > 0 && PortFacePidx_h && PortFaceCentroid_h && !portFaceCentroid_bary.empty()) { // Base output directory and per-port subdirs mkdir("./PortProbes", 0755); for (int pnum = 0; pnum < portCNT; ++pnum) { std::string portDir = "./PortProbes/Port" + std::to_string(pnum); mkdir(portDir.c_str(), 0755); // Open CSV for this port + timestep char pCsv[512]; std::snprintf(pCsv, sizeof(pCsv), "%s/Port%d_%04d.csv", portDir.c_str(), pnum, timeStep); std::ofstream pcsv(pCsv); if (!pcsv.is_open()) { std::cerr << "Error opening file: " << pCsv << "\n"; continue; } // Header: centroid only pcsv << "x1,y1,z1,Ex,Ey,Ez,Hx,Hy,Hz\n"; const auto max_precision = std::numeric_limits::digits10 + 1; pcsv << std::fixed << std::setprecision(max_precision); // Iterate all flattened excitation faces, pick those of this port for (int f = 0; f < excitationFaces; ++f) { if (PortFacePidx_h[f] != pnum) continue; // Centroid position from buffer const fp_t_ts* C = &PortFaceCentroid_h[3*f]; const double cx = static_cast(C[0]); const double cy = static_cast(C[1]); const double cz = static_cast(C[2]); // Bary search results for this centroid (should be present) int nAssoc = (int)portFaceCentroid_bary[f].first; if (nAssoc <= 0) { // If you prefer hard-fail, you can exit as in readPROBE() // Here we just skip gracefully. continue; } const auto& found_tets = portFaceCentroid_bary[f].second; // Average E/H over owning tets (same pattern as node probes) eField_all.reset(); hField_all.reset(); for (int t = 0; t < nAssoc; ++t) { int tet_id = found_tets[t].first; const std::array& b = found_tets[t].second; tetra& tet = tetARRAY[tet_id]; tet.geometry(lvtr, avtr, &vol); avtr[3].reset(); avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]); zeta[0] = (fp_t)b[0]; zeta[1] = (fp_t)b[1]; zeta[2] = (fp_t)b[2]; zeta[3] = (fp_t)b[3]; eField = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag); hField = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag); eField_all = eField_all + eField; hField_all = hField_all + hField; } eField_all = eField_all / ((fp_t)nAssoc); hField_all = hField_all / ((fp_t)nAssoc); // Write one row: centroid + averaged fields pcsv << cx << "," << cy << "," << cz << "," << eField_all.getx() << "," << eField_all.gety() << "," << eField_all.getz() << "," << hField_all.getx() << "," << hField_all.gety() << "," << hField_all.getz() << "\n"; } pcsv.close(); } } } void FemGrp::writeFieldGlobalCuBLAS(int timeStep){ fp_t vol; fp_t zeta[4]; vtr lvtr[3]; vtr avtr[4]; vtr coord[4]; vtr eLocal[4]; vtr hLocal[4]; vtr* eField = new vtr[nodeCNT]; vtr* hField = new vtr[nodeCNT]; int* count = new int[nodeCNT]; memset(count, 0, nodeCNT * sizeof(int)); int* polyOrder = new int[tetraCNT]; for(int i = 0; i < tetraCNT; i++){ tetra& tet = tetARRAY[i]; polyOrder[i] = tet.PolyOrderFlag; tet.geometry(lvtr, avtr, &vol); avtr[3].reset(); avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]); for(int j = 0; j < 4; j++){ zeta[0] = BaryCoord[j][0]; zeta[1] = BaryCoord[j][1]; zeta[2] = BaryCoord[j][2]; zeta[3] = BaryCoord[j][3]; eLocal[j] = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, tet.PolyOrderFlag); hLocal[j] = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, tet.PolyOrderFlag); int index = tet.nd[j]->getid(); eField[index] = eField[index] + eLocal[j] /*- Einc*/; hField[index] = hField[index] + hLocal[j] /*- Hinc*/; count[index] += 1; } } for(int i = 0; i < nodeCNT; i++){ eField[i] = eField[i] / static_cast(count[i]); hField[i] = hField[i] / static_cast(count[i]); } VtkWriter vtkWriter(1.0); // VtkWriter vtkWriter(unit); char vtkFilePrefix[128]; memset(vtkFilePrefix, 0, 128 * sizeof(char)); sprintf(vtkFilePrefix, "./VTU_LTS/%s_%04d", fname, timeStep); vtkWriter.writeField(vtkFilePrefix, nodeCNT, ndARRAY, tetraCNT, tetARRAY, eField, hField, polyOrder, 0, 0); //TODO: why here polyorder is not 1 delete [] eField; delete [] hField; delete [] count; delete [] polyOrder; } bool FemGrp::checkEnergyDecay(){ fieldEnergy /= numberOfEnergyPoints * NumOfSampleEnergyCheck; maxFieldEnergy = max(maxFieldEnergy, fieldEnergy); return (fieldEnergy < energyDecayFactor * maxFieldEnergy); } //////////////////////////////////////////////////////////////////////////////////////////////////////// // Organize GPU Memory //////////////////////////////////////////////////////////////////////////////////////////////////////// void FemGrp::PrepareGPUcuBLAS() { tetra* tet; int cntAux; //////////////////////////////////////////////////////////////////////////////////////////////////////// // Prepare Excitation Info //////////////////////////////////////////////////////////////////////////////////////////////////////// int exciCNT = 0; for(int i = 0; i < N_class; i ++) { exciCNT += ClassExcitationCount[i]; } CUDA_SAFE_CALL(cudaMallocHost((void**)&ExcitationFacesCnt_h, exciCNT * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&ExcitationFacesOffset_h, exciCNT * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&ExcitationFacesNum_h, excitationFaces * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&nd_coords_tet_h, NumOfUnitaryVectors * NumOfNodes * exciCNT * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&nd_coords_face_h, NumOfUnitaryVectors * NumOfNodesPerFace * excitationFaces * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&mapE_h, exciCNT * TetPolyOrderDim[PolyFlag] * sizeof(int8_t), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&mapH_h, exciCNT * TetPolyOrderDim[PolyFlag] * sizeof(int8_t), cudaHostAllocMapped)); // for(int i = 0; i < exciCNT * TetPolyOrderDim[PolyFlag]; i++){ // mapE_h[i] = 1; // mapH_h[i] = 1; // } // =============================================== // Allocate storage for port fields // =============================================== const int Q = GAUSS_POINT_NUM_h[PolyFlag]; // same as GPU kernel uses cout << "excitationFaces = " << excitationFaces << endl; cout << "exciCNT = " << exciCNT << endl; if (portCNT > 0) { CUDA_SAFE_CALL(cudaMallocHost((void**)&Etan_qp_h, excitationFaces * Q * 3 * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Htan_qp_h, excitationFaces * Q * 3 * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&PortFacePidx_h, excitationFaces * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&PortFaceCentroid_h, excitationFaces * 3 * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Etan_center_h, excitationFaces * 3 * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Htan_center_h, excitationFaces * 3 * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&FaceID_excitation_h, excitationFaces * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&TetID_excitation_h, excitationFaces * sizeof(int), cudaHostAllocMapped)); } // =============================================== // Allocated Impedance for Planewave // =============================================== cout << "PlaneWaveBCFlag = " << PlaneWaveBCFlag << endl; cout << "Number of Ports = " << portCNT << endl; if(PlaneWaveBCFlag) { CUDA_SAFE_CALL(cudaMallocHost((void**)&Z_face_pw_h, excitationFaces * sizeof(fp_t_ts), cudaHostAllocMapped)); } excitationFaces = 0; exciCNT = 0; for (int i = 0; i < N_class; i ++) { cout << "\nN CLASS = " << i << endl; for(int j = 0; j < ClassExcitationCount[i]; j ++) { tet = &(tetARRAY[ClassTetraIndex[i][j]]); cout << ClassTetraIndex[i][j] << " "; for(int k = 0; k < TetPolyOrderDim[PolyFlag]; k++) { mapE_h[exciCNT * TetPolyOrderDim[PolyFlag] + k] = (tet->LocMapE[k] < 0 ? 0 : 1); mapH_h[exciCNT * TetPolyOrderDim[PolyFlag] + k] = (tet->LocMapH[k] < 0 ? 0 : 1); } ExcitationFacesOffset_h[exciCNT] = excitationFaces; for(int k = 0; k < NumOfFaces; k++) { for(int node = 0; node < NumOfNodes; node++) { nd_coords_tet_h[NumOfUnitaryVectors * (NumOfNodes * exciCNT + node) + 0] = tet->nd[node]->getCoord().getx(); nd_coords_tet_h[NumOfUnitaryVectors * (NumOfNodes * exciCNT + node) + 1] = tet->nd[node]->getCoord().gety(); nd_coords_tet_h[NumOfUnitaryVectors * (NumOfNodes * exciCNT + node) + 2] = tet->nd[node]->getCoord().getz(); //cout << "TET ID = " << tet->getcnt() << " Face ID = " << tet->fc[k]->getcnt() << " BC = " << tet->fc[k]->bcPtr->getbType() << endl; //cout << tet->nd[node]->getCoord().getx() << " " << tet->nd[node]->getCoord().gety() << " " << tet->nd[node]->getCoord().getz() << endl; } int DGface_bc = tet->fc[k]->bcPtr->getbType(); if(DGface_bc == planeWaveType || DGface_bc == portType || DGface_bc == pmlType) { ExcitationFacesNum_h[excitationFaces] = k; for(int node = 0; node < NumOfNodesPerFace; node++) { nd_coords_face_h[NumOfUnitaryVectors * (NumOfNodesPerFace * excitationFaces + node) + 0] = tet->fc[k]->nd[node]->getCoord().getx(); nd_coords_face_h[NumOfUnitaryVectors * (NumOfNodesPerFace * excitationFaces + node) + 1] = tet->fc[k]->nd[node]->getCoord().gety(); nd_coords_face_h[NumOfUnitaryVectors * (NumOfNodesPerFace * excitationFaces + node) + 2] = tet->fc[k]->nd[node]->getCoord().getz(); //cout << tet->fc[k]->nd[node]->getCoord().getx() << " , " // << tet->fc[k]->nd[node]->getCoord().gety() << " , " // << tet->fc[k]->nd[node]->getCoord().getz() << endl; } cout << "\n"; if(PlaneWaveBCFlag) { Z_face_pw_h[excitationFaces] = No * sqrt(tet->mat->mur.getEntry(0,0) / tet->mat->epsr.getEntry(0,0)); } excitationFaces++; } } ExcitationFacesCnt_h[exciCNT] = excitationFaces - ExcitationFacesOffset_h[exciCNT]; exciCNT++; } } cout << " exciCNT = " << exciCNT << endl; // To save the current time step through the execution LocalExciIndexE = new int[N_class]; LocalExciIndexH = new int[N_class]; for(int i = 0; i < N_class; i ++) { LocalExciIndexE[i] = 0; LocalExciIndexH[i] = 0; } //////////////////////////////////////////////////////////////////////////////////////////////////////// // Create the fields at the HOST (only the ones that we will use to calculate the fields at the probes) //////////////////////////////////////////////////////////////////////////////////////////////////////// int sizeField = TetPolyOrderDim[PolyFlag] * tetraCNT; CUDA_SAFE_CALL(cudaMallocHost((void**)&En1_h, sizeField * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Hn32_h, sizeField * sizeof(fp_t_ts), cudaHostAllocMapped)); //////////////////////////////////////////////////////////////////////////////////////////////////////// // For Regular Tetrahedras //////////////////////////////////////////////////////////////////////////////////////////////////////// flag1 = true; // ---- Helpers ---- // Check for overflow auto safe_add = [](int a, int b) -> int { if ((b > 0 && a > INT_MAX - b) || (b < 0 && a < INT_MIN - b)) { fprintf(stderr, "Integer overflow in addition (%d + %d)\n", a, b); abort(); } return a + b; }; // Check if index is within range auto check_idx = [&](int idx, int lo, int hi, const char* what) { if (idx < lo || idx > hi) { fprintf(stderr, "Index out of range for %s: %d (expected [%d, %d])\n", what, idx, lo, hi); abort(); } }; // Check for null pointer auto check_ptr = [&](void* p, const char* what) { if (!p) { fprintf(stderr, "Null pointer: %s\n", what); abort(); } }; // ---- Allocations (pinned) ---- CUDA_SAFE_CALL(cudaMallocHost((void**)&classregNeighPML_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularTetraCnt_h, (size_t)N_class * regularCNT * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classIrregularTetraOffset_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classPMLTetraOffset_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighIrregular_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighIrregularOffset_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighPML_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighPMLOffset_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classTetraOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classPMLTetraOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighPMLOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&nonRegularTetraCnt_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&nonRegularPMLTetraCnt_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); // Always allocate these “per-class meta” arrays irrespective of regularTetraCNT, // so we can safely write zeros even if there are no regulars. CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsCnt_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLGroupsCnt_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped)); // These hold per-class pointers allocated later per class; init to nullptr CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularTetraOffset_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsId_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped)); for (int i = 0; i < N_class; ++i) { classRegularTetraOffset_h[i] = nullptr; classRegularGroupsId_h[i] = nullptr; } CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraOffset_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraFaceOffset_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLGroupsId_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped)); for (int i = 0; i < N_class; ++i) { classRegularPMLTetraOffset_h[i] = nullptr; classRegularPMLGroupsId_h[i] = nullptr; classRegularPMLTetraFaceOffset_h[i] = nullptr; } // Per group (global) CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsNeighCnt_h, (size_t)regularCNT * sizeof(int), cudaHostAllocMapped)); // ---- Zero-init everything deterministically ---- memset(classRegularTetraCnt_h, 0, (size_t)N_class * regularCNT * sizeof(int)); memset(classIrregularTetraOffset_h, 0, (size_t)N_class * sizeof(int)); memset(classPMLTetraOffset_h, 0, (size_t)N_class * sizeof(int)); memset(classNeighIrregular_h, 0, (size_t)N_class * sizeof(int)); memset(classNeighIrregularOffset_h, 0, (size_t)N_class * sizeof(int)); memset(classNeighPML_h, 0, (size_t)N_class * sizeof(int)); memset(classregNeighPML_h, 0, (size_t)N_class * sizeof(int)); memset(classNeighPMLOffset_h, 0, (size_t)N_class * sizeof(int)); memset(classTetraOffset_loc_h, 0, (size_t)N_class * sizeof(int)); memset(classNeighOffset_loc_h, 0, (size_t)N_class * sizeof(int)); memset(classPMLTetraOffset_loc_h, 0, (size_t)N_class * sizeof(int)); memset(classNeighPMLOffset_loc_h, 0, (size_t)N_class * sizeof(int)); memset(nonRegularTetraCnt_h, 0, (size_t)N_class * sizeof(int)); memset(nonRegularPMLTetraCnt_h, 0, (size_t)N_class * sizeof(int)); memset(classRegularGroupsCnt_h, 0, (size_t)N_class * sizeof(int)); memset(classRegularPMLGroupsCnt_h, 0, (size_t)N_class * sizeof(int)); memset(classRegularGroupsNeighCnt_h, 0, (size_t)regularCNT * sizeof(int)); // ---- Locals ---- std::set ID_aux, ID_aux_PML; totalRegularNeighFaceCnt = 0; totalRegularPMLNeighFaceCnt = 0; numRegTetras = 0; numRegPMLTetras = 0; int irregularTetras = 0; int irregularNeighbours= 0; int PMLTetras = 0; int PMLNeighbours = 0; // ---- Main loop ---- for (int i = 0; i < N_class; ++i) { // Safe offsets (depend on previous class) if (i == 0) { classIrregularTetraOffset_h[i] = 0; classNeighIrregularOffset_h[i] = 0; } else { // read-only of previous indices is safe now int prev = i - 1; check_idx(prev, 0, N_class-1, "prev class index"); // Prevent overflow and guarantee non-negative int pml_tetra_off = classPMLTetraOffset_h[prev]; int pml_tetra_cnt = ClassPMLTetraCnt[prev]; int pml_neigh_off = classNeighPMLOffset_h[prev]; int pml_neigh_cnt = classNeighPML_h[prev]; int reg_neigh_cnt = classregNeighPML_h[prev]; if (pml_tetra_off < 0 || pml_tetra_cnt < 0 || pml_neigh_off < 0 || pml_neigh_cnt < 0) { fprintf(stderr, "Negative offsets/cnts detected for prev class %d\n", prev); abort(); } classIrregularTetraOffset_h[i] = pml_tetra_off + pml_tetra_cnt; classNeighIrregularOffset_h[i] = pml_neigh_off + pml_neigh_cnt + reg_neigh_cnt; } classTetraOffset_loc_h[i] = irregularTetras; classNeighOffset_loc_h[i] = irregularNeighbours; int totalNeighbors = 0; // ----- Non-PML tetras in class i ----- for (int j = 0; j < ClassTetraCnt[i]; ++j) { int tIdx = ClassTetraIndex[i][j]; tet = &(tetARRAY[tIdx]); check_ptr(tet, "tet ptr"); int group_ID = tet->getRegularGroup(); // Count per class and group classRegularTetraCnt_h[i * regularCNT + group_ID]++; int neigh = tet->get_NeighNum(); if (group_ID == 0) { nonRegularTetraCnt_h[i]++; irregularTetras++; irregularNeighbours += neigh; classNeighIrregular_h[i] += neigh; totalNeighbors += neigh; } else { ID_aux.insert(group_ID); classRegularGroupsNeighCnt_h[group_ID] = neigh; totalRegularNeighFaceCnt += neigh; numRegTetras++; totalNeighbors += neigh; } } // ----- Build per-class arrays for REGULAR groups ----- if (!ID_aux.empty()) { int G = (int)ID_aux.size(); classRegularGroupsCnt_h[i] = G; CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsId_h[i], (size_t)G * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularTetraOffset_h[i], (size_t)G * sizeof(int), cudaHostAllocMapped)); check_ptr(classRegularGroupsId_h[i], "classRegularGroupsId_h[i]"); check_ptr(classRegularTetraOffset_h[i], "classRegularTetraOffset_h[i]"); cout << "Regular Tet group = " << endl; int cntAux = 0; for (int ID : ID_aux) { classRegularGroupsId_h[i][cntAux] = ID; cout << ID << endl; if (cntAux == 0) { classRegularTetraOffset_h[i][0] = 0; } else { int prevID = classRegularGroupsId_h[i][cntAux - 1]; int prevCnt = classRegularTetraCnt_h[i * regularCNT + prevID]; classRegularTetraOffset_h[i][cntAux] = classRegularTetraOffset_h[i][cntAux - 1] + prevCnt; } cntAux++; } ID_aux.clear(); } else { classRegularGroupsCnt_h[i] = 0; } // ----- PML part ----- if (PML_flag) { classPMLTetraOffset_h[i] = classIrregularTetraOffset_h[i] + ClassTetraCnt[i]; classNeighPML_h[i] = 0; classNeighPMLOffset_h[i] = classNeighIrregularOffset_h[i] + totalNeighbors; classPMLTetraOffset_loc_h[i] = PMLTetras; classNeighPMLOffset_loc_h[i] = PMLNeighbours; cout << "classNeighPMLOffset_loc_h[" << i << "] =" << classNeighPMLOffset_loc_h[i] << endl; //cout << "classNeighPMLOffset_loc_h[" << i << "] =" << classNeighPMLOffset_loc_h[i] << endl; cout << " classPMLTetraOffset_loc_h[ " << i << "] " << classPMLTetraOffset_loc_h[i] << endl; int pml_cnt = ClassPMLTetraCnt[i]; check_idx(pml_cnt, 0, INT_MAX, "ClassPMLTetraCnt[i]"); for (int j = 0; j < pml_cnt; ++j) { int idx = safe_add(ClassTetraCnt[i], j); int tIdx = ClassTetraIndex[i][idx]; tet = &(tetARRAY[tIdx]); check_ptr(tet, "tet ptr (PML)"); int group_ID = tet->getRegularGroup(); classRegularTetraCnt_h[i * regularCNT + group_ID]++; int neigh = tet->get_NeighNum(); if (group_ID == 0) { nonRegularPMLTetraCnt_h[i]++; PMLTetras = safe_add(PMLTetras, 1); PMLNeighbours = safe_add(PMLNeighbours, neigh); classNeighPML_h[i] = safe_add(classNeighPML_h[i], neigh); } else { ID_aux_PML.insert(group_ID); classRegularGroupsNeighCnt_h[group_ID] = neigh; totalRegularPMLNeighFaceCnt = safe_add(totalRegularPMLNeighFaceCnt, neigh); numRegPMLTetras = safe_add(numRegPMLTetras, 1); classregNeighPML_h[i] += neigh; } } cout << "PMLNeighbours = " << PMLNeighbours << endl; } // ----- Build per-class arrays for REGULAR PML groups ----- if (PML_flag) { if (!ID_aux_PML.empty()) { int Gp = (int)ID_aux_PML.size(); classRegularPMLGroupsCnt_h[i] = Gp; CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLGroupsId_h[i], (size_t)Gp * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraOffset_h[i], (size_t)Gp * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraFaceOffset_h[i], (size_t)Gp * sizeof(int), cudaHostAllocMapped)); check_ptr(classRegularPMLGroupsId_h[i], "classRegularPMLGroupsId_h[i]"); check_ptr(classRegularPMLTetraOffset_h[i], "classRegularPMLTetraOffset_h[i]"); check_ptr(classRegularPMLTetraFaceOffset_h[i], "classRegularPMLTetraFaceOffset_h[i]"); cout << "Regular PML Tet group = " << endl; int cntAux = 0; for (int ID : ID_aux_PML) { cout << ID << endl; classRegularPMLGroupsId_h[i][cntAux] = ID; if (cntAux == 0) { classRegularPMLTetraOffset_h[i][0] = 0; classRegularPMLTetraFaceOffset_h[i][0] = 0; } else { int prevID = classRegularPMLGroupsId_h[i][cntAux - 1]; int prevCnt = classRegularTetraCnt_h[i * regularCNT + prevID]; classRegularPMLTetraOffset_h[i][cntAux] = classRegularPMLTetraOffset_h[i][cntAux - 1] + prevCnt; int neigh = classRegularGroupsNeighCnt_h[prevID]; int num_element = classRegularTetraCnt_h[i * regularCNT + prevID]; int number_neigh = neigh * num_element; classRegularPMLTetraFaceOffset_h[i][cntAux] = classRegularPMLTetraFaceOffset_h[i][cntAux-1] + number_neigh; } cntAux++; } ID_aux_PML.clear(); } else { classRegularPMLGroupsCnt_h[i] = 0; } } } // ---- Final tallies ---- nonregularCNT_Normal = irregularTetras; nonregularCNT_PML = PMLTetras; num_elements_regular_PML = numRegPMLTetras; cout << "nonregularCNT_Normal = " << nonregularCNT_Normal << endl; cout << "nonregularCNT_PML = " << nonregularCNT_PML << endl; cout << "num_elements_regular_PML = " << num_elements_regular_PML << endl; //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Create the matrices for the regular groups (4 sets per regular group): // - Loc1E/Loc1H: local matrices that are multiplied by the same type of field (Field1 = Loc1 * Field0 + ...) // - Loc2E/Loc2H: local matrices that are multiplied by the opposite type of field (E1 = Loc2E * H1_2 + ...) // - Neigh1E/Neigh1H: matrices related to the neighbors opposite filed // - Neigh2E/Neigh2H: matrices related to the neighbors same filed // // *** NOTE: each of these matrices is Column-Major Order // *** NOTE: since they are regular, we assume that the elements are conformal and with 4 neighbours //////////////////////////////////////////////////////////////////////////////////////////////////////////////// int localMatrixSize = TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]; int neighMatrixSize = TetPolyOrderDim[PolyFlag] * FacePolyOrderDim[PolyFlag]; cout << "--------------------------------------------------------------------------------------------------" << endl; cout << "regularCNT_Normal = " << regularCNT_Normal << endl; cout << "totalRegularNeighFaceCnt = " << totalRegularNeighFaceCnt << endl; if(regularRegionFlag && regularCNT_Normal > 0) { cout << "========== FILLING regular ===============" << endl; CUDA_SAFE_CALL(cudaMallocHost((void**)®ularLoc1E_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularLoc2E_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularLoc1H_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularLoc2H_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularNeigh1E_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularNeigh2E_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularNeigh1H_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularNeigh2H_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); int localPosition = 0; int couplingPosition = 0; for(int i = 1; i < regularCNT_Normal+1; i++) { cout << "Group " << i << endl; tet = &(tetARRAY[regionARRAY[i]]); tet->prepareCuBLAS(®ularLoc1E_h[localPosition], ®ularLoc2E_h[localPosition], ®ularNeigh1E_h[couplingPosition], ®ularNeigh2E_h[couplingPosition], nullptr, ®ularLoc1H_h[localPosition], ®ularLoc2H_h[localPosition], ®ularNeigh1H_h[couplingPosition], ®ularNeigh2H_h[couplingPosition], nullptr); localPosition += localMatrixSize; couplingPosition += classRegularGroupsNeighCnt_h[i] * neighMatrixSize; } } cout << "Complete regular matrices preparation" << endl; cout << "--------------------------------------------------------------------------------------------------" << endl; cout << "regularCNT_PML = " << regularCNT_PML << endl; cout << "totalRegularPMLNeighFaceCnt = " << totalRegularPMLNeighFaceCnt << endl; if(regularRegionFlag && regularCNT_PML > 0) { cout << "========== FILLING regular PML ===============" << endl; CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLLoc1E_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLLoc2E_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLLoc1H_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLLoc2H_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLNeigh1E_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLNeigh2E_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLNeigh1H_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLNeigh2H_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLAuxE_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLAuxH_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLLoc1M_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLLoc2M_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLLoc1J_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)®ularPMLLoc2J_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); int localPosition = 0; int couplingPosition = 0; for(int i = regularCNT_Normal+1; i < regularCNT_Normal+regularCNT_PML+1; i++) { cout << "Group " << i << endl; tet = &(tetARRAY[regionARRAY[i]]); cout << "------------" << endl; tet->prepareCuBLAS_PML(®ularPMLLoc1E_h[localPosition], ®ularPMLLoc2E_h[localPosition], ®ularPMLNeigh1E_h[couplingPosition], ®ularPMLNeigh2E_h[couplingPosition], ®ularPMLLoc1H_h[localPosition], ®ularPMLLoc2H_h[localPosition], ®ularPMLNeigh1H_h[couplingPosition], ®ularPMLNeigh2H_h[couplingPosition], ®ularPMLAuxE_h[localPosition], ®ularPMLAuxH_h[localPosition], ®ularPMLLoc1M_h[localPosition], ®ularPMLLoc2M_h[localPosition], ®ularPMLLoc1J_h[localPosition],®ularPMLLoc2J_h[localPosition]); localPosition += localMatrixSize; couplingPosition += classRegularGroupsNeighCnt_h[i] * neighMatrixSize; } } cout << "Complete regular PML matrices preparation" << endl; cout << "--------------------------------------------------------------------------------------------------" << endl; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Calculate the neighbors (number per position + offset) so we know the number of matrices that we are going to need // Also, we generate an array that is going to map the ID and the order //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// cout << "Neighbor matrices preparation" << endl; cout << "tetraCNT = " << tetraCNT << endl; int neighCNT = 0; CUDA_SAFE_CALL(cudaMallocHost((void**)&mapIdLoc, tetraCNT * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Neighbours_h, tetraCNT * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighboursOffset_h, tetraCNT * sizeof(int), cudaHostAllocMapped)); cntAux = 0; for(int i = 0; i < N_class; i++) { for(int j = 0; j < ClassTetraCnt[i] + ClassPMLTetraCnt[i]; j++) { tet = &(tetARRAY[ClassTetraIndex[i][j]]); mapIdLoc[ClassTetraIndex[i][j]] = cntAux; Neighbours_h[cntAux] = tet->get_NeighNum(); NeighboursOffset_h[cntAux] = neighCNT; neighCNT += tet->get_NeighNum(); cntAux++; } } cout << "cntAux = " << cntAux << endl; CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighMap_h, neighCNT * FacePolyOrderDim[PolyFlag] * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighClass_h, N_class * sizeof(int), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighClassOffset_h, N_class * sizeof(int), cudaHostAllocMapped)); int maxNeighClass = 0; neighCNT = 0; cntAux = 0; for(int i = 0; i < N_class; i++) { NeighClassOffset_h[i] = neighCNT; //cout << "====== Class " << i << endl; //cout << "Non-PML " << endl; for(int j = 0; j < ClassTetraCnt[i]; j++) { tet = &(tetARRAY[ClassTetraIndex[i][j]]); bool isPML = tet->get_PML_Flag(); //cout << "TET = " << ClassTetraIndex[i][j] << " | PML = " << isPML << endl; for(int neigh = 0; neigh < tet->get_NeighNum(); neigh++) { tetra* neighbor = tet->get_NeighborTetra(neigh); int neighFace = tet->getNeighFace(neighbor); int offset = mapIdLoc[neighbor->getcnt()] * TetPolyOrderDim[PolyFlag]; int neighID = mapIdLoc[neighbor->getcnt()]; bool isPML2 = neighbor->get_PML_Flag(); //cout << "TET = " << neighID << " | PML = " << isPML2 << endl; for(int k = 0; k < FacePolyOrderDim[PolyFlag]; k++) { NeighMap_h[cntAux++] = offset + fac2tet[neighFace][k]; } } neighCNT += tet->get_NeighNum(); } for(int j = ClassTetraCnt[i]; j < ClassTetraCnt[i] + ClassPMLTetraCnt[i]; j++) { tet = &(tetARRAY[ClassTetraIndex[i][j]]); bool isPML = tet->get_PML_Flag(); for(int neigh = 0; neigh < tet->get_NeighNum(); neigh++) { tetra* neighbor = tet->get_NeighborTetra(neigh); int neighFace = tet->getNeighFace(neighbor); int offset = mapIdLoc[neighbor->getcnt()] * TetPolyOrderDim[PolyFlag]; int neighID = mapIdLoc[neighbor->getcnt()]; bool isPML2 = neighbor->get_PML_Flag(); for(int k = 0; k < FacePolyOrderDim[PolyFlag]; k++) { NeighMap_h[cntAux++] = offset + fac2tet[neighFace][k]; } } neighCNT += tet->get_NeighNum(); } NeighClass_h[i] = neighCNT - NeighClassOffset_h[i]; maxNeighClass = (int)std::max(maxNeighClass, NeighClass_h[i]); } cout << "Complete Neighbor matrices preparation" << endl; cout << "neighCNT = " << neighCNT << endl; cout << "--------------------------------------------------------------------------------------------------" << endl; //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Create the matrices (4 sets per field + inverse for exited elements): // - Loc1E/Loc1H: local matrices that are multiplied by the same type of field (Field1 = Loc1 * Field0 + ...) // - Loc2E/Loc2H: local matrices that are multiplied by the opposite type of field (E1 = Loc2E * H1_2 + ...) // - Neigh1E/Neigh1H: matrices related to the neighbors opposite filed // - Neigh2E/Neigh2H: matrices related to the neighbors same filed // - InvE_h/InvH_h: inverse Mass matrices (only for excited terms) // // *** NOTE: each of these matrices is Column-Major Order *** //////////////////////////////////////////////////////////////////////////////////////////////////////////////// cout << "Excitation preparation" << endl; cout << "exciCNT = " << exciCNT << endl; if (nonregularCNT_Normal > 0) { cout << "========== FILLING Irregular ===============" << endl; CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1E_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2E_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1H_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2H_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1E_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2E_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1H_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2H_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&InvE_h, exciCNT * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&InvH_h, exciCNT * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); std::cout << "Begin irregular CuBLAS preparation" << std::endl; std::cout << "N_class = " << N_class << std::endl; cout << "irregularTetras = " << irregularTetras << endl; cout << "nonregularCNT_Normal = " << nonregularCNT_Normal << endl; exciCNT = 0; irregularTetras = 0; irregularNeighbours = 0; //NOTE: this only works because of the order of the tetras in ClassTetraIndex (Exci0 NonExci0 Exci1 ...) where the number is the class //NOTE: classRegularTetraCnt_h[i * regularCNT + 0] means that we only take into consideration the group 0 (irregular mesh) since the others were already done in the regular section for(int i = 0; i < N_class; i++) { for(int j = 0; j < nonRegularTetraCnt_h[i]; j++) { tet = &(tetARRAY[ClassTetraIndex[i][j]]); int localPosition = irregularTetras * localMatrixSize; int couplingPosition = irregularNeighbours * neighMatrixSize; fp_t_ts* InvEptr = j < ClassExcitationCount[i] ? &InvE_h[(exciCNT + j) * localMatrixSize] : nullptr; fp_t_ts* InvHptr = j < ClassExcitationCount[i] ? &InvH_h[(exciCNT + j) * localMatrixSize] : nullptr; tet->prepareCuBLAS(&Loc1E_h[localPosition], &Loc2E_h[localPosition], &Neigh1E_h[couplingPosition], &Neigh2E_h[couplingPosition], InvEptr, &Loc1H_h[localPosition], &Loc2H_h[localPosition], &Neigh1H_h[couplingPosition], &Neigh2H_h[couplingPosition], InvHptr); irregularTetras++; irregularNeighbours += tet->get_NeighNum(); } exciCNT += ClassExcitationCount[i]; } cout << "irregularTetras = " << irregularTetras << endl; cout << "exciCNT = " << exciCNT << endl; } cout << "--------------------------------------------------------------------------------------------------" << endl; //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Create the matrices (4 sets per field + inverse for exited elements): // - Loc1E_PML/Loc1H_PML: local matrices that are multiplied by the same type of field (Field1 = Loc1 * Field0 + ...) // - Loc2E_PML/Loc2H_PML: local matrices that are multiplied by the opposite type of field (E1 = Loc2E * H1_2 + ...) // - Neigh1E_PML/Neigh1H_PML: matrices related to the neighbors opposite filed // - Neigh2E_PML/Neigh2H_PML: matrices related to the neighbors same filed // - InvE_h/InvH_h: inverse Mass matrices (only for excited terms) // // *** NOTE: each of these matrices is Column-Major Order *** //////////////////////////////////////////////////////////////////////////////////////////////////////////////// cout << "nonregularCNT_PML = " << nonregularCNT_PML << endl; if (nonregularCNT_PML > 0) { cout << "========== FILLING PML ===============" << endl; CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1E_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2E_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1H_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2H_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1E_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2E_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1H_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2H_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxE_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxH_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxM1_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxJ1_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxM2_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxJ2_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped)); cout << "PMLTetras = " << PMLTetras << endl; cout << "PMLNeighbours = " << PMLNeighbours << endl; // Reset counters before starting matrix population PMLTetras = 0; PMLNeighbours = 0; // Loop over all LTS classes for (int i = 0; i < N_class; i++) { for (int j = ClassTetraCnt[i]; j < ClassTetraCnt[i] + nonRegularPMLTetraCnt_h[i]; j++) { // Get pointer to the j-th irregular tetrahedron in class i tet = &(tetARRAY[ClassTetraIndex[i][j]]); // Non-PML Irregular Tetrahedron: compute memory positions for local and neighbor matrices int localPos = PMLTetras * localMatrixSize; int neighPos = PMLNeighbours * neighMatrixSize; // Fill in the local and coupling matrices for non-PML irregular tetra tet->prepareCuBLAS_PML(&Loc1E_PML_h[localPos], &Loc2E_PML_h[localPos], &Neigh1E_PML_h[neighPos], &Neigh2E_PML_h[neighPos], &Loc1H_PML_h[localPos], &Loc2H_PML_h[localPos], &Neigh1H_PML_h[neighPos], &Neigh2H_PML_h[neighPos], &AuxE_h[localPos], &AuxH_h[localPos], &AuxM1_h[localPos], &AuxM2_h[localPos], &AuxJ1_h[localPos],&AuxJ2_h[localPos]); // Increment running totals for non-PML irregular tetrahedra and their neighbors PMLTetras++; PMLNeighbours += tet->get_NeighNum(); } } cout << "PMLTetras = " << PMLTetras << endl; } int sizePML = PMLTetras * TetPolyOrderDim[PolyFlag]; cout << "--------------------------------------------------------------------------------------------------" << endl; //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Check GPU Memory //////////////////////////////////////////////////////////////////////////////////////////////////////////////// struct MemItem { const char* label; size_t bytes; }; auto BYTES_ = [](size_t elems, size_t sizeofT){ return elems * sizeofT; }; auto GB = [](size_t bytes){ return double(bytes) / 1e9; }; auto sum_bytes = [](const std::vector& v)->size_t{ size_t s=0; for (auto& it: v) s += it.bytes; return s; }; // ===== Memory accounting (exact, by allocation) =================================== const int TPO = TetPolyOrderDim[PolyFlag]; const int FPO = FacePolyOrderDim[PolyFlag]; const size_t localElems = static_cast(TPO) * TPO; const size_t neighElems = static_cast(TPO) * FPO; const int exciCNT_total = exciCNT; const int irregularTetras_total = irregularTetras; const int irregularNeighbours_total = irregularNeighbours; const int PMLTetras_total = PMLTetras; const int PMLNeighbours_total = PMLNeighbours; const int regNormGroups = regularCNT_Normal; const int regPMLGroups = regularCNT_PML; const int regNormFacesTotal = totalRegularNeighFaceCnt; const int regPMLFacesTotal = totalRegularPMLNeighFaceCnt; const size_t sizeFieldElems = sizeField; // already in elements const size_t sizePMLElems = sizePML; // already in elements (if you keep a global PML state) const size_t neighMapElems = static_cast(neighCNT) * FPO; const size_t neighboursElems = tetraCNT; const size_t auxInElems = static_cast(maxNeighClass) * FPO; const size_t auxOutElems = static_cast(maxNeighClass) * TPO; const size_t mapElemsPerExci = TPO; const size_t tetNdElems = static_cast(NumOfUnitaryVectors) * NumOfNodes * exciCNT_total; const size_t faceNdElems = static_cast(NumOfUnitaryVectors) * NumOfNodesPerFace * excitationFaces; // ============ Build accounting vectors matching your allocations ================== std::vector excit, prop, state, neighs; // ---- Excitation maps & counts ---- excit.push_back({"mapE (int8)", BYTES_(size_t(exciCNT_total) * mapElemsPerExci, sizeof(int8_t))}); excit.push_back({"mapH (int8)", BYTES_(size_t(exciCNT_total) * mapElemsPerExci, sizeof(int8_t))}); excit.push_back({"ExcitationFacesCnt (int)", BYTES_(exciCNT_total, sizeof(int))}); excit.push_back({"ExcitationFacesOffset (int)", BYTES_(exciCNT_total, sizeof(int))}); excit.push_back({"ExcitationFacesNum (int)", BYTES_(excitationFaces, sizeof(int))}); excit.push_back({"nd_coords_tet", BYTES_(tetNdElems, sizeof(fp_t_ts))}); excit.push_back({"nd_coords_face", BYTES_(faceNdElems, sizeof(fp_t_ts))}); if (PlaneWaveBCFlag && excitationFaces > 0) { excit.push_back({"Z_face_pw", BYTES_(excitationFaces, sizeof(fp_t_ts))}); } // Inverses only for excitations excit.push_back({"InvE", BYTES_(size_t(exciCNT_total) * localElems, sizeof(fp_t_ts))}); excit.push_back({"InvH", BYTES_(size_t(exciCNT_total) * localElems, sizeof(fp_t_ts))}); // ---- Irregular (non-PML) ---- prop.push_back({"Loc1E (irreg)", BYTES_(size_t(irregularTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"Loc2E (irreg)", BYTES_(size_t(irregularTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"Loc1H (irreg)", BYTES_(size_t(irregularTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"Loc2H (irreg)", BYTES_(size_t(irregularTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"Neigh1E (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"Neigh2E (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"Neigh1H (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"Neigh2H (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))}); // ---- Regular (non-PML) ---- if (regNormGroups > 0) { prop.push_back({"regularLoc1E", BYTES_(size_t(regNormGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularLoc2E", BYTES_(size_t(regNormGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularLoc1H", BYTES_(size_t(regNormGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularLoc2H", BYTES_(size_t(regNormGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularNeigh1E", BYTES_(size_t(regNormFacesTotal) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"regularNeigh2E", BYTES_(size_t(regNormFacesTotal) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"regularNeigh1H", BYTES_(size_t(regNormFacesTotal) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"regularNeigh2H", BYTES_(size_t(regNormFacesTotal) * neighElems, sizeof(fp_t_ts))}); } // ---- Regular PML ---- if (regPMLGroups > 0) { prop.push_back({"regularPMLLoc1E", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLLoc2E", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLLoc1H", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLLoc2H", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLNeigh1E", BYTES_(size_t(regPMLFacesTotal) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLNeigh2E", BYTES_(size_t(regPMLFacesTotal) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLNeigh1H", BYTES_(size_t(regPMLFacesTotal) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLNeigh2H", BYTES_(size_t(regPMLFacesTotal) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLAuxE", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLAuxH", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLLoc1M", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLLoc2M", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLLoc1J", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))}); prop.push_back({"regularPMLLoc2J", BYTES_(size_t(regPMLGroups) * localElems, sizeof(fp_t_ts))}); // per-element state for regular-PML region state.push_back({"r_Mn", BYTES_(size_t(numRegPMLTetras) * localElems, sizeof(fp_t_ts))}); state.push_back({"r_Mn1", BYTES_(size_t(numRegPMLTetras) * localElems, sizeof(fp_t_ts))}); state.push_back({"r_Jn12", BYTES_(size_t(numRegPMLTetras) * localElems, sizeof(fp_t_ts))}); state.push_back({"r_Jn32", BYTES_(size_t(numRegPMLTetras) * localElems, sizeof(fp_t_ts))}); } // ---- Irregular PML ---- if (PMLTetras_total > 0) { prop.push_back({"Loc1E_PML", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"Loc2E_PML", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"Loc1H_PML", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"Loc2H_PML", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"Neigh1E_PML", BYTES_(size_t(PMLNeighbours_total) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"Neigh2E_PML", BYTES_(size_t(PMLNeighbours_total) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"Neigh1H_PML", BYTES_(size_t(PMLNeighbours_total) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"Neigh2H_PML", BYTES_(size_t(PMLNeighbours_total) * neighElems, sizeof(fp_t_ts))}); prop.push_back({"AuxE", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"AuxH", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"AuxM1", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"AuxJ1", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"AuxM2", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); prop.push_back({"AuxJ2", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); // per-element PML state arrays state.push_back({"Mn", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); state.push_back({"Mn1", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); state.push_back({"Jn12", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); state.push_back({"Jn32", BYTES_(size_t(PMLTetras_total) * localElems, sizeof(fp_t_ts))}); } // ---- Global field buffers ---- state.push_back({"En", BYTES_(sizeFieldElems, sizeof(fp_t_ts))}); state.push_back({"En1", BYTES_(sizeFieldElems, sizeof(fp_t_ts))}); state.push_back({"Hn12", BYTES_(sizeFieldElems, sizeof(fp_t_ts))}); state.push_back({"Hn32", BYTES_(sizeFieldElems, sizeof(fp_t_ts))}); // ---- Neighbor maps/structs ---- neighs.push_back({"NeighMap (int)", BYTES_(neighMapElems, sizeof(int))}); neighs.push_back({"Neighbours (int)", BYTES_(neighboursElems, sizeof(int))}); neighs.push_back({"NeighboursOffset (int)", BYTES_(neighboursElems, sizeof(int))}); neighs.push_back({"auxFieldInput", BYTES_(auxInElems, sizeof(fp_t_ts))}); neighs.push_back({"auxFieldOutput", BYTES_(auxOutElems, sizeof(fp_t_ts))}); // ============================ Totals & printing =================================== const size_t bytesExcit = sum_bytes(excit); const size_t bytesProp = sum_bytes(prop); const size_t bytesState = sum_bytes(state); const size_t bytesNeigh = sum_bytes(neighs); const double factor = usageSecurityThresholdFactor; // e.g., 1.05 const double gExcit = GB(bytesExcit) * factor; const double gProp = GB(bytesProp ) * factor; const double gState = GB(bytesState) * factor; const double gNeigh = GB(bytesNeigh) * factor; const double gTotal = gExcit + gProp + gState + gNeigh; size_t free_cudamem=0, total_cudamem=0; CUDA_SAFE_CALL(cudaMemGetInfo(&free_cudamem, &total_cudamem)); auto print_rows = [](const char* category, std::vector v, bool sort_by_size = true) { if (sort_by_size) { std::sort(v.begin(), v.end(), [](const MemItem& a, const MemItem& b){ return a.bytes > b.bytes; }); } for (auto& it: v) if (it.bytes) { std::cout << std::left << std::setw(16) << category << std::setw(36) << it.label << std::right << std::setw(12) << std::fixed << std::setprecision(6) << (double(it.bytes)/1e9) << '\n'; } }; std::cout << "============================================================================================\n"; std::cout << std::left << std::setw(16) << "Category" << std::setw(36) << "Buffer" << std::right << std::setw(12) << "Size [GB]" << '\n'; std::cout << "--------------------------------------------------------------------------------------------\n"; print_rows("Excitation", excit); print_rows("Propagation", prop); print_rows("Fields/State",state); print_rows("Neighbors", neighs); std::cout << "--------------------------------------------------------------------------------------------\n"; std::cout << std::left << std::setw(16) << "TOTALS" << std::setw(36) << "Excitation" << std::right << std::setw(12) << std::fixed << std::setprecision(6) << gExcit << '\n'; std::cout << std::left << std::setw(16) << "TOTALS" << std::setw(36) << "Propagation" << std::right << std::setw(12) << std::fixed << std::setprecision(6) << gProp << '\n'; std::cout << std::left << std::setw(16) << "TOTALS" << std::setw(36) << "Fields/State" << std::right << std::setw(12) << std::fixed << std::setprecision(6) << gState << '\n'; std::cout << std::left << std::setw(16) << "TOTALS" << std::setw(36) << "Neighbors" << std::right << std::setw(12) << std::fixed << std::setprecision(6) << gNeigh << '\n'; std::cout << std::left << std::setw(16) << "TOTAL (est.)" << std::setw(36) << "" << std::right << std::setw(12) << std::fixed << std::setprecision(6) << gTotal << '\n'; std::cout << "--------------------------------------------------------------------------------------------\n"; std::cout << "GPU Memory Free / Total [GB]: " << std::fixed << std::setprecision(2) << double(free_cudamem)/1e9 << " / " << double(total_cudamem)/1e9 << '\n'; std::cout << "============================================================================================\n"; //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Copy to GPU Memory //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // ---- Excitation maps & counts ------------------------------------------------- CUDA_SAFE_MALLOC(mapE_d, BYTES(int8_t, exciCNT_total * mapElemsPerExci)); CUDA_SAFE_MALLOC(mapH_d, BYTES(int8_t, exciCNT_total * mapElemsPerExci)); CUDA_SAFE_MALLOC(ExcitationFacesCnt_d, BYTES(int, exciCNT_total)); CUDA_SAFE_MALLOC(ExcitationFacesOffset_d, BYTES(int, exciCNT_total)); CUDA_SAFE_MALLOC(ExcitationFacesNum_d, BYTES(int, excitationFaces)); CUDA_SAFE_MALLOC(nd_coords_tet_d, BYTES(fp_t_ts, tetNdElems)); CUDA_SAFE_MALLOC(nd_coords_face_d, BYTES(fp_t_ts, faceNdElems)); if (PlaneWaveBCFlag) { CUDA_SAFE_MALLOC(Z_face_pw_d, BYTES(fp_t_ts, excitationFaces)); } // --- Allocate precomputed tangential fields (only port faces) --- if (portCNT > 0) { CUDA_SAFE_CALL(cudaMalloc((void**)&Etan_qp_d, excitationFaces * Q * 3 * sizeof(fp_t_ts))); CUDA_SAFE_CALL(cudaMalloc((void**)&Htan_qp_d, excitationFaces * Q * 3 * sizeof(fp_t_ts))); CUDA_SAFE_CALL(cudaMalloc((void**)&PortFacePidx_d, excitationFaces * sizeof(int))); const int nPorts = (int)portExcitations.size(); CUDA_SAFE_CALL(cudaMalloc((void**)&ExcitationProps_d, nPorts * sizeof(ExcitationProp))); } CUDA_SAFE_COPY(mapE_d, mapE_h, BYTES(int8_t, exciCNT_total * mapElemsPerExci)); CUDA_SAFE_COPY(mapH_d, mapH_h, BYTES(int8_t, exciCNT_total * mapElemsPerExci)); CUDA_SAFE_COPY(ExcitationFacesCnt_d, ExcitationFacesCnt_h, BYTES(int, exciCNT_total)); CUDA_SAFE_COPY(ExcitationFacesOffset_d, ExcitationFacesOffset_h, BYTES(int, exciCNT_total)); CUDA_SAFE_COPY(ExcitationFacesNum_d, ExcitationFacesNum_h, BYTES(int, excitationFaces)); CUDA_SAFE_COPY(nd_coords_tet_d, nd_coords_tet_h, BYTES(fp_t_ts, tetNdElems)); CUDA_SAFE_COPY(nd_coords_face_d, nd_coords_face_h, BYTES(fp_t_ts, faceNdElems)); if (PlaneWaveBCFlag) { CUDA_SAFE_COPY(Z_face_pw_d, Z_face_pw_h, BYTES(fp_t_ts, excitationFaces)); } // --- copy precomputed tangential fields (only port faces) --- if (portCNT > 0) { cout << "Export Etan and Htan" << endl; CUDA_SAFE_CALL(cudaMemset(Etan_qp_d, 0.0, BYTES(fp_t_ts, excitationFaces * Q * 3))); CUDA_SAFE_CALL(cudaMemset(Htan_qp_d, 0.0, BYTES(fp_t_ts, excitationFaces * Q * 3))); CUDA_SAFE_COPY(Etan_qp_d, Etan_qp_h, BYTES(fp_t_ts, excitationFaces * Q * 3)); CUDA_SAFE_COPY(Htan_qp_d, Htan_qp_h, BYTES(fp_t_ts, excitationFaces * Q * 3)); CUDA_SAFE_COPY(PortFacePidx_d, PortFacePidx_h, BYTES(int, excitationFaces)); const int nPorts = (int)portExcitations.size(); CUDA_SAFE_COPY(ExcitationProps_d, portExcitations.data(), nPorts * sizeof(ExcitationProp)); } // ---- Irregular (non-PML) ----------------------------------------------------- CUDA_SAFE_MALLOC(Loc1E_d, BYTES(fp_t_ts, irregularTetras_total * localElems)); CUDA_SAFE_MALLOC(Loc2E_d, BYTES(fp_t_ts, irregularTetras_total * localElems)); CUDA_SAFE_MALLOC(Loc1H_d, BYTES(fp_t_ts, irregularTetras_total * localElems)); CUDA_SAFE_MALLOC(Loc2H_d, BYTES(fp_t_ts, irregularTetras_total * localElems)); CUDA_SAFE_MALLOC(Neigh1E_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems)); CUDA_SAFE_MALLOC(Neigh2E_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems)); CUDA_SAFE_MALLOC(Neigh1H_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems)); CUDA_SAFE_MALLOC(Neigh2H_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems)); // Inverses only for excitations CUDA_SAFE_MALLOC(InvE_d, BYTES(fp_t_ts, exciCNT_total * localElems)); CUDA_SAFE_MALLOC(InvH_d, BYTES(fp_t_ts, exciCNT_total * localElems)); // Irregular (non-PML) CUDA_SAFE_COPY(Loc1E_d, Loc1E_h, BYTES(fp_t_ts, irregularTetras_total * localElems)); CUDA_SAFE_COPY(Loc2E_d, Loc2E_h, BYTES(fp_t_ts, irregularTetras_total * localElems)); CUDA_SAFE_COPY(Loc1H_d, Loc1H_h, BYTES(fp_t_ts, irregularTetras_total * localElems)); CUDA_SAFE_COPY(Loc2H_d, Loc2H_h, BYTES(fp_t_ts, irregularTetras_total * localElems)); CUDA_SAFE_COPY(Neigh1E_d, Neigh1E_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems)); CUDA_SAFE_COPY(Neigh2E_d, Neigh2E_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems)); CUDA_SAFE_COPY(Neigh1H_d, Neigh1H_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems)); CUDA_SAFE_COPY(Neigh2H_d, Neigh2H_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems)); CUDA_SAFE_COPY(InvE_d, InvE_h, BYTES(fp_t_ts, exciCNT_total * localElems)); CUDA_SAFE_COPY(InvH_d, InvH_h, BYTES(fp_t_ts, exciCNT_total * localElems)); // ---- Regular (prototype per group) ------------------------------------------- // Use exact counts — NOT (regularCNT - 1) or "*4" if (regularRegionFlag) { if (regNormGroups > 0) { CUDA_SAFE_MALLOC(regularLoc1E_d, BYTES(fp_t_ts, static_cast(regNormGroups) * localElems)); CUDA_SAFE_MALLOC(regularLoc2E_d, BYTES(fp_t_ts, static_cast(regNormGroups) * localElems)); CUDA_SAFE_MALLOC(regularLoc1H_d, BYTES(fp_t_ts, static_cast(regNormGroups) * localElems)); CUDA_SAFE_MALLOC(regularLoc2H_d, BYTES(fp_t_ts, static_cast(regNormGroups) * localElems)); CUDA_SAFE_MALLOC(regularNeigh1E_d, BYTES(fp_t_ts, static_cast(regNormFacesTotal) * neighElems)); CUDA_SAFE_MALLOC(regularNeigh2E_d, BYTES(fp_t_ts, static_cast(regNormFacesTotal) * neighElems)); CUDA_SAFE_MALLOC(regularNeigh1H_d, BYTES(fp_t_ts, static_cast(regNormFacesTotal) * neighElems)); CUDA_SAFE_MALLOC(regularNeigh2H_d, BYTES(fp_t_ts, static_cast(regNormFacesTotal) * neighElems)); CUDA_SAFE_COPY(regularLoc1E_d, regularLoc1E_h, BYTES(fp_t_ts, static_cast(regNormGroups) * localElems)); CUDA_SAFE_COPY(regularLoc2E_d, regularLoc2E_h, BYTES(fp_t_ts, static_cast(regNormGroups) * localElems)); CUDA_SAFE_COPY(regularLoc1H_d, regularLoc1H_h, BYTES(fp_t_ts, static_cast(regNormGroups) * localElems)); CUDA_SAFE_COPY(regularLoc2H_d, regularLoc2H_h, BYTES(fp_t_ts, static_cast(regNormGroups) * localElems)); CUDA_SAFE_COPY(regularNeigh1E_d, regularNeigh1E_h, BYTES(fp_t_ts, static_cast(regNormFacesTotal) * neighElems)); CUDA_SAFE_COPY(regularNeigh2E_d, regularNeigh2E_h, BYTES(fp_t_ts, static_cast(regNormFacesTotal) * neighElems)); CUDA_SAFE_COPY(regularNeigh1H_d, regularNeigh1H_h, BYTES(fp_t_ts, static_cast(regNormFacesTotal) * neighElems)); CUDA_SAFE_COPY(regularNeigh2H_d, regularNeigh2H_h, BYTES(fp_t_ts, static_cast(regNormFacesTotal) * neighElems)); } if (regPMLGroups > 0) { // PML-regular CUDA_SAFE_MALLOC(regularPMLLoc1E_d, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_MALLOC(regularPMLLoc2E_d, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_MALLOC(regularPMLLoc1H_d, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_MALLOC(regularPMLLoc2H_d, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_MALLOC(regularPMLNeigh1E_d, BYTES(fp_t_ts, static_cast(regPMLFacesTotal) * neighElems)); CUDA_SAFE_MALLOC(regularPMLNeigh2E_d, BYTES(fp_t_ts, static_cast(regPMLFacesTotal) * neighElems)); CUDA_SAFE_MALLOC(regularPMLNeigh1H_d, BYTES(fp_t_ts, static_cast(regPMLFacesTotal) * neighElems)); CUDA_SAFE_MALLOC(regularPMLNeigh2H_d, BYTES(fp_t_ts, static_cast(regPMLFacesTotal) * neighElems)); // PML auxiliaries for regular-PML prototypes (if used) CUDA_SAFE_MALLOC(regularPMLAuxE_d, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_MALLOC(regularPMLAuxH_d, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_MALLOC(regularPMLLoc1M_d, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_MALLOC(regularPMLLoc2M_d, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_MALLOC(regularPMLLoc1J_d, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_MALLOC(regularPMLLoc2J_d, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); // PML-regular CUDA_SAFE_COPY(regularPMLLoc1E_d, regularPMLLoc1E_h, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_COPY(regularPMLLoc2E_d, regularPMLLoc2E_h, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_COPY(regularPMLLoc1H_d, regularPMLLoc1H_h, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_COPY(regularPMLLoc2H_d, regularPMLLoc2H_h, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_COPY(regularPMLNeigh1E_d, regularPMLNeigh1E_h, BYTES(fp_t_ts, static_cast(regPMLFacesTotal) * neighElems)); CUDA_SAFE_COPY(regularPMLNeigh2E_d, regularPMLNeigh2E_h, BYTES(fp_t_ts, static_cast(regPMLFacesTotal) * neighElems)); CUDA_SAFE_COPY(regularPMLNeigh1H_d, regularPMLNeigh1H_h, BYTES(fp_t_ts, static_cast(regPMLFacesTotal) * neighElems)); CUDA_SAFE_COPY(regularPMLNeigh2H_d, regularPMLNeigh2H_h, BYTES(fp_t_ts, static_cast(regPMLFacesTotal) * neighElems)); CUDA_SAFE_COPY(regularPMLAuxE_d, regularPMLAuxE_h, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_COPY(regularPMLAuxH_d, regularPMLAuxH_h, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_COPY(regularPMLLoc1M_d, regularPMLLoc1M_h, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_COPY(regularPMLLoc2M_d, regularPMLLoc2M_h, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_COPY(regularPMLLoc1J_d, regularPMLLoc1J_h, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_COPY(regularPMLLoc2J_d, regularPMLLoc2J_h, BYTES(fp_t_ts, static_cast(regPMLGroups) * localElems)); CUDA_SAFE_MALLOC(r_Mn_d, BYTES(fp_t_ts, static_cast(numRegPMLTetras) * localElems)); CUDA_SAFE_MALLOC(r_Mn1_d, BYTES(fp_t_ts, static_cast(numRegPMLTetras) * localElems)); CUDA_SAFE_MALLOC(r_Jn12_d, BYTES(fp_t_ts, static_cast(numRegPMLTetras) * localElems)); CUDA_SAFE_MALLOC(r_Jn32_d, BYTES(fp_t_ts, static_cast(numRegPMLTetras) * localElems)); CUDA_SAFE_ZERO(r_Mn_d, BYTES(fp_t_ts, static_cast(numRegPMLTetras) * localElems)); CUDA_SAFE_ZERO(r_Mn1_d, BYTES(fp_t_ts, static_cast(numRegPMLTetras) * localElems)); CUDA_SAFE_ZERO(r_Jn12_d, BYTES(fp_t_ts, static_cast(numRegPMLTetras) * localElems)); CUDA_SAFE_ZERO(r_Jn32_d, BYTES(fp_t_ts, static_cast(numRegPMLTetras) * localElems)); } } // ---- Irregular PML (per element) --------------------------------------------- cout << "Non regular PMLTetras_total = " << PMLTetras_total << endl; if (PMLTetras_total > 0) { CUDA_SAFE_MALLOC(Loc1E_PML_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(Loc2E_PML_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(Loc1H_PML_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(Loc2H_PML_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(Neigh1E_PML_d, BYTES(fp_t_ts, static_cast(PMLNeighbours_total) * neighElems)); CUDA_SAFE_MALLOC(Neigh2E_PML_d, BYTES(fp_t_ts, static_cast(PMLNeighbours_total) * neighElems)); CUDA_SAFE_MALLOC(Neigh1H_PML_d, BYTES(fp_t_ts, static_cast(PMLNeighbours_total) * neighElems)); CUDA_SAFE_MALLOC(Neigh2H_PML_d, BYTES(fp_t_ts, static_cast(PMLNeighbours_total) * neighElems)); CUDA_SAFE_MALLOC(AuxE_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(AuxH_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(AuxM1_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(AuxJ1_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(AuxM2_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(AuxJ2_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_COPY(Loc1E_PML_d, Loc1E_PML_h, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_COPY(Loc2E_PML_d, Loc2E_PML_h, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_COPY(Loc1H_PML_d, Loc1H_PML_h, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_COPY(Loc2H_PML_d, Loc2H_PML_h, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_COPY(Neigh1E_PML_d, Neigh1E_PML_h, BYTES(fp_t_ts, static_cast(PMLNeighbours_total) * neighElems)); CUDA_SAFE_COPY(Neigh2E_PML_d, Neigh2E_PML_h, BYTES(fp_t_ts, static_cast(PMLNeighbours_total) * neighElems)); CUDA_SAFE_COPY(Neigh1H_PML_d, Neigh1H_PML_h, BYTES(fp_t_ts, static_cast(PMLNeighbours_total) * neighElems)); CUDA_SAFE_COPY(Neigh2H_PML_d, Neigh2H_PML_h, BYTES(fp_t_ts, static_cast(PMLNeighbours_total) * neighElems)); CUDA_SAFE_COPY(AuxE_d, AuxE_h, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_COPY(AuxH_d, AuxH_h, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_COPY(AuxM1_d, AuxM1_h, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_COPY(AuxJ1_d, AuxJ1_h, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_COPY(AuxM2_d, AuxM2_h, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_COPY(AuxJ2_d, AuxJ2_h, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(Mn_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(Mn1_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(Jn12_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_MALLOC(Jn32_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_ZERO(Mn_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_ZERO(Mn1_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_ZERO(Jn12_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); CUDA_SAFE_ZERO(Jn32_d, BYTES(fp_t_ts, static_cast(PMLTetras_total) * localElems)); } // ---- Global field buffers ----------------------------------------------------- CUDA_SAFE_MALLOC(En_d, BYTES(fp_t_ts, sizeFieldElems)); CUDA_SAFE_MALLOC(En1_d, BYTES(fp_t_ts, sizeFieldElems)); CUDA_SAFE_MALLOC(Hn12_d, BYTES(fp_t_ts, sizeFieldElems)); CUDA_SAFE_MALLOC(Hn32_d, BYTES(fp_t_ts, sizeFieldElems)); // Fields zero init CUDA_SAFE_ZERO(En_d, BYTES(fp_t_ts, sizeFieldElems)); CUDA_SAFE_ZERO(En1_d, BYTES(fp_t_ts, sizeFieldElems)); CUDA_SAFE_ZERO(Hn12_d, BYTES(fp_t_ts, sizeFieldElems)); CUDA_SAFE_ZERO(Hn32_d, BYTES(fp_t_ts, sizeFieldElems)); // ---- Neighbor maps ------------------------------------------------------------ CUDA_SAFE_MALLOC(NeighMap_d, BYTES(int, neighMapElems)); CUDA_SAFE_MALLOC(Neighbours_d, BYTES(int, neighboursElems)); CUDA_SAFE_MALLOC(NeighboursOffset_d, BYTES(int, neighboursElems)); CUDA_SAFE_MALLOC(auxFieldInput, BYTES(fp_t_ts, auxInElems)); CUDA_SAFE_MALLOC(auxFieldOutput, BYTES(fp_t_ts, auxOutElems)); // Neighbor structures CUDA_SAFE_COPY(NeighMap_d, NeighMap_h, BYTES(int, neighMapElems)); CUDA_SAFE_COPY(Neighbours_d, Neighbours_h, BYTES(int, neighboursElems)); CUDA_SAFE_COPY(NeighboursOffset_d, NeighboursOffset_h, BYTES(int, neighboursElems)); } void FemGrp::TimeSteppingCuBLAS() { fp_t InitTime = 0.0; fp_t Frequency = freq; fp_t dt_nyquist = 1.0 / (2.0 * Frequency * MEGA); fp_t dt_sample = (1 / SamplingRate) * dt_nyquist; tsPerSampling = (int)ceil(dt_sample / LocTimeSteps[N_class - 1]); dt_sample = tsPerSampling * LocTimeSteps[N_class - 1]; if(FinalTime > 0) NtimeSteps = (int)ceil((FinalTime - InitTime) / LocTimeSteps[N_class -1]); // number of time steps for the biggest time step size else NtimeSteps = 0; if(usePade){ fp_t earlyTime = 10 * Length(maxPoint - minPoint) / Vo; /*7.5 (for saftey use 10) is empirical because in "Early Time Behavior in Reverberation Chambers and Its Effect on the Relationships Between Coherence Bandwidth, Chamber Decay Time, RMS Delay Spread, and the Chamber Buildup Time", Christopher L. Holloway et al. the value of 3/2 is suggested from equation 30 */ tsPerPade = (int)ceil(earlyTime / LocTimeSteps[N_class -1]); tsPerPade = tsPerPade + tsPerSampling - tsPerPade % tsPerSampling; fieldProbes = new fp_t_ts[probeCNT * (int)ceil((1.0 * NtimeSteps) / tsPerSampling) * NumOfFieldComponents]; CUDA_SAFE_CALL(cudaMallocHost((void**)&tranferencePadeFunctionFD_h, padeCNT * (int)ceil((1.0 * NtimeSteps) / tsPerSampling) * NumOfFieldComponents * sizeof(cuDoubleComplex), cudaHostAllocMapped)); getPadeFreq((int)ceil((1.0 * NtimeSteps) / tsPerSampling), tsPerSampling); } Write_TD_Data(tsPerSampling, NtimeSteps); //Output precision set to 15 digits cout.precision(15); //Print out data used in the computation cout << endl; cout << "=============================================" << endl; cout << "== Running CUDA Implementation (Non-Heavy) ==" << endl; cout << "=============================================" << endl; cout << endl; cout << "==========================================" << endl; cout << " PERFORMING INFORMATION " << endl; cout << "==========================================" << endl; if(FinalTime > 0) cout << " Final Time(sec) = " << FinalTime << endl; else cout << " Final Time = " << "TBD" << endl; cout << " Time Step, dt(sec) = " << LocTimeSteps[N_class -1] << endl; cout << " Number of Tetrahedra = " << tetraCNT << endl; cout << " Number of Classes = " << N_class << endl; if(FinalTime > 0) cout << " Number of Time Steps = " << NtimeSteps << endl; for(int i = 0; i < N_class ; i++){ cout << " LocTimeSteps[" << i << "] = " << LocTimeSteps[i] << endl; } cout << endl; cout << " dt_nyquist = " << dt_nyquist << endl; cout << " dt_sample = " << dt_sample << endl; cout << " tsPerSampling = " << tsPerSampling << endl; if(FinalTime > 0) cout << " Number of samplings = " << (int)ceil((1.0 * NtimeSteps) / tsPerSampling) << endl; if(usePade){ cout << " Time Steps / Pade Calc = " << tsPerPade << endl; } cout << "==========================================" << endl; cout << endl; //Memory status SYSTEM_MEM_USAGE(); cout << endl; cout << " " << endl; cout << "===================================================" << endl; cout << " Local Time-Stepping Loop " << endl; cout << "===================================================" << endl; // Variables for time tracking size_t total_time = 0; fp_t current_time = 0; bool exitBool = false; current_time -= (double)dt_sample * 1e9; if(FinalTime <= 0){ NtimeSteps = NumOfSampleEnergyCheck * tsPerSampling + 1; fieldEnergy = 0; maxFieldEnergy = 0; if(numberOfEnergyPoints == 0){ numberOfEnergyPoints = probeCNT; } } cublasHandle_t handle; cublasCreate(&handle); timer_start("Time Stepping", ' '); timer_start("Start Time Stepping", 'm'); for(int n = 0; n < NtimeSteps; n++){ ComputeE_cuBLAS(handle, N_class - 1); ComputeH_cuBLAS(handle, N_class - 1); if(n % tsPerSampling == 0) { CUDA_SAFE_CALL(cudaMemcpy(En1_h, En1_d, tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToHost)); CUDA_SAFE_CALL(cudaMemcpy(Hn32_h, Hn32_d, tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToHost)); CUDA_SAFE_CALL(cudaDeviceSynchronize()); total_time += timer_stop('m'); if(write_probes && probeCNT > 0) { writeFieldProbeCuBLAS(n); if(write_AnalyticalIncidentProbes){ writeAnalyticalIncidentPWProbes(n); } if(n != 0 && usePade && n % tsPerPade == 0) { if(padeTime < 0.0){ exitBool = calculatePadeCUDA(n, n / tsPerPade == 1, false); }else if(n * LocTimeSteps[N_class - 1] > padeTime * 1e-9){ exitBool = true; } } } if(write_fields){ writeFieldGlobalCuBLAS(n); } // Modified by Qi Jian to write surface currents if(WriteSurfFlag) { writeCurrentsOutputSurfMesh_CuBLAS(n); } // Writing the fields on the port surfaces if (PortBCFlag) { writePortFieldProbeCuBLAS(n); } fp_t_ts magAux = 0; for(int i = 0; i < tetraCNT * TetPolyOrderDim[PolyFlag]; i++){ magAux += En1_h[i] * En1_h[i]; } cout << "E field norm^2 " << magAux << endl; current_time += (double)dt_sample * 1e9; DEBUG_INFO(" Current Time : " + to_string(current_time) + "ns"); DEBUG_INFO(" Average iteration time : "+ to_string(((double)total_time / (double)(n + 1.0))) + " msec"); if(exitBool){ calculatePadeCUDA(n, false, true); break; } if(FinalTime < 0 && n == NtimeSteps-1){ if(!checkEnergyDecay()){ NtimeSteps += NumOfSampleEnergyCheck * tsPerSampling; cout << "Max Energy: " << maxFieldEnergy << " - Current Energy: " << fieldEnergy << " - Relation: " << fieldEnergy * 100 / maxFieldEnergy << "%" << endl; fieldEnergy = 0.0; }else{ Write_TD_Data(tsPerSampling, NtimeSteps); break; } } cout << "---------------------------------------------------" << endl; timer_start(to_string(tsPerSampling)+" steps", 'm'); } } if(!exitBool && padeCNT > 0 && !writeWhilePade){ writeFieldProbeAfterPade(tsPerSampling); } if(!exitBool && (NtimeSteps-1 % tsPerSampling != 0)){ timer_stop('m'); } DEBUG_INFO(" Total iteration time: "+ to_string((double)total_time) + " msec"); timer_stop(' '); } //The recursivity in ComputeE and ComputeH is due to the LTS process /********************************************************************** Local Time-Stepping for CUDA Recursive Explained in "Dissipative terms and local time-stepping improvements in a spatial high order Discontinuous Galerkin scheme for the time-domain Maxwell’s equations" by E. Montseny **********************************************************************/ void FemGrp::ComputeE_cuBLAS(cublasHandle_t handle, int class_i){ if(class_i == 0){ LE_CuBLAS(handle, class_i); }else{ LE_CuBLAS(handle, class_i); ComputeE_cuBLAS(handle, class_i - 1); ComputeH_cuBLAS(handle, class_i - 1); ComputeE_cuBLAS(handle, class_i - 1); } } void FemGrp::ComputeH_cuBLAS(cublasHandle_t handle, int class_i){ if(class_i == 0){ LH_CuBLAS(handle, class_i); }else{ LH_CuBLAS(handle, class_i); ComputeH_cuBLAS(handle, class_i - 1); ComputeE_cuBLAS(handle, class_i - 1); ComputeH_cuBLAS(handle, class_i - 1); } } void FemGrp::LE_CuBLAS(cublasHandle_t handle, int class_i) { const int Q = GAUSS_POINT_NUM_h[PolyFlag]; // same as GPU kernel uses int irregularTetras = nonRegularTetraCnt_h[class_i]; int classOffset = ClassTetraOffset[class_i]; int neighOffset = NeighClassOffset_h[class_i]; int blockSize = 256; //optimal number int numBlocks; if(irregularTetras > 0) { // Local Mattrices int nMatrices = irregularTetras; int matrixOffset = classTetraOffset_loc_h[class_i]; int m = TetPolyOrderDim[PolyFlag]; //rows of A int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A long long int strideA = m * n; long long int strideB = n; long long int strideC = m; float alpha = 1.0; float beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Loc1E_d[matrixOffset * strideA], m, strideA, &En_d[classOffset * strideB], n, strideB, &beta, &En1_d[classOffset * strideC], m, strideC, nMatrices); beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Loc2E_d[matrixOffset * strideA], m, strideA, &Hn12_d[classOffset * strideB], n, strideB, &beta, &En1_d[classOffset * strideC], m, strideC, nMatrices); if(ClassExcitationCount[class_i] > 0) { nMatrices = ClassExcitationCount[class_i]; matrixOffset = ClassExcitationOffset[class_i]; //cout << "ClassExcitationCount[" << class_i << "] = " << ClassExcitationCount[class_i] << endl; //cout << "ClassExcitationOffset[" << class_i << "] = " << ClassExcitationOffset[class_i] << endl; //cout << "classOffset * strideC " << classOffset * strideC << endl; numBlocks = (nMatrices + blockSize - 1) / blockSize; fp_t_ts dt = LocTimeSteps[class_i]; fp_t_ts t = (LocalExciIndexE[class_i] + 0.5) * dt; LocalExciIndexE[class_i]++; if (PWorPort == 0) { if (interior_excitation_flag) { addExcitationE_PML<<>>(&ExcitationFacesCnt_d[matrixOffset], &ExcitationFacesOffset_d[matrixOffset], ExcitationFacesNum_d, nMatrices, ClassExcitation_sc_CNT[class_i], &mapE_d[matrixOffset * strideC], excitationProp, PolyFlag, dt /Eo, t, &nd_coords_tet_d[matrixOffset * NumOfNodes * NumOfUnitaryVectors], nd_coords_face_d, Z_face_pw_d, &InvE_d[matrixOffset * strideA], &En1_d[classOffset * strideC]); } else { addExcitationE<<>>(&ExcitationFacesCnt_d[matrixOffset], &ExcitationFacesOffset_d[matrixOffset], ExcitationFacesNum_d, nMatrices, &mapE_d[matrixOffset * strideC], excitationProp, PolyFlag, dt /Eo, t, &nd_coords_tet_d[matrixOffset * NumOfNodes * NumOfUnitaryVectors], nd_coords_face_d, Z_face_pw_d, &InvE_d[matrixOffset * strideA], &En1_d[classOffset * strideC]); } } else { addExcitationE_port<<>>( &ExcitationFacesCnt_d[matrixOffset], &ExcitationFacesOffset_d[matrixOffset], ExcitationFacesNum_d, nMatrices, &mapE_d[matrixOffset * strideC], ExcitationProps_d, PortFacePidx_d, PolyFlag, dt /Eo, t, &nd_coords_tet_d[matrixOffset * NumOfNodes * NumOfUnitaryVectors], nd_coords_face_d, &InvE_d[matrixOffset * strideA], &En1_d[classOffset * strideC]); //cout << "\n\n\n\n\n"; } CUDA_SAFE_CALL(cudaDeviceSynchronize()); } // Coupling Matrices nMatrices = classNeighIrregular_h[class_i]; matrixOffset = classNeighOffset_loc_h[class_i]; m = TetPolyOrderDim[PolyFlag]; //rows of A n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A strideA = m * n; strideB = n; strideC = m; numBlocks = (nMatrices * n + blockSize - 1) / blockSize; makeNeighField<<>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1) beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Neigh1E_d[matrixOffset * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, nMatrices); makeNeighField<<>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1) beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Neigh2E_d[matrixOffset * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, nMatrices); int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag]; dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1); numBlocks = (irregularTetras + blockY - 1) / blockY; addCouplingResults<<>>(&En1_d[classOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], irregularTetras); //Implement 3D if tetras over blocksize * (2^(31) - 1) } // -------------------------------------------------------------------------------------------------- CUDA_SAFE_CALL(cudaDeviceSynchronize()); if(regularRegionFlag && classRegularGroupsCnt_h[class_i] > 0) { for(int i = 0; i < classRegularGroupsCnt_h[class_i]; i++) { int groupID = classRegularGroupsId_h[class_i][i]; int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID]; int groupOffset = classOffset + nonRegularTetraCnt_h[class_i] + classRegularTetraOffset_h[class_i][i]; CUDA_SAFE_CALL(cudaDeviceSynchronize()); // Local Matrices int m = TetPolyOrderDim[PolyFlag]; //rows of A int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A float alpha = 1.0; float beta = 0.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularLoc1E_d[(groupID - 1) * m * n], m, &En_d[groupOffset * n], n, &beta, &En1_d[groupOffset * m], m); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularLoc2E_d[(groupID - 1) * m * n], m, &Hn12_d[groupOffset * n], n, &beta, &En1_d[groupOffset * m], m); CUDA_SAFE_CALL(cudaDeviceSynchronize()); // Coupling Matrices int regularNeighOffset = neighOffset + classNeighIrregular_h[class_i] + classRegularTetraOffset_h[class_i][i] * NumOfFaces; m = TetPolyOrderDim[PolyFlag]; //rows of A n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize; makeNeighFieldRegular<<>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag); long long int strideA = m * n; long long int strideB = n * groupElements; long long int strideC = m * groupElements; CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularNeigh1E_d[(groupID - 1) * NumOfFaces * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, NumOfFaces); makeNeighFieldRegular<<>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularNeigh2E_d[(groupID - 1) * NumOfFaces * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, NumOfFaces); int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag]; dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1); numBlocks = (groupElements + blockY - 1) / blockY; addCouplingResultsRegular<<>>(&En1_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements); } } // ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- CUDA_SAFE_CALL(cudaDeviceSynchronize()); // -------------------------------------------- // PML Section int PMLTetras = nonRegularPMLTetraCnt_h[class_i]; classOffset = classPMLTetraOffset_h[class_i]; neighOffset = classNeighPMLOffset_h[class_i]; if(PMLTetras > 0) { // Local Mattrices int nMatrices = PMLTetras; int matrixOffset = classPMLTetraOffset_loc_h[class_i]; // cout << "classPMLTetraOffset_loc_h[class_i] : " << classPMLTetraOffset_loc_h[class_i] << endl; int m = TetPolyOrderDim[PolyFlag]; //rows of A int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A long long int strideA = m * n; long long int strideB = n; long long int strideC = m; float alpha = 1.0; float beta = 0.0; CUDA_SAFE_CALL(cudaDeviceSynchronize()); // -------------------------------------------------------- // Auxilliary J cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &AuxJ1_d[matrixOffset * strideA], m, strideA, &Jn12_d[matrixOffset * strideB], n, strideB, &beta, &Jn32_d[matrixOffset * strideC], m, strideC, nMatrices); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &AuxJ2_d[matrixOffset * strideA], m, strideA, &En_d[classOffset * strideB], n, strideB, &beta, &Jn32_d[matrixOffset * strideC], m, strideC, nMatrices); CUDA_SAFE_CALL(cudaDeviceSynchronize()); // -------------------------------------------------------- alpha = 1.0; beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Loc1E_PML_d[matrixOffset * strideA], m, strideA, &En_d[classOffset * strideB], n, strideB, &beta, &En1_d[classOffset * strideC], m, strideC, nMatrices); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Loc2E_PML_d[matrixOffset * strideA], m, strideA, &Hn12_d[classOffset * strideB], n, strideB, &beta, &En1_d[classOffset * strideC], m, strideC, nMatrices); CUDA_SAFE_CALL(cudaDeviceSynchronize()); // Add Auxilliary J term beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &AuxE_d[matrixOffset * strideA], m, strideA, &Jn32_d[matrixOffset * strideB], n, strideB, &beta, &En1_d[classOffset * strideC], m, strideC, nMatrices); CUDA_SAFE_CALL(cudaDeviceSynchronize()); // Coupling Matrices nMatrices = classNeighPML_h[class_i]; matrixOffset = classNeighPMLOffset_loc_h[class_i]; m = TetPolyOrderDim[PolyFlag]; //rows of A n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A strideA = m * n; strideB = n; strideC = m; numBlocks = (nMatrices * n + blockSize - 1) / blockSize; makeNeighField<<>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1) beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Neigh1E_PML_d[matrixOffset * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, nMatrices); makeNeighField<<>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1) CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Neigh2E_PML_d[matrixOffset * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, nMatrices); CUDA_SAFE_CALL(cudaDeviceSynchronize()); int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag]; dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1); numBlocks = (PMLTetras + blockY - 1) / blockY; //Implement 3D if tetras over blocksize * (2^(31) - 1) addCouplingResults<<>>(&En1_d[classPMLTetraOffset_h[class_i] * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], PMLTetras); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } if(regularRegionFlag && classRegularPMLGroupsCnt_h[class_i] > 0) { for(int i = 0; i < classRegularPMLGroupsCnt_h[class_i]; i++) { int groupID = classRegularPMLGroupsId_h[class_i][i]; int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID]; int groupOffset = classOffset + nonRegularPMLTetraCnt_h[class_i] + classRegularPMLTetraOffset_h[class_i][i]; CUDA_SAFE_CALL(cudaDeviceSynchronize()); int local_index = groupID - 1 - regularCNT_Normal; int aux_offset = classRegularPMLTetraOffset_h[class_i][i]; // Local Matrices int m = TetPolyOrderDim[PolyFlag]; //rows of A int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A // -------------------------------------------------------- // Auxilliary J float alpha = 1.0; float beta = 0.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLLoc1J_d[(local_index) * m * n], m, &r_Jn12_d[aux_offset * n], n, &beta, &r_Jn32_d[aux_offset * m], m); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLLoc2J_d[(local_index) * m * n], m, &En_d[aux_offset * n], n, &beta, &r_Jn32_d[aux_offset * m], m); // -------------------------------------------------------- beta = 0.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLLoc1E_d[(local_index) * m * n], m, &En_d[groupOffset * n], n, &beta, &En1_d[groupOffset * m], m); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLLoc2E_d[(local_index) * m * n], m, &Hn12_d[groupOffset * n], n, &beta, &En1_d[groupOffset * m], m); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLAuxE_d[(local_index) * m * n], m, &r_Jn32_d[aux_offset * n], n, &beta, &En1_d[groupOffset * m], m); CUDA_SAFE_CALL(cudaDeviceSynchronize()); // Coupling Matrices int regularNeighOffset = neighOffset + classNeighPML_h[class_i] + classRegularPMLTetraOffset_h[class_i][i] * NumOfFaces; m = TetPolyOrderDim[PolyFlag]; //rows of A n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize; makeNeighFieldRegular<<>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag); long long int strideA = m * n; long long int strideB = n * groupElements; long long int strideC = m * groupElements; CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLNeigh1E_d[(local_index) * NumOfFaces * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, NumOfFaces); makeNeighFieldRegular<<>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLNeigh2E_d[(local_index) * NumOfFaces * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, NumOfFaces); int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag]; dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1); numBlocks = (groupElements + blockY - 1) / blockY; addCouplingResultsRegular<<>>(&En1_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements); } } CUDA_SAFE_CALL(cudaDeviceSynchronize()); /* int total_tets = ClassTetraCnt[class_i] + ClassPMLTetraCnt[class_i]; int offset = ClassTetraOffset[class_i]; CUDA_SAFE_CALL(cudaMemcpy(&En_d[offset * TetPolyOrderDim[PolyFlag]], &En1_d[offset * TetPolyOrderDim[PolyFlag]], total_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice)); CUDA_SAFE_CALL(cudaDeviceSynchronize()); */ CUDA_SAFE_CALL(cudaMemcpy(&En_d[0], &En1_d[0], tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice)); cudaDeviceSynchronize(); if(nonRegularPMLTetraCnt_h[class_i] > 0) { int num_PML_tets = nonRegularPMLTetraCnt_h[class_i]; int matrixOffset = classPMLTetraOffset_loc_h[class_i]; CUDA_SAFE_CALL(cudaMemcpy(&Jn12_d[matrixOffset * TetPolyOrderDim[PolyFlag]], &Jn32_d[matrixOffset * TetPolyOrderDim[PolyFlag]], num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice)); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } if(classRegularPMLGroupsCnt_h[class_i] > 0) { int num_PML_tets = numRegPMLTetras; CUDA_SAFE_CALL(cudaMemcpy(&Jn12_d[0], &Jn32_d[0], num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice)); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } } void FemGrp::LH_CuBLAS(cublasHandle_t handle, int class_i){ const int Q = GAUSS_POINT_NUM_h[PolyFlag]; // same as GPU kernel uses int irregularTetras = nonRegularTetraCnt_h[class_i]; int classOffset = ClassTetraOffset[class_i]; int neighOffset = NeighClassOffset_h[class_i]; int blockSize = 256; //optimal number int numBlocks; if(irregularTetras > 0) { // Local Mattrices int nMatrices = irregularTetras; int matrixOffset = classTetraOffset_loc_h[class_i]; int m = TetPolyOrderDim[PolyFlag]; //rows of A int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A long long int strideA = m * n; long long int strideB = n; long long int strideC = m; float alpha = 1.0; float beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Loc1H_d[matrixOffset * strideA], m, strideA, &Hn12_d[classOffset * strideB], n, strideB, &beta, &Hn32_d[classOffset * strideC], m, strideC, nMatrices); beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Loc2H_d[matrixOffset * strideA], m, strideA, &En_d[classOffset * strideB], n, strideB, &beta, &Hn32_d[classOffset * strideC], m, strideC, nMatrices); if(ClassExcitationCount[class_i] > 0){ nMatrices = ClassExcitationCount[class_i]; matrixOffset = ClassExcitationOffset[class_i]; numBlocks = (nMatrices + blockSize - 1) / blockSize; fp_t_ts dt = LocTimeSteps[class_i]; fp_t_ts t = (LocalExciIndexH[class_i] + 1.0) * dt; LocalExciIndexH[class_i]++; if (PWorPort == 0) { if (interior_excitation_flag) { addExcitationH_PML<<>>(&ExcitationFacesCnt_d[matrixOffset], &ExcitationFacesOffset_d[matrixOffset], ExcitationFacesNum_d, nMatrices, ClassExcitation_sc_CNT[class_i], &mapH_d[matrixOffset * strideC], excitationProp, PolyFlag, dt / Uo, t, &nd_coords_tet_d[4 * 3 * matrixOffset], nd_coords_face_d, Z_face_pw_d, &InvH_d[strideA * matrixOffset], &Hn32_d[classOffset * strideC]); } else { addExcitationH<<>>(&ExcitationFacesCnt_d[matrixOffset], &ExcitationFacesOffset_d[matrixOffset], ExcitationFacesNum_d, nMatrices, &mapH_d[matrixOffset * strideC], excitationProp, PolyFlag, dt / Uo, t, &nd_coords_tet_d[4 * 3 * matrixOffset], nd_coords_face_d, Z_face_pw_d, &InvH_d[strideA * matrixOffset], &Hn32_d[classOffset * strideC]); } } else { addExcitationH_port<<>>(&ExcitationFacesCnt_d[matrixOffset], &ExcitationFacesOffset_d[matrixOffset], ExcitationFacesNum_d, nMatrices, &mapH_d[matrixOffset * strideC], ExcitationProps_d, PortFacePidx_d, PolyFlag, dt / Uo, t, &nd_coords_tet_d[4 * 3 * matrixOffset], nd_coords_face_d, &InvH_d[strideA * matrixOffset], &Hn32_d[classOffset * strideC]); } } // Coupling Matrices nMatrices = classNeighIrregular_h[class_i]; matrixOffset = classNeighOffset_loc_h[class_i]; m = TetPolyOrderDim[PolyFlag]; //rows of A n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A strideA = m * n; strideB = n; strideC = m; numBlocks = (nMatrices * n + blockSize - 1) / blockSize; makeNeighField<<>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1) beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Neigh1H_d[matrixOffset * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, nMatrices); makeNeighField<<>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1) beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Neigh2H_d[matrixOffset * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, nMatrices); int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag]; dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1); numBlocks = (irregularTetras + blockY - 1) / blockY; addCouplingResults<<>>(&Hn32_d[classOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], irregularTetras); //Implement 3D if tetras over blocksize * (2^(31) - 1) } // -------------------------------------------------------------------------------------------------- CUDA_SAFE_CALL(cudaDeviceSynchronize()); if(regularRegionFlag && classRegularGroupsCnt_h[class_i] > 0) { for(int i = 0; i < classRegularGroupsCnt_h[class_i]; i++) { int groupID = classRegularGroupsId_h[class_i][i]; int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID]; int groupOffset = classOffset + nonRegularTetraCnt_h[class_i] + classRegularTetraOffset_h[class_i][i]; // Local Matrices int m = TetPolyOrderDim[PolyFlag]; //rows of A int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A float alpha = 1.0; float beta = 0.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularLoc1H_d[(groupID - 1) * m * n], m, &Hn12_d[groupOffset * n], n, &beta, &Hn32_d[groupOffset * m], m); beta = 1.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularLoc2H_d[(groupID - 1) * m * n], m, &En_d[groupOffset * n], n, &beta, &Hn32_d[groupOffset * m], m); // Coupling Matrices int regularNeighOffset = neighOffset + classNeighIrregular_h[class_i] + classRegularTetraOffset_h[class_i][i] * NumOfFaces; m = TetPolyOrderDim[PolyFlag]; //rows of A n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize; makeNeighFieldRegular<<>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag); long long int strideA = m * n; long long int strideB = n * groupElements; long long int strideC = m * groupElements; beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularNeigh1H_d[(groupID - 1) * NumOfFaces * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, NumOfFaces); makeNeighFieldRegular<<>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag); beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularNeigh2H_d[(groupID - 1) * NumOfFaces * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, NumOfFaces); int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag]; dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1); numBlocks = (groupElements + blockY - 1) / blockY; addCouplingResultsRegular<<>>(&Hn32_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements); CUDA_SAFE_CALL(cudaDeviceSynchronize()); // make sure prior kernels/GEMMs finished } } // ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- // -------------------------------------------- // PML Section int PMLTetras = nonRegularPMLTetraCnt_h[class_i]; classOffset = classPMLTetraOffset_h[class_i]; neighOffset = classNeighPMLOffset_h[class_i]; if(PMLTetras > 0) { // Local Mattrices int nMatrices = PMLTetras; int matrixOffset = classPMLTetraOffset_loc_h[class_i]; int m = TetPolyOrderDim[PolyFlag]; //rows of A int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A long long int strideA = m * n; long long int strideB = n; long long int strideC = m; float alpha = 1.0; float beta = 0.0; // -------------------------------------------------------- // Auxilliary M cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &AuxM1_d[matrixOffset * strideA], m, strideA, &Mn_d[matrixOffset * strideB], n, strideB, &beta, &Mn1_d[matrixOffset * strideC], m, strideC, nMatrices); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &AuxM2_d[matrixOffset * strideA], m, strideA, &Hn12_d[classOffset * strideB], n, strideB, &beta, &Mn1_d[matrixOffset * strideC], m, strideC, nMatrices); // -------------------------------------------------------- alpha = 1.0; beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Loc1H_PML_d[matrixOffset * strideA], m, strideA, &Hn12_d[classOffset * strideB], n, strideB, &beta, &Hn32_d[classOffset * strideC], m, strideC, nMatrices); beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Loc2H_PML_d[matrixOffset * strideA], m, strideA, &En_d[classOffset * strideB], n, strideB, &beta, &Hn32_d[classOffset * strideC], m, strideC, nMatrices); // Add Auxilliary Term M beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &AuxH_d[matrixOffset * strideA], m, strideA, &Mn1_d[matrixOffset * strideB], n, strideB, &beta, &Hn32_d[classOffset * strideC], m, strideC, nMatrices); // Coupling Matrices nMatrices = classNeighPML_h[class_i]; matrixOffset = classNeighPMLOffset_loc_h[class_i]; // cout << start << " " << nMatrices << " " << start + nMatrices << endl; m = TetPolyOrderDim[PolyFlag]; //rows of A n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A strideA = m * n; strideB = n; strideC = m; numBlocks = (nMatrices * n + blockSize - 1) / blockSize; makeNeighField<<>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1) beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Neigh1H_PML_d[matrixOffset * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, nMatrices); makeNeighField<<>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1) beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, 1, n, &alpha, &Neigh2H_PML_d[matrixOffset * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, nMatrices); int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag]; dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1); numBlocks = (PMLTetras + blockY - 1) / blockY; //Implement 3D if tetras over blocksize * (2^(31) - 1) addCouplingResults<<>>(&Hn32_d[classPMLTetraOffset_h[class_i] * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], PMLTetras); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } if(regularRegionFlag && classRegularPMLGroupsCnt_h[class_i] > 0) { for(int i = 0; i < classRegularPMLGroupsCnt_h[class_i]; i++) { int groupID = classRegularPMLGroupsId_h[class_i][i]; int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID]; int groupOffset = classOffset + nonRegularPMLTetraCnt_h[class_i] + classRegularPMLTetraOffset_h[class_i][i]; CUDA_SAFE_CALL(cudaDeviceSynchronize()); int local_index = groupID - 1 - regularCNT_Normal; int aux_offset = classRegularPMLTetraOffset_h[class_i][i]; // Local Matrices int m = TetPolyOrderDim[PolyFlag]; //rows of A int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A // -------------------------------------------------------- // Auxilliary M float alpha = 1.0; float beta = 0.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLLoc1M_d[(local_index) * m * n], m, &r_Mn_d[aux_offset * n], n, &beta, &r_Mn1_d[aux_offset * m], m); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLLoc2M_d[(local_index) * m * n], m, &Hn12_d[aux_offset * n], n, &beta, &r_Mn1_d[aux_offset * m], m); CUDA_SAFE_CALL(cudaDeviceSynchronize()); // -------------------------------------------------------- beta = 0.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLLoc1H_d[(local_index) * m * n], m, &Hn12_d[groupOffset * n], n, &beta, &Hn32_d[groupOffset * m], m); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLLoc2H_d[(local_index) * m * n], m, &En_d[groupOffset * n], n, &beta, &Hn32_d[groupOffset * m], m); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLAuxH_d[(local_index) * m * n], m, &r_Mn1_d[aux_offset * n], n, &beta, &Hn32_d[groupOffset * m], m); CUDA_SAFE_CALL(cudaDeviceSynchronize()); // Coupling Matrices int regularNeighOffset = neighOffset + classNeighPML_h[class_i] + classRegularPMLTetraOffset_h[class_i][i] * NumOfFaces; m = TetPolyOrderDim[PolyFlag]; //rows of A n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize; makeNeighFieldRegular<<>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag); long long int strideA = m * n; long long int strideB = n * groupElements; long long int strideC = m * groupElements; CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 0.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLNeigh1H_d[(local_index) * NumOfFaces * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, NumOfFaces); makeNeighFieldRegular<<>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag); CUDA_SAFE_CALL(cudaDeviceSynchronize()); beta = 1.0; cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, groupElements, n, &alpha, ®ularPMLNeigh2H_d[(local_index) * NumOfFaces * strideA], m, strideA, auxFieldInput, n, strideB, &beta, auxFieldOutput, m, strideC, NumOfFaces); int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag]; dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1); numBlocks = (groupElements + blockY - 1) / blockY; addCouplingResultsRegular<<>>(&Hn32_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements); } } CUDA_SAFE_CALL(cudaDeviceSynchronize()); /* int total_tets = ClassTetraCnt[class_i] + ClassPMLTetraCnt[class_i]; int offset = ClassTetraOffset[class_i]; CUDA_SAFE_CALL(cudaMemcpy(&Hn12_d[ offset * TetPolyOrderDim[PolyFlag]], &Hn32_d[offset * TetPolyOrderDim[PolyFlag]], total_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice)); cudaDeviceSynchronize(); */ CUDA_SAFE_CALL(cudaMemcpy(&Hn12_d[0], &Hn32_d[0], tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice)); cudaDeviceSynchronize(); if(nonRegularPMLTetraCnt_h[class_i] > 0) { int num_PML_tets = nonRegularPMLTetraCnt_h[class_i]; int matrixOffset = classPMLTetraOffset_loc_h[class_i]; CUDA_SAFE_CALL(cudaMemcpy(&Mn_d[matrixOffset * TetPolyOrderDim[PolyFlag]], &Mn1_d[matrixOffset * TetPolyOrderDim[PolyFlag]], num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice)); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } if(classRegularPMLGroupsCnt_h[class_i] > 0) { int num_PML_tets = numRegPMLTetras; CUDA_SAFE_CALL(cudaMemcpy(&r_Mn_d[0], &r_Mn1_d[0], num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice)); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } } void FemGrp::FreeGPU(){ CUDA_SAFE_CALL(cudaFree(mapE_d)); CUDA_SAFE_CALL(cudaFree(mapH_d)); CUDA_SAFE_CALL(cudaFree(ExcitationFacesCnt_d)); CUDA_SAFE_CALL(cudaFree(ExcitationFacesOffset_d)); CUDA_SAFE_CALL(cudaFree(ExcitationFacesNum_d)); CUDA_SAFE_CALL(cudaFree(nd_coords_tet_d)); CUDA_SAFE_CALL(cudaFree(nd_coords_face_d)); if(PlaneWaveBCFlag){ CUDA_SAFE_CALL(cudaFree(Z_face_pw_d)); } CUDA_SAFE_CALL(cudaFree(InvE_d)); CUDA_SAFE_CALL(cudaFree(InvH_d)); CUDA_SAFE_CALL(cudaFree(Loc1E_d)); CUDA_SAFE_CALL(cudaFree(Loc2E_d)); CUDA_SAFE_CALL(cudaFree(Loc1H_d)); CUDA_SAFE_CALL(cudaFree(Loc2H_d)); CUDA_SAFE_CALL(cudaFree(Neigh1E_d)); CUDA_SAFE_CALL(cudaFree(Neigh2E_d)); CUDA_SAFE_CALL(cudaFree(Neigh1H_d)); CUDA_SAFE_CALL(cudaFree(Neigh2H_d)); if(regularRegionFlag){ CUDA_SAFE_CALL(cudaFree(regularLoc1E_d)); CUDA_SAFE_CALL(cudaFree(regularLoc2E_d)); CUDA_SAFE_CALL(cudaFree(regularLoc1H_d)); CUDA_SAFE_CALL(cudaFree(regularLoc2H_d)); CUDA_SAFE_CALL(cudaFree(regularNeigh1E_d)); CUDA_SAFE_CALL(cudaFree(regularNeigh2E_d)); CUDA_SAFE_CALL(cudaFree(regularNeigh1H_d)); CUDA_SAFE_CALL(cudaFree(regularNeigh2H_d)); } CUDA_SAFE_CALL(cudaFree(En_d)); CUDA_SAFE_CALL(cudaFree(En1_d)); CUDA_SAFE_CALL(cudaFree(Hn12_d)); CUDA_SAFE_CALL(cudaFree(Hn32_d)); CUDA_SAFE_CALL(cudaFree(NeighMap_d)); CUDA_SAFE_CALL(cudaFree(Neighbours_d)); CUDA_SAFE_CALL(cudaFree(auxFieldInput)); CUDA_SAFE_CALL(cudaFree(auxFieldOutput)); } // Refactored by Qi Jian to build Octree of the tetrahedrals void FemGrp::initializeOctree(std::string prjName, bool non_Conformal_flag) { cout << "========================================================== \n"; // Initialize octree object octree_object = Octree(); // Compute AABB for each tetrahedron /* std::cout << "Compute AABB for tetrahedral" << std::endl; octree_object.tetra_boxes.resize(tetraCNT); #pragma omp parallel for for (int tet_id = 0; tet_id < tetraCNT; ++tet_id) { const tetra& tet = tetARRAY[tet_id]; double x[4], y[4], z[4]; for (int i = 0; i < 4; ++i) { x[i] = tet.nd[i]->getCoord().getx(); y[i] = tet.nd[i]->getCoord().gety(); z[i] = tet.nd[i]->getCoord().getz(); } AABB box; box.xmin = std::min({x[0], x[1], x[2], x[3]}); box.xmax = std::max({x[0], x[1], x[2], x[3]}); box.ymin = std::min({y[0], y[1], y[2], y[3]}); box.ymax = std::max({y[0], y[1], y[2], y[3]}); box.zmin = std::min({z[0], z[1], z[2], z[3]}); box.zmax = std::max({z[0], z[1], z[2], z[3]}); octree_object.tetra_boxes[tet_id] = box; } */ // Compute AABB for each tetrahedron std::cout << "Compute AABB for tetrahedral (with buffer)" << std::endl; octree_object.tetra_boxes.resize(tetraCNT); // Buffer multiplier (e.g., 5% enlargement) const double buffer_factor = 2.0; #pragma omp parallel for for (int tet_id = 0; tet_id < tetraCNT; ++tet_id) { const tetra& tet = tetARRAY[tet_id]; double x[4], y[4], z[4]; for (int i = 0; i < 4; ++i) { x[i] = tet.nd[i]->getCoord().getx(); y[i] = tet.nd[i]->getCoord().gety(); z[i] = tet.nd[i]->getCoord().getz(); } AABB box; double xmin = std::min({x[0], x[1], x[2], x[3]}); double xmax = std::max({x[0], x[1], x[2], x[3]}); double ymin = std::min({y[0], y[1], y[2], y[3]}); double ymax = std::max({y[0], y[1], y[2], y[3]}); double zmin = std::min({z[0], z[1], z[2], z[3]}); double zmax = std::max({z[0], z[1], z[2], z[3]}); // Compute center and half-sizes double cx = 0.5 * (xmin + xmax); double cy = 0.5 * (ymin + ymax); double cz = 0.5 * (zmin + zmax); double hx = 0.5 * (xmax - xmin); double hy = 0.5 * (ymax - ymin); double hz = 0.5 * (zmax - zmin); // Apply buffer multiplier hx *= buffer_factor; hy *= buffer_factor; hz *= buffer_factor; // Store expanded box box.xmin = cx - hx; box.xmax = cx + hx; box.ymin = cy - hy; box.ymax = cy + hy; box.zmin = cz - hz; box.zmax = cz + hz; octree_object.tetra_boxes[tet_id] = box; } std::cout << "Compute global bounding box" << std::endl; // All the tetrahedra IDs std::vector all_tet_ids(tetraCNT); std::iota(all_tet_ids.begin(), all_tet_ids.end(), 0); // All the non-conformal tetrahedra IDs std::vector all_NC_tet_ids(nonConformalCNT); if (non_Conformal_flag) { std::cout << "Store non-conformal tetrahedra IDs" << std::endl; all_NC_tet_ids.assign(ncARRAY, ncARRAY + nonConformalCNT); } AABB global_box { .xmin = std::numeric_limits::max(), .xmax = -std::numeric_limits::max(), .ymin = std::numeric_limits::max(), .ymax = -std::numeric_limits::max(), .zmin = std::numeric_limits::max(), .zmax = -std::numeric_limits::max() }; for (const auto& box : octree_object.tetra_boxes) { global_box.xmin = std::min(global_box.xmin, box.xmin); global_box.xmax = std::max(global_box.xmax, box.xmax); global_box.ymin = std::min(global_box.ymin, box.ymin); global_box.ymax = std::max(global_box.ymax, box.ymax); global_box.zmin = std::min(global_box.zmin, box.zmin); global_box.zmax = std::max(global_box.zmax, box.zmax); } std::cout << "Global Bounding Box:" << std::endl; std::cout << " xmin = " << global_box.xmin << ", xmax = " << global_box.xmax << std::endl; std::cout << " ymin = " << global_box.ymin << ", ymax = " << global_box.ymax << std::endl; std::cout << " zmin = " << global_box.zmin << ", zmax = " << global_box.zmax << std::endl; fp_t x_range = (global_box.xmax - global_box.xmin); fp_t y_range = (global_box.ymax - global_box.ymin); fp_t z_range = (global_box.zmax - global_box.zmin); fp_t max_range = std::max({x_range, y_range, z_range}); fp_t wavelength = 3e8 / (freq * 1e6); double box_size = 100.0 * wavelength; // or any desired multiple of λ int min_depth = 1; // or 2, etc. int octree_depth = std::max(min_depth, static_cast(std::ceil(std::log2(max_range / box_size)))); double buffer_distance = wavelength / 2.0; //int octree_depth = static_cast(std::ceil(std::log2((4.0 * max_range) / wavelength))) - 1; std::cout << "Max Range = " << max_range << " | Wavelength = " << wavelength << std::endl; std::cout << "Compute octree with octree depth = " << octree_depth << std::endl; if (non_Conformal_flag) { octree_object.buildOctree_withNCFLAGS(all_tet_ids, all_NC_tet_ids, global_box, buffer_distance, 0, octree_depth); } else { octree_object.buildOctree(all_tet_ids, global_box, buffer_distance, 0, octree_depth); } // Link tetrahedron memory octree_object.tet_ptr = tetARRAY; octree_object.tet_count = tetraCNT; std::cout << "Octree build completed" << std::endl; cout << "========================================================== \n"; } // Find the Barycentric coordinates of the probes void FemGrp::computeBarycentricEmbedding() { std::cout << "Compute the Barycentric center of the nodes" << std::endl; const int num_nodes = outputMesh.num_nodes; const double tol = 1e-8; //#pragma omp parallel for schedule(dynamic) for (int node_id = 0; node_id < num_nodes; ++node_id) { std::vector node_xyz = outputMesh.getNode(node_id); double probe_xyz[3] = {node_xyz[0], node_xyz[1], node_xyz[2]}; std::vector>> found_tets; bool success = octree_object.findTetraInOctree(probe_xyz, found_tets, tol); if (success) { tri_nodes_bary[node_id].first = static_cast(found_tets.size()); tri_nodes_bary[node_id].second = found_tets; } else { tri_nodes_bary[node_id].first = -1; } } // Report and verify bool error_flag = false; for (int i = 0; i < num_nodes; ++i) { if (tri_nodes_bary[i].first < 0) { std::cerr << "Node " << i << " not found in simulation domain" << std::endl; std::vector node_xyz = outputMesh.getNode(i); double probe_xyz[3] = {node_xyz[0], node_xyz[1], node_xyz[2]}; std::cerr << probe_xyz[0] << " " << probe_xyz[1] << " " << probe_xyz[2] << std::endl; error_flag = true; } } if (error_flag) { std::cerr << "Error: Some nodes were not found in the simulation domain. Exiting." << std::endl; std::exit(EXIT_FAILURE); } } // Refactored by Qi Jian to initialize the output surface mesh // Note that the octree have to be built before calling this function void FemGrp::makeOutputSurfMesh(std::string prjName) { // Load surface mesh char triName[256]; sprintf(triName, "./%s_out.tri", prjName.c_str()); std::cout << "--------------------" << std::endl; std::cout << "Reading Tri surface mesh " << triName << std::endl; outputMesh.readFromFile(triName); std::cout << "--------------------" << std::endl; std::cout << "Compute Normals " << std::endl; outputMesh.computeTriangleNormals(); std::cout << "--------------------" << std::endl; outputMesh.printSummary(); std::cout << "--------------------" << std::endl; tri_nodes_bary.resize(outputMesh.num_nodes); // Fill barycentric coordinate map computeBarycentricEmbedding(); std::cout << "Completed" << std::endl; std::cout << "--------------------" << std::endl; } // Added by Qi Jian // Utility to write fields of probes void FemGrp::writeProbeFieldsCSV( const std::string& outputDir, // e.g. "./PROBES1" const std::string& fname, // simulation/project name int timeStep, // timestep number const std::vector& node_ids, // node IDs to write const std::vector& Efield, // electric field vectors const std::vector& Hfield // magnetic field vectors ) { char csvFileName[512]; sprintf(csvFileName, "%s/Probes_%s_%04d.csv", outputDir.c_str(), fname.c_str(), timeStep); std::ofstream csvFile(csvFileName); if (!csvFile.is_open()) { std::cerr << "Error opening file: " << csvFileName << std::endl; return; } // Write header csvFile << "Ex,Ey,Ez,Hx,Hy,Hz\n"; // Lambda to write one node's fields auto write_fields = [&](int node_id) { const vtr& E = Efield[node_id]; const vtr& H = Hfield[node_id]; csvFile << std::fixed << std::setprecision(6) << E.getx() << "," << E.gety() << "," << E.getz() << "," << H.getx() << "," << H.gety() << "," << H.getz() << "\n"; }; for (int i = 0; i < node_ids.size(); ++i) { int node_id = node_ids[i]; write_fields(node_id); } csvFile.close(); } void FemGrp::writeCurrentsOutputSurfMesh_CuBLAS(int timeStep) { const int num_nodes = outputMesh.num_nodes; const int num_tri = outputMesh.num_triangles; // ---------------------------------------------- // Step 1: Compute fields at all nodes (scattered field) // ---------------------------------------------- // Incident Field at points std::vector E_field(num_nodes); std::vector H_field(num_nodes); std::vector Einc_field(num_nodes); std::vector Hinc_field(num_nodes); int i, j; fp_t vol; fp_t zeta[4]; vtr lvtr[3]; vtr avtr[4]; int tetraMAP_aux[TetPolyOrderDim[getPolyFlag()]]; fp_t_ts E_coeff[TetPolyOrderDim[getPolyFlag()]]; fp_t_ts H_coeff[TetPolyOrderDim[getPolyFlag()]]; vtr Einc; vtr Hinc; vtr r; vtr eField; vtr hField; // DEBUG purpose: Store all the node ids as probes vector node_ids(num_nodes); for(i = 0; i < num_nodes; i++) { node_ids[i] = i; } // Compute the Incident Fields for(i = 0; i < num_nodes; i++) { int number_of_associated_tets = tri_nodes_bary.at(i).first; Einc.reset(); Hinc.reset(); std::vector>> found_tets = tri_nodes_bary.at(i).second; Einc_field[i].reset(); Hinc_field[i].reset(); for (int t = 0; t < number_of_associated_tets; t++) { int tet_id = found_tets.at(t).first; array tri_bary_coord = found_tets.at(t).second; tetra& tet = tetARRAY[tet_id]; zeta[0] = static_cast(tri_bary_coord[0]); zeta[1] = static_cast(tri_bary_coord[1]); zeta[2] = static_cast(tri_bary_coord[2]); zeta[3] = static_cast(tri_bary_coord[3]); SimplexToCartesian(tet, r, zeta); getAnalyticalPWField(tet, r, Einc, Hinc, timeStep, LocTimeSteps[N_class -1]); Einc_field[i] = Einc_field[i] + Einc; Hinc_field[i] = Hinc_field[i] + Hinc; } Einc_field[i] = Einc_field[i] / ((fp_t) number_of_associated_tets); Hinc_field[i] = Hinc_field[i] / ((fp_t) number_of_associated_tets); } //writeProbeFieldsCSV( "./PROBES_inc", fname, timeStep, node_ids, Einc_field, Hinc_field); make_dir_if_not_exist("./CURRENT_INC"); char regFileName[StrOutput]; // Prepare output file name regFileName[StrOutput] = {0}; sprintf(regFileName, "./CURRENT_INC/Einc_field_%s_%05d.dat", fname, timeStep); // Open output file FILE* fout = fopen(regFileName, "w"); if (!fout) { std::cerr << "❌ Failed to open output file: " << regFileName << std::endl; return; } std::vector tri_nodes = outputMesh.getTriangle(1); int nodeIdx = tri_nodes[0]; // Pick only the first node const vtr& E = Einc_field[nodeIdx]; // Get E-field vector at that node // Write full vector (Ex, Ey, Ez) to file fprintf(fout, "%.10e %.10e %.10e\n", E.getx(), E.gety(), E.getz()); fclose(fout); // Done! // Calculate Total Fields at the points for(i = 0; i < num_nodes; i++) { int number_of_associated_tets = tri_nodes_bary.at(i).first; eField.reset(); hField.reset(); std::vector>> found_tets = tri_nodes_bary.at(i).second; E_field[i].reset(); H_field[i].reset(); for (int t = 0; t < number_of_associated_tets; t++) { int tet_id = found_tets.at(t).first; array tri_bary_coord = found_tets.at(t).second; tetra& tet = tetARRAY[tet_id]; tet.geometry(lvtr, avtr, &vol); avtr[3].reset(); avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]); eField.reset(); hField.reset(); zeta[0] = static_cast(tri_bary_coord[0]); zeta[1] = static_cast(tri_bary_coord[1]); zeta[2] = static_cast(tri_bary_coord[2]); zeta[3] = static_cast(tri_bary_coord[3]); eField = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag); hField = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag); E_field[i] = E_field[i] + eField; H_field[i] = H_field[i] + hField; } E_field[i] = E_field[i] / ((fp_t) number_of_associated_tets); H_field[i] = H_field[i] / ((fp_t) number_of_associated_tets); } //writeProbeFieldsCSV( "./PROBES_total", fname, timeStep, node_ids, E_field, H_field); regMface = new Register[outputMesh.num_triangles]; regJface = new Register[outputMesh.num_triangles]; make_dir_if_not_exist("./CURRENT_Total"); for(int i = 0; i < outputMesh.num_triangles; i++) { std::vector tri_nodes = outputMesh.getTriangle(i); std::vector normal_d = outputMesh.getNormal(i); vtr NormalVtr(normal_d[0], normal_d[1], normal_d[2]); regMface[i].initial(3); regJface[i].initial(3); for(j = 0; j < 3; j++) { int nodeIdx = tri_nodes[j]; vtr eLocalFace = E_field[nodeIdx]; vtr hLocalFace = H_field[nodeIdx]; // No averaging regMface[i].setField(j, NormalVtr * eLocalFace * (-1.0)); regJface[i].setField(j, NormalVtr * hLocalFace * (1.0)); } } // Register memset(regFileName, 0, StrOutput * sizeof(char)); sprintf(regFileName, "./CURRENT_Total/Currents_%s_%05d", fname, timeStep); printRegister(regMface, regJface, outputMesh.num_triangles, regFileName,1); delete[] regMface; delete[] regJface; // Calculate Scattered Fields at the points for(int i = 0; i < num_nodes; i++) { E_field[i] = E_field[i] - Einc_field[i]; H_field[i] = H_field[i] - Hinc_field[i]; } //writeProbeFieldsCSV( "./PROBES_sc", fname, timeStep, node_ids, E_field, H_field); // ---------------------------------------------------------------------------------------------- // Write the Scattered Fields regMface = new Register[outputMesh.num_triangles]; regJface = new Register[outputMesh.num_triangles]; make_dir_if_not_exist("./CURRENT_SC"); for(int i = 0; i < outputMesh.num_triangles; i++) { std::vector tri_nodes = outputMesh.getTriangle(i); std::vector normal_d = outputMesh.getNormal(i); vtr NormalVtr(normal_d[0], normal_d[1], normal_d[2]); regMface[i].initial(3); regJface[i].initial(3); for(j = 0; j < 3; j++) { int nodeIdx = tri_nodes[j]; vtr eLocalFace = E_field[nodeIdx]; vtr hLocalFace = H_field[nodeIdx]; // No averaging regMface[i].setField(j, NormalVtr * eLocalFace * (-1.0)); regJface[i].setField(j, NormalVtr * hLocalFace * (1.0)); } } // Register memset(regFileName, 0, StrOutput * sizeof(char)); sprintf(regFileName, "./CURRENT_SC/Currents_%s_%05d", fname, timeStep); printRegister(regMface, regJface, outputMesh.num_triangles, regFileName,1); delete[] regMface; delete[] regJface; } #endif #endif