Maxwell-TD-Scattering/MaxwellTD_GPU_Port4/fem/femgrp.cc

#include <cstdlib>
#include <cmath>
#include <iostream>

#include "femgrp.h"
#include "matconv.h"
#include "Constants.h"
#include "vtkwriter.h"
#ifdef _OPENMP
#include <omp.h>
#endif
#include <map>
#include "MeshPartition_METIS5.h"
#include <vector>
#include "debug.hpp"
#include "vtk-5.0/vtkTetra.h"
#include "rapidcsv.h"
#include <string>
#include <sys/stat.h>
#include <sys/types.h>
#include <errno.h>
#include <cstring>  // For strerror
#include <cstdio>   // For perror or printf

#include <algorithm> // for std::max
#include <fstream>
#include <string>
#include <iomanip>
#include <filesystem>  // at top of file


auto check_dev_ptr = [](const void* p, const char* name) -> bool {
  if (!p) {
    fprintf(stderr, "[addExcitationE_port] ❌ NULL pointer: %s\n", name);
    return false;
  }
  cudaPointerAttributes attr;
#if CUDART_VERSION >= 10000
  cudaError_t perr = cudaPointerGetAttributes(&attr, p);
  if (perr != cudaSuccess) {
    fprintf(stderr, "[addExcitationE_port] ⚠️  cudaPointerGetAttributes failed for %s: %s\n",
            name, cudaGetErrorString(perr));
    // Still allow launch; you can change to 'return false;' if you prefer.
  } else {
    // cudaMemoryTypeDevice == 2 in older runtimes; in newer, use attr.type == cudaMemoryTypeDevice
#if CUDART_VERSION >= 11000
    bool is_dev = (attr.type == cudaMemoryTypeDevice);
#else
    bool is_dev = (attr.memoryType == cudaMemoryTypeDevice);
#endif
    if (!is_dev) {
      fprintf(stderr, "[addExcitationE_port] ⚠️  %s is NOT a device pointer (type=%d)\n",
              name,
#if CUDART_VERSION >= 11000
              (int)attr.type
#else
              (int)attr.memoryType
#endif
      );
    }
  }
#endif
  return true;
};


// ======================================
// Interpolation Quadrature Points (host tables)
#define g6_a_h  0.816847572980459
#define g6_b_h  (1.0 - g6_a_h) / 2.0
#define g6_c_h  0.108103018168070
#define g6_d_h  (1.0 - g6_c_h) / 2.0
#define g6_W1_h 0.109951743655322
#define g6_W2_h 0.223381589678011

#define g9_a_h  0.437525248383384
#define g9_b_h  (1.0 - 2.0 * g9_a_h)
#define g9_c_h  0.797112651860071
#define g9_d_h  0.165409927389841
#define g9_e_h  (1.0 - g9_c_h - g9_d_h)
#define g9_W1_h 0.205950504760887
#define g9_W2_h 0.063691414286223

fp_t_ts g2d_6_h[6][4] = {
  {g6_a_h, g6_b_h, g6_b_h, g6_W1_h},
  {g6_b_h, g6_a_h, g6_b_h, g6_W1_h},
  {g6_b_h, g6_b_h, g6_a_h, g6_W1_h},
  {g6_c_h, g6_d_h, g6_d_h, g6_W2_h},
  {g6_d_h, g6_c_h, g6_d_h, g6_W2_h},
  {g6_d_h, g6_d_h, g6_c_h, g6_W2_h}
};

fp_t_ts g2d_9_h[9][4] = {
  {g9_b_h, g9_a_h, g9_a_h, g9_W1_h},
  {g9_a_h, g9_b_h, g9_a_h, g9_W1_h},
  {g9_a_h, g9_a_h, g9_b_h, g9_W1_h},

  {g9_c_h, g9_d_h, g9_e_h, g9_W2_h},
  {g9_c_h, g9_e_h, g9_d_h, g9_W2_h},
  {g9_d_h, g9_c_h, g9_e_h, g9_W2_h},
  {g9_d_h, g9_e_h, g9_c_h, g9_W2_h},
  {g9_e_h, g9_c_h, g9_d_h, g9_W2_h},
  {g9_e_h, g9_d_h, g9_c_h, g9_W2_h}
};

const int GAUSS_POINT_NUM_h[4]  = {6, 9, 9, 9};


// ---- Shapes for quadratic triangle (P2) at barycentric l=(l0,l1,l2) ----
static inline void triP2_shapes(const double l[3], double N[6]) {
  const double l0=l[0], l1=l[1], l2=l[2];
  N[0] = l0*(2.0*l0-1.0);    // vertex 0
  N[1] = l1*(2.0*l1-1.0);    // vertex 1
  N[2] = l2*(2.0*l2-1.0);    // vertex 2
  N[3] = 4.0*l1*l2;          // edge(1,2)
  N[4] = 4.0*l0*l2;          // edge(0,2)
  N[5] = 4.0*l0*l1;          // edge(0,1)
}

// ---- One normal + area from 3 points (xyz9 = x0,y0,z0, x1,y1,z1, x2,y2,z2) ----
static inline void face_geometry9_host(const fp_t_ts* xyz9, double n[3], double& area) {
  const double x0=xyz9[0], y0=xyz9[1], z0=xyz9[2];
  const double x1=xyz9[3], y1=xyz9[4], z1=xyz9[5];
  const double x2=xyz9[6], y2=xyz9[7], z2=xyz9[8];
  double a[3] = {x1-x0, y1-y0, z1-z0};
  double b[3] = {x2-x0, y2-y0, z2-z0};
  // n ∝ a × b
  n[0] = a[1]*b[2] - a[2]*b[1];
  n[1] = a[2]*b[0] - a[0]*b[2];
  n[2] = a[0]*b[1] - a[1]*b[0];
  double nn = sqrt(n[0]*n[0] + n[1]*n[1] + n[2]*n[2]);
  area = 0.5*nn;
  if (nn > 0) { n[0]/=nn; n[1]/=nn; n[2]/=nn; }
}

// ---- Project vector to tangential plane (in-place) ----
static inline void proj_tangent(double v[3], const double n[3])
{
  const double vn = v[0]*n[0] + v[1]*n[1] + v[2]*n[2];
  v[0]-=vn*n[0]; v[1]-=vn*n[1]; v[2]-=vn*n[2];
}

// ---- Host quadrature accessor using your *_h tables ----
static inline void tri_gauss_host(int Q, int q, fp_t& z0, fp_t& z1, fp_t& z2, fp_t& w) {
  if (Q == 6)  { z0 = g2d_6_h[q][0]; z1 = g2d_6_h[q][1]; z2 = g2d_6_h[q][2]; w = g2d_6_h[q][3]; return; }
  if (Q == 9)  { z0 = g2d_9_h[q][0]; z1 = g2d_9_h[q][1]; z2 = g2d_9_h[q][2]; w = g2d_9_h[q][3]; return; }
  // add more orders if you enable them
  z0=z1=z2=w=0;
}

// ---- Interpolate E/H to Q quadrature points and project tangential ----
static inline void interp_port_fields_to_quads(
  const fp_t_ts* xyz9,           // x0 y0 z0 x1 y1 z1 x2 y2 z2
  const vtr evtr[6],             // P2 nodal vectors for E (face order: 0..5)
  const vtr hvtr[6],             // P2 nodal vectors for H
  int PolyFlag,
  fp_t_ts* Etan_out,             // [Q*3]
  fp_t_ts* Htan_out,           // [Q*3]
  fp_t_ts port_excitation_magnitude)
{
  const int Q = GAUSS_POINT_NUM_h[PolyFlag];
  double n[3], area;
  face_geometry9_host(xyz9, n, area);

  for (int q=0; q<Q; ++q)
  {
    fp_t z0,z1,z2,w; tri_gauss_host(Q,q,z0,z1,z2,w);
    double l[3] = { (double)z0, (double)z1, (double)z2 };
    double N[6]; triP2_shapes(l,N);

    double E[3]={0,0,0}, H[3]={0,0,0};
    for (int m=0; m<6; ++m)
    {
      const double a = N[m];
      E[0]+=a*evtr[m].getx();
      E[1]+=a*evtr[m].gety();
      E[2]+=a*evtr[m].getz();
      H[0]+=a*hvtr[m].getx();
      H[1]+=a*hvtr[m].gety();
      H[2]+=a*hvtr[m].getz();
    }

    proj_tangent(E,n);
    proj_tangent(H,n);

    Etan_out[q*3+0] = (fp_t_ts)E[0] * port_excitation_magnitude;
    Etan_out[q*3+1] = (fp_t_ts)E[1] * port_excitation_magnitude;
    Etan_out[q*3+2] = (fp_t_ts)E[2] * port_excitation_magnitude;
    Htan_out[q*3+0] = (fp_t_ts)H[0] * port_excitation_magnitude;
    Htan_out[q*3+1] = (fp_t_ts)H[1] * port_excitation_magnitude;
    Htan_out[q*3+2] = (fp_t_ts)H[2] * port_excitation_magnitude;

  }
}


// Write port quadrature fields to CSV
// Columns: face_idx,global_face_id,tet_id,port_idx,q,z0,z1,z2,w,x,y,z,Et_x,Et_y,Et_z,Ht_x,Ht_y,Ht_z
bool write_port_quadrature_csv(
  const char* out_path,
  int PolyFlag,
  int excitationFaces,
  const int* PortFacePidx_h,       // length = excitationFaces; -1 for non-port faces
  const int* FaceID_excitation_h,  // length = excitationFaces (optional; can pass nullptr)
  const int* TetID_excitation_h,   // length = excitationFaces (optional; can pass nullptr)
  const fp_t_ts* nd_coords_face_h, // length = excitationFaces * 9
  const fp_t_ts* Etan_qp_h,        // length = excitationFaces * Q * 3
  const fp_t_ts* Htan_qp_h         // length = excitationFaces * Q * 3
) {
  if (!out_path || !nd_coords_face_h || !Etan_qp_h || !Htan_qp_h || !PortFacePidx_h) {
      fprintf(stderr, "write_port_quadrature_csv: null pointer argument.\n");
      return false;
  }

  const int Q = GAUSS_POINT_NUM_h[PolyFlag];

  std::ofstream ofs(out_path);
  if (!ofs) {
      fprintf(stderr, "write_port_quadrature_csv: failed to open %s\n", out_path);
      return false;
  }

  ofs.setf(std::ios::scientific);
  ofs << std::setprecision(9);

  // Header
  ofs << "face_idx,global_face_id,tet_id,port_idx,q,"
         "z0,z1,z2,w,x,y,z,Et_x,Et_y,Et_z,Ht_x,Ht_y,Ht_z\n";

  for (int f = 0; f < excitationFaces; ++f) {
      int pidx = PortFacePidx_h[f];
      if (pidx < 0) continue; // skip non-port faces

      int global_face_id = FaceID_excitation_h ? FaceID_excitation_h[f] : -1;
      int tet_id         = TetID_excitation_h  ? TetID_excitation_h[f]  : -1;

      // Triangle vertices
      const fp_t_ts* xyz9 = &nd_coords_face_h[3 * 3 * f];
      double Ax = (double)xyz9[0], Ay = (double)xyz9[1], Az = (double)xyz9[2];
      double Bx = (double)xyz9[3], By = (double)xyz9[4], Bz = (double)xyz9[5];
      double Cx = (double)xyz9[6], Cy = (double)xyz9[7], Cz = (double)xyz9[8];

      // Fields
      const fp_t_ts* Eface = &Etan_qp_h[(size_t)f * Q * 3];
      const fp_t_ts* Hface = &Htan_qp_h[(size_t)f * Q * 3];

      for (int q = 0; q < Q; ++q) {
          fp_t z0, z1, z2, w;
          tri_gauss_host(Q, q, z0, z1, z2, w);

          // Quadrature point physical coords
          double x = z0 * Ax + z1 * Bx + z2 * Cx;
          double y = z0 * Ay + z1 * By + z2 * Cy;
          double z = z0 * Az + z1 * Bz + z2 * Cz;

          ofs << f << ','
              << global_face_id << ','
              << tet_id << ','
              << pidx << ','
              << q << ','
              << (double)z0 << ','
              << (double)z1 << ','
              << (double)z2 << ','
              << (double)w  << ','
              << x << ',' << y << ',' << z << ','
              << (double)Eface[q*3+0] << ','
              << (double)Eface[q*3+1] << ','
              << (double)Eface[q*3+2] << ','
              << (double)Hface[q*3+0] << ','
              << (double)Hface[q*3+1] << ','
              << (double)Hface[q*3+2] << '\n';
      }
  }

  ofs.close();
  return true;
}


// Evaluate at centroid
static inline void interp_port_fields_to_centroid(
  const fp_t_ts* xyz9,     // x0 y0 z0 x1 y1 z1 x2 y2 z2
  const vtr evtr[6],       // P2 nodal vectors for E
  const vtr hvtr[6],       // P2 nodal vectors for H
  fp_t_ts Etan_out[3],     // centroid E_t
  fp_t_ts Htan_out[3])     // centroid H_t
{
  // Face normal (for tangential projection)
  double n[3], area;
  face_geometry9_host(xyz9, n, area);

  // Centroid barycentrics
  const double l[3] = { 1.0/3.0, 1.0/3.0, 1.0/3.0 };

  // Quadratic triangle shape functions at centroid
  double N[6];
  triP2_shapes(l, N);

  // Interpolate P2 field
  double E[3] = {0.0, 0.0, 0.0};
  double H[3] = {0.0, 0.0, 0.0};
  for (int m = 0; m < 6; ++m) {
    const double a = N[m];
    E[0] += a * evtr[m].getx();  E[1] += a * evtr[m].gety();  E[2] += a * evtr[m].getz();
    H[0] += a * hvtr[m].getx();  H[1] += a * hvtr[m].gety();  H[2] += a * hvtr[m].getz();
  }

  // Project onto the tangential plane
  proj_tangent(E, n);
  proj_tangent(H, n);

  // Output single centroid values
  Etan_out[0] = (fp_t_ts)E[0];
  Etan_out[1] = (fp_t_ts)E[1];
  Etan_out[2] = (fp_t_ts)E[2];

  Htan_out[0] = (fp_t_ts)H[0];
  Htan_out[1] = (fp_t_ts)H[1];
  Htan_out[2] = (fp_t_ts)H[2];
}

// ======================================


// ---- centroid helper (assumes tet->nd[0..3] exist and have getCoord().getx/y/z()) ----

void make_dir_if_not_exist(const char* path) {
    struct stat st;
    if (stat(path, &st) != 0) {
        // Directory does not exist, try to create it
        if (mkdir(path, 0755) != 0) {
            perror("mkdir failed");
        }
    } else if (!S_ISDIR(st.st_mode)) {
        fprintf(stderr, "%s exists but is not a directory\n", path);
    }
}


void exportNeighData(
  int* NeighMap_h, int neighMapSize,
  int* NeighClass_h, int N_class,
  int* NeighClassOffset_h)
{
  // Export NeighMap_h
  {
      std::ofstream ofs("NeighMap.txt");
      for (int i = 0; i < neighMapSize; i++) {
          ofs << NeighMap_h[i] << "\n";
      }
  }

  // Export NeighClass_h
  {
      std::ofstream ofs("NeighClass.txt");
      for (int i = 0; i < N_class; i++) {
          ofs << NeighClass_h[i] << "\n";
      }
  }

  // Export NeighClassOffset_h
  {
      std::ofstream ofs("NeighClassOffset.txt");
      for (int i = 0; i < N_class; i++) {
          ofs << NeighClassOffset_h[i] << "\n";
      }
  }
}

// ---- Safe CUDA helpers -------------------------------------------------------
inline cudaError_t SafeCudaMalloc(void** p, size_t nbytes)
{
  if (nbytes == 0) { *p = nullptr; return cudaSuccess; }
  return cudaMalloc(p, nbytes);
}

inline cudaError_t SafeCudaMemcpyH2D(void* dst, const void* src, size_t nbytes)
{
  if (nbytes == 0 || !dst || !src) return cudaSuccess;
  return cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
}

inline cudaError_t SafeCudaMemset0(void* dst, size_t nbytes)
{
  if (nbytes == 0 || !dst) return cudaSuccess;
  return cudaMemset(dst, 0, nbytes); // zero is always safe
}

#define BYTES(T, count) (static_cast<size_t>(count) * sizeof(T))
#define CUDA_SAFE_MALLOC(ptr, bytes) CUDA_SAFE_CALL(SafeCudaMalloc((void**)&(ptr), (bytes)))
#define CUDA_SAFE_COPY(dst, src, bytes) CUDA_SAFE_CALL(SafeCudaMemcpyH2D((dst), (src), (bytes)))
#define CUDA_SAFE_ZERO(dst, bytes) CUDA_SAFE_CALL(SafeCudaMemset0((dst), (bytes)))
// ---- Safe CUDA helpers -------------------------------------------------------


#if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)
#include "kernels.cuh"
cudaStream_t stream_E, stream_H;
cudaStream_t stream_Pade;
ExcitationProp excitationProp;
std::vector<ExcitationProp> portExcitations;
ExcitationProp* ExcitationProps_d;
#endif

using namespace ClipperLib;
using namespace std;


int TriNumBas = 6;
bool ModuleFlag = true;

static fp_t BaryCoord[SecondOrderNodes][4] = {
  {1.0, 0.0, 0.0, 0.0},
  {0.0, 1.0, 0.0, 0.0},
  {0.0, 0.0, 1.0, 0.0},
  {0.0, 0.0, 0.0, 1.0},
  {0.5, 0.5, 0.0, 0.0},
  {0.5, 0.0, 0.5, 0.0},
  {0.5, 0.0, 0.0, 0.5},
  {0.0, 0.5, 0.5, 0.0},
  {0.0, 0.5, 0.0, 0.5},
  {0.0, 0.0, 0.5, 0.5}
};

static int fac2tet[4][18] = {
  {5, 4, 3, 11, 10, 9, 12, 13, 25, 24, 23, 26, 30, 31, 32, 42, 43, 44},
  {5, 2, 1, 11,  8, 7, 14, 15, 25, 22, 21, 27, 33, 34, 35, 42, 43, 44},
  {4, 2, 0, 10,  8, 6, 16, 17, 24, 22, 20, 28, 36, 37, 38, 42, 43, 44},
  {3, 1, 0,  9,  7, 6, 18, 19, 23, 21, 20, 29, 39, 40, 41, 42, 43, 44}
};

int faceExcitationOrder[15] = {
  1, 2, 4, 8, 3, 5, 6, 9, 10, 12, 3, 7, 11, 13, 14
};

int First2Second[3][2] = {
  {1, 2},
  {0, 2},
  {0, 1}
};


template<typename T>
void writeDenseMatrixToCSV_rapidcsv(const std::string& filename, denseMat<T>* mat, int dim)
{
    std::vector<std::vector<T>> data(dim, std::vector<T>(dim));

    for (int i = 0; i < dim; ++i)
        for (int j = 0; j < dim; ++j)
            data[i][j] = mat->getEntry(i, j);

    // rapidcsv needs column-major data
    std::vector<std::vector<T>> cols(dim, std::vector<T>(dim));
    for (int j = 0; j < dim; ++j)
        for (int i = 0; i < dim; ++i)
            cols[j][i] = data[i][j];

    rapidcsv::Document doc;
    for (int j = 0; j < dim; ++j)
        doc.SetColumn<T>(j, cols[j]);

    doc.Save(filename);
}

template<typename T_in, typename T_out>
denseMat<T_out>* wrapFlatMatrixConvert(const T_in* data, int dim) {
    auto* mat = new denseMat<T_out>(dim, dim);
    for (int i = 0; i < dim; ++i)
        for (int j = 0; j < dim; ++j)
            mat->setEntry(i, j, static_cast<T_out>(data[i * dim + j]));
    return mat;
}


FemGrp::FemGrp(){
  nodeCNT = 0;
  edgeCNT = 0;
  faceCNT = 0;
  tetraCNT = 0;
  bcCNT = 0;

  regularCNT = 1; //at least there is a non regular group
  regularTetraCNT = 0;

  ndARRAY = nullptr;
  tetARRAY = nullptr;
  edgeARRAY = nullptr;
  faceARRAY = nullptr;
  regularReferenceARRAY = nullptr;
  objProp = nullptr;
  totalObjNum = 0;
  usePade = false;
  padeTime = -1;
  padeCNT = 0;
  tsSource = 0;
  nonConformalCase = false;
  nonConformalCNT = 0;
  neighCNT = 0;

  writeWhilePade = false;
  writePadeTD = false;

  Coord.setO(0.0, 0.0, 0.0);
  Coord.setx_axis(1.0, 0.0, 0.0);
  Coord.sety_axis(0.0, 1.0, 0.0);
  Coord.setz_axis(0.0, 0.0, 1.0);
  freq = 0.0;

  // Added for DGTD
  TimeStep_dt = 0.0;
  ClassMul = 0;
  dt_min = 0.0;
  dt_max = 0.0;
  dimE = 0;
  dimH = 0;
  N_class = 0;
  NtimeSteps = 0;
  LocTimeSteps = nullptr;
  LocalExciIndexE = nullptr;
  LocalExciIndexH = nullptr;
  ClassTetraCnt = nullptr;
  ClassTetraIndex = nullptr;
  ClassTetraOffset = nullptr;
  planeWaveMesh = nullptr;
  InterSurfMesh = nullptr;
  SurfMesh = nullptr;
  To = 0.0;
  Tau = 0.0;
  SamplingRate = 1.0;
  FinalTime = 0.0;
  TimeDistFlag = 0; // Port
  ExcitFlag = 0; // Scattering

  regularRegionFlag = false;
  PlaneWaveBCFlag = false;
  PortBCFlag = false;

  fieldEnergy = 0.0;
  maxFieldEnergy = 0.0;
  energyDecayFactor = 0.0;
  numberOfEnergyPoints = 0;

  UseQuadratureMatrices = true;

  #if defined(DGTD_USE_CUDA)
    cudaStreamCreate(&stream_E);
    cudaStreamCreate(&stream_H);
    cudaStreamCreate(&stream_Pade);

    En_d = nullptr;
    Hn12_d = nullptr;
    En1_d = nullptr;
    Hn32_d = nullptr;
  #endif
}

FemGrp::~FemGrp(){

}

void FemGrp::readNODE(){

  // Read only the nodes belonging to this subdomain and neighbors
  char nname[StrLenShort];
  int pType;
  fp_t singORDER, Priority, x, y, z;

  sprintf(nname, "%s.node", fname);
  ifstream nodefile(nname, ios::in);

  if(!nodefile){
    cout << "File " << nname << " does NOT exist " << endl;
    exit(1);
  }

  if(usePade){
    initializeMaxMinPoints();
  }

  int nodeTotal;
  nodefile >> unit;
  nodefile >> nodeTotal;
  nodeCNT = nodeTotal; // only one domain, global = local
  if(nodeCNT >= 1){
    ndARRAY = new node[nodeCNT];
    for(int k = 0; k < nodeTotal; k ++){
      ndARRAY[k].set_globalId(k);
      nodefile >> pType >> Priority >> singORDER >> x >> y >> z;
      ndARRAY[k].set_n(k);
      ndARRAY[k].set_pType(pType);
      ndARRAY[k].setPType(static_cast<PointType>(pType));
      ndARRAY[k].set_singORDER(singORDER);
      ndARRAY[k].set_coord(x * unit, y * unit, z * unit);
      // ndARRAY[k].print();
      if(usePade){
        setMaxMinPoints(x * unit, y * unit, z * unit);
      }
    }
    cout << "MaxPoint = (" << maxPoint.getx() << ", " << maxPoint.gety() << ", " << maxPoint.getz() << ") " << endl;
    cout << "MinPoint = (" << minPoint.getx() << ", " << minPoint.gety() << ", " << minPoint.getz() << ") " << endl;
  }
}

void FemGrp::readTETRA(){
  // Read only the tetras in this subdomain and neighbors
  int  i, j, objNum, ndid[NumOfNodes], bcd[NumOfFaces], sNum[NumOfFaces];
  node *nd[NumOfNodes];
  char tname[StrLenShort];

  readBcMap(); // read in surface-btype map

  sprintf(tname, "%s.tetra", fname);
  ifstream tetrafile(tname, ios::in);

  if(!tetrafile){
    cout << "File " << tname << " does NOT exist " << endl;
    exit(1);
  }

  int tetraTotal;
  tetrafile >> tetraTotal;

  // Only one domain exists
  tetraCNT = tetraTotal;

  if(tetraCNT >= 1){
    tetARRAY = new tetra[tetraCNT];

    for(i = 0; i < tetraTotal; i ++){
      tetrafile >> objNum;
      if(objNum > totalObjNum)
        totalObjNum = objNum;
      tetrafile >> ndid[0] >> ndid[1] >> ndid[2] >> ndid[3]; //get the ids of the nodes
      tetrafile >> sNum[0] >> sNum[1] >> sNum[2] >> sNum[3]; //get the bc number of the faces

      for(j = 0; j < 4; j++){
        nd[j] = &(ndARRAY[ndid[j]]);
        bcd[j] = bcMap[sNum[j]];
      }

      tetARRAY[i].set_objNum(objNum);
      tetARRAY[i].set_node(nd[0], nd[1], nd[2], nd[3]);
      tetARRAY[i].set_nbc(bcd[0], bcd[1], bcd[2], bcd[3]);
      tetARRAY[i].reArrange(); //set the nodes and bc from smallest to greatest id
      tetARRAY[i].setcnt(i);
    }
  }
}

void FemGrp::readBcMap(){
  char name[StrLenShort];
  int i, surfCNT, sNum, bNum;

  sprintf(name, "%s.bcmap", fname);
  ifstream foo(name, ios::in);

  if(!foo){
    cout << "File " << name << " does NOT exist " << endl;
    exit(1);
  }

  foo >> surfCNT;
  if(surfCNT > 0){
    bcMap = new int[surfCNT + 1];
    bcMap[0] = 0;
    for(i = 0; i < surfCNT; i ++){
      foo >> sNum >> bNum;
      bcMap[sNum] = bNum;
    }
  }
}

void FemGrp::readMaterial(){
  char name[StrLenShort], matName[StrLenShort], dirName[StrLenShort], tmpName[StrLenShort], materialName[StrLenShort];
  int i, j, k;
  fp_t real, imaginary, cval, temp;
  FILE *matFILE;

  totalObjNum ++;
  objProp = new Material[totalObjNum];

  sprintf(name, "%s.prop", fname);
  ifstream foo(name, ios::in);
  if(!foo){
    cout << "File " << name << " does NOT exist " << endl;
    exit(1);
  }

  foo >> dirName; //directory where the materials are storaged
  DEBUG_INFO("totalObjNum: " + to_string(totalObjNum));
  //TODO: it only takes the real part
  for(i = 0; i < totalObjNum; i++)
  {
    foo >> materialName;
    sprintf(matName, "%s/%s.m", dirName, materialName);
    matFILE = fopen(matName, "r");
    cout << "Reading material properties from file: " << materialName << endl;
    fscanf(matFILE, "%s", tmpName);

    // relative dielectric constant
    for(j = 0; j < NumOfUnitaryVectors; j ++){
      for(k = 0; k < NumOfUnitaryVectors; k ++){
        #ifdef DGTD_USE_DOUBLE
          fscanf(matFILE, "%le %le ", &real, &imaginary);
        #else
          fscanf(matFILE, "%e %e ", &real, &imaginary);
        #endif
        cval = real;
        objProp[i].epsr.setEntry(j, k, cval);
      }
    }

    // relative permeability
    for(j = 0; j < NumOfUnitaryVectors; j ++){
      for(k = 0; k < NumOfUnitaryVectors; k ++){
        #ifdef DGTD_USE_DOUBLE
          fscanf(matFILE, "%le %le ", &real, &imaginary);
        #else
          fscanf(matFILE, "%e %e ", &real, &imaginary);
        #endif
        cval = real;
        objProp[i].mur.setEntry(j, k, cval);
      }
    }

    // conductivity
    for(j = 0; j < NumOfUnitaryVectors; j ++){
      for(k = 0; k < NumOfUnitaryVectors; k ++){
        #ifdef DGTD_USE_DOUBLE
          fscanf(matFILE, "%le ", &real);
        #else
          fscanf(matFILE, "%e ", &real);
        #endif
        cval = real;
        objProp[i].sigma.setEntry(j, k, cval);
      }
    }

    objProp[i].rum = objProp[i].mur.inverse();

    // Tag Scattering Region
    if (strncmp(materialName, "scattering", 10) == 0)
    {
      objProp[i].scattering_region = true;
    }
    else
    {
      objProp[i].scattering_region = false;
    }


    // PML
    if (strncmp(materialName, "pml", 3) == 0)
    {

        PML_flag = true;

        // Set Tetrahedron PML type true
        objProp[i].set_PML_Flag(1);
        cout << "PML Material Properties: " << endl;

        // PML Max Conductivity
        fp_t conductivity_PML = objProp[i].sigma.getEntry(0, 0);
        cout << "conductivity_PML = " << conductivity_PML <<  endl;

        // PML Order
        #ifdef DGTD_USE_DOUBLE
        fscanf(matFILE, "%le ", &temp);
        #else
        fscanf(matFILE, "%e ", &temp);
        #endif
        PML_conductivity_order = temp;
        objProp[i].set_PML_m_ord(PML_conductivity_order);
        cout << "PML_m_ord: " << objProp[i].get_PML_m_ord() << endl;

        // PML Thickness
        #ifdef DGTD_USE_DOUBLE
        fscanf(matFILE, "%le ", &temp);
        #else
        fscanf(matFILE, "%e ", &temp);
        #endif
        PML_thickness = temp;
        objProp[i].set_PML_thick(PML_thickness);
        cout << "PML_thickness: " << objProp[i].get_PML_thick() << endl;

        // PML Geometry
        #ifdef DGTD_USE_DOUBLE
        fscanf(matFILE, "%le ", &temp);
        #else
        fscanf(matFILE, "%e ", &temp);
        #endif
        Ellipse_Rx = temp;

        #ifdef DGTD_USE_DOUBLE
        fscanf(matFILE, "%le ", &temp);
        #else
        fscanf(matFILE, "%e ", &temp);
        #endif
        Ellipse_Ry = temp;

        #ifdef DGTD_USE_DOUBLE
        fscanf(matFILE, "%le ", &temp);
        #else
        fscanf(matFILE, "%e ", &temp);
        #endif
        Ellipse_Rz = temp;

        cout << "Ellipse_Rx: " << Ellipse_Rx << endl;
        cout << "Ellipse_Ry: " << Ellipse_Ry << endl;
        cout << "Ellipse_Rz: " << Ellipse_Rz << endl;

        #ifdef DGTD_USE_DOUBLE
        fscanf(matFILE, "%le ", &temp);
        #else
        fscanf(matFILE, "%e ", &temp);
        #endif
        planewave_xmin = temp;

        #ifdef DGTD_USE_DOUBLE
        fscanf(matFILE, "%le ", &temp);
        #else
        fscanf(matFILE, "%e ", &temp);
        #endif
        planewave_xmax = temp;

        #ifdef DGTD_USE_DOUBLE
        fscanf(matFILE, "%le ", &temp);
        #else
        fscanf(matFILE, "%e ", &temp);
        #endif
        planewave_ymin = temp;

        #ifdef DGTD_USE_DOUBLE
        fscanf(matFILE, "%le ", &temp);
        #else
        fscanf(matFILE, "%e ", &temp);
        #endif
        planewave_ymax = temp;

        #ifdef DGTD_USE_DOUBLE
        fscanf(matFILE, "%le ", &temp);
        #else
        fscanf(matFILE, "%e ", &temp);
        #endif
        planewave_zmin = temp;

        #ifdef DGTD_USE_DOUBLE
        fscanf(matFILE, "%le ", &temp);
        #else
        fscanf(matFILE, "%e ", &temp);
        #endif
        planewave_zmax = temp;

        cout << "PML Region:\n";
        cout << "  x: [" << planewave_xmin << ", " << planewave_xmax << "]\n";
        cout << "  y: [" << planewave_ymin << ", " << planewave_ymax << "]\n";
        cout << "  z: [" << planewave_zmin << ", " << planewave_zmax << "]\n";
    }
    else
    {
      // Set Tetrahedron PML type false
      objProp[i].set_PML_Flag(0);
    }


    fclose(matFILE);


  }
}

void FemGrp::readBC()
{
  char name[StrLenShort], bcName[StrLenShort], portName[StrLenShort];
  int i, pNum, bNum, bType;
  fp_t impR, impI, magE;
  fp_t x, y, z;
  fp_t theta, phi;
  fp_t rox, roy, roz;
  fp_t r1x, r1y, r1z;
  int PortFlag;
  fp_t CHIRP_BW_MHZ;
  fp_t phaseE;
  fp_t port_dx, port_dy, port_dz;
  fp_t vpath_x, vpath_y, vpath_z;
  fp_t epr, mur;

  PEC_PMC_port_flag = 0;


  // For ports
  bcNumToPnum.clear();
  pnumToBcNum.clear();

  // For PML
  int pmlMode;  // 0->radiation(port) , 1->scattering
  fp_t pol_x, pol_y, pol_z;
  PML_flag = false;

  readBcMap();

  sprintf(name, "%s.bc", fname);
  ifstream foo(name, ios::in);

  if(!foo){
    cout << "File " << name << " does NOT exist " << endl;
    exit(1);
  }


  foo >> bcCNT;

  bcARRAY = new bc[bcCNT];
  portCNT = 0;
  nonConformalCNT = 0;
  for(i = 0; i < bcCNT; i ++)
  {
    foo >> bNum >> bcName;
    bcARRAY[i].set_bNum(bNum); // id in file
    bcARRAY[i].set_name(bcName); // name in file
    bType = bcTypeConvert(bcName);
    bcARRAY[i].set_bType(bType);

    switch (bType)
    {
      case 0: // none
      {
        break;
      }

      case pmcType:  // pmc
      {
        break;
      }

      case fieldPlaneType:
      {
        break; // fieldPlane
      }

      case outputSurfType:
      {
        cout << "outputSurfType" << endl;
        break;
      }

      case abcType:
      {
        foo >> impR; //abc
        bcARRAY[i].set_rval(impR * No);
        break;
      }

      case constE:
      {
        foo >> x >> y >> z; // constE
        bcARRAY[i].SETFIELD(x, y, z);
        break;
      }

      case pecType:
      {
        break; // pec
      }

      case impType:
      {
        foo >> impR >> impI;   //original
        bcARRAY[i].set_cval(impR, impI);
        break;
      }

      case portType:
      {

        //  (1) TEM rectangular port
        //  port <name> <pNum> 1 <impR> <impI> <magE>  <dx> <dy> <dz>   <BW> <epr> <mur>   <vpath_x> <vpath_y> <vpath_z>
        //  (2) TEM coaxial port
        //  port <name> <pNum> 2 <impR> <impI> <magE>  <dx> <dy> <dz>   <BW> <epr> <mur>  <r0x> <r0y> <r0z>   <r1x> <r1y> <r1z>   <r2x> <r2y> <r2z>
        //  (3) TE rectangular port (a is along height and b is along width)
        //  port <name> <pNum> 3 <impR> <impI> <magE>  <dx> <dy> <dz>   <BW> <epr> <mur>  <a> <b> <m> <n>   <uv0x> <uv0y> <uv0z>   <vpx> <vpy> <vpz>

        pNum = -1;
        PortFlag = 0;

        if (!(foo >> portName >> pNum >> PortFlag))
        {
          std::cerr << "[PORT] Failed to read <name pNum PortFlag>\n";
          break;
        }
        cout << "pNum = " << pNum << endl;

        // Initialization of the variables
        impR=0.0, impI=0.0, magE=1.0;
        port_dx=0.0, port_dy=0.0, port_dz=1.0;
        CHIRP_BW_MHZ=0.0, epr=1.0, mur=1.0;

        if (!(foo >> impR >> impI >> magE >> port_dx >> port_dy >> port_dz >> CHIRP_BW_MHZ >> epr >> mur))
        {
          std::cerr << "[PORT] Failed to read common fields for port " << portName << "\n";
          break;
        }

        // Book-keeping
        bcARRAY[i].set_name(portName);
        bcARRAY[i].set_cval(impR, impI);
        bcARRAY[i].set_rval(impR);
        bcARRAY[i].set_pNum(pNum);
        bcARRAY[i].set_PortFlag(PortFlag);

        portCNT++;
        PWorPort   = 1;
        PortBCFlag = true;

        // If user gives impR==0, let device compute eta
        const double MU0  = 1.2566370614359173e-6; // 4π·1e-7
        const double EPS0 = 8.854187817e-12;
        const double PI   = 3.14159265358979323846;
        if (epr <= 0.0)  epr = 1.0;
        if (mur <= 0.0)  mur = 1.0;
        const double mu  = mur * MU0;
        const double eps = epr * EPS0;

        #if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)


          excitationProp.ExcitationFlag = ExcitFlag;

          ExcitationProp portEx{};
          portEx.portNum = pNum;
          portEx.BCNum   = i;

          // Timing / envelope
          portEx.TimeDistributionFlag = getTimeDist();
          portEx.to     = To;
          portEx.tau    = Tau;
          portEx.freq_m = (fp_t_ts)freq;            // MHz
          portEx.CHIRP_BW_MHZ = (fp_t_ts)CHIRP_BW_MHZ;

          // Medium / amplitude
          portEx.epr = (fp_t_ts)((epr>0.0)? epr : 1.0);
          portEx.mur = (fp_t_ts)((mur>0.0)? mur : 1.0);
          portEx.Emagnitude = (fp_t_ts)magE;

          // Direction vector (diagnostic; geometry gives unit normal)
          portEx.PortDirection[0] = (fp_t_ts)port_dx;
          portEx.PortDirection[1] = (fp_t_ts)port_dy;
          portEx.PortDirection[2] = (fp_t_ts)port_dz;

          // E/H field impedance (if 0, device computes implicitly)
          portEx.PortImpedance = (fp_t_ts)impR;
          portEx.PortFlag = PortFlag;

          // Map BC <-> Port
          bcNumToPnum[portEx.BCNum]   = portEx.portNum;
          pnumToBcNum[portEx.portNum] = portEx.BCNum;


          // ---- Branch by PortFlag for extra fields ----
          switch (PortFlag)
          {
            case 1: // TEM rectangular: needs vpath
            {
              double vpx=0, vpy=0, vpz=0;
              if (!(foo >> vpx >> vpy >> vpz))
              {
                std::cerr << "[PORT] TEM-rect missing <vpath_x vpath_y vpath_z> for " << portName << "\n";
                // Default vpath to PortDirection if absent
                vpx = port_dx; vpy = port_dy; vpz = port_dz;
              }
              portEx.vpath[0] = (fp_t_ts)vpx;
              portEx.vpath[1] = (fp_t_ts)vpy;
              portEx.vpath[2] = (fp_t_ts)vpz;


              if (impR == 0.0 && impI == 0.0)
              {
                double eta =  std::sqrt(mu/eps); // η = sqrt(μ/ε)
                portEx.PortImpedance = (fp_t_ts)eta;
                bcARRAY[i].set_rval(eta);
              }
              else
              {
                portEx.PortImpedance = (fp_t_ts)impR;
                bcARRAY[i].set_rval(impR);
              }

              break;
            }

            case 2: // TEM coax: needs r0 (center), r1 (inner), r2 (outer)
            {
              double r0x, r0y, r0z, r1x, r1y, r1z, r2x, r2y, r2z;
              if (!(foo >> r0x >> r0y >> r0z
                        >> r1x >> r1y >> r1z
                        >> r2x >> r2y >> r2z)) {
                std::cerr << "[PORT] TEM-coax missing r0/r1/r2 for " << portName << "\n";
                // Provide safe defaults (degenerate; will inject 0)
                r0x=r0y=r0z=0; r1x=1e-3; r1y=r1z=0; r2x=4e-3; r2y=r2z=0;
              }
              portEx.r0_port[0]=(fp_t_ts)r0x; portEx.r0_port[1]=(fp_t_ts)r0y; portEx.r0_port[2]=(fp_t_ts)r0z;
              portEx.r1_port[0]=(fp_t_ts)r1x; portEx.r1_port[1]=(fp_t_ts)r1y; portEx.r1_port[2]=(fp_t_ts)r1z;
              portEx.r2_port[0]=(fp_t_ts)r2x; portEx.r2_port[1]=(fp_t_ts)r2y; portEx.r2_port[2]=(fp_t_ts)r2z;

              std::array<double,3> v10 = { r1x - r0x, r1y - r0y, r1z - r0z };
              std::array<double,3> v20 = { r2x - r0x, r2y - r0y, r2z - r0z };
              const double a = std::sqrt(v10[0]*v10[0] + v10[1]*v10[1] + v10[2]*v10[2]);
              const double b = std::sqrt(v20[0]*v20[0] + v20[1]*v20[1] + v20[2]*v20[2]);

              if (impR == 0.0 && impI == 0.0)
              {
                double eta =  std::sqrt(mu/eps); // η = sqrt(μ/ε)

                // Characteristic (V/I) line impedance of the coax
                double Z0_line = std::numeric_limits<double>::quiet_NaN();
                bool geom_ok = (a > 0.0) && (b > a);
                if (geom_ok)
                {
                  Z0_line = (eta / (2.0*PI)) * std::log(b/a);
                }
                else
                {
                  std::cerr << "[PORT] TEM-coax invalid radii (a=" << a << ", b=" << b
                            << "). Using only field impedance eta for BC.\n";
                }


                portEx.PortImpedance = (fp_t_ts)Z0_line;
                bcARRAY[i].set_rval(Z0_line);
              }
              else
              {
                portEx.PortImpedance = (fp_t_ts)impR;
                bcARRAY[i].set_rval(impR);
              }

              break;
            }


            case 3: // TE_mn rectangular: needs rect_a rect_b m n uv0x uv0y uv0z vpx vpy vpz
            {
              double rect_a, rect_b;
              int m, n;
              double uv0x, uv0y, uv0z;
              double vpx, vpy, vpz;

              if (!(foo >> rect_a >> rect_b >> m >> n >> uv0x >> uv0y >> uv0z >> vpx  >> vpy  >> vpz))
              {
                std::cerr << "[PORT] TE_mn missing <a b m n uv0x uv0y uv0z vpx vpy vpz> for " << portName << "\n";
                // Safe defaults (device clamps tiny a/b)
                rect_a = 1.0; rect_b = 1.0; m = 1; n = 0;
                uv0x = uv0y = uv0z = 0.0;
                // use PortDirection as fallback vpath
                vpx  = port_dx; vpy  = port_dy; vpz  = port_dz;
              }

              portEx.rect_a = (fp_t_ts)rect_a;
              portEx.rect_b = (fp_t_ts)rect_b;
              portEx.m = m;
              portEx.n = n;
              portEx.uv0[0]=(fp_t_ts)uv0x;
              portEx.uv0[1]=(fp_t_ts)uv0y;
              portEx.uv0[2]=(fp_t_ts)uv0z;

              // store the raw vpath too (optional, but handy for logging/diagnostics)
              portEx.vpath[0] = (fp_t_ts)vpx;
              portEx.vpath[1] = (fp_t_ts)vpy;
              portEx.vpath[2] = (fp_t_ts)vpz;

              // ---- Build t1, t2 from vpath and PortDirection (n) ----
              // n = normalized PortDirection
              double nx = port_dx, ny = port_dy, nz = port_dz;
              double nrm = std::sqrt(nx*nx + ny*ny + nz*nz);
              if (nrm < 1e-14) { nx = 0.0; ny = 0.0; nz = 1.0; nrm = 1.0; }
              nx /= nrm; ny /= nrm; nz /= nrm;

              double t1x = vpx;
              double t1y = vpy;
              double t1z = vpz;

              // t2 = n × t1
              double t2x = ny*t1z - nz*t1y;
              double t2y = nz*t1x - nx*t1z;
              double t2z = nx*t1y - ny*t1x;
              double t2n = std::sqrt(t2x*t2x + t2y*t2y + t2z*t2z);
              t2x /= t2n; t2y /= t2n; t2z /= t2n;

              // store in the excitation
              portEx.t1[0] = (fp_t_ts)t1x; portEx.t1[1] = (fp_t_ts)t1y; portEx.t1[2] = (fp_t_ts)t1z;
              portEx.t2[0] = (fp_t_ts)t2x; portEx.t2[1] = (fp_t_ts)t2y; portEx.t2[2] = (fp_t_ts)t2z;


              if (impR == 0.0 && impI == 0.0)
              {
                // Geometry (meters) & mode indices already read into rect_a, rect_b, m, n
                const double a = (rect_a > 0.0) ? rect_a : 1e-12;
                const double b = (rect_b > 0.0) ? rect_b : 1e-12;

                // Frequency (MHz in your code)
                const double omega = 2.0 * PI * freq * 1.0e6;

                const double kc2 = std::pow(m*PI/a, 2.0) + std::pow(n*PI/b, 2.0); // k_cutoff^2
                const double k2  = omega*omega * mu * eps;                        // k^2

                double Z_TE_real = std::numeric_limits<double>::quiet_NaN();
                double Z_TE_imag = 0.0;

                if (k2 <= kc2)
                {
                  // Below cutoff: Z_TE = -j*(ωμ/α), purely reactive
                  const double alpha = std::sqrt(kc2 - k2);
                  Z_TE_imag = -(omega * mu) / alpha;
                  Z_TE_real = 1e12;  // large real placeholder for BC scalar
                  std::cerr << "[PORT] TE_mn below cutoff (a=" << a << ", b=" << b
                            << ", m=" << m << ", n=" << n << "). Using large real Z for BC, "
                            << "Im{Z_TE}=" << Z_TE_imag << " Ohm.\n";
                }
                else
                {
                  // Above cutoff: Z_TE is real and positive
                  const double beta = std::sqrt(k2 - kc2);
                  Z_TE_real = (omega * mu) / beta;
                }

                // User asked us to determine impedance → store TE wave impedance
                portEx.PortImpedance = (fp_t_ts)Z_TE_real;
                bcARRAY[i].set_rval(Z_TE_real);
                bcARRAY[i].set_cval(Z_TE_real, Z_TE_imag);
              }
              else
              {
                // User-specified
                portEx.PortImpedance = (fp_t_ts)impR;
                bcARRAY[i].set_rval(impR);
                bcARRAY[i].set_cval(impR, impI);
              }


              break;
            }

            default:
            {
              std::cerr << "[PORT] Unknown PortFlag=" << PortFlag << " for " << portName
                        << ". Defaulting to TEM-rect with vpath=PortDirection.\n";
              portEx.PortFlag = 1;
              portEx.vpath[0] = (fp_t_ts)port_dx;
              portEx.vpath[1] = (fp_t_ts)port_dy;
              portEx.vpath[2] = (fp_t_ts)port_dz;
              if (impR == 0.0 && impI == 0.0) portEx.PortImpedance = (fp_t_ts)0.0;
              break;
            }
          }

          portExcitations.push_back(portEx);

          // Log summary
          std::cout << "\n=========================\n"
                    << " PORT BOUNDARY CONDITION \n"
                    << "=========================\n"
                    << "PortName   : " << portName << "\n"
                    << "PortNum    : " << (portEx.portNum - 1) << "\n"
                    << "PortFlag   : " << portEx.PortFlag << " (1=TEM-rect, 2=TEM-coax, 3=TE_mn)\n"
                    << "E/H Zport  : " << portEx.PortImpedance << " + j" << impI << " (0 => implicit)\n"
                    << "magE       : " << portEx.Emagnitude << "\n"
                    << "PortDir    : (" << port_dx << ", " << port_dy << ", " << port_dz << ")\n"
                    << "epr, mur   : " << epr << ", " << mur << "\n";

          if (portEx.PortFlag == 1)
          {
            std::cout << "vpath      : (" << portEx.vpath[0] << ", " << portEx.vpath[1] << ", " << portEx.vpath[2] << ")\n";
          }
          else if (portEx.PortFlag == 2)
          {
            std::cout << "r0         : (" << portEx.r0_port[0] << ", " << portEx.r0_port[1] << ", " << portEx.r0_port[2] << ")\n"
                      << "r1(inner)  : (" << portEx.r1_port[0] << ", " << portEx.r1_port[1] << ", " << portEx.r1_port[2] << ")\n"
                      << "r2(outer)  : (" << portEx.r2_port[0] << ", " << portEx.r2_port[1] << ", " << portEx.r2_port[2] << ")\n";
          }
          else if (portEx.PortFlag == 3)
          {
            std::cout << "rect(a,b)  : " << portEx.rect_a << ", " << portEx.rect_b << "\n"
                      << "m,n        : " << portEx.m << ", " << portEx.n << "\n"
                      << "uv0        : (" << portEx.uv0[0] << ", " << portEx.uv0[1] << ", " << portEx.uv0[2] << ")\n";
          }
          std::cout << "=========================\n\n";
          #endif


          break;
      }


      case planeWaveType: // planeWave (theta,  phi, ex, ey, ez)
      {
        char typeName[StrLenShort];
        foo >> typeName >> magE >> theta >> phi >> x >> y >> z >> rox >> roy >> roz;

        cout << " " << endl;
        cout << "====================================================================================================" << endl;
        cout << "                                  PLANEWAVE BOUNDARY CONDITION                                      " << endl;
        cout << "====================================================================================================" << endl;
        printf(" PlaneWaveType : %f %f %f %f %f %f %f %f %f\n", magE, theta, phi, x, y, z, rox, roy, roz);
        printf(" Unit          : %f\n", unit);
        bcARRAY[i].set_name(typeName);
        bcARRAY[i].set_magE(magE);
        bcARRAY[i].setTheta(theta);
        bcARRAY[i].setPhi(phi);
        bcARRAY[i].set_cval(No, 0.0);
        bcARRAY[i].SETFIELD(x, y, z);
        bcARRAY[i].setPW_ro(rox * unit, roy * unit, roz * unit);

        cout << " Name          : " << typeName << endl;
        cout << " magE          : " << magE << endl;
        cout << " Theta         : " << theta << endl;
        cout << " Phi           : " << phi << endl;
        cout << " POL           : " << "(" << x << ", " << y << ", " << z << ")" << endl;
        cout << " r0            : " << "(" << rox << ", " << roy << ", " << roz << ")" << endl;
        cout << "====================================================================================================" << endl;
        cout << " " << endl;

        PWorPort = 0;

        #if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)
          // for cuda kernel
          excitationProp.ro[0] = rox * unit;
          excitationProp.ro[1] = roy * unit;
          excitationProp.ro[2] = roz * unit;

          excitationProp.Emagnitude = magE;

          excitationProp.Epol[0] = x;
          excitationProp.Epol[1] = y;
          excitationProp.Epol[2] = z;

          excitationProp.ExcitationFlag = ExcitFlag;
          excitationProp.freq_m = freq;

          excitationProp.to = To;
          excitationProp.tau = Tau;

          excitationProp.phi = phi;
          excitationProp.theta = theta;
        #endif

        interior_excitation_flag = false;
        planeWaveMesh = new PlaneWaveMesh;
        planeWaveMesh->setName(typeName);
        PlaneWaveBCFlag = true;

        break;
      }

      case nonConformal:
      {
        nonConformalCase = true;
        break;
      }


      // Excitation Mode (PlaneWave into PML region)
      case pmlType:
      {
        foo >> pmlMode >> portName >> magE >> theta >> phi >> pol_x >> pol_y >> pol_z >> rox >> roy >> roz;

        PWorPort = 0;

        std::cout << "\n";
        std::cout << "====================================================================================================" << std::endl;
        std::cout << "                                  PML EXCITATION BOUNDARY CONDITION                                 " << std::endl;
        std::cout << "====================================================================================================" << std::endl;
        printf(" PML Mode      : %d\n", pmlMode);
        printf(" Port Name     : %s\n", portName);
        printf(" magE          : %f\n", magE);
        printf(" Theta         : %f\n", theta);
        printf(" Phi           : %f\n", phi);
        printf(" POL           : (%f, %f, %f)\n", pol_x, pol_y, pol_z);
        printf(" r0            : (%f, %f, %f)\n", rox, roy, roz);
        printf(" Unit          : %f\n", unit);  // Make sure `unit` is defined
        std::cout << "====================================================================================================" << std::endl;
        std::cout << "\n";

        // Apply to BC object
        bcARRAY[i].set_name(portName);
        bcARRAY[i].set_magE(magE);
        bcARRAY[i].setTheta(theta);
        bcARRAY[i].setPhi(phi);
        bcARRAY[i].set_cval(No, 0.0);
        bcARRAY[i].SETFIELD(pol_x, pol_y, pol_z);  // Assuming SETFIELD is for polarization
        bcARRAY[i].setPW_ro(rox * unit, roy * unit, roz * unit);


        #if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)

          excitationProp.ro[0] = rox * unit;
          excitationProp.ro[1] = roy * unit;
          excitationProp.ro[2] = roz * unit;

          excitationProp.Emagnitude = magE;
          excitationProp.Epol[0] = pol_x;
          excitationProp.Epol[1] = pol_y;
          excitationProp.Epol[2] = pol_z;

          excitationProp.ExcitationFlag = ExcitFlag;  // Must be defined
          excitationProp.freq_m = freq;               // Must be defined

          excitationProp.to = To;     // Must be defined
          excitationProp.tau = Tau;   // Must be defined

          excitationProp.phi = phi;
          excitationProp.theta = theta;

        #endif

        if (pmlMode == 1)
        {
          interior_excitation_flag = true;

          planeWaveMesh = new PlaneWaveMesh;
          planeWaveMesh->setName(portName);
          PlaneWaveBCFlag = true;
        }
        break;
      }
    }


  }
}

/**
   Make the egde and face arrays
*/

int localEdgebType(int n, int nbc[]){
  int bType, nb1, nb2;

  switch (n){
    case 0:
      nb1 = nbc[2];
      nb2 = nbc[3];
      break;
    case 1:
      nb1 = nbc[1];
      nb2 = nbc[3];
      break;
    case 2:
      nb1 = nbc[1];
      nb2 = nbc[2];
      break;
    case 3:
      nb1 = nbc[0];
      nb2 = nbc[3];
      break;
    case 4:
      nb1 = nbc[0];
      nb2 = nbc[2];
      break;
    case 5:
      nb1 = nbc[0];
      nb2 = nbc[1];
      break;
  }

  bType = (nb1 > nb2) ? nb1 : nb2; //original

  return bType;
}

void FemGrp::makeEdgeArray(){
  int i, j;

  // oversized array for edge BCs
  int* edgeBcs = new int[tetraCNT * 6];
  // store global edge ids for set/array use
  int** edgeIds = new int*[tetraCNT];
  for(i = 0; i < tetraCNT; i++)
    edgeIds[i] = new int[NumOfEdges];

  int nbc[NumOfFaces];
  list<edge*> edgeList;
  list<edge*>::iterator edgeListIter;
  edgeSetPtr = new set<edge>;
  set<edge>::iterator edgeSetIter;
  int index = 0;
  for(i = 0; i < tetraCNT; i++){
    tetra* tet = &(tetARRAY[i]);

    for(j = 0; j < NumOfFaces; j++)
      nbc[j] = bcArrange(tet->getbc(j)); //return the bc (the number define for the material) of each face

    for(j = 0; j < NumOfEdges; j++){
      int n0 = edgeMAP[j][0];
      int n1 = edgeMAP[j][1];
      int bType = localEdgebType(j, nbc); //return the most important bc of the edge checking both faces

      node* nd0 = tet->getNode(n0);
      node* nd1 = tet->getNode(n1);
      edge* eg = new edge;
      eg->setEdge(nd0, nd1);

      //add each edge just once
      edgeSetIter = edgeSetPtr->find(*eg);
      if(edgeSetIter == edgeSetPtr->end()){
        // new edge
        eg->setGlobalCnt(index);
        edgeIds[i][j] = index;
        eg->setbType(bType);
        edgeBcs[index] = bType;
        edgeSetPtr->insert(*eg);
        edgeList.push_back(eg);
        index++;
      }else{
        // set the boundary condicion of higher value if the edge was already set
        delete eg;
        edgeIds[i][j] = edgeSetIter->getGlobalCnt();
        if(bType > edgeSetIter->getbType()){
          edgeBcs[edgeIds[i][j]] = bType;
          (const_cast<edge&>(*edgeSetIter)).setbType(bType);
        }
      }
    }
  }
  // convert the list into an array
  edgeCNT = edgeList.size();
  cout << " edgeCNT        == " << edgeCNT << endl;
  edgeARRAY = new edge*[edgeCNT];
  index = 0;
  for(edgeListIter = edgeList.begin(); edgeListIter != edgeList.end(); edgeListIter++)
    edgeARRAY[index++] = *edgeListIter;

  // set the boundary conditions
  for(i = 0; i < edgeCNT; i++)
    edgeARRAY[i]->setbType(edgeBcs[i]);
  delete [] edgeBcs;

  // get tetra-edge linkage
  for(i = 0; i < tetraCNT; i++){
    for(j = 0; j < NumOfEdges; j++)
      tetARRAY[i].setEdge(edgeARRAY[edgeIds[i][j]], j);
  }
  for(i = 0; i < tetraCNT; i++)
    delete [] edgeIds[i];

  delete [] edgeIds;
}

void FemGrp::makeNonConformalArray(){
  ncARRAY = new int[nonConformalCNT];
  int index = 0;
  for(int i=0; i < tetraCNT; i++){
    tetra* tet = &(tetARRAY[i]);
    if(tet->getIsNC()){
      ncARRAY[index] = tet->cnt;
      index++;
    }
  }
  if(nonConformalCNT != index)
    cout << "ERROR in makeNonConformalArray" << endl;
}


void FemGrp::makeFaceArray()
{
  int i, j;

  // oversized arrays for face BCs and a map from global IDs with PEC faces to IDs without PEC face
  int* faceBcs  = new int[tetraCNT * NumOfFaces];
  int* indexMap = new int[tetraCNT * NumOfFaces]; //TODO: review what's the use of this array
  memset(faceBcs, 0, tetraCNT * NumOfFaces * sizeof(int));
  memset(indexMap, 0, tetraCNT * NumOfFaces * sizeof(int));

  // store global face ids for set/array use
  int** faceIds = new int*[tetraCNT];
  for(i = 0; i < tetraCNT; i++){
    faceIds[i] = new int[NumOfFaces];
    memset(faceIds[i], 0, NumOfFaces * sizeof(int));
  }

  edge eg;
  list<face*> faceList;
  vector<face*> faceListVector;
  list<face*>::iterator faceListIter;
  faceSetPtr = new set<face>;
  set<face>::iterator faceSetIter;
  int index = 0;
  int indexNoPec = 0; //TODO: review what's the use of this variable
  for(i = 0; i < tetraCNT; i++){
    tetra* tet = &(tetARRAY[i]);

    for(j = 0; j < NumOfFaces; j++){
      int bcNum = tet->getbc(j); // marker
      int bType = bcArrange(bcNum); // bc type in the defines
      bc* bcPtr = getbcPtr(bcNum); // pointer to the bc
      if(bType == nonConformal && !(tet->isNonConformal)){
        nonConformalCNT++;
        tet->setIsNC(true);
      }
      node* nd0 = tet->getNode(faceMAP[j][0]);
      node* nd1 = tet->getNode(faceMAP[j][1]);
      node* nd2 = tet->getNode(faceMAP[j][2]);
      face* fc = new face;
      fc->setFace(nd0, nd1, nd2); //set a face with the nodes ordered from smallest to greatest id
      faceSetIter = faceSetPtr->find(*fc);
      if(faceSetIter == faceSetPtr->end()){
        // new face
        fc->setcnt(index);
        faceIds[i][j] = index;

        if(bType != pecType)
          indexMap[index] = indexNoPec++;

        faceBcs[index] = bType;
        fc->setbcPtr(bcPtr);

        // set up face-edge linkage
        eg.setEdge(nd1, nd2);
        fc->setEdge(edgeARRAY[edgeSetPtr->find(eg)->getGlobalCnt()], 0);
        eg.setEdge(nd0, nd2);
        fc->setEdge(edgeARRAY[edgeSetPtr->find(eg)->getGlobalCnt()], 1);
        eg.setEdge(nd1, nd0);
        fc->setEdge(edgeARRAY[edgeSetPtr->find(eg)->getGlobalCnt()], 2);

        index++;
        faceSetPtr->insert(*fc);
        faceList.push_back(fc);
        faceListVector.push_back(fc);
      }else{
        delete fc;

        faceIds[i][j] = faceSetIter->getcnt(); // the j-th local face of tetra i is an old face
        if(bType > faceSetIter->getbType()){ // choose btype with a larger value
          faceBcs[faceIds[i][j]] = bType;
          (const_cast<face&>(*faceSetIter)).setbType(bType);

          face* f = faceListVector[faceIds[i][j]];
          f->setbType(bType);
          f->setbcPtr(bcPtr);
        }
      }
    }
  }

  // convert the list into an array
  int totalFaceCount = faceList.size();
  cout << " totalFaceCount == " << totalFaceCount << endl;
  face** totalFaceArray = new face*[totalFaceCount];
  index = 0;
  for(faceListIter = faceList.begin(); faceListIter != faceList.end(); faceListIter++)
    totalFaceArray[index++] = *faceListIter;

  // set the boundary conditions
  for(i = 0; i < totalFaceCount; i++){
    totalFaceArray[i]->setbType(faceBcs[i]);
  }

  // set tetra-face linkage
  for(i = 0; i < tetraCNT; i++){
    tetra* tet = &(tetARRAY[i]);
    for(j = 0; j < 4; j++){
      face* fc = totalFaceArray[faceIds[i][j]];
      tet->setFace(fc, j);
      if(fc->hydra[0] == nullptr){ // newly found face linkage
        fc->hydra[0] = tet;
      } else { // already existed, half-linked
        fc->hydra[1] = tet;
        fc->tetraArrange(); //order hydra[0] < hydra[1]
      }
    }
  }

  for(i = 0; i < tetraCNT; i++)
    delete [] faceIds[i];
  delete [] faceIds;
  delete [] totalFaceArray;
  delete [] faceBcs;
  delete [] indexMap;

  // convert the reduced list into an array
  faceCNT = faceList.size();
  faceARRAY = new face*[faceCNT];

  indexNoPec = 0;
  for(faceListIter = faceList.begin(); faceListIter != faceList.end(); faceListIter++)
    faceARRAY[indexNoPec++] = *faceListIter;

  while (faceSetIter != faceSetPtr->end()){
    set<face>::iterator tmpIter = faceSetIter;
    faceSetIter++;
    faceSetPtr->erase(tmpIter);
  }
  faceSetPtr->clear();
  delete faceSetPtr;

  set<edge>::iterator edgeSetIter = edgeSetPtr->begin();
  while(edgeSetIter != edgeSetPtr->end()){
    set<edge>::iterator tmpIter = edgeSetIter;
    edgeSetIter++;
    edgeSetPtr->erase(tmpIter);
  }
  edgeSetPtr->clear();
  delete edgeSetPtr;
}

int FemGrp::bcArrange(int bNum){
    // from that indicated in file to type defined in bc.h (marker to bc type)
  for(int i = 0; i < bcCNT; i ++){
    if(bcARRAY[i].getbNum() == bNum)
      return bcARRAY[i].getbType();
  }
  return 0;
}

bc *FemGrp::getbcPtr(int bNum){
  for(int i = 0; i < bcCNT; i ++)
    if(bcARRAY[i].getbNum() == bNum)
      return &(bcARRAY[i]);
  return nullptr;
}

void FemGrp::AssignExcitParamToFace(){
  for(int i = 0; i < faceCNT; i++){
    faceARRAY[i]->setTo(To);
    faceARRAY[i]->setTau(Tau);
    faceARRAY[i]->setTimeDist(TimeDistFlag);
    faceARRAY[i]->setExciFlag(ExcitFlag);
    faceARRAY[i]->setFrequency(freq);
  }
}

void FemGrp::AssignMaterialProperties(){
  int i;
  tetra *tet;

  for(i = 0; i < tetraCNT; i++)
  {
    tet = &(tetARRAY[i]);
    tet->SetFacePEC();
    tet->SetFacePMC();
    tet->set_mat(&(objProp[tet->getobjNum()]));
    tet->set_ConductivityFlag();

    // Additional routine for scattering region
    if (tet->getMat()->scattering_region)
    {
      tet->scattering_region = true;
    }

    // Additional routine for PML
    if (tet->getMat()->get_PML_Flag() == 1)
    {
      tet->set_PML_Flag(1);
    }
    else
    {
      tet->set_PML_Flag(0);
    }
    if (tet->get_PML_Flag() == -1) cout << "PML_Flag() not set " << endl;


  }
}

void FemGrp::AssignTetraFlags(){
  int AbcCount = 0;
  int InterCount = 0;
  int PortCount = 0;

  tetra *tet;
  cout << " " << endl;
  cout << "======================================================" << endl;
  cout << "             Total number of TetraHedra              " << endl;
  cout << "======================================================" << endl;
  cout << " Total number of TetraHedra                := " << tetraCNT << endl;


  // Parallelized by Qi Jian

  #pragma omp parallel for
  for(int i = 0; i < tetraCNT; i++)
  {
    tet = &(tetARRAY[i]);
    tet->set_TetrahedronFlag();
  }


  double min_AABB_size = 3e8 / (freq * 1e6) / 10.0;
  // For every tetrahedron, set the neighbor tetrahedra
  #pragma omp parallel for
  for(int i = 0; i < tetraCNT; i++)
  {
    tet = &(tetARRAY[i]);
    tet->set_NeighborTetra(tetARRAY, ncARRAY, nonConformalCNT, &octree_object, min_AABB_size);
  }


  for(int i = 0; i < tetraCNT; i++)
  {
    tet = &(tetARRAY[i]);
    tet->set_PolyOrderFlagDebug(PolyFlag);

    // The following code is node thread safe.
    if (tet->TetrahedronFlag == 0) InterCount++;
    if (tet->TetrahedronFlag == 1) AbcCount++;
    if (tet->ExcitationFlag  == 1) PortCount++;
  }


  cout << " Total number of P" << PolyFlag << " TetraHedra             := " << tetraCNT << endl;

  cout << " Total number of Interior TetraHedra       := " << InterCount << endl;
  cout << " Total number of AbcCount TetraHedra       := " << AbcCount << endl;
  cout << " Total number of Port/PlaneWave TetraHedra := " << PortCount << endl;
  cout << "======================================================" << endl;
  cout << " " << endl;

  int min_poly = tetARRAY[0].get_PolyOrderFlag();
  for(int i = 1; i < tetraCNT; i++){
    if(tetARRAY[i].get_PolyOrderFlag() < min_poly)
      min_poly = tetARRAY[i].get_PolyOrderFlag();
  }

  for(int i = 0; i < tetraCNT; i++)
    tetARRAY[i].set_MinimumPoly(min_poly);

  // Define Excitation tetrahedral
  TetExcitIndexArraySize = PortCount;
  TetExcitIndexArray = (int*)malloc(sizeof(int) * TetExcitIndexArraySize);
  int index = 0;
  for(int i = 0; i < tetraCNT; i ++){
    tet = &(tetARRAY[i]);
    if(tet->ExcitationFlag == 1){
      TetExcitIndexArray[index] = i;
      index++;
    }
  }
}

void FemGrp::makePlaneWaveMesh(){
  int i, j;
  set<int> meshNodeIds;

  // count the number of plane wave faces
  int pwFaceNum = 0;
  for(i = 0; i < faceCNT; i++){
    if(faceARRAY[i]->getbType() == planeWaveType || faceARRAY[i]->getbType() == pmlType)
      pwFaceNum++;
  }

  // set planeWaveMesh_'s faceCnt_ and allocate its faceArray_
  planeWaveMesh->setFaceCnt(pwFaceNum);
  cout << "    pwFaceNum              == " << pwFaceNum << endl;
  cout << "    planeWaveMesh->faceCNT == " << planeWaveMesh->faceCNT << endl;

  // populate faceArray_
  int index = 0;
  for(i = 0; i < faceCNT; i++){
    if(faceARRAY[i]->getbType() == planeWaveType || faceARRAY[i]->getbType() == pmlType){
      planeWaveMesh->setFace(faceARRAY[i], index);
      index++;
      // add unique node ids
      for(j = 0; j < NumOfNodesPerFace; j++)
        meshNodeIds.insert(faceARRAY[i]->getNode(j)->getid());
    }
  }

  // allocate and add node pointers to array keep local mapping
  int nodeNum = meshNodeIds.size();
  planeWaveMesh->setNodeCnt(nodeNum);
  cout << "    nodeNum                == " << nodeNum << endl;
  cout << "    planeWaveMesh->nodeCNT == " << planeWaveMesh->nodeCNT << endl;

  planeWaveMesh->allocGlobToLocMap();
  node** PlaneWaveNodeArray = planeWaveMesh->getNodeArray();
  map<int, int>& globToLocMap = planeWaveMesh->getGlobToLocMap();
  set<int>::iterator meshNodeIdIter;
  int nodeCount = 0;
  for(meshNodeIdIter = meshNodeIds.begin(); meshNodeIdIter != meshNodeIds.end(); meshNodeIdIter++){
    PlaneWaveNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]);
    globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++;
  }


  // Set the bounding box coordinates for the Planewave mesh
  // Useful for PML
  /*
  planeWaveMesh->computeBoundingBox();
  planewave_xmin = planeWaveMesh->getXmin();
  planewave_xmax = planeWaveMesh->getXmax();
  planewave_ymin = planeWaveMesh->getYmin();
  planewave_ymax = planeWaveMesh->getYmax();
  planewave_zmin = planeWaveMesh->getZmin();
  planewave_zmax = planeWaveMesh->getZmax();

  cout << "Planewave bounding box coordinates: " << std::endl;
  cout << "xmin: " << planewave_xmin << ", xmax: " << planewave_xmax << std::endl;
  cout << "ymin: " << planewave_ymin << ", ymax: " << planewave_ymax << std::endl;
  cout << "zmin: " << planewave_zmin << ", zmax: " << planewave_zmax << std::endl;
  */

}

// Single BC_ID
void FemGrp::makeInterSurfMesh(int BC_id){
  cout << " Generating InterSurf Mesh with " << BC_id << endl;
  InterSurfMesh = new PlaneWaveMesh;
  int i, j;
  set<int> InterSurfNodeIds;

  // count the number of faces
  int InterFaceNum = 0;

  int* FaceMap = new int[faceCNT];
  for(i = 0; i < faceCNT; i++)
    FaceMap[i] = -1;

  // Find the faces
  for(i = 0; i < faceCNT; i++){
    if(faceARRAY[i]->getbcPtr()->getbType() == BC_id){ //change
      InterFaceNum++;
      FaceMap[i] = i;
    }
  }

  if(InterFaceNum == 0)
    return;

  // set InterSurfMesh_'s faceCnt_ and allocate its faceArray_
  cout << "    InterFaceNum           == " << InterFaceNum << endl;
  InterSurfMesh->setFaceCnt(InterFaceNum);
  cout << "    FaceNum                == " << InterFaceNum << endl;
  cout << "     ->faceCNT             == " << InterSurfMesh->faceCNT << endl;

  // populate faceArray_
  int index = 0;
  for(i = 0; i < faceCNT; i++){
    if(FaceMap[i] > 0){
      InterSurfMesh->setFace(faceARRAY[i], index);
      index++;
      // add unique node ids
      for(j = 0; j < NumOfNodesPerFace; j++)
        InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid());
    }
  }

  // allocate and add node pointers to array
  // keep local mapping
  int nodeNum = InterSurfNodeIds.size();
  InterSurfMesh->setNodeCnt(nodeNum);
  cout << "    nodeNum                == " << nodeNum << endl;
  cout << "     ->nodeCNT             == " << InterSurfMesh->nodeCNT << endl;
  InterSurfMesh->allocGlobToLocMap();
  node** InterSurfNodeArray = InterSurfMesh->getNodeArray();
  map<int, int>& globToLocMap = InterSurfMesh->getGlobToLocMap();
  set<int>::iterator meshNodeIdIter;

  int nodeCount = 0;
  for(meshNodeIdIter = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){
    InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]);
    globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++;
  }

  //write file
  char Currents_vtkFile[StrOutput];
  sprintf(Currents_vtkFile, "SurfBC_%s_%d", fname, BC_id);
  node** locNodeArray = new node*[InterSurfMesh->nodeCNT];
  for(i = 0; i < InterSurfMesh->nodeCNT; i++){
    node& Node = *(InterSurfMesh->ndArray[i]);
    index = InterSurfMesh->globToLocMap_->find(Node.getid())->second;
    locNodeArray[index] = new node(index,
                                   Node.getPType(),
                                   Node.getSingOrder(),
                                   Node.getCoord().getx(),
                                   Node.getCoord().gety(),
                                   Node.getCoord().getz());
  }

  face** locFaceArray = new face*[InterSurfMesh->faceCNT];
  for(i = 0; i < InterSurfMesh->faceCNT; i++){
    face& Face = *(InterSurfMesh->fcArray[i]);
    locFaceArray[i] = new face(Face);
    locFaceArray[i]->setFace(locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second],
                             locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second],
                             locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]);
  }
  //TODO: check why unit is 1. instead of unit
  VtkWriter vtkWriter(1.);
  //TODO: check why order is 1. instead of order
  vtkWriter.writeTriUg(Currents_vtkFile, InterSurfMesh->nodeCNT, locNodeArray, InterSurfMesh->faceCNT, locFaceArray, 1);

  for(i = 0; i < InterSurfMesh->nodeCNT; i++)
    delete locNodeArray[i];

  delete [] locNodeArray;

  for(i = 0; i < InterSurfMesh->faceCNT; i++)
    delete locFaceArray[i];

  delete [] locFaceArray;
}

// Double BC_ID
void FemGrp::makeInterSurfMesh(int BC_id1,int BC_id2){

  InterSurfMesh = new PlaneWaveMesh;
  int i, j;
  set<int>  InterSurfNodeIds;
  // count the number of faces
  int  InterFaceNum = 0;
  int* FaceMap = new int[faceCNT];
  for(i = 0; i < faceCNT; i++) FaceMap[i] = -1;

  // Find the faces
   for(i = 0; i < faceCNT; i++){
     if((faceARRAY[i]->getbcPtr()->getbType() == BC_id1) || (faceARRAY[i]->getbcPtr()->getbType() == BC_id2)){
        InterFaceNum++;
        FaceMap[i] = i;
      }
   }

  if(InterFaceNum == 0) return;
  // set InterSurfMesh_'s faceCnt_ and allocate its faceArray_
  cout << "==   InterFaceNum   == " << InterFaceNum    << endl;
  InterSurfMesh->setFaceCnt(InterFaceNum);
  cout << "==   FaceNum        == " << InterFaceNum         << endl;
  cout << "== ->faceCNT        == " << InterSurfMesh->faceCNT << endl;
  // populate faceArray_
  int index = 0;
  for(i = 0; i < faceCNT; i++){
    if(FaceMap[i] > 0){
      InterSurfMesh->setFace(faceARRAY[i], index);
      index++;
      // add unique node ids
      for(j = 0; j < NumOfNodesPerFace; j++)
        InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid());
    }
  }
  // allocate and add node pointers to array
  // keep local mapping
  int nodeNum = InterSurfNodeIds.size();
  InterSurfMesh->setNodeCnt(nodeNum);
  cout << "== nodeNum          == " << nodeNum << endl;
  cout << "== ->nodeCNT        == " << InterSurfMesh->nodeCNT << endl;
  InterSurfMesh->allocGlobToLocMap();
  node** InterSurfNodeArray   = InterSurfMesh->getNodeArray();
  map<int, int>& globToLocMap = InterSurfMesh->getGlobToLocMap();
  set<int>::iterator meshNodeIdIter;
  int nodeCount = 0;
  for(meshNodeIdIter  = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){
    InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]);
    globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++;
  }
  //write file
  char Currents_vtkFile[StrOutput];
  sprintf(Currents_vtkFile, "SurfBC_%s", fname);
  node** locNodeArray = new node*[InterSurfMesh->nodeCNT];
  for(i = 0; i < InterSurfMesh->nodeCNT; i++){
    node& Node = *(InterSurfMesh->ndArray[i]);
    index = InterSurfMesh->globToLocMap_->find(Node.getid())->second;
    locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz());
  }

  face** locFaceArray = new face*[InterSurfMesh->faceCNT];
  for(i = 0; i < InterSurfMesh->faceCNT; i++){
    face& Face = *(InterSurfMesh->fcArray[i]);
    locFaceArray[i] = new face(Face);
    locFaceArray[i]->setFace(
      locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second],
      locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second],
      locNodeArray[InterSurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]);
  }

  //TODO: check why unit is 1. instead of unit
  VtkWriter vtkWriter(1.);
  //TODO: check why order is 1. instead of order
  vtkWriter.writeTriUg(Currents_vtkFile, InterSurfMesh->nodeCNT, locNodeArray, InterSurfMesh->faceCNT, locFaceArray, 1);

  for(i = 0; i < InterSurfMesh->nodeCNT; i++)
    delete locNodeArray[i];
  delete [] locNodeArray;
  for(i = 0; i < InterSurfMesh->faceCNT; i++)
    delete locFaceArray[i];
  delete [] locFaceArray;
}

void FemGrp::makeSurfMesh(int BC_id){
  cout << "Generating Surf Mesh with " << BC_id << endl;
  SurfMesh = new PlaneWaveMesh;
  int i, j;
  set<int> InterSurfNodeIds;
  // count the number of faces
  int InterFaceNum = 0;
  int* FaceMap = new int[faceCNT];
  for(i = 0; i < faceCNT; i++)
    FaceMap[i] = -1;

  // Find the faces
  for(i = 0; i < faceCNT; i++){
    if(faceARRAY[i]->getbcPtr()->getbType() == BC_id){ //change
      InterFaceNum++;
      FaceMap[i] = i;
    }
  }

  if(InterFaceNum == 0)
    return;

  // set SurfMesh_'s faceCnt_ and allocate its faceArray_
  cout << "== InterFaceNum   == " << InterFaceNum << endl;
  SurfMesh->setFaceCnt(InterFaceNum);
  cout << "== FaceNum   == " << InterFaceNum << endl;
  cout << "== ->faceCNT == " << SurfMesh->faceCNT << endl;
  // populate faceArray_
  int index = 0;
  for(i = 0; i < faceCNT; i++){
    if(FaceMap[i] > 0){
      SurfMesh->setFace(faceARRAY[i], index);
      index++;
      // add unique node ids
      for(j = 0; j < NumOfNodesPerFace; j++)
        InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid());
    }
  }
  // allocate and add node pointers to array
  // keep local mapping
  int nodeNum = InterSurfNodeIds.size();
  SurfMesh->setNodeCnt(nodeNum);
  cout << "== nodeNum   == " << nodeNum << endl;
  cout << "== ->nodeCNT == " << SurfMesh->nodeCNT << endl;
  SurfMesh->allocGlobToLocMap();
  node** InterSurfNodeArray = SurfMesh->getNodeArray();
  map<int, int>& globToLocMap = SurfMesh->getGlobToLocMap();
  set<int>::iterator meshNodeIdIter;
  int nodeCount = 0;
  for(meshNodeIdIter = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){
    InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]);
    globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++;
  }
  //write file
  char Currents_vtkFile[StrOutput];
  sprintf(Currents_vtkFile, "SurfBC_%s_%d", fname, BC_id);
  node** locNodeArray = new node*[SurfMesh->nodeCNT];
  for(i = 0; i < SurfMesh->nodeCNT; i++){
    node& Node = *(SurfMesh->ndArray[i]);
    index = SurfMesh->globToLocMap_->find(Node.getid())->second;
    locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz());
  }

  face** locFaceArray = new face*[SurfMesh->faceCNT];
  for(i = 0; i < SurfMesh->faceCNT; i++){
    face& Face = *(SurfMesh->fcArray[i]);
    locFaceArray[i] = new face(Face);
    locFaceArray[i]->setFace(locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second],
                             locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second],
                             locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]);
  }

  //TODO: check why unit is 1. instead of unit (it may be because the node coordinates are already scaled after readin. So they are true unit of the geometry)
  VtkWriter vtkWriter(1.);
  vtkWriter.writeTriUg(Currents_vtkFile, SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, 1); //The one is because we only work with first order geometry (modify if we want to work with higher order structures)

  for(i = 0; i < SurfMesh->nodeCNT; i++)
    delete locNodeArray[i];
  delete [] locNodeArray;
  for(i = 0; i < SurfMesh->faceCNT; i++)
    delete locFaceArray[i];
  delete [] locFaceArray;
}

// Double BC_ID
void FemGrp::makeSurfMesh(int BC_id1,int BC_id2){

  SurfMesh = new PlaneWaveMesh;
  int i, j;
  set<int>  InterSurfNodeIds;
  // count the number of faces
  int  InterFaceNum = 0;
  int* FaceMap = new int[faceCNT];
  for(i = 0; i < faceCNT; i++)
    FaceMap[i] = -1;

  // Find the faces
  for(i = 0; i < faceCNT; i++){
    if((faceARRAY[i]->getbcPtr()->getbType() == BC_id1) || (faceARRAY[i]->getbcPtr()->getbType() == BC_id2)){
      InterFaceNum++;
      FaceMap[i] = i;
    }
  }

  if(InterFaceNum == 0)
    return;
  // set SurfMesh_'s faceCnt_ and allocate its faceArray_
  cout << "==   InterFaceNum   == " << InterFaceNum << endl;
  SurfMesh->setFaceCnt(InterFaceNum);
  cout << "==   FaceNum   == " << InterFaceNum << endl;
  cout << "== ->faceCNT == " << SurfMesh->faceCNT << endl;
  // populate faceArray_
  int index = 0;
  for(i = 0; i < faceCNT; i++){
    if(FaceMap[i] > 0){
      SurfMesh->setFace(faceARRAY[i], index);
      index++;
      // add unique node ids
      for(j = 0; j < NumOfNodesPerFace; j++)
        InterSurfNodeIds.insert(faceARRAY[i]->getNode(j)->getid());
    }
  }
  // allocate and add node pointers to array
  // keep local mapping
  int nodeNum = InterSurfNodeIds.size();
  SurfMesh->setNodeCnt(nodeNum);
  cout << "== nodeNum   == " << nodeNum << endl;
  cout << "== ->nodeCNT == " << SurfMesh->nodeCNT << endl;
  SurfMesh->allocGlobToLocMap();
  node** InterSurfNodeArray = SurfMesh->getNodeArray();
  map<int, int>& globToLocMap = SurfMesh->getGlobToLocMap();
  set<int>::iterator meshNodeIdIter;
  int nodeCount = 0;
  for(meshNodeIdIter = InterSurfNodeIds.begin(); meshNodeIdIter != InterSurfNodeIds.end(); meshNodeIdIter++){
    InterSurfNodeArray[nodeCount] = &(ndARRAY[*meshNodeIdIter]);
    globToLocMap[ndARRAY[*meshNodeIdIter].getid()] = nodeCount++;
  }

  //write file
  char Currents_vtkFile[StrOutput];
  sprintf(Currents_vtkFile, "SurfBC_%s", fname);
  node** locNodeArray = new node*[SurfMesh->nodeCNT];
  for(i = 0; i < SurfMesh->nodeCNT; i++){
    node& Node = *(SurfMesh->ndArray[i]);
    index  = SurfMesh->globToLocMap_->find(Node.getid())->second;
    locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz());
  }

  face** locFaceArray = new face*[SurfMesh->faceCNT];
  for(i = 0; i < SurfMesh->faceCNT; i++){
    face& Face = *(SurfMesh->fcArray[i]);
    locFaceArray[i] = new face(Face);
    locFaceArray[i]->setFace(
        locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second],
        locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second],
        locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]);
  }

  //TODO: check why unit is 1. instead of unit
  VtkWriter vtkWriter(1.);
  //TODO: check why order is 1. instead of order
  vtkWriter.writeTriUg(Currents_vtkFile, SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, 1);

  for(i = 0; i < SurfMesh->nodeCNT; i++)
    delete locNodeArray[i];
  delete [] locNodeArray;
  for(i = 0; i < SurfMesh->faceCNT; i++)
    delete locFaceArray[i];
  delete [] locFaceArray;

}

// Set up the tet mass matrices and also the local inverses
// If non-matrix free is used also precompute and store the update matrices
void FemGrp::GetMatrices(){
  int i;
  tetra *tet;
  timer_start("CPU Matrices Evaluation",'u');
  // this gets the  mass matrices for the local tets only

  cout << "tetraCNT = " << tetraCNT << endl;
  //std::vector<fp_t> vec_x1, vec_y1, vec_z1;
  //std::vector<fp_t> vec_A2x, vec_A2y, vec_A2z;

  //fp_t cutoff_freq = freq * 1e6; // Convert MHz to Hz


  #pragma omp parallel for schedule(dynamic) private(tet,i)
  for(i = 0; i < tetraCNT; i ++)
  {

    #if defined(DGTD_USE_CUDA)


       //cout << "regularRegionFlag = " << regularRegionFlag << endl;
       //cout << "regularReferenceARRAY[" << i << "] = " << regularReferenceARRAY[i] << endl;

      //It is important in this order to avoid the checking of a null pointer
      if(!regularRegionFlag || regularReferenceARRAY[i] == i)
      {

        tet = &(tetARRAY[i]);
        tet->set_flux_GAMMA(factor_Flux);
        bool isPML = tet->get_PML_Flag();


        // -------------------------------------------------------------------------------
        if (isPML)
        {

          tet->set_Conductivity_Profile_Planar(planewave_xmin, planewave_ymin, planewave_zmin,
          planewave_xmax, planewave_ymax, planewave_zmax);

          if (UseQuadratureMatrices)
          {
            tensor identity(1.0, 0.0, 0.0,
                            0.0, 1.0, 0.0,
                            0.0, 0.0, 1.0);

            tet->Calculate_M_Matrix_E_Numeric();
            tet->Calculate_M_Matrix_I_E_Numeric();
            tet->Calculate_ABC_E_Numeric();

            tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_epA_E, tet->matA, tet->mat->epsr, true,
                                                              "A", planewave_xmin, planewave_ymin, planewave_zmin,
                                                              planewave_xmax, planewave_ymax, planewave_zmax,
                                                              Ellipse_Rx, Ellipse_Ry, Ellipse_Rz);  // epA
            tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_epB_E, tet->matB, tet->mat->epsr, true,
                                                              "B", planewave_xmin, planewave_ymin, planewave_zmin,
                                                              planewave_xmax, planewave_ymax, planewave_zmax,
                                                              Ellipse_Rx, Ellipse_Ry, Ellipse_Rz);  // epB
            tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_epC_E, tet->matC, tet->mat->epsr, true,
                                                              "C", planewave_xmin, planewave_ymin, planewave_zmin,
                                                              planewave_xmax, planewave_ymax, planewave_zmax,
                                                              Ellipse_Rx, Ellipse_Ry, Ellipse_Rz);  // epC
            tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_D_E, tet->matD, identity, true,
                                                              "D", planewave_xmin, planewave_ymin, planewave_zmin,
                                                              planewave_xmax, planewave_ymax, planewave_zmax,
                                                              Ellipse_Rx, Ellipse_Ry, Ellipse_Rz);  // D
            tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_F_E, tet->matF, identity, true,
                                                              "F", planewave_xmin, planewave_ymin, planewave_zmin,
                                                              planewave_xmax, planewave_ymax, planewave_zmax,
                                                              Ellipse_Rx, Ellipse_Ry, Ellipse_Rz);  // F

            tet->Calculate_Bii_Matrix_E_Numeric();
            tet->Calculate_Bij_Matrix_E_Numeric();
            tet->Calculate_S_Matrix_E_Numeric();
            tet->Calculate_Fii_Matrix_E_Numeric();
            tet->Calculate_Fij_Matrix_E_Numeric();

            tet->SetUp_LocalFaceToTetraMapE_NMF1_PML(tet->Class_dt);

            tet->Calculate_M_Matrix_H_Numeric();
            tet->Calculate_M_Matrix_I_H_Numeric();
            tet->Calculate_ABC_H_Numeric();

            tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_muA_H, tet->matA, tet->mat->mur, false,
                                                              "A", planewave_xmin, planewave_ymin, planewave_zmin,
                                                              planewave_xmax, planewave_ymax, planewave_zmax,
                                                              Ellipse_Rx, Ellipse_Ry, Ellipse_Rz);  // muA
            tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_muB_H, tet->matB, tet->mat->mur, false,
                                                              "B", planewave_xmin, planewave_ymin, planewave_zmin,
                                                              planewave_xmax, planewave_ymax, planewave_zmax,
                                                              Ellipse_Rx, Ellipse_Ry, Ellipse_Rz);  // muB
            tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_muC_H, tet->matC, tet->mat->mur, false,
                                                              "C", planewave_xmin, planewave_ymin, planewave_zmin,
                                                              planewave_xmax, planewave_ymax, planewave_zmax,
                                                              Ellipse_Rx, Ellipse_Ry, Ellipse_Rz);  // muC
            tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_D_H, tet->matD, identity, false,
                                                              "D", planewave_xmin, planewave_ymin, planewave_zmin,
                                                              planewave_xmax, planewave_ymax, planewave_zmax,
                                                              Ellipse_Rx, Ellipse_Ry, Ellipse_Rz);  // D
            tet->Calculate_Mass_Material_Matrix_Vary_Numeric( tet->Mass_F_H, tet->matF, identity, false,
                                                              "F", planewave_xmin, planewave_ymin, planewave_zmin,
                                                              planewave_xmax, planewave_ymax, planewave_zmax,
                                                              Ellipse_Rx, Ellipse_Ry, Ellipse_Rz);  // F


            tet->Calculate_Bii_Matrix_H_Numeric();
            tet->Calculate_Bij_Matrix_H_Numeric();
            tet->Calculate_S_Matrix_H_Numeric();
            tet->Calculate_Fii_Matrix_H_Numeric();
            tet->Calculate_Fij_Matrix_H_Numeric();
            tet->SetUp_LocalFaceToTetraMapH_NMF1_PML(tet->Class_dt);

          }

          else
          {

            tensor identity(1.0, 0.0, 0.0,
                            0.0, 1.0, 0.0,
                            0.0, 0.0, 1.0);

            tet->Calculate_M_Matrix_E();

            tet->Calculate_M_Matrix_I_E();
            tet->Calculate_ABC_E();

            tet->Calculate_Mass_Material_Matrix( tet->Mass_epA_E, tet->matA, tet->mat->epsr, true);  // epA
            tet->Calculate_Mass_Material_Matrix( tet->Mass_epB_E, tet->matB, tet->mat->epsr, true);  // epB
            tet->Calculate_Mass_Material_Matrix( tet->Mass_epC_E, tet->matC, tet->mat->epsr, true);  // epC
            tet->Calculate_Mass_Material_Matrix( tet->Mass_D_E, tet->matD, identity, true);  // D
            tet->Calculate_Mass_Material_Matrix( tet->Mass_F_E, tet->matF, identity, true);  // F

            tet->Calculate_Bii_Matrix_E();
            tet->Calculate_Bij_Matrix_E();
            tet->Calculate_S_Matrix_E();
            tet->Calculate_Fii_Matrix_E();
            tet->Calculate_Fij_Matrix_E();


            tet->SetUp_LocalFaceToTetraMapE_NMF1_PML(tet->Class_dt);

            tet->Calculate_M_Matrix_H();

            tet->Calculate_M_Matrix_I_H();
            tet->Calculate_ABC_H();

            tet->Calculate_Mass_Material_Matrix( tet->Mass_muA_H, tet->matA, tet->mat->mur, false);  // muA
            tet->Calculate_Mass_Material_Matrix( tet->Mass_muB_H, tet->matB, tet->mat->mur, false);  // muB
            tet->Calculate_Mass_Material_Matrix( tet->Mass_muC_H, tet->matC, tet->mat->mur, false);  // muC
            tet->Calculate_Mass_Material_Matrix( tet->Mass_D_H, tet->matD, identity, false);  // D
            tet->Calculate_Mass_Material_Matrix( tet->Mass_F_H, tet->matF, identity, false);  // F

            tet->Calculate_Bii_Matrix_H();
            tet->Calculate_Bij_Matrix_H();
            tet->Calculate_S_Matrix_H();
            tet->Calculate_Fii_Matrix_H();
            tet->Calculate_Fij_Matrix_H();

            tet->SetUp_LocalFaceToTetraMapH_NMF1_PML(tet->Class_dt);
          }
        }

        // PML
        // -------------------------------------------------------------------------------


        else
        {
          if (UseQuadratureMatrices)
          {

            tet->Calculate_M_Matrix_E_Numeric();
            tet->Calculate_M_Matrix_H_Numeric();
            tet->Calculate_Bii_Matrix_E_Numeric();
            tet->Calculate_Bij_Matrix_E_Numeric();
            tet->Calculate_S_Matrix_E_Numeric();
            tet->Calculate_Fii_Matrix_E_Numeric();
            tet->Calculate_Fij_Matrix_E_Numeric();
            tet->SetUp_LocalFaceToTetraMapE_NMF1_Numeric(tet->Class_dt);

            tet->Calculate_Bii_Matrix_H_Numeric();
            tet->Calculate_Bij_Matrix_H_Numeric();
            tet->Calculate_S_Matrix_H_Numeric();
            tet->Calculate_Fii_Matrix_H_Numeric();
            tet->Calculate_Fij_Matrix_H_Numeric();

            tet->SetUp_LocalFaceToTetraMapH_NMF1_Numeric(tet->Class_dt);
          }

          else
          {
            tet->Calculate_M_Matrix_E();
            tet->Calculate_M_Matrix_H();

            tet->Calculate_Bii_Matrix_E();
            tet->Calculate_Bij_Matrix_E();
            tet->Calculate_S_Matrix_E();
            tet->Calculate_Fii_Matrix_E();
            tet->Calculate_Fij_Matrix_E();
            tet->SetUp_LocalFaceToTetraMapE_NMF1(tet->Class_dt);

            tet->Calculate_Bii_Matrix_H();
            tet->Calculate_Bij_Matrix_H();
            tet->Calculate_S_Matrix_H();
            tet->Calculate_Fii_Matrix_H();
            tet->Calculate_Fij_Matrix_H();
            tet->SetUp_LocalFaceToTetraMapH_NMF1(tet->Class_dt);
          }
        }

      }

    #else
      tet = &(tetARRAY[i]);
      tet->set_flux_GAMMA(factor_Flux);
      tet->Calculate_M_Matrix_E();
      tet->Calculate_M_Matrix_H();

      // this
      tet->Calculate_Bii_Matrix_E();
      tet->Calculate_Bij_Matrix_E();
      tet->Calculate_S_Matrix_E();
      tet->Calculate_Fii_Matrix_E();
      tet->Calculate_Fij_Matrix_E();
      tet->SetUp_LocalFaceToTetraMapE_NMF1(tet->Class_dt);

      tet->Calculate_Bii_Matrix_H();
      tet->Calculate_Bij_Matrix_H();
      tet->Calculate_S_Matrix_H();
      tet->Calculate_Fii_Matrix_H();
      tet->Calculate_Fij_Matrix_H();
      tet->SetUp_LocalFaceToTetraMapH_NMF1(tet->Class_dt);
    #endif
  }
  timer_stop('u');


}

void FemGrp::SetUpMatrixVector(){
  DimE = dimE;
  DimH = dimH;

  #if defined(DGTD_USE_CUDA)
    // MemSizeE = DimE * sizeof(fp_t_ts);
    // MemSizeH = DimH * sizeof(fp_t_ts);

    // CUDA_SAFE_CALL(cudaMallocHost((void**)&En1_h, MemSizeE, cudaHostAllocMapped));
    // CUDA_SAFE_CALL(cudaMallocHost((void**)&Hn32_h, MemSizeH, cudaHostAllocMapped));
  #else
    MemSizeE = DimE * sizeof(fp_t);
    MemSizeH = DimH * sizeof(fp_t);

    en = new ArrayFP<fp_t>(DimE);
    hn_12 = new ArrayFP<fp_t>(DimH);
    en_1 = new ArrayFP<fp_t>(DimE);
    hn_32 = new ArrayFP<fp_t>(DimH);
  #endif

  // pre-compute the facial matrices required for coupling
  #pragma omp parallel for schedule(static)
  for(int i = 0; i < faceCNT; i++)
    faceARRAY[i]->SetUpMatrixFree();

  // #pragma omp parallel for schedule(dynamic) private(tet,i)
  #pragma omp parallel for schedule(dynamic)
  for(int i = 0; i < tetraCNT; i++){
    tetARRAY[i].SetUpMatrixFree();
  }

}

void FemGrp::DG_AssignOffsets(){
  int i;
  int OffsetE = 0;
  int OffsetH = 0;
  tetra* tet;

  for(i = 0; i < tetraCNT; i ++){
    tet = &(tetARRAY[i]);
    tet->CountDOF_E();
    tet->CountDOF_H();

    dimE = dimE + tet->LocalEDOF;
    dimH = dimH + tet->LocalHDOF;

    tet->set_LocalOffsetE(OffsetE);
    OffsetE = OffsetE + tet->LocalEDOF;

    tet->set_LocalOffsetH(OffsetH);
    OffsetH = OffsetH + tet->LocalHDOF;
  }
  cout << " " << endl;
  cout << "=================" << endl;
  cout << "    Dimensions   " << endl;
  cout << "=================" << endl;
  cout << " dimE = " << dimE << endl;
  cout << " dimH = " << dimH << endl;
  cout << "=================" << endl;
  cout << " " << endl;
}

void FemGrp::Get_dt_min_max(){
  int printSc = tetraCNT / 10;
  fp_t V_P;
  fp_t LocaldtMin = 1.0 * 1e6;
  fp_t LocalDt;
  fp_t LocaldtMax = 0.0;

  // #pragma omp parallel for schedule(dynamic) shared(LocaldtMin) private(LocalDt, V_P)
  for(int i = 0; i < tetraCNT; i ++){
    tetra* tet = &(tetARRAY[i]);

    tet->TimeStepEstimate(LocalDt, V_P);
    tet->set_Stability_dt(LocalDt); // May 5 2011

    if(LocalDt < LocaldtMin){
      #pragma omp atomic write
      LocaldtMin = LocalDt;
    }

    if(LocalDt > LocaldtMax){
      #pragma omp atomic write
      LocaldtMax = LocalDt;
    }

    if(i % printSc == 0)
      DEBUG_INFO(" Finished: " + to_string(i / (fp_t)tetraCNT * 100.0) + " %");
  }
  dt_min = LocaldtMin;
  dt_max = LocaldtMax;
}


void FemGrp::LocalTimeSteppingClassPartioning()
{
  cout.setf(ios::scientific,ios_base::floatfield);
  cout.precision(20);

  cout << " " << endl;
  cout << "========================================================" << endl;
  cout << "          LocalTimeSteppingClassPartioning              " << endl << flush;
  cout << "========================================================" << endl;

  //////////////////////////////////////////////////////////////////////////////////////
  // In this part we calculate the minimum and maximum time-step, with these          //
  // values, we calculate the number of classes and the ttime-step of each class as:  //
  //    dt_k = (2.0 * m + 1)^k * dt_min                                               //
  //      -  m       = class factor                                                   //
  //      -  k       = number of the class(starts in 0)                               //
  //      -  dt_k    = timestep of class k                                            //
  //      -  dt_min  = minimun timestep                                               //
  // we also assign to each tetra the class they belong to                            //
  //////////////////////////////////////////////////////////////////////////////////////
  int ClassCnt  = 0;
  int PMLClassCnt = 0; // For PML

  setClassMul(1);// this is actually the m not (2m+1)
  fp_t m = getClassMul();
  cout << " Class Factor: (2m + 1), m = " << m << " " << endl << flush;
  cout << " " << endl;
  fp_t LocalDt;
  fp_t LocalDt_down;
  fp_t LocalDt_up;
  tetra *tet;
  cout << " Calculating Time steps " << endl;
  Get_dt_min_max();
  cout << " " << endl;
  cout << " Get_dt_min = " << dt_min << endl;
  cout << " Get_dt_max = " << dt_max << endl;
  cout << " " << endl;

  cout.setf(ios::scientific,ios_base::floatfield);
  cout.precision(8);

  cout << " Starting class partitioning" << endl;
  N_class = (int)ceil(log((dt_max / dt_min)) / log(2.0 * m + 1.0));

  if(scalbSty == 1 || N_class == 0) //only 1 if DGTD_USE_LTS is NOT defined
    N_class = 1;

  LocTimeSteps = new double[N_class];
  ClassTetraCnt = new int[N_class];
  ClassPMLTetraCnt = new int[N_class];

  for(int i = 0 ; i < N_class; i++)
  {
    ClassTetraCnt[i]  = 0;
    ClassPMLTetraCnt[i] = 0;
  }

  cout << " " << endl;
  cout << " N_class: " << N_class << endl;

  if(scalbSty)
    TimeStep_dt = dt_min;

  numberPML = 0;


  for(int i = 0 ; i < N_class; i++)
  {
    LocalDt_down = pow((2.0 * m + 1.0), i) * dt_min;
    LocalDt_up = pow((2.0 * m + 1.0), (i + 1)) * dt_min;
    LocTimeSteps[i] = 1.0 * LocalDt_down;

    #pragma omp parallel for schedule(dynamic) shared(ClassCnt,PMLClassCnt) private(tet, LocalDt)
    for(int j = 0; j < tetraCNT; j ++)
    {
      tet = &(tetARRAY[j]);
      if(scalbSty)
      {
        tet->set_LTS_Flag(i);
        tet->set_Class_dt(1.0 * LocalDt_down);
        bool isExcitation = tet->get_ExcitationFlag();

        #pragma omp atomic
        ClassCnt++;

        if (tet->get_PML_Flag() && !isExcitation)
        {
          #pragma omp atomic
          PMLClassCnt++;
        }
        else
        {
          // Increment the count of tetrahedra in this class
          #pragma omp atomic
          ClassCnt++;
        }

      }
      else
      {
        LocalDt = tet->get_Stability_dt();
        //LocalDt = 0.93 * LocalDt;
        if(LocalDt_down <= LocalDt && (LocalDt < LocalDt_up || i == N_class - 1))
        {
          tet->set_LTS_Flag(i);
          tet->set_Class_dt(1.0 * LocalDt_down);
          bool isExcitation = tet->get_ExcitationFlag();

          if (tet->get_PML_Flag() && !isExcitation)
          {
            #pragma omp atomic
            PMLClassCnt++;
          }
          else
          {
            // Increment the count of tetrahedra in this class
            #pragma omp atomic
            ClassCnt++;
          }

        }
      }
    }

    ClassTetraCnt[i] = ClassCnt;
    ClassPMLTetraCnt[i] = PMLClassCnt;

    numberPML += PMLClassCnt;

    cout << " Number of Tetra in class: " << i << " = " << ClassTetraCnt[i] << endl;
    cout << " Number of PML Tetra in class: " << i << " = " << ClassPMLTetraCnt[i] << std::endl;
    cout << "-------------------------------------------------------------" << endl;
    ClassCnt = 0;
    PMLClassCnt = 0;
  }
  cout << "Total Number of PML Tetras = " << numberPML << endl;


  ////////////////////////////////////////////////////////////////////////////////////
  // In this part we check if there is enough elements in one class to be efficient //
  // if not, those elements will be moved to the previous class                     //
  ////////////////////////////////////////////////////////////////////////////////////

  if(N_class > 1)
  {
    bool reduceN_class = false;
    bool balanced = false;
    for(int i = 0; i < N_class - 1; i++)
    {
      int classN = (N_class - 1) - i;

      fp_t number_of_tetra_in_classN = (fp_t)ClassTetraCnt[classN] + (fp_t)ClassPMLTetraCnt[classN];
      fp_t relClassCnt = number_of_tetra_in_classN / tetraCNT;
      fp_t previousClassDt = pow((2.0 * m + 1.0), classN - 1) * dt_min;
      if (relClassCnt < ClassRelMinCNT && number_of_tetra_in_classN < ClassMinCNT)
      {
        if(i == 0)
        {
          reduceN_class = true;
        }
        balanced = true;
        ClassTetraCnt[classN - 1] += ClassTetraCnt[classN];
        ClassTetraCnt[classN] = 0;
        ClassPMLTetraCnt[classN - 1] += ClassPMLTetraCnt[classN];
        ClassPMLTetraCnt[classN] = 0;
        #pragma omp parallel for schedule(dynamic) private(tet)
        for(int j = 0; j < tetraCNT; j ++)
        {
          tet = &(tetARRAY[j]);
          if(tetARRAY[j].get_LTS_Flag() == classN)
          {
            tet->set_LTS_Flag(classN - 1);
            tet->set_Class_dt(1.0 * previousClassDt);
          }
        }
      }
    }

    if(reduceN_class)
    {
      N_class -= 1;
    }

    if(balanced)
    {
      cout << "=================================" << endl;

      cout << "Classes have been balanced\n";
      for (int i = 0; i < N_class; i++)
      {
        cout << " Number of Tetra in class: " << i << " = " << ClassTetraCnt[i] << std::endl;
        cout << " Number of PML Tetra in class: " << i << " = " << ClassPMLTetraCnt[i] << std::endl << endl;
      }
      cout << "=================================" << endl;

    }
  }

  // Check that all the elements are associated with a class
  for(int j = 0; j < tetraCNT; j ++)
  {
    if(tetARRAY[j].get_LTS_Flag() < 0)
      cout << " tet " << tetARRAY[j].getcnt() << " has LTS_flag = " << tetARRAY[j].get_LTS_Flag() << " and LTS time step " << tetARRAY[j].get_Class_dt()  << endl;
  }

  ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // In this part we order the tetras in the most efficient way for the GPU                                                //
  //    - 1st: we order by class, from smaller time-step to larger                                                         //
  //    - 2nd: each class is ordered by nonConformal tetras 1st and then conformal ones                                    //
  //    - 3rd: we order the nonconformal ones as: excitation (ordered by number of exciting faces 1-2-3) - nonExcitation   //
  //    - 4th: we order the conformal ones as: nonRegular - Reg1 - Reg2 - ...                                              //
  //                                                                                                                       //
  //          *** NOTE: in nonConformal we also include any tetra with a face without neighbor ***                         //
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////


  // -----------------------------------------------------------------------------------------------------
  // Determine cutoff between Normal-regular groups and Regular-PML groups.
  // Assumptions:
  // - regularGroup == 0  -> Irregular (both non-PML and PML)
  // - regularGroup > 0   -> Regular
  // - Groups are assigned so that all non-PML regular groups use smaller IDs
  //   than any PML regular groups (i.e., there exists a clean cutoff).
  //
  // Outputs:
  //   regularCNT_Normal : number of regular groups used by non-PML (g in [1 .. cutoff-1])
  //   regularCNT_PML    : number of regular groups used by PML     (g in [cutoff .. regularCNT-1])
  // -----------------------------------------------------------------------------------------------------

  cout << "-----------------------" << endl;


    if (regularCNT > 1)
    {
      regularCNT_Normal = 0;
      for(int j = 0; j < tetraCNT; j ++)
      {
        tet = &(tetARRAY[j]);
        int groupID = tet->getRegularGroup();
        bool isPML    = tet->get_PML_Flag();

        if (!isPML)
        {
          if ((groupID > regularCNT_Normal))
          {
            regularCNT_Normal = groupID;
          }
        }
      }
      regularCNT_PML = regularCNT - regularCNT_Normal - 1;
    }
    else
    {
      regularCNT_Normal = 0;
      regularCNT_PML = 0;
    }

    cout << "regularCNT = " << regularCNT << endl;
    std::cout << "regularCNT_Normal = " << regularCNT_Normal << "\n";
    std::cout << "regularCNT_PML = " << regularCNT_PML << "\n";


  int NumGroups = regularCNT + 4 + portCNT;
  cout << "NumGroups = " << NumGroups << endl;


  // -----------------------
  // Populate the TetraIndex
  // -----------------------
  // ----------------------------------------------------------------- //
  // Store the tetrahedra in the ClassTetraIndexAux array              //
  // ----------------------------------------------------------------- //

  list<int>* ClassTetraIndexAux = new list<int>[NumGroups];
  ClassTetraIndex = new int*[N_class];
  ClassExcitationCount = new int[N_class];
  ClassExcitationOffset = new int[N_class];
  ClassExcitation_sc_CNT = new int[N_class];
  list<int> ClassExcitationPerFaceList[(int)pow(2, NumOfFaces) - 1];

  if (portCNT > 0)
  {
    ClassPortCnt_h = new int[N_class * portCNT];
    ClassPortOffset_h = new int[N_class * portCNT];
    ClassPortNum_h = new int[N_class * portCNT];
  }

  for(int i = 0 ; i < N_class; i++)
  {
    ClassTetraIndex[i] = new int[ClassTetraCnt[i] + ClassPMLTetraCnt[i]];
    ClassExcitationCount[i] = 0;
    ClassExcitationOffset[i] = 0;
    ClassExcitation_sc_CNT[i] = 0;
  }


  int PML_Case                  = NumGroups - 1;
  int Scattering_Excited_Case   = NumGroups - 2;
  int Total_Excited_Case        = NumGroups - 3;
  int NC_Case                   = NumGroups - 4;
  int Port_Case                 = NumGroups - 4 - portCNT; // First port case
  int Conformal_Case            = 0;

  int index;
  int DGface_bc;
  int auxCNT = 0;
  excitationFaces = 0;

  int ClassOffSet = 0;
  ClassTetraOffset = new int[N_class];
  ClassPMLTetraOffset = new int[N_class];

  for(int i = 0 ; i < N_class; i++)
  {
    for(int j = 0; j < tetraCNT; j ++)
    {
      tet = &(tetARRAY[j]);

      bool isExcite = tet->ExcitationFlag;
      bool isPML    = tet->get_PML_Flag();
      bool isNC     = tet->getIsNC();

      if(tet->LTS_Flag == i)
      {
        if(tet->getRegularGroup() > 0)
          ClassTetraIndexAux[tet->getRegularGroup()].push_back(tet->getcnt());
        else if(!isNC && tet->get_NeighNum() == 4 && !isPML && !isExcite)
          ClassTetraIndexAux[Conformal_Case].push_back(tet->getcnt());
        else if (isPML)
          ClassTetraIndexAux[PML_Case].push_back(tet->getcnt());
        else
        {
          if(isExcite)
          {
            ClassExcitationCount[i]++;
            int face = 0;
            for(int k = 0; k < NumOfFaces; k++)
            {
              if (!tet->fc[k] || !tet->fc[k]->bcPtr) continue; // optional null guard
              DGface_bc = tet->fc[k]->bcPtr->getbType();
              if(DGface_bc == planeWaveType || DGface_bc == portType || DGface_bc == pmlType)
              {
                face += (1 << k);
                excitationFaces++;
              }
            }
            if (face > 0)
              ClassExcitationPerFaceList[face - 1].push_back(tet->getcnt());
          }
          else
          {
              ClassTetraIndexAux[NC_Case].push_back(tet->getcnt());
          }
        }
      }
    }


    // ----------------------------------------------------------------- //
    // Excitation                                                        //
    // ----------------------------------------------------------------- //


    ClassExcitationOffset[i] = auxCNT;
    auxCNT += ClassExcitationCount[i];

    for(int j = (1 << NumOfFaces) - 2; j >= 0; j--)
    {
      int listIndex = faceExcitationOrder[j] - 1;
      int auxSize = ClassExcitationPerFaceList[listIndex].size();

      for(int k = 0; k < auxSize; k++)
      {
        int tet_id = ClassExcitationPerFaceList[listIndex].back();
        tet = &(tetARRAY[tet_id]);

        if (PlaneWaveBCFlag)
        {
          if (tet->scattering_region)
            ClassTetraIndexAux[Scattering_Excited_Case].push_back(tet_id);
          else
            ClassTetraIndexAux[Total_Excited_Case].push_front(tet_id);
        }
        else
        {
          int port_id = -1;
          for (int k=0; k<NumOfFaces; k++)
          {
            int bc_number = tet->getbc(k);
            if (tet->fc[k]->bcPtr->getbType() == portType)
            {
              int pnum = bcNumToPnum[bc_number];
              ClassTetraIndexAux[Port_Case+pnum].push_front(tet_id);
              break;
            }
          }
        }

        ClassExcitationPerFaceList[listIndex].pop_back();
      }
    }


    // ----------------------------------------------------------------- //
    // Store the tetrahedra in the ClassTetraIndex array                 //
    // ----------------------------------------------------------------- //

    index = 0;

    auto addGroupToIndex = [&](int group) {
      int size = ClassTetraIndexAux[group].size();
      for (int l = 0; l < size; l++)
      {
        ClassTetraIndex[i][index++] = ClassTetraIndexAux[group].front();
        ClassTetraIndexAux[group].pop_front();
      }
    };


    // -----------------------------------------------------------------------------------------------
    // Order: Scattered Field Excited, Total Field Excited, NC, Conformal, Regular, PML, Regular PML
    // -----------------------------------------------------------------------------------------------

    if (PlaneWaveBCFlag)
    {
      addGroupToIndex(Scattering_Excited_Case);
      ClassExcitation_sc_CNT[i] = index;
      addGroupToIndex(Total_Excited_Case);
    }
    else
    {
      for(int p = 0; p < portCNT; p++)
      {
        ClassPortOffset_h[i * portCNT + p] = index;
        addGroupToIndex(Port_Case + p);
        ClassPortCnt_h[i * portCNT + p] = index - ClassPortOffset_h[i * portCNT + p];
        ClassPortNum_h[i * portCNT + p] = p;
      }
    }


    addGroupToIndex(NC_Case);
    addGroupToIndex(Conformal_Case);


    // Add Regular Tetrahedra
    // WE assume that there are only 6 regular tetrehedron that are non-PML
    if ( regularCNT > 1)
    {
      for (int k = 1; k <= regularCNT_Normal; k++)
      {
        addGroupToIndex(k);
      }
    }

    cout << "Class " << i << " | PML index = " << index << endl;

    addGroupToIndex(PML_Case);

    // Add PML Regular Tetrahedra
    if ( regularCNT > 6)
    {
      for (int k = regularCNT_Normal; k < regularCNT; k++)
      {
        addGroupToIndex(k);
      }
    }


    ClassTetraOffset[i] = ClassOffSet;
    ClassOffSet += ClassTetraCnt[i] + ClassPMLTetraCnt[i];
    ClassPMLTetraOffset[i] = ClassOffSet - ClassPMLTetraCnt[i];

  }


  for(int i = 0; i < N_class; i++)
  {
    std::cout << " ClassExcitationCount[" << i << "]     = " << ClassExcitationCount[i] << std::endl;
    std::cout << " ClassTetraOffset[" << i << "]         = " << ClassTetraOffset[i] << std::endl;
    std::cout << " ClassPMLTetraOffset[" << i << "]      = " << ClassPMLTetraOffset[i] << std::endl;
  }

  std::cout << "excitationFaces = " << excitationFaces << std::endl;
  std::cout << "========================================================" << std::endl;

}


/**
  OpenMP Local Time-Stepping for matrix free Recursive

  Explained in "Dissipative terms and local time-stepping improvements
      in a spatial high order Discontinuous Galerkin scheme
      for the time-domain Maxwell’s equations" by E. Montseny
*/

void FemGrp::ComputeE_MatrixFree(int class_i, fp_t dt_i){
  if(class_i == 0){
    LeapFrogE(class_i, LocTimeSteps[class_i]);
  }
  else{
    LeapFrogE(class_i, LocTimeSteps[class_i]);
    ComputeE_MatrixFree(class_i - 1, LocTimeSteps[class_i-1]);
    ComputeH_MatrixFree(class_i - 1, LocTimeSteps[class_i-1]);
    ComputeE_MatrixFree(class_i - 1, LocTimeSteps[class_i-1]);
  }
}

void FemGrp::ComputeH_MatrixFree(int class_i, fp_t dt_i){
  if(class_i == 0){
    LeapFrogH(class_i, LocTimeSteps[class_i]);
  }
  else{
    LeapFrogH(class_i, LocTimeSteps[class_i]);
    ComputeH_MatrixFree(class_i - 1, LocTimeSteps[class_i - 1]);
    ComputeE_MatrixFree(class_i - 1, LocTimeSteps[class_i - 1]);
    ComputeH_MatrixFree(class_i - 1, LocTimeSteps[class_i - 1]);
  }
}

void FemGrp::LeapFrogE(int class_i, fp_t dt_i){
  int i;
  int n;
  fp_t InitTime  = 0.0;
  n = LocalExciIndexE[class_i];

  #pragma omp parallel for schedule(dynamic) private(i)
  for(i = 0; i < ClassTetraCnt[class_i]; i++){
    tetra* tet  = &(tetARRAY[ClassTetraIndex[class_i][i]]);
    tet->LocalFaceToTetraMapE_NMF1(*en_1, *en, *hn_12, dt_i, InitTime + (n + 0.5) * dt_i);
  }

  #pragma omp parallel for schedule(dynamic) private(i)
  for(i = 0 ; i < DimE; i++){
    en->setentry(i, en_1->getentry(i));
  }
  LocalExciIndexE[class_i] = LocalExciIndexE[class_i] + 1;
}

void FemGrp::LeapFrogH(int class_i, fp_t dt_i){
  int i;
  int n;
  fp_t InitTime  = 0.0;
  n = LocalExciIndexH[class_i];

  #pragma omp parallel for schedule(dynamic) private(i)
  for(i = 0; i < ClassTetraCnt[class_i]; i ++){
    tetra* tet = &(tetARRAY[ClassTetraIndex[class_i][i]]);
    tet->LocalFaceToTetraMapH_NMF1(*hn_32, *en_1, *hn_12, dt_i, InitTime + (n + 1.0) * dt_i);
  }

  #pragma omp parallel for schedule(dynamic) private(i)
  for(i = 0 ; i < DimH ; i++){
    hn_12->setentry(i, hn_32->getentry(i));
  }
  LocalExciIndexH[class_i] = LocalExciIndexH[class_i] + 1;
}

/**
  Local Time-Stepping Update
*/
void FemGrp::LTS_TimeUpdateGlobal_MatrixFree(){
  int i, n;
  fp_t InitTime  = 0.0;

  LocalExciIndexE = new int[N_class];
  LocalExciIndexH = new int[N_class];

  for(i = 0; i < N_class; i++){
    LocalExciIndexE[i] = 0;
    LocalExciIndexH[i] = 0;
  }

  NtimeSteps = (int)ceil((FinalTime - InitTime) / LocTimeSteps[N_class -1]);

  cout.setf(ios::scientific,ios_base::floatfield);
  cout.precision(15);

  cout << "Start Time Stepping " << endl;
  cout << "FinalTime        = " << FinalTime << endl;
  cout << "TimeStep_dt      = " << LocTimeSteps[N_class -1] << endl;
  cout << "tetraCNT         = " << tetraCNT << endl;
  cout << "NtimeSteps       = " << NtimeSteps << endl;

  timer_start("Time Stepping", ' ');
  fp_t Frequency = freq;
  fp_t dt_nyquist = 1.0 / (2.0 * Frequency * MEGA);
  // fp_t dt_nyquist = 2.0 / (Frequency * MEGA); //That's wrong
  fp_t dt_sample = (1 / SamplingRate) * dt_nyquist;
  int postProcIters = (int)ceil(dt_sample / LocTimeSteps[N_class - 1]);
  int printScreenIters = 2 * postProcIters;

  Write_TD_Data(postProcIters, NtimeSteps);

  cout << "dt_nyquist       = " << dt_nyquist << endl;
  cout << "dt_sample        = " << dt_sample << endl;
  cout << "printScreenIters = " << printScreenIters << endl;
  cout << "postProcIters    = " << postProcIters << endl;
  cout << "N_class          = " << N_class <<endl;

  size_t total_time = 0;
  fp_t current_time = 0;
  current_time -= (double)dt_sample * 1e9;

  SYSTEM_MEM_USAGE();
  timer_start("Start Time Stepping", ' ');

  for(n = 0 ; n < NtimeSteps ; n++)
  {
    ComputeE_MatrixFree(N_class - 1 , LocTimeSteps[N_class - 1]);
    ComputeH_MatrixFree(N_class - 1 , LocTimeSteps[N_class - 1]);

    if(n % postProcIters == 0)
    {
      if(write_AnalyticalIncidentProbes)
      {
        if(probeCNT > 0)
        {
          CalculateL2Error(n, LocTimeSteps[N_class - 1], ExcitFlag);
          CalculateL2ErrorProbes(n, LocTimeSteps[N_class - 1], ExcitFlag);
        }
        writeAnalyticalIncidentPWProbes(n);
      }

      if(write_probes && probeCNT > 0)
      {
        writeFieldProbe(n);
      }

      if(write_fields)
      {
        writeFieldGlobal(n);
      }


      if(portCNT != 0)
      {
        EvaluateSparametersGlobal(n, LocTimeSteps[N_class -1], true);
      }

      cout << "E field norm " <<  en_1->magnitude() << endl;
      //cout << "H field norm " <<  hn_32->magnitude() << endl;
      total_time += timer_stop(' ');
      timer_start(to_string(postProcIters)+" steps ", ' ');
      DEBUG_INFO("Percentage Completed :" + to_string((double)n / (double)NtimeSteps * 100.0) + "%");
      current_time += (double)dt_sample * 1e9;
      DEBUG_INFO("Current Time : " + to_string(current_time) + "ns");
      DEBUG_INFO("Average iteration time : "+ to_string(((double)total_time / (double)(n + 1.0))) + " sec");
    }
  }

  DEBUG_INFO("Total iteration time: "+ to_string(((double)total_time)) + " sec");
  timer_stop(' ');
}

//*****************

void FemGrp::Write_TD_Data(int tsPerSample, int nTimeSteps){
  // fp_t  to  = 4.0  * pow(10.0, -9.0);
  // fp_t  tau = 0.8  * pow(10.0, -9.0);
  char TD_data[180];

  sprintf(TD_data, "./PROBES/%s.TD_Data", fname);
  ofstream TD_datafile(TD_data, ios_base::out);
  if(!TD_datafile){
    cout << "Error in opening file: " << TD_data << "for write"<< endl;
  }

  TD_datafile << LocTimeSteps[N_class -1] << endl;
  TD_datafile << nTimeSteps << endl;
  TD_datafile << To << endl;
  TD_datafile << Tau << endl;
  TD_datafile << tsPerSample << endl;
  TD_datafile << probeCNT << endl;
}


// Modifed by qi jian to use octree to store the probes barycentric coordinates
void FemGrp::readPROBE()
{
  // Read only the nodes belonging to this subdomain and neighbors
  char nname[StrLenShort];

  // Read the probe file
  sprintf(nname, "%s.probe", fname);
  rapidcsv::Document probe_doc(nname);
  std::vector<double> x_col = probe_doc.GetColumn<double>("X");
  std::vector<double> y_col = probe_doc.GetColumn<double>("Y");
  std::vector<double> z_col = probe_doc.GetColumn<double>("Z");

  // Check that all the columns have the same size
  assert(x_col.size() == y_col.size());
  assert(y_col.size() == z_col.size());
  assert(z_col.size() == x_col.size());


  probeCNT = x_col.size();
  if(padeCNT > probeCNT)
  {
    padeCNT = probeCNT;
    cout << "Pade Number Of Elements REDUCED to " << probeCNT << endl;
  }

  probes_bary.resize(probeCNT);
  std::cout << "Compute the Barycentric coordinates of the Probes" << std::endl;
  const double tol = 1e-8;

  //#pragma omp parallel for schedule(dynamic)
  for (int node_id = 0; node_id < probeCNT; ++node_id)
  {
      double probe_xyz[3] = {x_col[node_id] * unit, y_col[node_id] * unit, z_col[node_id] * unit};

      std::vector<std::pair<int, std::array<double, 4>>> found_tets;
      bool success = octree_object.findTetraInOctree(probe_xyz, found_tets, tol);

      if (success)
      {
          probes_bary[node_id].first = static_cast<int>(found_tets.size());
          probes_bary[node_id].second = found_tets;
      }
      else
      {
        probes_bary[node_id].first = -1;
      }
  }

  // Report and verify
  bool error_flag = false;
  for (int i = 0; i < probeCNT; ++i)
  {
      if (probes_bary[i].first < 0)
      {
          std::cerr << "Node " << i << " not found in simulation domain" << std::endl;
          double probe_xyz[3] = {x_col[i] * unit, y_col[i] * unit, z_col[i] * unit};
          std::cerr << probe_xyz[0] << " " << probe_xyz[1] << " " << probe_xyz[2] << std::endl;
          error_flag = true;
      }
  }

  if (error_flag)
  {
      std::cerr << "Error: Some nodes were not found in the simulation domain. Exiting." << std::endl;
      std::exit(EXIT_FAILURE);
  }

}


// TODO!!!
/*
  //  - excitationFaces              (flattened exc. faces count)
  //  - PortFacePidx_h               (int[excitationFaces], -1 for non-port faces)
  //  - PortFaceCentroid_h           (fp_t_ts[excitationFaces*3], centroid coords per face)
*/
// Uses TetID_excitation_h (owner tet id) to compute barycentrics of each
// port-face centroid inside its owning tetra. No octree/hydra traversal.
//
// Inputs assumed ready:
//  - excitationFaces
//  - PortFacePidx_h              : int[excitationFaces], -1 if NOT a port face
//  - PortFaceCentroid_h          : fp_t_ts[3*excitationFaces] (cx,cy,cz per face)
//  - TetID_excitation_h          : int[excitationFaces] (owner tetra index 0..tetraCNT-1)
//  - FaceID_excitation_h         : int[excitationFaces] (optional, not strictly needed here)
//
// Output:
//  - portFaceCentroid_bary[f].first  = 1 on success, -1 if non-port or error
//  - portFaceCentroid_bary[f].second = { { tetId, {l0,l1,l2,l3} } }  (exactly one entry)
void FemGrp::prepPortFaceCentroidPROBE()
{
  if (portCNT <= 0 || !PortFacePidx_h || !PortFaceCentroid_h || !TetID_excitation_h)
  {
    std::cerr << "[prepPortFaceCentroidPROBE] Missing inputs or no ports.\n";
    return;
  }

  auto det3 = [](const double x[3], const double y[3], const double z[3])
  {
    return x[0]*(y[1]*z[2]-y[2]*z[1])
         - x[1]*(y[0]*z[2]-y[2]*z[0])
         + x[2]*(y[0]*z[1]-y[1]*z[0]);
  };

  std::cout << "Compute barycentric coords of port-face centroids (using TetID_excitation_h)\n";

  portFaceCentroid_bary.clear();
  portFaceCentroid_bary.resize(excitationFaces);

  int done = 0, errors = 0;

  for (int f = 0; f < excitationFaces; ++f)
  {
    // Skip non-port faces
    if (PortFacePidx_h[f] < 0)
    {
      portFaceCentroid_bary[f].first = -1;
      continue;
    }

    // Owner tetra index from your pre-filled array
    const int tId = TetID_excitation_h[f];
    if (tId < 0 || tId >= tetraCNT)
    {
      std::cerr << "[PortCentroid] Invalid owner tId=" << tId << " for excitation face f=" << f << "\n";
      portFaceCentroid_bary[f].first = -1;
      ++errors;
      continue;
    }

    const tetra& T = tetARRAY[tId];

    // Tetra vertices
    double v[4][3];
    for (int i = 0; i < 4; ++i)
    {
      v[i][0] = T.nd[i]->getCoord().getx();
      v[i][1] = T.nd[i]->getCoord().gety();
      v[i][2] = T.nd[i]->getCoord().getz();
    }

    // Face centroid (cx,cy,cz)
    const fp_t_ts* C = &PortFaceCentroid_h[3 * f];
    const double   P[3] = { (double)C[0], (double)C[1], (double)C[2] };

    // Barycentric via Cramer's rule
    double a[3] = { v[0][0]-v[3][0], v[0][1]-v[3][1], v[0][2]-v[3][2] };
    double b[3] = { v[1][0]-v[3][0], v[1][1]-v[3][1], v[1][2]-v[3][2] };
    double c[3] = { v[2][0]-v[3][0], v[2][1]-v[3][1], v[2][2]-v[3][2] };
    double r[3] = { P[0]-v[3][0],    P[1]-v[3][1],    P[2]-v[3][2]    };

    const double D  = det3(a,b,c);
    if (std::abs(D) == 0.0)
    {
      std::cerr << "[PortCentroid] Degenerate tetra (D=0) at tId=" << tId << " for f=" << f << "\n";
      portFaceCentroid_bary[f].first = -1;
      ++errors;
      continue;
    }

    double l0 = det3(r,b,c) / D;
    double l1 = det3(a,r,c) / D;
    double l2 = det3(a,b,r) / D;
    double l3 = 1.0 - (l0 + l1 + l2);

    // Gentle renormalization (handles tiny FP drift)
    double sumL = l0 + l1 + l2 + l3;
    if (std::abs(sumL - 1.0) > 1e-10)
    {
      l3 = 1.0 - (l0 + l1 + l2);
    }

    // Store exactly one (tet, lambdas)
    std::vector<std::pair<int, std::array<double,4>>> vec;
    vec.emplace_back(tId, std::array<double,4>{l0,l1,l2,l3});

    portFaceCentroid_bary[f].first  = 1;
    portFaceCentroid_bary[f].second = std::move(vec);
    ++done;
    //cout << l0 << " " << l1 << " " << l2 << " " << l3 << "\n";
  }

  std::cout << "[prepPortFaceCentroidPROBE] Completed: " << done
            << " faces; errors=" << errors << ".\n";

  if (errors > 0) {
    std::cerr << "Error: Some port-face centroids could not be assigned.\n";
    std::exit(EXIT_FAILURE);
  }
}


/*
void FemGrp::prepPortFaceCentroidPROBE()
{
  // Requires:
  //  - excitationFaces              (flattened exc. faces count)
  //  - PortFacePidx_h               (int[excitationFaces], -1 for non-port faces)
  //  - PortFaceCentroid_h           (fp_t_ts[excitationFaces*3], centroid coords per face)
  //  - octree_object.findTetraInOctree(double[3], out, tol)

  if (portCNT <= 0 || !PortFacePidx_h || !PortFaceCentroid_h)
  {
    std::cerr << "[readPortFaceCentroidPROBE] No ports or centroid buffers not ready.\n";
    return;
  }

  const double tol = 1e-3;
  std::cout << "Compute the Barycentric coordinates of Probes on Ports" << std::endl;

  portFaceCentroid_bary.clear();
  portFaceCentroid_bary.resize(excitationFaces);

  int not_found = 0;
  int done = 0;

  long long total_found_tets = 0;   // sum of found_tets.size() over successes
  int       success_faces     = 0;  // number of faces with success==true


  // #pragma omp parallel for schedule(dynamic) reduction(+:not_found,done)  // (optional)
  for (int f = 0; f < excitationFaces; ++f)
  {
    // Only process port faces
    if (PortFacePidx_h[f] < 0)
    {
      portFaceCentroid_bary[f].first = -1;   // mark as N/A (non-port)
      continue;
    }

    // Centroid coordinates of face f
    // NOTE: These come from node coords directly; do NOT rescale unless your mesh needs it.
    const fp_t_ts* C = &PortFaceCentroid_h[3 * f];
    double xyz[3] = { (double)C[0], (double)C[1], (double)C[2] };
    std::vector<std::pair<int, std::array<double,4>>> found_tets;
    bool success = octree_object.findTetraInOctree(xyz, found_tets, tol);

    if (success)
    {
      portFaceCentroid_bary[f].first  = static_cast<int>(found_tets.size());
      portFaceCentroid_bary[f].second = std::move(found_tets);
      ++done;

      // [NEW] accumulate for average
      total_found_tets += portFaceCentroid_bary[f].first;
      ++success_faces;
    }
    else
    {
      portFaceCentroid_bary[f].first = -1;
      ++not_found;

      // Debug print (can be silenced)
      std::cerr << "[PortCentroid] face f=" << f
                << " (port " << PortFacePidx_h[f] << ") NOT found at "
                << xyz[0] << " " << xyz[1] << " " << xyz[2] << "\n";
    }
  }

  std::cout << "[readPortFaceCentroidPROBE] Located " << done
            << " port-face centroids; " << not_found << " not found.\n";


  if (not_found == 0 && success_faces > 0) {
    const double avg = static_cast<double>(total_found_tets) / static_cast<double>(success_faces);
    std::cout << "[PortCentroid] average owning tets per centroid = " << avg
              << " (over " << success_faces << " faces)\n";
  }

  // Hard error if any were not found (match readPROBE behavior if you prefer)
  if (not_found > 0)
  {
    std::cerr << "Error: Some port-face centroids were not found in the domain. Exiting.\n";
    std::exit(EXIT_FAILURE);
  }
}
*/


void FemGrp::readREGULAR(){
  // writeFieldGlobal(1);
  char tname[StrLenShort];

  sprintf(tname, "%s.regular", fname);
  ifstream regularAreaFile(tname, ios::in);

  if(!regularAreaFile){
    cout << "File " << tname << " does NOT exist " << endl;
    exit(1);
  }

  int numOfRegions;
  int region;

  regularAreaFile >> numOfRegions;
  regularTetraCNT = 0;
  // Only one domain exists
  regularCNT = numOfRegions;
  if(regularCNT >= 1){
    regularReferenceARRAY = new int[tetraCNT];
    regionARRAY = new int[regularCNT];
    for(int i = 0; i < regularCNT; i++)
      regionARRAY[i] = -1;

    for(int i = 0; i < tetraCNT; i ++){
      tetra* tet = &(tetARRAY[i]);
      regularAreaFile >> region;
      tet->setRegularGroup(region);
      if(region == 0){
        regularReferenceARRAY[i] = i;
      }
      else
      {
        regularTetraCNT++;
        if(regionARRAY[region] == -1)
        {
          regionARRAY[region] = i;
          regularReferenceARRAY[i] = i;
        }
        else
        {
          regularReferenceARRAY[i] = regionARRAY[region];
        }
      }
      // cout << "i = " << i << " reference = " << regularReferenceARRAY[i] << " region = " << region << endl;
    }
  }
}

void FemGrp::initializeMaxMinPoints(){
  maxPoint.setvtr(std::numeric_limits<fp_t>::min(), std::numeric_limits<fp_t>::min(), std::numeric_limits<fp_t>::min());
  minPoint.setvtr(std::numeric_limits<fp_t>::max(), std::numeric_limits<fp_t>::max(), std::numeric_limits<fp_t>::max());
}

void FemGrp::setMaxMinPoints(fp_t x, fp_t y, fp_t z){
  maxPoint.setvtr(x > maxPoint.getx() ? x : maxPoint.getx(),
                  y > maxPoint.gety() ? y : maxPoint.gety(),
                  z > maxPoint.getz() ? z : maxPoint.getz());
  minPoint.setvtr(x < minPoint.getx() ? x : minPoint.getx(),
                  y < minPoint.gety() ? y : minPoint.gety(),
                  z < minPoint.getz() ? z : minPoint.getz());
}

// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000 Post-processing 0000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //


// Modified by qi jian to write field at probes (CPU VERSION)
void FemGrp::writeFieldProbe(int timeStep)
{
  int i, j;
  fp_t vol;
  fp_t zeta[4];
  vtr lvtr[3];
  vtr avtr[4];

  int tetraMAP_aux[TetPolyOrderDim[getPolyFlag()]];
  #if defined(DGTD_USE_CUDA)
    fp_t_ts E_coeff[TetPolyOrderDim[getPolyFlag()]];
    fp_t_ts H_coeff[TetPolyOrderDim[getPolyFlag()]];
  #else
    fp_t E_coeff[TetPolyOrderDim[getPolyFlag()]];
    fp_t H_coeff[TetPolyOrderDim[getPolyFlag()]];
  #endif

  vtr eField;
  vtr hField;
  vtr eField_all;
  vtr hField_all;

  char csvFileName[StrOutput];
  std::ofstream csvFile;

  if(padeCNT == 0 || writeWhilePade)
  {
    sprintf(csvFileName, "Probes_%s_%04d.csv", fname, timeStep);
    csvFile.open(csvFileName);
    csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";
  }


  const int num_nodes = probeCNT;

  // Calculate Total Fields at the points
  for(i = 0; i < num_nodes; i++)
  {
    int number_of_associated_tets = probes_bary.at(i).first;

    eField.reset();
    hField.reset();

    std::vector<std::pair<int, std::array<double, 4>>> found_tets = probes_bary.at(i).second;
    eField_all.reset();
    hField_all.reset();

    for (int t = 0; t < number_of_associated_tets; t++)
    {

      int tet_id = found_tets.at(t).first;
      array<double,4> tri_bary_coord = found_tets.at(t).second;
      tetra& tet = tetARRAY[tet_id];

      tet.geometry(lvtr, avtr, &vol);
      avtr[3].reset();
      avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);

      eField.reset();
      hField.reset();
      zeta[0] = static_cast<fp_t>(tri_bary_coord[0]);
      zeta[1] = static_cast<fp_t>(tri_bary_coord[1]);
      zeta[2] = static_cast<fp_t>(tri_bary_coord[2]);
      zeta[3] = static_cast<fp_t>(tri_bary_coord[3]);


      // Calculate E field
      tet.Local_DG_mapE(tetraMAP_aux, tet.LocalOffsetE);

      for(j = 0 ; j < TetPolyOrderDim[getPolyFlag()] ; j++)
      {
        if(tetraMAP_aux[j] < 0)
          E_coeff[j] = 0.0;
        else
          #if defined(DGTD_USE_CUDA)
            E_coeff[j] = En1_h[tetraMAP_aux[j]];
          #else
            E_coeff[j] = en_1->getentry(tetraMAP_aux[j]);
          #endif
      }
      // Calculate H field
      tet.Local_DG_mapH(tetraMAP_aux, tet.LocalOffsetH);

      for(j = 0 ; j < TetPolyOrderDim[getPolyFlag()] ; j++){
        if(tetraMAP_aux[j] < 0)
          H_coeff[j] = 0.0;
        else
          #if defined(DGTD_USE_CUDA)
            H_coeff[j] = Hn32_h[tetraMAP_aux[j]];
          #else
            H_coeff[j] = hn_32->getentry(tetraMAP_aux[j]);
          #endif
      }

      eField = CalcEfield(E_coeff, avtr, vol, zeta, PolyFlag);
      hField = CalcEfield(H_coeff, avtr, vol, zeta, PolyFlag);


      eField_all = eField_all + eField;
      hField_all = hField_all + hField;

    }

    eField_all = eField_all / ((fp_t) number_of_associated_tets);
    hField_all = hField_all / ((fp_t) number_of_associated_tets);

    if(usePade){ // && i < padeCNT
      int row = ((int)(timeStep / tsPerSampling)) * NumOfFieldComponents * probeCNT;
      int column = i * NumOfFieldComponents;
      fieldProbes[row + column + 0] = eField_all.getx();
      fieldProbes[row + column + 1] = eField_all.gety();
      fieldProbes[row + column + 2] = eField_all.getz();
      fieldProbes[row + column + 3] = hField_all.getx();
      fieldProbes[row + column + 4] = hField_all.gety();
      fieldProbes[row + column + 5] = hField_all.getz();
    }

    if(padeCNT == 0 || writeWhilePade)
    {
      const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
      csvFile << std::setprecision(max_precision) << eField_all.getx() << "," << eField_all.gety() << "," << eField_all.getz() << "," << hField_all.getx() << "," << hField_all.gety() << "," << hField_all.getz() << "\n";
    }
  }


  if(padeCNT == 0 || writeWhilePade)
  {
    usleep(100);
    csvFile.close();
  }


}


void FemGrp::writeFieldProbeAfterPade(int tsSize)
{
  const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};

  #pragma omp parallel for
  for(int i = 0; i < (int)ceil((1.0 * NtimeSteps) / tsPerSampling); i++){
    char csvFileName[StrOutput];
    std::ofstream csvFile;
    sprintf(csvFileName, "./PROBES/Probes_%s_%04d.csv", fname, i * tsSize);
    csvFile.open(csvFileName);
    csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";

    for(int probe = 0; probe < probeCNT; probe++)
    {
      int column = probe * NumOfFieldComponents;
      int row = i * NumOfFieldComponents * probeCNT;
      for(int j = 0; j < NumOfFieldComponents; j++)
      {
        csvFile << std::setprecision(max_precision) << fieldProbes[row + column + j];

        if(j == NumOfFieldComponents - 1)
          csvFile << "\n";
        else
          csvFile << ",";
      }
    }
    usleep(100);
    csvFile.close();
  }
}


void FemGrp::writeFieldGlobal(int timeStep){
  int i, j;
  fp_t vol;
  fp_t zeta[4];
  vtr lvtr[3];
  vtr avtr[4];
  vtr coord[4];
  vtr eLocal[4];
  vtr hLocal[4];

  int* tetraMAP_aux;
  int* MapE_Pe;

  #if defined(DGTD_USE_CUDA)
    fp_t_ts* E_coeff;
    fp_t_ts* H_coeff;
  #else
    fp_t* E_coeff;
    fp_t* H_coeff;
  #endif


  vtr* eField = new vtr[nodeCNT];
  vtr* hField = new vtr[nodeCNT];
  int* count = new int[nodeCNT];
  memset(count, 0, nodeCNT * sizeof(int));

  // only initialize the memory for the first solution
  if(regE.TetraReg == 0)
    regE.initial(tetraCNT);
  if(regH.TetraReg == 0)
    regH.initial(tetraCNT);

  int* polyOrder = new int[tetraCNT];
  for(i = 0; i < tetraCNT; i++){
    tetra& tet = tetARRAY[i];
    polyOrder[i] = tet.PolyOrderFlag;
    for(j = 0; j < NumOfNodes; j++){
      coord[j] = (tet.getNode(j))->getCoord();
    }
    tet.geometry(lvtr, avtr, &vol);
    avtr[3].reset();
    avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);

    tetraMAP_aux = new int[TetPolyOrderDim[tet.PolyOrderFlag]];
    MapE_Pe = new int[2 * TetPolyOrderDim[tet.PolyOrderFlag]];
      #if defined(DGTD_USE_CUDA)
        E_coeff = new fp_t_ts[TetPolyOrderDim[tet.PolyOrderFlag]];
        H_coeff = new fp_t_ts[TetPolyOrderDim[tet.PolyOrderFlag]];
      #else
        E_coeff = new fp_t[TetPolyOrderDim[tet.PolyOrderFlag]];
        H_coeff = new fp_t[TetPolyOrderDim[tet.PolyOrderFlag]];
      #endif


    // E field
    tet.Local_DG_mapE(tetraMAP_aux, tet.LocalOffsetE);

    for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){
      if(tetraMAP_aux[j] < 0)
        E_coeff[j] = 0.0;
      else
        #if defined(DGTD_USE_CUDA)
          E_coeff[j] = En1_h[tetraMAP_aux[j]];
        #else
          E_coeff[j] = en_1->getentry(tetraMAP_aux[j]);
        #endif
    }
    // H field
    tet.Local_DG_mapH(tetraMAP_aux, tet.LocalOffsetH);

    for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){
      if(tetraMAP_aux[j] < 0)
        H_coeff[j] = 0.0;
      else
        #if defined(DGTD_USE_CUDA)
          H_coeff[j] = Hn32_h[tetraMAP_aux[j]];
        #else
          H_coeff[j] = hn_32->getentry(tetraMAP_aux[j]);
        #endif

    }

    for(j = 0; j < 4; j++){
      zeta[0] = BaryCoord[j][0];
      zeta[1] = BaryCoord[j][1];
      zeta[2] = BaryCoord[j][2];
      zeta[3] = BaryCoord[j][3];

      eLocal[j] = CalcEfield(E_coeff, avtr, vol, zeta, tet.PolyOrderFlag);
      hLocal[j] = CalcEfield(H_coeff, avtr, vol, zeta, tet.PolyOrderFlag);

      int index = tet.nd[j]->getid();
      eField[index] = eField[index] + eLocal[j] /*- Einc*/;
      hField[index] = hField[index] + hLocal[j] /*- Hinc*/;
      count[index] += 1;
    }
    regE.setRegister(i, eLocal);
    regH.setRegister(i, hLocal);
    delete [] tetraMAP_aux;
    delete [] MapE_Pe;
    delete [] E_coeff;
    delete [] H_coeff;
  }

  for(i = 0; i < nodeCNT; i++){
    eField[i] = eField[i] / static_cast<fp_t>(count[i]);
    hField[i] = hField[i] / static_cast<fp_t>(count[i]);
  }

  VtkWriter vtkWriter(1.0);
  //   VtkWriter vtkWriter(unit);
  char vtkFilePrefix[128];
  memset(vtkFilePrefix, 0, 128 * sizeof(char));

  sprintf(vtkFilePrefix, "%s_%04d", fname, timeStep);

  vtkWriter.writeField(vtkFilePrefix, nodeCNT, ndARRAY, tetraCNT, tetARRAY, eField, hField, polyOrder, 0, 0); //TODO: why here polyorder is not 1

  delete [] eField;
  delete [] hField;
  delete [] polyOrder;
  delete [] count;
}


// Modified by qi jian to compute the analytical incident field at the probes
void FemGrp::writeAnalyticalIncidentPWProbes(int timeStep){
  int i;
  vtr Einc;
  vtr Hinc;
  vtr r;
  vtr Einc_field;
  vtr Hinc_field;

  fp_t zeta[4];
  char csvFileName[StrOutput];
  sprintf(csvFileName, "AnalyticalIncidentField_%s_%04d.csv", fname, timeStep);

  std::ofstream csvFile(csvFileName);
  csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";


  for(i = 0; i < probeCNT; i++)
  {

    // Get the Incident Field at the probe
    int number_of_associated_tets = probes_bary.at(i).first;

    Einc.reset();
    Hinc.reset();
    std::vector<std::pair<int, std::array<double, 4>>> found_tets = probes_bary.at(i).second;
    Einc_field.reset(); // Store for all valid candidate tets
    Hinc_field.reset(); // Store for all valid candidate tets

    for (int t = 0; t < number_of_associated_tets; t++)
    {
      int tet_id = found_tets.at(t).first;
      array<double,4> tri_bary_coord = found_tets.at(t).second;
      tetra& tet = tetARRAY[tet_id];

      zeta[0] = static_cast<fp_t>(tri_bary_coord[0]);
      zeta[1] = static_cast<fp_t>(tri_bary_coord[1]);
      zeta[2] = static_cast<fp_t>(tri_bary_coord[2]);
      zeta[3] = static_cast<fp_t>(tri_bary_coord[3]);

      SimplexToCartesian(tet, r, zeta);
      getAnalyticalPWField(tet, r, Einc, Hinc, timeStep, LocTimeSteps[N_class -1]);

      Einc_field = Einc_field + Einc;
      Hinc_field = Hinc_field + Hinc;

    }

    Einc_field = Einc_field / ((fp_t) number_of_associated_tets);
    Hinc_field = Hinc_field / ((fp_t) number_of_associated_tets);

    const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
    csvFile << std::setprecision(max_precision) << Einc_field.getx() << "," << Einc_field.gety() << "," << Einc_field.getz() << "," << Hinc_field.getx() << "," << Hinc_field.gety() << "," << Hinc_field.getz() << "\n";

  }
  usleep(100);
  csvFile.close();
}


void FemGrp::getAnalyticalPWField(tetra& tet, vtr& r, vtr& Einc, vtr& Hinc, int timeStep, fp_t dt){
  fp_t eta = No * sqrt(tet.mat->mur.getEntry(0,0) / tet.mat->epsr.getEntry(0,0));
  fp_t V_light = Vo / sqrt(tet.mat->epsr.getEntry(0,0) * tet.mat->mur.getEntry(0,0));
  fp_t omega = 2.0  * Pi * freq * MEGA;
  fp_t Exponent;
  fp_t SinModul;
  fp_t Neuman;
  fp_t IncidExcit_E;
  fp_t IncidExcit_H;
  fp_t t;

  for(int i = 0; i < bcCNT; i++){
    bc bc_i = bcARRAY[i];
    if(bc_i.getbType() == planeWaveType || bc_i.getbType() == pmlType){
      fp_t Emagnitude = bc_i.getMagE();
      fp_t theta_in_rad = bc_i.getTheta() * Pi / 180.0;
      fp_t phi_in_rad = bc_i.getPhi() * Pi / 180.0;
      vtr Epol = bc_i.getField();
      vtr kvtr(sin(theta_in_rad) * cos(phi_in_rad), sin(theta_in_rad) * sin(phi_in_rad), cos(theta_in_rad));
      vtr Hpol = kvtr * Epol;
      vtr ro = bc_i.getPW_ro();
      fp_t Hmagnitude = Emagnitude / eta;

      Hpol.unitvtr();
      Epol.unitvtr();
      switch(ExcitFlag){
        case 0: //(not tested)

          if(Exponent >= 0.0){
            // Plane wave E
            t = dt * (timeStep + 1.0);
            Exponent = t - To - dotP(kvtr, r - ro) / Vo;
            SinModul = cos(omega * Exponent);
            IncidExcit_E = Emagnitude * SinModul;
            t = dt * (timeStep + 1.5);
            Exponent = t - To - dotP(kvtr, r - ro) / Vo;
            SinModul = cos(omega * Exponent);
            IncidExcit_H = Hmagnitude * SinModul;
          }else{
            IncidExcit_E = 0.0;
            IncidExcit_H = 0.0;
          }
          break;
        case 1:
          // Gauss Pulse
          t = dt * (timeStep + 1.0);
          Exponent = t - To - dotP(kvtr, r - ro) / Vo;
          SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
          IncidExcit_E = Emagnitude * SinModul * exp(-(Exponent * Exponent) / (Tau * Tau));
          t = dt * (timeStep + 1.5);
          Exponent = t - To - dotP(kvtr, r - ro) / Vo;
          SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
          IncidExcit_H = Hmagnitude * SinModul * exp(-(Exponent * Exponent) / (Tau * Tau));
          break;
        case 2: //(not tested)
          // Neuman Pulse E
          t = dt * (timeStep + 1.0);
          Exponent = t - To - dotP(kvtr, r - ro) / Vo;
          Neuman = (2.0 * Exponent) / (Tau * Tau);
          IncidExcit_E = (Emagnitude * Neuman) * exp(-(Exponent * Exponent) / (Tau * Tau));
          t = dt * (timeStep + 1.5);
          Exponent = t - To - dotP(kvtr, r - ro) / Vo;
          Neuman = (2.0 * Exponent) / (Tau * Tau);
          IncidExcit_H = Hmagnitude * Neuman * exp(-(Exponent * Exponent) / (Tau * Tau));
          break;

          case 3:
          {
              // DC-Free Hann-Modulated Cosine Pulse (with time delay)
              fp_t tdelay = To; // To represents the delay time
              t = dt * (timeStep + 1.0);
              Exponent = t - tdelay - dotP(kvtr, r - ro) / Vo;

              if (Exponent >= 0.0 && Exponent <= Tau) {
                  // Shift exponent relative to pulse center
                  fp_t t_rel = Exponent - Tau / 2.0;
                  fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tau));  // Hann window
                  SinModul = cos(omega * t_rel);
                  IncidExcit_E = Emagnitude * SinModul * window;
              } else {
                  IncidExcit_E = 0.0;
              }

              t = dt * (timeStep + 1.5);
              Exponent = t - tdelay - dotP(kvtr, r - ro) / Vo;

              if (Exponent >= 0.0 && Exponent <= Tau) {
                  fp_t t_rel = Exponent - Tau / 2.0;
                  fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tau));  // Hann window
                  SinModul = cos(omega * t_rel);
                  IncidExcit_H = Hmagnitude * SinModul * window;
              } else {
                  IncidExcit_H = 0.0;
              }
              break;
          }


          case 4: // Linear Chirp Excitation with sine start and Hann window
          {
              fp_t f_end = freq * MEGA;
              fp_t B = Tau * MEGA;
              fp_t f0 = f_end - B;
              fp_t f1 = f_end;
              fp_t Tchirp = To;

              // Incident Electric Field (E)
              t = dt * (timeStep + 1.0);
              Exponent = t - dotP(kvtr, r - ro) / Vo;
              if (Exponent >= 0.0 && Exponent <= Tchirp)
              {
                  fp_t chirpArg = 2.0 * Pi * f0 * Exponent + Pi * (f1 - f0) / Tchirp * Exponent * Exponent;
                  fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tchirp)); // Hann window
                  IncidExcit_E = Emagnitude * sin(chirpArg) * window;
              }
              else
              {
                  IncidExcit_E = 0.0;
              }

              // Incident Magnetic Field (H)
              t = dt * (timeStep + 1.5);
              Exponent = t - To - dotP(kvtr, r - ro) / Vo;
              if (Exponent >= 0.0 && Exponent <= Tchirp)
              {
                  fp_t chirpArg = 2.0 * Pi * f0 * Exponent + Pi * (f1 - f0) / Tchirp * Exponent * Exponent;
                  fp_t window = 0.5 * (1.0 - cos(2.0 * Pi * Exponent / Tchirp)); // Hann window
                  IncidExcit_H = Hmagnitude * sin(chirpArg) * window;
              }
              else
              {
                  IncidExcit_H = 0.0;
              }
              break;
          }


        default:
          break;
      }
      Einc = Epol * IncidExcit_E;
      Hinc = Hpol * IncidExcit_H;
      // cout << "Einc at: (" << r.getx() << ", " << r.gety() << ", " << r.getz() << ") = (" << Einc.getx() << ", " << Einc.gety() << ", " << Einc.getz() << ")" << endl;
    }
  }
}

void FemGrp::writeEquivalentSurfaceCurrents_(int timeStep){
  int i, j;
  int m;
  int index;
  int FaceNum;
  fp_t vol;
  fp_t zeta[4];
  fp_t Area;
  vtr NormalVtr;
  vtr lvtr[3];
  vtr avtr[4];
  vtr coord[4];
  vtr eLocal[4];
  vtr hLocal[4];
  vtr eLocalFace[3];
  vtr hLocalFace[3];
  tetra* tet;

  ArrayFP<fp_t>* origEn_1 = new ArrayFP<fp_t>(TetPolyOrderDim[PolyFlag]);
  ArrayFP<fp_t>* origHn_32 = new ArrayFP<fp_t>(TetPolyOrderDim[PolyFlag]);

  char Currents_vtkFile[StrOutput];
  sprintf(Currents_vtkFile, "Currents_%s_%04d", fname, timeStep);

  // fill the port field with averaged values
  vtr* JField = new vtr[SurfMesh->nodeCNT];
  vtr* MField = new vtr[SurfMesh->nodeCNT];
  int* count = new int[SurfMesh->nodeCNT];
  memset(count, 0, SurfMesh->nodeCNT * sizeof(int));

  regMface = new Register[SurfMesh->faceCNT];
  regJface = new Register[SurfMesh->faceCNT];

  for(i = 0; i < SurfMesh->faceCNT; i++){
    SurfMesh->fcArray[i]->getAreaNormal(&Area, &NormalVtr);
    tet = SurfMesh->fcArray[i]->hydra[0];
    tet->geometry(lvtr, avtr, &vol);
    avtr[3].reset();
    avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);

    Get_Coefficients_(tet, origEn_1, origHn_32);

    for(m = 0; m < NumOfFaces; m++){
      zeta[m] = 0.0;
      if(SurfMesh->fcArray[i] == tet->getFacePtr(m))
        FaceNum = m;
    }

    for(j = 0; j < 4; j++){
      zeta[0] = BaryCoord[j][0];
      zeta[1] = BaryCoord[j][1];
      zeta[2] = BaryCoord[j][2];
      zeta[3] = BaryCoord[j][3];
      eLocal[j] = CalcEfield(origEn_1->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag);
      hLocal[j] = CalcEfield(origHn_32->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag);
    }

    regMface[i].initial(3);
    regJface[i].initial(3);
    for(j = 0; j < 3; j++){
      eLocalFace[j] = eLocal[faceMAP[FaceNum][j]];
      hLocalFace[j] = hLocal[faceMAP[FaceNum][j]];
      index = SurfMesh->globToLocMap_->find(SurfMesh->fcArray[i]->getNode(j)->getid())->second;
      MField[index] = MField[index] + NormalVtr * eLocalFace[j] * (-1.0);
      JField[index] = JField[index] + NormalVtr * hLocalFace[j] * (1.0);
      // No averaging
      regMface[i].setField(j, NormalVtr * eLocalFace[j] * (-1.0));
      regJface[i].setField(j, NormalVtr * hLocalFace[j] * (1.0));
      count[index] += 1;
    }
  }

  // This is for visualization in the vtk format
  for(i = 0; i < SurfMesh->nodeCNT; i++){
    MField[i] = MField[i] / static_cast<fp_t>(count[i]);
    JField[i] = JField[i] / static_cast<fp_t>(count[i]);
  }

  node** locNodeArray = new node*[SurfMesh->nodeCNT];
  for(i = 0; i < SurfMesh->nodeCNT; i++){
    node& Node = *(SurfMesh->ndArray[i]);
    int index  = SurfMesh->globToLocMap_->find(Node.getid())->second;
    locNodeArray[index] = new node(index, Node.getPType(), Node.getSingOrder(), Node.getCoord().getx(), Node.getCoord().gety(), Node.getCoord().getz());
  }

  face** locFaceArray = new face*[SurfMesh->faceCNT];
  for(i = 0; i < SurfMesh->faceCNT; i++){
    face& Face = *(SurfMesh->fcArray[i]);
    locFaceArray[i] = new face(Face);
    locFaceArray[i]->setFace(
        locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(0)->getid())->second],
        locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(1)->getid())->second],
        locNodeArray[SurfMesh->globToLocMap_->find(Face.getNode(2)->getid())->second]);
  }

  // Vtk
  VtkWriter vtkWriter(1.);
  vtkWriter.writeTriUg(Currents_vtkFile, SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, MField, JField, 1);

  // Register
  char regFileName[StrOutput];
  char regFileNameDebug[StrOutput];
  memset(regFileName, 0, StrOutput * sizeof(char));
  sprintf(regFileName, "Currents_%s_%05d", fname, timeStep);
  sprintf(regFileNameDebug, "Currents_%s_%05d_dbg", fname, timeStep);

  printRegister(regMface, regJface, SurfMesh->faceCNT, regFileName,1);
  // printRegisterDebug(regMface, regJface, SurfMesh->faceCNT, regFileNameDebug,2);

  if(timeStep == 0)
    printTriMesh(SurfMesh->nodeCNT, locNodeArray, SurfMesh->faceCNT, locFaceArray, fname);

  for(i = 0; i < SurfMesh->nodeCNT; i++)
    delete locNodeArray[i];
  delete [] locNodeArray;
  for(i = 0; i < SurfMesh->faceCNT; i++)
    delete locFaceArray[i];
  delete [] locFaceArray;
  delete [] MField;
  delete [] JField;
  delete [] count;
  delete origEn_1;
  delete origHn_32;
}


// Print face registers
void FemGrp::printRegister(Register* regMface, Register* regJface, int FaceCnt, char *prjName, int order){
  int i, j;
  char fnameJ3[180];
  char fnameM3[180];

  sprintf(fnameM3, "%s_BC.curM", prjName);
  sprintf(fnameJ3, "%s_BC.curJ", prjName);

  ofstream foutJ3, foutM3;
  foutM3.open(fnameM3, ios::out);
  foutJ3.open(fnameJ3, ios::out);

  for(i = 0; i < FaceCnt; i++){
    if(order == 1){
      for(j = 0; j < 3; j ++){
        foutM3 << regMface[i].getField(j).getx() << endl;
        foutM3 << regMface[i].getField(j).gety() << endl;
        foutM3 << regMface[i].getField(j).getz() << endl;
      }
      foutM3 << endl;
      for(j = 0; j < 3; j ++){
        foutJ3 << regJface[i].getField(j).getx() << endl;
        foutJ3 << regJface[i].getField(j).gety() << endl;
        foutJ3 << regJface[i].getField(j).getz() << endl;
      }
      foutJ3 << endl;
    }else if(order == 2){
      for(j = 0; j < 3; j ++){
        foutM3 << regMface[i].getField(j).getx() << endl;
        foutM3 << regMface[i].getField(j).gety() << endl;
        foutM3 << regMface[i].getField(j).getz() << endl;
      }
      for(j = 0 ; j < 3 ; j++){
        int index0 = First2Second[j][0];
        int index1 = First2Second[j][1];
        foutM3 << 0.5 * (regMface[i].getField(index0).getx() + regMface[i].getField(index1).getx()) << endl;
        foutM3 << 0.5 * (regMface[i].getField(index0).gety() + regMface[i].getField(index1).gety()) << endl;
        foutM3 << 0.5 * (regMface[i].getField(index0).getz() + regMface[i].getField(index1).getz()) << endl;
      }
      foutM3 << endl;

      for(j = 0; j < 3; j ++){
        foutJ3 << regJface[i].getField(j).getx() << endl;
        foutJ3 << regJface[i].getField(j).gety() << endl;
        foutJ3 << regJface[i].getField(j).getz() << endl;
      }

      for(j = 0 ; j < 3 ; j++){
        int index0 = First2Second[j][0];
        int index1 = First2Second[j][1];
        foutJ3 << 0.5 * (regJface[i].getField(index0).getx() + regJface[i].getField(index1).getx()) << endl;
        foutJ3 << 0.5 * (regJface[i].getField(index0).gety() + regJface[i].getField(index1).gety()) << endl;
        foutJ3 << 0.5 * (regJface[i].getField(index0).getz() + regJface[i].getField(index1).getz()) << endl;
      }

      foutJ3 << endl;
    }
  }
  foutJ3.close();
  foutM3.close();
}

// Print out Outer Surface node & triangle info on *.tri
void FemGrp::printTriMesh(int ndNum, node **ndArray, int fcNum, face **fcArray, char *prjName){
  int i;
  face* fcPtr;
  FILE* fd;
  char triName[360];

  sprintf(triName, "%s.tri", prjName);

  fd = fopen(triName, "wt");
  fprintf(fd, "%f\n", unit);
  fprintf(fd, "%d\n", ndNum);

  for(i = 0; i < ndNum; i ++){
    fprintf(fd, "%f %f %f\n",
            (ndArray[i]->getCoord().getx()) / unit,
            (ndArray[i]->getCoord().gety()) / unit,
            (ndArray[i]->getCoord().getz()) / unit);
  }

  fprintf(fd,"%d\n", fcNum);
  for(i = 0; i < fcNum; i ++){
    fcPtr = fcArray[i];
    node* n0Ptr;
    node* n1Ptr;
    node* n2Ptr;

    n0Ptr = fcPtr->getNode(0);
    n1Ptr = fcPtr->getNode(1);
    n2Ptr = fcPtr->getNode(2);
    fprintf(fd, "%d %d %d\n", n0Ptr->getid(), n1Ptr->getid(), n2Ptr->getid());
  }
  fclose(fd);
}


// Modified by qi jian to compute the L2 error at the probes
void FemGrp::CalculateL2ErrorProbes(int& timeStep, fp_t dt, int TimeDistFlag){
  int i, j;

  fp_t vol;
  fp_t zeta[4];
  vtr lvtr[3];
  vtr avtr[4];
  vtr eLocal;
  vtr hLocal;
  vtr eLocal_exa;
  vtr hLocal_exa;

  vtr eLocal_all;
  vtr hLocal_all;
  vtr eLocal_exa_all;
  vtr hLocal_exa_all;

  fp_t E_coeff[TetPolyOrderDim[getPolyFlag()]];
  fp_t H_coeff[TetPolyOrderDim[getPolyFlag()]];
  fp_t IntegrOmegaE = 0.0;
  fp_t IntegrOmegaH = 0.0;
  vtr r;
  vtr Exa_NumE;
  vtr Exa_NumH;
  char Error_E_TimeLog[180];
  char Error_H_TimeLog[180];

  int outOfModelProbes = 0;

  for(i = 0; i < probeCNT; i++)
  {

    int number_of_associated_tets = probes_bary.at(i).first;

    eLocal.reset();
    hLocal.reset();

    std::vector<std::pair<int, std::array<double, 4>>> found_tets = probes_bary.at(i).second;
    eLocal_exa.reset();
    hLocal_exa.reset();

    eLocal_all.reset();
    hLocal_all.reset();
    eLocal_exa_all.reset();
    hLocal_exa_all.reset();

    for (int t = 0; t < number_of_associated_tets; t++)
    {

      int tet_id = found_tets.at(t).first;
      array<double,4> probe_bary_coord = found_tets.at(t).second;
      tetra& tet = tetARRAY[tet_id];

      int tetraMAP[TetPolyOrderDim[tet.PolyOrderFlag]];

      tet.geometry(lvtr, avtr, &vol);
      avtr[3].reset();
      avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);

      // Compute the Efield
      tet.Local_DG_mapE(tetraMAP, tet.LocalOffsetE);
      for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){
        if(tetraMAP[j] < 0)
          E_coeff[j] = 0.0;
        else
          E_coeff[j] = en_1->getentry(tetraMAP[j]);
      }

      // Compute the Hfield
      tet.Local_DG_mapH(tetraMAP, tet.LocalOffsetH);
      for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag] ; j++){
        if(tetraMAP[j] < 0)
          H_coeff[j] = 0.0;
        else
          H_coeff[j] = hn_32->getentry(tetraMAP[j]);
      }

      eLocal.reset();
      hLocal.reset();
      eLocal_exa.reset();
      hLocal_exa.reset();


      zeta[0] = static_cast<fp_t>(probe_bary_coord[0]);
      zeta[1] = static_cast<fp_t>(probe_bary_coord[1]);
      zeta[2] = static_cast<fp_t>(probe_bary_coord[2]);
      zeta[3] = static_cast<fp_t>(probe_bary_coord[3]);
      SimplexToCartesian(tet, r, zeta);

      eLocal = CalcEfield(E_coeff, avtr, vol, zeta, tet.PolyOrderFlag);
      hLocal = CalcEfield(H_coeff, avtr, vol, zeta, tet.PolyOrderFlag);
      GetExactSolution(tet, r, eLocal_exa, hLocal_exa, timeStep, dt, TimeDistFlag);


      // Add all the local fields from all relevant tets
      eLocal_all = eLocal_all + eLocal;
      hLocal_all = hLocal_all + hLocal;
      eLocal_exa_all = eLocal_exa_all + eLocal_exa;
      hLocal_exa_all = hLocal_exa_all + hLocal_exa;

    }

    eLocal_all = eLocal_all / ((fp_t) number_of_associated_tets);
    hLocal_all = hLocal_all / ((fp_t) number_of_associated_tets);
    eLocal_exa_all = eLocal_exa_all / ((fp_t) number_of_associated_tets);
    hLocal_exa_all = hLocal_exa_all / ((fp_t) number_of_associated_tets);


    Exa_NumE = eLocal_exa_all - eLocal_all;
    Exa_NumH = hLocal_exa_all - hLocal_all;

    IntegrOmegaE += Exa_NumE.magnitude() * Exa_NumE.magnitude();
    IntegrOmegaH += Exa_NumH.magnitude() * Exa_NumH.magnitude();

    sprintf(Error_E_TimeLog, "%s_Probe_%d.TDerrorE", fname, i);
    sprintf(Error_H_TimeLog, "%s_Probe_%d.TDerrorH", fname, i);

    ofstream Error_E(Error_E_TimeLog, ios_base::out | ios::app);
    Error_E.setf(ios::scientific, ios::floatfield);
    Error_E.precision(15);

    if(!Error_E)
      cout << "Error in opening file: " << Error_E_TimeLog << " for write " << endl;

    Error_E << "[" << (timeStep + 1.0) * dt << ", " << Exa_NumE.magnitude() << "]; \n";
    Error_E.close();

    ofstream Error_H(Error_H_TimeLog, ios_base::out | ios::app);
    Error_H.setf(ios::scientific, ios::floatfield);
    Error_H.precision(15);

    if(!Error_H)
      cout << "Error in opening file: " << Error_H_TimeLog << " for write " << endl;

    Error_H << "[" << (timeStep + 1.5) * dt << ", " << Exa_NumH.magnitude() << "]; \n";
    Error_H.close();

  }


  // Write to file
  if(outOfModelProbes < probeCNT)
  {
    sprintf(Error_E_TimeLog, "%s_Probes_Global.TDerrorE", fname);
    sprintf(Error_H_TimeLog, "%s_Probes_Global.TDerrorH", fname);

    ofstream Error_E(Error_E_TimeLog, ios_base::out | ios::app);
    Error_E.setf(ios::scientific, ios::floatfield);
    Error_E.precision(15);

    if(!Error_E)
      cout << "Error in opening file: " << Error_E_TimeLog << " for write " << endl;

    Error_E << "[" << (timeStep + 1.0) * dt << ", " << sqrt(IntegrOmegaE / (probeCNT - outOfModelProbes)) << "]; \n";
    Error_E.close();

    ofstream Error_H(Error_H_TimeLog, ios_base::out | ios::app);
    Error_H.setf(ios::scientific, ios::floatfield);
    Error_H.precision(15);

    if(!Error_H)
      cout << "Error in opening file: " << Error_H_TimeLog << " for write " << endl;

    Error_H << "[" << (timeStep + 1.5) * dt << ", " << sqrt(IntegrOmegaH / (probeCNT - outOfModelProbes)) << "]; \n";
    Error_H.close();
  }

}


void FemGrp::CalculateL2Error(int& timeStep, fp_t dt, int TimeDistFlag){
  int i, j;
  fp_t vol;
  fp_t zeta[4];
  vtr lvtr[3];
  vtr avtr[4];
  vtr coord[4];
  vtr eLocal[4];
  vtr hLocal[4];
  vtr eLocal_exa[4];
  vtr hLocal_exa[4];
  int QuadOrder = 2; //TODO: Recheck with the order of the basis
  int points = 4;
  fp_t** ZetaMat = new fp_t*[points];
  fp_t* weights = new fp_t[points];
  for(int i = 0; i < points; i++)
    ZetaMat[i] = new fp_t[4];
  GetTetQuadRule(QuadOrder, points, ZetaMat, weights);

  fp_t IntegrOmegaE = 0.0;
  fp_t IntegrOmegaH = 0.0;
  fp_t NormalizeOmegaE = 0.0;
  fp_t NormalizeOmegaH = 0.0;

  for(i = 0; i < tetraCNT; i++){
    tetra& tet = tetARRAY[i];
    int tetraMAP_E[TetPolyOrderDim[tet.PolyOrderFlag]];
    int tetraMAP_H[TetPolyOrderDim[tet.PolyOrderFlag]];
    auto origEn_1 = new ArrayFP<fp_t>(TetPolyOrderDim[tet.PolyOrderFlag]);
    auto origHn_32 = new ArrayFP<fp_t>(TetPolyOrderDim[tet.PolyOrderFlag]);
    for(j = 0; j < 4; j++){
      coord[j] = (tet.getNode(j))->getCoord();
    }
    tet.geometry(lvtr, avtr, &vol);
    avtr[3].reset();
    avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);


    tet.Local_DG_mapE(tetraMAP_E, tet.LocalOffsetE);
    tet.Local_DG_mapH(tetraMAP_H, tet.LocalOffsetH);
    origEn_1->reset();
    origHn_32->reset();
    for(j = 0 ; j < TetPolyOrderDim[tet.PolyOrderFlag]; j++){
      if(tetraMAP_E[j] < 0)
        origEn_1->setentry(j, 0.0);
      else
        origEn_1->setentry(j, en_1->getentry(tetraMAP_E[j]));

      if(tetraMAP_H[j] < 0)
        origHn_32->setentry(j, 0.0);
      else
        origHn_32->setentry(j, hn_32->getentry(tetraMAP_H[j]));
    }

    fp_t IntegrValueE = 0.0;
    fp_t IntegrValueH = 0.0;
    fp_t NormalizeValueE = 0.0;
    fp_t NormalizeValueH = 0.0;

    vtr r;
    vtr Exa_NumE;
    Exa_NumE.reset();
    vtr Exa_NumH;
    Exa_NumH.reset();

    //Tetrahedron integration
    for(j = 0; j < points; j++){
      zeta[0] = ZetaMat[j][0];
      zeta[1] = ZetaMat[j][1];
      zeta[2] = ZetaMat[j][2];
      zeta[3] = ZetaMat[j][3];
      SimplexToCartesian(tet, r, zeta);
      eLocal[j] = CalcEfield(origEn_1->getEntryPtr(), avtr, vol, zeta, tet.PolyOrderFlag);
      hLocal[j] = CalcEfield(origHn_32->getEntryPtr(), avtr, vol, zeta, tet.PolyOrderFlag);
      GetExactSolution(tet, r, eLocal_exa[j], hLocal_exa[j], timeStep, dt, TimeDistFlag);

      Exa_NumE = eLocal_exa[j] - eLocal[j];
      Exa_NumH = hLocal_exa[j] - hLocal[j];
      IntegrValueE += weights[j] * vol * (Exa_NumE.magnitude() * Exa_NumE.magnitude());
      IntegrValueH += weights[j] * vol * (Exa_NumH.magnitude() * Exa_NumH.magnitude());
      NormalizeValueE += weights[j] * vol * (eLocal_exa[j].magnitude() * eLocal_exa[j].magnitude());
      NormalizeValueH += weights[j] * vol * (hLocal_exa[j].magnitude() * hLocal_exa[j].magnitude());
    }

    IntegrOmegaE = IntegrOmegaE + IntegrValueE;
    IntegrOmegaH = IntegrOmegaH + IntegrValueH;
    NormalizeOmegaE = NormalizeOmegaE + NormalizeValueE;
    NormalizeOmegaH = NormalizeOmegaH + NormalizeValueH;
  }
    // Write to file
  char Error_E_TimeLog[180];
  char Error_H_TimeLog[180];
  sprintf(Error_E_TimeLog, "%s.TDerrorE", fname);
  sprintf(Error_H_TimeLog, "%s.TDerrorH", fname);

  ofstream Error_E(Error_E_TimeLog, ios_base::out | ios::app);
  Error_E.setf(ios::scientific, ios::floatfield);
  Error_E.precision(15);
  if(!Error_E)
    cout << "Error in opening file: " << Error_E_TimeLog << " for write " << endl;

  Error_E << "[" << (timeStep + 1.0) * dt << ", " << sqrt(IntegrOmegaE) << "]; \n";
  Error_E.close();

  ofstream Error_H(Error_H_TimeLog, ios_base::out | ios::app);
  Error_H.setf(ios::scientific, ios::floatfield);
  Error_H.precision(15);
  if(!Error_H)
    cout << "Error in opening file: " << Error_H_TimeLog << " for write " << endl;

  Error_H << "[" << (timeStep + 1.5) * dt << ", " << sqrt(IntegrOmegaH) << "]; \n";
  Error_H.close();

  delete[] weights;
  for(i = 0; i < points; i++)
    delete[] ZetaMat[i];
  delete[] ZetaMat;
}

void FemGrp::SimplexToCartesian(tetra& tet, vtr& r, fp_t zeta[4]){
  fp_t x = 0.;
  fp_t y = 0.;
  fp_t z = 0.;
  for(int i = 0; i < 4 ; i++){
    x += tet.getNode(i)->getCoord().getx() * zeta[i];
    y += tet.getNode(i)->getCoord().gety() * zeta[i];
    z += tet.getNode(i)->getCoord().getz() * zeta[i];
  }
  r.setvtr(x, y, z);
}

void FemGrp::GetExactSolution(tetra& tet, vtr& r, vtr& Einc, vtr& Hinc, int timeStep, fp_t dt, int Flag){
  fp_t to = To;
  fp_t tau = Tau;
  fp_t eta = No * sqrt(tet.mat->mur.getEntry(0,0) / tet.mat->epsr.getEntry(0,0));
  fp_t V_light = Vo / sqrt(tet.mat->epsr.getEntry(0,0) * tet.mat->mur.getEntry(0,0));
  fp_t Neuman;
  fp_t Frequency = freq;
  fp_t omega = 2.0  * Pi * Frequency * MEGA;

  fp_t Exponent;
  fp_t SinModul;

  for(int i = 0; i < bcCNT; i++){
    bc bc_i = bcARRAY[i];
    if(bc_i.getbType() == planeWaveType || bc_i.getbType() == pmlType){
      fp_t Emagnitude = bc_i.getMagE();
      fp_t theta_in_rad = bc_i.getTheta() * Pi / 180.0;
      fp_t phi_in_rad = bc_i.getPhi() * Pi / 180.0;
      vtr Epol = bc_i.getField();
      vtr kvtr(sin(theta_in_rad) * cos(phi_in_rad), sin(theta_in_rad) * sin(phi_in_rad), cos(theta_in_rad));
      kvtr.unitvtr();
      vtr Hpol = kvtr * Epol;
      vtr ro = bc_i.getPW_ro();
      fp_t Hmagnitude = Emagnitude / eta;

      Hpol.unitvtr();
      Epol.unitvtr();
      switch (Flag){
        case 0:
          kvtr.Scale((omega / V_light));
          Hinc = Hpol * (Hmagnitude * cos(dotP(kvtr, r - ro) - omega * (timeStep + 1.5) * dt));
          Einc = Epol * (Emagnitude * cos(dotP(kvtr, r - ro) - omega * (timeStep + 1.0) * dt));
          break;

        case 1:
          Exponent = (timeStep + 1.0) * dt - to - (dotP(kvtr, r - ro) / V_light);
          SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
          Einc = Epol * SinModul * (Emagnitude * exp(- (Exponent * Exponent) / (tau * tau)));
          Exponent = (timeStep + 1.5) * dt - to - (dotP(kvtr, r - ro) / V_light);
          SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
          Hinc = Hpol * SinModul * (Hmagnitude * exp(- (Exponent * Exponent) / (tau * tau)));
          break;

        case 2:
          Exponent = (timeStep + 1.5) * dt - to - (dotP(kvtr, r - ro) / V_light);
          Neuman = (2.0 * Exponent) / (tau * tau);
          Hinc = Hpol * (Hmagnitude * Neuman * exp(- (Exponent * Exponent) / (tau * tau)));
          Exponent = (timeStep + 1.0) * dt - to - (dotP(kvtr, r - ro) / V_light);
          Neuman = (2.0 * Exponent) / (tau * tau);
          Einc = Epol * (Emagnitude * Neuman * exp(- (Exponent * Exponent) / (tau * tau)));
          break;

        default:
          break;
      }
    }
  }
}

/* "Early Time Behavior in Reverberation Chambers and
      Its Effect on the Relationships Between Coherence
      Bandwidth, Chamber Decay Time, RMS Delay
      Spread, and the Chamber Buildup Time", Christopher L. Holloway et al. */
bool FemGrp::calculatePade(int currentTimeStep){
  int M = currentTimeStep / tsPerSampling;
  int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
  int N = (int)floor(M / 2.0);

  int finish = 0;
  timer_start("Process : ", 'm');
  #pragma omp parallel for schedule(static)  shared(finish)
  for(int pade = 0; pade < padeCNT; pade++){
    int auxFinish = 0;
    fp_t convergence = 0.0;
    fp_t maxProbe = 0.0;

    for(int component = 0; component < NumOfFieldComponents; component++){
      fp_t a_k[N] = {0};
      fp_t b_k[N] = {0};
      fp_t_ts maxValComponent = 0.0;
      getPadeCoef(a_k, b_k, &fieldProbes[pade * totalSamples * NumOfFieldComponents], N, component, &maxValComponent);
      maxProbe += maxValComponent;
      convergence += maxValComponent * getFreqDomainPade(a_k, b_k, totalSamples, N, &tranferencePadeFunctionFD[pade * totalSamples * NumOfFieldComponents], component, pade, currentTimeStep / tsPerPade == 1);

      cout << "Probe = " << pade << " Component = " << component << " Value = " << (convergence / maxProbe) << endl;
      if((currentTimeStep / tsPerPade == 1 || (convergence / maxProbe) < PadeTolerance) && (component == NumOfUnitaryVectors - 1 || component == NumOfFieldComponents - 1)){
        auxFinish++;
        maxProbe = 0.0;
        convergence = 0.0;
      }
    }
    #pragma omp atomic update
    finish += auxFinish;
  }
  timer_stop('m');
  return finish == 0;
}

void FemGrp::calculatePadeEnd(int currentTimeStep){
  int M = currentTimeStep / tsPerSampling;
  int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
  int N = (int)floor(M / 2.0);

  int finish = 0;
  timer_start("Process : ", 'm');
  for(int pade = 0; pade < padeCNT; pade++){
    Complex* FD = new Complex[totalSamples * NumOfFieldComponents];

    // #pragma omp parallel for
    for(int component = 0; component < NumOfFieldComponents; component++){
      fp_t a_k[N] = {0};
      fp_t b_k[N] = {0};

      fp_t_ts maxValComponent = 0.0;
      timer_start("Coef: " + std::to_string(component) + ": ",'m');
      getPadeCoef(a_k, b_k, &fieldProbes[pade * totalSamples * NumOfFieldComponents], N, component, &maxValComponent);
      timer_stop('m');
      timer_start("Freq Dom " + std::to_string(component) + ": ",'m');
      getFreqDomainPade(a_k, b_k, totalSamples, N, FD, component, pade, true);
      timer_stop('m');
    }
    // getPadeIFFTEnd(pade, FD);
    timer_start("IFFF " + std::to_string(pade) + ": ",'m');
    getPadeIFFT(pade, FD);
    timer_stop('m');
    delete[] FD;

    cout << "Pade point exported: " << pade << endl;
  }
  timer_stop('m');
  return;
}

#if defined(DGTD_USE_CUDA)
  void FemGrp::calculatePadeEndCUDA(int currentTimeStep){
    int M = currentTimeStep / tsPerSampling;
    int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
    int N = (int)floor(M / 2.0);
    int nFields = padeCNT * NumOfFieldComponents;
    int finish = 0;
    timer_start("Process : ", 'm');
    cudaStream_t* streams = (cudaStream_t*)malloc(NumOfFieldComponents * sizeof(cudaStream_t));

    CUDA_SAFE_CALL(cudaMalloc((void**)&padeFreqConstant_d, totalSamples * sizeof(int)));
    CUDA_SAFE_CALL(cudaMemcpy(padeFreqConstant_d, padeFreqConstant, totalSamples * sizeof(int), cudaMemcpyHostToDevice));

    for(int i = 0; i < NumOfFieldComponents; i++){
      cudaStreamCreate(&streams[i]);
    }

    cuDoubleComplex* Hf;
    CUDA_SAFE_CALL(cudaMallocHost((void**)&Hf, totalSamples * nFields * sizeof(cuDoubleComplex), cudaHostAllocMapped));

    for(int pade = 0; pade < padeCNT; pade++){
      fp_t* maxValComponent = new fp_t[NumOfFieldComponents];
      for(int component = 0; component < NumOfFieldComponents; component++){
        fp_t* a_k;
        fp_t* b_k;
        CUDA_SAFE_CALL(cudaMallocHost((void**)&a_k, N * sizeof(fp_t), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&b_k, N * sizeof(fp_t), cudaHostAllocMapped));

        cuDoubleComplex* FD = &Hf[totalSamples * (pade * NumOfFieldComponents + component)];
        getPadeCoefCUDA(a_k, b_k, &maxValComponent[component], pade * NumOfFieldComponents + component, streams[component], currentTimeStep);
        getFreqDomainPadeCUDA(a_k, b_k, totalSamples, N, FD, streams[component]);
      }
      getPadeIFFT(pade, &Hf[pade * totalSamples * NumOfFieldComponents]);
    }

    for(int i = 0; i < NumOfFieldComponents; i++){
      cudaStreamDestroy(streams[i]);
    }

    timer_stop('m');

    CUDA_SAFE_CALL(cudaFree(padeFreqConstant_d));
    CUDA_SAFE_CALL(cudaFreeHost(Hf));

    return;
  }

  bool FemGrp::calculatePadeCUDA(int currentTimeStep, bool isFirst, bool isEnd){
    if(isEnd){
      FreeGPU();
    }

    int M = currentTimeStep / tsPerSampling;
    int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
    int N = (int)floor(M / 2.0);
    int nFields = padeCNT * NumOfFieldComponents;
    int finish = 0;
    bool* exitArray = new bool[padeCNT];

    timer_start("Process : ", 'm');
    cudaStream_t* streams = (cudaStream_t*)malloc(NumOfFieldComponents * sizeof(cudaStream_t));

    CUDA_SAFE_CALL(cudaMalloc((void**)&padeFreqConstant_d, totalSamples * sizeof(int)));
    CUDA_SAFE_CALL(cudaMemcpy(padeFreqConstant_d, padeFreqConstant, totalSamples * sizeof(int), cudaMemcpyHostToDevice));

    for(int i = 0; i < NumOfFieldComponents; i++){
      cudaStreamCreate(&streams[i]);
    }

    int nPoints = isEnd ? probeCNT : padeCNT;

    for(int pade = 0; pade < nPoints; pade++){
      timer_start("Process : ", 'm');
      fp_t* maxValComponent = new fp_t[NumOfFieldComponents];
      cuDoubleComplex* Hf;
      CUDA_SAFE_CALL(cudaMallocHost((void**)&Hf, totalSamples * NumOfFieldComponents * sizeof(cuDoubleComplex), cudaHostAllocMapped));
      #pragma omp parallel for
      for(int component = 0; component < NumOfFieldComponents; component++){
        fp_t* a_k;
        fp_t* b_k;
        CUDA_SAFE_CALL(cudaMallocHost((void**)&a_k, N * sizeof(fp_t), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&b_k, N * sizeof(fp_t), cudaHostAllocMapped));

        cuDoubleComplex* FD = &Hf[totalSamples * component];
        getPadeCoefCUDA(a_k, b_k, &maxValComponent[component], pade * NumOfFieldComponents + component, streams[component], currentTimeStep);
        getFreqDomainPadeCUDA(a_k, b_k, totalSamples, N, FD, streams[component]);
      }

      if(!isFirst && !isEnd){
        exitArray[pade] = studyPadeConvergence(&tranferencePadeFunctionFD_h[pade * NumOfFieldComponents * totalSamples], Hf, maxValComponent, totalSamples, pade);
      }

      if(isEnd){
        printFD(pade, Hf);
        if(pade < padeCNT && writePadeTD){
          getPadeIFFT(pade, Hf);
        }
        cout << "Final Pade Point " << pade << "completed" << endl;
      }else{
        CUDA_SAFE_CALL(cudaMemcpy(&tranferencePadeFunctionFD_h[pade * NumOfFieldComponents * totalSamples], Hf, totalSamples * NumOfFieldComponents * sizeof(cuDoubleComplex), cudaMemcpyHostToHost));
      }
      delete [] maxValComponent;
      CUDA_SAFE_CALL(cudaFreeHost(Hf));
      timer_stop('m');
    }

    for(int i = 0; i < NumOfFieldComponents; i++){
      cudaStreamDestroy(streams[i]);
    }

    CUDA_SAFE_CALL(cudaFree(padeFreqConstant_d));
    bool exitValue = false;

    if(!isFirst && !isEnd){
      for(int pade = 0; pade < padeCNT; pade++){
        if(pade == 0){
          exitValue = exitArray[0];
        }else
          exitValue = exitValue & exitArray[pade];
      }
    }

    delete [] exitArray;
    timer_stop('m');
    return exitValue;
  }

  bool FemGrp::studyPadeConvergence(cuDoubleComplex* oldField, cuDoubleComplex* newField, fp_t* maxFields, int M_global, int point){
    for(int typeOfField = 0; typeOfField < TypeOfFields; typeOfField++){
      fp_t convergence = 0.0;
      fp_t maxProbe = 0.0;
      #pragma omp parallel for shared(convergence, maxProbe)
      for(int component = 0; component < NumOfUnitaryVectors; component++){
        fp_t sum_X = 0.0, sum_Y = 0.0, sum_XY = 0.0, sum_XX = 0.0, sum_YY = 0.0;
        fp_t lastYf_abs = 0.0;
        fp_t currentYf_abs = 0.0;

        for(int i = 0; i < M_global; i++){
          int arrayMap  = component * M_global + i;
          lastYf_abs = sqrt(pow(oldField[arrayMap].x,2)+pow(oldField[arrayMap].y,2));
          currentYf_abs = sqrt(pow(newField[arrayMap].x,2)+pow(newField[arrayMap].y,2));
          sum_X = sum_X + currentYf_abs;
          sum_Y = sum_Y + lastYf_abs;
          sum_XY = sum_XY + currentYf_abs * lastYf_abs;
          sum_XX = sum_XX + currentYf_abs * currentYf_abs;
          sum_YY = sum_YY + lastYf_abs * lastYf_abs;
        }
        #pragma omp atomic update
        convergence += maxFields[component] * (M_global * sum_XY - sum_X * sum_Y) / sqrt((M_global * sum_XX - sum_X * sum_X) * (M_global * sum_YY - sum_Y * sum_Y));
        #pragma omp atomic update
        maxProbe += maxFields[component];
      }
      cout << "Convergence Point " << point << " Fields " << (typeOfField ? "H" : "E") << ": " << (convergence / maxProbe) << endl;
      if((convergence / maxProbe) < PadeTolerance){
        return false;
      }
    }
    return true;
  }

  void FemGrp::getPadeCoefCUDA(fp_t* a_k, fp_t* b_k, fp_t* maxField, int local_id, cudaStream_t stream, int currentTimeStep){
    int M = currentTimeStep / tsPerSampling;
    int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
    int N = (int)floor(M / 2.0);
    int nFields = padeCNT * NumOfFieldComponents;

    a_k[0] = fieldProbes[local_id];
    b_k[0] = 1;
    *maxField = abs(fieldProbes[local_id]);
    cusolverDnHandle_t handle;
    cusolverDnCreate(&handle);
    cusolverDnSetStream(handle, stream);

    int n = N-1;
    // int n = 2;
    int nrhs = 1;

    fp_t* G_h;
    fp_t* d_h;

    fp_t* G_d;
    fp_t* d_d;

    CUDA_SAFE_CALL(cudaMallocHost((void**)&G_h, n * n * sizeof(fp_t), cudaHostAllocMapped));
    CUDA_SAFE_CALL(cudaMallocHost((void**)&d_h, n * sizeof(fp_t), cudaHostAllocMapped));

    for(int i = 0; i < n; i++){
      for(int j = 0; j < n; j++){
        G_h[j * n + i] = fieldProbes[(N - j + i) * probeCNT * NumOfFieldComponents + local_id];
        *maxField = max(abs(fieldProbes[(N - j + i) * probeCNT * NumOfFieldComponents + local_id]), *maxField);
      }
      d_h[i] = -fieldProbes[(N + i + 1) * probeCNT * NumOfFieldComponents + local_id];
    }

    //Copy matrices
    CUDA_SAFE_CALL(cudaMalloc((void**)&G_d, n * n * sizeof(fp_t)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_d, n * sizeof(fp_t)));

    CUDA_SAFE_CALL(cudaMemcpyAsync(G_d, G_h, n * n * sizeof(fp_t), cudaMemcpyHostToDevice, stream));
    CUDA_SAFE_CALL(cudaMemcpyAsync(d_d, d_h, n * sizeof(fp_t), cudaMemcpyHostToDevice, stream));

    //Calculate buffer
    int bufferSize;
    cusolverDnDgetrf_bufferSize(handle, n, n, G_d, n, &bufferSize);

    //Initialize variables
    int* info;
    CUDA_SAFE_CALL(cudaMalloc((void**)&info, sizeof(int)));

    fp_t* buffer; // workspace for gesv
    CUDA_SAFE_CALL(cudaMalloc((void**)&buffer, bufferSize * sizeof(fp_t)));

    int *ipiv = NULL; // pivoting sequence
    CUDA_SAFE_CALL(cudaMalloc((void**)&ipiv, n * sizeof(int)));

    //Solve problem
    cusolverDnDgetrf(handle, n, n, G_d, n, buffer, ipiv, info);
    cusolverDnDgetrs(handle, CUBLAS_OP_N, n, nrhs, G_d, n, ipiv, d_d, n, info);

    //Copy data back to CPU
    CUDA_SAFE_CALL(cudaMemcpyAsync(d_h, d_d, n * sizeof(fp_t), cudaMemcpyDeviceToHost, stream));

    //Free GPU
    CUDA_SAFE_CALL(cudaFree(G_d));
    CUDA_SAFE_CALL(cudaFree(d_d));
    CUDA_SAFE_CALL(cudaFree(buffer));
    CUDA_SAFE_CALL(cudaFree(info));
    CUDA_SAFE_CALL(cudaFree(ipiv));

    CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
    cusolverDnDestroy(handle);
    CUDA_SAFE_CALL(cudaFreeHost(G_h));

    for(int i = 0; i < n; i++){
      b_k[i + 1] = d_h[i];
      a_k[i + 1] = 0.0;
      for(int j = 0; j < i + 1; j++){
        a_k[i + 1] += b_k[j] * fieldProbes[(i + 1 - j) * probeCNT * NumOfFieldComponents + local_id];
      }
    }

    CUDA_SAFE_CALL(cudaFreeHost(d_h));
  }

  void FemGrp::getFreqDomainPadeCUDA(fp_t* a_k, fp_t* b_k, int M_global, int N, cuDoubleComplex* H_f, cudaStream_t stream){
    fp_t* a_k_d;
    fp_t* b_k_d;
    CUDA_SAFE_CALL(cudaMalloc((void**)&a_k_d, N * sizeof(fp_t)));
    CUDA_SAFE_CALL(cudaMalloc((void**)&b_k_d, N * sizeof(fp_t)));

    CUDA_SAFE_CALL(cudaMemcpyAsync(a_k_d, a_k, N * sizeof(fp_t), cudaMemcpyHostToDevice, stream));
    CUDA_SAFE_CALL(cudaMemcpyAsync(b_k_d, b_k, N * sizeof(fp_t), cudaMemcpyHostToDevice, stream));

    cuDoubleComplex* H_f_d;
    CUDA_SAFE_CALL(cudaMalloc((void**)&H_f_d, M_global * sizeof(cuDoubleComplex)));

    dim3 blockDim(256, 1, 1);
    dim3 gridDim(ceil_div(M_global, 256), 1, 1);

    CalculatePadeFreq<<<gridDim, blockDim, 0, stream>>>(a_k_d, b_k_d, M_global, N, padeFreqConstant_d, H_f_d);

    CUDA_SAFE_CALL(cudaMemcpyAsync(H_f, H_f_d, M_global * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, stream));
    CUDA_SAFE_CALL(cudaFree(a_k_d));
    CUDA_SAFE_CALL(cudaFree(b_k_d));
    CUDA_SAFE_CALL(cudaFree(H_f_d));
    CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
  }

  void FemGrp::getPadeIFFT(int probe, cuDoubleComplex* fDomainField){
    int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
    double* tDomainField = new double[M_global];
    double* tDomainFieldOutput = new double[M_global * NumOfFieldComponents];
    const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};

    for(int component = 0; component < NumOfFieldComponents; component++){
      fftw_complex* fft;
      fftw_plan ifft;
      fft = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * M_global);
      #pragma omp parallel for
      for(int k = 0; k < M_global; k++){
        cuDoubleComplex field = fDomainField[component * M_global + k];
        Complex aux = (std::complex<float>(field.x, field.y) / sourceFreqDomain[k]) / M_global;
        fft[k][0] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.real();
        fft[k][1] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.imag();
      }

      ifft = fftw_plan_dft_c2r_1d(M_global, fft, tDomainField, FFTW_ESTIMATE);
      fftw_execute(ifft);
      fftw_destroy_plan(ifft);
      fftw_free(fft);
      #pragma omp parallel for
      for (int i = 0; i < M_global; i++) {
        tDomainFieldOutput[component * M_global + i] = 0.0;
        for (int j = 0; j <= min(i, tsSource); j++) {
          tDomainFieldOutput[component * M_global + i] += tDomainField[i - j] * sourceTimeDomain[j]; // Main convolution operation
        }
      }
    }


    char csvFileName[StrOutput];
    sprintf(csvFileName, "./PROBES/TD_Pade_%s_Probe_%d.csv", fname, probe);
    std::ofstream csvFile(csvFileName);
    csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";

    for(int n = 0; n < M_global; n++){
      for(int component = 0; component < NumOfFieldComponents; component++){
        if (component > 0){
          csvFile << ",";
        }
        csvFile << std::setprecision(max_precision) << tDomainFieldOutput[component * M_global + n];
      }
      csvFile << "\n";
    }
    usleep(100);
    csvFile.close();
    delete [] tDomainField;
    delete [] tDomainFieldOutput;
  }

  void FemGrp::printFD(int probe, cuDoubleComplex* fDomainField){
    int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
    const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
    char csvFileName[StrOutput];
    sprintf(csvFileName, "./PROBES/FD_Pade_%s_Probe_%d.csv", fname, probe);
    std::ofstream csvFile(csvFileName);
    csvFile << "ExRe" << "," << "ExIm" << "," << "EyRe" << "," << "EyIm" << "," << "EzRe" << "," << "EzIm" << "," << "HxRe" << "," << "HxIm" << "," << "HyRe" << "," << "HyIm" << "," << "HzRe" << "," << "HzIm" << "\n";

    for(int n = 0; n < M_global; n++){
      for(int component = 0; component < NumOfFieldComponents; component++){
        if (component > 0){
          csvFile << ",";
        }
        csvFile << std::setprecision(max_precision) << fDomainField[component * M_global + n].x << "," << fDomainField[component * M_global + n].y;
      }
      csvFile << "\n";
    }
  }

  void FemGrp::testEnd(){
    int ts = 0;
    char tname[StrLenShort];
    int totalSamples = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
    try {
      while(1){
        sprintf(tname, "PROBES_aux/Probes_%s_%04i.csv", fname, ts * tsPerSampling);
        cout << tname << endl;
        rapidcsv::Document probe_doc(tname);
        std::vector<double> Ex_col = probe_doc.GetColumn<double>("Ex");
        std::vector<double> Ey_col = probe_doc.GetColumn<double>("Ey");
        std::vector<double> Ez_col = probe_doc.GetColumn<double>("Ez");
        std::vector<double> Hx_col = probe_doc.GetColumn<double>("Hx");
        std::vector<double> Hy_col = probe_doc.GetColumn<double>("Hy");
        std::vector<double> Hz_col = probe_doc.GetColumn<double>("Hz");

        for(int i = 0; i < Ey_col.size(); i++){
          // fieldProbes[i * totalSamples * NumOfFieldComponents + ts * NumOfFieldComponents + 0] = Ex_col[i];
          // fieldProbes[i * totalSamples * NumOfUnitaryVectors * TypeO#pragma omp parallel forfFields + ts * NumOfFieldComponents + 5] = Hz_col[i];
          // cout << ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 0 << endl;
          fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 0] = Ex_col[i];
          fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 1] = Ey_col[i];
          fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 2] = Ez_col[i];
          fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 3] = Hx_col[i];
          fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 4] = Hy_col[i];
          fieldProbes[ts * NumOfFieldComponents * Ey_col.size() + i * NumOfFieldComponents + 5] = Hz_col[i];
        }
        ts++;
      }
    }catch(...){
      calculatePadeCUDA(ts * tsPerSampling, false, true);
      // for(int i = ts/8 ; i <= ts; i += ts/8){
      //   cout << i << " " << (ts+1)/8 << " " << ts << " " << (i == (ts/8)) << " " << (i ==  8 * (ts/8)) << endl;
      //   cout << calculatePadeCUDA(i * tsPerSampling, i == ts/8, i == 8 * (ts/8)) << endl;
      // }
      return;
    }

  }
#endif

void FemGrp::getPadeCoef(fp_t* a_k, fp_t* b_k, fp_t_ts* field, int N, int component, fp_t_ts* maxField){
  denseMat<fp_t>* G = new denseMat<fp_t>(N-1, N-1);
  ArrayFP<fp_t> d(N-1);
  a_k[0] = field[component];
  b_k[0] = 1;
  *maxField = field[component];
  // timer_start("Fill : ", 'm');
  timer_start("getPadeCoef " + std::to_string(1) + ": ",'m');
  for(int k = 0; k < N-1; k++){
    for(int m = 0; m < N-1; m++){
      G->setEntry(k,m, field[(N - m + k) * NumOfFieldComponents + component]); //it has to be in column form
      *maxField = max(abs(field[(N - m + k) * NumOfFieldComponents + component]), *maxField);
    }
    d[k] = -field[(N + k + 1) * NumOfFieldComponents + component];
  }

  timer_stop('m');
  timer_start("getPadeCoef " + std::to_string(2) + ": ",'m');

  G->SelfTranspose();

  timer_stop('m');
  timer_start("getPadeCoef " + std::to_string(3) + ": ",'m');
  solveAx_B(*G, d);
  timer_stop('m');
  timer_start("getPadeCoef " + std::to_string(4) + ": ",'m');
  for(int k = 0; k < N-1; k++){
    b_k[k + 1] = d[k];
    for(int m = 0; m < k + 1; m++){
      a_k[k + 1] += b_k[m] * field[(k + 1 - m) * NumOfFieldComponents + component];
    }
  }
  timer_stop('m');

  G->Clear();
  for(int i = 0; i<N; i++){
    cout << a_k[i] << " " << b_k[i] << endl;
  }
  // timer_stop('m');
}

void FemGrp::getPadeFreq(int N, int tsPerSampling){
  #if defined(DGTD_USE_CUDA)
    CUDA_SAFE_CALL(cudaMallocHost((void**)&padeFreqConstant, N * sizeof(int), cudaHostAllocMapped));
  #endif
  sourceFreqDomain = new Complex[N];
  sourceTimeDomain = new fp_t[N];

  #pragma omp parallel for
  for(int i = 0; i < N; i++){
    getSourceTimeDomain(i * tsPerSampling, &sourceTimeDomain[i], ExcitFlag);
    if(abs(sourceTimeDomain[i]) > SourceTolerancePade){
      tsSource = i;
    }
  }

  int finish = N % 2 == 0 ? N / 2 - 1 : (N - 1) / 2;
  fftw_complex* fftOut;
  fftOut = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * N);
  fftw_plan fft;
  fft = fftw_plan_dft_r2c_1d(N, sourceTimeDomain, fftOut, FFTW_ESTIMATE);
  fftw_execute(fft);
  fftw_destroy_plan(fft);

  #pragma omp parallel for
  for (int i = 0; i < N; ++i) {
    sourceFreqDomain[i] = std::complex<fp_t>(fftOut[i][0], fftOut[i][1]);
    if (i <= finish) {
      padeFreqConstant[i] = i;
    } else {
      padeFreqConstant[i] = -N + i;
    }
  }

  fftw_free(fftOut);
}

void FemGrp::getSourceTimeDomain(int timeStep, fp_t* Einc, int ExcitFlag){
  fp_t dt = LocTimeSteps[N_class - 1];
  fp_t omega = 2.0  * Pi * freq * MEGA;
  fp_t to = To;
  fp_t tau = Tau;
  fp_t Exponent, SinModul;

  switch (ExcitFlag){
    case 0:
      *Einc = static_cast<fp_t>(cos(omega * (timeStep + 1.0) * dt));
      break;

    case 1:
      Exponent = (timeStep + 1.0) * dt - to;
      SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
      *Einc = static_cast<fp_t>(SinModul * exp(- (Exponent * Exponent) / (tau * tau)));
      break;

    case 2:
      Exponent = (timeStep + 1.0) * dt - to;
      SinModul = ModuleFlag ? cos(omega * Exponent) : 1.0;
      *Einc = static_cast<fp_t>(SinModul * exp(- (Exponent * Exponent) / (tau * tau)));

      break;

    default:
      break;
  }
}

fp_t FemGrp::getFreqDomainPade(fp_t* a_k, fp_t* b_k, int M_global, int N, Complex* H_f, int component, int probe, bool firstValue){
  const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
  char csvFileName[StrOutput];
  sprintf(csvFileName, "Pade_Freq_1_%d_%d_%d.csv", N, probe, component);
  std::ofstream csvFile(csvFileName);
  if(firstValue){
    Complex sumA_k = 0;
    Complex sumB_k = 0;
    Complex j = Complex (0.0, 1.0);
    for(int i = 0; i < M_global; i++){
      sumA_k = 0;
      sumB_k = 0;
      for(int k = 0; k < N; k++){
        sumA_k += a_k[k] * pow(padeFreqs[i], k);
        sumB_k += b_k[k] * pow(padeFreqs[i], k);
      }
      Complex freqVal = sumA_k / sumB_k;
      csvFile << std::setprecision(max_precision) << sqrt(pow(freqVal.real(),2)+pow(freqVal.imag(),2));
      csvFile << "\n";
      H_f[i * NumOfFieldComponents + component] = freqVal;
    }
    cout << "First/Final Pade Calculation" << endl;
    csvFile.close();
    return 0.0;
  }else{
    Complex sumA_k = 0;
    Complex sumB_k = 0;
    Complex j = Complex (0.0, 1.0);
    fp_t lastYf_abs = 0.0;
    fp_t currentYf_abs = 0.0;
    fp_t freqNorm = 0.0;
    fp_t errorNorm = 0.0;

    fp_t sum_X = 0.0, sum_Y = 0.0, sum_XY = 0.0, sum_XX = 0.0, sum_YY = 0.0;
    for(int i = 0; i < M_global; i++){
      sumA_k = 0;
      sumB_k = 0;
      for(int k = 0; k < N; k++){
        sumA_k += a_k[k] * pow(padeFreqs[i], k);
        sumB_k += b_k[k] * pow(padeFreqs[i], k);
      }
      Complex freqVal = sumA_k / sumB_k;
      // csvFile << std::setprecision(max_precision) << sqrt(pow(freqVal.real(),2)+pow(freqVal.imag(),2));
      // csvFile << "\n";
      lastYf_abs = sqrt(H_f[i * NumOfFieldComponents + component].real() * H_f[i * NumOfFieldComponents + component].real() + H_f[i * NumOfFieldComponents + component].imag() * H_f[i * NumOfFieldComponents + component].imag());
      H_f[i * NumOfFieldComponents + component] = freqVal;
      currentYf_abs = sqrt(pow(freqVal.real(),2)+pow(freqVal.imag(),2));

      sum_X = sum_X + currentYf_abs;
      sum_Y = sum_Y + lastYf_abs;
      sum_XY = sum_XY + currentYf_abs * lastYf_abs;
      sum_XX = sum_XX + currentYf_abs * currentYf_abs;
      sum_YY = sum_YY + lastYf_abs * lastYf_abs;
    }

    fp_t corr = (M_global * sum_XY - sum_X * sum_Y) / sqrt((M_global * sum_XX - sum_X * sum_X) * (M_global * sum_YY - sum_Y * sum_Y));
    // cout << "Current Error In Pade (Probe = " << probe << ", Component = " << component <<") = " << corr << endl;
    return corr;
  }
  return 0.0;
}

void FemGrp::getPadeIFFTEnd(int probe, Complex* fDomainField){
  int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
  Complex j = Complex (0.0, 1.0);

  fp_t* tDomainField = new fp_t[M_global * NumOfFieldComponents];

  const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};

  for(int component = 0; component < NumOfFieldComponents; component++){
    Complex* tDomainTransferFunction = new Complex[M_global];
    #pragma omp parallel for
    for(int n = 0; n < M_global; n++){
      tDomainTransferFunction[n] = 0.0;
      for(int k = 0; k < M_global; k++){
        tDomainTransferFunction[n] += abs(sourceFreqDomain[k]) < SourceTolerancePade ? 0.0 : fDomainField[k * NumOfFieldComponents + component] / sourceFreqDomain[k] * exp(j * 2 * Pi * n * k / M_global);
      }
      tDomainTransferFunction[n] /= M_global;
    }
    #pragma omp parallel for
    for(int n = 0; n < M_global; n++){
      tDomainField[n * NumOfFieldComponents + component] = 0.0;
      for(int k = 0; k <= n; k++){
        tDomainField[n * NumOfFieldComponents + component] += tDomainTransferFunction[n-k].real() * sourceTimeDomain[k];
      }
    }
    delete [] tDomainTransferFunction;
  }

  char csvFileName[StrOutput];
  sprintf(csvFileName, "Pade_%s_Probe_%d.csv", fname, probe);
  std::ofstream csvFile(csvFileName);
  csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";

  for(int n = 0; n < M_global; n++){
    for(int component = 0; component < NumOfFieldComponents; component++){
      if (component > 0){
        csvFile << ",";
      }
      csvFile << std::setprecision(max_precision) << tDomainField[n * NumOfFieldComponents + component];
    }
    csvFile << "\n";
  }
  usleep(100);
  csvFile.close();
  delete [] tDomainField;
}

void FemGrp::getPadeIFFT(int probe, Complex* fDomainField){
  cout << "hello" << endl;
  int M_global = (int)ceil((1.0 * NtimeSteps) / tsPerSampling);
  double* tDomainField = new double[M_global * NumOfFieldComponents];
  double* tDomainFieldOutput = new double[M_global * NumOfFieldComponents];
  const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};

  for(int component = 0; component < NumOfFieldComponents; component++){
    fftw_complex* fft;
    fftw_plan ifft;
    fft = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * M_global);
    #pragma omp parallel for
    for(int k = 0; k < M_global; k++){
      // Complex aux = (fDomainField[k * NumOfFieldComponents + component] / sourceFreqDomain[k]) / M_global;
      Complex aux = (fDomainField[probe * M_global * NumOfFieldComponents + component * M_global + k] / sourceFreqDomain[k]) / M_global;
      fft[k][0] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.real();
      fft[k][1] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? 0.0 : aux.imag();
      // fft[k] = (abs(sourceFreqDomain[k]) < SourceTolerancePade) ? (fftw_complex)0.0 : (fftw_complex)fDomainField[k * NumOfFieldComponents + component];
    }

    double* tDomainFieldVec = &tDomainField[M_global * component];

    ifft = fftw_plan_dft_c2r_1d(M_global, fft, tDomainFieldVec, FFTW_ESTIMATE);
    fftw_execute(ifft);
    fftw_destroy_plan(ifft);
    fftw_free(fft);
    #pragma omp parallel for
    for (int i = 0; i < M_global; i++) {
      tDomainFieldOutput[component * M_global + i] = 0.0;
      for (int j = 0; j <= min(i, tsSource); j++) {
        tDomainFieldOutput[component * M_global + i] += tDomainField[component * M_global + i - j] * sourceTimeDomain[j]; // Main convolution operation
      }
    }
  }


  char csvFileName[StrOutput];
  sprintf(csvFileName, "Pade_%s_Probe_%d.csv", fname, probe);
  std::ofstream csvFile(csvFileName);
  csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";

  for(int n = 0; n < M_global; n++){
    for(int component = 0; component < NumOfFieldComponents; component++){
      if (component > 0){
        csvFile << ",";
      }
      csvFile << std::setprecision(max_precision) << tDomainFieldOutput[component * M_global + n];
    }
    csvFile << "\n";
  }
  usleep(100);
  csvFile.close();
  delete [] tDomainField;
  delete [] tDomainFieldOutput;
}

void FemGrp::GetTetQuadRule(int PolyOrder, int& points, fp_t** zeta, fp_t* weights){
  if(PolyOrder == 1){
    zeta[0][0] = 0.25;
    zeta[0][0] = 0.25;
    zeta[0][0] = 0.25;
    weights[0] = 1.0;
  }else if(PolyOrder == 2){
    zeta[0][0] = 0.585410196624969;
    zeta[0][1] = 0.138196601125011;
    zeta[0][2] = 0.138196601125011;
    zeta[0][3] = 0.138196601125011;
    //
    zeta[1][0] = 0.138196601125011;
    zeta[1][1] = 0.585410196624969;
    zeta[1][2] = 0.138196601125011;
    zeta[1][3] = 0.138196601125011;
    //
    zeta[2][0] = 0.138196601125011;
    zeta[2][1] = 0.138196601125011;
    zeta[2][2] = 0.585410196624969;
    zeta[2][3] = 0.138196601125011;
    //
    zeta[3][0] = 0.138196601125011;
    zeta[3][1] = 0.138196601125011;
    zeta[3][2] = 0.138196601125011;
    zeta[3][3] = 0.585410196624969;
    //
    weights[0] = 0.250000000000000;
    weights[1] = 0.250000000000000;
    weights[2] = 0.250000000000000;
    weights[3] = 0.250000000000000;
  }else if(PolyOrder == 3){
    zeta[0][0] = 0.250000000000000;
    zeta[0][1] = 0.250000000000000;
    zeta[0][2] = 0.250000000000000;
    zeta[0][3] = 0.250000000000000;
      //
    zeta[1][0] = 0.500000000000000;
    zeta[1][1] = 0.166666666666667;
    zeta[1][2] = 0.166666666666667;
    zeta[1][3] = 0.166666666666667;
    //
    zeta[2][0] = 0.166666666666667;
    zeta[2][1] = 0.500000000000000;
    zeta[2][2] = 0.166666666666667;
    zeta[2][3] = 0.166666666666667;
     //
    zeta[3][0] = 0.166666666666667;
    zeta[3][1] = 0.166666666666667;
    zeta[3][2] = 0.500000000000000;
    zeta[3][3] = 0.166666666666667;
    //
    //
    zeta[4][0] = 0.166666666666667;
    zeta[4][1] = 0.166666666666667;
    zeta[4][2] = 0.166666666666667;
    zeta[4][3] = 0.500000000000000;
    //
    weights[0] = -0.800000000000000;
    weights[1] = 0.450000000000000;
    weights[2] = 0.450000000000000;
    weights[3] = 0.450000000000000;
    weights[4] = 0.450000000000000;
  }
}

void FemGrp::Get_Coefficients_(tetra* tet, ArrayFP<fp_t>* origEn_1, ArrayFP<fp_t>* origHn_32){
  int* tetraMAP_E = new int[TetPolyOrderDim[tet->PolyOrderFlag]];
  int* tetraMAP_H = new int[TetPolyOrderDim[tet->PolyOrderFlag]];

  tet->Local_DG_mapE(tetraMAP_E, tet->LocalOffsetE);
  tet->Local_DG_mapH(tetraMAP_H, tet->LocalOffsetH);

  origEn_1->reset();
  origHn_32->reset();

  for(int i = 0 ; i < TetPolyOrderDim[tet->PolyOrderFlag]; i++){
    origEn_1->setentry(i, tetraMAP_E[i] < 0 ? 0.0 : en_1->getentry(tetraMAP_E[i]));
    origHn_32->setentry(i, tetraMAP_H[i] < 0 ? 0.0 : hn_32->getentry(tetraMAP_H[i]));
  }
}

void FemGrp::numberDofs(){
  tetra* tet = 0;
  int LocalDim = TetPolyOrderDim[PolyFlag];
  int *tetraEMap = 0;
  int *tetraHMap = 0;

  int EdofOffset = 0;//[E H] offset
  int HdofOffset = DimE;

  for(int i = 0; i < tetraCNT; i++){
    tet = &(tetARRAY[i]);
    tet->allocDofMap();

    tetraEMap = tet->get_LocalEMap(); // obtained from SetupMatrixFree
    tetraHMap = tet->get_LocalHMap();

    for(int j = 0; j < LocalDim; j++){
      //in case there is -1
      tet->setEHGlobalMap(j,
                         (tetraEMap[j] != NOT_NUMBERED) ? (tetraEMap[j] + EdofOffset) : (tetraEMap[j]),
                         (tetraHMap[j] != NOT_NUMBERED) ? (tetraHMap[j] + HdofOffset) : (tetraHMap[j]));
    }
  }
  size_t matrixDIM_com = dimE + dimH;
  cout << " " << endl;
  cout << "==============================================" << endl;
  cout << "         NUMBER OF DEGREES OF FREEDOM         " << endl;
  cout << "==============================================" << endl;
  cout << " Global Number of dof is " << matrixDIM_com << endl;
  cout << " Global Matrix dim is (w/o compress) " << tetraCNT * LocalDim * 2 << endl;
  cout << "==============================================" << endl;
  cout << " " << endl;
}

// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000 Port Meshes 00000000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //


/*
void FemGrp::makePortMeshes()
{
  int i, j;
  if(portCNT == 0)
    return;
  map<int,int> PortMap, PortMapRes;
  set<int> PortSet;
  set<int>::iterator it;
  int DGface_bc;
  for(int idx = 0; idx < tetraCNT; idx++){
    for(j = 0 ; j < NumOfFaces ; j++){
      DGface_bc = tetARRAY[idx].fc[j]->getbType();
      if(DGface_bc >= portType && DGface_bc < pecType)
        PortSet.insert(DGface_bc);
    }
  }

  LocPortCnt = (int)PortSet.size();

  cout << "PortSet.size = " << (int)PortSet.size() << endl;
  cout << "portCNT = " << portCNT << endl;
  cout << "LocPortCnt = " << LocPortCnt << endl;

  portCNT = LocPortCnt;

  cout << "portCNT = " << portCNT << endl;
  cout << "LocPortCnt = " << LocPortCnt << endl;

  if(LocPortCnt == 0)
    return;

  for(it = PortSet.begin(); it != PortSet.end(); it++)
    cout << "Port_type:" << *it << endl;

  int counter = 0;
  for(it = PortSet.begin(); it != PortSet.end(); it++){
    PortMap[*it] = counter;
    PortMapRes[counter] = *it;
    counter++;
  }

  pMeshARRAY = new portMesh[LocPortCnt];

  // count the port faces (portFaceNums)
  // get pointers to port faces (portFaceLists)
  // keep set of unique global node ids for faces (portNodeIds)
  int* portFaceNums = new int[LocPortCnt];
  list<face*>* portFaceLists = new list<face*>[LocPortCnt];
  set<int>* portNodeIds = new set<int>[LocPortCnt];
  memset(portFaceNums, 0, portCNT * sizeof(int));

  for(i = 0; i < faceCNT; i++){

    int bType = faceARRAY[i]->getbType();

    if((bType >= portType) && (bType != pecType)){
      int portNum = PortMap.find(bType)->second;
      (portFaceNums[portNum])++; // increment the face count
      portFaceLists[portNum].push_back(faceARRAY[i]); // add face pointer
      // add unique node ids
      for(j = 0; j < NumOfNodesPerFace; j++)
        portNodeIds[portNum].insert(faceARRAY[i]->getNode(j)->getid());
    }
  }

  for(i = 0; i < LocPortCnt; i++){
    portMesh& portmesh = pMeshARRAY[i];

    // set port name, magnitude and impedance
    for(j = 0; j < bcCNT; j++){
      if(bcARRAY[j].getbType() == PortMapRes[i]){
        portmesh.setName(bcARRAY[j].getName());
	      cout<<"This is " << portmesh.getName() << endl;
        portmesh.setMagE(bcARRAY[j].getMagE());
        portmesh.setImpZ(bcARRAY[j].getCval());
        break;
      }
    }

    // allocate and add face pointers to array
    int faceNum = portFaceNums[i];
    portmesh.setFaceCnt(faceNum);
    if(faceNum > 0){
      face** portFaceArray = portmesh.getFaceArray();
      list<face*>::iterator faceListIter = portFaceLists[i].begin();
      for(j = 0; j < faceNum; j++){
        portFaceArray[j] = *faceListIter;
        faceListIter++;
      }

      // allocate and add node pointers to array
      // keep local mapping
      int nodeNum = portNodeIds[i].size();
      portmesh.setNodeCnt(nodeNum);
      portmesh.allocGlobToLocMap();
      node** portNodeArray = portmesh.getNodeArray();
      map<int, int>& globToLocMap = portmesh.getGlobToLocMap();
      set<int>::iterator portNodeIdIter;
      int nodeCount = 0;
      for(portNodeIdIter  = portNodeIds[i].begin(); portNodeIdIter != portNodeIds[i].end(); portNodeIdIter++){
        portNodeArray[nodeCount] = &(ndARRAY[*portNodeIdIter]);
        globToLocMap[ndARRAY[*portNodeIdIter].getid()] = nodeCount++;
      }

      // setup the remaining port mesh stuff
      scalingLength = 1.0;
      portmesh.makeCoordSystem();
      portmesh.makeObjMap();
      portmesh.readVline(unit);
      portmesh.writeMesh(objProp);

      cout.setf(ios::scientific);
      cout.precision(15);

      #if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)
        vtr PortDirection_vtr = portmesh.getPortDirection();
        excitationProp.PortDirection[0] = PortDirection_vtr.getx();
        excitationProp.PortDirection[1] = PortDirection_vtr.gety();
        excitationProp.PortDirection[2] = PortDirection_vtr.getz();
      #endif
    }
  }

  delete [] portFaceNums;
  delete [] portFaceLists;
  delete [] portNodeIds;
}
*/


void FemGrp::makePortMeshes()
{
  int i, j;
  if (portCNT == 0) return;

  LocPortCnt = portCNT;
  pMeshARRAY = new portMesh[LocPortCnt];

  // Collectors per port
  int*              portFaceNums  = new int[LocPortCnt];
  std::list<face*>* portFaceLists = new std::list<face*>[LocPortCnt];
  std::set<int>*    portNodeIds   = new std::set<int>[LocPortCnt];
  std::memset(portFaceNums, 0, LocPortCnt * sizeof(int));

  // Pass 1: walk faces and collect them by portNum (via bcNumToPnum)
  for (i = 0; i < faceCNT; ++i)
  {
    int bType = faceARRAY[i]->getbType();
    if (bType != portType) continue;   // only port faces


    // pick the valid owning tetra (check hydra pointers BEFORE deref)
    tetra* tet = nullptr;
    if (faceARRAY[i]->hydra[0] != nullptr)
    {
      tet = faceARRAY[i]->hydra[0];
    }
    else if (faceARRAY[i]->hydra[1] != nullptr)
    {
      tet = faceARRAY[i]->hydra[1];
    }
    else
    {
      continue; // no owner; defensive
    }


    // Find bc_number for THIS face inside its tetra (match same face by pointer)
    int bc_number = -1;
    for (int k = 0; k < NumOfFaces; ++k)
    {
      if (tet->fc[k] == faceARRAY[i])
      {
        bc_number = tet->getbc(k);
        break;
      }
    }
    if (bc_number < 0) continue;

    int portNum = bcNumToPnum[bc_number]-1;

    ++portFaceNums[portNum];
    portFaceLists[portNum].push_back(faceARRAY[i]);

    for (j = 0; j < NumOfNodesPerFace; ++j)
    {
      portNodeIds[portNum].insert(faceARRAY[i]->getNode(j)->getid());
    }
  }

  // Optional: sanity check
  for (int p = 1; p < LocPortCnt+1; ++p)
  {
    std::cout << "Port " << p
              << " (BCNum=" << pnumToBcNum[p] << ") has "
              << portNodeIds[p-1].size() << " unique nodes and "
              << portFaceNums[p-1] << " faces.\n";
  }

  // Pass 2: finalize each port mesh
  for (int p = 0; p < LocPortCnt; ++p)
  {
    portMesh& portmesh = pMeshARRAY[p];

    // Initialize from bcARRAY using BCNum directly
    int bc_number = pnumToBcNum[p+1];
    if (bc_number >= 0 && bc_number < bcCNT)
    {
      auto& rec = bcARRAY[bc_number];  // <-- no bcRec type name
      string name = rec.getName();
      fp_t   magnitudeE = rec.getMagE();
      cout << "bc_number = " << bc_number << " name = " << name << " | magE = " << magnitudeE << endl;
      portmesh.setName(rec.getName());
      portmesh.setMagE(magnitudeE);
      portmesh.setImpZ(rec.getCval());
    }

    // Faces
    int faceNum = portFaceNums[p];
    portmesh.setFaceCnt(faceNum);
    if (faceNum > 0) {
      face** portFaceArray = portmesh.getFaceArray();
      auto itF = portFaceLists[p].begin();
      for (j = 0; j < faceNum; ++j, ++itF) {
        portFaceArray[j] = *itF;
      }

      // Nodes + local map
      int nodeNum = static_cast<int>(portNodeIds[p].size());
      portmesh.setNodeCnt(nodeNum);
      portmesh.allocGlobToLocMap();

      node** portNodeArray = portmesh.getNodeArray();
      std::map<int,int>& globToLocMap = portmesh.getGlobToLocMap();

      int nodeCount = 0;
      for (int gid : portNodeIds[p]) {
        // If ids aren't dense indices into ndARRAY, replace with your id->index lookup.
        portNodeArray[nodeCount] = &(ndARRAY[gid]);
        globToLocMap[ ndARRAY[gid].getid() ] = nodeCount++;
      }

      // Remaining setup
      scalingLength = 1.0;
      portmesh.makeCoordSystem();
      portmesh.makeObjMap();
      portmesh.readVline(unit);
      portmesh.writeMesh(objProp);
    }
  }

  delete [] portFaceNums;
  delete [] portFaceLists;
  delete [] portNodeIds;
}


/*
void FemGrp::solveWaveguidePorts()
{
  char command[1000];
  memset(command, 0, 1000 * sizeof(char));
  sprintf(command, "anwg_h1 %s %e 1 \n",pMeshARRAY->portName, freq);
  cout<<"=============Running Command:============"<<endl;
  cout << command << endl;
  system(command);
}
*/
/*
void FemGrp::WriteWaveguidePortFields()
{
  // For each port
  for(int i = 0; i < portCNT ; i++)
  {
    portMesh& portmesh = pMeshARRAY[i];
    portmesh.writeVtk();
  }
}
*/


// Using anwg to solve for the port excitation mode (1st mode)
void FemGrp::solveWaveguidePorts()
{
  // run for each detected port
  for (int i = 0; i < portCNT; ++i)
  {
    const std::string name = pMeshARRAY[i].getName();   // uses the name you set from bcARRAY
    char command[1024];
    // quote the name in case it has spaces; print freq with good precision
    std::snprintf(command, sizeof(command), "anwg_h1 \"%s\" %.16e 1", name.c_str(), freq);

    std::cout << "============= Running Command (port " << i << "): =============\n";
    std::cout << command << std::endl;

    int rc = std::system(command);
    if (rc != 0)
    {
      std::cerr << "anwg_h1 failed for port " << i << " (rc = " << rc << ")\n";
    }
  }
}

void FemGrp::WriteWaveguidePortFields()
{
  // For each port
  for (int i = 0; i < portCNT; ++i)
  {
    portMesh& portmesh = pMeshARRAY[i];
    std::cout << "Writing VTK for port " << i << " (" << portmesh.getName() << ")\n";
    portmesh.writeVtk();
  }
}


void FemGrp::AssignPortFieldsInFaces()
{
  for(int i = 0 ; i < portCNT ; i++)
  {
    pMeshARRAY[i].makeRHS_E();
    pMeshARRAY[i].makeRHS_H();
  }
}


void FemGrp::AssignPortFieldsInFaces_TEM()
{
  for (int i = 0; i < portCNT; ++i)
  {
    const auto& ex  = portExcitations[i];
    portMesh&   pm  = pMeshARRAY[i];

    pm.makeRHS_TEM(ex.freq_m * 1e6, ex.epr,
                  ex.vpath[0], ex.vpath[1], ex.vpath[2],
                  ex.PortDirection[0], ex.PortDirection[1], ex.PortDirection[2]);
  }
}

//TODO: make dynamic
void FemGrp::EvaluateSparametersGlobal(int timeStep, fp_t dt, bool isCompact)
{
  int i, j, k, m;
  int FaceNum;
  int Nsample = 102;
  int GaussPnt = Nsample - 1;
  int IsOnFace;
  int tetraMAP_P2[30];
  int tetraMAP_P1[12];
  int tetraMAP_P0[6];
  vtr lvtr[3];
  vtr avtr[4];
  fp_t vol;
  fp_t zeta0, zeta1, zeta2;
  fp_t zetaFace[3];
  fp_t zeta[4];
  fp_t wgt = 1.0;
  fp_t EvalueTotal;
  fp_t EvalueInc;
  fp_t h;
  fp_t* VoltEntryInc = new fp_t[portCNT];
  fp_t* VoltEntryTotal = new fp_t[portCNT];
  vtr Total_E_Local;
  vtr Inc_E_Local;
  vtr Point;
  vtr PortDirection;
  vtr Normal;
  fp_t area = 0.0;
  tetra* tet;

  ArrayFP<fp_t>* origEn_1_P2 = new ArrayFP<fp_t>(30);
  ArrayFP<fp_t>* origEn_1_P1 = new ArrayFP<fp_t>(12);
  ArrayFP<fp_t>* origEn_1_P0 = new ArrayFP<fp_t>(6);

  for(i = 0; i < portCNT; i++){
    VoltEntryInc[i] = 0.0;
    VoltEntryTotal[i] = 0.0;
  }

  for(i = 0; i < portCNT; i++)
  {
    vtr VoltLine = pMeshARRAY[i].vline.coord[1] - pMeshARRAY[i].vline.coord[0];
    vtr VoltLineUnit = pMeshARRAY[i].vline.coord[1] - pMeshARRAY[i].vline.coord[0];
    VoltLineUnit.unitvtr();

    h = VoltLine.magnitude() / GaussPnt;
    for(k = 0; k < GaussPnt; k++){
      Point = pMeshARRAY[i].vline.coord[0] + VoltLineUnit * (k + 0.5) * h;
      //cout << "k = " << k << "  FCCNT = " << pMeshARRAY[i].faceCNT  << endl;

      for(j = 0; j < pMeshARRAY[i].faceCNT; j++){
        IsOnFace = pMeshARRAY[i].fcArray[j]->PointInFace(Point, zeta0, zeta1, zeta2);
        zetaFace[0] = zeta0;
        zetaFace[1] = zeta1;
        zetaFace[2] = zeta2;

        if(IsOnFace == 1)
        {
          pMeshARRAY[i].fcArray[j]->getAreaNormal(&area, &Normal);
          PortDirection = pMeshARRAY[i].fcArray[j]->bcPtr->get_PortDirection();

          if(dotP(Normal, PortDirection) < 0.0)
            tet = pMeshARRAY[i].fcArray[j]->hydra[0];
          else
            tet = pMeshARRAY[i].fcArray[j]->hydra[1];

          tet->geometry(lvtr, avtr, &vol);
          for(m = 0 ; m < 4; m++){
            if(pMeshARRAY[i].fcArray[j] == tet->getFacePtr(m))
              FaceNum = m;
          }
          avtr[3].reset();
          avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);

          // 0th order polynomial
          if(tet->PolyOrderFlag == 0){
            tet->Local_DG_mapE(tetraMAP_P0, tet->LocalOffsetE);
            origEn_1_P0->reset();
            for(int Cnt1 = 0 ; Cnt1 < 6 ; Cnt1++){
              if(tetraMAP_P0[Cnt1] < 0)
                origEn_1_P0->setentry(Cnt1, 0.0);
              else
                origEn_1_P0->setentry(Cnt1, en_1->getentry(tetraMAP_P0[Cnt1]));
            }
          }else if(tet->PolyOrderFlag == 1){ // 1st order polynomial
            tet->Local_DG_mapE(tetraMAP_P1, tet->LocalOffsetE);
            origEn_1_P1->reset();
            for(int Cnt2 = 0 ; Cnt2 < 12 ; Cnt2++){
              if(tetraMAP_P1[Cnt2] < 0)
                origEn_1_P1->setentry(Cnt2, 0.0);
              else
                origEn_1_P1->setentry(Cnt2, en_1->getentry(tetraMAP_P1[Cnt2]));
            }
          }else if(tet->PolyOrderFlag == 2){ // 2nd order polynomial
            tet->Local_DG_mapE(tetraMAP_P2, tet->LocalOffsetE);
            origEn_1_P2->reset();
            for(int Cnt2 = 0 ; Cnt2 < 30 ; Cnt2++){
              if(tetraMAP_P2[Cnt2] < 0)
                origEn_1_P2->setentry(Cnt2, 0.0);
              else
                origEn_1_P2->setentry(Cnt2, en_1->getentry(tetraMAP_P2[Cnt2]));
            }
          }

          for(m = 0 ; m < 4 ; m++){
            zeta[m] = 0.0;
          }

          zeta[faceMAP[FaceNum][0]] = zetaFace[0];
          zeta[faceMAP[FaceNum][1]] = zetaFace[1];
          zeta[faceMAP[FaceNum][2]] = zetaFace[2];


          // 0th order polynomial
          if(tet->PolyOrderFlag == 0){

            Total_E_Local = CalcEfield(origEn_1_P0->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag);
            pMeshARRAY[i].fcArray[j]->SparEinc((timeStep + 1.0) * dt, Point, Inc_E_Local, zetaFace);

          }else if(tet->PolyOrderFlag == 1){// 1st order polynomial

            Total_E_Local = CalcEfield(origEn_1_P1->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag);
            pMeshARRAY[i].fcArray[j]->SparEinc((timeStep + 1.0) * dt, Point, Inc_E_Local, zetaFace);

          }else if(tet->PolyOrderFlag == 2){// 2nd order polynomial

            Total_E_Local = CalcEfield(origEn_1_P2->getEntryPtr(), avtr, vol, zeta, tet->PolyOrderFlag);
            pMeshARRAY[i].fcArray[j]->SparEinc((timeStep + 1.0) * dt, Point, Inc_E_Local, zetaFace);

          }

          EvalueTotal = dotP(Total_E_Local, VoltLineUnit);
          EvalueInc = dotP(Inc_E_Local, VoltLineUnit);
          VoltEntryInc[i] += - 1.0 * h * wgt * EvalueInc;
          VoltEntryTotal[i] += - 1.0 * h * wgt * EvalueTotal;
        }
      }
    }

    // Write a file with all the impendances of the ports
    if(timeStep == 0){
      char Impedance_Log[180];
      sprintf(Impedance_Log, "%s.ImpZ", fname);
      ofstream ImpedanceOutfile(Impedance_Log, ios_base::out);
      if(!ImpedanceOutfile)
        cout << "Error in opening file: " << Impedance_Log << " for write " << endl;

      for(i = 0 ; i < portCNT ; i++)
        ImpedanceOutfile << pMeshARRAY[i].impZ << " ";

      ImpedanceOutfile.close();
    }

    // Write to file Vinc
    if(timeStep == 0)
      system("mkdir TimeDomainVoltages");
    char IncVoltage_TimeLog[180];
    ofstream IncVoltageOutfile;
    if(isCompact){
      sprintf(IncVoltage_TimeLog, "./TimeDomainVoltages/%s.TD_Vinc", fname);
      IncVoltageOutfile.open(IncVoltage_TimeLog, ios_base::out | ios::app);
    }else{
      sprintf(IncVoltage_TimeLog, "./TimeDomainVoltages/%s_%05d.TD_Vinc", fname, timeStep);
      IncVoltageOutfile.open(IncVoltage_TimeLog, ios_base::out);
    }

    IncVoltageOutfile.setf(ios::scientific, ios::floatfield);
    IncVoltageOutfile.precision(15);
    if(!IncVoltageOutfile)
      cout << "Error in opening file: " << IncVoltage_TimeLog << " for write " << endl;

    IncVoltageOutfile << (timeStep + 1.0) * dt << " ";
    for(i = 0 ; i < portCNT ; i++)
      IncVoltageOutfile << VoltEntryInc[i]<< " ";

    IncVoltageOutfile<<endl;
    IncVoltageOutfile.close();

    // Write to file Vtotal
    char TotVoltage_TimeLog[180];
    ofstream TotVoltageOutfile;
    if(isCompact){
      sprintf(TotVoltage_TimeLog, "./TimeDomainVoltages/%s.TD_Vtot", fname);
      TotVoltageOutfile.open(TotVoltage_TimeLog, ios_base::out | ios::app);
    }else{
      sprintf(TotVoltage_TimeLog, "./TimeDomainVoltages/%s_%05d.TD_Vtot", fname, timeStep);
      TotVoltageOutfile.open(TotVoltage_TimeLog, ios_base::out);
    }
    TotVoltageOutfile.setf(ios::scientific, ios::floatfield);
    TotVoltageOutfile.precision(15);
    if(!TotVoltageOutfile)
      cout << "Error in opening file: " << TotVoltage_TimeLog << "for write"<< endl;

    TotVoltageOutfile << (timeStep + 1.0) * dt << " ";
    for(i = 0 ; i < portCNT ; i++)
      TotVoltageOutfile << VoltEntryTotal[i] << " ";

    TotVoltageOutfile << endl;
    TotVoltageOutfile.close();

    // Write to file Vref
    char ReflVoltage_TimeLog [180];
    ofstream ReflVoltageOutfile;
    if(isCompact){
      sprintf(ReflVoltage_TimeLog, "./TimeDomainVoltages/%s.TD_Vref", fname);
      ReflVoltageOutfile.open(ReflVoltage_TimeLog, ios_base::out | ios::app);
    }else{
      sprintf(ReflVoltage_TimeLog, "./TimeDomainVoltages/%s_%05d.TD_Vref", fname, timeStep);
      ReflVoltageOutfile.open(ReflVoltage_TimeLog, ios_base::out);
    }

    ReflVoltageOutfile.setf(ios::scientific, ios::floatfield);
    ReflVoltageOutfile.precision(15);
    if(!ReflVoltageOutfile)
      cout << "Error in  opening file: " << ReflVoltage_TimeLog << "for write"<< endl;
    ReflVoltageOutfile.close();

    delete origEn_1_P2;
    delete origEn_1_P1;
    delete origEn_1_P0;

    delete [] VoltEntryInc;
    delete [] VoltEntryTotal;
  }
}

// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
// 000000000000000000000000000000000000 GPU ROUTINES 00000000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //

// DEVICE implementations
#if defined (DGTD_USE_CUDA)
  #if defined (CUDA_NON_HEAVY)

    ////////////////////////////////////////////////////////////////////////////////////////////////////////
    // OUTPUT Functions
    ////////////////////////////////////////////////////////////////////////////////////////////////////////

    // Modified by Qi Jian to write field from the PROBES
    void FemGrp::writeFieldProbeCuBLAS(int timeStep)
    {
      fp_t vol;
      fp_t zeta[4];
      vtr lvtr[3];
      vtr avtr[4];
      vtr eField;
      vtr hField;

      vtr eField_all;
      vtr hField_all;

      char csvFileName[StrOutput];
      std::ofstream csvFile;

      if(padeCNT == 0 || writeWhilePade)
      {
        sprintf(csvFileName, "./PROBES/Probes_%s_%04d.csv", fname, timeStep);
        csvFile.open(csvFileName);
        csvFile << "Ex" << "," << "Ey" << "," << "Ez" << "," << "Hx" << "," << "Hy" << "," << "Hz" << "\n";
      }

      const int num_nodes = probeCNT;

      // Calculate Total Fields at the points
      for(int i = 0; i < num_nodes; i++)
      {
        int number_of_associated_tets = probes_bary.at(i).first;

        eField.reset();
        hField.reset();

        std::vector<std::pair<int, std::array<double, 4>>> found_tets = probes_bary.at(i).second;
        eField_all.reset();
        hField_all.reset();

        for (int t = 0; t < number_of_associated_tets; t++)
        {

          int tet_id = found_tets.at(t).first;
          array<double,4> tri_bary_coord = found_tets.at(t).second;
          tetra& tet = tetARRAY[tet_id];

          tet.geometry(lvtr, avtr, &vol);
          avtr[3].reset();
          avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);

          eField.reset();
          hField.reset();
          zeta[0] = static_cast<fp_t>(tri_bary_coord[0]);
          zeta[1] = static_cast<fp_t>(tri_bary_coord[1]);
          zeta[2] = static_cast<fp_t>(tri_bary_coord[2]);
          zeta[3] = static_cast<fp_t>(tri_bary_coord[3]);

          eField = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);
          hField = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);

          eField_all = eField_all + eField;
          hField_all = hField_all + hField;

        }

        eField_all = eField_all / ((fp_t) number_of_associated_tets);
        hField_all = hField_all / ((fp_t) number_of_associated_tets);

        if(usePade){ // && i < padeCNT
          int row = (int)(timeStep / tsPerSampling)* NumOfFieldComponents * probeCNT ;
          int column = i * NumOfFieldComponents;
          fieldProbes[row + column + 0] = eField_all.getx();
          fieldProbes[row + column + 1] = eField_all.gety();
          fieldProbes[row + column + 2] = eField_all.getz();
          fieldProbes[row + column + 3] = hField_all.getx();
          fieldProbes[row + column + 4] = hField_all.gety();
          fieldProbes[row + column + 5] = hField_all.getz();
        }


        if(padeCNT == 0 || writeWhilePade){
          const auto max_precision {std::numeric_limits<fp_t>::digits10 + 1};
          csvFile << std::setprecision(max_precision) << eField_all.getx() << "," << eField_all.gety() << "," << eField_all.getz() << "," << hField_all.getx() << "," << hField_all.gety() << "," << hField_all.getz() << "\n";
        }

      }

      if(padeCNT == 0 || writeWhilePade)
      {
        usleep(100);
        csvFile.close();
      }

    }


    // ----------------------------------------------------------------------
    //  Port-face centroid probes: one CSV per port, per timestep, folders
    // ----------------------------------------------------------------------
    void FemGrp::writePortFieldProbeCuBLAS(int timeStep)
    {
      fp_t vol;
      fp_t zeta[4];
      vtr lvtr[3];
      vtr avtr[4];
      vtr eField, hField;
      vtr eField_all, hField_all;

      char csvFileName[StrOutput];
      std::ofstream csvFile;

      if (portCNT > 0 && PortFacePidx_h && PortFaceCentroid_h && !portFaceCentroid_bary.empty())
      {
        // Base output directory and per-port subdirs
        mkdir("./PortProbes", 0755);
        for (int pnum = 0; pnum < portCNT; ++pnum)
        {
          std::string portDir = "./PortProbes/Port" + std::to_string(pnum);
          mkdir(portDir.c_str(), 0755);

          // Open CSV for this port + timestep
          char pCsv[512];
          std::snprintf(pCsv, sizeof(pCsv), "%s/Port%d_%04d.csv", portDir.c_str(), pnum, timeStep);
          std::ofstream pcsv(pCsv);
          if (!pcsv.is_open())
          {
            std::cerr << "Error opening file: " << pCsv << "\n";
            continue;
          }

          // Header: centroid only
          pcsv << "x1,y1,z1,Ex,Ey,Ez,Hx,Hy,Hz\n";
          const auto max_precision = std::numeric_limits<fp_t>::digits10 + 1;
          pcsv << std::fixed << std::setprecision(max_precision);

          // Iterate all flattened excitation faces, pick those of this port
          for (int f = 0; f < excitationFaces; ++f)
          {
            if (PortFacePidx_h[f] != pnum) continue;

            // Centroid position from buffer
            const fp_t_ts* C = &PortFaceCentroid_h[3*f];
            const double cx = static_cast<double>(C[0]);
            const double cy = static_cast<double>(C[1]);
            const double cz = static_cast<double>(C[2]);

            // Bary search results for this centroid (should be present)
            int nAssoc = (int)portFaceCentroid_bary[f].first;
            if (nAssoc <= 0)
            {
              // If you prefer hard-fail, you can exit as in readPROBE()
              // Here we just skip gracefully.
              continue;
            }
            const auto& found_tets = portFaceCentroid_bary[f].second;

            // Average E/H over owning tets (same pattern as node probes)
            eField_all.reset();
            hField_all.reset();

            for (int t = 0; t < nAssoc; ++t)
            {
              int tet_id = found_tets[t].first;
              const std::array<double,4>& b = found_tets[t].second;

              tetra& tet = tetARRAY[tet_id];
              tet.geometry(lvtr, avtr, &vol);
              avtr[3].reset();
              avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);

              zeta[0] = (fp_t)b[0];
              zeta[1] = (fp_t)b[1];
              zeta[2] = (fp_t)b[2];
              zeta[3] = (fp_t)b[3];

              eField = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);
              hField = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);

              eField_all = eField_all + eField;
              hField_all = hField_all + hField;
            }

            eField_all = eField_all / ((fp_t)nAssoc);
            hField_all = hField_all / ((fp_t)nAssoc);

            // Write one row: centroid + averaged fields
            pcsv << cx << "," << cy << "," << cz << ","
                 << eField_all.getx() << "," << eField_all.gety() << "," << eField_all.getz() << ","
                 << hField_all.getx() << "," << hField_all.gety() << "," << hField_all.getz() << "\n";
          }

          pcsv.close();
        }
      }
    }


    void FemGrp::writeFieldGlobalCuBLAS(int timeStep){
      fp_t vol;
      fp_t zeta[4];
      vtr lvtr[3];
      vtr avtr[4];
      vtr coord[4];
      vtr eLocal[4];
      vtr hLocal[4];

      vtr* eField = new vtr[nodeCNT];
      vtr* hField = new vtr[nodeCNT];

      int* count = new int[nodeCNT];
      memset(count, 0, nodeCNT * sizeof(int));

      int* polyOrder = new int[tetraCNT];

      for(int i = 0; i < tetraCNT; i++){
        tetra& tet = tetARRAY[i];
        polyOrder[i] = tet.PolyOrderFlag;

        tet.geometry(lvtr, avtr, &vol);
        avtr[3].reset();
        avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);

        for(int j = 0; j < 4; j++){
          zeta[0] = BaryCoord[j][0];
          zeta[1] = BaryCoord[j][1];
          zeta[2] = BaryCoord[j][2];
          zeta[3] = BaryCoord[j][3];

          eLocal[j] = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, tet.PolyOrderFlag);
          hLocal[j] = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, tet.PolyOrderFlag);

          int index = tet.nd[j]->getid();
          eField[index] = eField[index] + eLocal[j] /*- Einc*/;
          hField[index] = hField[index] + hLocal[j] /*- Hinc*/;
          count[index] += 1;
        }
      }
      for(int i = 0; i < nodeCNT; i++){
        eField[i] = eField[i] / static_cast<fp_t>(count[i]);
        hField[i] = hField[i] / static_cast<fp_t>(count[i]);
      }
        VtkWriter vtkWriter(1.0);
        //   VtkWriter vtkWriter(unit);
        char vtkFilePrefix[128];
        memset(vtkFilePrefix, 0, 128 * sizeof(char));

        sprintf(vtkFilePrefix, "./VTU_LTS/%s_%04d", fname, timeStep);

        vtkWriter.writeField(vtkFilePrefix, nodeCNT, ndARRAY, tetraCNT, tetARRAY, eField, hField, polyOrder, 0, 0); //TODO: why here polyorder is not 1


      delete [] eField;
      delete [] hField;
      delete [] count;
      delete [] polyOrder;
    }


    bool FemGrp::checkEnergyDecay(){
      fieldEnergy /= numberOfEnergyPoints * NumOfSampleEnergyCheck;
      maxFieldEnergy = max(maxFieldEnergy, fieldEnergy);

      return (fieldEnergy < energyDecayFactor * maxFieldEnergy);
    }


    ////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Organize GPU Memory
    ////////////////////////////////////////////////////////////////////////////////////////////////////////

    void FemGrp::PrepareGPUcuBLAS()
    {
      tetra* tet;
      int cntAux;

      ////////////////////////////////////////////////////////////////////////////////////////////////////////
      // Prepare Excitation Info
      ////////////////////////////////////////////////////////////////////////////////////////////////////////

      int exciCNT = 0;
      for(int i = 0; i < N_class; i ++)
      {
        exciCNT += ClassExcitationCount[i];
      }

      CUDA_SAFE_CALL(cudaMallocHost((void**)&ExcitationFacesCnt_h, exciCNT * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&ExcitationFacesOffset_h, exciCNT * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&ExcitationFacesNum_h, excitationFaces * sizeof(int), cudaHostAllocMapped));

      CUDA_SAFE_CALL(cudaMallocHost((void**)&nd_coords_tet_h, NumOfUnitaryVectors * NumOfNodes * exciCNT * sizeof(fp_t_ts), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&nd_coords_face_h, NumOfUnitaryVectors * NumOfNodesPerFace * excitationFaces * sizeof(fp_t_ts), cudaHostAllocMapped));

      CUDA_SAFE_CALL(cudaMallocHost((void**)&mapE_h, exciCNT * TetPolyOrderDim[PolyFlag] * sizeof(int8_t), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&mapH_h, exciCNT * TetPolyOrderDim[PolyFlag] * sizeof(int8_t), cudaHostAllocMapped));

      // for(int i = 0; i < exciCNT * TetPolyOrderDim[PolyFlag]; i++){
      //   mapE_h[i] = 1;
      //   mapH_h[i] = 1;
      // }

      // ===============================================
      // Allocate storage for port fields
      // ===============================================
      const int Q = GAUSS_POINT_NUM_h[PolyFlag];  // same as GPU kernel uses

      cout << "excitationFaces = " << excitationFaces << endl;
      cout << "exciCNT = " << exciCNT << endl;

      if (portCNT > 0)
      {
        CUDA_SAFE_CALL(cudaMallocHost((void**)&Etan_qp_h, excitationFaces * Q * 3 * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&Htan_qp_h, excitationFaces * Q * 3 * sizeof(fp_t_ts), cudaHostAllocMapped));

        CUDA_SAFE_CALL(cudaMallocHost((void**)&PortFacePidx_h,     excitationFaces * sizeof(int),         cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&PortFaceCentroid_h, excitationFaces * 3 * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&Etan_center_h,      excitationFaces * 3 * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&Htan_center_h,      excitationFaces * 3 * sizeof(fp_t_ts), cudaHostAllocMapped));

        CUDA_SAFE_CALL(cudaMallocHost((void**)&FaceID_excitation_h,     excitationFaces * sizeof(int),    cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&TetID_excitation_h,      excitationFaces * sizeof(int),    cudaHostAllocMapped));

      }

      // ===============================================
      // Allocated Impedance for Planewave
      // ===============================================
      cout << "PlaneWaveBCFlag = " << PlaneWaveBCFlag << endl;
      cout << "Number of Ports = " << portCNT << endl;
      if(PlaneWaveBCFlag)
      {
        CUDA_SAFE_CALL(cudaMallocHost((void**)&Z_face_pw_h, excitationFaces * sizeof(fp_t_ts), cudaHostAllocMapped));
      }


      excitationFaces = 0;
      exciCNT = 0;

      for (int i = 0; i < N_class; i ++)
      {

        cout << "\nN CLASS = " << i << endl;

        for(int j = 0; j < ClassExcitationCount[i]; j ++)
        {
          tet = &(tetARRAY[ClassTetraIndex[i][j]]);

          cout << ClassTetraIndex[i][j] << " ";

          for(int k = 0; k < TetPolyOrderDim[PolyFlag]; k++)
          {
            mapE_h[exciCNT * TetPolyOrderDim[PolyFlag] + k] = (tet->LocMapE[k] < 0 ? 0 : 1);
            mapH_h[exciCNT * TetPolyOrderDim[PolyFlag] + k] = (tet->LocMapH[k] < 0 ? 0 : 1);
          }

          ExcitationFacesOffset_h[exciCNT] = excitationFaces;
          for(int k = 0; k < NumOfFaces; k++)
          {
            for(int node = 0; node < NumOfNodes; node++)
            {
              nd_coords_tet_h[NumOfUnitaryVectors * (NumOfNodes * exciCNT + node) + 0] = tet->nd[node]->getCoord().getx();
              nd_coords_tet_h[NumOfUnitaryVectors * (NumOfNodes * exciCNT + node) + 1] = tet->nd[node]->getCoord().gety();
              nd_coords_tet_h[NumOfUnitaryVectors * (NumOfNodes * exciCNT + node) + 2] = tet->nd[node]->getCoord().getz();

              //cout << "TET ID = " << tet->getcnt() << "  Face ID = " << tet->fc[k]->getcnt() << "  BC = " << tet->fc[k]->bcPtr->getbType() << endl;
              //cout << tet->nd[node]->getCoord().getx() << "  " << tet->nd[node]->getCoord().gety() << "  " << tet->nd[node]->getCoord().getz() << endl;

            }


            int DGface_bc = tet->fc[k]->bcPtr->getbType();
            if(DGface_bc == planeWaveType || DGface_bc == portType || DGface_bc == pmlType)
            {
              ExcitationFacesNum_h[excitationFaces] = k;

              for(int node = 0; node < NumOfNodesPerFace; node++)
              {
                nd_coords_face_h[NumOfUnitaryVectors * (NumOfNodesPerFace * excitationFaces + node) + 0] = tet->fc[k]->nd[node]->getCoord().getx();
                nd_coords_face_h[NumOfUnitaryVectors * (NumOfNodesPerFace * excitationFaces + node) + 1] = tet->fc[k]->nd[node]->getCoord().gety();
                nd_coords_face_h[NumOfUnitaryVectors * (NumOfNodesPerFace * excitationFaces + node) + 2] = tet->fc[k]->nd[node]->getCoord().getz();

                //cout << tet->fc[k]->nd[node]->getCoord().getx() << " , "
                //     << tet->fc[k]->nd[node]->getCoord().gety() << " , "
                //     << tet->fc[k]->nd[node]->getCoord().getz() << endl;
              }
              cout << "\n";

              if(PlaneWaveBCFlag)
              {
                Z_face_pw_h[excitationFaces] = No * sqrt(tet->mat->mur.getEntry(0,0) / tet->mat->epsr.getEntry(0,0));
              }

              excitationFaces++;
            }
          }
          ExcitationFacesCnt_h[exciCNT] = excitationFaces - ExcitationFacesOffset_h[exciCNT];
          exciCNT++;
        }
      }

      cout << " exciCNT = " << exciCNT << endl;

      // To save the current time step through the execution
      LocalExciIndexE = new int[N_class];
      LocalExciIndexH = new int[N_class];
      for(int i = 0; i < N_class; i ++)
      {
        LocalExciIndexE[i] = 0;
        LocalExciIndexH[i] = 0;
      }

      ////////////////////////////////////////////////////////////////////////////////////////////////////////
      // Create the fields at the HOST (only the ones that we will use to calculate the fields at the probes)
      ////////////////////////////////////////////////////////////////////////////////////////////////////////
      int sizeField = TetPolyOrderDim[PolyFlag] * tetraCNT;

      CUDA_SAFE_CALL(cudaMallocHost((void**)&En1_h, sizeField * sizeof(fp_t_ts), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&Hn32_h, sizeField * sizeof(fp_t_ts), cudaHostAllocMapped));


      ////////////////////////////////////////////////////////////////////////////////////////////////////////
      // For Regular Tetrahedras
      ////////////////////////////////////////////////////////////////////////////////////////////////////////

      flag1 = true;

      // ---- Helpers ----

      // Check for overflow
      auto safe_add = [](int a, int b) -> int
      {
        if ((b > 0 && a > INT_MAX - b) || (b < 0 && a < INT_MIN - b))
        {
          fprintf(stderr, "Integer overflow in addition (%d + %d)\n", a, b);
          abort();
        }
        return a + b;
      };

      // Check if index is within range
      auto check_idx = [&](int idx, int lo, int hi, const char* what) {
        if (idx < lo || idx > hi) {
          fprintf(stderr, "Index out of range for %s: %d (expected [%d, %d])\n",
                  what, idx, lo, hi);
          abort();
        }
      };

      // Check for null pointer
      auto check_ptr = [&](void* p, const char* what) {
        if (!p) { fprintf(stderr, "Null pointer: %s\n", what); abort(); }
      };

      // ---- Allocations (pinned) ----


      CUDA_SAFE_CALL(cudaMallocHost((void**)&classregNeighPML_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));

      CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularTetraCnt_h, (size_t)N_class * regularCNT * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classIrregularTetraOffset_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classPMLTetraOffset_h,       (size_t)N_class * sizeof(int), cudaHostAllocMapped));

      CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighIrregular_h,       (size_t)N_class * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighIrregularOffset_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));

      CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighPML_h,       (size_t)N_class * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighPMLOffset_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));

      CUDA_SAFE_CALL(cudaMallocHost((void**)&classTetraOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));

      CUDA_SAFE_CALL(cudaMallocHost((void**)&classPMLTetraOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classNeighPMLOffset_loc_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));

      CUDA_SAFE_CALL(cudaMallocHost((void**)&nonRegularTetraCnt_h,     (size_t)N_class * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&nonRegularPMLTetraCnt_h,  (size_t)N_class * sizeof(int), cudaHostAllocMapped));

      // Always allocate these “per-class meta” arrays irrespective of regularTetraCNT,
      // so we can safely write zeros even if there are no regulars.
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsCnt_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLGroupsCnt_h, (size_t)N_class * sizeof(int), cudaHostAllocMapped));

      // These hold per-class pointers allocated later per class; init to nullptr
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularTetraOffset_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsId_h,    (size_t)N_class * sizeof(int*), cudaHostAllocMapped));

      for (int i = 0; i < N_class; ++i)
      {
        classRegularTetraOffset_h[i] = nullptr;
        classRegularGroupsId_h[i]    = nullptr;
      }


      CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraOffset_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraFaceOffset_h, (size_t)N_class * sizeof(int*), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLGroupsId_h,    (size_t)N_class * sizeof(int*), cudaHostAllocMapped));
      for (int i = 0; i < N_class; ++i)
      {
        classRegularPMLTetraOffset_h[i] = nullptr;
        classRegularPMLGroupsId_h[i]    = nullptr;
        classRegularPMLTetraFaceOffset_h[i] = nullptr;
      }

      // Per group (global)
      CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsNeighCnt_h, (size_t)regularCNT * sizeof(int), cudaHostAllocMapped));

      // ---- Zero-init everything deterministically ----
      memset(classRegularTetraCnt_h,         0, (size_t)N_class * regularCNT * sizeof(int));
      memset(classIrregularTetraOffset_h,    0, (size_t)N_class * sizeof(int));
      memset(classPMLTetraOffset_h,          0, (size_t)N_class * sizeof(int));
      memset(classNeighIrregular_h,          0, (size_t)N_class * sizeof(int));
      memset(classNeighIrregularOffset_h,    0, (size_t)N_class * sizeof(int));
      memset(classNeighPML_h,                0, (size_t)N_class * sizeof(int));
      memset(classregNeighPML_h,             0, (size_t)N_class * sizeof(int));
      memset(classNeighPMLOffset_h,          0, (size_t)N_class * sizeof(int));
      memset(classTetraOffset_loc_h,         0, (size_t)N_class * sizeof(int));
      memset(classNeighOffset_loc_h,         0, (size_t)N_class * sizeof(int));
      memset(classPMLTetraOffset_loc_h,      0, (size_t)N_class * sizeof(int));
      memset(classNeighPMLOffset_loc_h,      0, (size_t)N_class * sizeof(int));
      memset(nonRegularTetraCnt_h,           0, (size_t)N_class * sizeof(int));
      memset(nonRegularPMLTetraCnt_h,        0, (size_t)N_class * sizeof(int));
      memset(classRegularGroupsCnt_h,        0, (size_t)N_class * sizeof(int));
      memset(classRegularPMLGroupsCnt_h,     0, (size_t)N_class * sizeof(int));
      memset(classRegularGroupsNeighCnt_h,   0, (size_t)regularCNT * sizeof(int));

      // ---- Locals ----
      std::set<int> ID_aux, ID_aux_PML;

      totalRegularNeighFaceCnt     = 0;
      totalRegularPMLNeighFaceCnt  = 0;
      numRegTetras                 = 0;
      numRegPMLTetras              = 0;

      int irregularTetras    = 0;
      int irregularNeighbours= 0;
      int PMLTetras          = 0;
      int PMLNeighbours      = 0;

      // ---- Main loop ----
      for (int i = 0; i < N_class; ++i)
      {
        // Safe offsets (depend on previous class)
        if (i == 0)
        {
          classIrregularTetraOffset_h[i] = 0;
          classNeighIrregularOffset_h[i] = 0;
        }
        else
        {
          // read-only of previous indices is safe now
          int prev = i - 1;
          check_idx(prev, 0, N_class-1, "prev class index");

          // Prevent overflow and guarantee non-negative
          int pml_tetra_off   = classPMLTetraOffset_h[prev];
          int pml_tetra_cnt   = ClassPMLTetraCnt[prev];
          int pml_neigh_off   = classNeighPMLOffset_h[prev];
          int pml_neigh_cnt   = classNeighPML_h[prev];
          int reg_neigh_cnt   = classregNeighPML_h[prev];

          if (pml_tetra_off < 0 || pml_tetra_cnt < 0 || pml_neigh_off < 0 || pml_neigh_cnt < 0) {
            fprintf(stderr, "Negative offsets/cnts detected for prev class %d\n", prev);
            abort();
          }

          classIrregularTetraOffset_h[i] = pml_tetra_off + pml_tetra_cnt;
          classNeighIrregularOffset_h[i] = pml_neigh_off + pml_neigh_cnt + reg_neigh_cnt;
        }

        classTetraOffset_loc_h[i] = irregularTetras;
        classNeighOffset_loc_h[i] = irregularNeighbours;

        int totalNeighbors = 0;


        // ----- Non-PML tetras in class i -----
        for (int j = 0; j < ClassTetraCnt[i]; ++j)
        {
          int tIdx = ClassTetraIndex[i][j];
          tet = &(tetARRAY[tIdx]);
          check_ptr(tet, "tet ptr");

          int group_ID = tet->getRegularGroup();

          // Count per class and group
          classRegularTetraCnt_h[i * regularCNT + group_ID]++;

          int neigh = tet->get_NeighNum();

          if (group_ID == 0)
          {
            nonRegularTetraCnt_h[i]++;
            irregularTetras++;
            irregularNeighbours += neigh;
            classNeighIrregular_h[i] += neigh;
            totalNeighbors += neigh;
          }
          else
          {
            ID_aux.insert(group_ID);
            classRegularGroupsNeighCnt_h[group_ID] = neigh;
            totalRegularNeighFaceCnt += neigh;
            numRegTetras++;
            totalNeighbors += neigh;
          }
        }


        // ----- Build per-class arrays for REGULAR groups -----
        if (!ID_aux.empty())
        {
          int G = (int)ID_aux.size();
          classRegularGroupsCnt_h[i] = G;

          CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularGroupsId_h[i],    (size_t)G * sizeof(int), cudaHostAllocMapped));
          CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularTetraOffset_h[i], (size_t)G * sizeof(int), cudaHostAllocMapped));
          check_ptr(classRegularGroupsId_h[i],    "classRegularGroupsId_h[i]");
          check_ptr(classRegularTetraOffset_h[i], "classRegularTetraOffset_h[i]");

          cout << "Regular Tet group = " << endl;
          int cntAux = 0;
          for (int ID : ID_aux)
          {

            classRegularGroupsId_h[i][cntAux] = ID;

            cout << ID << endl;

            if (cntAux == 0)
            {
              classRegularTetraOffset_h[i][0] = 0;
            }
            else
            {
              int prevID =  classRegularGroupsId_h[i][cntAux - 1];
              int prevCnt = classRegularTetraCnt_h[i * regularCNT + prevID];
              classRegularTetraOffset_h[i][cntAux] = classRegularTetraOffset_h[i][cntAux - 1] + prevCnt;
            }
            cntAux++;
          }
          ID_aux.clear();
        }
        else
        {
          classRegularGroupsCnt_h[i] = 0;
        }


        // ----- PML part -----
        if (PML_flag)
        {
          classPMLTetraOffset_h[i] = classIrregularTetraOffset_h[i] + ClassTetraCnt[i];
          classNeighPML_h[i]       = 0;
          classNeighPMLOffset_h[i] = classNeighIrregularOffset_h[i] + totalNeighbors;

          classPMLTetraOffset_loc_h[i] = PMLTetras;
          classNeighPMLOffset_loc_h[i] = PMLNeighbours;

          cout << "classNeighPMLOffset_loc_h[" << i << "] =" << classNeighPMLOffset_loc_h[i] << endl;
          //cout << "classNeighPMLOffset_loc_h[" << i << "] =" << classNeighPMLOffset_loc_h[i] << endl;
          cout << " classPMLTetraOffset_loc_h[ " << i << "] " <<  classPMLTetraOffset_loc_h[i] << endl;

          int pml_cnt = ClassPMLTetraCnt[i];
          check_idx(pml_cnt, 0, INT_MAX, "ClassPMLTetraCnt[i]");

          for (int j = 0; j < pml_cnt; ++j)
          {
            int idx = safe_add(ClassTetraCnt[i], j);
            int tIdx = ClassTetraIndex[i][idx];
            tet = &(tetARRAY[tIdx]);
            check_ptr(tet, "tet ptr (PML)");

            int group_ID = tet->getRegularGroup();

            classRegularTetraCnt_h[i * regularCNT + group_ID]++;

            int neigh = tet->get_NeighNum();

            if (group_ID == 0)
            {
              nonRegularPMLTetraCnt_h[i]++;
              PMLTetras     = safe_add(PMLTetras, 1);
              PMLNeighbours = safe_add(PMLNeighbours, neigh);
              classNeighPML_h[i] = safe_add(classNeighPML_h[i], neigh);
            }
            else
            {
              ID_aux_PML.insert(group_ID);
              classRegularGroupsNeighCnt_h[group_ID] = neigh;
              totalRegularPMLNeighFaceCnt = safe_add(totalRegularPMLNeighFaceCnt, neigh);
              numRegPMLTetras = safe_add(numRegPMLTetras, 1);
              classregNeighPML_h[i] += neigh;
            }
          }

          cout << "PMLNeighbours = " << PMLNeighbours << endl;

        }


        // ----- Build per-class arrays for REGULAR PML groups -----
        if (PML_flag)
        {
          if (!ID_aux_PML.empty())
          {
            int Gp = (int)ID_aux_PML.size();
            classRegularPMLGroupsCnt_h[i] = Gp;

            CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLGroupsId_h[i],    (size_t)Gp * sizeof(int), cudaHostAllocMapped));
            CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraOffset_h[i], (size_t)Gp * sizeof(int), cudaHostAllocMapped));
            CUDA_SAFE_CALL(cudaMallocHost((void**)&classRegularPMLTetraFaceOffset_h[i], (size_t)Gp * sizeof(int), cudaHostAllocMapped));

            check_ptr(classRegularPMLGroupsId_h[i],    "classRegularPMLGroupsId_h[i]");
            check_ptr(classRegularPMLTetraOffset_h[i], "classRegularPMLTetraOffset_h[i]");
            check_ptr(classRegularPMLTetraFaceOffset_h[i], "classRegularPMLTetraFaceOffset_h[i]");


            cout << "Regular PML Tet group = " << endl;
            int cntAux = 0;
            for (int ID : ID_aux_PML)
            {
              cout << ID << endl;

              classRegularPMLGroupsId_h[i][cntAux] = ID;

              if (cntAux == 0)
              {
                classRegularPMLTetraOffset_h[i][0] = 0;
                classRegularPMLTetraFaceOffset_h[i][0] = 0;
              }
              else
              {
                int prevID = classRegularPMLGroupsId_h[i][cntAux - 1];
                int prevCnt = classRegularTetraCnt_h[i * regularCNT + prevID];
                classRegularPMLTetraOffset_h[i][cntAux] = classRegularPMLTetraOffset_h[i][cntAux - 1] + prevCnt;

                int neigh = classRegularGroupsNeighCnt_h[prevID];
                int num_element = classRegularTetraCnt_h[i * regularCNT + prevID];
                int number_neigh = neigh * num_element;
                classRegularPMLTetraFaceOffset_h[i][cntAux] = classRegularPMLTetraFaceOffset_h[i][cntAux-1] + number_neigh;
              }
              cntAux++;
            }
            ID_aux_PML.clear();
          }
          else
          {
            classRegularPMLGroupsCnt_h[i] = 0;
          }
        }
      }

      // ---- Final tallies ----
      nonregularCNT_Normal = irregularTetras;
      nonregularCNT_PML    = PMLTetras;
      num_elements_regular_PML = numRegPMLTetras;
      cout << "nonregularCNT_Normal = " << nonregularCNT_Normal << endl;
      cout << "nonregularCNT_PML = " << nonregularCNT_PML << endl;
      cout << "num_elements_regular_PML = " << num_elements_regular_PML << endl;


      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      // Create the matrices for the regular groups (4 sets per regular group):
      //      - Loc1E/Loc1H: local matrices that are multiplied by the same type of field (Field1 = Loc1 * Field0 + ...)
      //      - Loc2E/Loc2H: local matrices that are multiplied by the opposite type of field (E1 = Loc2E * H1_2 + ...)
      //      - Neigh1E/Neigh1H: matrices related to the neighbors opposite filed
      //      - Neigh2E/Neigh2H: matrices related to the neighbors same filed
      //
      // *** NOTE: each of these matrices is Column-Major Order
      // *** NOTE: since they are regular, we assume that the elements are conformal and with 4 neighbours
      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////

      int localMatrixSize = TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag];
      int neighMatrixSize = TetPolyOrderDim[PolyFlag] * FacePolyOrderDim[PolyFlag];

      cout << "--------------------------------------------------------------------------------------------------" << endl;


      cout << "regularCNT_Normal = " << regularCNT_Normal << endl;
      cout << "totalRegularNeighFaceCnt = " << totalRegularNeighFaceCnt << endl;

      if(regularRegionFlag && regularCNT_Normal > 0)
      {
        cout << "========== FILLING regular ===============" << endl;

        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularLoc1E_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularLoc2E_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularLoc1H_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularLoc2H_h, regularCNT_Normal * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));

        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularNeigh1E_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularNeigh2E_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularNeigh1H_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularNeigh2H_h, totalRegularNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));

        int localPosition = 0;
        int couplingPosition = 0;

        for(int i = 1; i < regularCNT_Normal+1; i++)
        {
          cout << "Group " << i << endl;
          tet = &(tetARRAY[regionARRAY[i]]);

          tet->prepareCuBLAS(&regularLoc1E_h[localPosition], &regularLoc2E_h[localPosition], &regularNeigh1E_h[couplingPosition], &regularNeigh2E_h[couplingPosition], nullptr,
                             &regularLoc1H_h[localPosition], &regularLoc2H_h[localPosition], &regularNeigh1H_h[couplingPosition], &regularNeigh2H_h[couplingPosition], nullptr);

          localPosition += localMatrixSize;
          couplingPosition += classRegularGroupsNeighCnt_h[i] * neighMatrixSize;
        }
      }
      cout << "Complete regular matrices preparation" << endl;

      cout << "--------------------------------------------------------------------------------------------------" << endl;


      cout << "regularCNT_PML = " << regularCNT_PML << endl;
      cout << "totalRegularPMLNeighFaceCnt = " << totalRegularPMLNeighFaceCnt << endl;

      if(regularRegionFlag && regularCNT_PML > 0)
      {

        cout << "========== FILLING regular PML ===============" << endl;

        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc1E_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc2E_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc1H_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc2H_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));

        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLNeigh1E_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLNeigh2E_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLNeigh1H_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLNeigh2H_h, totalRegularPMLNeighFaceCnt * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));

        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLAuxE_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLAuxH_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc1M_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc2M_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc1J_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&regularPMLLoc2J_h, regularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));

        int localPosition = 0;
        int couplingPosition = 0;

        for(int i = regularCNT_Normal+1; i < regularCNT_Normal+regularCNT_PML+1; i++)
        {
          cout << "Group " << i << endl;
          tet = &(tetARRAY[regionARRAY[i]]);
          cout << "------------" << endl;

          tet->prepareCuBLAS_PML(&regularPMLLoc1E_h[localPosition], &regularPMLLoc2E_h[localPosition],
                                &regularPMLNeigh1E_h[couplingPosition], &regularPMLNeigh2E_h[couplingPosition],
                                &regularPMLLoc1H_h[localPosition], &regularPMLLoc2H_h[localPosition],
                                &regularPMLNeigh1H_h[couplingPosition], &regularPMLNeigh2H_h[couplingPosition],
                                &regularPMLAuxE_h[localPosition], &regularPMLAuxH_h[localPosition],
                                &regularPMLLoc1M_h[localPosition], &regularPMLLoc2M_h[localPosition],
                                &regularPMLLoc1J_h[localPosition],&regularPMLLoc2J_h[localPosition]);


          localPosition += localMatrixSize;
          couplingPosition += classRegularGroupsNeighCnt_h[i] * neighMatrixSize;
        }
      }
      cout << "Complete regular PML matrices preparation" << endl;

      cout << "--------------------------------------------------------------------------------------------------" << endl;


      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      // Calculate the neighbors (number per position + offset) so we know the number of matrices that we are going to need
      // Also, we generate an array that is going to map the ID and the order
      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

      cout << "Neighbor matrices preparation" << endl;

      cout << "tetraCNT = " << tetraCNT << endl;

      int neighCNT = 0;

      CUDA_SAFE_CALL(cudaMallocHost((void**)&mapIdLoc, tetraCNT * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&Neighbours_h, tetraCNT * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighboursOffset_h, tetraCNT * sizeof(int), cudaHostAllocMapped));

      cntAux = 0;
      for(int i = 0; i < N_class; i++)
      {
        for(int j = 0; j < ClassTetraCnt[i] + ClassPMLTetraCnt[i]; j++)
        {
          tet = &(tetARRAY[ClassTetraIndex[i][j]]);

          mapIdLoc[ClassTetraIndex[i][j]] = cntAux;
          Neighbours_h[cntAux] = tet->get_NeighNum();
          NeighboursOffset_h[cntAux] = neighCNT;

          neighCNT += tet->get_NeighNum();
          cntAux++;
        }
      }


      cout << "cntAux = " << cntAux << endl;


      CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighMap_h, neighCNT * FacePolyOrderDim[PolyFlag] * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighClass_h, N_class * sizeof(int), cudaHostAllocMapped));
      CUDA_SAFE_CALL(cudaMallocHost((void**)&NeighClassOffset_h, N_class * sizeof(int), cudaHostAllocMapped));

      int maxNeighClass = 0;
      neighCNT = 0;
      cntAux = 0;
      for(int i = 0; i < N_class; i++)
      {
        NeighClassOffset_h[i] = neighCNT;
        //cout << "====== Class " << i << endl;

        //cout << "Non-PML " << endl;
        for(int j = 0; j < ClassTetraCnt[i]; j++)
        {
          tet = &(tetARRAY[ClassTetraIndex[i][j]]);
          bool isPML = tet->get_PML_Flag();
          //cout << "TET = " << ClassTetraIndex[i][j] << " | PML = " << isPML << endl;

          for(int neigh = 0; neigh < tet->get_NeighNum(); neigh++)
          {
            tetra* neighbor = tet->get_NeighborTetra(neigh);
            int neighFace = tet->getNeighFace(neighbor);
            int offset = mapIdLoc[neighbor->getcnt()] * TetPolyOrderDim[PolyFlag];

            int neighID = mapIdLoc[neighbor->getcnt()];
            bool isPML2 = neighbor->get_PML_Flag();
            //cout << "TET = " << neighID << " | PML = " << isPML2 << endl;

            for(int k = 0; k < FacePolyOrderDim[PolyFlag]; k++)
            {
              NeighMap_h[cntAux++] = offset + fac2tet[neighFace][k];
            }
          }

          neighCNT += tet->get_NeighNum();
        }

        for(int j = ClassTetraCnt[i]; j <  ClassTetraCnt[i] + ClassPMLTetraCnt[i]; j++)
        {
          tet = &(tetARRAY[ClassTetraIndex[i][j]]);

          bool isPML = tet->get_PML_Flag();
          for(int neigh = 0; neigh < tet->get_NeighNum(); neigh++)
          {
            tetra* neighbor = tet->get_NeighborTetra(neigh);
            int neighFace = tet->getNeighFace(neighbor);
            int offset = mapIdLoc[neighbor->getcnt()] * TetPolyOrderDim[PolyFlag];

            int neighID = mapIdLoc[neighbor->getcnt()];
            bool isPML2 = neighbor->get_PML_Flag();
            for(int k = 0; k < FacePolyOrderDim[PolyFlag]; k++)
            {
              NeighMap_h[cntAux++] = offset + fac2tet[neighFace][k];
            }
          }
          neighCNT += tet->get_NeighNum();
        }

        NeighClass_h[i] = neighCNT - NeighClassOffset_h[i];
        maxNeighClass = (int)std::max(maxNeighClass, NeighClass_h[i]);
      }
      cout << "Complete Neighbor matrices preparation" << endl;
      cout << "neighCNT = " << neighCNT << endl;
      cout << "--------------------------------------------------------------------------------------------------" << endl;


      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      // Create the matrices (4 sets per field + inverse for exited elements):
      //      - Loc1E/Loc1H: local matrices that are multiplied by the same type of field (Field1 = Loc1 * Field0 + ...)
      //      - Loc2E/Loc2H: local matrices that are multiplied by the opposite type of field (E1 = Loc2E * H1_2 + ...)
      //      - Neigh1E/Neigh1H: matrices related to the neighbors opposite filed
      //      - Neigh2E/Neigh2H: matrices related to the neighbors same filed
      //      - InvE_h/InvH_h: inverse Mass matrices (only for excited terms)
      //
      // *** NOTE: each of these matrices is Column-Major Order ***
      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////

      cout << "Excitation preparation" << endl;
      cout << "exciCNT = " << exciCNT << endl;


      if (nonregularCNT_Normal > 0)
      {
          cout << "========== FILLING Irregular ===============" << endl;

          CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1E_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
          CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2E_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
          CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1H_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
          CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2H_h, irregularTetras * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));

          CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1E_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
          CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2E_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
          CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1H_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
          CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2H_h, irregularNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));

          CUDA_SAFE_CALL(cudaMallocHost((void**)&InvE_h, exciCNT * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
          CUDA_SAFE_CALL(cudaMallocHost((void**)&InvH_h, exciCNT * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));


          std::cout << "Begin irregular CuBLAS preparation" << std::endl;
          std::cout << "N_class = " << N_class << std::endl;

          cout << "irregularTetras = " << irregularTetras << endl;
          cout << "nonregularCNT_Normal = " << nonregularCNT_Normal << endl;

          exciCNT = 0;
          irregularTetras = 0;
          irregularNeighbours = 0;

          //NOTE: this only works because of the order of the tetras in ClassTetraIndex (Exci0 NonExci0 Exci1 ...) where the number is the class
          //NOTE: classRegularTetraCnt_h[i * regularCNT + 0] means that we only take into consideration the group 0 (irregular mesh) since the others were already done in the regular section
          for(int i = 0; i < N_class; i++)
          {

            for(int j = 0; j < nonRegularTetraCnt_h[i]; j++)
            {


              tet = &(tetARRAY[ClassTetraIndex[i][j]]);
              int localPosition = irregularTetras * localMatrixSize;
              int couplingPosition = irregularNeighbours * neighMatrixSize;

              fp_t_ts* InvEptr = j < ClassExcitationCount[i] ? &InvE_h[(exciCNT + j) * localMatrixSize] : nullptr;
              fp_t_ts* InvHptr = j < ClassExcitationCount[i] ? &InvH_h[(exciCNT + j) * localMatrixSize] : nullptr;

              tet->prepareCuBLAS(&Loc1E_h[localPosition], &Loc2E_h[localPosition], &Neigh1E_h[couplingPosition], &Neigh2E_h[couplingPosition], InvEptr,
                                &Loc1H_h[localPosition], &Loc2H_h[localPosition], &Neigh1H_h[couplingPosition], &Neigh2H_h[couplingPosition], InvHptr);

              irregularTetras++;
              irregularNeighbours += tet->get_NeighNum();

            }
            exciCNT += ClassExcitationCount[i];
          }
          cout << "irregularTetras = " << irregularTetras << endl;
          cout << "exciCNT = " << exciCNT << endl;

      }

      cout << "--------------------------------------------------------------------------------------------------" << endl;


      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      // Create the matrices (4 sets per field + inverse for exited elements):
      //      - Loc1E_PML/Loc1H_PML: local matrices that are multiplied by the same type of field (Field1 = Loc1 * Field0 + ...)
      //      - Loc2E_PML/Loc2H_PML: local matrices that are multiplied by the opposite type of field (E1 = Loc2E * H1_2 + ...)
      //      - Neigh1E_PML/Neigh1H_PML: matrices related to the neighbors opposite filed
      //      - Neigh2E_PML/Neigh2H_PML: matrices related to the neighbors same filed
      //      - InvE_h/InvH_h: inverse Mass matrices (only for excited terms)
      //
      // *** NOTE: each of these matrices is Column-Major Order ***
      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      cout << "nonregularCNT_PML = " << nonregularCNT_PML << endl;

      if (nonregularCNT_PML > 0)
      {
        cout << "========== FILLING PML ===============" << endl;

        CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1E_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2E_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc1H_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&Loc2H_PML_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));

        CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1E_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2E_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh1H_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&Neigh2H_PML_h, PMLNeighbours * neighMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));

        CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxE_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxH_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxM1_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxJ1_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxM2_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));
        CUDA_SAFE_CALL(cudaMallocHost((void**)&AuxJ2_h, nonregularCNT_PML * localMatrixSize * sizeof(fp_t_ts), cudaHostAllocMapped));

        cout << "PMLTetras = " << PMLTetras << endl;
        cout << "PMLNeighbours = " << PMLNeighbours << endl;

        // Reset counters before starting matrix population
        PMLTetras = 0;
        PMLNeighbours = 0;

        // Loop over all LTS classes
        for (int i = 0; i < N_class; i++)
        {

            for (int j =  ClassTetraCnt[i]; j < ClassTetraCnt[i] + nonRegularPMLTetraCnt_h[i]; j++)
            {
                // Get pointer to the j-th irregular tetrahedron in class i
                tet = &(tetARRAY[ClassTetraIndex[i][j]]);

                // Non-PML Irregular Tetrahedron: compute memory positions for local and neighbor matrices
                int localPos = PMLTetras * localMatrixSize;
                int neighPos = PMLNeighbours * neighMatrixSize;

                // Fill in the local and coupling matrices for non-PML irregular tetra
                tet->prepareCuBLAS_PML(&Loc1E_PML_h[localPos], &Loc2E_PML_h[localPos],
                                      &Neigh1E_PML_h[neighPos], &Neigh2E_PML_h[neighPos],
                                      &Loc1H_PML_h[localPos], &Loc2H_PML_h[localPos],
                                      &Neigh1H_PML_h[neighPos], &Neigh2H_PML_h[neighPos],
                                      &AuxE_h[localPos], &AuxH_h[localPos],
                                      &AuxM1_h[localPos], &AuxM2_h[localPos],
                                      &AuxJ1_h[localPos],&AuxJ2_h[localPos]);

                // Increment running totals for non-PML irregular tetrahedra and their neighbors
                PMLTetras++;
                PMLNeighbours += tet->get_NeighNum();

            }
        }
        cout << "PMLTetras = " << PMLTetras << endl;
      }

      int sizePML = PMLTetras * TetPolyOrderDim[PolyFlag];

      cout << "--------------------------------------------------------------------------------------------------" << endl;


      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      // Check GPU Memory
      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////

      struct MemItem { const char* label; size_t bytes; };

      auto BYTES_ = [](size_t elems, size_t sizeofT){ return elems * sizeofT; };
      auto GB     = [](size_t bytes){ return double(bytes) / 1e9; };
      auto sum_bytes = [](const std::vector<MemItem>& v)->size_t{
          size_t s=0; for (auto& it: v) s += it.bytes; return s;
      };

      // ===== Memory accounting (exact, by allocation) ===================================
      const int    TPO = TetPolyOrderDim[PolyFlag];
      const int    FPO = FacePolyOrderDim[PolyFlag];
      const size_t localElems   = static_cast<size_t>(TPO) * TPO;
      const size_t neighElems   = static_cast<size_t>(TPO) * FPO;

      const int    exciCNT_total             = exciCNT;
      const int    irregularTetras_total     = irregularTetras;
      const int    irregularNeighbours_total = irregularNeighbours;
      const int    PMLTetras_total           = PMLTetras;
      const int    PMLNeighbours_total       = PMLNeighbours;

      const int    regNormGroups     = regularCNT_Normal;
      const int    regPMLGroups      = regularCNT_PML;
      const int    regNormFacesTotal = totalRegularNeighFaceCnt;
      const int    regPMLFacesTotal  = totalRegularPMLNeighFaceCnt;

      const size_t sizeFieldElems = sizeField; // already in elements
      const size_t sizePMLElems   = sizePML;   // already in elements (if you keep a global PML state)

      const size_t neighMapElems   = static_cast<size_t>(neighCNT) * FPO;
      const size_t neighboursElems = tetraCNT;
      const size_t auxInElems      = static_cast<size_t>(maxNeighClass) * FPO;
      const size_t auxOutElems     = static_cast<size_t>(maxNeighClass) * TPO;

      const size_t mapElemsPerExci = TPO;
      const size_t tetNdElems      = static_cast<size_t>(NumOfUnitaryVectors) * NumOfNodes        * exciCNT_total;
      const size_t faceNdElems     = static_cast<size_t>(NumOfUnitaryVectors) * NumOfNodesPerFace * excitationFaces;

      // ============ Build accounting vectors matching your allocations ==================
      std::vector<MemItem> excit, prop, state, neighs;

      // ---- Excitation maps & counts ----
      excit.push_back({"mapE (int8)",                 BYTES_(size_t(exciCNT_total) * mapElemsPerExci, sizeof(int8_t))});
      excit.push_back({"mapH (int8)",                 BYTES_(size_t(exciCNT_total) * mapElemsPerExci, sizeof(int8_t))});
      excit.push_back({"ExcitationFacesCnt (int)",    BYTES_(exciCNT_total, sizeof(int))});
      excit.push_back({"ExcitationFacesOffset (int)", BYTES_(exciCNT_total, sizeof(int))});
      excit.push_back({"ExcitationFacesNum (int)",    BYTES_(excitationFaces, sizeof(int))});
      excit.push_back({"nd_coords_tet",               BYTES_(tetNdElems,  sizeof(fp_t_ts))});
      excit.push_back({"nd_coords_face",              BYTES_(faceNdElems, sizeof(fp_t_ts))});
      if (PlaneWaveBCFlag && excitationFaces > 0) {
          excit.push_back({"Z_face_pw",               BYTES_(excitationFaces, sizeof(fp_t_ts))});
      }
      // Inverses only for excitations
      excit.push_back({"InvE",                        BYTES_(size_t(exciCNT_total) * localElems, sizeof(fp_t_ts))});
      excit.push_back({"InvH",                        BYTES_(size_t(exciCNT_total) * localElems, sizeof(fp_t_ts))});

      // ---- Irregular (non-PML) ----
      prop.push_back({"Loc1E (irreg)",   BYTES_(size_t(irregularTetras_total)     * localElems, sizeof(fp_t_ts))});
      prop.push_back({"Loc2E (irreg)",   BYTES_(size_t(irregularTetras_total)     * localElems, sizeof(fp_t_ts))});
      prop.push_back({"Loc1H (irreg)",   BYTES_(size_t(irregularTetras_total)     * localElems, sizeof(fp_t_ts))});
      prop.push_back({"Loc2H (irreg)",   BYTES_(size_t(irregularTetras_total)     * localElems, sizeof(fp_t_ts))});
      prop.push_back({"Neigh1E (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))});
      prop.push_back({"Neigh2E (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))});
      prop.push_back({"Neigh1H (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))});
      prop.push_back({"Neigh2H (irreg)", BYTES_(size_t(irregularNeighbours_total) * neighElems, sizeof(fp_t_ts))});

      // ---- Regular (non-PML) ----
      if (regNormGroups > 0) {
          prop.push_back({"regularLoc1E",    BYTES_(size_t(regNormGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularLoc2E",    BYTES_(size_t(regNormGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularLoc1H",    BYTES_(size_t(regNormGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularLoc2H",    BYTES_(size_t(regNormGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularNeigh1E",  BYTES_(size_t(regNormFacesTotal)  * neighElems, sizeof(fp_t_ts))});
          prop.push_back({"regularNeigh2E",  BYTES_(size_t(regNormFacesTotal)  * neighElems, sizeof(fp_t_ts))});
          prop.push_back({"regularNeigh1H",  BYTES_(size_t(regNormFacesTotal)  * neighElems, sizeof(fp_t_ts))});
          prop.push_back({"regularNeigh2H",  BYTES_(size_t(regNormFacesTotal)  * neighElems, sizeof(fp_t_ts))});
      }

      // ---- Regular PML ----
      if (regPMLGroups > 0)
      {
          prop.push_back({"regularPMLLoc1E",   BYTES_(size_t(regPMLGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLLoc2E",   BYTES_(size_t(regPMLGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLLoc1H",   BYTES_(size_t(regPMLGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLLoc2H",   BYTES_(size_t(regPMLGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLNeigh1E", BYTES_(size_t(regPMLFacesTotal)  * neighElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLNeigh2E", BYTES_(size_t(regPMLFacesTotal)  * neighElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLNeigh1H", BYTES_(size_t(regPMLFacesTotal)  * neighElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLNeigh2H", BYTES_(size_t(regPMLFacesTotal)  * neighElems, sizeof(fp_t_ts))});

          prop.push_back({"regularPMLAuxE",    BYTES_(size_t(regPMLGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLAuxH",    BYTES_(size_t(regPMLGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLLoc1M",   BYTES_(size_t(regPMLGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLLoc2M",   BYTES_(size_t(regPMLGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLLoc1J",   BYTES_(size_t(regPMLGroups)      * localElems, sizeof(fp_t_ts))});
          prop.push_back({"regularPMLLoc2J",   BYTES_(size_t(regPMLGroups)      * localElems, sizeof(fp_t_ts))});

          // per-element state for regular-PML region
          state.push_back({"r_Mn",             BYTES_(size_t(numRegPMLTetras)   * localElems, sizeof(fp_t_ts))});
          state.push_back({"r_Mn1",            BYTES_(size_t(numRegPMLTetras)   * localElems, sizeof(fp_t_ts))});
          state.push_back({"r_Jn12",           BYTES_(size_t(numRegPMLTetras)   * localElems, sizeof(fp_t_ts))});
          state.push_back({"r_Jn32",           BYTES_(size_t(numRegPMLTetras)   * localElems, sizeof(fp_t_ts))});
      }

      // ---- Irregular PML ----
      if (PMLTetras_total > 0)
      {
          prop.push_back({"Loc1E_PML",   BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
          prop.push_back({"Loc2E_PML",   BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
          prop.push_back({"Loc1H_PML",   BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
          prop.push_back({"Loc2H_PML",   BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});

          prop.push_back({"Neigh1E_PML", BYTES_(size_t(PMLNeighbours_total)   * neighElems, sizeof(fp_t_ts))});
          prop.push_back({"Neigh2E_PML", BYTES_(size_t(PMLNeighbours_total)   * neighElems, sizeof(fp_t_ts))});
          prop.push_back({"Neigh1H_PML", BYTES_(size_t(PMLNeighbours_total)   * neighElems, sizeof(fp_t_ts))});
          prop.push_back({"Neigh2H_PML", BYTES_(size_t(PMLNeighbours_total)   * neighElems, sizeof(fp_t_ts))});

          prop.push_back({"AuxE",        BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
          prop.push_back({"AuxH",        BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
          prop.push_back({"AuxM1",       BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
          prop.push_back({"AuxJ1",       BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
          prop.push_back({"AuxM2",       BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
          prop.push_back({"AuxJ2",       BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});

          // per-element PML state arrays
          state.push_back({"Mn",          BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
          state.push_back({"Mn1",         BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
          state.push_back({"Jn12",        BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
          state.push_back({"Jn32",        BYTES_(size_t(PMLTetras_total)       * localElems, sizeof(fp_t_ts))});
      }

      // ---- Global field buffers ----
      state.push_back({"En",    BYTES_(sizeFieldElems, sizeof(fp_t_ts))});
      state.push_back({"En1",   BYTES_(sizeFieldElems, sizeof(fp_t_ts))});
      state.push_back({"Hn12",  BYTES_(sizeFieldElems, sizeof(fp_t_ts))});
      state.push_back({"Hn32",  BYTES_(sizeFieldElems, sizeof(fp_t_ts))});

      // ---- Neighbor maps/structs ----
      neighs.push_back({"NeighMap (int)",         BYTES_(neighMapElems,   sizeof(int))});
      neighs.push_back({"Neighbours (int)",       BYTES_(neighboursElems, sizeof(int))});
      neighs.push_back({"NeighboursOffset (int)", BYTES_(neighboursElems, sizeof(int))});
      neighs.push_back({"auxFieldInput",          BYTES_(auxInElems,  sizeof(fp_t_ts))});
      neighs.push_back({"auxFieldOutput",         BYTES_(auxOutElems, sizeof(fp_t_ts))});

      // ============================ Totals & printing ===================================
      const size_t bytesExcit = sum_bytes(excit);
      const size_t bytesProp  = sum_bytes(prop);
      const size_t bytesState = sum_bytes(state);
      const size_t bytesNeigh = sum_bytes(neighs);

      const double factor = usageSecurityThresholdFactor; // e.g., 1.05
      const double gExcit = GB(bytesExcit) * factor;
      const double gProp  = GB(bytesProp ) * factor;
      const double gState = GB(bytesState) * factor;
      const double gNeigh = GB(bytesNeigh) * factor;
      const double gTotal = gExcit + gProp + gState + gNeigh;

      size_t free_cudamem=0, total_cudamem=0;
      CUDA_SAFE_CALL(cudaMemGetInfo(&free_cudamem, &total_cudamem));

      auto print_rows = [](const char* category, std::vector<MemItem> v, bool sort_by_size = true)
      {
          if (sort_by_size) {
              std::sort(v.begin(), v.end(),
                        [](const MemItem& a, const MemItem& b){ return a.bytes > b.bytes; });
          }
          for (auto& it: v) if (it.bytes) {
              std::cout << std::left  << std::setw(16) << category
                        << std::setw(36) << it.label
                        << std::right << std::setw(12) << std::fixed << std::setprecision(6)
                        << (double(it.bytes)/1e9) << '\n';
          }
      };

      std::cout << "============================================================================================\n";
      std::cout << std::left << std::setw(16) << "Category"
                << std::setw(36) << "Buffer"
                << std::right << std::setw(12) << "Size [GB]" << '\n';
      std::cout << "--------------------------------------------------------------------------------------------\n";

      print_rows("Excitation",  excit);
      print_rows("Propagation", prop);
      print_rows("Fields/State",state);
      print_rows("Neighbors",   neighs);

      std::cout << "--------------------------------------------------------------------------------------------\n";
      std::cout << std::left << std::setw(16) << "TOTALS"
                << std::setw(36) << "Excitation"
                << std::right << std::setw(12) << std::fixed << std::setprecision(6) << gExcit << '\n';
      std::cout << std::left << std::setw(16) << "TOTALS"
                << std::setw(36) << "Propagation"
                << std::right << std::setw(12) << std::fixed << std::setprecision(6) << gProp  << '\n';
      std::cout << std::left << std::setw(16) << "TOTALS"
                << std::setw(36) << "Fields/State"
                << std::right << std::setw(12) << std::fixed << std::setprecision(6) << gState << '\n';
      std::cout << std::left << std::setw(16) << "TOTALS"
                << std::setw(36) << "Neighbors"
                << std::right << std::setw(12) << std::fixed << std::setprecision(6) << gNeigh << '\n';
      std::cout << std::left << std::setw(16) << "TOTAL (est.)"
                << std::setw(36) << ""
                << std::right << std::setw(12) << std::fixed << std::setprecision(6) << gTotal << '\n';

      std::cout << "--------------------------------------------------------------------------------------------\n";
      std::cout << "GPU Memory Free / Total [GB]: "
                << std::fixed << std::setprecision(2)
                << double(free_cudamem)/1e9 << " / " << double(total_cudamem)/1e9 << '\n';
      std::cout << "============================================================================================\n";


      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      // Copy to GPU Memory
      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////

      // ---- Excitation maps & counts -------------------------------------------------
      CUDA_SAFE_MALLOC(mapE_d,                  BYTES(int8_t, exciCNT_total * mapElemsPerExci));
      CUDA_SAFE_MALLOC(mapH_d,                  BYTES(int8_t, exciCNT_total * mapElemsPerExci));
      CUDA_SAFE_MALLOC(ExcitationFacesCnt_d,    BYTES(int,    exciCNT_total));
      CUDA_SAFE_MALLOC(ExcitationFacesOffset_d, BYTES(int,    exciCNT_total));
      CUDA_SAFE_MALLOC(ExcitationFacesNum_d,    BYTES(int,    excitationFaces));
      CUDA_SAFE_MALLOC(nd_coords_tet_d,         BYTES(fp_t_ts, tetNdElems));
      CUDA_SAFE_MALLOC(nd_coords_face_d,        BYTES(fp_t_ts, faceNdElems));
      if (PlaneWaveBCFlag)
      {
          CUDA_SAFE_MALLOC(Z_face_pw_d,         BYTES(fp_t_ts, excitationFaces));
      }

      // --- Allocate precomputed tangential fields (only port faces) ---
      if (portCNT > 0)
      {
        CUDA_SAFE_CALL(cudaMalloc((void**)&Etan_qp_d, excitationFaces * Q * 3 * sizeof(fp_t_ts)));
        CUDA_SAFE_CALL(cudaMalloc((void**)&Htan_qp_d, excitationFaces * Q * 3 * sizeof(fp_t_ts)));

        CUDA_SAFE_CALL(cudaMalloc((void**)&PortFacePidx_d, excitationFaces * sizeof(int)));
        const int nPorts = (int)portExcitations.size();
        CUDA_SAFE_CALL(cudaMalloc((void**)&ExcitationProps_d, nPorts * sizeof(ExcitationProp)));
      }


      CUDA_SAFE_COPY(mapE_d,  mapE_h,  BYTES(int8_t, exciCNT_total * mapElemsPerExci));
      CUDA_SAFE_COPY(mapH_d,  mapH_h,  BYTES(int8_t, exciCNT_total * mapElemsPerExci));
      CUDA_SAFE_COPY(ExcitationFacesCnt_d,    ExcitationFacesCnt_h,    BYTES(int, exciCNT_total));
      CUDA_SAFE_COPY(ExcitationFacesOffset_d, ExcitationFacesOffset_h, BYTES(int, exciCNT_total));
      CUDA_SAFE_COPY(ExcitationFacesNum_d,    ExcitationFacesNum_h,    BYTES(int, excitationFaces));
      CUDA_SAFE_COPY(nd_coords_tet_d,         nd_coords_tet_h,         BYTES(fp_t_ts, tetNdElems));
      CUDA_SAFE_COPY(nd_coords_face_d,        nd_coords_face_h,        BYTES(fp_t_ts, faceNdElems));

      if (PlaneWaveBCFlag)
      {
          CUDA_SAFE_COPY(Z_face_pw_d, Z_face_pw_h, BYTES(fp_t_ts, excitationFaces));
      }

      // --- copy precomputed tangential fields (only port faces) ---
      if (portCNT > 0)
      {
        cout << "Export Etan and Htan" << endl;
        CUDA_SAFE_CALL(cudaMemset(Etan_qp_d, 0.0, BYTES(fp_t_ts, excitationFaces * Q * 3)));
        CUDA_SAFE_CALL(cudaMemset(Htan_qp_d, 0.0, BYTES(fp_t_ts, excitationFaces * Q * 3)));
        CUDA_SAFE_COPY(Etan_qp_d, Etan_qp_h, BYTES(fp_t_ts, excitationFaces * Q * 3));
        CUDA_SAFE_COPY(Htan_qp_d, Htan_qp_h, BYTES(fp_t_ts, excitationFaces * Q * 3));
        CUDA_SAFE_COPY(PortFacePidx_d,    PortFacePidx_h,    BYTES(int, excitationFaces));
        const int nPorts = (int)portExcitations.size();
        CUDA_SAFE_COPY(ExcitationProps_d,    portExcitations.data(),   nPorts * sizeof(ExcitationProp));

      }


      // ---- Irregular (non-PML) -----------------------------------------------------
      CUDA_SAFE_MALLOC(Loc1E_d,   BYTES(fp_t_ts, irregularTetras_total   * localElems));
      CUDA_SAFE_MALLOC(Loc2E_d,   BYTES(fp_t_ts, irregularTetras_total   * localElems));
      CUDA_SAFE_MALLOC(Loc1H_d,   BYTES(fp_t_ts, irregularTetras_total   * localElems));
      CUDA_SAFE_MALLOC(Loc2H_d,   BYTES(fp_t_ts, irregularTetras_total   * localElems));
      CUDA_SAFE_MALLOC(Neigh1E_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
      CUDA_SAFE_MALLOC(Neigh2E_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
      CUDA_SAFE_MALLOC(Neigh1H_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
      CUDA_SAFE_MALLOC(Neigh2H_d, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));

      // Inverses only for excitations
      CUDA_SAFE_MALLOC(InvE_d, BYTES(fp_t_ts, exciCNT_total * localElems));
      CUDA_SAFE_MALLOC(InvH_d, BYTES(fp_t_ts, exciCNT_total * localElems));


      // Irregular (non-PML)
      CUDA_SAFE_COPY(Loc1E_d, Loc1E_h, BYTES(fp_t_ts, irregularTetras_total    * localElems));
      CUDA_SAFE_COPY(Loc2E_d, Loc2E_h, BYTES(fp_t_ts, irregularTetras_total    * localElems));
      CUDA_SAFE_COPY(Loc1H_d, Loc1H_h, BYTES(fp_t_ts, irregularTetras_total    * localElems));
      CUDA_SAFE_COPY(Loc2H_d, Loc2H_h, BYTES(fp_t_ts, irregularTetras_total    * localElems));
      CUDA_SAFE_COPY(Neigh1E_d, Neigh1E_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
      CUDA_SAFE_COPY(Neigh2E_d, Neigh2E_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
      CUDA_SAFE_COPY(Neigh1H_d, Neigh1H_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
      CUDA_SAFE_COPY(Neigh2H_d, Neigh2H_h, BYTES(fp_t_ts, irregularNeighbours_total * neighElems));
      CUDA_SAFE_COPY(InvE_d, InvE_h, BYTES(fp_t_ts, exciCNT_total * localElems));
      CUDA_SAFE_COPY(InvH_d, InvH_h, BYTES(fp_t_ts, exciCNT_total * localElems));


      // ---- Regular (prototype per group) -------------------------------------------
      // Use exact counts — NOT (regularCNT - 1) or "*4"
      if (regularRegionFlag)
      {

          if (regNormGroups > 0)
          {
            CUDA_SAFE_MALLOC(regularLoc1E_d, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
            CUDA_SAFE_MALLOC(regularLoc2E_d, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
            CUDA_SAFE_MALLOC(regularLoc1H_d, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
            CUDA_SAFE_MALLOC(regularLoc2H_d, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));

            CUDA_SAFE_MALLOC(regularNeigh1E_d, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
            CUDA_SAFE_MALLOC(regularNeigh2E_d, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
            CUDA_SAFE_MALLOC(regularNeigh1H_d, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
            CUDA_SAFE_MALLOC(regularNeigh2H_d, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));

            CUDA_SAFE_COPY(regularLoc1E_d, regularLoc1E_h, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
            CUDA_SAFE_COPY(regularLoc2E_d, regularLoc2E_h, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
            CUDA_SAFE_COPY(regularLoc1H_d, regularLoc1H_h, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
            CUDA_SAFE_COPY(regularLoc2H_d, regularLoc2H_h, BYTES(fp_t_ts, static_cast<size_t>(regNormGroups) * localElems));
            CUDA_SAFE_COPY(regularNeigh1E_d, regularNeigh1E_h, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
            CUDA_SAFE_COPY(regularNeigh2E_d, regularNeigh2E_h, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
            CUDA_SAFE_COPY(regularNeigh1H_d, regularNeigh1H_h, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));
            CUDA_SAFE_COPY(regularNeigh2H_d, regularNeigh2H_h, BYTES(fp_t_ts, static_cast<size_t>(regNormFacesTotal) * neighElems));

          }

          if (regPMLGroups > 0)
          {
            // PML-regular
            CUDA_SAFE_MALLOC(regularPMLLoc1E_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_MALLOC(regularPMLLoc2E_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_MALLOC(regularPMLLoc1H_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_MALLOC(regularPMLLoc2H_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));

            CUDA_SAFE_MALLOC(regularPMLNeigh1E_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
            CUDA_SAFE_MALLOC(regularPMLNeigh2E_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
            CUDA_SAFE_MALLOC(regularPMLNeigh1H_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
            CUDA_SAFE_MALLOC(regularPMLNeigh2H_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));

            // PML auxiliaries for regular-PML prototypes (if used)
            CUDA_SAFE_MALLOC(regularPMLAuxE_d,  BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_MALLOC(regularPMLAuxH_d,  BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_MALLOC(regularPMLLoc1M_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_MALLOC(regularPMLLoc2M_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_MALLOC(regularPMLLoc1J_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_MALLOC(regularPMLLoc2J_d, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));

            // PML-regular
            CUDA_SAFE_COPY(regularPMLLoc1E_d, regularPMLLoc1E_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_COPY(regularPMLLoc2E_d, regularPMLLoc2E_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_COPY(regularPMLLoc1H_d, regularPMLLoc1H_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_COPY(regularPMLLoc2H_d, regularPMLLoc2H_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_COPY(regularPMLNeigh1E_d, regularPMLNeigh1E_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
            CUDA_SAFE_COPY(regularPMLNeigh2E_d, regularPMLNeigh2E_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
            CUDA_SAFE_COPY(regularPMLNeigh1H_d, regularPMLNeigh1H_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));
            CUDA_SAFE_COPY(regularPMLNeigh2H_d, regularPMLNeigh2H_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLFacesTotal) * neighElems));

            CUDA_SAFE_COPY(regularPMLAuxE_d,  regularPMLAuxE_h,  BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_COPY(regularPMLAuxH_d,  regularPMLAuxH_h,  BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_COPY(regularPMLLoc1M_d, regularPMLLoc1M_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_COPY(regularPMLLoc2M_d, regularPMLLoc2M_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_COPY(regularPMLLoc1J_d, regularPMLLoc1J_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));
            CUDA_SAFE_COPY(regularPMLLoc2J_d, regularPMLLoc2J_h, BYTES(fp_t_ts, static_cast<size_t>(regPMLGroups) * localElems));

            CUDA_SAFE_MALLOC(r_Mn_d,   BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
            CUDA_SAFE_MALLOC(r_Mn1_d,  BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
            CUDA_SAFE_MALLOC(r_Jn12_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
            CUDA_SAFE_MALLOC(r_Jn32_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));

            CUDA_SAFE_ZERO(r_Mn_d,   BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
            CUDA_SAFE_ZERO(r_Mn1_d,  BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
            CUDA_SAFE_ZERO(r_Jn12_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));
            CUDA_SAFE_ZERO(r_Jn32_d, BYTES(fp_t_ts, static_cast<size_t>(numRegPMLTetras) * localElems));

          }
      }

      // ---- Irregular PML (per element) ---------------------------------------------
      cout << "Non regular PMLTetras_total = " << PMLTetras_total << endl;
      if (PMLTetras_total > 0)
      {
          CUDA_SAFE_MALLOC(Loc1E_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total)      * localElems));
          CUDA_SAFE_MALLOC(Loc2E_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total)      * localElems));
          CUDA_SAFE_MALLOC(Loc1H_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total)      * localElems));
          CUDA_SAFE_MALLOC(Loc2H_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total)      * localElems));

          CUDA_SAFE_MALLOC(Neigh1E_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
          CUDA_SAFE_MALLOC(Neigh2E_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
          CUDA_SAFE_MALLOC(Neigh1H_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
          CUDA_SAFE_MALLOC(Neigh2H_PML_d, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));

          CUDA_SAFE_MALLOC(AuxE_d,  BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_MALLOC(AuxH_d,  BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_MALLOC(AuxM1_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_MALLOC(AuxJ1_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_MALLOC(AuxM2_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_MALLOC(AuxJ2_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));


          CUDA_SAFE_COPY(Loc1E_PML_d, Loc1E_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_COPY(Loc2E_PML_d, Loc2E_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_COPY(Loc1H_PML_d, Loc1H_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_COPY(Loc2H_PML_d, Loc2H_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));

          CUDA_SAFE_COPY(Neigh1E_PML_d, Neigh1E_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
          CUDA_SAFE_COPY(Neigh2E_PML_d, Neigh2E_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
          CUDA_SAFE_COPY(Neigh1H_PML_d, Neigh1H_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));
          CUDA_SAFE_COPY(Neigh2H_PML_d, Neigh2H_PML_h, BYTES(fp_t_ts, static_cast<size_t>(PMLNeighbours_total) * neighElems));

          CUDA_SAFE_COPY(AuxE_d,  AuxE_h,  BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_COPY(AuxH_d,  AuxH_h,  BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_COPY(AuxM1_d, AuxM1_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_COPY(AuxJ1_d, AuxJ1_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_COPY(AuxM2_d, AuxM2_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_COPY(AuxJ2_d, AuxJ2_h, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));

          CUDA_SAFE_MALLOC(Mn_d,   BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_MALLOC(Mn1_d,  BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_MALLOC(Jn12_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_MALLOC(Jn32_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));

          CUDA_SAFE_ZERO(Mn_d,   BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_ZERO(Mn1_d,  BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_ZERO(Jn12_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));
          CUDA_SAFE_ZERO(Jn32_d, BYTES(fp_t_ts, static_cast<size_t>(PMLTetras_total) * localElems));

      }


      // ---- Global field buffers -----------------------------------------------------
      CUDA_SAFE_MALLOC(En_d,   BYTES(fp_t_ts, sizeFieldElems));
      CUDA_SAFE_MALLOC(En1_d,  BYTES(fp_t_ts, sizeFieldElems));
      CUDA_SAFE_MALLOC(Hn12_d, BYTES(fp_t_ts, sizeFieldElems));
      CUDA_SAFE_MALLOC(Hn32_d, BYTES(fp_t_ts, sizeFieldElems));

      // Fields zero init
      CUDA_SAFE_ZERO(En_d,   BYTES(fp_t_ts, sizeFieldElems));
      CUDA_SAFE_ZERO(En1_d,  BYTES(fp_t_ts, sizeFieldElems));
      CUDA_SAFE_ZERO(Hn12_d, BYTES(fp_t_ts, sizeFieldElems));
      CUDA_SAFE_ZERO(Hn32_d, BYTES(fp_t_ts, sizeFieldElems));


      // ---- Neighbor maps ------------------------------------------------------------
      CUDA_SAFE_MALLOC(NeighMap_d,         BYTES(int,    neighMapElems));
      CUDA_SAFE_MALLOC(Neighbours_d,       BYTES(int,    neighboursElems));
      CUDA_SAFE_MALLOC(NeighboursOffset_d, BYTES(int,    neighboursElems));
      CUDA_SAFE_MALLOC(auxFieldInput,      BYTES(fp_t_ts, auxInElems));
      CUDA_SAFE_MALLOC(auxFieldOutput,     BYTES(fp_t_ts, auxOutElems));

      // Neighbor structures
      CUDA_SAFE_COPY(NeighMap_d,         NeighMap_h,         BYTES(int,    neighMapElems));
      CUDA_SAFE_COPY(Neighbours_d,       Neighbours_h,       BYTES(int,    neighboursElems));
      CUDA_SAFE_COPY(NeighboursOffset_d, NeighboursOffset_h, BYTES(int,    neighboursElems));
    }


    void FemGrp::TimeSteppingCuBLAS()
    {
      fp_t InitTime = 0.0;

      fp_t Frequency = freq;
      fp_t dt_nyquist = 1.0 / (2.0 * Frequency * MEGA);
      fp_t dt_sample = (1 / SamplingRate) * dt_nyquist;
      tsPerSampling = (int)ceil(dt_sample / LocTimeSteps[N_class - 1]);
      dt_sample = tsPerSampling * LocTimeSteps[N_class - 1];

      if(FinalTime > 0)
        NtimeSteps = (int)ceil((FinalTime - InitTime) / LocTimeSteps[N_class -1]); // number of time steps for the biggest time step size
      else
        NtimeSteps = 0;

      if(usePade){
        fp_t earlyTime = 10 * Length(maxPoint - minPoint) / Vo;
        /*7.5 (for saftey use 10) is empirical because in "Early Time Behavior in Reverberation Chambers and
                                        Its Effect on the Relationships Between Coherence
                                        Bandwidth, Chamber Decay Time, RMS Delay
                                        Spread, and the Chamber Buildup Time", Christopher L. Holloway et al.
                                        the value of 3/2 is suggested from equation 30  */
        tsPerPade = (int)ceil(earlyTime / LocTimeSteps[N_class -1]);
        tsPerPade = tsPerPade + tsPerSampling - tsPerPade % tsPerSampling;

        fieldProbes = new fp_t_ts[probeCNT * (int)ceil((1.0 * NtimeSteps) / tsPerSampling) * NumOfFieldComponents];

        CUDA_SAFE_CALL(cudaMallocHost((void**)&tranferencePadeFunctionFD_h, padeCNT * (int)ceil((1.0 * NtimeSteps) / tsPerSampling) * NumOfFieldComponents * sizeof(cuDoubleComplex), cudaHostAllocMapped));

        getPadeFreq((int)ceil((1.0 * NtimeSteps) / tsPerSampling), tsPerSampling);
      }

      Write_TD_Data(tsPerSampling, NtimeSteps);

      //Output precision set to 15 digits
      cout.precision(15);

      //Print out data used in the computation
      cout << endl;
      cout << "=============================================" << endl;
      cout << "== Running CUDA Implementation (Non-Heavy) ==" << endl;
      cout << "=============================================" << endl;
      cout << endl;
      cout << "==========================================" << endl;
      cout << "          PERFORMING INFORMATION          " << endl;
      cout << "==========================================" << endl;

      if(FinalTime > 0)
        cout << " Final Time(sec)         = " << FinalTime << endl;
      else
        cout << " Final Time              = " << "TBD" << endl;

      cout << " Time Step, dt(sec)      = " << LocTimeSteps[N_class -1] << endl;
      cout << " Number of Tetrahedra    = " << tetraCNT << endl;
      cout << " Number of Classes       = " << N_class << endl;

      if(FinalTime > 0)
        cout << " Number of Time Steps    = " << NtimeSteps << endl;

      for(int i = 0; i < N_class ; i++){
        cout << " LocTimeSteps[" << i << "]         = " << LocTimeSteps[i] << endl;
      }

      cout << endl;
      cout << " dt_nyquist              = " << dt_nyquist << endl;
      cout << " dt_sample               = " << dt_sample << endl;
      cout << " tsPerSampling           = " << tsPerSampling << endl;

      if(FinalTime > 0)
        cout << " Number of samplings     = " << (int)ceil((1.0 * NtimeSteps) / tsPerSampling) << endl;

      if(usePade){
        cout << " Time Steps / Pade Calc  = " << tsPerPade << endl;
      }
      cout << "==========================================" << endl;
      cout << endl;

      //Memory status
      SYSTEM_MEM_USAGE();
      cout << endl;

      cout << " " << endl;
      cout << "===================================================" << endl;
      cout << "               Local Time-Stepping Loop            " << endl;
      cout << "===================================================" << endl;

      // Variables for time tracking
      size_t total_time = 0;
      fp_t current_time = 0;
      bool exitBool = false;
      current_time -= (double)dt_sample * 1e9;

      if(FinalTime <= 0){
        NtimeSteps = NumOfSampleEnergyCheck * tsPerSampling + 1;
        fieldEnergy = 0;
        maxFieldEnergy = 0;
        if(numberOfEnergyPoints == 0){
          numberOfEnergyPoints = probeCNT;
        }
      }

      cublasHandle_t handle;
      cublasCreate(&handle);

      timer_start("Time Stepping", ' ');
      timer_start("Start Time Stepping", 'm');
      for(int n = 0; n < NtimeSteps; n++){
        ComputeE_cuBLAS(handle, N_class - 1);
        ComputeH_cuBLAS(handle, N_class - 1);


        if(n % tsPerSampling == 0)
        {

          CUDA_SAFE_CALL(cudaMemcpy(En1_h, En1_d, tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToHost));
          CUDA_SAFE_CALL(cudaMemcpy(Hn32_h, Hn32_d, tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToHost));

          CUDA_SAFE_CALL(cudaDeviceSynchronize());

          total_time += timer_stop('m');
          if(write_probes && probeCNT > 0)
          {
            writeFieldProbeCuBLAS(n);
            if(write_AnalyticalIncidentProbes){
              writeAnalyticalIncidentPWProbes(n);
            }

            if(n != 0 && usePade && n % tsPerPade == 0)
            {
              if(padeTime < 0.0){
                exitBool = calculatePadeCUDA(n, n / tsPerPade == 1, false);
              }else if(n * LocTimeSteps[N_class - 1] > padeTime * 1e-9){
                exitBool = true;
              }
            }
          }

          if(write_fields){
            writeFieldGlobalCuBLAS(n);
          }

          // Modified by Qi Jian to write surface currents
          if(WriteSurfFlag)
          {
            writeCurrentsOutputSurfMesh_CuBLAS(n);
          }

          // Writing the fields on the port surfaces
          if (PortBCFlag)
          {
            writePortFieldProbeCuBLAS(n);
          }


          fp_t_ts magAux = 0;

          for(int i = 0; i < tetraCNT * TetPolyOrderDim[PolyFlag]; i++){
            magAux += En1_h[i] * En1_h[i];
          }

          cout << "E field norm^2 " << magAux << endl;

          current_time += (double)dt_sample * 1e9;
          DEBUG_INFO(" Current Time : " + to_string(current_time) + "ns");
          DEBUG_INFO(" Average iteration time : "+ to_string(((double)total_time / (double)(n + 1.0))) + " msec");

          if(exitBool){
            calculatePadeCUDA(n, false, true);
            break;
          }

          if(FinalTime < 0 && n == NtimeSteps-1){
            if(!checkEnergyDecay()){
              NtimeSteps += NumOfSampleEnergyCheck * tsPerSampling;
              cout << "Max Energy: " << maxFieldEnergy << " - Current Energy: " << fieldEnergy << " - Relation: " << fieldEnergy * 100 / maxFieldEnergy << "%" << endl;
              fieldEnergy = 0.0;
            }else{
              Write_TD_Data(tsPerSampling, NtimeSteps);
              break;
            }
          }
          cout << "---------------------------------------------------" << endl;

          timer_start(to_string(tsPerSampling)+" steps", 'm');
        }
      }
      if(!exitBool && padeCNT > 0 && !writeWhilePade){
        writeFieldProbeAfterPade(tsPerSampling);
      }

      if(!exitBool && (NtimeSteps-1 % tsPerSampling != 0)){
        timer_stop('m');
      }

      DEBUG_INFO(" Total iteration time: "+ to_string((double)total_time) + " msec");
      timer_stop(' ');
    }

    //The recursivity in ComputeE and ComputeH is due to the LTS process
    /**********************************************************************
      Local Time-Stepping for CUDA Recursive

      Explained in "Dissipative terms and local time-stepping improvements
          in a spatial high order Discontinuous Galerkin scheme
          for the time-domain Maxwell’s equations" by E. Montseny
    **********************************************************************/

    void FemGrp::ComputeE_cuBLAS(cublasHandle_t handle, int class_i){
      if(class_i == 0){
        LE_CuBLAS(handle, class_i);
      }else{
        LE_CuBLAS(handle, class_i);
        ComputeE_cuBLAS(handle, class_i - 1);
        ComputeH_cuBLAS(handle, class_i - 1);
        ComputeE_cuBLAS(handle, class_i - 1);
      }
    }

    void FemGrp::ComputeH_cuBLAS(cublasHandle_t handle, int class_i){
      if(class_i == 0){
        LH_CuBLAS(handle, class_i);
      }else{
        LH_CuBLAS(handle, class_i);
        ComputeH_cuBLAS(handle, class_i - 1);
        ComputeE_cuBLAS(handle, class_i - 1);
        ComputeH_cuBLAS(handle, class_i - 1);
      }
    }


    void FemGrp::LE_CuBLAS(cublasHandle_t handle, int class_i)
    {
        const int Q = GAUSS_POINT_NUM_h[PolyFlag];  // same as GPU kernel uses

        int irregularTetras = nonRegularTetraCnt_h[class_i];

        int classOffset = ClassTetraOffset[class_i];
        int neighOffset = NeighClassOffset_h[class_i];

        int blockSize = 256; //optimal number
        int numBlocks;

        if(irregularTetras > 0)
        {
          // Local Mattrices
          int nMatrices = irregularTetras;
          int matrixOffset = classTetraOffset_loc_h[class_i];


          int m = TetPolyOrderDim[PolyFlag]; //rows of A
          int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A

          long long int strideA = m * n;
          long long int strideB = n;
          long long int strideC = m;

          float alpha = 1.0;
          float beta = 0.0;

          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Loc1E_d[matrixOffset * strideA], m,
                                    strideA,
                                    &En_d[classOffset * strideB], n,
                                    strideB,
                                    &beta,
                                    &En1_d[classOffset * strideC], m,
                                    strideC,
                                    nMatrices);

          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Loc2E_d[matrixOffset * strideA], m,
                                    strideA,
                                    &Hn12_d[classOffset * strideB], n,
                                    strideB,
                                    &beta,
                                    &En1_d[classOffset * strideC], m,
                                    strideC,
                                    nMatrices);


          if(ClassExcitationCount[class_i] > 0)
          {
            nMatrices = ClassExcitationCount[class_i];
            matrixOffset = ClassExcitationOffset[class_i];

            //cout << "ClassExcitationCount[" << class_i << "] = " << ClassExcitationCount[class_i] << endl;
            //cout << "ClassExcitationOffset[" << class_i << "] = " << ClassExcitationOffset[class_i] << endl;
            //cout << "classOffset * strideC " << classOffset * strideC << endl;

            numBlocks = (nMatrices + blockSize - 1) / blockSize;

            fp_t_ts dt = LocTimeSteps[class_i];
            fp_t_ts t = (LocalExciIndexE[class_i] + 0.5) * dt;
            LocalExciIndexE[class_i]++;


            if (PWorPort == 0)
            {
              if (interior_excitation_flag)
              {
                addExcitationE_PML<<<numBlocks, blockSize>>>(&ExcitationFacesCnt_d[matrixOffset],
                  &ExcitationFacesOffset_d[matrixOffset],
                  ExcitationFacesNum_d,
                  nMatrices,
                  ClassExcitation_sc_CNT[class_i],
                  &mapE_d[matrixOffset * strideC],
                  excitationProp,
                  PolyFlag,
                  dt /Eo, t,
                  &nd_coords_tet_d[matrixOffset * NumOfNodes * NumOfUnitaryVectors],
                  nd_coords_face_d,
                  Z_face_pw_d,
                  &InvE_d[matrixOffset * strideA],
                  &En1_d[classOffset * strideC]);

              }
              else
              {

                addExcitationE<<<numBlocks, blockSize>>>(&ExcitationFacesCnt_d[matrixOffset],
                  &ExcitationFacesOffset_d[matrixOffset],
                  ExcitationFacesNum_d,
                  nMatrices,
                  &mapE_d[matrixOffset * strideC],
                  excitationProp,
                  PolyFlag,
                  dt /Eo, t,
                  &nd_coords_tet_d[matrixOffset * NumOfNodes * NumOfUnitaryVectors],
                  nd_coords_face_d,
                  Z_face_pw_d,
                  &InvE_d[matrixOffset * strideA],
                  &En1_d[classOffset * strideC]);
              }
            }
            else
            {
              addExcitationE_port<<<numBlocks, blockSize>>>( &ExcitationFacesCnt_d[matrixOffset],
                  &ExcitationFacesOffset_d[matrixOffset],
                  ExcitationFacesNum_d,
                  nMatrices,
                  &mapE_d[matrixOffset * strideC],
                  ExcitationProps_d,
                  PortFacePidx_d,
                  PolyFlag,
                  dt /Eo, t,
                  &nd_coords_tet_d[matrixOffset * NumOfNodes * NumOfUnitaryVectors],
                  nd_coords_face_d,
                  &InvE_d[matrixOffset * strideA],
                  &En1_d[classOffset * strideC]);

                //cout << "\n\n\n\n\n";
            }

            CUDA_SAFE_CALL(cudaDeviceSynchronize());


          }

          // Coupling Matrices
          nMatrices = classNeighIrregular_h[class_i];
          matrixOffset = classNeighOffset_loc_h[class_i];

          m = TetPolyOrderDim[PolyFlag]; //rows of A
          n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A

          strideA = m * n;
          strideB = n;
          strideC = m;

          numBlocks = (nMatrices * n + blockSize - 1) / blockSize;
          makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)

          beta = 0.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Neigh1E_d[matrixOffset * strideA], m,
                                    strideA,
                                    auxFieldInput, n,
                                    strideB,
                                    &beta,
                                    auxFieldOutput, m,
                                    strideC,
                                    nMatrices);

          makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)

          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Neigh2E_d[matrixOffset * strideA], m,
                                    strideA,
                                    auxFieldInput, n,
                                    strideB,
                                    &beta,
                                    auxFieldOutput, m,
                                    strideC,
                                    nMatrices);


          int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];

          dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
          numBlocks = (irregularTetras + blockY - 1) / blockY;

          addCouplingResults<<<numBlocks, blockDim>>>(&En1_d[classOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], irregularTetras); //Implement 3D if tetras over blocksize * (2^(31) - 1)
        }


        // --------------------------------------------------------------------------------------------------
        CUDA_SAFE_CALL(cudaDeviceSynchronize());


        if(regularRegionFlag && classRegularGroupsCnt_h[class_i] > 0)
        {
          for(int i = 0; i < classRegularGroupsCnt_h[class_i]; i++)
          {
            int groupID = classRegularGroupsId_h[class_i][i];
            int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID];
            int groupOffset = classOffset + nonRegularTetraCnt_h[class_i] + classRegularTetraOffset_h[class_i][i];

            CUDA_SAFE_CALL(cudaDeviceSynchronize());

            // Local Matrices
            int m = TetPolyOrderDim[PolyFlag]; //rows of A
            int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A

            float alpha = 1.0;
            float beta = 0.0;

            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularLoc1E_d[(groupID - 1) * m * n], m,
                        &En_d[groupOffset * n], n,
                        &beta,
                        &En1_d[groupOffset * m], m);

                CUDA_SAFE_CALL(cudaDeviceSynchronize());


            beta = 1.0;
            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularLoc2E_d[(groupID - 1) * m * n], m,
                        &Hn12_d[groupOffset * n], n,
                        &beta,
                        &En1_d[groupOffset * m], m);


                CUDA_SAFE_CALL(cudaDeviceSynchronize());

            // Coupling Matrices
            int regularNeighOffset = neighOffset + classNeighIrregular_h[class_i] + classRegularTetraOffset_h[class_i][i] * NumOfFaces;

            m = TetPolyOrderDim[PolyFlag]; //rows of A
            n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A

            numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize;

            makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag);

            long long int strideA = m * n;
            long long int strideB = n * groupElements;
            long long int strideC = m * groupElements;


            CUDA_SAFE_CALL(cudaDeviceSynchronize());


            beta = 0.0;
            cublasSgemmStridedBatched(handle,
                                      CUBLAS_OP_N,
                                      CUBLAS_OP_N,
                                      m, groupElements, n,
                                      &alpha,
                                      &regularNeigh1E_d[(groupID - 1) * NumOfFaces * strideA], m,
                                      strideA,
                                      auxFieldInput, n,
                                      strideB,
                                      &beta,
                                      auxFieldOutput, m,
                                      strideC,
                                      NumOfFaces);

            makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag);


            CUDA_SAFE_CALL(cudaDeviceSynchronize());


            beta = 1.0;
            cublasSgemmStridedBatched(handle,
                                      CUBLAS_OP_N,
                                      CUBLAS_OP_N,
                                      m, groupElements, n,
                                      &alpha,
                                      &regularNeigh2E_d[(groupID - 1) * NumOfFaces * strideA], m,
                                      strideA,
                                      auxFieldInput, n,
                                      strideB,
                                      &beta,
                                      auxFieldOutput, m,
                                      strideC,
                                      NumOfFaces);

            int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];

            dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
            numBlocks = (groupElements + blockY - 1) / blockY;

            addCouplingResultsRegular<<<numBlocks, blockDim>>>(&En1_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements);
          }
        }


        // -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------


        CUDA_SAFE_CALL(cudaDeviceSynchronize());


        // --------------------------------------------
        // PML Section
        int PMLTetras = nonRegularPMLTetraCnt_h[class_i];

        classOffset = classPMLTetraOffset_h[class_i];
        neighOffset = classNeighPMLOffset_h[class_i];

        if(PMLTetras > 0)
        {
          // Local Mattrices
          int nMatrices = PMLTetras;
          int matrixOffset = classPMLTetraOffset_loc_h[class_i];

//          cout << "classPMLTetraOffset_loc_h[class_i] : " << classPMLTetraOffset_loc_h[class_i] << endl;

          int m = TetPolyOrderDim[PolyFlag]; //rows of A
          int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A

          long long int strideA = m * n;
          long long int strideB = n;
          long long int strideC = m;

          float alpha = 1.0;
          float beta = 0.0;

          CUDA_SAFE_CALL(cudaDeviceSynchronize());
          // --------------------------------------------------------
          // Auxilliary J


          cublasSgemmStridedBatched(handle,
            CUBLAS_OP_N,
            CUBLAS_OP_N,
            m, 1, n,
            &alpha,
            &AuxJ1_d[matrixOffset * strideA], m,
            strideA,
            &Jn12_d[matrixOffset * strideB], n,
            strideB,
            &beta,
            &Jn32_d[matrixOffset * strideC], m,
            strideC,
            nMatrices);

          CUDA_SAFE_CALL(cudaDeviceSynchronize());


          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                      CUBLAS_OP_N,
                      CUBLAS_OP_N,
                      m, 1, n,
                      &alpha,
                      &AuxJ2_d[matrixOffset * strideA], m,
                      strideA,
                      &En_d[classOffset * strideB], n,
                      strideB,
                      &beta,
                      &Jn32_d[matrixOffset * strideC], m,
                      strideC,
                      nMatrices);

          CUDA_SAFE_CALL(cudaDeviceSynchronize());
          // --------------------------------------------------------

          alpha = 1.0;
          beta = 0.0;

          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Loc1E_PML_d[matrixOffset * strideA], m,
                                    strideA,
                                    &En_d[classOffset * strideB], n,
                                    strideB,
                                    &beta,
                                    &En1_d[classOffset * strideC], m,
                                    strideC,
                                    nMatrices);

          CUDA_SAFE_CALL(cudaDeviceSynchronize());


          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Loc2E_PML_d[matrixOffset * strideA], m,
                                    strideA,
                                    &Hn12_d[classOffset * strideB], n,
                                    strideB,
                                    &beta,
                                    &En1_d[classOffset * strideC], m,
                                    strideC,
                                    nMatrices);


          CUDA_SAFE_CALL(cudaDeviceSynchronize());


          // Add Auxilliary J term
          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                      CUBLAS_OP_N,
                      CUBLAS_OP_N,
                      m, 1, n,
                      &alpha,
                      &AuxE_d[matrixOffset * strideA], m,
                      strideA,
                      &Jn32_d[matrixOffset * strideB], n,
                      strideB,
                      &beta,
                      &En1_d[classOffset * strideC], m,
                      strideC,
                      nMatrices);


          CUDA_SAFE_CALL(cudaDeviceSynchronize());


          // Coupling Matrices
          nMatrices = classNeighPML_h[class_i];
          matrixOffset = classNeighPMLOffset_loc_h[class_i];

          m = TetPolyOrderDim[PolyFlag]; //rows of A
          n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A

          strideA = m * n;
          strideB = n;
          strideC = m;

          numBlocks = (nMatrices * n + blockSize - 1) / blockSize;
          makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)

          beta = 0.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Neigh1E_PML_d[matrixOffset * strideA], m,
                                    strideA,
                                    auxFieldInput, n,
                                    strideB,
                                    &beta,
                                    auxFieldOutput, m,
                                    strideC,
                                    nMatrices);

          makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)

          CUDA_SAFE_CALL(cudaDeviceSynchronize());

          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Neigh2E_PML_d[matrixOffset * strideA], m,
                                    strideA,
                                    auxFieldInput, n,
                                    strideB,
                                    &beta,
                                    auxFieldOutput, m,
                                    strideC,
                                    nMatrices);

          CUDA_SAFE_CALL(cudaDeviceSynchronize());

          int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];

          dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
          numBlocks = (PMLTetras + blockY - 1) / blockY;

          //Implement 3D if tetras over blocksize * (2^(31) - 1)
          addCouplingResults<<<numBlocks, blockDim>>>(&En1_d[classPMLTetraOffset_h[class_i] * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], PMLTetras);

          CUDA_SAFE_CALL(cudaDeviceSynchronize());

        }


        if(regularRegionFlag && classRegularPMLGroupsCnt_h[class_i] > 0)
        {
          for(int i = 0; i < classRegularPMLGroupsCnt_h[class_i]; i++)
          {
            int groupID = classRegularPMLGroupsId_h[class_i][i];
            int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID];
            int groupOffset = classOffset + nonRegularPMLTetraCnt_h[class_i] + classRegularPMLTetraOffset_h[class_i][i];


            CUDA_SAFE_CALL(cudaDeviceSynchronize());

            int local_index = groupID - 1 - regularCNT_Normal;
            int aux_offset = classRegularPMLTetraOffset_h[class_i][i];

            // Local Matrices
            int m = TetPolyOrderDim[PolyFlag]; //rows of A
            int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A


            // --------------------------------------------------------
            // Auxilliary J
            float alpha = 1.0;
            float beta = 0.0;
            cublasSgemm(handle,
              CUBLAS_OP_N,
              CUBLAS_OP_N,
              m, groupElements, n,
              &alpha,
              &regularPMLLoc1J_d[(local_index) * m * n], m,
              &r_Jn12_d[aux_offset * n], n,
              &beta,
              &r_Jn32_d[aux_offset * m], m);

            CUDA_SAFE_CALL(cudaDeviceSynchronize());


            beta = 1.0;
            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularPMLLoc2J_d[(local_index) * m * n], m,
                        &En_d[aux_offset * n], n,
                        &beta,
                        &r_Jn32_d[aux_offset * m], m);

            // --------------------------------------------------------


            beta = 0.0;
            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularPMLLoc1E_d[(local_index) * m * n], m,
                        &En_d[groupOffset * n], n,
                        &beta,
                        &En1_d[groupOffset * m], m);

                CUDA_SAFE_CALL(cudaDeviceSynchronize());


            beta = 1.0;
            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularPMLLoc2E_d[(local_index) * m * n], m,
                        &Hn12_d[groupOffset * n], n,
                        &beta,
                        &En1_d[groupOffset * m], m);


            CUDA_SAFE_CALL(cudaDeviceSynchronize());

            beta = 1.0;
            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularPMLAuxE_d[(local_index) * m * n], m,
                        &r_Jn32_d[aux_offset * n], n,
                        &beta,
                        &En1_d[groupOffset * m], m);

            CUDA_SAFE_CALL(cudaDeviceSynchronize());


            // Coupling Matrices
            int regularNeighOffset = neighOffset + classNeighPML_h[class_i] + classRegularPMLTetraOffset_h[class_i][i] * NumOfFaces;

            m = TetPolyOrderDim[PolyFlag]; //rows of A
            n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A

            numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize;

            makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag);

            long long int strideA = m * n;
            long long int strideB = n * groupElements;
            long long int strideC = m * groupElements;


            CUDA_SAFE_CALL(cudaDeviceSynchronize());


            beta = 0.0;
            cublasSgemmStridedBatched(handle,
                                      CUBLAS_OP_N,
                                      CUBLAS_OP_N,
                                      m, groupElements, n,
                                      &alpha,
                                      &regularPMLNeigh1E_d[(local_index) * NumOfFaces * strideA], m,
                                      strideA,
                                      auxFieldInput, n,
                                      strideB,
                                      &beta,
                                      auxFieldOutput, m,
                                      strideC,
                                      NumOfFaces);

            makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag);


            CUDA_SAFE_CALL(cudaDeviceSynchronize());


            beta = 1.0;
            cublasSgemmStridedBatched(handle,
                                      CUBLAS_OP_N,
                                      CUBLAS_OP_N,
                                      m, groupElements, n,
                                      &alpha,
                                      &regularPMLNeigh2E_d[(local_index) * NumOfFaces * strideA], m,
                                      strideA,
                                      auxFieldInput, n,
                                      strideB,
                                      &beta,
                                      auxFieldOutput, m,
                                      strideC,
                                      NumOfFaces);

            int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];

            dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
            numBlocks = (groupElements + blockY - 1) / blockY;

            addCouplingResultsRegular<<<numBlocks, blockDim>>>(&En1_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements);
          }
        }


        CUDA_SAFE_CALL(cudaDeviceSynchronize());

        /*
        int total_tets = ClassTetraCnt[class_i] + ClassPMLTetraCnt[class_i];
        int offset = ClassTetraOffset[class_i];
        CUDA_SAFE_CALL(cudaMemcpy(&En_d[offset * TetPolyOrderDim[PolyFlag]], &En1_d[offset * TetPolyOrderDim[PolyFlag]],
                                  total_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
        CUDA_SAFE_CALL(cudaDeviceSynchronize());
        */

        CUDA_SAFE_CALL(cudaMemcpy(&En_d[0], &En1_d[0], tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
        cudaDeviceSynchronize();


        if(nonRegularPMLTetraCnt_h[class_i] > 0)
        {
          int num_PML_tets = nonRegularPMLTetraCnt_h[class_i];
          int matrixOffset = classPMLTetraOffset_loc_h[class_i];
          CUDA_SAFE_CALL(cudaMemcpy(&Jn12_d[matrixOffset * TetPolyOrderDim[PolyFlag]], &Jn32_d[matrixOffset * TetPolyOrderDim[PolyFlag]],
                                    num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
          CUDA_SAFE_CALL(cudaDeviceSynchronize());
        }

        if(classRegularPMLGroupsCnt_h[class_i] > 0)
        {
          int num_PML_tets = numRegPMLTetras;
          CUDA_SAFE_CALL(cudaMemcpy(&Jn12_d[0], &Jn32_d[0], num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
          CUDA_SAFE_CALL(cudaDeviceSynchronize());
        }


    }

    void FemGrp::LH_CuBLAS(cublasHandle_t handle, int class_i){

        const int Q = GAUSS_POINT_NUM_h[PolyFlag];  // same as GPU kernel uses

        int irregularTetras = nonRegularTetraCnt_h[class_i];

        int classOffset = ClassTetraOffset[class_i];
        int neighOffset = NeighClassOffset_h[class_i];

        int blockSize = 256; //optimal number
        int numBlocks;

        if(irregularTetras > 0)
        {
          // Local Mattrices
          int nMatrices = irregularTetras;
          int matrixOffset = classTetraOffset_loc_h[class_i];

          int m = TetPolyOrderDim[PolyFlag]; //rows of A
          int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A

          long long int strideA = m * n;
          long long int strideB = n;
          long long int strideC = m;

          float alpha = 1.0;
          float beta = 0.0;


          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Loc1H_d[matrixOffset * strideA], m,
                                    strideA,
                                    &Hn12_d[classOffset * strideB], n,
                                    strideB,
                                    &beta,
                                    &Hn32_d[classOffset * strideC], m,
                                    strideC,
                                    nMatrices);

          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Loc2H_d[matrixOffset * strideA], m,
                                    strideA,
                                    &En_d[classOffset * strideB], n,
                                    strideB,
                                    &beta,
                                    &Hn32_d[classOffset * strideC], m,
                                    strideC,
                                    nMatrices);


          if(ClassExcitationCount[class_i] > 0){
            nMatrices = ClassExcitationCount[class_i];
            matrixOffset = ClassExcitationOffset[class_i];

            numBlocks = (nMatrices + blockSize - 1) / blockSize;

            fp_t_ts dt =  LocTimeSteps[class_i];
            fp_t_ts t = (LocalExciIndexH[class_i] + 1.0) * dt;
            LocalExciIndexH[class_i]++;


            if (PWorPort == 0)
            {
              if (interior_excitation_flag)
              {
                addExcitationH_PML<<<numBlocks, blockSize>>>(&ExcitationFacesCnt_d[matrixOffset],
                  &ExcitationFacesOffset_d[matrixOffset],
                  ExcitationFacesNum_d,
                  nMatrices,
                  ClassExcitation_sc_CNT[class_i],
                  &mapH_d[matrixOffset * strideC],
                  excitationProp,
                  PolyFlag,
                  dt / Uo, t,
                  &nd_coords_tet_d[4 * 3 * matrixOffset],
                  nd_coords_face_d,
                  Z_face_pw_d,
                  &InvH_d[strideA * matrixOffset],
                  &Hn32_d[classOffset * strideC]);
              }
              else
              {
                addExcitationH<<<numBlocks, blockSize>>>(&ExcitationFacesCnt_d[matrixOffset],
                  &ExcitationFacesOffset_d[matrixOffset],
                  ExcitationFacesNum_d,
                  nMatrices,
                  &mapH_d[matrixOffset * strideC],
                  excitationProp,
                  PolyFlag,
                  dt / Uo, t,
                  &nd_coords_tet_d[4 * 3 * matrixOffset],
                  nd_coords_face_d,
                  Z_face_pw_d,
                  &InvH_d[strideA * matrixOffset],
                  &Hn32_d[classOffset * strideC]);
              }
            }
            else
            {

              addExcitationH_port<<<numBlocks, blockSize>>>(&ExcitationFacesCnt_d[matrixOffset],
                &ExcitationFacesOffset_d[matrixOffset],
                ExcitationFacesNum_d,
                nMatrices,
                &mapH_d[matrixOffset * strideC],
                ExcitationProps_d,
                PortFacePidx_d,
                PolyFlag,
                dt / Uo, t,
                &nd_coords_tet_d[4 * 3 * matrixOffset],
                nd_coords_face_d,
                &InvH_d[strideA * matrixOffset],
                &Hn32_d[classOffset * strideC]);
            }

          }

          // Coupling Matrices
          nMatrices = classNeighIrregular_h[class_i];
          matrixOffset = classNeighOffset_loc_h[class_i];

          m = TetPolyOrderDim[PolyFlag]; //rows of A
          n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A

          strideA = m * n;
          strideB = n;
          strideC = m;

          numBlocks = (nMatrices * n + blockSize - 1) / blockSize;
          makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)

          beta = 0.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Neigh1H_d[matrixOffset * strideA], m,
                                    strideA,
                                    auxFieldInput, n,
                                    strideB,
                                    &beta,
                                    auxFieldOutput, m,
                                    strideC,
                                    nMatrices);

          makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)

          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Neigh2H_d[matrixOffset * strideA], m,
                                    strideA,
                                    auxFieldInput, n,
                                    strideB,
                                    &beta,
                                    auxFieldOutput, m,
                                    strideC,
                                    nMatrices);

          int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];

          dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
          numBlocks = (irregularTetras + blockY - 1) / blockY;

          addCouplingResults<<<numBlocks, blockDim>>>(&Hn32_d[classOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], irregularTetras); //Implement 3D if tetras over blocksize * (2^(31) - 1)
        }


        // --------------------------------------------------------------------------------------------------
        CUDA_SAFE_CALL(cudaDeviceSynchronize());


        if(regularRegionFlag && classRegularGroupsCnt_h[class_i] > 0)
        {
          for(int i = 0; i < classRegularGroupsCnt_h[class_i]; i++)
          {
            int groupID = classRegularGroupsId_h[class_i][i];
            int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID];
            int groupOffset = classOffset + nonRegularTetraCnt_h[class_i] + classRegularTetraOffset_h[class_i][i];

            // Local Matrices
            int m = TetPolyOrderDim[PolyFlag]; //rows of A
            int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A

            float alpha = 1.0;
            float beta = 0.0;

            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularLoc1H_d[(groupID - 1) * m * n], m,
                        &Hn12_d[groupOffset * n], n,
                        &beta,
                        &Hn32_d[groupOffset * m], m);

            beta = 1.0;
            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularLoc2H_d[(groupID - 1) * m * n], m,
                        &En_d[groupOffset * n], n,
                        &beta,
                        &Hn32_d[groupOffset * m], m);

            // Coupling Matrices
            int regularNeighOffset = neighOffset + classNeighIrregular_h[class_i] + classRegularTetraOffset_h[class_i][i] * NumOfFaces;

            m = TetPolyOrderDim[PolyFlag]; //rows of A
            n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A

            numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize;

            makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag);

            long long int strideA = m * n;
            long long int strideB = n * groupElements;
            long long int strideC = m * groupElements;

            beta = 0.0;
            cublasSgemmStridedBatched(handle,
                                      CUBLAS_OP_N,
                                      CUBLAS_OP_N,
                                      m, groupElements, n,
                                      &alpha,
                                      &regularNeigh1H_d[(groupID - 1) * NumOfFaces * strideA], m,
                                      strideA,
                                      auxFieldInput, n,
                                      strideB,
                                      &beta,
                                      auxFieldOutput, m,
                                      strideC,
                                      NumOfFaces);

            makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag);

            beta = 1.0;
            cublasSgemmStridedBatched(handle,
                                      CUBLAS_OP_N,
                                      CUBLAS_OP_N,
                                      m, groupElements, n,
                                      &alpha,
                                      &regularNeigh2H_d[(groupID - 1) * NumOfFaces * strideA], m,
                                      strideA,
                                      auxFieldInput, n,
                                      strideB,
                                      &beta,
                                      auxFieldOutput, m,
                                      strideC,
                                      NumOfFaces);

            int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];

            dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
            numBlocks = (groupElements + blockY - 1) / blockY;

            addCouplingResultsRegular<<<numBlocks, blockDim>>>(&Hn32_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements);

            CUDA_SAFE_CALL(cudaDeviceSynchronize());  // make sure prior kernels/GEMMs finished

          }
        }


        // -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------


        // --------------------------------------------
        // PML Section
        int PMLTetras = nonRegularPMLTetraCnt_h[class_i];

        classOffset = classPMLTetraOffset_h[class_i];
        neighOffset = classNeighPMLOffset_h[class_i];

        if(PMLTetras > 0)
        {
          // Local Mattrices
          int nMatrices = PMLTetras;
          int matrixOffset = classPMLTetraOffset_loc_h[class_i];

          int m = TetPolyOrderDim[PolyFlag]; //rows of A
          int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A

          long long int strideA = m * n;
          long long int strideB = n;
          long long int strideC = m;

          float alpha = 1.0;
          float beta = 0.0;

          // --------------------------------------------------------
          // Auxilliary M

          cublasSgemmStridedBatched(handle,
            CUBLAS_OP_N,
            CUBLAS_OP_N,
            m, 1, n,
            &alpha,
            &AuxM1_d[matrixOffset * strideA], m,
            strideA,
            &Mn_d[matrixOffset * strideB], n,
            strideB,
            &beta,
            &Mn1_d[matrixOffset * strideC], m,
            strideC,
            nMatrices);

          CUDA_SAFE_CALL(cudaDeviceSynchronize());


          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                      CUBLAS_OP_N,
                      CUBLAS_OP_N,
                      m, 1, n,
                      &alpha,
                      &AuxM2_d[matrixOffset * strideA], m,
                      strideA,
                      &Hn12_d[classOffset * strideB], n,
                      strideB,
                      &beta,
                      &Mn1_d[matrixOffset * strideC], m,
                      strideC,
                      nMatrices);


          // --------------------------------------------------------


          alpha = 1.0;
          beta = 0.0;

          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Loc1H_PML_d[matrixOffset * strideA], m,
                                    strideA,
                                    &Hn12_d[classOffset * strideB], n,
                                    strideB,
                                    &beta,
                                    &Hn32_d[classOffset * strideC], m,
                                    strideC,
                                    nMatrices);

          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Loc2H_PML_d[matrixOffset * strideA], m,
                                    strideA,
                                    &En_d[classOffset * strideB], n,
                                    strideB,
                                    &beta,
                                    &Hn32_d[classOffset * strideC], m,
                                    strideC,
                                    nMatrices);


          // Add Auxilliary Term M

          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                      CUBLAS_OP_N,
                      CUBLAS_OP_N,
                      m, 1, n,
                      &alpha,
                      &AuxH_d[matrixOffset * strideA], m,
                      strideA,
                      &Mn1_d[matrixOffset * strideB], n,
                      strideB,
                      &beta,
                      &Hn32_d[classOffset * strideC], m,
                      strideC,
                      nMatrices);


          // Coupling Matrices

          nMatrices = classNeighPML_h[class_i];
          matrixOffset = classNeighPMLOffset_loc_h[class_i];

          // cout << start << " " << nMatrices << " " << start + nMatrices << endl;
          m = TetPolyOrderDim[PolyFlag]; //rows of A
          n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A

          strideA = m * n;
          strideB = n;
          strideC = m;

          numBlocks = (nMatrices * n + blockSize - 1) / blockSize;
          makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], En_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)

          beta = 0.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Neigh1H_PML_d[matrixOffset * strideA], m,
                                    strideA,
                                    auxFieldInput, n,
                                    strideB,
                                    &beta,
                                    auxFieldOutput, m,
                                    strideC,
                                    nMatrices);

          makeNeighField<<<numBlocks, blockSize>>>(&NeighMap_d[neighOffset * n], Hn12_d, auxFieldInput, nMatrices * n); //Implement 3D if tetras over blocksize * (2^(31) - 1)

          beta = 1.0;
          cublasSgemmStridedBatched(handle,
                                    CUBLAS_OP_N,
                                    CUBLAS_OP_N,
                                    m, 1, n,
                                    &alpha,
                                    &Neigh2H_PML_d[matrixOffset * strideA], m,
                                    strideA,
                                    auxFieldInput, n,
                                    strideB,
                                    &beta,
                                    auxFieldOutput, m,
                                    strideC,
                                    nMatrices);

          int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];

          dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
          numBlocks = (PMLTetras + blockY - 1) / blockY;

          //Implement 3D if tetras over blocksize * (2^(31) - 1)
          addCouplingResults<<<numBlocks, blockDim>>>(&Hn32_d[classPMLTetraOffset_h[class_i]  * TetPolyOrderDim[PolyFlag]], auxFieldOutput, &Neighbours_d[classOffset], &NeighboursOffset_d[classOffset], PMLTetras);


          CUDA_SAFE_CALL(cudaDeviceSynchronize());
        }


        if(regularRegionFlag && classRegularPMLGroupsCnt_h[class_i] > 0)
        {
          for(int i = 0; i < classRegularPMLGroupsCnt_h[class_i]; i++)
          {
            int groupID = classRegularPMLGroupsId_h[class_i][i];
            int groupElements = classRegularTetraCnt_h[class_i * regularCNT + groupID];
            int groupOffset = classOffset + nonRegularPMLTetraCnt_h[class_i] + classRegularPMLTetraOffset_h[class_i][i];


            CUDA_SAFE_CALL(cudaDeviceSynchronize());

            int local_index = groupID - 1 - regularCNT_Normal;
            int aux_offset = classRegularPMLTetraOffset_h[class_i][i];

            // Local Matrices
            int m = TetPolyOrderDim[PolyFlag]; //rows of A
            int n = TetPolyOrderDim[PolyFlag]; //rows of B and cols of A


            // --------------------------------------------------------
            // Auxilliary M

            float alpha = 1.0;
            float beta = 0.0;
            cublasSgemm(handle,
              CUBLAS_OP_N,
              CUBLAS_OP_N,
              m, groupElements, n,
              &alpha,
              &regularPMLLoc1M_d[(local_index) * m * n], m,
              &r_Mn_d[aux_offset * n], n,
              &beta,
              &r_Mn1_d[aux_offset * m], m);

            CUDA_SAFE_CALL(cudaDeviceSynchronize());


            beta = 1.0;
            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularPMLLoc2M_d[(local_index) * m * n], m,
                        &Hn12_d[aux_offset * n], n,
                        &beta,
                        &r_Mn1_d[aux_offset * m], m);


            CUDA_SAFE_CALL(cudaDeviceSynchronize());
            // --------------------------------------------------------


            beta = 0.0;
            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularPMLLoc1H_d[(local_index) * m * n], m,
                        &Hn12_d[groupOffset * n], n,
                        &beta,
                        &Hn32_d[groupOffset * m], m);

                CUDA_SAFE_CALL(cudaDeviceSynchronize());


            beta = 1.0;
            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularPMLLoc2H_d[(local_index) * m * n], m,
                        &En_d[groupOffset * n], n,
                        &beta,
                        &Hn32_d[groupOffset * m], m);


            CUDA_SAFE_CALL(cudaDeviceSynchronize());

            beta = 1.0;
            cublasSgemm(handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
                        m, groupElements, n,
                        &alpha,
                        &regularPMLAuxH_d[(local_index) * m * n], m,
                        &r_Mn1_d[aux_offset * n], n,
                        &beta,
                        &Hn32_d[groupOffset * m], m);

            CUDA_SAFE_CALL(cudaDeviceSynchronize());


            // Coupling Matrices
            int regularNeighOffset = neighOffset + classNeighPML_h[class_i] + classRegularPMLTetraOffset_h[class_i][i] * NumOfFaces;

            m = TetPolyOrderDim[PolyFlag]; //rows of A
            n = FacePolyOrderDim[PolyFlag]; //rows of B and cols of A

            numBlocks = (groupElements * n * NumOfFaces + blockSize - 1) / blockSize;

            makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], En_d, auxFieldInput, groupElements, PolyFlag);

            long long int strideA = m * n;
            long long int strideB = n * groupElements;
            long long int strideC = m * groupElements;


            CUDA_SAFE_CALL(cudaDeviceSynchronize());


            beta = 0.0;
            cublasSgemmStridedBatched(handle,
                                      CUBLAS_OP_N,
                                      CUBLAS_OP_N,
                                      m, groupElements, n,
                                      &alpha,
                                      &regularPMLNeigh1H_d[(local_index) * NumOfFaces * strideA], m,
                                      strideA,
                                      auxFieldInput, n,
                                      strideB,
                                      &beta,
                                      auxFieldOutput, m,
                                      strideC,
                                      NumOfFaces);

            makeNeighFieldRegular<<<numBlocks, blockSize>>>(&NeighMap_d[regularNeighOffset * n], Hn12_d, auxFieldInput, groupElements, PolyFlag);


            CUDA_SAFE_CALL(cudaDeviceSynchronize());


            beta = 1.0;
            cublasSgemmStridedBatched(handle,
                                      CUBLAS_OP_N,
                                      CUBLAS_OP_N,
                                      m, groupElements, n,
                                      &alpha,
                                      &regularPMLNeigh2H_d[(local_index) * NumOfFaces * strideA], m,
                                      strideA,
                                      auxFieldInput, n,
                                      strideB,
                                      &beta,
                                      auxFieldOutput, m,
                                      strideC,
                                      NumOfFaces);

            int blockY = (blockSize + TetPolyOrderDim[PolyFlag] - 1) / TetPolyOrderDim[PolyFlag];

            dim3 blockDim(TetPolyOrderDim[PolyFlag], blockY, 1);
            numBlocks = (groupElements + blockY - 1) / blockY;

            addCouplingResultsRegular<<<numBlocks, blockDim>>>(&Hn32_d[groupOffset * TetPolyOrderDim[PolyFlag]], auxFieldOutput, groupElements);
          }
        }


        CUDA_SAFE_CALL(cudaDeviceSynchronize());


          /*
        int total_tets = ClassTetraCnt[class_i] + ClassPMLTetraCnt[class_i];
        int offset = ClassTetraOffset[class_i];
        CUDA_SAFE_CALL(cudaMemcpy(&Hn12_d[ offset * TetPolyOrderDim[PolyFlag]], &Hn32_d[offset * TetPolyOrderDim[PolyFlag]],
                                  total_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
        cudaDeviceSynchronize();
          */

        CUDA_SAFE_CALL(cudaMemcpy(&Hn12_d[0], &Hn32_d[0], tetraCNT * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
        cudaDeviceSynchronize();


        if(nonRegularPMLTetraCnt_h[class_i] > 0)
        {
          int num_PML_tets = nonRegularPMLTetraCnt_h[class_i];
          int matrixOffset = classPMLTetraOffset_loc_h[class_i];
          CUDA_SAFE_CALL(cudaMemcpy(&Mn_d[matrixOffset * TetPolyOrderDim[PolyFlag]], &Mn1_d[matrixOffset * TetPolyOrderDim[PolyFlag]], num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
          CUDA_SAFE_CALL(cudaDeviceSynchronize());
        }

        if(classRegularPMLGroupsCnt_h[class_i] > 0)
        {
          int num_PML_tets = numRegPMLTetras;
          CUDA_SAFE_CALL(cudaMemcpy(&r_Mn_d[0], &r_Mn1_d[0], num_PML_tets * TetPolyOrderDim[PolyFlag] * sizeof(fp_t_ts), cudaMemcpyDeviceToDevice));
          CUDA_SAFE_CALL(cudaDeviceSynchronize());
        }


    }

    void FemGrp::FreeGPU(){
      CUDA_SAFE_CALL(cudaFree(mapE_d));
      CUDA_SAFE_CALL(cudaFree(mapH_d));

      CUDA_SAFE_CALL(cudaFree(ExcitationFacesCnt_d));
      CUDA_SAFE_CALL(cudaFree(ExcitationFacesOffset_d));
      CUDA_SAFE_CALL(cudaFree(ExcitationFacesNum_d));

      CUDA_SAFE_CALL(cudaFree(nd_coords_tet_d));
      CUDA_SAFE_CALL(cudaFree(nd_coords_face_d));

      if(PlaneWaveBCFlag){
        CUDA_SAFE_CALL(cudaFree(Z_face_pw_d));
      }

      CUDA_SAFE_CALL(cudaFree(InvE_d));
      CUDA_SAFE_CALL(cudaFree(InvH_d));


      CUDA_SAFE_CALL(cudaFree(Loc1E_d));
      CUDA_SAFE_CALL(cudaFree(Loc2E_d));
      CUDA_SAFE_CALL(cudaFree(Loc1H_d));
      CUDA_SAFE_CALL(cudaFree(Loc2H_d));

      CUDA_SAFE_CALL(cudaFree(Neigh1E_d));
      CUDA_SAFE_CALL(cudaFree(Neigh2E_d));
      CUDA_SAFE_CALL(cudaFree(Neigh1H_d));
      CUDA_SAFE_CALL(cudaFree(Neigh2H_d));

      if(regularRegionFlag){
        CUDA_SAFE_CALL(cudaFree(regularLoc1E_d));
        CUDA_SAFE_CALL(cudaFree(regularLoc2E_d));
        CUDA_SAFE_CALL(cudaFree(regularLoc1H_d));
        CUDA_SAFE_CALL(cudaFree(regularLoc2H_d));

        CUDA_SAFE_CALL(cudaFree(regularNeigh1E_d));
        CUDA_SAFE_CALL(cudaFree(regularNeigh2E_d));
        CUDA_SAFE_CALL(cudaFree(regularNeigh1H_d));
        CUDA_SAFE_CALL(cudaFree(regularNeigh2H_d));
      }

      CUDA_SAFE_CALL(cudaFree(En_d));
      CUDA_SAFE_CALL(cudaFree(En1_d));
      CUDA_SAFE_CALL(cudaFree(Hn12_d));
      CUDA_SAFE_CALL(cudaFree(Hn32_d));

      CUDA_SAFE_CALL(cudaFree(NeighMap_d));
      CUDA_SAFE_CALL(cudaFree(Neighbours_d));

      CUDA_SAFE_CALL(cudaFree(auxFieldInput));
      CUDA_SAFE_CALL(cudaFree(auxFieldOutput));
    }


    // Refactored by Qi Jian to build Octree of the tetrahedrals
    void FemGrp::initializeOctree(std::string prjName, bool non_Conformal_flag)
    {

        cout << "========================================================== \n";

        // Initialize octree object
        octree_object = Octree();

        // Compute AABB for each tetrahedron
        /*
        std::cout << "Compute AABB for tetrahedral" << std::endl;
        octree_object.tetra_boxes.resize(tetraCNT);
        #pragma omp parallel for
        for (int tet_id = 0; tet_id < tetraCNT; ++tet_id)
        {
            const tetra& tet = tetARRAY[tet_id];

            double x[4], y[4], z[4];
            for (int i = 0; i < 4; ++i)
            {
                x[i] = tet.nd[i]->getCoord().getx();
                y[i] = tet.nd[i]->getCoord().gety();
                z[i] = tet.nd[i]->getCoord().getz();
            }

            AABB box;
            box.xmin = std::min({x[0], x[1], x[2], x[3]});
            box.xmax = std::max({x[0], x[1], x[2], x[3]});
            box.ymin = std::min({y[0], y[1], y[2], y[3]});
            box.ymax = std::max({y[0], y[1], y[2], y[3]});
            box.zmin = std::min({z[0], z[1], z[2], z[3]});
            box.zmax = std::max({z[0], z[1], z[2], z[3]});

            octree_object.tetra_boxes[tet_id] = box;
        }
        */

        // Compute AABB for each tetrahedron
        std::cout << "Compute AABB for tetrahedral (with buffer)" << std::endl;
        octree_object.tetra_boxes.resize(tetraCNT);

        // Buffer multiplier (e.g., 5% enlargement)
        const double buffer_factor = 2.0;

        #pragma omp parallel for
        for (int tet_id = 0; tet_id < tetraCNT; ++tet_id)
        {
            const tetra& tet = tetARRAY[tet_id];

            double x[4], y[4], z[4];
            for (int i = 0; i < 4; ++i)
            {
                x[i] = tet.nd[i]->getCoord().getx();
                y[i] = tet.nd[i]->getCoord().gety();
                z[i] = tet.nd[i]->getCoord().getz();
            }

            AABB box;
            double xmin = std::min({x[0], x[1], x[2], x[3]});
            double xmax = std::max({x[0], x[1], x[2], x[3]});
            double ymin = std::min({y[0], y[1], y[2], y[3]});
            double ymax = std::max({y[0], y[1], y[2], y[3]});
            double zmin = std::min({z[0], z[1], z[2], z[3]});
            double zmax = std::max({z[0], z[1], z[2], z[3]});

            // Compute center and half-sizes
            double cx = 0.5 * (xmin + xmax);
            double cy = 0.5 * (ymin + ymax);
            double cz = 0.5 * (zmin + zmax);
            double hx = 0.5 * (xmax - xmin);
            double hy = 0.5 * (ymax - ymin);
            double hz = 0.5 * (zmax - zmin);

            // Apply buffer multiplier
            hx *= buffer_factor;
            hy *= buffer_factor;
            hz *= buffer_factor;

            // Store expanded box
            box.xmin = cx - hx;  box.xmax = cx + hx;
            box.ymin = cy - hy;  box.ymax = cy + hy;
            box.zmin = cz - hz;  box.zmax = cz + hz;

            octree_object.tetra_boxes[tet_id] = box;
        }


        std::cout << "Compute global bounding box" << std::endl;

        // All the tetrahedra IDs
        std::vector<int> all_tet_ids(tetraCNT);
        std::iota(all_tet_ids.begin(), all_tet_ids.end(), 0);

        // All the non-conformal tetrahedra IDs
        std::vector<int> all_NC_tet_ids(nonConformalCNT);
        if (non_Conformal_flag)
        {
          std::cout << "Store non-conformal tetrahedra IDs" << std::endl;
          all_NC_tet_ids.assign(ncARRAY, ncARRAY + nonConformalCNT);
        }


        AABB global_box {
            .xmin =  std::numeric_limits<float>::max(),
            .xmax = -std::numeric_limits<float>::max(),
            .ymin =  std::numeric_limits<float>::max(),
            .ymax = -std::numeric_limits<float>::max(),
            .zmin =  std::numeric_limits<float>::max(),
            .zmax = -std::numeric_limits<float>::max()
        };

        for (const auto& box : octree_object.tetra_boxes)
        {
            global_box.xmin = std::min(global_box.xmin, box.xmin);
            global_box.xmax = std::max(global_box.xmax, box.xmax);
            global_box.ymin = std::min(global_box.ymin, box.ymin);
            global_box.ymax = std::max(global_box.ymax, box.ymax);
            global_box.zmin = std::min(global_box.zmin, box.zmin);
            global_box.zmax = std::max(global_box.zmax, box.zmax);
        }

        std::cout << "Global Bounding Box:" << std::endl;
        std::cout << "  xmin = " << global_box.xmin << ", xmax = " << global_box.xmax << std::endl;
        std::cout << "  ymin = " << global_box.ymin << ", ymax = " << global_box.ymax << std::endl;
        std::cout << "  zmin = " << global_box.zmin << ", zmax = " << global_box.zmax << std::endl;

        fp_t x_range = (global_box.xmax - global_box.xmin);
        fp_t y_range = (global_box.ymax - global_box.ymin);
        fp_t z_range = (global_box.zmax - global_box.zmin);
        fp_t max_range = std::max({x_range, y_range, z_range});
        fp_t wavelength = 3e8 / (freq * 1e6);

        double box_size = 100.0 * wavelength; // or any desired multiple of λ
        int min_depth = 1; // or 2, etc.
        int octree_depth = std::max(min_depth, static_cast<int>(std::ceil(std::log2(max_range / box_size))));

        double buffer_distance = wavelength / 2.0;

        //int octree_depth = static_cast<int>(std::ceil(std::log2((4.0 * max_range) / wavelength))) - 1;

        std::cout << "Max Range = " << max_range << " | Wavelength = " << wavelength << std::endl;
        std::cout << "Compute octree with octree depth = " << octree_depth << std::endl;


        if (non_Conformal_flag)
        {
          octree_object.buildOctree_withNCFLAGS(all_tet_ids, all_NC_tet_ids, global_box, buffer_distance, 0, octree_depth);
        }
        else
        {
          octree_object.buildOctree(all_tet_ids, global_box, buffer_distance, 0, octree_depth);
        }

        // Link tetrahedron memory
        octree_object.tet_ptr = tetARRAY;
        octree_object.tet_count = tetraCNT;
        std::cout << "Octree build completed" << std::endl;

        cout << "========================================================== \n";


    }


    // Find the Barycentric coordinates of the probes
    void FemGrp::computeBarycentricEmbedding()
    {
        std::cout << "Compute the Barycentric center of the nodes" << std::endl;
        const int num_nodes = outputMesh.num_nodes;
        const double tol = 1e-8;

        //#pragma omp parallel for schedule(dynamic)
        for (int node_id = 0; node_id < num_nodes; ++node_id)
        {
            std::vector<float> node_xyz = outputMesh.getNode(node_id);
            double probe_xyz[3] = {node_xyz[0], node_xyz[1], node_xyz[2]};

            std::vector<std::pair<int, std::array<double, 4>>> found_tets;
            bool success = octree_object.findTetraInOctree(probe_xyz, found_tets, tol);

            if (success)
            {
                tri_nodes_bary[node_id].first = static_cast<int>(found_tets.size());
                tri_nodes_bary[node_id].second = found_tets;
            }
            else
            {
                tri_nodes_bary[node_id].first = -1;
            }
        }

        // Report and verify
        bool error_flag = false;
        for (int i = 0; i < num_nodes; ++i)
        {
            if (tri_nodes_bary[i].first < 0)
            {
                std::cerr << "Node " << i << " not found in simulation domain" << std::endl;
                std::vector<float> node_xyz = outputMesh.getNode(i);
                double probe_xyz[3] = {node_xyz[0], node_xyz[1], node_xyz[2]};
                std::cerr << probe_xyz[0] << " " << probe_xyz[1] << " " << probe_xyz[2] << std::endl;
                error_flag = true;
            }
        }

        if (error_flag)
        {
            std::cerr << "Error: Some nodes were not found in the simulation domain. Exiting." << std::endl;
            std::exit(EXIT_FAILURE);
        }
    }


    // Refactored by Qi Jian to initialize the output surface mesh
    // Note that the octree have to be built before calling this function
    void FemGrp::makeOutputSurfMesh(std::string prjName)
    {
        // Load surface mesh
        char triName[256];
        sprintf(triName, "./%s_out.tri", prjName.c_str());
        std::cout << "--------------------" << std::endl;
        std::cout << "Reading Tri surface mesh " << triName << std::endl;
        outputMesh.readFromFile(triName);

        std::cout << "--------------------" << std::endl;
        std::cout << "Compute Normals " << std::endl;
        outputMesh.computeTriangleNormals();

        std::cout << "--------------------" << std::endl;
        outputMesh.printSummary();
        std::cout << "--------------------" << std::endl;

        tri_nodes_bary.resize(outputMesh.num_nodes);

        // Fill barycentric coordinate map
        computeBarycentricEmbedding();

        std::cout << "Completed" << std::endl;
        std::cout << "--------------------" << std::endl;
    }


    // Added by Qi Jian
    // Utility to write fields of probes
    void FemGrp::writeProbeFieldsCSV(
      const std::string& outputDir,       // e.g. "./PROBES1"
      const std::string& fname,           // simulation/project name
      int timeStep,                       // timestep number
      const std::vector<int>& node_ids,   // node IDs to write
      const std::vector<vtr>& Efield, // electric field vectors
      const std::vector<vtr>& Hfield  // magnetic field vectors
  )
  {

    char csvFileName[512];
    sprintf(csvFileName, "%s/Probes_%s_%04d.csv", outputDir.c_str(), fname.c_str(), timeStep);

    std::ofstream csvFile(csvFileName);
    if (!csvFile.is_open()) {
    std::cerr << "Error opening file: " << csvFileName << std::endl;
    return;
    }

    // Write header
    csvFile << "Ex,Ey,Ez,Hx,Hy,Hz\n";

    // Lambda to write one node's fields
    auto write_fields = [&](int node_id)
    {
      const vtr& E = Efield[node_id];
      const vtr& H = Hfield[node_id];
      csvFile << std::fixed << std::setprecision(6)
      << E.getx() << "," << E.gety() << "," << E.getz() << ","
      << H.getx() << "," << H.gety() << "," << H.getz() << "\n";
    };

    for (int i = 0; i < node_ids.size(); ++i)
    {
      int node_id = node_ids[i];
      write_fields(node_id);
    }

    csvFile.close();
  }


  void FemGrp::writeCurrentsOutputSurfMesh_CuBLAS(int timeStep)
  {

    const int num_nodes = outputMesh.num_nodes;
    const int num_tri = outputMesh.num_triangles;

    // ----------------------------------------------
    // Step 1: Compute fields at all nodes (scattered field)
    // ----------------------------------------------

    // Incident Field at points
    std::vector<vtr> E_field(num_nodes);
    std::vector<vtr> H_field(num_nodes);
    std::vector<vtr> Einc_field(num_nodes);
    std::vector<vtr> Hinc_field(num_nodes);


    int i, j;
    fp_t vol;
    fp_t zeta[4];
    vtr lvtr[3];
    vtr avtr[4];

    int tetraMAP_aux[TetPolyOrderDim[getPolyFlag()]];
    fp_t_ts E_coeff[TetPolyOrderDim[getPolyFlag()]];
    fp_t_ts H_coeff[TetPolyOrderDim[getPolyFlag()]];

    vtr Einc;
    vtr Hinc;
    vtr r;
    vtr eField;
    vtr hField;


    // DEBUG purpose: Store all the node ids as probes
    vector<int> node_ids(num_nodes);
    for(i = 0; i < num_nodes; i++)
    {
      node_ids[i] = i;
    }


    // Compute the Incident Fields
    for(i = 0; i < num_nodes; i++)
    {
      int number_of_associated_tets = tri_nodes_bary.at(i).first;

      Einc.reset();
      Hinc.reset();

      std::vector<std::pair<int, std::array<double, 4>>> found_tets = tri_nodes_bary.at(i).second;
      Einc_field[i].reset();
      Hinc_field[i].reset();

      for (int t = 0; t < number_of_associated_tets; t++)
      {
        int tet_id = found_tets.at(t).first;
        array<double,4> tri_bary_coord = found_tets.at(t).second;
        tetra& tet = tetARRAY[tet_id];

        zeta[0] = static_cast<fp_t>(tri_bary_coord[0]);
        zeta[1] = static_cast<fp_t>(tri_bary_coord[1]);
        zeta[2] = static_cast<fp_t>(tri_bary_coord[2]);
        zeta[3] = static_cast<fp_t>(tri_bary_coord[3]);

        SimplexToCartesian(tet, r, zeta);
        getAnalyticalPWField(tet, r, Einc, Hinc, timeStep, LocTimeSteps[N_class -1]);

        Einc_field[i] = Einc_field[i] + Einc;
        Hinc_field[i] = Hinc_field[i] + Hinc;

      }

      Einc_field[i] = Einc_field[i] / ((fp_t) number_of_associated_tets);
      Hinc_field[i] = Hinc_field[i] / ((fp_t) number_of_associated_tets);

    }

    //writeProbeFieldsCSV( "./PROBES_inc", fname, timeStep, node_ids, Einc_field, Hinc_field);

    make_dir_if_not_exist("./CURRENT_INC");
    char regFileName[StrOutput];
    // Prepare output file name
    regFileName[StrOutput] = {0};
    sprintf(regFileName, "./CURRENT_INC/Einc_field_%s_%05d.dat", fname, timeStep);

    // Open output file
    FILE* fout = fopen(regFileName, "w");
    if (!fout)
    {
        std::cerr << "❌ Failed to open output file: " << regFileName << std::endl;
        return;
    }

    std::vector<int> tri_nodes = outputMesh.getTriangle(1);
    int nodeIdx = tri_nodes[0];                // Pick only the first node
    const vtr& E = Einc_field[nodeIdx];        // Get E-field vector at that node

    // Write full vector (Ex, Ey, Ez) to file
    fprintf(fout, "%.10e %.10e %.10e\n", E.getx(), E.gety(), E.getz());
    fclose(fout);  // Done!


    // Calculate Total Fields at the points
    for(i = 0; i < num_nodes; i++)
    {
      int number_of_associated_tets = tri_nodes_bary.at(i).first;

      eField.reset();
      hField.reset();

      std::vector<std::pair<int, std::array<double, 4>>> found_tets = tri_nodes_bary.at(i).second;
      E_field[i].reset();
      H_field[i].reset();

      for (int t = 0; t < number_of_associated_tets; t++)
      {

        int tet_id = found_tets.at(t).first;
        array<double,4> tri_bary_coord = found_tets.at(t).second;
        tetra& tet = tetARRAY[tet_id];

        tet.geometry(lvtr, avtr, &vol);
        avtr[3].reset();
        avtr[3] = avtr[3] - (avtr[0] + avtr[1] + avtr[2]);

        eField.reset();
        hField.reset();
        zeta[0] = static_cast<fp_t>(tri_bary_coord[0]);
        zeta[1] = static_cast<fp_t>(tri_bary_coord[1]);
        zeta[2] = static_cast<fp_t>(tri_bary_coord[2]);
        zeta[3] = static_cast<fp_t>(tri_bary_coord[3]);

        eField = CalcEfield(&En1_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);
        hField = CalcEfield(&Hn32_h[mapIdLoc[tet.getcnt()] * TetPolyOrderDim[PolyFlag]], avtr, vol, zeta, PolyFlag);

        E_field[i] = E_field[i] + eField;
        H_field[i] = H_field[i] + hField;

      }

      E_field[i] = E_field[i] / ((fp_t) number_of_associated_tets);
      H_field[i] = H_field[i] / ((fp_t) number_of_associated_tets);

    }

    //writeProbeFieldsCSV( "./PROBES_total", fname, timeStep, node_ids, E_field, H_field);


    regMface = new Register[outputMesh.num_triangles];
    regJface = new Register[outputMesh.num_triangles];

    make_dir_if_not_exist("./CURRENT_Total");


    for(int i = 0; i < outputMesh.num_triangles; i++)
    {

        std::vector<int> tri_nodes = outputMesh.getTriangle(i);

        std::vector<float> normal_d = outputMesh.getNormal(i);
        vtr NormalVtr(normal_d[0], normal_d[1], normal_d[2]);
        regMface[i].initial(3);
        regJface[i].initial(3);

        for(j = 0; j < 3; j++)
        {
          int nodeIdx = tri_nodes[j];
          vtr eLocalFace = E_field[nodeIdx];
          vtr hLocalFace = H_field[nodeIdx];

          // No averaging
          regMface[i].setField(j, NormalVtr * eLocalFace * (-1.0));
          regJface[i].setField(j, NormalVtr * hLocalFace * (1.0));
        }
    }

      // Register
      memset(regFileName, 0, StrOutput * sizeof(char));
      sprintf(regFileName, "./CURRENT_Total/Currents_%s_%05d", fname, timeStep);
      printRegister(regMface, regJface, outputMesh.num_triangles, regFileName,1);

      delete[] regMface;
      delete[] regJface;


    // Calculate Scattered Fields at the points
    for(int i = 0; i < num_nodes; i++)
    {
      E_field[i] = E_field[i] - Einc_field[i];
      H_field[i] = H_field[i] - Hinc_field[i];
    }

    //writeProbeFieldsCSV( "./PROBES_sc", fname, timeStep, node_ids, E_field, H_field);


    // ----------------------------------------------------------------------------------------------
    // Write the Scattered Fields

    regMface = new Register[outputMesh.num_triangles];
    regJface = new Register[outputMesh.num_triangles];

    make_dir_if_not_exist("./CURRENT_SC");


    for(int i = 0; i < outputMesh.num_triangles; i++)
    {

        std::vector<int> tri_nodes = outputMesh.getTriangle(i);

        std::vector<float> normal_d = outputMesh.getNormal(i);
        vtr NormalVtr(normal_d[0], normal_d[1], normal_d[2]);
        regMface[i].initial(3);
        regJface[i].initial(3);

        for(j = 0; j < 3; j++)
        {
          int nodeIdx = tri_nodes[j];
          vtr eLocalFace = E_field[nodeIdx];
          vtr hLocalFace = H_field[nodeIdx];

          // No averaging
          regMface[i].setField(j, NormalVtr * eLocalFace * (-1.0));
          regJface[i].setField(j, NormalVtr * hLocalFace * (1.0));
        }
    }

      // Register
      memset(regFileName, 0, StrOutput * sizeof(char));
      sprintf(regFileName, "./CURRENT_SC/Currents_%s_%05d", fname, timeStep);
      printRegister(regMface, regJface, outputMesh.num_triangles, regFileName,1);

      delete[] regMface;
      delete[] regJface;


  }


  #endif
#endif