//
// Created by jundafeng on 3/23/22.
//

#ifndef DGTD_KERNELS_CU
#define DGTD_KERNELS_CU
#include "kernels.cuh"
/* Basic kernel: one tetra per thread
 * Opt1: one block per tetra, one entry per thread, registers can be shuffled.
 * New kernels can be parallelized in a smarter way:
 * 1. volume integral can be parallelized in (12,12)
 * 2. surface integral can  be parallelized in (6,6)
 * 3. coupling integral can be parallelized in (6,6)
 * 4. excitation can be parallelized in (12,1)
 */

//-----------------------CONSTANTS--------------------------------------------------------------------------------------------

__device__ constexpr int GAUSS_POINT_NUM[4]  = {6, 9, 9, 9};
__device__ constexpr int TetPolyOrderDim[4]  = {6, 12, 30, 45};
__device__ constexpr int FacePolyOrderDim[4] = {3,  6, 12, 18};

#define g6_a 0.816847572980459
#define g6_b (1.0 - g6_a) / 2.0
#define g6_c 0.108103018168070
#define g6_d (1.0 - g6_c) / 2.0
#define g6_W1 0.109951743655322
#define g6_W2 0.223381589678011

#define g9_a 0.437525248383384
#define g9_b 1.0 - 2.0 * g9_a
#define g9_c 0.797112651860071
#define g9_d 0.165409927389841
#define g9_e 1.0 - g9_c - g9_d
#define g9_W1 0.205950504760887
#define g9_W2 0.063691414286223

#if defined(DGTD_USE_CUDA)

#include <cuda.h>
#include <cuda_runtime.h>
#include <cooperative_groups.h>
#include "cuda_utils.h"

__device__ fp_t_ts g2d_6[6][4] = {
    g6_a, g6_b, g6_b, g6_W1,
    g6_b, g6_a, g6_b, g6_W1,
    g6_b, g6_b, g6_a, g6_W1,
    g6_c, g6_d, g6_d, g6_W2,
    g6_d, g6_c, g6_d, g6_W2,
    g6_d, g6_d, g6_c, g6_W2};

// added by  Seung-Cheol Lee & Marinos N. Vouvakis 08/12/02
// Order = 5
__device__ fp_t_ts g2d_9[9][4] = {
    g9_b, g9_a, g9_a, g9_W1,
    g9_a, g9_b, g9_a, g9_W1,
    g9_a, g9_a, g9_b, g9_W1,

    g9_c, g9_d, g9_e, g9_W2,
    g9_c, g9_e, g9_d, g9_W2,
    g9_d, g9_c, g9_e, g9_W2,
    g9_d, g9_e, g9_c, g9_W2,
    g9_e, g9_c, g9_d, g9_W2,
    g9_e, g9_d, g9_c, g9_W2};

// __device__ const int FACE_BASIS_IJK[6][2] = {
//   {1, 2},
//   {0, 2},
//   {0, 1},
//   {1, 2},
//   {0, 2},
//   {0, 1},
// };

__device__ const int FACE_BASIS_IJK[18][3] = {
    {1, 2, -1},
    {0, 2, -1},
    {0, 1, -1},
    {1, 2, -1},
    {0, 2, -1},
    {0, 1, -1},
    {0, 1, 2},
    {0, 1, 2},
    {1, 2, -1},
    {0, 2, -1},
    {0, 1, -1},
    {0, 1, 2},
    {0, 1, 2},
    {0, 1, 2},
    {0, 1, 2},
    {0, 1, 2},
    {0, 1, 2},
    {0, 1, 2}
};

__device__ const int FAC2TET[4][18] = {
    {5, 4, 3, 11, 10, 9, 12, 13, 25, 24, 23, 26, 30, 31, 32, 42, 43, 44},
    {5, 2, 1, 11,  8, 7, 14, 15, 25, 22, 21, 27, 33, 34, 35, 42, 43, 44},
    {4, 2, 0, 10,  8, 6, 16, 17, 24, 22, 20, 28, 36, 37, 38, 42, 43, 44},
    {3, 1, 0,  9,  7, 6, 18, 19, 23, 21, 20, 29, 39, 40, 41, 42, 43, 44}
};


#endif

//----------------------------------------------------------------------------------------------------------------------------

#if defined(DGTD_USE_CUDA) || defined(DGTD_USE_CUDA_OPENCL)

__global__ void makeNeighField(int* MappingArray, fp_t_ts* Field, fp_t_ts* auxiliar, int elements)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if(idx < elements){
    auxiliar[idx] = Field[MappingArray[idx]];
    // printf("%d\n", MappingArray[idx]);
  }
}

__global__ void makeNeighFieldRegular(int* MappingArray, fp_t_ts* Field, fp_t_ts* auxiliar, int elements, int polyOrder){
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if(idx < NumOfFaces * FacePolyOrderDim[polyOrder] * elements){

    int blockId = idx / FacePolyOrderDim[polyOrder];

    int basisId = idx % FacePolyOrderDim[polyOrder];
    int tetId = blockId / NumOfFaces;
    int faceId = blockId % NumOfFaces;

    auxiliar[(faceId * elements + tetId) * FacePolyOrderDim[polyOrder] + basisId] = Field[MappingArray[idx]];

    // for(int i = 0; i < NumOfFaces; i++){
    //   auxiliar[idx + i * elements] = Field[MappingArray[idx + i * FacePolyOrderDim[polyOrder]]];
    // }
  }
}


__global__ void makeNeighFieldRegular_custom(int* MappingArray, fp_t_ts* Field, fp_t_ts* auxiliar, int elements, int polyOrder, int numFaces){
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if(idx < numFaces * FacePolyOrderDim[polyOrder] * elements){

    int blockId = idx / FacePolyOrderDim[polyOrder];

    int basisId = idx % FacePolyOrderDim[polyOrder];
    int tetId = blockId / numFaces;
    int faceId = blockId % numFaces;

    auxiliar[(faceId * elements + tetId) * FacePolyOrderDim[polyOrder] + basisId] = Field[MappingArray[idx]];

  }
}


__global__ void addCouplingResults(fp_t_ts* Field, fp_t_ts* auxiliar, int* neighs, int* neighsOffset, int elements){
  int tet = blockIdx.x * blockDim.y + threadIdx.y;

  if(tet < elements){
    int basis = blockDim.x;

    fp_t_ts fieldAux = 0.0;
    int offset = (neighsOffset[tet] - neighsOffset[0]) * basis;

    for(int i = 0; i < neighs[tet]; i++){
      fieldAux += auxiliar[offset + basis * i + threadIdx.x];
    }

    Field[tet * basis + threadIdx.x] += fieldAux;
  }
}

__global__ void addCouplingResultsRegular(fp_t_ts* Field, fp_t_ts* auxiliar, int elements){
  int tet = blockIdx.x * blockDim.y + threadIdx.y;

  if(tet < elements){
    int basis = blockDim.x;

    fp_t_ts fieldAux = 0.0;

    for(int i = 0; i < NumOfFaces; i++){
      fieldAux += auxiliar[tet * basis + elements * basis * i + threadIdx.x];
    }

    Field[tet * basis + threadIdx.x] += fieldAux;
  }
}


__global__ void addCouplingResultsRegular_custom(fp_t_ts* Field, fp_t_ts* auxiliar, int elements, int numFaces){
  
  int tet = blockIdx.x * blockDim.y + threadIdx.y;

  if(tet < elements)
  {
    int basis = blockDim.x;

    fp_t_ts fieldAux = 0.0;

    for(int i = 0; i < numFaces; i++)
    {
      fieldAux += auxiliar[tet * basis + elements * basis * i + threadIdx.x];
    }

    Field[tet * basis + threadIdx.x] += fieldAux;
  }
}


  __global__ void fill_ones_cols(fp_t_ts* __restrict__ M, int rows, int cols, int ld)
  {
      int r = blockIdx.x * blockDim.x + threadIdx.x;
      int c = blockIdx.y * blockDim.y + threadIdx.y;
      if (r < rows && c < cols) M[(size_t)c * ld + r] = 1.0f;
  }


// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //
// 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 //

__device__ fp_t_ts dotP3131(const fp_t_ts *v1, const fp_t_ts *v2){
  fp_t_ts result;
  result = v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
  return result;
}

__device__ void crossP3(const fp_t_ts *v1, const fp_t_ts *v2, fp_t_ts *result){
  result[0] = v1[1] * v2[2] - v1[2] * v2[1];
  result[1] = -(v1[0] * v2[2] - v1[2] * v2[0]);
  result[2] = v1[0] * v2[1] - v1[1] * v2[0];
}

__device__ void scale3(fp_t_ts *v1, const fp_t_ts factor){
  v1[0] = v1[0] * factor;
  v1[1] = v1[1] * factor;
  v1[2] = v1[2] * factor;
}

__device__ void scale3(const fp_t_ts *v1, const fp_t_ts factor, fp_t_ts *v2){
  v2[0] = v1[0] * factor;
  v2[1] = v1[1] * factor;
  v2[2] = v1[2] * factor;
}

__device__ void unit3(fp_t_ts *v1){
  fp_t_ts work[3];
  fp_t_ts mag = sqrt(v1[0] * v1[0] + v1[1] * v1[1] + v1[2] * v1[2]);

  work[0] = v1[0] / mag;
  work[1] = v1[1] / mag;
  work[2] = v1[2] / mag;

  v1[0] = work[0];
  v1[1] = work[1];
  v1[2] = work[2];
}

__device__ void plus3(fp_t_ts *v1, fp_t_ts *v2){
  v1[0] = v1[0] + v2[0];
  v1[1] = v1[1] + v2[1];
  v1[2] = v1[2] + v2[2];
}

__device__ void set3(const fp_t_ts *v1, fp_t_ts *v2){
  v2[0] = v1[0];
  v2[1] = v1[1];
  v2[2] = v1[2];
}

__device__ void plus3(const fp_t_ts *v1, const fp_t_ts *v2, fp_t_ts *result){
  result[0] = v1[0] + v2[0];
  result[1] = v1[1] + v2[1];
  result[2] = v1[2] + v2[2];
}

__device__ void plus33(const fp_t_ts *v1, const fp_t_ts *v2, const fp_t_ts *v3, fp_t_ts *result){
  result[0] = v1[0] + v2[0] + v3[0];
  result[1] = v1[1] + v2[1] + v3[1];
  result[2] = v1[2] + v2[2] + v3[2];
}

__device__ void minus3(const fp_t_ts *v1, const fp_t_ts *v2, fp_t_ts *result){
  result[0] = v1[0] - v2[0];
  result[1] = v1[1] - v2[1];
  result[2] = v1[2] - v2[2];
}

__device__ void reset3(fp_t_ts *result){
  result[0] = 0.0;
  result[1] = 0.0;
  result[2] = 0.0;
}

__device__ void geometry3(const fp_t_ts *nd_coord, fp_t_ts lvtr[][3], fp_t_ts avtr[][3], fp_t_ts *vol){
  int i, i1, i2, i3;
  fp_t_ts vv[3], tt[3];

  for (i = 0; i < 3; i++){
    minus3(&nd_coord[(i + 1) * 3], &nd_coord[0], lvtr[i]);
  }

  for (i = 0; i < 3; i++){
    i1 = (i + 1) % 4;
    i2 = (i + 2) % 4;
    i3 = (i + 3) % 4;

    minus3(&nd_coord[i2 * 3], &nd_coord[i1 * 3], tt);
    minus3(&nd_coord[i3 * 3], &nd_coord[i1 * 3], vv);
    crossP3(tt, vv, avtr[i]);
    scale3(avtr[i], 0.5);

    minus3(&nd_coord[i * 3], &nd_coord[i1 * 3], tt);
    if (dotP3131(avtr[i], tt) < 0.0)
      scale3(avtr[i], -1);
    ;
  }

  reset3(avtr[3]);
  plus33(avtr[0], avtr[1], avtr[2], avtr[3]);
  scale3(avtr[3], -1);

  (*vol) = dotP3131(avtr[2], lvtr[1]) / 3.0;
}

__device__ fp_t_ts magnitude3(const fp_t_ts *v1){
  return sqrt(v1[0] * v1[0] + v1[1] * v1[1] + v1[2] * v1[2]);
}

__device__ void tet_center(const fp_t_ts *tet_nd_coord, fp_t_ts *tet_center){
  tet_center[0] = (tet_nd_coord[0 * 3 + 0] + tet_nd_coord[1 * 3 + 0] + tet_nd_coord[2 * 3 + 0] + tet_nd_coord[3 * 3 + 0]) * 0.25;
  tet_center[1] = (tet_nd_coord[0 * 3 + 1] + tet_nd_coord[1 * 3 + 1] + tet_nd_coord[2 * 3 + 1] + tet_nd_coord[3 * 3 + 1]) * 0.25;
  tet_center[2] = (tet_nd_coord[0 * 3 + 2] + tet_nd_coord[1 * 3 + 2] + tet_nd_coord[2 * 3 + 2] + tet_nd_coord[3 * 3 + 2]) * 0.25;
}

// have to use 2d arrays in the device functions, using 1d array with offsets will result in erroneous values
__device__ void geometry2(const fp_t_ts *nd_coord, const fp_t_ts *tet0_nd_coord, fp_t_ts *normal, fp_t_ts nvtr[][3], fp_t_ts *area){
  fp_t_ts t0[3], t1[3], tv[3], tet0center[3];

  minus3(&nd_coord[2 * 3], &nd_coord[1 * 3], t0);
  minus3(&nd_coord[0 * 3], &nd_coord[2 * 3], t1);

  crossP3(t0, t1, normal);

  (*area) = 0.5 * magnitude3(normal);
  scale3(normal, 1 / magnitude3(normal));

  crossP3(normal, t0, nvtr[0]);
  crossP3(normal, t1, nvtr[1]);

  if (dotP3131(nvtr[0], t1) < 0.0)
    scale3(nvtr[0], -1);
  if (dotP3131(nvtr[1], t0) > 0.0)
    scale3(nvtr[1], -1);

  tet_center(tet0_nd_coord, tet0center);
  minus3(tet0center, &nd_coord[0 * 3], tv);

  if (dotP3131(tv, normal) > 0.0)
    scale3(normal, -1);
}

__device__ void geometry2(const fp_t_ts *nd_coord, fp_t_ts nvtr[][3], fp_t_ts *area){
  fp_t_ts t0[3], t1[3], normal[3];
  minus3(&nd_coord[2 * 3], &nd_coord[1 * 3], t0);
  minus3(&nd_coord[0 * 3], &nd_coord[2 * 3], t1);

  crossP3(t0, t1, normal);

  (*area) = 0.5 * magnitude3(normal);
  scale3(normal, 1 / magnitude3(normal));

  crossP3(normal, t0, nvtr[0]);
  crossP3(normal, t1, nvtr[1]);

  if (dotP3131(nvtr[0], t1) < 0.0)
    scale3(nvtr[0], -1);
  if (dotP3131(nvtr[1], t0) > 0.0)
    scale3(nvtr[1], -1);
}

__device__ void dotP3331(const fp_t_ts *tensor, const fp_t_ts *v1, fp_t_ts *result){
  result[0] = v1[0] * tensor[0] + v1[1] * tensor[1] + v1[2] * tensor[2];
  result[1] = v1[0] * tensor[3] + v1[1] * tensor[4] + v1[2] * tensor[5];
  result[2] = v1[0] * tensor[6] + v1[1] * tensor[7] + v1[2] * tensor[8];
}

__device__ void makeCoeff33(const fp_t_ts avtr[][3], const fp_t_ts *epsr, fp_t_ts coeffA[][3]){
  int i, j;
  fp_t_ts temp[3];
  for (i = 0; i < 3; i++){
    for (j = 0; j < 3; j++){
      dotP3331(epsr, avtr[j], temp);
      coeffA[i][j] = dotP3131(temp, avtr[i]);
    }
  }
}

__device__ void quadObtention(int np, int id, fp_t_ts &zeta0, fp_t_ts &zeta1, fp_t_ts &zeta2, fp_t_ts &weight){
  if (id < 0 || id >= np){
    printf("Error in quadObtention");
    return;
  }

  switch (np){
  case 6:
    zeta0 = g2d_6[id][0];
    zeta1 = g2d_6[id][1];
    zeta2 = g2d_6[id][2];
    weight = g2d_6[id][3];
    break;
  case 9:
    zeta0 = g2d_9[id][0];
    zeta1 = g2d_9[id][1];
    zeta2 = g2d_9[id][2];
    weight = g2d_9[id][3];
    break;
  default:
    break;
  }
}

__device__ void faceBasisW(const fp_t_ts *node_coords, const fp_t_ts *zeta, const int p, fp_t_ts *W){
  fp_t_ts gradZeta[3][3];
  fp_t_ts area;
  geometry2(node_coords, gradZeta, &area);

  scale3(gradZeta[0], 0.5 / area);
  scale3(gradZeta[1], 0.5 / area);
  gradZeta[2][0] = 0;
  gradZeta[2][1] = 0;
  gradZeta[2][2] = 0;
  minus3(gradZeta[2], gradZeta[0], gradZeta[2]);
  minus3(gradZeta[2], gradZeta[1], gradZeta[2]);

  int i = FACE_BASIS_IJK[p][0];
  int j = FACE_BASIS_IJK[p][1];
  int k = FACE_BASIS_IJK[p][2];

  switch (p){
    case 0:
      scale3(gradZeta[j], zeta[i]);
      scale3(gradZeta[i], zeta[j]);
      minus3(gradZeta[j], gradZeta[i], W);
      break;
    case 1:
      scale3(gradZeta[j], zeta[i]);
      scale3(gradZeta[i], zeta[j]);
      minus3(gradZeta[j], gradZeta[i], W);
      break;
    case 2:
      scale3(gradZeta[j], zeta[i]);
      scale3(gradZeta[i], zeta[j]);
      minus3(gradZeta[j], gradZeta[i], W);
      break;
    case 3:
      scale3(gradZeta[j], zeta[i]);
      scale3(gradZeta[i], zeta[j]);
      plus3(gradZeta[j], gradZeta[i], W);
      scale3(W, 4.);
      break;
    case 4:
      scale3(gradZeta[j], zeta[i]);
      scale3(gradZeta[i], zeta[j]);
      plus3(gradZeta[j], gradZeta[i], W);
      scale3(W, 4.);
      break;
    case 5:
      scale3(gradZeta[j], zeta[i]);
      scale3(gradZeta[i], zeta[j]);
      plus3(gradZeta[j], gradZeta[i], W);
      scale3(W, 4.);
      break;
    case 6:
      scale3(gradZeta[k], zeta[i]);
      scale3(gradZeta[i], zeta[k]);
      minus3(gradZeta[k], gradZeta[i], W);
      scale3(W, 4 * zeta[j]);
      break;
    case 7:
      scale3(gradZeta[j], zeta[i]);
      scale3(gradZeta[i], zeta[j]);
      minus3(gradZeta[j], gradZeta[i], W);
      scale3(W, 4. * zeta[k]);
      break;
    case 8:
      scale3(gradZeta[j], zeta[i] * zeta[i] - 2. * zeta[i] * zeta[j]);
      scale3(gradZeta[i], zeta[j] * zeta[j] - 2. * zeta[i] * zeta[j]);
      minus3(gradZeta[j], gradZeta[i], W);
      scale3(W, 4.);
      break;
    case 9:
      scale3(gradZeta[j], zeta[i] * zeta[i] - 2. * zeta[i] * zeta[j]);
      scale3(gradZeta[i], zeta[j] * zeta[j] - 2. * zeta[i] * zeta[j]);
      minus3(gradZeta[j], gradZeta[i], W);
      scale3(W, 4.);
      break;
    case 10:
      scale3(gradZeta[j], zeta[i] * zeta[i] - 2. * zeta[i] * zeta[j]);
      scale3(gradZeta[i], zeta[j] * zeta[j] - 2. * zeta[i] * zeta[j]);
      minus3(gradZeta[j], gradZeta[i], W);
      scale3(W, 4.);
      break;
    case 11:
      scale3(gradZeta[k], zeta[i] * zeta[j]);
      scale3(gradZeta[j], zeta[i] * zeta[k]);
      scale3(gradZeta[i], zeta[k] * zeta[j]);
      plus33(gradZeta[k], gradZeta[j], gradZeta[i], W);
      scale3(W, 8.);
      break;
    case 12:
      scale3(gradZeta[k], zeta[j]);
      scale3(gradZeta[j], zeta[k]);
      minus3(gradZeta[k], gradZeta[j], W);
      scale3(W, 4. * zeta[i] * zeta[i]);
      break;
    case 13:
      scale3(gradZeta[k], zeta[i]);
      scale3(gradZeta[i], zeta[k]);
      minus3(gradZeta[k], gradZeta[i], W);
      scale3(W, 4. * zeta[j] * zeta[j]);
      break;
    case 14:
      scale3(gradZeta[j], zeta[i]);
      scale3(gradZeta[i], zeta[j]);
      minus3(gradZeta[j], gradZeta[i], W);
      scale3(W, 4. * zeta[k] * zeta[k]);
      break;
    case 15:
      W[0] = W[1] = W[2] = 0;
      break;
    case 16:
      W[0] = W[1] = W[2] = 0;
      break;
    case 17:
      W[0] = W[1] = W[2] = 0;
      break;
    default:
      W[0] = W[1] = W[2] = 0;
      printf("Check faceBasisW: you are asking for somthing out of scope");
      break;
  }
}

__device__ void getCartesianCoord(const fp_t_ts *node_coords, const fp_t_ts *zeta, fp_t_ts *pnt){
  pnt[0] = node_coords[0 * 3 + 0] * zeta[0] + node_coords[1 * 3 + 0] * zeta[1] + node_coords[2 * 3 + 0] * zeta[2];
  pnt[1] = node_coords[0 * 3 + 1] * zeta[0] + node_coords[1 * 3 + 1] * zeta[1] + node_coords[2 * 3 + 1] * zeta[2];
  pnt[2] = node_coords[0 * 3 + 2] * zeta[0] + node_coords[1 * 3 + 2] * zeta[1] + node_coords[2 * 3 + 2] * zeta[2];
}

__device__ void PlaneWaveFields(int ExcitationFlag, fp_t_ts t, fp_t_ts to, fp_t_ts tau, const fp_t_ts *r, const fp_t_ts *ro, fp_t_ts freq_m, fp_t_ts *kvtr, fp_t_ts eta, fp_t_ts Emagnitude, fp_t_ts &IncidExcit_E, fp_t_ts &IncidExcit_H)
{

  fp_t_ts omega = 2.0 * Pi * freq_m * MEGA;
  fp_t_ts Exponent;
  fp_t_ts SinModul;
  fp_t_ts Neuman;
  fp_t_ts work[3];
  // printf("%d", ExcitationFlag);
  switch (ExcitationFlag){
    case 0:
      minus3(r, ro, work);
      Exponent = t - to - dotP3131(kvtr, work) / Vo;
      // Time Harmonic
      if (Exponent >= 0.0){
        SinModul = cos(omega * Exponent);
        IncidExcit_E = Emagnitude * SinModul;
        IncidExcit_H = (Emagnitude / eta) * SinModul;
      }else{
        IncidExcit_E = 0.0;
        IncidExcit_H = 0.0;
      }
      break;

    case 1:
      // Gauss Pulse
      minus3(r, ro, work);
      Exponent = t - to - dotP3131(kvtr, work) / Vo;
      SinModul = cos(omega * Exponent);
      IncidExcit_E = Emagnitude * SinModul * exp(-(Exponent * Exponent) / (tau * tau));
      IncidExcit_H = (Emagnitude / eta) * SinModul * exp(-(Exponent * Exponent) / (tau * tau));
      break;

    case 2:
      // Neuman Pulse
      minus3(r, ro, work);
      Exponent = t - to - dotP3131(kvtr, work) / Vo;
      Neuman = (2.0 * Exponent) / (tau * tau);
      IncidExcit_E = (Emagnitude * Neuman) * exp(-(Exponent * Exponent) / (tau * tau));
      IncidExcit_H = ((Emagnitude / eta) * Neuman) * exp(-(Exponent * Exponent) / (tau * tau));
      break;

    case 3:
      {
          fp_t_ts work[3];
          minus3(r, ro, work);  // work = r - ro
    
          Exponent = t - dotP3131(kvtr, work) / Vo;
    
          if (Exponent >= 0.0 && Exponent <= to) 
          {
            // Chirp parameters
            fp_t_ts f_end = freq_m * MEGA;         // Center frequency in Hz
            fp_t_ts B = tau * MEGA;                   // Bandwidth in Hz
            fp_t_ts f0 = f_end - B;           // Start frequency
            fp_t_ts f1 = f_end;           // End frequency
            fp_t_ts Tchirp = to;                       // Chirp duration
        
            // Chirp phase
            fp_t_ts chirpArg = 2.0 * Pi * f0 * Exponent 
                            + Pi * (f1 - f0) / Tchirp * Exponent * Exponent;
                
            // Final tapered chirp signal
            fp_t_ts chirpSignal = sin(chirpArg);
        
            IncidExcit_E = Emagnitude * chirpSignal;
            IncidExcit_H = (Emagnitude / eta) * chirpSignal;
          } 
          else 
          {
            IncidExcit_E = 0.0;
            IncidExcit_H = 0.0;
          }
          break;
      }
      

    default:
      break;
  }
}


__global__ void CalculatePadeFreq(fp_t* a_k, fp_t* b_k, int M_global, int N, int* constantM, cuDoubleComplex* Hf){

  int m = blockIdx.x * blockDim.x + threadIdx.x;

  if(m < M_global){
    fp_t sum_real_a = 0.0;
    fp_t sum_imag_a = 0.0;
    fp_t sum_real_b = 0.0;
    fp_t sum_imag_b = 0.0;

    for (int n = 0; n < N; n++) {
      fp_t angle = -2.0f * Pi * constantM[m] * n / M_global;
      fp_t cos_val, sin_val;

      sincos(angle, &sin_val, &cos_val);

      sum_real_a += a_k[n] * cos_val;
      sum_imag_a += a_k[n] * sin_val;

      sum_real_b += b_k[n] * cos_val;
      sum_imag_b += b_k[n] * sin_val;
    }

    Hf[m] = cuCdiv(make_cuDoubleComplex(sum_real_a, sum_imag_a), make_cuDoubleComplex(sum_real_b, sum_imag_b));

  }

  return;
}


// === Utility Functions for Double Precision ===
__device__ __forceinline__ fp_t_ts cleanSmall(fp_t_ts val, fp_t_ts eps = 1e-12) {
  return fabs(val) < eps ? 0.0 : val;
}

__device__ __forceinline__ void cleanVec3(fp_t_ts* v, fp_t_ts eps = 1e-12) {
  for (int i = 0; i < 3; ++i)
      v[i] = cleanSmall(v[i], eps);
}


// --------------------------------------------------------------------------------------
// Plane Wave Exciation Update
// --------------------------------------------------------------------------------------


__device__ void E_updateinc_PW(const fp_t_ts t, const fp_t_ts to, const fp_t_ts tau, const fp_t_ts freq_m, const fp_t_ts eta, const fp_t_ts Emagnitude, const fp_t_ts *ro, const fp_t_ts theta, const fp_t_ts phi, const fp_t_ts *Epol, const int ExcitationFlag, const fp_t_ts *r, const fp_t_ts *normalvtr, fp_t_ts *nxHinc, fp_t_ts *nxnxEinc)
{
  fp_t_ts IncidExcit_E, IncidExcit_H;

  fp_t_ts theta_in_rad = theta * (fp_t_ts)Pi / (fp_t_ts)180.0;
  fp_t_ts phi_in_rad = phi * (fp_t_ts)Pi / (fp_t_ts)180.0;
  fp_t_ts kvtr[3], Hpol[3], work[3];
  kvtr[0] = sin(theta_in_rad) * cos(phi_in_rad);
  kvtr[1] = sin(theta_in_rad) * sin(phi_in_rad);
  kvtr[2] = cos(theta_in_rad);

  // Fields in Time Domain
  PlaneWaveFields(ExcitationFlag, t, to, tau, r, ro, freq_m, kvtr, eta, Emagnitude, IncidExcit_E, IncidExcit_H);

  // EField
  crossP3(normalvtr, Epol, work);
  crossP3(normalvtr, work, nxnxEinc);
  scale3(nxnxEinc, IncidExcit_E);

  // HField
  crossP3(kvtr, Epol, work);
  crossP3(normalvtr, work, Hpol);
  scale3(Hpol, IncidExcit_H, nxHinc);


  /*
  if (r[2] == 0.0 && r[1] < 1e-4 && r[1] > 0.0 && r[0] < 5e-4 && r[0] > 0.0)
  {
    printf(
      "r=(%e,%e,%e) | nxnxE =(%e,%e,%e) | nxH =(%e,%e,%e) \n",
      (double)r[0], (double)r[1], (double)r[2], 
      (double)nxnxEinc[0], (double)nxnxEinc[1], (double)nxnxEinc[2],      
      (double)nxHinc[0], (double)nxHinc[1], (double)nxHinc[2]
    );  
  } 
  */
  

  for (int i = 0; i < 3; ++i) 
  {
    if (fabs(nxHinc[i]) < (fp_t_ts)1e-12) nxHinc[i] = 0.0;
    if (fabs(nxnxEinc[i]) < (fp_t_ts)1e-12) nxnxEinc[i] = 0.0;
  }
}


__device__ void H_updateinc_PW(const fp_t_ts t, const fp_t_ts to, const fp_t_ts tau, const fp_t_ts freq_m, const fp_t_ts eta, const fp_t_ts Emagnitude, const fp_t_ts *ro, const fp_t_ts theta, const fp_t_ts phi, const fp_t_ts *Epol, const int ExcitationFlag, const fp_t_ts *r, const fp_t_ts *normalvtr, fp_t_ts *nxEinc, fp_t_ts *nxnxHinc)
{
  fp_t_ts IncidExcit_E, IncidExcit_H;
  fp_t_ts theta_in_rad = theta * (fp_t_ts)Pi / (fp_t_ts)180.0;
  fp_t_ts phi_in_rad = phi * (fp_t_ts)Pi / (fp_t_ts)180.0;
  fp_t_ts kvtr[3], Hpol[3], work[3];
  kvtr[0] = sin(theta_in_rad) * cos(phi_in_rad);
  kvtr[1] = sin(theta_in_rad) * sin(phi_in_rad);
  kvtr[2] = cos(theta_in_rad);

  PlaneWaveFields(ExcitationFlag, t, to, tau, r, ro, freq_m, kvtr, eta, Emagnitude, IncidExcit_E, IncidExcit_H);

  // EField
  crossP3(normalvtr, Epol, work);
  scale3(work, IncidExcit_E, nxEinc);

  // HField
  crossP3(kvtr, Epol, Hpol);
  crossP3(normalvtr, Hpol, work);
  crossP3(normalvtr, work, nxnxHinc);
  scale3(nxnxHinc, IncidExcit_H);

  for (int i = 0; i < 3; ++i) 
  {
    if (fabs(nxEinc[i]) < (fp_t_ts)1e-12) nxEinc[i] = 0.0;
    if (fabs(nxnxHinc[i]) < (fp_t_ts)1e-12) nxnxHinc[i] = 0.0;
  }
}

__global__ void addExcitationH_PML( int* ExcitationFacesCnt,
                          int* ExcitationFacesOffset,
                          int* ExcitationFacesNum,
                          int excitationCNT,
                          int excitationPMLCNT,
                          int8_t* map,
                          ExcitationProp exci_prop,
                          int PolyFlag,
                          fp_t_ts dt_muo, fp_t_ts t,
                          fp_t_ts* nd_coords_tet,
                          fp_t_ts* nd_coords_face,
                          fp_t_ts* Z_face,
                          fp_t_ts* InvMat,
                          fp_t_ts* Field)
{

  int tet_id = blockIdx.x * blockDim.x + threadIdx.x;

  if (tet_id < excitationCNT)
  {
    //-----TODO: Make this part dynamic----------------------------
    const int TetPolyOrderDim2 = TetPolyOrderDim[3];
    const int FacePolyOrderDim2 = FacePolyOrderDim[3];

    fp_t_ts nxe_inc[TetPolyOrderDim2] = {0};
    fp_t_ts nxnxh_inc[TetPolyOrderDim2] = {0};
    //-------------------------------------------------------------
    fp_t_ts Z = 0.0;
    for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++){
      int face_id = ExcitationFacesOffset[tet_id] + i;
      Z = Z_face[face_id];

      fp_t_ts nvtr[2][3];
      fp_t_ts normvtr[3] = {0};
      fp_t_ts area = 0;
      fp_t_ts nxentry;
      fp_t_ts nxnxentry;
      fp_t_ts z[3], w[3], r[3];
      fp_t_ts wgt;

      //-----------------TODO: Make this part dynamic--------------------------------------------
      fp_t_ts nxelem[FacePolyOrderDim2] = {0};
      fp_t_ts nxnxelem[FacePolyOrderDim2] = {0};
      //-----------------------------------------------------------------------------------------

      fp_t_ts nxE_inc[3];
      fp_t_ts nxnxH_inc[3];

      geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area);

      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
        nxentry = 0;
        nxnxentry = 0;
        for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++){
          quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt);
          faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w);
          getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r);

          H_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, Z, exci_prop.Emagnitude, exci_prop.ro, exci_prop.theta, exci_prop.phi, exci_prop.Epol, exci_prop.ExcitationFlag, r, normvtr, nxE_inc, nxnxH_inc);

          if (!(isnan(nxE_inc[0]))){
            nxentry += area * wgt * dotP3131(w, nxE_inc);
            nxnxentry += area * wgt * dotP3131(w, nxnxH_inc);
          }
        }
        //nxelem[j] += nxentry;
        //nxnxelem[j] += nxnxentry;
        nxelem[j] += cleanSmall(nxentry);
        nxnxelem[j] += cleanSmall(nxnxentry);
      }
      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
        // printf("%d\n", ExcitationFacesNum[face_id]);
        nxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j];
        nxnxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j];
      }
    }

    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
      nxe_inc[i] *= -dt_muo;
      //nxe_inc[i] += (-Z * dt_muo) * nxnxh_inc[i];
      nxe_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i];
    }
    fp_t_ts sum;
    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
      sum = 0.0;
      for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++){
        sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxe_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i];
      }
      sum = cleanSmall(sum);

      if (tet_id < excitationPMLCNT)
      {
        Field[tet_id * TetPolyOrderDim[PolyFlag] + i] -= 0.5 * sum;  // Scattered Region
      }
      else
      {
        Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += 0.5 * sum;  // Total Region
      }
    }
  }
}

__global__ void addExcitationE_PML( int* ExcitationFacesCnt,
                          int* ExcitationFacesOffset,
                          int* ExcitationFacesNum,
                          int excitationCNT,
                          int excitationPMLCNT,
                          int8_t* map,
                          ExcitationProp exci_prop,
                          int PolyFlag,
                          fp_t_ts dt_epsilon, fp_t_ts t,
                          fp_t_ts* nd_coords_tet,
                          fp_t_ts* nd_coords_face,
                          fp_t_ts* Z_face,
                          fp_t_ts* InvMat,
                          fp_t_ts* Field){

  int tet_id = blockIdx.x * blockDim.x + threadIdx.x;

  if (tet_id < excitationCNT){
    //-----TODO: Make this part dynamic----------------------------
    const int TetPolyOrderDim2 = TetPolyOrderDim[3];
    const int FacePolyOrderDim2 = FacePolyOrderDim[3];

    fp_t_ts nxh_inc[TetPolyOrderDim2] = {0};
    fp_t_ts nxnxe_inc[TetPolyOrderDim2] = {0};
    //-------------------------------------------------------------
    fp_t_ts Y = 0.0;
    for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++){
      int face_id = ExcitationFacesOffset[tet_id] + i;
      Y = 1 / Z_face[face_id];

      fp_t_ts nvtr[2][3];
      fp_t_ts normvtr[3] = {0};
      fp_t_ts area = 0;
      fp_t_ts nxentry;
      fp_t_ts nxnxentry;
      fp_t_ts z[3], w[3], r[3];
      fp_t_ts wgt;

      //-----------------TODO: Make this part dynamic--------------------------------------------
      fp_t_ts nxelem[FacePolyOrderDim2] = {0};
      fp_t_ts nxnxelem[FacePolyOrderDim2] = {0};
      //-----------------------------------------------------------------------------------------

      fp_t_ts nxH_inc[3];
      fp_t_ts nxnxE_inc[3];

      geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area);

      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
        nxentry = 0;
        nxnxentry = 0;
        for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++){
          quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt);
          faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w);
          getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r);

          E_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m,  Z_face[face_id], exci_prop.Emagnitude, exci_prop.ro, 
                        exci_prop.theta, exci_prop.phi, exci_prop.Epol, exci_prop.ExcitationFlag, r, normvtr, nxH_inc, nxnxE_inc);


          if (!(isnan(nxH_inc[0]))){
            nxentry += area * wgt * dotP3131(w, nxH_inc);
            nxnxentry += area * wgt * dotP3131(w, nxnxE_inc);
          }
        }
        
        //nxelem[j] += nxentry;
        //nxnxelem[j] += nxnxentry;
        nxelem[j] += cleanSmall(nxentry);
        nxnxelem[j] += cleanSmall(nxnxentry);
      }
      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
        // printf("%d\n", ExcitationFacesNum[face_id]);
        nxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j];
        nxnxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j];
      }
      
    }

    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
      nxh_inc[i] *= dt_epsilon;
      //nxh_inc[i] += (-Y * dt_epsilon) * nxnxe_inc[i];
      nxh_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i];
    }
    fp_t_ts sum;
    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
      sum = 0.0;
      for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++)
      {
        sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxh_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i];  
      }

      sum = cleanSmall(sum);

      if (tet_id < excitationPMLCNT)
      {
        Field[tet_id * TetPolyOrderDim[PolyFlag] + i] -= 0.5 * sum;  // Scattered Region
      }
      else
      {
        Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += 0.5 * sum;  // Total Region
      }

    }
    
  }
}


__global__ void addExcitationE( int* ExcitationFacesCnt,
                          int* ExcitationFacesOffset,
                          int* ExcitationFacesNum,
                          int excitationCNT,
                          int8_t* map,
                          ExcitationProp exci_prop,
                          int PolyFlag,
                          fp_t_ts dt_epsilon, fp_t_ts t,
                          fp_t_ts* nd_coords_tet,
                          fp_t_ts* nd_coords_face,
                          fp_t_ts* Z_face,
                          fp_t_ts* InvMat,
                          fp_t_ts* Field)
{

  int tet_id = blockIdx.x * blockDim.x + threadIdx.x;

  if (tet_id < excitationCNT)
  {
    //-----TODO: Make this part dynamic----------------------------
    const int TetPolyOrderDim2 = TetPolyOrderDim[3];
    const int FacePolyOrderDim2 = FacePolyOrderDim[3];

    fp_t_ts nxh_inc[TetPolyOrderDim2] = {0};
    fp_t_ts nxnxe_inc[TetPolyOrderDim2] = {0};
    //-------------------------------------------------------------
    fp_t_ts Y = 0.0;
    for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++)
    {
      int face_id = ExcitationFacesOffset[tet_id] + i;
      Y = 1 / Z_face[face_id];

      fp_t_ts nvtr[2][3];
      fp_t_ts normvtr[3] = {0};
      fp_t_ts area = 0;
      fp_t_ts nxentry;
      fp_t_ts nxnxentry;
      fp_t_ts z[3], w[3], r[3];
      fp_t_ts wgt;

      //-----------------TODO: Make this part dynamic--------------------------------------------
      fp_t_ts nxelem[FacePolyOrderDim2] = {0};
      fp_t_ts nxnxelem[FacePolyOrderDim2] = {0};
      //-----------------------------------------------------------------------------------------

      fp_t_ts nxH_inc[3];
      fp_t_ts nxnxE_inc[3];

      geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area);

      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++)
      {
        nxentry = 0;
        nxnxentry = 0;
        for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++)
        {
          quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt);
          faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w);
          getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r);

            E_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, Z_face[face_id], exci_prop.Emagnitude, 
                           exci_prop.ro, exci_prop.theta, exci_prop.phi,exci_prop.Epol,
                           exci_prop.ExcitationFlag,r,normvtr,nxH_inc,nxnxE_inc);
          

          if (!(isnan(nxH_inc[0])))
          {
            nxentry += area * wgt * dotP3131(w, nxH_inc);
            nxnxentry += area * wgt * dotP3131(w, nxnxE_inc);
          }


          /*
          if (r[2] == 0.0)
          {
            printf("r = (%.6e, %.6e, %.6e) | face_id=%d | q=%d | nxH_inc=(%.6e, %.6e, %.6e) | nxnxE_inc=(%.6e, %.6e, %.6e) | nxentry=%.6e | nxnxentry=%.6e\n",
                   (double)r[0], (double)r[1], (double)r[2],
                   face_id, k,
                   (double)nxH_inc[0], (double)nxH_inc[1], (double)nxH_inc[2],
                   (double)nxnxE_inc[0], (double)nxnxE_inc[1], (double)nxnxE_inc[2],
                   (double)nxentry, (double)nxnxentry
                  );
          }
          */
          
          
        }
        
        //printf("face_id=%d j=%d nxentry=%.6e nxnxentry=%.6e\n", face_id, j, (double)nxentry, (double)nxnxentry);
        
        nxelem[j] += nxentry;
        nxnxelem[j] += nxnxentry;

      }
      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
        //printf("%d\n", ExcitationFacesNum[face_id]);
        nxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j];
        nxnxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j];

        //printf("tet_id=%d face_id=%d j=%d TetNode=%d nxh_inc=%.6e nxnxe_inc=%.6e\n", tet_id, face_id, j, FAC2TET[ExcitationFacesNum[face_id]][j], (double)nxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]], (double)nxnxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] );
      }
      
    }

    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
      nxh_inc[i] *= dt_epsilon;
      //nxh_inc[i] += (-Y * dt_epsilon) * nxnxe_inc[i];
      nxh_inc[i] += (-Y * dt_epsilon) * nxnxe_inc[i];
      nxh_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i];

      //printf("tet_id=%d i=%d nxh_inc=%.6e nxnxe_inc=%6e Y=%6e dt_epsilon=%6e \n", tet_id, i, (double)nxh_inc[i], (double)nxnxe_inc[i], (double)Y, (double)dt_epsilon );

    }


    fp_t_ts sum;
    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
      sum = 0.0;
      for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++){
        sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxh_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i];
        //printf("InvMat[%d][%d]=%.6e | nxh_inc[%d]=%.6e | map=%d\n", j, i, (double)InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i], j, (double)nxh_inc[j], map[tet_id * TetPolyOrderDim[PolyFlag] + i]);
     
      }

      //printf("E-field update: tet_id=%d i=%d sum=%.6e\n", tet_id, i, (double)sum);
   
        Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += sum;  


    }
    
  }
}


__global__ void addExcitationH( int* ExcitationFacesCnt,
                          int* ExcitationFacesOffset,
                          int* ExcitationFacesNum,
                          int excitationCNT,
                          int8_t* map,
                          ExcitationProp exci_prop,
                          int PolyFlag,
                          fp_t_ts dt_muo, fp_t_ts t,
                          fp_t_ts* nd_coords_tet,
                          fp_t_ts* nd_coords_face,
                          fp_t_ts* Z_face,
                          fp_t_ts* InvMat,
                          fp_t_ts* Field)
{

  int tet_id = blockIdx.x * blockDim.x + threadIdx.x;

  if (tet_id < excitationCNT)
  {
    //-----TODO: Make this part dynamic----------------------------
    const int TetPolyOrderDim2 = TetPolyOrderDim[3];
    const int FacePolyOrderDim2 = FacePolyOrderDim[3];

    fp_t_ts nxe_inc[TetPolyOrderDim2] = {0};
    fp_t_ts nxnxh_inc[TetPolyOrderDim2] = {0};
    //-------------------------------------------------------------
    fp_t_ts Z = 0.0;
    for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++){
      int face_id = ExcitationFacesOffset[tet_id] + i;
      Z = Z_face[face_id];

      fp_t_ts nvtr[2][3];
      fp_t_ts normvtr[3] = {0};
      fp_t_ts area = 0;
      fp_t_ts nxentry;
      fp_t_ts nxnxentry;
      fp_t_ts z[3], w[3], r[3];
      fp_t_ts wgt;

      //-----------------TODO: Make this part dynamic--------------------------------------------
      fp_t_ts nxelem[FacePolyOrderDim2] = {0};
      fp_t_ts nxnxelem[FacePolyOrderDim2] = {0};
      //-----------------------------------------------------------------------------------------

      fp_t_ts nxE_inc[3];
      fp_t_ts nxnxH_inc[3];

      geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area);

      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++)
      {
        nxentry = 0;
        nxnxentry = 0;
        for (int k = 0; k < GAUSS_POINT_NUM[PolyFlag]; k++){
          quadObtention(GAUSS_POINT_NUM[PolyFlag], k, z[0], z[1], z[2], wgt);
          faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w);
          getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r);


            H_updateinc_PW(t, exci_prop.to, exci_prop.tau, exci_prop.freq_m, Z, exci_prop.Emagnitude, 
                           exci_prop.ro, exci_prop.theta, exci_prop.phi,exci_prop.Epol,
                           exci_prop.ExcitationFlag,r,normvtr,nxE_inc,nxnxH_inc);
          
          
          if (!(isnan(nxE_inc[0]))){
            nxentry += area * wgt * dotP3131(w, nxE_inc);
            nxnxentry += area * wgt * dotP3131(w, nxnxH_inc);
          }
        }
        nxelem[j] += nxentry;
        nxnxelem[j] += nxnxentry;
      }
      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++){
        // printf("%d\n", ExcitationFacesNum[face_id]);
        nxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j];
        nxnxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j];
      }
    }

    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
      nxe_inc[i] *= -dt_muo;
      nxe_inc[i] += (-Z * dt_muo) * nxnxh_inc[i];
      nxe_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i];
    }
    fp_t_ts sum;
    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
      sum = 0.0;
      for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++){
        sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxe_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i];
      }

        Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += sum; 

    }
  }
}


__global__ void checker(fp_t_ts* a, fp_t_ts* b, int length){
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if(idx < length){
    if(a[idx] != b[idx]){
      printf("ERROR %.10f\n", a[idx] - b[idx]);
    }
  }
}


// ---- math helpers for fp_t_ts ----
__device__ inline fp_t_ts my_abs(fp_t_ts x) {
  #if defined(__CUDACC__) || defined(__CUDA_ARCH__)
      // CUDA device intrinsics
      return fabsf(x);   // if fp_t_ts=float
      // return fabs(x); // if fp_t_ts=double
  #else
      return std::fabs(x);
  #endif
  }
  
  // max |a[i]-b[i]|; optionally return index of worst component
  __device__ inline fp_t_ts max_abs_diff3(const fp_t_ts a[3],
                                          const fp_t_ts b[3],
                                          int* idx_out) {
      fp_t_ts d0 = my_abs(a[0] - b[0]);
      fp_t_ts d1 = my_abs(a[1] - b[1]);
      fp_t_ts d2 = my_abs(a[2] - b[2]);
      fp_t_ts md = d0; int mi = 0;
      if (d1 > md) { md = d1; mi = 1; }
      if (d2 > md) { md = d2; mi = 2; }
      if (idx_out) *idx_out = mi;
      return md;
  }
  
  // Print both vectors side by side (cast to double for printf)
  __device__ inline void print_vec3_same_line(const char* label,
                                              const fp_t_ts comp[3],
                                              const fp_t_ts ref[3]) {
      printf("%s: comp=(% .6e,% .6e,% .6e) | "
                   "ref=(% .6e,% .6e,% .6e) | "
                   "diff=(% .3e,% .3e,% .3e)\n",
             label,
             (double)comp[0], (double)comp[1], (double)comp[2],
             (double)ref[0],  (double)ref[1],  (double)ref[2],
             (double)(comp[0]-ref[0]),
             (double)(comp[1]-ref[1]),
             (double)(comp[2]-ref[2]));
  }
  

// --------------------------------------------------------------------------------------
// Antenna Port Update
// Implement the port excitation using analytical solution
// TEM Port
// --------------------------------------------------------------------------------------


// Port-only version of H-field excitation
__global__ void addExcitationH_port( int* ExcitationFacesCnt,
                          int* ExcitationFacesOffset,
                          int* ExcitationFacesNum,
                          int excitationCNT,
                          int8_t* map,
                          ExcitationProp* exci_prop_vec,
                          int* PortFacePidx_d,
                          int PolyFlag,
                          fp_t_ts dt_muo, fp_t_ts t,
                          fp_t_ts* nd_coords_tet,
                          fp_t_ts* nd_coords_face,
                          fp_t_ts* Z_face,
                          fp_t_ts* InvMat,
                          fp_t_ts* Field,
                          fp_t_ts* Etan_qp_d, fp_t_ts* Htan_qp_d)
{

  int tet_id = blockIdx.x * blockDim.x + threadIdx.x;


  if (tet_id < excitationCNT)
  {
    //-----TODO: Make this part dynamic----------------------------
    const int TetPolyOrderDim2 = TetPolyOrderDim[3];
    const int FacePolyOrderDim2 = FacePolyOrderDim[3];

    fp_t_ts nxe_inc[TetPolyOrderDim2] = {0};
    fp_t_ts nxnxh_inc[TetPolyOrderDim2] = {0};
    //-------------------------------------------------------------
    fp_t_ts Z = 0.0;
    for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++)
    {
      int face_id = ExcitationFacesOffset[tet_id] + i;
      //printf("face_id = %d\n",face_id);
      Z = Z_face[face_id];

      int port_id = PortFacePidx_d[face_id];
      ExcitationProp exci_prop = exci_prop_vec[port_id];

      fp_t_ts nvtr[2][3];
      fp_t_ts normvtr[3] = {0};
      fp_t_ts area = 0;
      fp_t_ts nxentry;
      fp_t_ts nxnxentry;
      fp_t_ts z[3], w[3], r[3];
      fp_t_ts wgt;

      //-----------------TODO: Make this part dynamic--------------------------------------------
      fp_t_ts nxelem[FacePolyOrderDim2] = {0};
      fp_t_ts nxnxelem[FacePolyOrderDim2] = {0};
      //-----------------------------------------------------------------------------------------
      fp_t_ts nxE_inc_[3];
      fp_t_ts nxnxH_inc_[3];

      fp_t_ts  Hvec[3];
      fp_t_ts  Evec[3];

      geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area);
      int Q = GAUSS_POINT_NUM[PolyFlag];

      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++)
      {
        nxentry = 0;
        nxnxentry = 0;
        for (int k = 0; k < Q; k++)
        {
          quadObtention(Q, k, z[0], z[1], z[2], wgt);
          faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w);
          getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r);

          int qbase = (face_id * Q + k) * 3;
          Hvec[0] = Htan_qp_d[qbase + 0];
          Hvec[1] = Htan_qp_d[qbase + 1];
          Hvec[2] = Htan_qp_d[qbase + 2];

          Evec[0] = Etan_qp_d[qbase + 0];
          Evec[1] = Etan_qp_d[qbase + 1];
          Evec[2] = Etan_qp_d[qbase + 2];
                       
          H_updateinc_PORT(
            t, exci_prop.to, exci_prop.tau, exci_prop.freq_m,
            exci_prop.TimeDistributionFlag,
            exci_prop.PortDirection, 
            Evec, Hvec, exci_prop.CHIRP_BW_MHZ,
            nxE_inc_, nxnxH_inc_);          
            
          nxentry += area * wgt * dotP3131(w, nxE_inc_);
          nxnxentry += area * wgt * dotP3131(w, nxnxH_inc_);
          
        }
        nxelem[j] += nxentry;
        nxnxelem[j] += nxnxentry;
      }
      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++)
      {
        // printf("%d\n", ExcitationFacesNum[face_id]);
        nxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j];
        nxnxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j];
      }
    }

    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++){
      nxe_inc[i] *= -dt_muo;
      nxe_inc[i] += (-Z * dt_muo) * nxnxh_inc[i];
      nxe_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i];
    }
    fp_t_ts sum;
    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++)
    {
      sum = 0.0;
      for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++){
        sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxe_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i];
      }
      Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += sum; 
    }
  }
}


__device__ void H_updateinc_PORT(
    fp_t_ts t, fp_t_ts to, fp_t_ts tau, fp_t_ts freq_m,
    int TimeDistributionFlag,
    fp_t_ts* PortDirection,   // unit propagation dir
    fp_t_ts* Et_pre,          // precomputed tangential E at quad point
    fp_t_ts* Ht_pre,          // precomputed tangential H at quad point
    fp_t_ts CHIRP_BW_MHZ,     // bandwidth for chirp
    fp_t_ts* nxEinc,          // out: n × E_inc
    fp_t_ts* nxnxHinc)        // out: n × (n × H_inc)
{
  // Time envelopes
  fp_t_ts IncidExcit = 1.0f;
  fp_t_ts Emag = 1.0f;

  TimeModulationInc(TimeDistributionFlag, t, to, freq_m, 
                    Emag, tau, CHIRP_BW_MHZ, IncidExcit);

  // === Your required sequence ===
  fp_t_ts work[3];
  fp_t_ts field[3];

  field[0] = Et_pre[0];
  field[1] = Et_pre[1];
  field[2] = Et_pre[2];

  // EField
  crossP3(PortDirection, field, work);
  scale3(work, IncidExcit, nxEinc);


  field[0] = Ht_pre[0];
  field[1] = Ht_pre[1];
  field[2] = Ht_pre[2];

  // HField
  crossP3(PortDirection, field, work);
  crossP3(PortDirection, work, nxnxHinc);
  scale3(nxnxHinc, IncidExcit);

  for (int i = 0; i < 3; ++i) 
  {
    if (fabs(nxEinc[i]) < (fp_t_ts)1e-12) nxEinc[i] = 0.0;
    if (fabs(nxnxHinc[i]) < (fp_t_ts)1e-12) nxnxHinc[i] = 0.0;
  }

}


__device__ void TimeModulationInc(int TimeDistributionFlag, fp_t_ts t, fp_t_ts to, fp_t_ts freq_m, 
                                 fp_t_ts Emag, fp_t_ts tau, fp_t_ts CHIRP_BW_MHZ, fp_t_ts &IncidExcit)
{
  fp_t_ts Exponent;
  fp_t_ts omega = 2.0 * Pi * freq_m * MEGA;
  fp_t_ts Neuman;
  fp_t_ts SinModul;
  fp_t_ts kr;

  fp_t_ts work[3];

  switch (TimeDistributionFlag){
    // Time Harmonic
    case 0:
      IncidExcit = Emag * sin(- omega * (t));
      break;

    // Gauss Pulse
    case 1:
      Exponent = t - to;
      SinModul = cos(omega * (t - to));
      IncidExcit = Emag * SinModul * exp(-(Exponent * Exponent) / (tau * tau));
      break;

    // Neuman Pulse
    case 2:
      Exponent = t - to;
      Neuman = (2.0 * Exponent) / (tau * tau);
      IncidExcit = (Emag * Neuman) * exp(-(Exponent * Exponent) / (tau * tau));
      break;

    // 3) Linear FM chirp starting at 'to', duration 'tau', bandwidth CHIRP_BW_MHZ,
    //    Hann-tapered on [0, tau] to reduce spectral sidelobes.
    case 3:
    {
        // Time since chirp start (seconds)
        fp_t_ts tt = t - to;

        // Gate to [0, tau]
        if (tt >= (fp_t_ts)0.0 && tt <= tau)
        {
            // Sweep parameters
            const fp_t_ts B   = (fp_t_ts)(CHIRP_BW_MHZ * MEGA);          // Hz
            const fp_t_ts fc  = (fp_t_ts)(freq_m * MEGA);                // Hz
            const fp_t_ts f0  = fc - (fp_t_ts)0.5 * B;                   // start freq [Hz]
            const fp_t_ts k   = (tau != (fp_t_ts)0 ? B / tau : (fp_t_ts)0); // Hz/s

            // Phase: 2π( f0*tt + 0.5*k*tt^2 )
            const fp_t_ts phase = (fp_t_ts)(2.0 * Pi) * (f0 * tt + (fp_t_ts)0.5 * k * tt * tt);

            // Hann window on [0, tau]: w = 0.5*(1 - cos(2π*tt/tau))
            const fp_t_ts w = (fp_t_ts)0.5 * ((fp_t_ts)1.0 - cos((fp_t_ts)(2.0 * Pi) * (tt / tau)));

            const fp_t_ts s = sin(phase) * w;

            IncidExcit = Emag * s;
        }
        else
        {
            // Outside the chirp window
            IncidExcit = (fp_t_ts)0.0;
        }
    }
    break;

    // 4) Linear FM chirp starting at 'to', duration 'tau', bandwidth CHIRP_BW_MHZ,
    case 4:
    {
        // Time since chirp start (seconds)
        fp_t_ts tt = t - to;

        // Gate to [0, tau]
        if (tt >= (fp_t_ts)0.0 && tt <= tau)
        {
            // Sweep parameters
            const fp_t_ts B   = (fp_t_ts)(CHIRP_BW_MHZ * MEGA);          // Hz
            const fp_t_ts fc  = (fp_t_ts)(freq_m * MEGA);                // Hz
            const fp_t_ts f0  = fc - (fp_t_ts)0.5 * B;                   // start freq [Hz]
            const fp_t_ts k   = (tau != (fp_t_ts)0 ? B / tau : (fp_t_ts)0); // Hz/s

            // Phase: 2π( f0*tt + 0.5*k*tt^2 )
            const fp_t_ts phase = (fp_t_ts)(2.0 * Pi) * (f0 * tt + (fp_t_ts)0.5 * k * tt * tt);
            const fp_t_ts s = sin(phase);

            IncidExcit = Emag * s;
        }
        else
        {
            // Outside the chirp window
            IncidExcit = (fp_t_ts)0.0;
        }
    }
    break;

    default:
      break;
  }
}


// ============================================================
// Device helper: build n×H_inc and n×(n×E_inc) from precomputed Et/Ht
// ============================================================
__device__ void E_updateinc_PORT(
    fp_t_ts t, fp_t_ts to, fp_t_ts tau, fp_t_ts freq_m,
    int TimeDistributionFlag,
    fp_t_ts* PortDirection,   // unit propagation dir
    fp_t_ts* Et_pre,          // precomputed tangential E at quad point
    fp_t_ts* Ht_pre,          // precomputed tangential H at quad point
    fp_t_ts CHIRP_BW_MHZ,     // bandwidth for chirp
    fp_t_ts* nxHinc,          // out: n × H_inc
    fp_t_ts* nxnxEinc)        // out: n × (n × E_inc)
{
  // Time envelopes
  fp_t_ts IncidExcit = 1.0f;
  fp_t_ts Emag = 1.0f;


  TimeModulationInc(TimeDistributionFlag, t, to, freq_m, 
                    Emag, tau, CHIRP_BW_MHZ, IncidExcit);

                
  // Build the required quantities
  fp_t_ts work[3];


  // n × H_inc
  crossP3(PortDirection, Ht_pre, work);
  scale3(work, IncidExcit, nxHinc);

  // n × (n × E_inc)
  crossP3(PortDirection, Et_pre, work);
  crossP3(PortDirection, work, nxnxEinc);
  scale3(nxnxEinc, IncidExcit);

  for (int i = 0; i < 3; ++i) 
  {
    if (fabs(nxHinc[i]) < (fp_t_ts)1e-12) nxHinc[i] = 0.0;
    if (fabs(nxnxEinc[i]) < (fp_t_ts)1e-12) nxnxEinc[i] = 0.0;
  }

}

// ============================================================
// Kernel: Port-only version of E-field excitation
// Uses precomputed Etan_qp_d / Htan_qp_d
// ============================================================


// Tiny helper to print a 3-vector
__device__ inline void dprint3(const char* tag, fp_t_ts v0, fp_t_ts v1, fp_t_ts v2)
{
  printf("%s(%.6e, %.6e, %.6e)\n", tag, (double)v0, (double)v1, (double)v2);
}


__global__ void addExcitationE_port( int* ExcitationFacesCnt,
                                      int* ExcitationFacesOffset,
                                      int* ExcitationFacesNum,
                                      int excitationCNT,
                                      int8_t* map,
                                      ExcitationProp* exci_prop_vec,
                                      int* PortFacePidx_d,
                                      int PolyFlag,
                                      fp_t_ts dt_epsilon, fp_t_ts t,
                                      fp_t_ts* nd_coords_tet,
                                      fp_t_ts* nd_coords_face,
                                      fp_t_ts* Z_face,
                                      fp_t_ts* InvMat,
                                      fp_t_ts* Field,
                                      fp_t_ts* Etan_qp_d,
                                      fp_t_ts* Htan_qp_d)
{

  int tet_id = blockIdx.x * blockDim.x + threadIdx.x;

  if (tet_id < excitationCNT)
  {
    //-----TODO: Make this part dynamic----------------------------
    const int TetPolyOrderDim2 = TetPolyOrderDim[3];
    const int FacePolyOrderDim2 = FacePolyOrderDim[3];

    fp_t_ts nxh_inc[TetPolyOrderDim2] = {0};
    fp_t_ts nxnxe_inc[TetPolyOrderDim2] = {0};
    //-------------------------------------------------------------
    fp_t_ts Y = 0.0;
    for (int i = 0; i < ExcitationFacesCnt[tet_id]; i++)
    {
      int face_id = ExcitationFacesOffset[tet_id] + i;
      Y = 1 / Z_face[face_id];

      int port_id = PortFacePidx_d[face_id];
      ExcitationProp exci_prop = exci_prop_vec[port_id];


      fp_t_ts nvtr[2][3];
      fp_t_ts normvtr[3] = {0};
      fp_t_ts area = 0;
      fp_t_ts nxentry;
      fp_t_ts nxnxentry;
      fp_t_ts z[3], w[3], r[3];
      fp_t_ts wgt;


      //-----------------TODO: Make this part dynamic--------------------------------------------
      fp_t_ts nxelem[FacePolyOrderDim2] = {0};
      fp_t_ts nxnxelem[FacePolyOrderDim2] = {0};
      //-----------------------------------------------------------------------------------------
      fp_t_ts nxH_inc_[3];
      fp_t_ts nxnxE_inc_[3];

      fp_t_ts  Hvec[3];
      fp_t_ts  Evec[3];


      geometry2(&nd_coords_face[3 * 3 * face_id], &nd_coords_tet[4 * 3 * tet_id], normvtr, nvtr, &area);
      int Q = GAUSS_POINT_NUM[PolyFlag];

      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++)
      {
        nxentry = 0;
        nxnxentry = 0;

        for (int k = 0; k < Q; k++)
        {
          quadObtention(Q, k, z[0], z[1], z[2], wgt);
          faceBasisW(&nd_coords_face[3 * 3 * face_id], z, j, w);
          getCartesianCoord(&nd_coords_face[3 * 3 * face_id], z, r);

          int qbase = (face_id * Q + k) * 3;
          Hvec[0] = Htan_qp_d[qbase + 0];
          Hvec[1] = Htan_qp_d[qbase + 1];
          Hvec[2] = Htan_qp_d[qbase + 2];
          Evec[0] = Etan_qp_d[qbase + 0];
          Evec[1] = Etan_qp_d[qbase + 1];
          Evec[2] = Etan_qp_d[qbase + 2];

          E_updateinc_PORT(
            t, exci_prop.to, exci_prop.tau, exci_prop.freq_m,
            exci_prop.TimeDistributionFlag, exci_prop.PortDirection, 
            Evec, Hvec, exci_prop.CHIRP_BW_MHZ,
            nxH_inc_, nxnxE_inc_);          


          nxentry += area * wgt * dotP3131(w, nxH_inc_);
          nxnxentry += area * wgt * dotP3131(w, nxnxE_inc_);
    
        }
        
        nxelem[j] += nxentry;
        nxnxelem[j] += nxnxentry;

      }
      for (int j = 0; j < FacePolyOrderDim[PolyFlag]; j++)
      {
        nxh_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxelem[j];
        nxnxe_inc[FAC2TET[ExcitationFacesNum[face_id]][j]] += nxnxelem[j];
      }
      
    }

    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++)
    {
      nxh_inc[i] *= dt_epsilon;
      nxh_inc[i] += (-Y * dt_epsilon) * nxnxe_inc[i];
      nxh_inc[i] *= map[tet_id * TetPolyOrderDim[PolyFlag] + i];
    }


    fp_t_ts sum;
    for (int i = 0; i < TetPolyOrderDim[PolyFlag]; i++)
    {
      sum = 0.0;
      for (int j = 0; j < TetPolyOrderDim[PolyFlag]; j++)
      {
        sum += InvMat[tet_id * (TetPolyOrderDim[PolyFlag] * TetPolyOrderDim[PolyFlag]) + j * TetPolyOrderDim[PolyFlag] + i] * nxh_inc[j] * map[tet_id * TetPolyOrderDim[PolyFlag] + i];
      }
      Field[tet_id * TetPolyOrderDim[PolyFlag] + i] += sum;  
    }
  }
}


__global__ void fillOnes(fp_t_ts* arr, size_t N) {
  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < N) arr[i] = (fp_t_ts)1.0;
}


#endif
#endif // DGTD_KERNELS_CU