You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
78 lines
2.1 KiB
78 lines
2.1 KiB
#pragma once
|
|
#include <mutex>
|
|
#include <cassert>
|
|
#include <atomic>
|
|
#include <algorithm>
|
|
#include <chrono>
|
|
#include <cmath>
|
|
#include <thread>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <iostream>
|
|
#include <numeric>
|
|
#include <tuple>
|
|
#include "cuda.h"
|
|
|
|
|
|
#define cuda_err_chk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
|
|
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=false) {
|
|
|
|
if(code != cudaSuccess) {
|
|
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
|
|
if (abort) exit(1);
|
|
}
|
|
}
|
|
|
|
|
|
#define PRINT_ERROR \
|
|
do { \
|
|
fprintf(stderr, "Error at line %d, file %s (%d) [%s]\n", \
|
|
__LINE__, __FILE__, errno, strerror(errno)); exit(1); \
|
|
} while(0)
|
|
|
|
|
|
static std::chrono::time_point<std::chrono::high_resolution_clock> now() {
|
|
return std::chrono::high_resolution_clock::now();
|
|
}
|
|
|
|
/*Device function that returns how many SMs are there in the device/arch - it can be more than the maximum readable SMs*/
|
|
__device__ __forceinline__ unsigned int getnsmid(){
|
|
unsigned int r;
|
|
asm("mov.u32 %0, %%nsmid;" : "=r"(r));
|
|
return r;
|
|
}
|
|
|
|
__device__ __forceinline__ unsigned int my_lanemask32_lt()
|
|
{
|
|
unsigned int lanemask32_lt;
|
|
asm volatile("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
|
|
return (lanemask32_lt);
|
|
}
|
|
|
|
/*Device function that returns the current SMID of for the block being run*/
|
|
__device__ __forceinline__ unsigned int getsmid(){
|
|
unsigned int r;
|
|
asm("mov.u32 %0, %%smid;" : "=r"(r));
|
|
return r;
|
|
}
|
|
|
|
/*Device function that returns the current warpid of for the block being run*/
|
|
__device__ __forceinline__ unsigned int getwarpid(){
|
|
unsigned int r;
|
|
asm("mov.u32 %0, %%warpid;" : "=r"(r));
|
|
return r;
|
|
}
|
|
|
|
__device__ __forceinline__ unsigned int getwarpsz()
|
|
{
|
|
unsigned int warpSize;
|
|
asm volatile("mov.u32 %0, WARP_SZ;" : "=r"(warpSize));
|
|
return warpSize;
|
|
}
|
|
|
|
/*Device function that returns the current laneid of for the warp in the block being run*/
|
|
__device__ __forceinline__ unsigned int getlaneid(){
|
|
unsigned int r;
|
|
asm("mov.u32 %0, %%laneid;" : "=r"(r));
|
|
return r;
|
|
}
|
|
|