Commit 92f2ab2d authored by Johannes Spazier's avatar Johannes Spazier

Time series for POIs are now stored on GPU side and only copied back at the...

Time series for POIs are now stored on GPU side and only copied back at the end of the simulation. This saves a lot of runtime especially for a large number of POIs.
parent 62e2ab25
......@@ -113,12 +113,6 @@ int main( int argc, char **argv )
for( Par.time=0,loop=1,lastProgress=Par.outProgress,lastPropagation=Par.outPropagation,lastDump=0;
Par.time<=Par.timeMax; loop++,Par.time+=Par.dt,lastProgress+=Par.dt,lastPropagation+=Par.dt ) {
/* FIXME: check if Par.poiDt can be used for those purposes */
if( Par.filePOIs && Par.poiDt && ((Par.time/Par.poiDt)*Par.poiDt == Par.time) ) {
Node.copyPOIs();
ewSavePOIs();
}
Node.run();
elapsed = ((int)clock())/CLOCKS_PER_SEC;
......@@ -134,6 +128,7 @@ int main( int argc, char **argv )
if( Par.outDump && (elapsed-lastDump) >= Par.outDump || Par.time >= Par.timeMax ) {
Node.copyIntermediate();
Node.copyFromGPU();
Node.copyPOIs();
ewDumpPOIs();
ewDump2D();
lastDump = elapsed;
......
......@@ -218,3 +218,11 @@ __global__ void runGridExtendKernel( KernelData data ) {
}
}
__global__ void storePOIs( KernelData data ) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if( id < data.num_pois ) {
data.pois[data.it * data.num_pois + id] = data.h[data.idxs[id]];
}
}
......@@ -38,5 +38,6 @@ __global__ void runWaveBoundaryKernel( KernelData data );
__global__ void runFluxUpdateKernel( KernelData data );
__global__ void runFluxBoundaryKernel( KernelData data );
__global__ void runGridExtendKernel( KernelData data );
__global__ void storePOIs( KernelData data );
#endif /* EW_KERNELS_H */
......@@ -33,6 +33,14 @@
#include "ewGpuNode.cuh"
#include "ewCudaKernels.cuh"
/* We need some global variables from other files to use them in combination with this class. */
extern int NPOIs;
extern int NtPOI;
extern long *idxPOI;
extern int *flagRunupPOI;
extern int *timePOI;
extern float **sshPOI;
CGpuNode::CGpuNode() {
pitch = 0;
......@@ -248,6 +256,26 @@ int CGpuNode::copyToGPU() {
data.params.jMin = dp.jMin;
data.params.jMax = dp.jMax;
/* POI handling. */
int *relIdxs = (int*) malloc( NPOIs * sizeof(int) );
data.host_idxs = (long*) malloc( NPOIs * sizeof(long) );
data.num_pois = 0;
for(int n = 0; n < NPOIs; n++) {
int i = idxPOI[n] / data.params.nJ + 1;
int j = idxPOI[n] % data.params.nJ + 1;
if( vgpu.hasLine( i ) ) {
relIdxs[data.num_pois] = vgpu.data.idx( vgpu.getRel(i), j );
data.host_idxs[data.num_pois] = n;
data.num_pois++;
}
}
CUDA_CALL( cudaMalloc( &(data.pois), NtPOI * data.num_pois * sizeof(float) ) );
CUDA_CALL( cudaMalloc( &(data.idxs), data.num_pois * sizeof(int) ) );
CUDA_CALL( cudaMemcpy( data.idxs, relIdxs, data.num_pois * sizeof(int), cudaMemcpyHostToDevice ) );
free(relIdxs);
/*****************/
}
return 0;
......@@ -304,36 +332,38 @@ int CGpuNode::copyIntermediate() {
int CGpuNode::copyPOIs() {
Params& dp = params;
if( copied )
return 0;
VGpu *vgpu;
for( int n = 0; n < NPOIs; n++ ) {
for( int id = 0; id < num_virtual_gpus; id++ ) {
int i = idxPOI[n] / dp.nJ + 1;
int j = idxPOI[n] % dp.nJ + 1;
for( int id = 0; id < num_virtual_gpus; id++ ) {
if( vgpus[id].hasLine( i ) ) {
vgpu = &(vgpus[id]);
break;
VGpu& vgpu = vgpus[id];
KernelData& data = vgpu.data;
float *pois = (float*) malloc(data.num_pois * NtPOI * sizeof(float));
double ampFactor;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) )
CUDA_CALL( cudaMemcpy( pois, data.pois, data.num_pois * NtPOI * sizeof(float), cudaMemcpyDeviceToHost ) );
for(int n = 0; n < data.num_pois; n++) {
for(int i = 0; i < data.it; i++) {
int host_n = data.host_idxs[n];
if( flagRunupPOI[host_n] )
ampFactor = pow( d[idxPOI[host_n]], 0.25 );
else
ampFactor = 1.;
sshPOI[ host_n ][i] = ampFactor * pois[i*data.num_pois + n];
}
}
int id = vgpu->data.idx( vgpu->getRel(i), j );
for(int i = 0; i < data.it; i++)
timePOI[i] = i * Par.poiDt;
CUDA_CALL( cudaSetDevice( vgpu->dev->id ) )
CUDA_CALL( cudaMemcpy( h + idxPOI[n], vgpu->data.h + dp.lpad + id, sizeof(float), cudaMemcpyDeviceToHost ) );
free(pois);
}
return 0;
}
int CGpuNode::freeMem() {
for( int i = 0; i < num_virtual_gpus; i++ ) {
......@@ -362,6 +392,10 @@ int CGpuNode::freeMem() {
CUDA_CALL( cudaFree( data.cB4 ) );
CUDA_CALL( cudaFree( data.extend ) );
CUDA_CALL( cudaFree( data.pois ) );
CUDA_CALL( cudaFree( data.idxs ) );
free( data.host_idxs );
}
cudaFreeHost( extend );
......@@ -394,6 +428,18 @@ int CGpuNode::run() {
dp.mTime = Par.time;
if( Par.filePOIs && Par.poiDt && ((Par.time/Par.poiDt)*Par.poiDt == Par.time) ) {
for( int i = 0; i < num_virtual_gpus; i++ ) {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
int num_blocks = data.num_pois / nThreads + 1;
cudaSetDevice( vgpu.dev->id );
storePOIs<<<num_blocks, nThreads>>>( data );
data.it++;
}
}
for( int i = 0; i < num_virtual_gpus; i++ ) {
VGpu& vgpu = vgpus[i];
......
......@@ -82,6 +82,13 @@ public:
float *cB3;
float *cB4;
/* POIs */
float *pois;
int *idxs;
long *host_idxs;
int num_pois;
int it;
Params params;
int4 *extend;
......
......@@ -38,14 +38,14 @@
//#define SSHMAX_TO_SINGLE_FILE 0
static int MaxPOIs;
int MaxPOIs;
int NPOIs;
static char **idPOI;
long *idxPOI;
static int *flagRunupPOI;
static int NtPOI;
static int *timePOI;
static float **sshPOI;
int *flagRunupPOI;
int NtPOI;
int *timePOI;
float **sshPOI;
int ewLoadPOIs()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment