#include "ewGpuNode.cuh" #include "ewCudaKernels.cuh" CGpuNode::CGpuNode() { pitch = 0; copied = true; for( int i = 0; i < 5; i++ ) { cudaEventCreate( &(evtStart[i]) ); cudaEventCreate( &(evtEnd[i]) ); dur[i] = 0.0; } } int CGpuNode::mallocMem() { CArrayNode::mallocMem(); Params& dp = data.params; /* fill in some fields here */ dp.nI = NLon; dp.nJ = NLat; dp.sshArrivalThreshold = Par.sshArrivalThreshold; dp.sshClipThreshold = Par.sshClipThreshold; dp.sshZeroThreshold = Par.sshZeroThreshold; dp.lpad = 0; size_t nJ_aligned = dp.nJ + dp.lpad; /* 2-dim */ CUDA_CALL( cudaMallocPitch( &(data.d), &pitch, nJ_aligned * sizeof(float), dp.nI ) ); CUDA_CALL( cudaMallocPitch( &(data.h), &pitch, nJ_aligned * sizeof(float), dp.nI ) ); CUDA_CALL( cudaMallocPitch( &(data.hMax), &pitch, nJ_aligned * sizeof(float), dp.nI ) ); CUDA_CALL( cudaMallocPitch( &(data.fM), &pitch, nJ_aligned * sizeof(float), dp.nI ) ); CUDA_CALL( cudaMallocPitch( &(data.fN), &pitch, nJ_aligned * sizeof(float), dp.nI ) ); CUDA_CALL( cudaMallocPitch( &(data.cR1), &pitch, nJ_aligned * sizeof(float), dp.nI ) ); CUDA_CALL( cudaMallocPitch( &(data.cR2), &pitch, nJ_aligned * sizeof(float), dp.nI ) ); CUDA_CALL( cudaMallocPitch( &(data.cR4), &pitch, nJ_aligned * sizeof(float), dp.nI ) ); CUDA_CALL( cudaMallocPitch( &(data.tArr), &pitch, nJ_aligned * sizeof(float), dp.nI ) ); /* TODO: cR3, cR5 for coriolis */ /* 1-dim */ CUDA_CALL( cudaMalloc( &(data.cR6), dp.nJ * sizeof(float) ) ); CUDA_CALL( cudaMalloc( &(data.cB1), dp.nI * sizeof(float) ) ); CUDA_CALL( cudaMalloc( &(data.cB2), dp.nJ * sizeof(float) ) ); CUDA_CALL( cudaMalloc( &(data.cB3), dp.nI * sizeof(float) ) ); CUDA_CALL( cudaMalloc( &(data.cB4), dp.nJ * sizeof(float) ) ); /* TODO: make sure that pitch is a multiple of 4 and the same for each cudaMallocPitch() call */ dp.pI = pitch / sizeof(float); return 0; } int CGpuNode::copyToGPU() { Params& dp = data.params; /* fill in further fields here */ dp.iMin = Imin; dp.iMax = Imax; dp.jMin = Jmin; dp.jMax = Jmax; /* FIXME: should not work correctly */ /* add offset to data.d to guarantee alignment: data.d + LPAD */ /* 2-dim */ CUDA_CALL( cudaMemcpy2D( data.d + dp.lpad, pitch, d, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.h + dp.lpad, pitch, h, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.hMax + dp.lpad, pitch, hMax, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.fM + dp.lpad, pitch, fM, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.fN + dp.lpad, pitch, fN, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.cR1 + dp.lpad, pitch, cR1, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.cR2 + dp.lpad, pitch, cR2, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.cR4 + dp.lpad, pitch, cR4, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.tArr + dp.lpad, pitch, tArr, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) ); /* FIXME: move global variables into data structure */ /* 1-dim */ CUDA_CALL( cudaMemcpy( data.cR6, R6, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy( data.cB1, C1, dp.nI * sizeof(float), cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy( data.cB2, C2, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy( data.cB3, C3, dp.nI * sizeof(float), cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy( data.cB4, C4, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) ); return 0; } int CGpuNode::copyFromGPU() { Params& dp = data.params; CUDA_CALL( cudaMemcpy2D( hMax, dp.nJ * sizeof(float), data.hMax + dp.lpad, pitch, dp.nJ * sizeof(float), dp.nI, cudaMemcpyDeviceToHost ) ); CUDA_CALL( cudaMemcpy2D( tArr, dp.nJ * sizeof(float), data.tArr + dp.lpad, pitch, dp.nJ * sizeof(float), dp.nI, cudaMemcpyDeviceToHost ) ); return 0; } int CGpuNode::copyIntermediate() { /* ignore copy requests if data already present on CPU side */ if( copied ) return 0; Params& dp = data.params; CUDA_CALL( cudaMemcpy2D( h, dp.nJ * sizeof(float), data.h + dp.lpad, pitch, dp.nJ * sizeof(float), dp.nI, cudaMemcpyDeviceToHost ) ); /* copy finished */ copied = true; return 0; } int CGpuNode::freeMem() { /* 2-dim */ CUDA_CALL( cudaFree( data.d ) ); CUDA_CALL( cudaFree( data.h ) ); CUDA_CALL( cudaFree( data.hMax ) ); CUDA_CALL( cudaFree( data.fM ) ); CUDA_CALL( cudaFree( data.fN ) ); CUDA_CALL( cudaFree( data.cR1 ) ); CUDA_CALL( cudaFree( data.cR2 ) ); CUDA_CALL( cudaFree( data.cR4 ) ); CUDA_CALL( cudaFree( data.tArr ) ); /* 1-dim */ CUDA_CALL( cudaFree( data.cR6 ) ); CUDA_CALL( cudaFree( data.cB1 ) ); CUDA_CALL( cudaFree( data.cB2 ) ); CUDA_CALL( cudaFree( data.cB3 ) ); CUDA_CALL( cudaFree( data.cB4 ) ); float total_dur = 0.f; for( int j = 0; j < 5; j++ ) { printf_v("Duration %u: %.3f\n", j, dur[j]); total_dur += dur[j]; } printf_v("Duration total: %.3f\n",total_dur); CArrayNode::freeMem(); return 0; } int CGpuNode::run() { Params& dp = data.params; int nThreads = 256; int xThreads = 32; int yThreads = nThreads / xThreads; int xBlocks = ceil( (float)dp.nJ / (float)xThreads ); int yBlocks = ceil( (float)dp.nI / (float)yThreads ); dim3 threads( xThreads, yThreads ); dim3 blocks( xBlocks, yBlocks ); int nBlocks = ceil( (float)max(dp.nI,dp.nJ) / (float)nThreads ); dp.mTime = Par.time; CUDA_CALL( cudaEventRecord( evtStart[0], 0 ) ); runWaveUpdateKernel<<>>( data ); CUDA_CALL( cudaEventRecord( evtEnd[0], 0 ) ); CUDA_CALL( cudaEventRecord( evtStart[1], 0 ) ); runWaveBoundaryKernel<<>>( data ); CUDA_CALL( cudaEventRecord( evtEnd[1], 0 ) ); CUDA_CALL( cudaEventRecord( evtStart[2], 0 ) ); runFluxUpdateKernel<<>>( data ); CUDA_CALL( cudaEventRecord( evtEnd[2], 0 ) ); CUDA_CALL( cudaEventRecord( evtStart[3], 0 ) ); runFluxBoundaryKernel<<>>( data ); CUDA_CALL( cudaEventRecord( evtEnd[3], 0 ) ); CUDA_CALL( cudaEventRecord( evtStart[4], 0 ) ); runGridExtendKernel1<<>>( data ); runGridExtendKernel2<<<1,1>>>( data ); CUDA_CALL( cudaEventRecord( evtEnd[4], 0 ) ); void *g_mm; CUDA_CALL( cudaGetSymbolAddress( &g_mm, g_MinMax ) ); int4 MinMax; CUDA_CALL( cudaMemcpy( &MinMax, g_mm, sizeof(int4), cudaMemcpyDeviceToHost ) ); cudaDeviceSynchronize(); Imin = dp.iMin = MinMax.x; Imax = dp.iMax = MinMax.y; Jmin = dp.jMin = MinMax.z; Jmax = dp.jMax = MinMax.w; float _dur; for( int j = 0; j < 5; j++ ) { cudaEventElapsedTime( &_dur, evtStart[j], evtEnd[j]); dur[j] += _dur; } /* data has changed now -> copy becomes necessary */ copied = false; return 0; }