From 9e4d7de82f873ae2f2edb35dccc2d7de0767fc12 Mon Sep 17 00:00:00 2001 From: Johannes Spazier Date: Sat, 15 Feb 2014 16:32:02 +0000 Subject: [PATCH] - Update of GridExtend-Kernel. - The memory alignment was adjusted. - The progress output is now flushed. --- code/trunk/src/EasyWave.cpp | 4 +- code/trunk/src/ewCudaKernels.cu | 112 +++++++++++-------------------- code/trunk/src/ewCudaKernels.cuh | 3 +- code/trunk/src/ewGpuNode.cu | 38 +++++++---- 4 files changed, 69 insertions(+), 88 deletions(-) diff --git a/code/trunk/src/EasyWave.cpp b/code/trunk/src/EasyWave.cpp index 27cb14c..828fb32 100644 --- a/code/trunk/src/EasyWave.cpp +++ b/code/trunk/src/EasyWave.cpp @@ -129,7 +129,9 @@ int main( int argc, char **argv ) lastProgress = 0; } } - + + fflush(stdout); + if( Par.outPropagation ) { if( lastPropagation >= Par.outPropagation ) { Node.copyIntermediate(); diff --git a/code/trunk/src/ewCudaKernels.cu b/code/trunk/src/ewCudaKernels.cu index bc5dc66..5a87d02 100644 --- a/code/trunk/src/ewCudaKernels.cu +++ b/code/trunk/src/ewCudaKernels.cu @@ -35,7 +35,7 @@ __global__ void runWaveUpdateKernel( KernelData data ) { float absH; /* maybe unnecessary if controlled from outside */ - if( i <= dp.iMax && j <= dp.jMax && data.d[ij] != 0 ) { + if( i <= dp.iMax && j <= dp.jMax && data.d[ij] != 0 ) { float hh = data.h[ij] - data.cR1[ij] * ( data.fM[ij] - data.fM[data.le(ij)] + data.fN[ij] * data.cR6[j] - data.fN[data.dn(ij)]*data.cR6[j-1] ); @@ -49,10 +49,9 @@ __global__ void runWaveUpdateKernel( KernelData data ) { } if( dp.sshArrivalThreshold && data.tArr[ij] < 0 && absH > dp.sshArrivalThreshold ) - data.tArr[ij] = dp.mTime; + data.tArr[ij] = dp.mTime; data.h[ij] = hh; - } } @@ -65,7 +64,7 @@ __global__ void runFluxUpdateKernel( KernelData data ) { int j = blockIdx.x * blockDim.x + threadIdx.x + dp.jMin; int ij = data.idx(i,j); - if( i <= dp.iMax && j <= dp.jMax && data.d[ij] != 0 ) { + if( i <= dp.iMax && j <= dp.jMax && data.d[ij] != 0 ) { float hh = data.h[ij]; @@ -171,106 +170,75 @@ __global__ void runFluxBoundaryKernel( KernelData data ) { } -__device__ int4 c_MinMax = {0,0,0,0}; - -__global__ void runGridExtendKernel1( KernelData data ) { +__global__ void runGridExtendKernel( KernelData data ) { -#if (__CUDA_ARCH__ >= 130) Params& dp = data.params; int id = blockIdx.x * blockDim.x + threadIdx.x + 1; +#if (__CUDA_ARCH__ >= 130) + if( id >= dp.jMin && id <= dp.jMax ) { if( fabsf(data.h[data.idx(dp.iMin+2,id)]) > dp.sshClipThreshold ) - atomicAdd( &c_MinMax.x, 1 ); + atomicAdd( &(data.g_MinMax->x), 1 ); if( fabsf(data.h[data.idx(dp.iMax-2,id)]) > dp.sshClipThreshold ) - atomicAdd( &c_MinMax.y, 1 ); + atomicAdd( &(data.g_MinMax->y), 1 ); } if( id >= dp.iMin && id <= dp.iMax ) { if( fabsf(data.h[data.idx(id,dp.jMin+2)]) > dp.sshClipThreshold ) - atomicAdd( &c_MinMax.z, 1 ); + atomicAdd( &(data.g_MinMax->z), 1 ); if( fabsf(data.h[data.idx(id,dp.jMax-2)]) > dp.sshClipThreshold ) - atomicAdd( &c_MinMax.w, 1 ); + atomicAdd( &(data.g_MinMax->w), 1 ); } -#endif - -} - -__global__ void runGridExtendKernel2( KernelData data ) { - - Params& dp = data.params; - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - if( id == 0 ) { -#if (__CUDA_ARCH__ >= 130) +#else - if( c_MinMax.x ) - dp.iMin = max( dp.iMin-1, 2 ); + if( id == 1 ) { - if( c_MinMax.y ) - dp.iMax = min( dp.iMax+1, dp.nI-1 ); + for( int j = dp.jMin; j <= dp.jMax; j++ ) { + + if( fabsf(data.h[data.idx(dp.iMin+2,j)]) > dp.sshClipThreshold ) { + data.g_MinMax->x = 1; + break; + } - if( c_MinMax.z ) - dp.jMin = max( dp.jMin-1, 2 ); + } - if( c_MinMax.w ) - dp.jMax = min( dp.jMax+1, dp.nJ-1 ); + for( int j = dp.jMin; j <= dp.jMax; j++ ) { - c_MinMax.x = 0; - c_MinMax.y = 0; - c_MinMax.z = 0; - c_MinMax.w = 0; + if( fabsf(data.h[data.idx(dp.iMax-2,j)]) > dp.sshClipThreshold ) { + data.g_MinMax->y = 1; + break; + } -#else + } - for( int j = dp.jMin; j <= dp.jMax; j++ ) { + for( int i = dp.iMin; i <= dp.iMax; i++ ) { + + if( fabsf(data.h[data.idx(i,dp.jMin+2)]) > dp.sshClipThreshold ) { + data.g_MinMax->z = 1; + break; + } - if( fabsf(data.h[data.idx(dp.iMin+2,j)]) > dp.sshClipThreshold ) { - dp.iMin = max( dp.iMin-1, 2 ); - break; - } - } + } - for( int j = dp.jMin; j <= dp.jMax; j++ ) { + for( int i = dp.iMin; i <= dp.iMax; i++ ) { - if( fabsf(data.h[data.idx(dp.iMax-2,j)]) > dp.sshClipThreshold ) { - dp.iMax = min( dp.iMax+1, dp.nI-1 ); - break; - } - } - - for( int i = dp.iMin; i <= dp.iMax; i++ ) { + if( fabsf(data.h[data.idx(i,dp.jMax-2)]) > dp.sshClipThreshold ) { + data.g_MinMax->w = 1; + break; + } - if( fabsf(data.h[data.idx(i,dp.jMin+2)]) > dp.sshClipThreshold ) { - dp.jMin = max( dp.jMin-1, 2 ); - break; - } - } + } - for( int i = dp.iMin; i <= dp.iMax; i++ ) { - - if( fabsf(data.h[data.idx(i,dp.jMax-2)]) > dp.sshClipThreshold ) { - dp.jMax = min( dp.jMax+1, dp.nJ-1 ); - break; - } - } + } #endif - int4 tmp; - tmp.x = dp.iMin; - tmp.y = dp.iMax; - tmp.z = dp.jMin; - tmp.w = dp.jMax; - - *(data.g_MinMax) = tmp; - } - } + diff --git a/code/trunk/src/ewCudaKernels.cuh b/code/trunk/src/ewCudaKernels.cuh index 983dc02..743ea55 100644 --- a/code/trunk/src/ewCudaKernels.cuh +++ b/code/trunk/src/ewCudaKernels.cuh @@ -29,7 +29,6 @@ __global__ void runWaveUpdateKernel( KernelData data ); __global__ void runWaveBoundaryKernel( KernelData data ); __global__ void runFluxUpdateKernel( KernelData data ); __global__ void runFluxBoundaryKernel( KernelData data ); -__global__ void runGridExtendKernel1( KernelData data ); -__global__ void runGridExtendKernel2( KernelData data ); +__global__ void runGridExtendKernel( KernelData data ); #endif /* EW_KERNELS_H */ diff --git a/code/trunk/src/ewGpuNode.cu b/code/trunk/src/ewGpuNode.cu index 074ef50..382dd9a 100644 --- a/code/trunk/src/ewGpuNode.cu +++ b/code/trunk/src/ewGpuNode.cu @@ -50,7 +50,7 @@ int CGpuNode::mallocMem() { dp.sshArrivalThreshold = Par.sshArrivalThreshold; dp.sshClipThreshold = Par.sshClipThreshold; dp.sshZeroThreshold = Par.sshZeroThreshold; - dp.lpad = 0; + dp.lpad = 31; size_t nJ_aligned = dp.nJ + dp.lpad; @@ -85,13 +85,15 @@ int CGpuNode::copyToGPU() { Params& dp = data.params; - /* fill in further fields here */ - dp.iMin = Imin; + /* align left grid boundary to a multiple of 32 with an offset 1 */ + Jmin -= (Jmin-2) % 32; + + /* fill in further fields here */ + dp.iMin = Imin; dp.iMax = Imax; - dp.jMin = Jmin; + dp.jMin = Jmin; dp.jMax = Jmax; - /* FIXME: should not work correctly */ /* add offset to data.d to guarantee alignment: data.d + LPAD */ /* 2-dim */ CUDA_CALL( cudaMemcpy2D( data.d + dp.lpad, pitch, d, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) ); @@ -202,8 +204,10 @@ int CGpuNode::run() { int xThreads = 32; int yThreads = nThreads / xThreads; - int xBlocks = ceil( (float)dp.nJ / (float)xThreads ); - int yBlocks = ceil( (float)dp.nI / (float)yThreads ); + int NJ = dp.jMax - dp.jMin + 1; + int NI = dp.iMax - dp.iMin + 1; + int xBlocks = ceil( (float)NJ / (float)xThreads ); + int yBlocks = ceil( (float)NI / (float)yThreads ); dim3 threads( xThreads, yThreads ); dim3 blocks( xBlocks, yBlocks ); @@ -225,17 +229,25 @@ int CGpuNode::run() { runFluxBoundaryKernel<<>>( data ); CUDA_CALL( cudaEventRecord( evtEnd[3], 0 ) ); CUDA_CALL( cudaEventRecord( evtStart[4], 0 ) ); - runGridExtendKernel1<<>>( data ); - runGridExtendKernel2<<<1,1>>>( data ); + CUDA_CALL( cudaMemset( data.g_MinMax, 0, sizeof(int4) ) ); + runGridExtendKernel<<>>( data ); CUDA_CALL( cudaEventRecord( evtEnd[4], 0 ) ); int4 MinMax; CUDA_CALL( cudaMemcpy( &MinMax, data.g_MinMax, sizeof(int4), cudaMemcpyDeviceToHost ) ); cudaDeviceSynchronize(); - Imin = dp.iMin = MinMax.x; - Imax = dp.iMax = MinMax.y; - Jmin = dp.jMin = MinMax.z; - Jmax = dp.jMax = MinMax.w; + + if( MinMax.x ) + Imin = dp.iMin = max( dp.iMin-1, 2 ); + + if( MinMax.y ) + Imax = dp.iMax = min( dp.iMax+1, dp.nI-1 ); + + if( MinMax.z ) + Jmin = dp.jMin = max( dp.jMin-32, 2 ); + + if( MinMax.w ) + Jmax = dp.jMax = min( dp.jMax+1, dp.nJ-1 ); float _dur; for( int j = 0; j < 5; j++ ) { -- GitLab