Commit 9e4d7de8 authored by Johannes Spazier's avatar Johannes Spazier

- Update of GridExtend-Kernel.

- The memory alignment was adjusted.
- The progress output is now flushed.
parent 9f6d2882
......@@ -129,7 +129,9 @@ int main( int argc, char **argv )
lastProgress = 0;
}
}
fflush(stdout);
if( Par.outPropagation ) {
if( lastPropagation >= Par.outPropagation ) {
Node.copyIntermediate();
......
......@@ -35,7 +35,7 @@ __global__ void runWaveUpdateKernel( KernelData data ) {
float absH;
/* maybe unnecessary if controlled from outside */
if( i <= dp.iMax && j <= dp.jMax && data.d[ij] != 0 ) {
if( i <= dp.iMax && j <= dp.jMax && data.d[ij] != 0 ) {
float hh = data.h[ij] - data.cR1[ij] * ( data.fM[ij] - data.fM[data.le(ij)] + data.fN[ij] * data.cR6[j] - data.fN[data.dn(ij)]*data.cR6[j-1] );
......@@ -49,10 +49,9 @@ __global__ void runWaveUpdateKernel( KernelData data ) {
}
if( dp.sshArrivalThreshold && data.tArr[ij] < 0 && absH > dp.sshArrivalThreshold )
data.tArr[ij] = dp.mTime;
data.tArr[ij] = dp.mTime;
data.h[ij] = hh;
}
}
......@@ -65,7 +64,7 @@ __global__ void runFluxUpdateKernel( KernelData data ) {
int j = blockIdx.x * blockDim.x + threadIdx.x + dp.jMin;
int ij = data.idx(i,j);
if( i <= dp.iMax && j <= dp.jMax && data.d[ij] != 0 ) {
if( i <= dp.iMax && j <= dp.jMax && data.d[ij] != 0 ) {
float hh = data.h[ij];
......@@ -171,106 +170,75 @@ __global__ void runFluxBoundaryKernel( KernelData data ) {
}
__device__ int4 c_MinMax = {0,0,0,0};
__global__ void runGridExtendKernel1( KernelData data ) {
__global__ void runGridExtendKernel( KernelData data ) {
#if (__CUDA_ARCH__ >= 130)
Params& dp = data.params;
int id = blockIdx.x * blockDim.x + threadIdx.x + 1;
#if (__CUDA_ARCH__ >= 130)
if( id >= dp.jMin && id <= dp.jMax ) {
if( fabsf(data.h[data.idx(dp.iMin+2,id)]) > dp.sshClipThreshold )
atomicAdd( &c_MinMax.x, 1 );
atomicAdd( &(data.g_MinMax->x), 1 );
if( fabsf(data.h[data.idx(dp.iMax-2,id)]) > dp.sshClipThreshold )
atomicAdd( &c_MinMax.y, 1 );
atomicAdd( &(data.g_MinMax->y), 1 );
}
if( id >= dp.iMin && id <= dp.iMax ) {
if( fabsf(data.h[data.idx(id,dp.jMin+2)]) > dp.sshClipThreshold )
atomicAdd( &c_MinMax.z, 1 );
atomicAdd( &(data.g_MinMax->z), 1 );
if( fabsf(data.h[data.idx(id,dp.jMax-2)]) > dp.sshClipThreshold )
atomicAdd( &c_MinMax.w, 1 );
atomicAdd( &(data.g_MinMax->w), 1 );
}
#endif
}
__global__ void runGridExtendKernel2( KernelData data ) {
Params& dp = data.params;
int id = blockIdx.x * blockDim.x + threadIdx.x;
if( id == 0 ) {
#if (__CUDA_ARCH__ >= 130)
#else
if( c_MinMax.x )
dp.iMin = max( dp.iMin-1, 2 );
if( id == 1 ) {
if( c_MinMax.y )
dp.iMax = min( dp.iMax+1, dp.nI-1 );
for( int j = dp.jMin; j <= dp.jMax; j++ ) {
if( fabsf(data.h[data.idx(dp.iMin+2,j)]) > dp.sshClipThreshold ) {
data.g_MinMax->x = 1;
break;
}
if( c_MinMax.z )
dp.jMin = max( dp.jMin-1, 2 );
}
if( c_MinMax.w )
dp.jMax = min( dp.jMax+1, dp.nJ-1 );
for( int j = dp.jMin; j <= dp.jMax; j++ ) {
c_MinMax.x = 0;
c_MinMax.y = 0;
c_MinMax.z = 0;
c_MinMax.w = 0;
if( fabsf(data.h[data.idx(dp.iMax-2,j)]) > dp.sshClipThreshold ) {
data.g_MinMax->y = 1;
break;
}
#else
}
for( int j = dp.jMin; j <= dp.jMax; j++ ) {
for( int i = dp.iMin; i <= dp.iMax; i++ ) {
if( fabsf(data.h[data.idx(i,dp.jMin+2)]) > dp.sshClipThreshold ) {
data.g_MinMax->z = 1;
break;
}
if( fabsf(data.h[data.idx(dp.iMin+2,j)]) > dp.sshClipThreshold ) {
dp.iMin = max( dp.iMin-1, 2 );
break;
}
}
}
for( int j = dp.jMin; j <= dp.jMax; j++ ) {
for( int i = dp.iMin; i <= dp.iMax; i++ ) {
if( fabsf(data.h[data.idx(dp.iMax-2,j)]) > dp.sshClipThreshold ) {
dp.iMax = min( dp.iMax+1, dp.nI-1 );
break;
}
}
for( int i = dp.iMin; i <= dp.iMax; i++ ) {
if( fabsf(data.h[data.idx(i,dp.jMax-2)]) > dp.sshClipThreshold ) {
data.g_MinMax->w = 1;
break;
}
if( fabsf(data.h[data.idx(i,dp.jMin+2)]) > dp.sshClipThreshold ) {
dp.jMin = max( dp.jMin-1, 2 );
break;
}
}
}
for( int i = dp.iMin; i <= dp.iMax; i++ ) {
if( fabsf(data.h[data.idx(i,dp.jMax-2)]) > dp.sshClipThreshold ) {
dp.jMax = min( dp.jMax+1, dp.nJ-1 );
break;
}
}
}
#endif
int4 tmp;
tmp.x = dp.iMin;
tmp.y = dp.iMax;
tmp.z = dp.jMin;
tmp.w = dp.jMax;
*(data.g_MinMax) = tmp;
}
}
......@@ -29,7 +29,6 @@ __global__ void runWaveUpdateKernel( KernelData data );
__global__ void runWaveBoundaryKernel( KernelData data );
__global__ void runFluxUpdateKernel( KernelData data );
__global__ void runFluxBoundaryKernel( KernelData data );
__global__ void runGridExtendKernel1( KernelData data );
__global__ void runGridExtendKernel2( KernelData data );
__global__ void runGridExtendKernel( KernelData data );
#endif /* EW_KERNELS_H */
......@@ -50,7 +50,7 @@ int CGpuNode::mallocMem() {
dp.sshArrivalThreshold = Par.sshArrivalThreshold;
dp.sshClipThreshold = Par.sshClipThreshold;
dp.sshZeroThreshold = Par.sshZeroThreshold;
dp.lpad = 0;
dp.lpad = 31;
size_t nJ_aligned = dp.nJ + dp.lpad;
......@@ -85,13 +85,15 @@ int CGpuNode::copyToGPU() {
Params& dp = data.params;
/* fill in further fields here */
dp.iMin = Imin;
/* align left grid boundary to a multiple of 32 with an offset 1 */
Jmin -= (Jmin-2) % 32;
/* fill in further fields here */
dp.iMin = Imin;
dp.iMax = Imax;
dp.jMin = Jmin;
dp.jMin = Jmin;
dp.jMax = Jmax;
/* FIXME: should not work correctly */
/* add offset to data.d to guarantee alignment: data.d + LPAD */
/* 2-dim */
CUDA_CALL( cudaMemcpy2D( data.d + dp.lpad, pitch, d, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
......@@ -202,8 +204,10 @@ int CGpuNode::run() {
int xThreads = 32;
int yThreads = nThreads / xThreads;
int xBlocks = ceil( (float)dp.nJ / (float)xThreads );
int yBlocks = ceil( (float)dp.nI / (float)yThreads );
int NJ = dp.jMax - dp.jMin + 1;
int NI = dp.iMax - dp.iMin + 1;
int xBlocks = ceil( (float)NJ / (float)xThreads );
int yBlocks = ceil( (float)NI / (float)yThreads );
dim3 threads( xThreads, yThreads );
dim3 blocks( xBlocks, yBlocks );
......@@ -225,17 +229,25 @@ int CGpuNode::run() {
runFluxBoundaryKernel<<<nBlocks,nThreads>>>( data );
CUDA_CALL( cudaEventRecord( evtEnd[3], 0 ) );
CUDA_CALL( cudaEventRecord( evtStart[4], 0 ) );
runGridExtendKernel1<<<nBlocks,nThreads>>>( data );
runGridExtendKernel2<<<1,1>>>( data );
CUDA_CALL( cudaMemset( data.g_MinMax, 0, sizeof(int4) ) );
runGridExtendKernel<<<nBlocks,nThreads>>>( data );
CUDA_CALL( cudaEventRecord( evtEnd[4], 0 ) );
int4 MinMax;
CUDA_CALL( cudaMemcpy( &MinMax, data.g_MinMax, sizeof(int4), cudaMemcpyDeviceToHost ) );
cudaDeviceSynchronize();
Imin = dp.iMin = MinMax.x;
Imax = dp.iMax = MinMax.y;
Jmin = dp.jMin = MinMax.z;
Jmax = dp.jMax = MinMax.w;
if( MinMax.x )
Imin = dp.iMin = max( dp.iMin-1, 2 );
if( MinMax.y )
Imax = dp.iMax = min( dp.iMax+1, dp.nI-1 );
if( MinMax.z )
Jmin = dp.jMin = max( dp.jMin-32, 2 );
if( MinMax.w )
Jmax = dp.jMax = min( dp.jMax+1, dp.nJ-1 );
float _dur;
for( int j = 0; j < 5; j++ ) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment