Commit ddec049d authored by Johannes Spazier's avatar Johannes Spazier

- Fixed problems with timing of kernel functions when using multiple streams per physical GPU.

- Fixed some indexing issues in kernel WaveBoundary (memory accesses of 1D-Arrays were not 0-based).
- Closed small memory leaks caused by cuda-calls.
parent b1b96525
......@@ -67,17 +67,17 @@ __global__ void runWaveBoundaryKernel( KernelData data ) {
if( id <= dp.nJ-1 ) {
ij = dt.idx(1,id);
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB2[id];
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB2[id-1];
if( dt.fM[ij] > 0 ) dt.h[ij] = -dt.h[ij];
}
if( id == 2 ) {
ij = dt.idx(1,1);
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[1];
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[0];
if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij];
ij = dt.idx(1,dp.nJ);
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[1];
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[0];
if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
}
......@@ -85,17 +85,17 @@ __global__ void runWaveBoundaryKernel( KernelData data ) {
if( id <= dp.nJ-1 ) {
ij = dt.idx(dp.nI,id);
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB4[id];
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB4[id-1];
if( dt.fM[dt.le(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
}
if( id == 2 ) {
ij = dt.idx(dp.nI,1);
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[dp.nI];
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[dp.nI-1];
if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij];
ij = dt.idx(dp.nI,dp.nJ);
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[dp.nI];
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[dp.nI-1];
if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
}
......@@ -103,13 +103,13 @@ __global__ void runWaveBoundaryKernel( KernelData data ) {
if( id <= dp.nI - 1 ) {
ij = dt.idx(id,1);
dt.h[ij] = sqrtf( powf(dt.fN[ij],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.le(ij)]),2.0f) )*dt.cB1[id];
dt.h[ij] = sqrtf( powf(dt.fN[ij],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.le(ij)]),2.0f) )*dt.cB1[id-1];
if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij];
}
if( id <= dp.nI - 1 ) {
ij = dt.idx(id,dp.nJ);
dt.h[ij] = sqrtf( powf(dt.fN[dt.dn(ij)],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.dn(ij)]),2.0f) )*dt.cB3[id];
dt.h[ij] = sqrtf( powf(dt.fN[dt.dn(ij)],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.dn(ij)]),2.0f) )*dt.cB3[id-1];
if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
}
......
......@@ -11,11 +11,22 @@ CGpuNode::CGpuNode() {
num_real_gpus = 2;
vgpus = new VGpu[num_virtual_gpus];
gpus = new Gpu[num_real_gpus];
cudaMallocHost( &extend, num_virtual_gpus * sizeof(int4) );
for( int i = 0; i < VGpu::NEVENTS; i++ ) {
dur[i] = 0.0;
for( int j = 0; j < num_real_gpus; j++ ) {
cudaSetDevice( j );
gpus[j].id = j;
for( int i = 0; i < gpus[j].NEVENTS; i++ ) {
cudaEventCreate( &(gpus[j].evtStart[i]) );
cudaEventCreate( &(gpus[j].evtEnd[i]) );
gpus[j].dur[i] = 0.0f;
}
}
for( int j = 0; j < num_virtual_gpus; j++ ) {
......@@ -25,20 +36,17 @@ CGpuNode::CGpuNode() {
vgpu.data.devID = j;
vgpu.data.devNum = num_virtual_gpus;
vgpu.dev = j % num_real_gpus;
vgpu.dev = &(gpus[j % num_real_gpus]);
vgpu.dev->maxId = j / num_real_gpus;
vgpu.relId = j / num_real_gpus;
cudaSetDevice( vgpu.dev );
cudaSetDevice( vgpu.dev->id );
for( int i = 0; i < vgpu.NSTREAMS; i++ ) {
cudaStreamCreate( &(vgpu.stream[i]) );
}
for( int i = 0; i < vgpu.NEVENTS; i++ ) {
cudaEventCreate( &(vgpu.evtStart[i]) );
cudaEventCreate( &(vgpu.evtEnd[i]) );
vgpu.dur[i] = 0.0;
}
cudaEventCreate( &vgpu.evtSync );
}
......@@ -72,18 +80,28 @@ CGpuNode::~CGpuNode() {
VGpu& vgpu = vgpus[j];
cudaSetDevice( vgpu.dev );
cudaSetDevice( vgpu.dev->id );
for( int i = 0; i < vgpu.NSTREAMS; i++ ) {
cudaStreamDestroy( vgpu.stream[i] );
}
for( int i = 0; i < 5; i++ ) {
cudaEventDestroy( vgpu.evtStart[i] );
cudaEventDestroy( vgpu.evtEnd[i] );
cudaEventDestroy( vgpu.evtSync );
}
for( int j = 0; j < num_real_gpus; j++ ) {
cudaSetDevice( j );
for( int i = 0; i < gpus[j].NEVENTS; i++ ) {
cudaEventDestroy( gpus[j].evtStart[i] );
cudaEventDestroy( gpus[j].evtEnd[i] );
}
cudaDeviceReset();
}
}
int CGpuNode::mallocMem() {
......@@ -111,7 +129,7 @@ int CGpuNode::mallocMem() {
int ghost = vgpu.gb + vgpu.gt;
cudaSetDevice( vgpu.dev );
cudaSetDevice( vgpu.dev->id );
/* arrays that need ghost zones must add 2 to vgpu.size */
/* 2-dim */
......@@ -171,7 +189,7 @@ int CGpuNode::copyToGPU() {
int off = (vgpu.off - vgpu.gt - 1);
int ghost = vgpu.gb + vgpu.gt;
cudaSetDevice( vgpu.dev );
cudaSetDevice( vgpu.dev->id );
/* FIXME: should not work correctly */
/* add offset to data.d to guarantee alignment: data.d + LPAD */
......@@ -211,7 +229,7 @@ int CGpuNode::copyFromGPU() {
int off = (vgpu.off - 1) * dp.nJ;
cudaSetDevice( vgpu.dev );
cudaSetDevice( vgpu.dev->id );
CUDA_CALL( cudaMemcpy2D( hMax + off, dp.nJ * sizeof(float), data.hMax + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) );
CUDA_CALL( cudaMemcpy2D( tArr + off, dp.nJ * sizeof(float), data.tArr + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) );
......@@ -236,7 +254,7 @@ int CGpuNode::copyIntermediate() {
int off = (vgpu.off - 1) * dp.nJ;
cudaSetDevice( vgpu.dev );
cudaSetDevice( vgpu.dev->id );
CUDA_CALL( cudaMemcpy2D( h + off, dp.nJ * sizeof(float), data.h + (vgpu.gt) * data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) );
......@@ -273,7 +291,7 @@ int CGpuNode::copyPOIs() {
int id = vgpu->data.idx( vgpu->getRel(i), j );
CUDA_CALL( cudaSetDevice( vgpu->dev ) )
CUDA_CALL( cudaSetDevice( vgpu->dev->id ) )
CUDA_CALL( cudaMemcpy( h + idxPOI[n], vgpu->data.h + dp.lpad + id, sizeof(float), cudaMemcpyDeviceToHost ) );
}
......@@ -282,14 +300,12 @@ int CGpuNode::copyPOIs() {
int CGpuNode::freeMem() {
float max_dur = 0.0;
for( int i = 0; i < num_virtual_gpus; i++ ) {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
cudaSetDevice( vgpu.dev );
cudaSetDevice( vgpu.dev->id );
/* 2-dim */
CUDA_CALL( cudaFree( data.d ) );
......@@ -309,24 +325,21 @@ int CGpuNode::freeMem() {
CUDA_CALL( cudaFree( data.cB3 ) );
CUDA_CALL( cudaFree( data.cB4 ) );
float total_dur = 0.0;
for( int j = 0; j < 6; j++ ) {
total_dur += vgpus[i].dur[j];
dur[j] = max( dur[j], vgpus[i].dur[j] );
printf_v("GPU #%u, duration %u: %.3f\n", i, j, vgpus[i].dur[j]);
}
CUDA_CALL( cudaFree( data.extend ) );
}
max_dur = max( max_dur, total_dur );
cudaFreeHost( extend );
printf_v("GPU #%u, duration total: %.3f\n", i, total_dur);
printf_v("###############\n");
for( int i = 0; i < num_real_gpus; i++ ) {
}
float dur = 0.0f;
for( int j = 0; j < 7; j++ ) {
for( int j = 0; j < 5; j++ ) {
printf_v("Duration %u: %.3f\n", j, dur[j]);
printf_v("GPU #%u, duration %u: %.3f\n", i, j, gpus[i].dur[j]);
dur += gpus[i].dur[j];
}
printf_v("GPU #%u, duration total: %.3f\n", i, dur);
}
printf_v("Duration total: %.3f\n", max_dur);
CArrayNode::freeMem();
......@@ -356,14 +369,18 @@ int CGpuNode::run() {
vgpu.threads = dim3( xThreads, yThreads );
vgpu.blocks = dim3( ceil( (float)dp.nJ / (float)xThreads ), ceil( (float)data.params.nI / (float)yThreads ) );
if( ! isActive( vgpu ) )
continue;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[0] ) );
if( isActive( vgpu ) ) {
CUDA_CALL( cudaSetDevice( vgpu.dev ) );
runWaveUpdateKernel<<<vgpu.blocks,vgpu.threads,0,vgpu.stream[0]>>>( data );
}
CUDA_CALL( cudaEventRecord( vgpu.evtStart[0], vgpu.stream[0] ) );
runWaveUpdateKernel<<<vgpu.blocks,vgpu.threads,0,vgpu.stream[0]>>>( data );
CUDA_CALL( cudaEventRecord( vgpu.evtEnd[0], vgpu.stream[0] ) );
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[0] ) );
}
for( int i = 0; i < num_virtual_gpus; i++ ) {
......@@ -371,36 +388,45 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
if( ! isActive( vgpu ) )
continue;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
CUDA_CALL( cudaSetDevice( vgpu.dev ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[1] ) );
CUDA_CALL( cudaEventRecord( vgpu.evtStart[1], vgpu.stream[0] ) );
runWaveBoundaryKernel<<<vgpu.nBlocks,nThreads,0,vgpu.stream[0]>>>( data );
CUDA_CALL( cudaEventRecord( vgpu.evtEnd[1], vgpu.stream[0] ) );
if( isActive( vgpu ) ) {
runWaveBoundaryKernel<<<vgpu.nBlocks,nThreads,0,vgpu.stream[0]>>>( data );
}
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[1] ) );
}
for( int i = 0; i < num_virtual_gpus; i++ ) {
VGpu& vgpu = vgpus[i];
if( ! isActive( vgpu ) )
continue;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
CUDA_CALL( cudaSetDevice( vgpu.dev ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[5] ) );
CUDA_CALL( cudaEventRecord( vgpu.evtStart[5], vgpu.stream[0] ) );
if( i < num_virtual_gpus - 1 ) {
int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.h, vgpus[i+1].dev, vgpu.data.h + off, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) );
}
if( isActive( vgpu ) ) {
if( i < num_virtual_gpus - 1 ) {
int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.h, vgpus[i+1].dev->id, vgpu.data.h + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) );
}
if( i > 0 ) {
int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.h + off, vgpus[i-1].dev->id, vgpu.data.h + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) );
}
if( i > 0 ) {
int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.h + off, vgpus[i-1].dev, vgpu.data.h + vgpu.data.params.pI, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) );
}
CUDA_CALL( cudaEventRecord( vgpu.evtEnd[5], vgpu.stream[0] ) );
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[5] ) );
cudaEventRecord( vgpu.evtSync, vgpu.stream[0] );
}
......@@ -412,7 +438,7 @@ int CGpuNode::run() {
if( ! isActive(vgpu) )
continue;
CUDA_CALL( cudaSetDevice( vgpu.dev ) );
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
if( i < num_virtual_gpus - 1 )
cudaStreamWaitEvent( vgpu.stream[0], vgpus[i+1].evtSync, 0 );
......@@ -426,14 +452,18 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
if( ! isActive( vgpu ) )
continue;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[2] ) );
CUDA_CALL( cudaSetDevice( vgpu.dev ) );
if( isActive( vgpu ) ) {
CUDA_CALL( cudaEventRecord( vgpu.evtStart[2], vgpu.stream[0] ) );
runFluxUpdateKernel<<<vgpu.blocks,vgpu.threads,0,vgpu.stream[0]>>>( data );
CUDA_CALL( cudaEventRecord( vgpu.evtEnd[2], vgpu.stream[0] ) );
runFluxUpdateKernel<<<vgpu.blocks,vgpu.threads,0,vgpu.stream[0]>>>( data );
}
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[2] ) );
}
for( int i = 0; i < num_virtual_gpus; i++ ) {
......@@ -441,37 +471,47 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
if( ! isActive( vgpu ) )
continue;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[3] ) );
CUDA_CALL( cudaSetDevice( vgpu.dev ) );
if( isActive( vgpu ) ) {
CUDA_CALL( cudaEventRecord( vgpu.evtStart[3], vgpu.stream[0] ) );
runFluxBoundaryKernel<<<vgpu.nBlocks,nThreads,0,vgpu.stream[0]>>>( data );
CUDA_CALL( cudaEventRecord( vgpu.evtEnd[3], vgpu.stream[0] ) );
runFluxBoundaryKernel<<<vgpu.nBlocks,nThreads,0,vgpu.stream[0]>>>( data );
}
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[3] ) );
}
for( int i = 0; i < num_virtual_gpus; i++ ) {
VGpu& vgpu = vgpus[i];
if( ! isActive( vgpu ) )
continue;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
CUDA_CALL( cudaSetDevice( vgpu.dev ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[6] ) );
if( i < num_virtual_gpus - 1 ) {
int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fN, vgpus[i+1].dev, vgpu.data.fN + off, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) );
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fM, vgpus[i+1].dev, vgpu.data.fM + off, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) );
}
if( isActive( vgpu ) ) {
if( i < num_virtual_gpus - 1 ) {
int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fN, vgpus[i+1].dev->id, vgpu.data.fN + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) );
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fM, vgpus[i+1].dev->id, vgpu.data.fM + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) );
}
if( i > 0 ) {
int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fN + off, vgpus[i-1].dev->id, vgpu.data.fN + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) );
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fM + off, vgpus[i-1].dev->id, vgpu.data.fM + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) );
}
if( i > 0 ) {
int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fN + off, vgpus[i-1].dev, vgpu.data.fN + vgpu.data.params.pI, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) );
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fM + off, vgpus[i-1].dev, vgpu.data.fM + vgpu.data.params.pI, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) );
}
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[6] ) );
}
for( int i = 0; i < num_virtual_gpus; i++ ) {
......@@ -479,15 +519,20 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
if( ! isActive( vgpu ) )
continue;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[4] ) );
CUDA_CALL( cudaSetDevice( vgpu.dev ) );
if( isActive( vgpu ) ) {
runGridExtendKernel<<<vgpu.nBlocks,nThreads,0,vgpu.stream[0]>>>( data );
CUDA_CALL( cudaMemcpyAsync( &(extend[i]), data.extend, sizeof(int4), cudaMemcpyDeviceToHost, vgpu.stream[0]) );
}
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[4] ) );
CUDA_CALL( cudaEventRecord( vgpu.evtStart[4], vgpu.stream[0] ) );
runGridExtendKernel<<<vgpu.nBlocks,nThreads,0,vgpu.stream[0]>>>( data );
CUDA_CALL( cudaMemcpyAsync( &(extend[i]), data.extend, sizeof(int4), cudaMemcpyDeviceToHost, vgpu.stream[0]) );
CUDA_CALL( cudaEventRecord( vgpu.evtEnd[4], vgpu.stream[0] ) );
}
for( int i = 0; i < num_virtual_gpus; i++ ) {
......@@ -495,7 +540,7 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
cudaSetDevice( vgpu.dev );
cudaSetDevice( vgpu.dev->id );
cudaDeviceSynchronize();
CUDA_CALL( cudaMemset( data.extend, 0, sizeof(int4) ) );
......@@ -507,11 +552,18 @@ int CGpuNode::run() {
glb_MinMax.z += extend[i].z;
glb_MinMax.w += extend[i].w;
}
float dur;
for( int j = 0; j < 6; j++ ) {
cudaEventElapsedTime( &dur, vgpus[i].evtStart[j], vgpus[i].evtEnd[j]);
vgpus[i].dur[j] += dur;
if( Par.verbose ) {
for( int i = 0; i < num_real_gpus; i++ ) {
cudaSetDevice( i );
float dur;
for( int j = 0; j < 7; j++ ) {
if( cudaEventElapsedTime( &dur, gpus[i].evtStart[j], gpus[i].evtEnd[j]) == cudaSuccess )
gpus[i].dur[j] += dur;
}
}
}
......
......@@ -63,6 +63,20 @@ public:
__host__ __device__ int idx( int i, int j ) { return (j-1) + (i-1) * params.pI + params.lpad; }
};
class Gpu {
public:
int id;
int maxId;
static const short NEVENTS = 7;
cudaEvent_t evtStart[NEVENTS];
cudaEvent_t evtEnd[NEVENTS];
float dur[NEVENTS];
};
class VGpu {
public:
......@@ -76,14 +90,10 @@ public:
static const short NSTREAMS = 2;
cudaStream_t stream[NSTREAMS];
static const short NEVENTS = 6;
cudaEvent_t evtStart[NEVENTS];
cudaEvent_t evtEnd[NEVENTS];
float dur[NEVENTS];
cudaEvent_t evtSync;
int dev;
Gpu *dev;
int relId;
int nBlocks;
dim3 threads;
......@@ -100,6 +110,8 @@ protected:
VGpu *vgpus;
Params params;
Gpu *gpus;
int4 *extend;
/* line size in bytes */
......@@ -112,8 +124,6 @@ protected:
int num_virtual_gpus;
int num_real_gpus;
float dur[VGpu::NEVENTS];
public:
CGpuNode();
~CGpuNode();
......
......@@ -68,43 +68,43 @@ __global__ void runWaveBoundaryKernel( KernelData data ) {
if( id <= dp.nI-1 ) {
ij = dt.idx(id,1);
dt.h[ij] = sqrtf( powf(dt.fN[ij],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.le(ij)]),2.0f) )*dt.cB1[id];
dt.h[ij] = sqrtf( powf(dt.fN[ij],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.le(ij)]),2.0f) )*dt.cB1[id-1];
if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij];
}
if( id <= dp.nI-1 ) {
ij = dt.idx(id,dp.nJ);
dt.h[ij] = sqrtf( powf(dt.fN[dt.dn(ij)],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.dn(ij)]),2.0f) )*dt.cB3[id];
dt.h[ij] = sqrtf( powf(dt.fN[dt.dn(ij)],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.dn(ij)]),2.0f) )*dt.cB3[id-1];
if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
}
if( id <= dp.nJ-1 ) {
ij = dt.idx(1,id);
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB2[id];
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB2[id-1];
if( dt.fM[ij] > 0 ) dt.h[ij] = -dt.h[ij];
}
if( id <= dp.nJ-1 ) {
ij = dt.idx(dp.nI,id);
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB4[id];
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB4[id-1];
if( dt.fM[dt.le(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
}
if( id == 2 ) {
ij = dt.idx(1,1);
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[1];
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[0];
if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij];
ij = dt.idx(dp.nI,1);
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[dp.nI];
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[dp.nI-1];
if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij];
ij = dt.idx(1,dp.nJ);
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[1];
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[0];
if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
ij = dt.idx(dp.nI,dp.nJ);
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[dp.nI];
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[dp.nI-1];
if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
}
}