Commit ddec049d authored by Johannes Spazier's avatar Johannes Spazier

- Fixed problems with timing of kernel functions when using multiple streams per physical GPU.

- Fixed some indexing issues in kernel WaveBoundary (memory accesses of 1D-Arrays were not 0-based).
- Closed small memory leaks caused by cuda-calls.
parent b1b96525
...@@ -67,17 +67,17 @@ __global__ void runWaveBoundaryKernel( KernelData data ) { ...@@ -67,17 +67,17 @@ __global__ void runWaveBoundaryKernel( KernelData data ) {
if( id <= dp.nJ-1 ) { if( id <= dp.nJ-1 ) {
ij = dt.idx(1,id); ij = dt.idx(1,id);
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB2[id]; dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB2[id-1];
if( dt.fM[ij] > 0 ) dt.h[ij] = -dt.h[ij]; if( dt.fM[ij] > 0 ) dt.h[ij] = -dt.h[ij];
} }
if( id == 2 ) { if( id == 2 ) {
ij = dt.idx(1,1); ij = dt.idx(1,1);
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[1]; dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[0];
if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij]; if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij];
ij = dt.idx(1,dp.nJ); ij = dt.idx(1,dp.nJ);
dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[1]; dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[0];
if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
} }
...@@ -85,17 +85,17 @@ __global__ void runWaveBoundaryKernel( KernelData data ) { ...@@ -85,17 +85,17 @@ __global__ void runWaveBoundaryKernel( KernelData data ) {
if( id <= dp.nJ-1 ) { if( id <= dp.nJ-1 ) {
ij = dt.idx(dp.nI,id); ij = dt.idx(dp.nI,id);
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB4[id]; dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB4[id-1];
if( dt.fM[dt.le(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; if( dt.fM[dt.le(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
} }
if( id == 2 ) { if( id == 2 ) {
ij = dt.idx(dp.nI,1); ij = dt.idx(dp.nI,1);
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[dp.nI]; dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[dp.nI-1];
if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij]; if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij];
ij = dt.idx(dp.nI,dp.nJ); ij = dt.idx(dp.nI,dp.nJ);
dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[dp.nI]; dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[dp.nI-1];
if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
} }
...@@ -103,13 +103,13 @@ __global__ void runWaveBoundaryKernel( KernelData data ) { ...@@ -103,13 +103,13 @@ __global__ void runWaveBoundaryKernel( KernelData data ) {
if( id <= dp.nI - 1 ) { if( id <= dp.nI - 1 ) {
ij = dt.idx(id,1); ij = dt.idx(id,1);
dt.h[ij] = sqrtf( powf(dt.fN[ij],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.le(ij)]),2.0f) )*dt.cB1[id]; dt.h[ij] = sqrtf( powf(dt.fN[ij],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.le(ij)]),2.0f) )*dt.cB1[id-1];
if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij]; if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij];
} }
if( id <= dp.nI - 1 ) { if( id <= dp.nI - 1 ) {
ij = dt.idx(id,dp.nJ); ij = dt.idx(id,dp.nJ);
dt.h[ij] = sqrtf( powf(dt.fN[dt.dn(ij)],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.dn(ij)]),2.0f) )*dt.cB3[id]; dt.h[ij] = sqrtf( powf(dt.fN[dt.dn(ij)],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.dn(ij)]),2.0f) )*dt.cB3[id-1];
if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij];
} }
......
...@@ -11,11 +11,22 @@ CGpuNode::CGpuNode() { ...@@ -11,11 +11,22 @@ CGpuNode::CGpuNode() {
num_real_gpus = 2; num_real_gpus = 2;
vgpus = new VGpu[num_virtual_gpus]; vgpus = new VGpu[num_virtual_gpus];
gpus = new Gpu[num_real_gpus];
cudaMallocHost( &extend, num_virtual_gpus * sizeof(int4) ); cudaMallocHost( &extend, num_virtual_gpus * sizeof(int4) );
for( int i = 0; i < VGpu::NEVENTS; i++ ) { for( int j = 0; j < num_real_gpus; j++ ) {
dur[i] = 0.0;
cudaSetDevice( j );
gpus[j].id = j;
for( int i = 0; i < gpus[j].NEVENTS; i++ ) {
cudaEventCreate( &(gpus[j].evtStart[i]) );
cudaEventCreate( &(gpus[j].evtEnd[i]) );
gpus[j].dur[i] = 0.0f;
}
} }
for( int j = 0; j < num_virtual_gpus; j++ ) { for( int j = 0; j < num_virtual_gpus; j++ ) {
...@@ -25,20 +36,17 @@ CGpuNode::CGpuNode() { ...@@ -25,20 +36,17 @@ CGpuNode::CGpuNode() {
vgpu.data.devID = j; vgpu.data.devID = j;
vgpu.data.devNum = num_virtual_gpus; vgpu.data.devNum = num_virtual_gpus;
vgpu.dev = j % num_real_gpus; vgpu.dev = &(gpus[j % num_real_gpus]);
vgpu.dev->maxId = j / num_real_gpus;
vgpu.relId = j / num_real_gpus;
cudaSetDevice( vgpu.dev ); cudaSetDevice( vgpu.dev->id );
for( int i = 0; i < vgpu.NSTREAMS; i++ ) { for( int i = 0; i < vgpu.NSTREAMS; i++ ) {
cudaStreamCreate( &(vgpu.stream[i]) ); cudaStreamCreate( &(vgpu.stream[i]) );
} }
for( int i = 0; i < vgpu.NEVENTS; i++ ) {
cudaEventCreate( &(vgpu.evtStart[i]) );
cudaEventCreate( &(vgpu.evtEnd[i]) );
vgpu.dur[i] = 0.0;
}
cudaEventCreate( &vgpu.evtSync ); cudaEventCreate( &vgpu.evtSync );
} }
...@@ -72,18 +80,28 @@ CGpuNode::~CGpuNode() { ...@@ -72,18 +80,28 @@ CGpuNode::~CGpuNode() {
VGpu& vgpu = vgpus[j]; VGpu& vgpu = vgpus[j];
cudaSetDevice( vgpu.dev ); cudaSetDevice( vgpu.dev->id );
for( int i = 0; i < vgpu.NSTREAMS; i++ ) { for( int i = 0; i < vgpu.NSTREAMS; i++ ) {
cudaStreamDestroy( vgpu.stream[i] ); cudaStreamDestroy( vgpu.stream[i] );
} }
for( int i = 0; i < 5; i++ ) { cudaEventDestroy( vgpu.evtSync );
cudaEventDestroy( vgpu.evtStart[i] );
cudaEventDestroy( vgpu.evtEnd[i] );
} }
for( int j = 0; j < num_real_gpus; j++ ) {
cudaSetDevice( j );
for( int i = 0; i < gpus[j].NEVENTS; i++ ) {
cudaEventDestroy( gpus[j].evtStart[i] );
cudaEventDestroy( gpus[j].evtEnd[i] );
} }
cudaDeviceReset();
}
} }
int CGpuNode::mallocMem() { int CGpuNode::mallocMem() {
...@@ -111,7 +129,7 @@ int CGpuNode::mallocMem() { ...@@ -111,7 +129,7 @@ int CGpuNode::mallocMem() {
int ghost = vgpu.gb + vgpu.gt; int ghost = vgpu.gb + vgpu.gt;
cudaSetDevice( vgpu.dev ); cudaSetDevice( vgpu.dev->id );
/* arrays that need ghost zones must add 2 to vgpu.size */ /* arrays that need ghost zones must add 2 to vgpu.size */
/* 2-dim */ /* 2-dim */
...@@ -171,7 +189,7 @@ int CGpuNode::copyToGPU() { ...@@ -171,7 +189,7 @@ int CGpuNode::copyToGPU() {
int off = (vgpu.off - vgpu.gt - 1); int off = (vgpu.off - vgpu.gt - 1);
int ghost = vgpu.gb + vgpu.gt; int ghost = vgpu.gb + vgpu.gt;
cudaSetDevice( vgpu.dev ); cudaSetDevice( vgpu.dev->id );
/* FIXME: should not work correctly */ /* FIXME: should not work correctly */
/* add offset to data.d to guarantee alignment: data.d + LPAD */ /* add offset to data.d to guarantee alignment: data.d + LPAD */
...@@ -211,7 +229,7 @@ int CGpuNode::copyFromGPU() { ...@@ -211,7 +229,7 @@ int CGpuNode::copyFromGPU() {
int off = (vgpu.off - 1) * dp.nJ; int off = (vgpu.off - 1) * dp.nJ;
cudaSetDevice( vgpu.dev ); cudaSetDevice( vgpu.dev->id );
CUDA_CALL( cudaMemcpy2D( hMax + off, dp.nJ * sizeof(float), data.hMax + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) ); CUDA_CALL( cudaMemcpy2D( hMax + off, dp.nJ * sizeof(float), data.hMax + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) );
CUDA_CALL( cudaMemcpy2D( tArr + off, dp.nJ * sizeof(float), data.tArr + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) ); CUDA_CALL( cudaMemcpy2D( tArr + off, dp.nJ * sizeof(float), data.tArr + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) );
...@@ -236,7 +254,7 @@ int CGpuNode::copyIntermediate() { ...@@ -236,7 +254,7 @@ int CGpuNode::copyIntermediate() {
int off = (vgpu.off - 1) * dp.nJ; int off = (vgpu.off - 1) * dp.nJ;
cudaSetDevice( vgpu.dev ); cudaSetDevice( vgpu.dev->id );
CUDA_CALL( cudaMemcpy2D( h + off, dp.nJ * sizeof(float), data.h + (vgpu.gt) * data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) ); CUDA_CALL( cudaMemcpy2D( h + off, dp.nJ * sizeof(float), data.h + (vgpu.gt) * data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) );
...@@ -273,7 +291,7 @@ int CGpuNode::copyPOIs() { ...@@ -273,7 +291,7 @@ int CGpuNode::copyPOIs() {
int id = vgpu->data.idx( vgpu->getRel(i), j ); int id = vgpu->data.idx( vgpu->getRel(i), j );
CUDA_CALL( cudaSetDevice( vgpu->dev ) ) CUDA_CALL( cudaSetDevice( vgpu->dev->id ) )
CUDA_CALL( cudaMemcpy( h + idxPOI[n], vgpu->data.h + dp.lpad + id, sizeof(float), cudaMemcpyDeviceToHost ) ); CUDA_CALL( cudaMemcpy( h + idxPOI[n], vgpu->data.h + dp.lpad + id, sizeof(float), cudaMemcpyDeviceToHost ) );
} }
...@@ -282,14 +300,12 @@ int CGpuNode::copyPOIs() { ...@@ -282,14 +300,12 @@ int CGpuNode::copyPOIs() {
int CGpuNode::freeMem() { int CGpuNode::freeMem() {
float max_dur = 0.0;
for( int i = 0; i < num_virtual_gpus; i++ ) { for( int i = 0; i < num_virtual_gpus; i++ ) {
VGpu& vgpu = vgpus[i]; VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data; KernelData& data = vgpu.data;
cudaSetDevice( vgpu.dev ); cudaSetDevice( vgpu.dev->id );
/* 2-dim */ /* 2-dim */
CUDA_CALL( cudaFree( data.d ) ); CUDA_CALL( cudaFree( data.d ) );
...@@ -309,24 +325,21 @@ int CGpuNode::freeMem() { ...@@ -309,24 +325,21 @@ int CGpuNode::freeMem() {
CUDA_CALL( cudaFree( data.cB3 ) ); CUDA_CALL( cudaFree( data.cB3 ) );
CUDA_CALL( cudaFree( data.cB4 ) ); CUDA_CALL( cudaFree( data.cB4 ) );
float total_dur = 0.0; CUDA_CALL( cudaFree( data.extend ) );
for( int j = 0; j < 6; j++ ) {
total_dur += vgpus[i].dur[j];
dur[j] = max( dur[j], vgpus[i].dur[j] );
printf_v("GPU #%u, duration %u: %.3f\n", i, j, vgpus[i].dur[j]);
} }
max_dur = max( max_dur, total_dur ); cudaFreeHost( extend );
printf_v("GPU #%u, duration total: %.3f\n", i, total_dur); for( int i = 0; i < num_real_gpus; i++ ) {
printf_v("###############\n");
} float dur = 0.0f;
for( int j = 0; j < 7; j++ ) {
for( int j = 0; j < 5; j++ ) { printf_v("GPU #%u, duration %u: %.3f\n", i, j, gpus[i].dur[j]);
printf_v("Duration %u: %.3f\n", j, dur[j]); dur += gpus[i].dur[j];
}
printf_v("GPU #%u, duration total: %.3f\n", i, dur);
} }
printf_v("Duration total: %.3f\n", max_dur);
CArrayNode::freeMem(); CArrayNode::freeMem();
...@@ -356,14 +369,18 @@ int CGpuNode::run() { ...@@ -356,14 +369,18 @@ int CGpuNode::run() {
vgpu.threads = dim3( xThreads, yThreads ); vgpu.threads = dim3( xThreads, yThreads );
vgpu.blocks = dim3( ceil( (float)dp.nJ / (float)xThreads ), ceil( (float)data.params.nI / (float)yThreads ) ); vgpu.blocks = dim3( ceil( (float)dp.nJ / (float)xThreads ), ceil( (float)data.params.nI / (float)yThreads ) );
if( ! isActive( vgpu ) ) CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
continue;
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[0] ) );
CUDA_CALL( cudaSetDevice( vgpu.dev ) ); if( isActive( vgpu ) ) {
CUDA_CALL( cudaEventRecord( vgpu.evtStart[0], vgpu.stream[0] ) );
runWaveUpdateKernel<<<vgpu.blocks,vgpu.threads,0,vgpu.stream[0]>>>( data ); runWaveUpdateKernel<<<vgpu.blocks,vgpu.threads,0,vgpu.stream[0]>>>( data );
CUDA_CALL( cudaEventRecord( vgpu.evtEnd[0], vgpu.stream[0] ) ); }
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[0] ) );
} }
for( int i = 0; i < num_virtual_gpus; i++ ) { for( int i = 0; i < num_virtual_gpus; i++ ) {
...@@ -371,36 +388,45 @@ int CGpuNode::run() { ...@@ -371,36 +388,45 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i]; VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data; KernelData& data = vgpu.data;
if( ! isActive( vgpu ) ) CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
continue;
CUDA_CALL( cudaSetDevice( vgpu.dev ) ); if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[1] ) );
if( isActive( vgpu ) ) {
CUDA_CALL( cudaEventRecord( vgpu.evtStart[1], vgpu.stream[0] ) );
runWaveBoundaryKernel<<<vgpu.nBlocks,nThreads,0,vgpu.stream[0]>>>( data ); runWaveBoundaryKernel<<<vgpu.nBlocks,nThreads,0,vgpu.stream[0]>>>( data );
CUDA_CALL( cudaEventRecord( vgpu.evtEnd[1], vgpu.stream[0] ) ); }
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[1] ) );
} }
for( int i = 0; i < num_virtual_gpus; i++ ) { for( int i = 0; i < num_virtual_gpus; i++ ) {
VGpu& vgpu = vgpus[i]; VGpu& vgpu = vgpus[i];
if( ! isActive( vgpu ) ) CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
continue;
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[5] ) );
CUDA_CALL( cudaSetDevice( vgpu.dev ) ); if( isActive( vgpu ) ) {
CUDA_CALL( cudaEventRecord( vgpu.evtStart[5], vgpu.stream[0] ) );
if( i < num_virtual_gpus - 1 ) { if( i < num_virtual_gpus - 1 ) {
int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI; int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.h, vgpus[i+1].dev, vgpu.data.h + off, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.h, vgpus[i+1].dev->id, vgpu.data.h + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) );
} }
if( i > 0 ) { if( i > 0 ) {
int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI; int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.h + off, vgpus[i-1].dev, vgpu.data.h + vgpu.data.params.pI, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.h + off, vgpus[i-1].dev->id, vgpu.data.h + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) );
}
} }
CUDA_CALL( cudaEventRecord( vgpu.evtEnd[5], vgpu.stream[0] ) );
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[5] ) );
cudaEventRecord( vgpu.evtSync, vgpu.stream[0] ); cudaEventRecord( vgpu.evtSync, vgpu.stream[0] );
} }
...@@ -412,7 +438,7 @@ int CGpuNode::run() { ...@@ -412,7 +438,7 @@ int CGpuNode::run() {
if( ! isActive(vgpu) ) if( ! isActive(vgpu) )
continue; continue;
CUDA_CALL( cudaSetDevice( vgpu.dev ) ); CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
if( i < num_virtual_gpus - 1 ) if( i < num_virtual_gpus - 1 )
cudaStreamWaitEvent( vgpu.stream[0], vgpus[i+1].evtSync, 0 ); cudaStreamWaitEvent( vgpu.stream[0], vgpus[i+1].evtSync, 0 );
...@@ -426,14 +452,18 @@ int CGpuNode::run() { ...@@ -426,14 +452,18 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i]; VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data; KernelData& data = vgpu.data;
if( ! isActive( vgpu ) ) CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
continue;
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[2] ) );
CUDA_CALL( cudaSetDevice( vgpu.dev ) ); if( isActive( vgpu ) ) {
CUDA_CALL( cudaEventRecord( vgpu.evtStart[2], vgpu.stream[0] ) );
runFluxUpdateKernel<<<vgpu.blocks,vgpu.threads,0,vgpu.stream[0]>>>( data ); runFluxUpdateKernel<<<vgpu.blocks,vgpu.threads,0,vgpu.stream[0]>>>( data );
CUDA_CALL( cudaEventRecord( vgpu.evtEnd[2], vgpu.stream[0] ) ); }
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[2] ) );
} }
for( int i = 0; i < num_virtual_gpus; i++ ) { for( int i = 0; i < num_virtual_gpus; i++ ) {
...@@ -441,37 +471,47 @@ int CGpuNode::run() { ...@@ -441,37 +471,47 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i]; VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data; KernelData& data = vgpu.data;
if( ! isActive( vgpu ) ) CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
continue;
CUDA_CALL( cudaSetDevice( vgpu.dev ) ); if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[3] ) );
if( isActive( vgpu ) ) {
CUDA_CALL( cudaEventRecord( vgpu.evtStart[3], vgpu.stream[0] ) );
runFluxBoundaryKernel<<<vgpu.nBlocks,nThreads,0,vgpu.stream[0]>>>( data ); runFluxBoundaryKernel<<<vgpu.nBlocks,nThreads,0,vgpu.stream[0]>>>( data );
CUDA_CALL( cudaEventRecord( vgpu.evtEnd[3], vgpu.stream[0] ) ); }
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[3] ) );
} }
for( int i = 0; i < num_virtual_gpus; i++ ) { for( int i = 0; i < num_virtual_gpus; i++ ) {
VGpu& vgpu = vgpus[i]; VGpu& vgpu = vgpus[i];
if( ! isActive( vgpu ) ) CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
continue;
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[6] ) );
CUDA_CALL( cudaSetDevice( vgpu.dev ) ); if( isActive( vgpu ) ) {
if( i < num_virtual_gpus - 1 ) { if( i < num_virtual_gpus - 1 ) {
int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI; int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fN, vgpus[i+1].dev, vgpu.data.fN + off, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fN, vgpus[i+1].dev->id, vgpu.data.fN + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) );
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fM, vgpus[i+1].dev, vgpu.data.fM + off, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fM, vgpus[i+1].dev->id, vgpu.data.fM + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) );
} }
if( i > 0 ) { if( i > 0 ) {
int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI; int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI;
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fN + off, vgpus[i-1].dev, vgpu.data.fN + vgpu.data.params.pI, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fN + off, vgpus[i-1].dev->id, vgpu.data.fN + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) );
CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fM + off, vgpus[i-1].dev, vgpu.data.fM + vgpu.data.params.pI, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fM + off, vgpus[i-1].dev->id, vgpu.data.fM + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) );
}
} }
if( Par.verbose && vgpu.relId == vgpu.dev->maxId )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[6] ) );
} }
for( int i = 0; i < num_virtual_gpus; i++ ) { for( int i = 0; i < num_virtual_gpus; i++ ) {
...@@ -479,15 +519,20 @@ int CGpuNode::run() { ...@@ -479,15 +519,20 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i]; VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data; KernelData& data = vgpu.data;
if( ! isActive( vgpu ) )