diff --git a/cpu/branches/multi-gpu/ewCudaKernels.cu b/cpu/branches/multi-gpu/ewCudaKernels.cu index 933ed346c3affd1a26760b094a9fd15668762174..cce42701f62935fa054319795c3eafc0646700a2 100644 --- a/cpu/branches/multi-gpu/ewCudaKernels.cu +++ b/cpu/branches/multi-gpu/ewCudaKernels.cu @@ -67,17 +67,17 @@ __global__ void runWaveBoundaryKernel( KernelData data ) { if( id <= dp.nJ-1 ) { ij = dt.idx(1,id); - dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB2[id]; + dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB2[id-1]; if( dt.fM[ij] > 0 ) dt.h[ij] = -dt.h[ij]; } if( id == 2 ) { ij = dt.idx(1,1); - dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[1]; + dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[0]; if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij]; ij = dt.idx(1,dp.nJ); - dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[1]; + dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[0]; if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; } @@ -85,17 +85,17 @@ __global__ void runWaveBoundaryKernel( KernelData data ) { if( id <= dp.nJ-1 ) { ij = dt.idx(dp.nI,id); - dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB4[id]; + dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB4[id-1]; if( dt.fM[dt.le(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; } if( id == 2 ) { ij = dt.idx(dp.nI,1); - dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[dp.nI]; + dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[dp.nI-1]; if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij]; ij = dt.idx(dp.nI,dp.nJ); - dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[dp.nI]; + dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[dp.nI-1]; if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; } @@ -103,13 +103,13 @@ __global__ void runWaveBoundaryKernel( KernelData data ) { if( id <= dp.nI - 1 ) { ij = dt.idx(id,1); - dt.h[ij] = sqrtf( powf(dt.fN[ij],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.le(ij)]),2.0f) )*dt.cB1[id]; + dt.h[ij] = sqrtf( powf(dt.fN[ij],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.le(ij)]),2.0f) )*dt.cB1[id-1]; if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij]; } if( id <= dp.nI - 1 ) { ij = dt.idx(id,dp.nJ); - dt.h[ij] = sqrtf( powf(dt.fN[dt.dn(ij)],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.dn(ij)]),2.0f) )*dt.cB3[id]; + dt.h[ij] = sqrtf( powf(dt.fN[dt.dn(ij)],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.dn(ij)]),2.0f) )*dt.cB3[id-1]; if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; } diff --git a/cpu/branches/multi-gpu/ewGpuNode.cu b/cpu/branches/multi-gpu/ewGpuNode.cu index 1ec22c88d51a87d117914a61bc4fe0274ca8d2be..5e6a5eefb001b590b71a4a025d9e7a605ac09365 100644 --- a/cpu/branches/multi-gpu/ewGpuNode.cu +++ b/cpu/branches/multi-gpu/ewGpuNode.cu @@ -11,11 +11,22 @@ CGpuNode::CGpuNode() { num_real_gpus = 2; vgpus = new VGpu[num_virtual_gpus]; + gpus = new Gpu[num_real_gpus]; cudaMallocHost( &extend, num_virtual_gpus * sizeof(int4) ); - for( int i = 0; i < VGpu::NEVENTS; i++ ) { - dur[i] = 0.0; + for( int j = 0; j < num_real_gpus; j++ ) { + + cudaSetDevice( j ); + + gpus[j].id = j; + + for( int i = 0; i < gpus[j].NEVENTS; i++ ) { + cudaEventCreate( &(gpus[j].evtStart[i]) ); + cudaEventCreate( &(gpus[j].evtEnd[i]) ); + gpus[j].dur[i] = 0.0f; + } + } for( int j = 0; j < num_virtual_gpus; j++ ) { @@ -25,20 +36,17 @@ CGpuNode::CGpuNode() { vgpu.data.devID = j; vgpu.data.devNum = num_virtual_gpus; - vgpu.dev = j % num_real_gpus; + vgpu.dev = &(gpus[j % num_real_gpus]); + + vgpu.dev->maxId = j / num_real_gpus; + vgpu.relId = j / num_real_gpus; - cudaSetDevice( vgpu.dev ); + cudaSetDevice( vgpu.dev->id ); for( int i = 0; i < vgpu.NSTREAMS; i++ ) { cudaStreamCreate( &(vgpu.stream[i]) ); } - for( int i = 0; i < vgpu.NEVENTS; i++ ) { - cudaEventCreate( &(vgpu.evtStart[i]) ); - cudaEventCreate( &(vgpu.evtEnd[i]) ); - vgpu.dur[i] = 0.0; - } - cudaEventCreate( &vgpu.evtSync ); } @@ -72,18 +80,28 @@ CGpuNode::~CGpuNode() { VGpu& vgpu = vgpus[j]; - cudaSetDevice( vgpu.dev ); + cudaSetDevice( vgpu.dev->id ); for( int i = 0; i < vgpu.NSTREAMS; i++ ) { cudaStreamDestroy( vgpu.stream[i] ); } - for( int i = 0; i < 5; i++ ) { - cudaEventDestroy( vgpu.evtStart[i] ); - cudaEventDestroy( vgpu.evtEnd[i] ); + cudaEventDestroy( vgpu.evtSync ); + + } + + for( int j = 0; j < num_real_gpus; j++ ) { + + cudaSetDevice( j ); + + for( int i = 0; i < gpus[j].NEVENTS; i++ ) { + cudaEventDestroy( gpus[j].evtStart[i] ); + cudaEventDestroy( gpus[j].evtEnd[i] ); } + cudaDeviceReset(); } + } int CGpuNode::mallocMem() { @@ -111,7 +129,7 @@ int CGpuNode::mallocMem() { int ghost = vgpu.gb + vgpu.gt; - cudaSetDevice( vgpu.dev ); + cudaSetDevice( vgpu.dev->id ); /* arrays that need ghost zones must add 2 to vgpu.size */ /* 2-dim */ @@ -171,7 +189,7 @@ int CGpuNode::copyToGPU() { int off = (vgpu.off - vgpu.gt - 1); int ghost = vgpu.gb + vgpu.gt; - cudaSetDevice( vgpu.dev ); + cudaSetDevice( vgpu.dev->id ); /* FIXME: should not work correctly */ /* add offset to data.d to guarantee alignment: data.d + LPAD */ @@ -211,7 +229,7 @@ int CGpuNode::copyFromGPU() { int off = (vgpu.off - 1) * dp.nJ; - cudaSetDevice( vgpu.dev ); + cudaSetDevice( vgpu.dev->id ); CUDA_CALL( cudaMemcpy2D( hMax + off, dp.nJ * sizeof(float), data.hMax + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) ); CUDA_CALL( cudaMemcpy2D( tArr + off, dp.nJ * sizeof(float), data.tArr + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) ); @@ -236,7 +254,7 @@ int CGpuNode::copyIntermediate() { int off = (vgpu.off - 1) * dp.nJ; - cudaSetDevice( vgpu.dev ); + cudaSetDevice( vgpu.dev->id ); CUDA_CALL( cudaMemcpy2D( h + off, dp.nJ * sizeof(float), data.h + (vgpu.gt) * data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) ); @@ -273,7 +291,7 @@ int CGpuNode::copyPOIs() { int id = vgpu->data.idx( vgpu->getRel(i), j ); - CUDA_CALL( cudaSetDevice( vgpu->dev ) ) + CUDA_CALL( cudaSetDevice( vgpu->dev->id ) ) CUDA_CALL( cudaMemcpy( h + idxPOI[n], vgpu->data.h + dp.lpad + id, sizeof(float), cudaMemcpyDeviceToHost ) ); } @@ -282,14 +300,12 @@ int CGpuNode::copyPOIs() { int CGpuNode::freeMem() { - float max_dur = 0.0; - for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; - cudaSetDevice( vgpu.dev ); + cudaSetDevice( vgpu.dev->id ); /* 2-dim */ CUDA_CALL( cudaFree( data.d ) ); @@ -309,24 +325,21 @@ int CGpuNode::freeMem() { CUDA_CALL( cudaFree( data.cB3 ) ); CUDA_CALL( cudaFree( data.cB4 ) ); - float total_dur = 0.0; - for( int j = 0; j < 6; j++ ) { - total_dur += vgpus[i].dur[j]; - dur[j] = max( dur[j], vgpus[i].dur[j] ); - printf_v("GPU #%u, duration %u: %.3f\n", i, j, vgpus[i].dur[j]); - } + CUDA_CALL( cudaFree( data.extend ) ); + } - max_dur = max( max_dur, total_dur ); + cudaFreeHost( extend ); - printf_v("GPU #%u, duration total: %.3f\n", i, total_dur); - printf_v("###############\n"); + for( int i = 0; i < num_real_gpus; i++ ) { - } + float dur = 0.0f; + for( int j = 0; j < 7; j++ ) { - for( int j = 0; j < 5; j++ ) { - printf_v("Duration %u: %.3f\n", j, dur[j]); + printf_v("GPU #%u, duration %u: %.3f\n", i, j, gpus[i].dur[j]); + dur += gpus[i].dur[j]; + } + printf_v("GPU #%u, duration total: %.3f\n", i, dur); } - printf_v("Duration total: %.3f\n", max_dur); CArrayNode::freeMem(); @@ -356,14 +369,18 @@ int CGpuNode::run() { vgpu.threads = dim3( xThreads, yThreads ); vgpu.blocks = dim3( ceil( (float)dp.nJ / (float)xThreads ), ceil( (float)data.params.nI / (float)yThreads ) ); - if( ! isActive( vgpu ) ) - continue; + CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); + + if( Par.verbose && vgpu.relId == 0 ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[0] ) ); + + if( isActive( vgpu ) ) { - CUDA_CALL( cudaSetDevice( vgpu.dev ) ); + runWaveUpdateKernel<<>>( data ); + } - CUDA_CALL( cudaEventRecord( vgpu.evtStart[0], vgpu.stream[0] ) ); - runWaveUpdateKernel<<>>( data ); - CUDA_CALL( cudaEventRecord( vgpu.evtEnd[0], vgpu.stream[0] ) ); + if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[0] ) ); } for( int i = 0; i < num_virtual_gpus; i++ ) { @@ -371,36 +388,45 @@ int CGpuNode::run() { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; - if( ! isActive( vgpu ) ) - continue; + CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); - CUDA_CALL( cudaSetDevice( vgpu.dev ) ); + if( Par.verbose && vgpu.relId == 0 ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[1] ) ); - CUDA_CALL( cudaEventRecord( vgpu.evtStart[1], vgpu.stream[0] ) ); - runWaveBoundaryKernel<<>>( data ); - CUDA_CALL( cudaEventRecord( vgpu.evtEnd[1], vgpu.stream[0] ) ); + if( isActive( vgpu ) ) { + + runWaveBoundaryKernel<<>>( data ); + } + + if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[1] ) ); } for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; - if( ! isActive( vgpu ) ) - continue; + CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); - CUDA_CALL( cudaSetDevice( vgpu.dev ) ); + if( Par.verbose && vgpu.relId == 0 ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[5] ) ); - CUDA_CALL( cudaEventRecord( vgpu.evtStart[5], vgpu.stream[0] ) ); - if( i < num_virtual_gpus - 1 ) { - int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI; - CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.h, vgpus[i+1].dev, vgpu.data.h + off, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); - } + if( isActive( vgpu ) ) { + + if( i < num_virtual_gpus - 1 ) { + int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI; + CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.h, vgpus[i+1].dev->id, vgpu.data.h + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); + } + + if( i > 0 ) { + int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI; + CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.h + off, vgpus[i-1].dev->id, vgpu.data.h + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); + } - if( i > 0 ) { - int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI; - CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.h + off, vgpus[i-1].dev, vgpu.data.h + vgpu.data.params.pI, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); } - CUDA_CALL( cudaEventRecord( vgpu.evtEnd[5], vgpu.stream[0] ) ); + + if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[5] ) ); cudaEventRecord( vgpu.evtSync, vgpu.stream[0] ); } @@ -412,7 +438,7 @@ int CGpuNode::run() { if( ! isActive(vgpu) ) continue; - CUDA_CALL( cudaSetDevice( vgpu.dev ) ); + CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); if( i < num_virtual_gpus - 1 ) cudaStreamWaitEvent( vgpu.stream[0], vgpus[i+1].evtSync, 0 ); @@ -426,14 +452,18 @@ int CGpuNode::run() { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; - if( ! isActive( vgpu ) ) - continue; + CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); + + if( Par.verbose && vgpu.relId == 0 ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[2] ) ); - CUDA_CALL( cudaSetDevice( vgpu.dev ) ); + if( isActive( vgpu ) ) { - CUDA_CALL( cudaEventRecord( vgpu.evtStart[2], vgpu.stream[0] ) ); - runFluxUpdateKernel<<>>( data ); - CUDA_CALL( cudaEventRecord( vgpu.evtEnd[2], vgpu.stream[0] ) ); + runFluxUpdateKernel<<>>( data ); + } + + if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[2] ) ); } for( int i = 0; i < num_virtual_gpus; i++ ) { @@ -441,37 +471,47 @@ int CGpuNode::run() { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; - if( ! isActive( vgpu ) ) - continue; + CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); + + if( Par.verbose && vgpu.relId == 0 ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[3] ) ); - CUDA_CALL( cudaSetDevice( vgpu.dev ) ); + if( isActive( vgpu ) ) { - CUDA_CALL( cudaEventRecord( vgpu.evtStart[3], vgpu.stream[0] ) ); - runFluxBoundaryKernel<<>>( data ); - CUDA_CALL( cudaEventRecord( vgpu.evtEnd[3], vgpu.stream[0] ) ); + runFluxBoundaryKernel<<>>( data ); + } + + if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[3] ) ); } for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; - if( ! isActive( vgpu ) ) - continue; + CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); - CUDA_CALL( cudaSetDevice( vgpu.dev ) ); + if( Par.verbose && vgpu.relId == 0 ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[6] ) ); - if( i < num_virtual_gpus - 1 ) { - int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI; - CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fN, vgpus[i+1].dev, vgpu.data.fN + off, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); - CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fM, vgpus[i+1].dev, vgpu.data.fM + off, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); - } + if( isActive( vgpu ) ) { + + if( i < num_virtual_gpus - 1 ) { + int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI; + CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fN, vgpus[i+1].dev->id, vgpu.data.fN + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); + CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fM, vgpus[i+1].dev->id, vgpu.data.fM + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); + } + + if( i > 0 ) { + int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI; + CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fN + off, vgpus[i-1].dev->id, vgpu.data.fN + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); + CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fM + off, vgpus[i-1].dev->id, vgpu.data.fM + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); + } - if( i > 0 ) { - int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI; - CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fN + off, vgpus[i-1].dev, vgpu.data.fN + vgpu.data.params.pI, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); - CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fM + off, vgpus[i-1].dev, vgpu.data.fM + vgpu.data.params.pI, vgpu.dev, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); } + if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[6] ) ); } for( int i = 0; i < num_virtual_gpus; i++ ) { @@ -479,15 +519,20 @@ int CGpuNode::run() { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; - if( ! isActive( vgpu ) ) - continue; + CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); + + if( Par.verbose && vgpu.relId == 0 ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[4] ) ); - CUDA_CALL( cudaSetDevice( vgpu.dev ) ); + if( isActive( vgpu ) ) { + + runGridExtendKernel<<>>( data ); + CUDA_CALL( cudaMemcpyAsync( &(extend[i]), data.extend, sizeof(int4), cudaMemcpyDeviceToHost, vgpu.stream[0]) ); + } + + if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) + CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[4] ) ); - CUDA_CALL( cudaEventRecord( vgpu.evtStart[4], vgpu.stream[0] ) ); - runGridExtendKernel<<>>( data ); - CUDA_CALL( cudaMemcpyAsync( &(extend[i]), data.extend, sizeof(int4), cudaMemcpyDeviceToHost, vgpu.stream[0]) ); - CUDA_CALL( cudaEventRecord( vgpu.evtEnd[4], vgpu.stream[0] ) ); } for( int i = 0; i < num_virtual_gpus; i++ ) { @@ -495,7 +540,7 @@ int CGpuNode::run() { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; - cudaSetDevice( vgpu.dev ); + cudaSetDevice( vgpu.dev->id ); cudaDeviceSynchronize(); CUDA_CALL( cudaMemset( data.extend, 0, sizeof(int4) ) ); @@ -507,11 +552,18 @@ int CGpuNode::run() { glb_MinMax.z += extend[i].z; glb_MinMax.w += extend[i].w; + } - float dur; - for( int j = 0; j < 6; j++ ) { - cudaEventElapsedTime( &dur, vgpus[i].evtStart[j], vgpus[i].evtEnd[j]); - vgpus[i].dur[j] += dur; + if( Par.verbose ) { + for( int i = 0; i < num_real_gpus; i++ ) { + + cudaSetDevice( i ); + + float dur; + for( int j = 0; j < 7; j++ ) { + if( cudaEventElapsedTime( &dur, gpus[i].evtStart[j], gpus[i].evtEnd[j]) == cudaSuccess ) + gpus[i].dur[j] += dur; + } } } diff --git a/cpu/branches/multi-gpu/ewGpuNode.cuh b/cpu/branches/multi-gpu/ewGpuNode.cuh index 21e29442e5eb1b66cae6c96cbe3ad27b740c4e1c..93837c2fbc41db47e625241bd6b09891d5616afb 100644 --- a/cpu/branches/multi-gpu/ewGpuNode.cuh +++ b/cpu/branches/multi-gpu/ewGpuNode.cuh @@ -63,6 +63,20 @@ public: __host__ __device__ int idx( int i, int j ) { return (j-1) + (i-1) * params.pI + params.lpad; } }; +class Gpu { + +public: + int id; + + int maxId; + + static const short NEVENTS = 7; + cudaEvent_t evtStart[NEVENTS]; + cudaEvent_t evtEnd[NEVENTS]; + float dur[NEVENTS]; + +}; + class VGpu { public: @@ -76,14 +90,10 @@ public: static const short NSTREAMS = 2; cudaStream_t stream[NSTREAMS]; - static const short NEVENTS = 6; - cudaEvent_t evtStart[NEVENTS]; - cudaEvent_t evtEnd[NEVENTS]; - float dur[NEVENTS]; - cudaEvent_t evtSync; - int dev; + Gpu *dev; + int relId; int nBlocks; dim3 threads; @@ -100,6 +110,8 @@ protected: VGpu *vgpus; Params params; + Gpu *gpus; + int4 *extend; /* line size in bytes */ @@ -112,8 +124,6 @@ protected: int num_virtual_gpus; int num_real_gpus; - float dur[VGpu::NEVENTS]; - public: CGpuNode(); ~CGpuNode(); diff --git a/cpu/trunk/ewCudaKernels.cu b/cpu/trunk/ewCudaKernels.cu index 0c2c280db2e52a4565605214a88bad9ae389db84..c6d4c3e412aea0138b3b4ee63724359c1c11c6b8 100644 --- a/cpu/trunk/ewCudaKernels.cu +++ b/cpu/trunk/ewCudaKernels.cu @@ -68,43 +68,43 @@ __global__ void runWaveBoundaryKernel( KernelData data ) { if( id <= dp.nI-1 ) { ij = dt.idx(id,1); - dt.h[ij] = sqrtf( powf(dt.fN[ij],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.le(ij)]),2.0f) )*dt.cB1[id]; + dt.h[ij] = sqrtf( powf(dt.fN[ij],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.le(ij)]),2.0f) )*dt.cB1[id-1]; if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij]; } if( id <= dp.nI-1 ) { ij = dt.idx(id,dp.nJ); - dt.h[ij] = sqrtf( powf(dt.fN[dt.dn(ij)],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.dn(ij)]),2.0f) )*dt.cB3[id]; + dt.h[ij] = sqrtf( powf(dt.fN[dt.dn(ij)],2.0f) + 0.25f*powf((dt.fM[ij] + dt.fM[dt.dn(ij)]),2.0f) )*dt.cB3[id-1]; if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; } if( id <= dp.nJ-1 ) { ij = dt.idx(1,id); - dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB2[id]; + dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB2[id-1]; if( dt.fM[ij] > 0 ) dt.h[ij] = -dt.h[ij]; } if( id <= dp.nJ-1 ) { ij = dt.idx(dp.nI,id); - dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB4[id]; + dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + 0.25f*powf((dt.fN[ij] + dt.fN[dt.dn(ij)]),2.0f) )*dt.cB4[id-1]; if( dt.fM[dt.le(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; } if( id == 2 ) { ij = dt.idx(1,1); - dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[1]; + dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[0]; if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij]; ij = dt.idx(dp.nI,1); - dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[dp.nI]; + dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[ij],2.0f) )*dt.cB1[dp.nI-1]; if( dt.fN[ij] > 0 ) dt.h[ij] = -dt.h[ij]; ij = dt.idx(1,dp.nJ); - dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[1]; + dt.h[ij] = sqrtf( powf(dt.fM[ij],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[0]; if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; ij = dt.idx(dp.nI,dp.nJ); - dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[dp.nI]; + dt.h[ij] = sqrtf( powf(dt.fM[dt.le(ij)],2.0f) + powf(dt.fN[dt.dn(ij)],2.0f) )*dt.cB3[dp.nI-1]; if( dt.fN[dt.dn(ij)] < 0 ) dt.h[ij] = -dt.h[ij]; } }