/* * EasyWave - A realtime tsunami simulation program with GPU support. * Copyright (C) 2014 Andrey Babeyko, Johannes Spazier * GFZ German Research Centre for Geosciences (http://www.gfz-potsdam.de) * * Parts of this program (especially the GPU extension) were developed * within the context of the following publicly funded project: * - TRIDEC, EU 7th Framework Programme, Grant Agreement 258723 * (http://www.tridec-online.eu) * * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by * the European Commission - subsequent versions of the EUPL (the "Licence"), * complemented with the following provision: For the scientific transparency * and verification of results obtained and communicated to the public after * using a modified version of the work, You (as the recipient of the source * code and author of this modified version, used to produce the published * results in scientific communications) commit to make this modified source * code available in a repository that is easily and freely accessible for a * duration of five years after the communication of the obtained results. * * You may not use this work except in compliance with the Licence. * * You may obtain a copy of the Licence at: * https://joinup.ec.europa.eu/software/page/eupl * * Unless required by applicable law or agreed to in writing, software * distributed under the Licence is distributed on an "AS IS" basis, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Licence for the specific language governing permissions and * limitations under the Licence. */ #include "ewGpuNode.cuh" #include "ewCudaKernels.cuh" CGpuNode::CGpuNode() { pitch = 0; copied = true; /* TODO: make dynamic */ num_virtual_gpus = 4; num_real_gpus = 2; vgpus = new VGpu[num_virtual_gpus]; gpus = new Gpu[num_real_gpus]; cudaMallocHost( &extend, num_virtual_gpus * sizeof(int4) ); for( int j = 0; j < num_real_gpus; j++ ) { cudaSetDevice( j ); cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 ); gpus[j].id = j; for( int i = 0; i < gpus[j].NEVENTS; i++ ) { cudaEventCreate( &(gpus[j].evtStart[i]) ); cudaEventCreate( &(gpus[j].evtEnd[i]) ); gpus[j].dur[i] = 0.0f; } } for( int j = 0; j < num_virtual_gpus; j++ ) { VGpu& vgpu = vgpus[j]; vgpu.data.devID = j; vgpu.data.devNum = num_virtual_gpus; vgpu.dev = &(gpus[j % num_real_gpus]); vgpu.dev->maxId = j / num_real_gpus; vgpu.relId = j / num_real_gpus; cudaSetDevice( vgpu.dev->id ); for( int i = 0; i < vgpu.NSTREAMS; i++ ) { cudaStreamCreate( &(vgpu.stream[i]) ); } cudaEventCreate( &vgpu.evtSync ); } for( int j = 0; j < num_real_gpus - 1; j++ ) { int peerAccess = 0; cudaDeviceCanAccessPeer( &peerAccess, j, j + 1 ); printf_v("GPU #%u can access GPU #%u: %u\n", j, j + 1, peerAccess); if( peerAccess ) { cudaSetDevice( j ); cudaDeviceEnablePeerAccess( j + 1, 0 ); } cudaDeviceCanAccessPeer( &peerAccess, j + 1, j ); printf_v("GPU #%u can access GPU #%u: %u\n", j + 1, j, peerAccess); if( peerAccess ) { cudaSetDevice( j + 1 ); cudaDeviceEnablePeerAccess( j, 0 ); } } memset( extend, 0, num_virtual_gpus * sizeof(int4) ); } CGpuNode::~CGpuNode() { for( int j = 0; j < num_virtual_gpus; j++ ) { VGpu& vgpu = vgpus[j]; cudaSetDevice( vgpu.dev->id ); for( int i = 0; i < vgpu.NSTREAMS; i++ ) { cudaStreamDestroy( vgpu.stream[i] ); } cudaEventDestroy( vgpu.evtSync ); } for( int j = 0; j < num_real_gpus; j++ ) { cudaSetDevice( j ); for( int i = 0; i < gpus[j].NEVENTS; i++ ) { cudaEventDestroy( gpus[j].evtStart[i] ); cudaEventDestroy( gpus[j].evtEnd[i] ); } cudaDeviceReset(); } } int CGpuNode::mallocMem() { CArrayNode::mallocMem(); Params& dp = params; /* fill in some fields here */ dp.nI = NLon; dp.nJ = NLat; dp.sshArrivalThreshold = Par.sshArrivalThreshold; dp.sshClipThreshold = Par.sshClipThreshold; dp.sshZeroThreshold = Par.sshZeroThreshold; dp.lpad = 0; size_t nJ_aligned = dp.nJ + dp.lpad; init_vgpus(); for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; int ghost = vgpu.gb + vgpu.gt; cudaSetDevice( vgpu.dev->id ); /* arrays that need ghost zones must add 2 to vgpu.size */ /* 2-dim */ CUDA_CALL( cudaMallocPitch( &(data.d), &pitch, nJ_aligned * sizeof(float), vgpu.size + ghost ) ); CUDA_CALL( cudaMallocPitch( &(data.fM), &pitch, nJ_aligned * sizeof(float), vgpu.size + ghost ) ); CUDA_CALL( cudaMallocPitch( &(data.fN), &pitch, nJ_aligned * sizeof(float), vgpu.size + ghost ) ); CUDA_CALL( cudaMallocPitch( &(data.cR1), &pitch, nJ_aligned * sizeof(float), vgpu.size + ghost ) ); CUDA_CALL( cudaMallocPitch( &(data.cR2), &pitch, nJ_aligned * sizeof(float), vgpu.size + ghost ) ); CUDA_CALL( cudaMallocPitch( &(data.cR4), &pitch, nJ_aligned * sizeof(float), vgpu.size + ghost ) ); CUDA_CALL( cudaMallocPitch( &(data.tArr), &pitch, nJ_aligned * sizeof(float), vgpu.size + ghost ) ); /* TODO: cR3, cR5 for coriolis */ CUDA_CALL( cudaMallocPitch( &(data.h), &pitch, nJ_aligned * sizeof(float), vgpu.size + ghost ) ); CUDA_CALL( cudaMallocPitch( &(data.hMax), &pitch, nJ_aligned * sizeof(float), vgpu.size + ghost ) ); /* 1-dim */ CUDA_CALL( cudaMalloc( &(data.cR6), dp.nJ * sizeof(float) ) ); CUDA_CALL( cudaMalloc( &(data.cB1), (vgpu.size + ghost) * sizeof(float) ) ); CUDA_CALL( cudaMalloc( &(data.cB2), dp.nJ * sizeof(float) ) ); CUDA_CALL( cudaMalloc( &(data.cB3), (vgpu.size + ghost) * sizeof(float) ) ); CUDA_CALL( cudaMalloc( &(data.cB4), dp.nJ * sizeof(float) ) ); /* data field to store return values like calculated iMin and iMax values */ CUDA_CALL( cudaMalloc( &(data.extend), sizeof(int4) ) ); /* assign some parameters */ data.params.nI = vgpu.getRel( vgpu.end ); data.params.nJ = dp.nJ; data.params.sshArrivalThreshold = dp.sshArrivalThreshold; data.params.sshClipThreshold = dp.sshClipThreshold; data.params.sshZeroThreshold = dp.sshZeroThreshold; data.params.lpad = dp.lpad; /* TODO: make sure that pitch is a multiple of 4 and the same for each cudaMallocPitch() call */ data.params.pI = pitch / sizeof(float); } return 0; } int CGpuNode::copyToGPU() { Params& dp = params; /* fill in further fields here */ dp.iMin = Imin; dp.iMax = Imax; dp.jMin = Jmin; dp.jMax = Jmax; for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; /* special treatment because of needed ghost zones */ int off = (vgpu.off - vgpu.gt - 1); int ghost = vgpu.gb + vgpu.gt; cudaSetDevice( vgpu.dev->id ); /* FIXME: should not work correctly */ /* add offset to data.d to guarantee alignment: data.d + LPAD */ /* 2-dim */ CUDA_CALL( cudaMemcpy2D( data.d + dp.lpad, pitch, d + off * dp.nJ, dp.nJ * sizeof(float), dp.nJ * sizeof(float), vgpu.size + ghost, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.fM + dp.lpad, pitch, fM + off * dp.nJ, dp.nJ * sizeof(float), dp.nJ * sizeof(float), vgpu.size + ghost, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.fN + dp.lpad, pitch, fN + off * dp.nJ, dp.nJ * sizeof(float), dp.nJ * sizeof(float), vgpu.size + ghost, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.cR1 + dp.lpad, pitch, cR1 + off * dp.nJ, dp.nJ * sizeof(float), dp.nJ * sizeof(float), vgpu.size + ghost, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.cR2 + dp.lpad, pitch, cR2 + off * dp.nJ, dp.nJ * sizeof(float), dp.nJ * sizeof(float), vgpu.size + ghost, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.cR4 + dp.lpad, pitch, cR4 + off * dp.nJ, dp.nJ * sizeof(float), dp.nJ * sizeof(float), vgpu.size + ghost, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.tArr + dp.lpad, pitch, tArr + off * dp.nJ, dp.nJ * sizeof(float), dp.nJ * sizeof(float), vgpu.size + ghost, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.h + dp.lpad, pitch, h + off * dp.nJ, dp.nJ * sizeof(float), dp.nJ * sizeof(float), vgpu.size + ghost, cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy2D( data.hMax + dp.lpad, pitch, hMax + off * dp.nJ, dp.nJ * sizeof(float), dp.nJ * sizeof(float), vgpu.size + ghost, cudaMemcpyHostToDevice ) ); /* FIXME: move global variables into data structure */ /* 1-dim */ CUDA_CALL( cudaMemcpy( data.cR6, R6, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy( data.cB1, C1 + off, (vgpu.size + ghost) * sizeof(float), cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy( data.cB2, C2, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy( data.cB3, C3 + off, (vgpu.size + ghost) * sizeof(float), cudaMemcpyHostToDevice ) ); CUDA_CALL( cudaMemcpy( data.cB4, C4, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) ); data.params.jMin = dp.jMin; data.params.jMax = dp.jMax; } return 0; } int CGpuNode::copyFromGPU() { Params& dp = params; for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; int off = (vgpu.off - 1) * dp.nJ; cudaSetDevice( vgpu.dev->id ); CUDA_CALL( cudaMemcpy2D( hMax + off, dp.nJ * sizeof(float), data.hMax + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) ); CUDA_CALL( cudaMemcpy2D( tArr + off, dp.nJ * sizeof(float), data.tArr + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) ); } return 0; } int CGpuNode::copyIntermediate() { /* ignore copy requests if data already present on CPU side */ if( copied ) return 0; Params& dp = params; for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; int off = (vgpu.off - 1) * dp.nJ; cudaSetDevice( vgpu.dev->id ); CUDA_CALL( cudaMemcpy2D( h + off, dp.nJ * sizeof(float), data.h + (vgpu.gt) * data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) ); } /* copy finished */ copied = true; return 0; } int CGpuNode::copyPOIs() { Params& dp = params; if( copied ) return 0; VGpu *vgpu; for( int n = 0; n < NPOIs; n++ ) { int i = idxPOI[n] / dp.nJ + 1; int j = idxPOI[n] % dp.nJ + 1; for( int id = 0; id < num_virtual_gpus; id++ ) { if( vgpus[id].hasLine( i ) ) { vgpu = &(vgpus[id]); break; } } int id = vgpu->data.idx( vgpu->getRel(i), j ); CUDA_CALL( cudaSetDevice( vgpu->dev->id ) ) CUDA_CALL( cudaMemcpy( h + idxPOI[n], vgpu->data.h + dp.lpad + id, sizeof(float), cudaMemcpyDeviceToHost ) ); } return 0; } int CGpuNode::freeMem() { for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; cudaSetDevice( vgpu.dev->id ); /* 2-dim */ CUDA_CALL( cudaFree( data.d ) ); CUDA_CALL( cudaFree( data.h ) ); CUDA_CALL( cudaFree( data.hMax ) ); CUDA_CALL( cudaFree( data.fM ) ); CUDA_CALL( cudaFree( data.fN ) ); CUDA_CALL( cudaFree( data.cR1 ) ); CUDA_CALL( cudaFree( data.cR2 ) ); CUDA_CALL( cudaFree( data.cR4 ) ); CUDA_CALL( cudaFree( data.tArr ) ); /* 1-dim */ CUDA_CALL( cudaFree( data.cR6 ) ); CUDA_CALL( cudaFree( data.cB1 ) ); CUDA_CALL( cudaFree( data.cB2 ) ); CUDA_CALL( cudaFree( data.cB3 ) ); CUDA_CALL( cudaFree( data.cB4 ) ); CUDA_CALL( cudaFree( data.extend ) ); } cudaFreeHost( extend ); for( int i = 0; i < num_real_gpus; i++ ) { float dur = 0.0f; for( int j = 0; j < 7; j++ ) { printf_v("GPU #%u, duration %u: %.3f\n", i, j, gpus[i].dur[j]); dur += gpus[i].dur[j]; } printf_v("GPU #%u, duration total: %.3f\n", i, dur); } CArrayNode::freeMem(); return 0; } int CGpuNode::run() { Params& dp = params; int nThreads = 256; int xThreads = 32; int yThreads = nThreads / xThreads; int4 glb_MinMax = {0,0,0,0}; dp.mTime = Par.time; for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; updateParams( vgpu ); vgpu.nBlocks = ceil( (float)max(data.params.nI,dp.nJ) / (float)nThreads ); vgpu.threads = dim3( xThreads, yThreads ); vgpu.blocks = dim3( ceil( (float)dp.nJ / (float)xThreads ), ceil( (float)data.params.nI / (float)yThreads ) ); CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); if( Par.verbose && vgpu.relId == 0 ) CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[0] ) ); if( isActive( vgpu ) ) { runWaveUpdateKernel<<>>( data ); runWaveBoundaryKernel<<>>( data ); } if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[0] ) ); } for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); if( Par.verbose && vgpu.relId == 0 ) CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[5] ) ); if( isActive( vgpu ) ) { if( i < num_virtual_gpus - 1 ) { int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI; CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.h, vgpus[i+1].dev->id, vgpu.data.h + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); } if( i > 0 ) { int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI; CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.h + off, vgpus[i-1].dev->id, vgpu.data.h + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); } } if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[5] ) ); cudaEventRecord( vgpu.evtSync, vgpu.stream[0] ); } for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; if( ! isActive(vgpu) ) continue; CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); if( i < num_virtual_gpus - 1 ) cudaStreamWaitEvent( vgpu.stream[0], vgpus[i+1].evtSync, 0 ); if( i > 0 ) cudaStreamWaitEvent( vgpu.stream[0], vgpus[i-1].evtSync, 0 ); } for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); if( Par.verbose && vgpu.relId == 0 ) CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[2] ) ); if( isActive( vgpu ) ) { runFluxUpdateKernel<<>>( data ); runFluxBoundaryKernel<<>>( data ); } if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[2] ) ); } for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); if( Par.verbose && vgpu.relId == 0 ) CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[6] ) ); if( isActive( vgpu ) ) { if( i < num_virtual_gpus - 1 ) { int off = ( vgpu.getRel(vgpu.end) - 1 ) * vgpu.data.params.pI; CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fN, vgpus[i+1].dev->id, vgpu.data.fN + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i+1].data.fM, vgpus[i+1].dev->id, vgpu.data.fM + off, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0]) ); } if( i > 0 ) { int off = ( vgpus[i-1].getRel(vgpus[i-1].end) ) * vgpus[i-1].data.params.pI; CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fN + off, vgpus[i-1].dev->id, vgpu.data.fN + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); CUDA_CALL( cudaMemcpyPeerAsync( vgpus[i-1].data.fM + off, vgpus[i-1].dev->id, vgpu.data.fM + vgpu.data.params.pI, vgpu.dev->id, vgpu.data.params.pI * sizeof(float), vgpu.stream[0] ) ); } } if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[6] ) ); } for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; CUDA_CALL( cudaSetDevice( vgpu.dev->id ) ); if( Par.verbose && vgpu.relId == 0 ) CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[4] ) ); if( isActive( vgpu ) ) { runGridExtendKernel<<>>( data ); CUDA_CALL( cudaMemcpyAsync( &(extend[i]), data.extend, sizeof(int4), cudaMemcpyDeviceToHost, vgpu.stream[0]) ); } if( Par.verbose && vgpu.relId == vgpu.dev->maxId ) CUDA_CALL( cudaEventRecord( vgpu.dev->evtEnd[4] ) ); } for( int i = 0; i < num_virtual_gpus; i++ ) { VGpu& vgpu = vgpus[i]; KernelData& data = vgpu.data; cudaSetDevice( vgpu.dev->id ); cudaDeviceSynchronize(); CUDA_CALL( cudaMemset( data.extend, 0, sizeof(int4) ) ); if( vgpu.hasLine( dp.iMin + 2 ) ) glb_MinMax.x += extend[i].x; if( vgpu.hasLine( dp.iMax - 2 ) ) glb_MinMax.y += extend[i].y; glb_MinMax.z += extend[i].z; glb_MinMax.w += extend[i].w; } if( Par.verbose ) { for( int i = 0; i < num_real_gpus; i++ ) { cudaSetDevice( i ); float dur; for( int j = 0; j < 7; j++ ) { if( cudaEventElapsedTime( &dur, gpus[i].evtStart[j], gpus[i].evtEnd[j]) == cudaSuccess ) gpus[i].dur[j] += dur; } } } memset( extend, 0, num_virtual_gpus * sizeof(int4) ); if( glb_MinMax.x > 0 ) Imin = dp.iMin = max( dp.iMin-1, 2 ); if( glb_MinMax.y > 0 ) Imax = dp.iMax = min( dp.iMax+1, dp.nI-1 ); if( glb_MinMax.z > 0 ) Jmin = dp.jMin = max( dp.jMin-1, 2 ); if( glb_MinMax.w > 0 ) Jmax = dp.jMax = min( dp.jMax+1, dp.nJ-1 ); /* data has changed now -> copy becomes necessary */ copied = false; return 0; } int CGpuNode::init_vgpus() { Params& dp = params; int nI_partial = dp.nI / num_virtual_gpus; int nI_mod = dp.nI % num_virtual_gpus; int off = 1; for( int i = 0; i < num_virtual_gpus; i++ ) { vgpus[i].size = nI_partial + (int)( i < nI_mod ); vgpus[i].off = off; vgpus[i].end = vgpus[i].off + vgpus[i].size - 1; off += vgpus[i].size; vgpus[i].gt = 1 - (int)( i == 0 ); vgpus[i].gb = 1 - (int)( i == num_virtual_gpus - 1 ); printf_v("VGPU #%u: off=%u end=%u size=%u gt=%u gb=%u\n", i, vgpus[i].off, vgpus[i].end, vgpus[i].size, vgpus[i].gt, vgpus[i].gb); } return 0; } int CGpuNode::updateParams( VGpu& vgpu ) { Params& dp = vgpu.data.params; dp.mTime = params.mTime; dp.jMin = params.jMin; dp.jMax = params.jMax; if( params.iMin <= vgpu.end && params.iMax >= vgpu.off ) { /* range of virtual GPU must be handled */ /* calculate relative values for iMin and iMax depending on current range */ dp.iMin = max( 2, vgpu.getRel(params.iMin) ); dp.iMax = min( vgpu.getRel( vgpu.end ), vgpu.getRel(params.iMax) ); return 0; } return 1; } bool CGpuNode::isActive( VGpu& vgpu ) { return ( params.iMin <= vgpu.end && params.iMax >= vgpu.off ); }