Commit 92c192e0 authored by Johannes Spazier's avatar Johannes Spazier
Browse files

Keep kernel data small by separating POI related data in a dedicated class....

Keep kernel data small by separating POI related data in a dedicated class. This was necessary to make the code run on the older Tesla C1060.
parent 92f2ab2d
......@@ -219,10 +219,10 @@ __global__ void runGridExtendKernel( KernelData data ) {
}
__global__ void storePOIs( KernelData data ) {
__global__ void storePOIs( PoiData data, float *h ) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if( id < data.num_pois ) {
data.pois[data.it * data.num_pois + id] = data.h[data.idxs[id]];
data.pois[data.it * data.num_pois + id] = h[data.idxs[id]];
}
}
......@@ -38,6 +38,6 @@ __global__ void runWaveBoundaryKernel( KernelData data );
__global__ void runFluxUpdateKernel( KernelData data );
__global__ void runFluxBoundaryKernel( KernelData data );
__global__ void runGridExtendKernel( KernelData data );
__global__ void storePOIs( KernelData data );
__global__ void storePOIs( PoiData data, float *h );
#endif /* EW_KERNELS_H */
......@@ -57,7 +57,7 @@ CGpuNode::CGpuNode() {
for( int j = 0; j < num_real_gpus; j++ ) {
cudaSetDevice( j );
setDevice( j );
cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 );
......@@ -83,7 +83,7 @@ CGpuNode::CGpuNode() {
vgpu.dev->maxId = j / num_real_gpus;
vgpu.relId = j / num_real_gpus;
cudaSetDevice( vgpu.dev->id );
setDevice( vgpu.dev->id );
for( int i = 0; i < vgpu.NSTREAMS; i++ ) {
cudaStreamCreate( &(vgpu.stream[i]) );
......@@ -99,7 +99,7 @@ CGpuNode::CGpuNode() {
printf_v("GPU #%u can access GPU #%u: %u\n", j, j + 1, peerAccess);
if( peerAccess ) {
cudaSetDevice( j );
setDevice( j );
cudaDeviceEnablePeerAccess( j + 1, 0 );
}
......@@ -107,7 +107,7 @@ CGpuNode::CGpuNode() {
printf_v("GPU #%u can access GPU #%u: %u\n", j + 1, j, peerAccess);
if( peerAccess ) {
cudaSetDevice( j + 1 );
setDevice( j + 1 );
cudaDeviceEnablePeerAccess( j, 0 );
}
......@@ -122,7 +122,7 @@ CGpuNode::~CGpuNode() {
VGpu& vgpu = vgpus[j];
cudaSetDevice( vgpu.dev->id );
setDevice( vgpu.dev->id );
for( int i = 0; i < vgpu.NSTREAMS; i++ ) {
cudaStreamDestroy( vgpu.stream[i] );
......@@ -134,7 +134,7 @@ CGpuNode::~CGpuNode() {
for( int j = 0; j < num_real_gpus; j++ ) {
cudaSetDevice( j );
setDevice( j );
for( int i = 0; i < gpus[j].NEVENTS; i++ ) {
cudaEventDestroy( gpus[j].evtStart[i] );
......@@ -171,7 +171,7 @@ int CGpuNode::mallocMem() {
int ghost = vgpu.gb + vgpu.gt;
cudaSetDevice( vgpu.dev->id );
setDevice( vgpu.dev->id );
/* arrays that need ghost zones must add 2 to vgpu.size */
/* 2-dim */
......@@ -226,12 +226,13 @@ int CGpuNode::copyToGPU() {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
PoiData& poi_data = vgpu.poi_data;
/* special treatment because of needed ghost zones */
int off = (vgpu.off - vgpu.gt - 1);
int ghost = vgpu.gb + vgpu.gt;
cudaSetDevice( vgpu.dev->id );
setDevice( vgpu.dev->id );
/* FIXME: should not work correctly */
/* add offset to data.d to guarantee alignment: data.d + LPAD */
......@@ -260,20 +261,21 @@ int CGpuNode::copyToGPU() {
/* POI handling. */
int *relIdxs = (int*) malloc( NPOIs * sizeof(int) );
data.host_idxs = (long*) malloc( NPOIs * sizeof(long) );
data.num_pois = 0;
host_idxs = (long*) malloc( NPOIs * sizeof(long) );
poi_data.num_pois = 0;
poi_data.it = 0;
for(int n = 0; n < NPOIs; n++) {
int i = idxPOI[n] / data.params.nJ + 1;
int j = idxPOI[n] % data.params.nJ + 1;
if( vgpu.hasLine( i ) ) {
relIdxs[data.num_pois] = vgpu.data.idx( vgpu.getRel(i), j );
data.host_idxs[data.num_pois] = n;
data.num_pois++;
relIdxs[poi_data.num_pois] = vgpu.data.idx( vgpu.getRel(i), j );
host_idxs[poi_data.num_pois] = n;
poi_data.num_pois++;
}
}
CUDA_CALL( cudaMalloc( &(data.pois), NtPOI * data.num_pois * sizeof(float) ) );
CUDA_CALL( cudaMalloc( &(data.idxs), data.num_pois * sizeof(int) ) );
CUDA_CALL( cudaMemcpy( data.idxs, relIdxs, data.num_pois * sizeof(int), cudaMemcpyHostToDevice ) );
CUDA_CALL( cudaMalloc( &(poi_data.pois), NtPOI * poi_data.num_pois * sizeof(float) ) );
CUDA_CALL( cudaMalloc( &(poi_data.idxs), poi_data.num_pois * sizeof(int) ) );
CUDA_CALL( cudaMemcpy( poi_data.idxs, relIdxs, poi_data.num_pois * sizeof(int), cudaMemcpyHostToDevice ) );
free(relIdxs);
/*****************/
}
......@@ -291,7 +293,7 @@ int CGpuNode::copyFromGPU() {
int off = (vgpu.off - 1) * dp.nJ;
cudaSetDevice( vgpu.dev->id );
setDevice( vgpu.dev->id );
CUDA_CALL( cudaMemcpy2D( hMax + off, dp.nJ * sizeof(float), data.hMax + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) );
CUDA_CALL( cudaMemcpy2D( tArr + off, dp.nJ * sizeof(float), data.tArr + (vgpu.gt)*data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) );
......@@ -316,7 +318,7 @@ int CGpuNode::copyIntermediate() {
int off = (vgpu.off - 1) * dp.nJ;
cudaSetDevice( vgpu.dev->id );
setDevice( vgpu.dev->id );
CUDA_CALL( cudaMemcpy2D( h + off, dp.nJ * sizeof(float), data.h + (vgpu.gt) * data.params.pI + dp.lpad, pitch, dp.nJ * sizeof(float), vgpu.size, cudaMemcpyDeviceToHost ) );
......@@ -335,26 +337,26 @@ int CGpuNode::copyPOIs() {
for( int id = 0; id < num_virtual_gpus; id++ ) {
VGpu& vgpu = vgpus[id];
KernelData& data = vgpu.data;
float *pois = (float*) malloc(data.num_pois * NtPOI * sizeof(float));
PoiData& poi_data = vgpu.poi_data;
float *pois = (float*) malloc(poi_data.num_pois * NtPOI * sizeof(float));
double ampFactor;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) )
CUDA_CALL( cudaMemcpy( pois, data.pois, data.num_pois * NtPOI * sizeof(float), cudaMemcpyDeviceToHost ) );
CUDA_CALL( setDevice( vgpu.dev->id ) )
CUDA_CALL( cudaMemcpy( pois, poi_data.pois, poi_data.num_pois * NtPOI * sizeof(float), cudaMemcpyDeviceToHost ) );
for(int n = 0; n < data.num_pois; n++) {
for(int i = 0; i < data.it; i++) {
int host_n = data.host_idxs[n];
for(int n = 0; n < poi_data.num_pois; n++) {
for(int i = 0; i < poi_data.it; i++) {
int host_n = host_idxs[n];
if( flagRunupPOI[host_n] )
ampFactor = pow( d[idxPOI[host_n]], 0.25 );
else
ampFactor = 1.;
sshPOI[ host_n ][i] = ampFactor * pois[i*data.num_pois + n];
sshPOI[ host_n ][i] = ampFactor * pois[i*poi_data.num_pois + n];
}
}
for(int i = 0; i < data.it; i++)
for(int i = 0; i < poi_data.it; i++)
timePOI[i] = i * Par.poiDt;
free(pois);
......@@ -370,8 +372,9 @@ int CGpuNode::freeMem() {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
PoiData& poi_data = vgpu.poi_data;
cudaSetDevice( vgpu.dev->id );
setDevice( vgpu.dev->id );
/* 2-dim */
CUDA_CALL( cudaFree( data.d ) );
......@@ -393,9 +396,9 @@ int CGpuNode::freeMem() {
CUDA_CALL( cudaFree( data.extend ) );
CUDA_CALL( cudaFree( data.pois ) );
CUDA_CALL( cudaFree( data.idxs ) );
free( data.host_idxs );
CUDA_CALL( cudaFree( poi_data.pois ) );
CUDA_CALL( cudaFree( poi_data.idxs ) );
free( host_idxs );
}
cudaFreeHost( extend );
......@@ -432,11 +435,12 @@ int CGpuNode::run() {
for( int i = 0; i < num_virtual_gpus; i++ ) {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
int num_blocks = data.num_pois / nThreads + 1;
PoiData& poi_data = vgpu.poi_data;
int num_blocks = poi_data.num_pois / nThreads + 1;
cudaSetDevice( vgpu.dev->id );
storePOIs<<<num_blocks, nThreads>>>( data );
data.it++;
setDevice( vgpu.dev->id );
storePOIs<<<num_blocks, nThreads>>>(poi_data, data.h);
poi_data.it++;
}
}
......@@ -451,7 +455,7 @@ int CGpuNode::run() {
vgpu.threads = dim3( xThreads, yThreads );
vgpu.blocks = dim3( ceil( (float)dp.nJ / (float)xThreads ), ceil( (float)data.params.nI / (float)yThreads ) );
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
CUDA_CALL( setDevice( vgpu.dev->id ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[0] ) );
......@@ -470,7 +474,7 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i];
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
CUDA_CALL( setDevice( vgpu.dev->id ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[5] ) );
......@@ -502,7 +506,7 @@ int CGpuNode::run() {
if( ! isActive(vgpu) )
continue;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
CUDA_CALL( setDevice( vgpu.dev->id ) );
if( i < num_virtual_gpus - 1 )
cudaStreamWaitEvent( vgpu.stream[0], vgpus[i+1].evtSync, 0 );
......@@ -516,7 +520,7 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
CUDA_CALL( setDevice( vgpu.dev->id ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[2] ) );
......@@ -535,7 +539,7 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i];
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
CUDA_CALL( setDevice( vgpu.dev->id ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[6] ) );
......@@ -565,7 +569,7 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
CUDA_CALL( cudaSetDevice( vgpu.dev->id ) );
CUDA_CALL( setDevice( vgpu.dev->id ) );
if( Par.verbose && vgpu.relId == 0 )
CUDA_CALL( cudaEventRecord( vgpu.dev->evtStart[4] ) );
......@@ -586,7 +590,7 @@ int CGpuNode::run() {
VGpu& vgpu = vgpus[i];
KernelData& data = vgpu.data;
cudaSetDevice( vgpu.dev->id );
setDevice( vgpu.dev->id );
cudaDeviceSynchronize();
CUDA_CALL( cudaMemset( data.extend, 0, sizeof(int4) ) );
......@@ -603,7 +607,7 @@ int CGpuNode::run() {
if( Par.verbose ) {
for( int i = 0; i < num_real_gpus; i++ ) {
cudaSetDevice( i );
setDevice( i );
float dur;
for( int j = 0; j < 7; j++ ) {
......@@ -682,3 +686,12 @@ bool CGpuNode::isActive( VGpu& vgpu ) {
return ( params.iMin <= vgpu.end && params.iMax >= vgpu.off );
}
cudaError_t CGpuNode::setDevice( int device ) {
if( num_real_gpus == 1 )
return cudaSuccess;
return cudaSetDevice( device );
}
......@@ -82,13 +82,6 @@ public:
float *cB3;
float *cB4;
/* POIs */
float *pois;
int *idxs;
long *host_idxs;
int num_pois;
int it;
Params params;
int4 *extend;
......@@ -102,6 +95,14 @@ public:
__host__ __device__ int idx( int i, int j ) { return (j-1) + (i-1) * params.pI + params.lpad; }
};
class PoiData {
public:
float *pois;
int *idxs;
int num_pois;
int it;
};
class Gpu {
public:
......@@ -123,6 +124,7 @@ public:
int end;
int size;
KernelData data;
PoiData poi_data;
int gt, gb;
......@@ -152,6 +154,7 @@ protected:
Gpu *gpus;
int4 *extend;
long *host_idxs;
/* line size in bytes */
size_t pitch;
......@@ -178,6 +181,7 @@ private:
int init_vgpus();
int updateParams( VGpu& vgpu );
bool isActive( VGpu& vgpu );
cudaError_t setDevice( int device );
};
#endif /* EW_GPUNODE_H */
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment