ewGpuNode.cu 7.33 KB
Newer Older
1 2 3 4 5 6 7
#include "ewGpuNode.cuh"
#include "ewCudaKernels.cuh"

CGpuNode::CGpuNode() {

	pitch = 0;
	copied = true;
8 9 10 11 12 13 14

	for( int i = 0; i < 5; i++ ) {
		cudaEventCreate( &(evtStart[i]) );
		cudaEventCreate( &(evtEnd[i]) );
		dur[i] = 0.0;
	}

15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
}

int CGpuNode::mallocMem() {

	CArrayNode::mallocMem();

	Params& dp = data.params;

	/* fill in some fields here */
	dp.nI = NLon;
	dp.nJ = NLat;
	dp.sshArrivalThreshold = Par.sshArrivalThreshold;
	dp.sshClipThreshold = Par.sshClipThreshold;
	dp.sshZeroThreshold = Par.sshZeroThreshold;
	dp.lpad = 0;

	size_t nJ_aligned = dp.nJ + dp.lpad;

	/* 2-dim */
	CUDA_CALL( cudaMallocPitch( &(data.d), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.h), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.hMax), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.fM), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.fN), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.cR1), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.cR2), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.cR4), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.tArr), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	/* TODO: cR3, cR5 for coriolis */

	/* 1-dim */
	CUDA_CALL( cudaMalloc( &(data.cR6), dp.nJ * sizeof(float) ) );
	CUDA_CALL( cudaMalloc( &(data.cB1), dp.nI * sizeof(float) ) );
	CUDA_CALL( cudaMalloc( &(data.cB2), dp.nJ * sizeof(float) ) );
	CUDA_CALL( cudaMalloc( &(data.cB3), dp.nI * sizeof(float) ) );
	CUDA_CALL( cudaMalloc( &(data.cB4), dp.nJ * sizeof(float) ) );

52 53
	CUDA_CALL( cudaMalloc( &(data.g_MinMax), sizeof(int4) ) );

54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
	/* TODO: make sure that pitch is a multiple of 4 and the same for each cudaMallocPitch() call */
	dp.pI = pitch / sizeof(float);

	return 0;
}

int CGpuNode::copyToGPU() {

	Params& dp = data.params;

	/* fill in further fields here */
	dp.iMin = Imin;
	dp.iMax = Imax;
	dp.jMin = Jmin;
	dp.jMax = Jmax;

	/* FIXME: should not work correctly */
	/* add offset to data.d to guarantee alignment: data.d + LPAD */
	/* 2-dim */
	CUDA_CALL( cudaMemcpy2D( data.d + dp.lpad, pitch, d, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.h + dp.lpad, pitch, h, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.hMax + dp.lpad, pitch, hMax, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.fM + dp.lpad, pitch, fM, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.fN + dp.lpad, pitch, fN, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.cR1 + dp.lpad, pitch, cR1, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.cR2 + dp.lpad, pitch, cR2, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.cR4 + dp.lpad, pitch, cR4, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.tArr + dp.lpad, pitch, tArr, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );

	/* FIXME: move global variables into data structure */
	/* 1-dim */
	CUDA_CALL( cudaMemcpy( data.cR6, R6, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy( data.cB1, C1, dp.nI * sizeof(float), cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy( data.cB2, C2, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy( data.cB3, C3, dp.nI * sizeof(float), cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy( data.cB4, C4, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) );

	return 0;
}
int CGpuNode::copyFromGPU() {

	Params& dp = data.params;

	CUDA_CALL( cudaMemcpy2D( hMax, dp.nJ * sizeof(float), data.hMax + dp.lpad, pitch, dp.nJ * sizeof(float), dp.nI, cudaMemcpyDeviceToHost ) );
	CUDA_CALL( cudaMemcpy2D( tArr, dp.nJ * sizeof(float), data.tArr + dp.lpad, pitch, dp.nJ * sizeof(float), dp.nI, cudaMemcpyDeviceToHost ) );

	return 0;
}

int CGpuNode::copyIntermediate() {

	/* ignore copy requests if data already present on CPU side */
	if( copied )
		return 0;

	Params& dp = data.params;

	CUDA_CALL( cudaMemcpy2D( h, dp.nJ * sizeof(float), data.h + dp.lpad, pitch, dp.nJ * sizeof(float), dp.nI, cudaMemcpyDeviceToHost ) );

	/* copy finished */
	copied = true;

	return 0;
}

119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
int CGpuNode::copyPOIs() {

	Params& dp = data.params;

	if( copied )
		return 0;

	for( int n = 0; n < NPOIs; n++ ) {

		int i = idxPOI[n] / dp.nJ + 1;
		int j = idxPOI[n] % dp.nJ + 1;

		int id = data.idx( i, j );

		CUDA_CALL( cudaMemcpy( h + idxPOI[n], data.h + dp.lpad + id, sizeof(float), cudaMemcpyDeviceToHost ) );
	}

	return 0;
}

139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
int CGpuNode::freeMem() {

	/* 2-dim */
	CUDA_CALL( cudaFree( data.d ) );
	CUDA_CALL( cudaFree( data.h ) );
	CUDA_CALL( cudaFree( data.hMax ) );
	CUDA_CALL( cudaFree( data.fM ) );
	CUDA_CALL( cudaFree( data.fN ) );
	CUDA_CALL( cudaFree( data.cR1 ) );
	CUDA_CALL( cudaFree( data.cR2 ) );
	CUDA_CALL( cudaFree( data.cR4 ) );
	CUDA_CALL( cudaFree( data.tArr ) );

	/* 1-dim */
	CUDA_CALL( cudaFree( data.cR6 ) );
	CUDA_CALL( cudaFree( data.cB1 ) );
	CUDA_CALL( cudaFree( data.cB2 ) );
	CUDA_CALL( cudaFree( data.cB3 ) );
	CUDA_CALL( cudaFree( data.cB4 ) );

159 160
	CUDA_CALL( cudaFree( data.g_MinMax ) );

161 162 163 164 165 166 167
	float total_dur = 0.f;
	for( int j = 0; j < 5; j++ ) {
		printf_v("Duration %u: %.3f\n", j, dur[j]);
		total_dur += dur[j];
	}
	printf_v("Duration total: %.3f\n",total_dur);

168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
	CArrayNode::freeMem();

	return 0;
}

int CGpuNode::run() {

	Params& dp = data.params;

	int nThreads = 256;
	int xThreads = 32;
	int yThreads = nThreads / xThreads;

	int xBlocks = ceil( (float)dp.nJ / (float)xThreads );
	int yBlocks = ceil( (float)dp.nI / (float)yThreads );

	dim3 threads( xThreads, yThreads );
	dim3 blocks( xBlocks, yBlocks );

	int nBlocks = ceil( (float)max(dp.nI,dp.nJ) / (float)nThreads );

	dp.mTime = Par.time;

191
	CUDA_CALL( cudaEventRecord( evtStart[0], 0 ) );
192
	runWaveUpdateKernel<<<blocks,threads>>>( data );
193 194
	CUDA_CALL( cudaEventRecord( evtEnd[0], 0 ) );
	CUDA_CALL( cudaEventRecord( evtStart[1], 0 ) );
195
	runWaveBoundaryKernel<<<nBlocks,nThreads>>>( data );
196 197
	CUDA_CALL( cudaEventRecord( evtEnd[1], 0 ) );
	CUDA_CALL( cudaEventRecord( evtStart[2], 0 ) );
198
	runFluxUpdateKernel<<<blocks,threads>>>( data );
199 200
	CUDA_CALL( cudaEventRecord( evtEnd[2], 0 ) );
	CUDA_CALL( cudaEventRecord( evtStart[3], 0 ) );
201
	runFluxBoundaryKernel<<<nBlocks,nThreads>>>( data );
202 203
	CUDA_CALL( cudaEventRecord( evtEnd[3], 0 ) );
	CUDA_CALL( cudaEventRecord( evtStart[4], 0 ) );
204 205
	runGridExtendKernel1<<<nBlocks,nThreads>>>( data );
	runGridExtendKernel2<<<1,1>>>( data );
206
	CUDA_CALL( cudaEventRecord( evtEnd[4], 0 ) );
207 208

	int4 MinMax;
209
	CUDA_CALL( cudaMemcpy( &MinMax, data.g_MinMax, sizeof(int4), cudaMemcpyDeviceToHost ) );
210
	cudaDeviceSynchronize();
211 212 213 214 215
	Imin = dp.iMin = MinMax.x;
	Imax = dp.iMax = MinMax.y;
	Jmin = dp.jMin = MinMax.z;
	Jmax = dp.jMax = MinMax.w;

216 217 218 219 220 221
	float _dur;
	for( int j = 0; j < 5; j++ ) {
		cudaEventElapsedTime( &_dur, evtStart[j], evtEnd[j]);
		dur[j] += _dur;
	}

222 223 224 225 226
	/* data has changed now -> copy becomes necessary */
	copied = false;

	return 0;
}