ewGpuNode.cu 6.95 KB
Newer Older
1 2 3 4 5 6 7
#include "ewGpuNode.cuh"
#include "ewCudaKernels.cuh"

CGpuNode::CGpuNode() {

	pitch = 0;
	copied = true;
8 9 10 11 12 13 14

	for( int i = 0; i < 5; i++ ) {
		cudaEventCreate( &(evtStart[i]) );
		cudaEventCreate( &(evtEnd[i]) );
		dur[i] = 0.0;
	}

15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
}

int CGpuNode::mallocMem() {

	CArrayNode::mallocMem();

	Params& dp = data.params;

	/* fill in some fields here */
	dp.nI = NLon;
	dp.nJ = NLat;
	dp.sshArrivalThreshold = Par.sshArrivalThreshold;
	dp.sshClipThreshold = Par.sshClipThreshold;
	dp.sshZeroThreshold = Par.sshZeroThreshold;
	dp.lpad = 0;

	size_t nJ_aligned = dp.nJ + dp.lpad;

	/* 2-dim */
	CUDA_CALL( cudaMallocPitch( &(data.d), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.h), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.hMax), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.fM), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.fN), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.cR1), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.cR2), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.cR4), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	CUDA_CALL( cudaMallocPitch( &(data.tArr), &pitch, nJ_aligned * sizeof(float), dp.nI ) );
	/* TODO: cR3, cR5 for coriolis */

	/* 1-dim */
	CUDA_CALL( cudaMalloc( &(data.cR6), dp.nJ * sizeof(float) ) );
	CUDA_CALL( cudaMalloc( &(data.cB1), dp.nI * sizeof(float) ) );
	CUDA_CALL( cudaMalloc( &(data.cB2), dp.nJ * sizeof(float) ) );
	CUDA_CALL( cudaMalloc( &(data.cB3), dp.nI * sizeof(float) ) );
	CUDA_CALL( cudaMalloc( &(data.cB4), dp.nJ * sizeof(float) ) );

	/* TODO: make sure that pitch is a multiple of 4 and the same for each cudaMallocPitch() call */
	dp.pI = pitch / sizeof(float);

	return 0;
}

int CGpuNode::copyToGPU() {

	Params& dp = data.params;

	/* fill in further fields here */
	dp.iMin = Imin;
	dp.iMax = Imax;
	dp.jMin = Jmin;
	dp.jMax = Jmax;

	/* FIXME: should not work correctly */
	/* add offset to data.d to guarantee alignment: data.d + LPAD */
	/* 2-dim */
	CUDA_CALL( cudaMemcpy2D( data.d + dp.lpad, pitch, d, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.h + dp.lpad, pitch, h, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.hMax + dp.lpad, pitch, hMax, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.fM + dp.lpad, pitch, fM, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.fN + dp.lpad, pitch, fN, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.cR1 + dp.lpad, pitch, cR1, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.cR2 + dp.lpad, pitch, cR2, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.cR4 + dp.lpad, pitch, cR4, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy2D( data.tArr + dp.lpad, pitch, tArr, dp.nJ * sizeof(float), dp.nJ * sizeof(float), dp.nI, cudaMemcpyHostToDevice ) );

	/* FIXME: move global variables into data structure */
	/* 1-dim */
	CUDA_CALL( cudaMemcpy( data.cR6, R6, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy( data.cB1, C1, dp.nI * sizeof(float), cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy( data.cB2, C2, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy( data.cB3, C3, dp.nI * sizeof(float), cudaMemcpyHostToDevice ) );
	CUDA_CALL( cudaMemcpy( data.cB4, C4, dp.nJ * sizeof(float), cudaMemcpyHostToDevice ) );

	return 0;
}
int CGpuNode::copyFromGPU() {

	Params& dp = data.params;

	CUDA_CALL( cudaMemcpy2D( hMax, dp.nJ * sizeof(float), data.hMax + dp.lpad, pitch, dp.nJ * sizeof(float), dp.nI, cudaMemcpyDeviceToHost ) );
	CUDA_CALL( cudaMemcpy2D( tArr, dp.nJ * sizeof(float), data.tArr + dp.lpad, pitch, dp.nJ * sizeof(float), dp.nI, cudaMemcpyDeviceToHost ) );

	return 0;
}

int CGpuNode::copyIntermediate() {

	/* ignore copy requests if data already present on CPU side */
	if( copied )
		return 0;

	Params& dp = data.params;

	CUDA_CALL( cudaMemcpy2D( h, dp.nJ * sizeof(float), data.h + dp.lpad, pitch, dp.nJ * sizeof(float), dp.nI, cudaMemcpyDeviceToHost ) );

	/* copy finished */
	copied = true;

	return 0;
}

int CGpuNode::freeMem() {

	/* 2-dim */
	CUDA_CALL( cudaFree( data.d ) );
	CUDA_CALL( cudaFree( data.h ) );
	CUDA_CALL( cudaFree( data.hMax ) );
	CUDA_CALL( cudaFree( data.fM ) );
	CUDA_CALL( cudaFree( data.fN ) );
	CUDA_CALL( cudaFree( data.cR1 ) );
	CUDA_CALL( cudaFree( data.cR2 ) );
	CUDA_CALL( cudaFree( data.cR4 ) );
	CUDA_CALL( cudaFree( data.tArr ) );

	/* 1-dim */
	CUDA_CALL( cudaFree( data.cR6 ) );
	CUDA_CALL( cudaFree( data.cB1 ) );
	CUDA_CALL( cudaFree( data.cB2 ) );
	CUDA_CALL( cudaFree( data.cB3 ) );
	CUDA_CALL( cudaFree( data.cB4 ) );

137 138 139 140 141 142 143
	float total_dur = 0.f;
	for( int j = 0; j < 5; j++ ) {
		printf_v("Duration %u: %.3f\n", j, dur[j]);
		total_dur += dur[j];
	}
	printf_v("Duration total: %.3f\n",total_dur);

144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
	CArrayNode::freeMem();

	return 0;
}

int CGpuNode::run() {

	Params& dp = data.params;

	int nThreads = 256;
	int xThreads = 32;
	int yThreads = nThreads / xThreads;

	int xBlocks = ceil( (float)dp.nJ / (float)xThreads );
	int yBlocks = ceil( (float)dp.nI / (float)yThreads );

	dim3 threads( xThreads, yThreads );
	dim3 blocks( xBlocks, yBlocks );

	int nBlocks = ceil( (float)max(dp.nI,dp.nJ) / (float)nThreads );

	dp.mTime = Par.time;

167
	CUDA_CALL( cudaEventRecord( evtStart[0], 0 ) );
168
	runWaveUpdateKernel<<<blocks,threads>>>( data );
169 170
	CUDA_CALL( cudaEventRecord( evtEnd[0], 0 ) );
	CUDA_CALL( cudaEventRecord( evtStart[1], 0 ) );
171
	runWaveBoundaryKernel<<<nBlocks,nThreads>>>( data );
172 173
	CUDA_CALL( cudaEventRecord( evtEnd[1], 0 ) );
	CUDA_CALL( cudaEventRecord( evtStart[2], 0 ) );
174
	runFluxUpdateKernel<<<blocks,threads>>>( data );
175 176
	CUDA_CALL( cudaEventRecord( evtEnd[2], 0 ) );
	CUDA_CALL( cudaEventRecord( evtStart[3], 0 ) );
177
	runFluxBoundaryKernel<<<nBlocks,nThreads>>>( data );
178 179
	CUDA_CALL( cudaEventRecord( evtEnd[3], 0 ) );
	CUDA_CALL( cudaEventRecord( evtStart[4], 0 ) );
180 181
	runGridExtendKernel1<<<nBlocks,nThreads>>>( data );
	runGridExtendKernel2<<<1,1>>>( data );
182
	CUDA_CALL( cudaEventRecord( evtEnd[4], 0 ) );
183 184 185 186 187 188

	void *g_mm;
	CUDA_CALL( cudaGetSymbolAddress( &g_mm, g_MinMax ) );

	int4 MinMax;
	CUDA_CALL( cudaMemcpy( &MinMax, g_mm, sizeof(int4), cudaMemcpyDeviceToHost ) );
189
	cudaDeviceSynchronize();
190 191 192 193 194
	Imin = dp.iMin = MinMax.x;
	Imax = dp.iMax = MinMax.y;
	Jmin = dp.jMin = MinMax.z;
	Jmax = dp.jMax = MinMax.w;

195 196 197 198 199 200
	float _dur;
	for( int j = 0; j < 5; j++ ) {
		cudaEventElapsedTime( &_dur, evtStart[j], evtEnd[j]);
		dur[j] += _dur;
	}

201 202 203 204 205
	/* data has changed now -> copy becomes necessary */
	copied = false;

	return 0;
}