diff --git a/cpu/trunk/EasyWave.cu b/cpu/trunk/src/EasyWave.cu similarity index 100% rename from cpu/trunk/EasyWave.cu rename to cpu/trunk/src/EasyWave.cu diff --git a/cpu/trunk/Makefile b/cpu/trunk/src/Makefile similarity index 77% rename from cpu/trunk/Makefile rename to cpu/trunk/src/Makefile index f27a6b5600f09dd9ee92f35f54fda7104b2e7175..900b1b3ccac3158a1f15a85d4cd9b43df1eedbe5 100644 --- a/cpu/trunk/Makefile +++ b/cpu/trunk/src/Makefile @@ -17,17 +17,14 @@ OBJECTS=$(patsubst %.cpp, %.o, $(CPPS) ) $(patsubst %.cu,%.o,$(CUS) ) all: EasyWave -EasyWave: $(OBJECTS) link.o +EasyWave: $(OBJECTS) $(NVCC) -o $@ $^ %.o: %.cpp *.h $(CC) -c $(CFLAGS) -o $@ $< %.o: %.cu *.cuh *.h - $(NVCC) -dc $(NVFLAGS) -x cu -o $@ $< - -link.o: $(CU_OBJS) - $(NVCC) -dlink $(NVFLAGS) -o $@ $^ + $(NVCC) $(NVFLAGS) -c -x cu -o $@ $< clean: rm -f EasyWave *.o diff --git a/cpu/trunk/cOgrd.cpp b/cpu/trunk/src/cOgrd.cpp similarity index 100% rename from cpu/trunk/cOgrd.cpp rename to cpu/trunk/src/cOgrd.cpp diff --git a/cpu/trunk/cOgrd.h b/cpu/trunk/src/cOgrd.h similarity index 100% rename from cpu/trunk/cOgrd.h rename to cpu/trunk/src/cOgrd.h diff --git a/cpu/trunk/cOkadaEarthquake.cpp b/cpu/trunk/src/cOkadaEarthquake.cpp similarity index 100% rename from cpu/trunk/cOkadaEarthquake.cpp rename to cpu/trunk/src/cOkadaEarthquake.cpp diff --git a/cpu/trunk/cOkadaEarthquake.h b/cpu/trunk/src/cOkadaEarthquake.h similarity index 100% rename from cpu/trunk/cOkadaEarthquake.h rename to cpu/trunk/src/cOkadaEarthquake.h diff --git a/cpu/trunk/cOkadaFault.cpp b/cpu/trunk/src/cOkadaFault.cpp similarity index 100% rename from cpu/trunk/cOkadaFault.cpp rename to cpu/trunk/src/cOkadaFault.cpp diff --git a/cpu/trunk/cOkadaFault.h b/cpu/trunk/src/cOkadaFault.h similarity index 100% rename from cpu/trunk/cOkadaFault.h rename to cpu/trunk/src/cOkadaFault.h diff --git a/cpu/trunk/cSphere.cpp b/cpu/trunk/src/cSphere.cpp similarity index 100% rename from cpu/trunk/cSphere.cpp rename to cpu/trunk/src/cSphere.cpp diff --git a/cpu/trunk/cSphere.h b/cpu/trunk/src/cSphere.h similarity index 100% rename from cpu/trunk/cSphere.h rename to cpu/trunk/src/cSphere.h diff --git a/cpu/trunk/easywave.h b/cpu/trunk/src/easywave.h similarity index 100% rename from cpu/trunk/easywave.h rename to cpu/trunk/src/easywave.h diff --git a/cpu/trunk/ewCudaKernels.cu b/cpu/trunk/src/ewCudaKernels.cu similarity index 98% rename from cpu/trunk/ewCudaKernels.cu rename to cpu/trunk/src/ewCudaKernels.cu index c6d4c3e412aea0138b3b4ee63724359c1c11c6b8..c0a28580dfde5366e8f4b16606f4d83f5b11b019 100644 --- a/cpu/trunk/ewCudaKernels.cu +++ b/cpu/trunk/src/ewCudaKernels.cu @@ -1,8 +1,6 @@ #include "ewGpuNode.cuh" #include "ewCudaKernels.cuh" -__device__ int4 g_MinMax = {0,0,0,0}; - __global__ void runWaveUpdateKernel( KernelData data ) { Params& dp = data.params; @@ -209,5 +207,5 @@ __global__ void runGridExtendKernel2( KernelData data ) { tmp.z = dp.jMin; tmp.w = dp.jMax; - g_MinMax = tmp; + *(data.g_MinMax) = tmp; } diff --git a/cpu/trunk/ewCudaKernels.cuh b/cpu/trunk/src/ewCudaKernels.cuh similarity index 92% rename from cpu/trunk/ewCudaKernels.cuh rename to cpu/trunk/src/ewCudaKernels.cuh index 5f7dfe33a873a605a79795d40183d6f3d5431d15..b67b1ed6159ec0fea4477b78b75055ee244a6cdb 100644 --- a/cpu/trunk/ewCudaKernels.cuh +++ b/cpu/trunk/src/ewCudaKernels.cuh @@ -1,8 +1,6 @@ #ifndef EW_KERNELS_H #define EW_KERNELS_H -extern __device__ int4 g_MinMax; - __global__ void runWaveUpdateKernel( KernelData data ); __global__ void runWaveBoundaryKernel( KernelData data ); __global__ void runFluxUpdateKernel( KernelData data ); diff --git a/cpu/trunk/ewGpuNode.cu b/cpu/trunk/src/ewGpuNode.cu similarity index 97% rename from cpu/trunk/ewGpuNode.cu rename to cpu/trunk/src/ewGpuNode.cu index ce61ad2464b731f802883011cc2e2e2acf39366f..cbc04f81b2865803046f43bcea1f457ac00b3483 100644 --- a/cpu/trunk/ewGpuNode.cu +++ b/cpu/trunk/src/ewGpuNode.cu @@ -49,6 +49,8 @@ int CGpuNode::mallocMem() { CUDA_CALL( cudaMalloc( &(data.cB3), dp.nI * sizeof(float) ) ); CUDA_CALL( cudaMalloc( &(data.cB4), dp.nJ * sizeof(float) ) ); + CUDA_CALL( cudaMalloc( &(data.g_MinMax), sizeof(int4) ) ); + /* TODO: make sure that pitch is a multiple of 4 and the same for each cudaMallocPitch() call */ dp.pI = pitch / sizeof(float); @@ -154,6 +156,8 @@ int CGpuNode::freeMem() { CUDA_CALL( cudaFree( data.cB3 ) ); CUDA_CALL( cudaFree( data.cB4 ) ); + CUDA_CALL( cudaFree( data.g_MinMax ) ); + float total_dur = 0.f; for( int j = 0; j < 5; j++ ) { printf_v("Duration %u: %.3f\n", j, dur[j]); @@ -201,11 +205,8 @@ int CGpuNode::run() { runGridExtendKernel2<<<1,1>>>( data ); CUDA_CALL( cudaEventRecord( evtEnd[4], 0 ) ); - void *g_mm; - CUDA_CALL( cudaGetSymbolAddress( &g_mm, g_MinMax ) ); - int4 MinMax; - CUDA_CALL( cudaMemcpy( &MinMax, g_mm, sizeof(int4), cudaMemcpyDeviceToHost ) ); + CUDA_CALL( cudaMemcpy( &MinMax, data.g_MinMax, sizeof(int4), cudaMemcpyDeviceToHost ) ); cudaDeviceSynchronize(); Imin = dp.iMin = MinMax.x; Imax = dp.iMax = MinMax.y; diff --git a/cpu/trunk/ewGpuNode.cuh b/cpu/trunk/src/ewGpuNode.cuh similarity index 98% rename from cpu/trunk/ewGpuNode.cuh rename to cpu/trunk/src/ewGpuNode.cuh index 11373ca96a0893fcce9a31470116cd94defcd253..20f6f8c61ce6d9d98e17dddc381fc4ae17b5a887 100644 --- a/cpu/trunk/ewGpuNode.cuh +++ b/cpu/trunk/src/ewGpuNode.cuh @@ -52,6 +52,8 @@ public: Params params; + int4 *g_MinMax; + __device__ int le( int ij ) { return ij - params.pI; } __device__ int ri( int ij ) { return ij + params.pI; } __device__ int up( int ij ) { return ij + 1; } diff --git a/cpu/trunk/ewGrid.cpp b/cpu/trunk/src/ewGrid.cpp similarity index 79% rename from cpu/trunk/ewGrid.cpp rename to cpu/trunk/src/ewGrid.cpp index 2f9536fbaeba6b0cb27faf5faf25ed880124234e..bbd9b85863ce48900602318074c283ed595ec488 100644 --- a/cpu/trunk/ewGrid.cpp +++ b/cpu/trunk/src/ewGrid.cpp @@ -74,36 +74,59 @@ int ewLoadBathymetry() Dx = Re * g2r( DLon ); // in m along the equator Dy = Re * g2r( DLat ); - /* NOTE: optimal would be reading everything in one step, but that does not work because rows and columns are transposed - * (only possible with binary data at all) - use temporary buffer for now (consumes additional memory!) */ - float *buf = new float[ NLat*NLon ]; - ierr = fread( buf, sizeof(float), NLat*NLon, fp ); + if( isBin ) { - for( i=1; i<=NLon; i++ ) { - for( j=1; j<=NLat; j++ ) { + /* NOTE: optimal would be reading everything in one step, but that does not work because rows and columns are transposed + * (only possible with binary data at all) - use temporary buffer for now (consumes additional memory!) */ + float *buf = new float[ NLat*NLon ]; + ierr = fread( buf, sizeof(float), NLat*NLon, fp ); - m = idx(j,i); + for( i=1; i<=NLon; i++ ) { + for( j=1; j<=NLat; j++ ) { + + m = idx(j,i); - if( isBin ) - fval = buf[ (j-1) * NLon + (i-1) ]; - //ierr = fread( &fval, sizeof(float), 1, fp ); - else - ierr = fscanf( fp, " %f ", &fval ); + if( isBin ) + fval = buf[ (j-1) * NLon + (i-1) ]; + //ierr = fread( &fval, sizeof(float), 1, fp ); - Node(m, iTopo) = fval; - Node(m, iTime) = -1; - Node(m, iD) = -fval; + Node(m, iTopo) = fval; + Node(m, iTime) = -1; + Node(m, iD) = -fval; - if( Node(m, iD) < 0 ) { - Node(m, iD) = 0.0f; - } else if( Node(m, iD) < Par.dmin ) { - Node(m, iD) = Par.dmin; + if( Node(m, iD) < 0 ) { + Node(m, iD) = 0.0f; + } else if( Node(m, iD) < Par.dmin ) { + Node(m, iD) = Par.dmin; + } + + } } - } - } + delete[] buf; + + } else { - delete[] buf; + for( j=1; j<=NLat; j++ ) { + for( i=1; i<=NLon; i++ ) { + + m = idx(j,i); + ierr = fscanf( fp, " %f ", &fval ); + + Node(m, iTopo) = fval; + Node(m, iTime) = -1; + Node(m, iD) = -fval; + + if( Node(m, iD) < 0 ) { + Node(m, iD) = 0.0f; + } else if( Node(m, iD) < Par.dmin ) { + Node(m, iD) = Par.dmin; + } + + } + + } + } for( k=1; k