diff --git a/cpu/trunk/EasyWave.cu b/cpu/trunk/EasyWave.cu index e7445510a15625adcf40d858625daabff82e2c8c..f042662b87732c79668af050eba5c4c7834418db 100644 --- a/cpu/trunk/EasyWave.cu +++ b/cpu/trunk/EasyWave.cu @@ -11,6 +11,20 @@ CNode *gNode; +double diff(timespec start, timespec end) { + + timespec temp; + if ((end.tv_nsec-start.tv_nsec)<0) { + temp.tv_sec = end.tv_sec-start.tv_sec-1; + temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec; + } else { + temp.tv_sec = end.tv_sec-start.tv_sec; + temp.tv_nsec = end.tv_nsec-start.tv_nsec; + } + + return (double)((double)temp.tv_nsec / 1000000000.0 + (double)temp.tv_sec); +} + int commandLineHelp( void ); int main( int argc, char **argv ) @@ -64,6 +78,9 @@ int main( int argc, char **argv ) // Main loop Log.print("Starting main loop..."); + timespec start, end; + clock_gettime(CLOCK_MONOTONIC, &start); + for( Par.time=0,loop=1,lastProgress=Par.outProgress,lastPropagation=Par.outPropagation,lastDump=0; Par.time<=Par.timeMax; loop++,Par.time+=Par.dt,lastProgress+=Par.dt,lastPropagation+=Par.dt ) { @@ -103,6 +120,7 @@ int main( int argc, char **argv ) } } // main loop + clock_gettime(CLOCK_MONOTONIC, &end); Log.print("Finishing main loop"); /* TODO: check if theses calls can be combined */ @@ -116,6 +134,8 @@ int main( int argc, char **argv ) Node.freeMem(); + printf_v("Runtime: %.3lf\n", diff(start, end) * 1000.0); + delete gNode; return 0; @@ -148,6 +168,7 @@ int commandLineHelp( void ) printf( "-ssh_arrival ... threshold for arrival times in [m], default- 0.001\n" ); printf( " negative value considered as relative threshold\n" ); printf( "-gpu start GPU version of EasyWave (requires a CUDA capable device)\n" ); + printf( "-verbose generate verbose output on stdout\n" ); printf( "\nExample:\n" ); printf( "\t easyWave -grid gebcoIndonesia.grd -source fault.inp -time 120\n\n" ); diff --git a/cpu/trunk/easywave.h b/cpu/trunk/easywave.h index fde0240e38ff4c7e98972b0f7601eadb1a493f94..1b5664ac2ecfaa5680390f0669f9d163b4dcf8a7 100644 --- a/cpu/trunk/easywave.h +++ b/cpu/trunk/easywave.h @@ -47,6 +47,8 @@ struct EWPARAMS { float sshTransparencyThreshold; float sshArrivalThreshold; bool gpu; + bool adjustZtop; + bool verbose; }; extern struct EWPARAMS Par; @@ -84,11 +86,11 @@ int ewSavePOIs(); int ewDumpPOIs(); int ewDumpPOIsCompact( int istage ); +/* verbose printf: only executed if -verbose was set */ +#define printf_v( Args, ... ) if( Par.verbose ) printf( Args, ##__VA_ARGS__); + #include "ewNode.h" extern CNode *gNode; -/* unset to remove ZTOP_PATCH */ -#define ZTOP_PATCH - #endif /* EASYWAVE_H */ diff --git a/cpu/trunk/ewGpuNode.cu b/cpu/trunk/ewGpuNode.cu index e1a419fd0091ac39f8c84c4ee34fae8cbbb10bdb..87103fae605cd94198df74853d617cc16c1330f4 100644 --- a/cpu/trunk/ewGpuNode.cu +++ b/cpu/trunk/ewGpuNode.cu @@ -5,6 +5,13 @@ CGpuNode::CGpuNode() { pitch = 0; copied = true; + + for( int i = 0; i < 5; i++ ) { + cudaEventCreate( &(evtStart[i]) ); + cudaEventCreate( &(evtEnd[i]) ); + dur[i] = 0.0; + } + } int CGpuNode::mallocMem() { @@ -127,6 +134,13 @@ int CGpuNode::freeMem() { CUDA_CALL( cudaFree( data.cB3 ) ); CUDA_CALL( cudaFree( data.cB4 ) ); + float total_dur = 0.f; + for( int j = 0; j < 5; j++ ) { + printf_v("Duration %u: %.3f\n", j, dur[j]); + total_dur += dur[j]; + } + printf_v("Duration total: %.3f\n",total_dur); + CArrayNode::freeMem(); return 0; @@ -150,23 +164,40 @@ int CGpuNode::run() { dp.mTime = Par.time; + CUDA_CALL( cudaEventRecord( evtStart[0], 0 ) ); runWaveUpdateKernel<<>>( data ); + CUDA_CALL( cudaEventRecord( evtEnd[0], 0 ) ); + CUDA_CALL( cudaEventRecord( evtStart[1], 0 ) ); runWaveBoundaryKernel<<>>( data ); + CUDA_CALL( cudaEventRecord( evtEnd[1], 0 ) ); + CUDA_CALL( cudaEventRecord( evtStart[2], 0 ) ); runFluxUpdateKernel<<>>( data ); + CUDA_CALL( cudaEventRecord( evtEnd[2], 0 ) ); + CUDA_CALL( cudaEventRecord( evtStart[3], 0 ) ); runFluxBoundaryKernel<<>>( data ); + CUDA_CALL( cudaEventRecord( evtEnd[3], 0 ) ); + CUDA_CALL( cudaEventRecord( evtStart[4], 0 ) ); runGridExtendKernel1<<>>( data ); runGridExtendKernel2<<<1,1>>>( data ); + CUDA_CALL( cudaEventRecord( evtEnd[4], 0 ) ); void *g_mm; CUDA_CALL( cudaGetSymbolAddress( &g_mm, g_MinMax ) ); int4 MinMax; CUDA_CALL( cudaMemcpy( &MinMax, g_mm, sizeof(int4), cudaMemcpyDeviceToHost ) ); + cudaDeviceSynchronize(); Imin = dp.iMin = MinMax.x; Imax = dp.iMax = MinMax.y; Jmin = dp.jMin = MinMax.z; Jmax = dp.jMax = MinMax.w; + float _dur; + for( int j = 0; j < 5; j++ ) { + cudaEventElapsedTime( &_dur, evtStart[j], evtEnd[j]); + dur[j] += _dur; + } + /* data has changed now -> copy becomes necessary */ copied = false; diff --git a/cpu/trunk/ewGpuNode.cuh b/cpu/trunk/ewGpuNode.cuh index 1af6666131fc7d66db98d76abd88d68ae436a52f..3b1ac68cb293c68f4d73b50f681c2b6101c50fd6 100644 --- a/cpu/trunk/ewGpuNode.cuh +++ b/cpu/trunk/ewGpuNode.cuh @@ -71,6 +71,10 @@ protected: /* specifies if data was already copied in the current calculation step */ bool copied; + cudaEvent_t evtStart[5]; + cudaEvent_t evtEnd[5]; + float dur[5]; + public: CGpuNode(); int mallocMem(); diff --git a/cpu/trunk/ewParam.cpp b/cpu/trunk/ewParam.cpp index bfec5f479fc896a223371c49a6859ce063bde901..bc0420bcec2bbb2fa2b3119d8070df04a281def9 100644 --- a/cpu/trunk/ewParam.cpp +++ b/cpu/trunk/ewParam.cpp @@ -150,6 +150,16 @@ int ewParam( int argc, char **argv ) else Par.gpu = false; + if( ( argn = utlCheckCommandLineOption( argc, argv, "adjust_ztop", 11 ) ) != 0 ) + Par.adjustZtop = true; + else + Par.adjustZtop = false; + + if( ( argn = utlCheckCommandLineOption( argc, argv, "verbose", 7 ) ) != 0 ) + Par.verbose = true; + else + Par.verbose = false; + return 0; } diff --git a/cpu/trunk/ewSource.cpp b/cpu/trunk/ewSource.cpp index 90b7d68fed8574cff2a7fc9e0df8e7c78a78466c..8a65a1bf2f5771838f50ee9afb4eb1e521f77fbc 100644 --- a/cpu/trunk/ewSource.cpp +++ b/cpu/trunk/ewSource.cpp @@ -49,51 +49,55 @@ int ewSource() ierr = eq.read( Par.fileSource ); if(ierr) return ierr; -#ifdef ZTOP_PATCH - // check fault parameters - Err.disable(); - ierr = eq.finalizeInput(); - while( ierr ) { - i = ierr/10; - ierr = ierr - 10*i; - if( ierr == FLT_ERR_STRIKE ) { - Log.print( "No strike on input: Employing effective symmetric source model" ); - if( eq.nfault > 1 ) { Err.enable(); return Err.post("Symmetric source assumes only 1 fault"); } - eq.fault[0].strike = 0.; - effSymSource = 1; - } - else if( ierr == FLT_ERR_ZTOP ) { - Log.print( "Automatic depth correction to fault top @ 10 km" ); - eq.fault[i].depth = eq.fault[i].width/2 * sindeg(eq.fault[i].dip) + 10.e3; - } - else { + if( Par.adjustZtop ) { + + // check fault parameters + Err.disable(); + ierr = eq.finalizeInput(); + while( ierr ) { + i = ierr/10; + ierr = ierr - 10*i; + if( ierr == FLT_ERR_STRIKE ) { + Log.print( "No strike on input: Employing effective symmetric source model" ); + if( eq.nfault > 1 ) { Err.enable(); return Err.post("Symmetric source assumes only 1 fault"); } + eq.fault[0].strike = 0.; + effSymSource = 1; + } + else if( ierr == FLT_ERR_ZTOP ) { + Log.print( "Automatic depth correction to fault top @ 10 km" ); + eq.fault[i].depth = eq.fault[i].width/2 * sindeg(eq.fault[i].dip) + 10.e3; + } + else { + Err.enable(); + return ierr; + } + ierr = eq.finalizeInput(); + } Err.enable(); - return ierr; - } - ierr = eq.finalizeInput(); - } - Err.enable(); -#else - // check fault parameters - Err.disable(); - ierr = eq.finalizeInput(); - if( ierr ) { - i = ierr/10; - ierr = ierr - 10*i; - if( ierr != FLT_ERR_STRIKE ) { - Err.enable(); - ierr = eq.finalizeInput(); - return ierr; - } - Log.print( "No strike on input: Employing effective symmetric source model" ); - Err.enable(); - if( eq.nfault > 1 ) return Err.post("symmetric source assumes only 1 fault"); - eq.fault[0].strike = 0.; - effSymSource = 1; - ierr = eq.finalizeInput(); if(ierr) return ierr; + + } else { + + // check fault parameters + Err.disable(); + ierr = eq.finalizeInput(); + if( ierr ) { + i = ierr/10; + ierr = ierr - 10*i; + if( ierr != FLT_ERR_STRIKE ) { + Err.enable(); + ierr = eq.finalizeInput(); + return ierr; + } + Log.print( "No strike on input: Employing effective symmetric source model" ); + Err.enable(); + if( eq.nfault > 1 ) return Err.post("symmetric source assumes only 1 fault"); + eq.fault[0].strike = 0.; + effSymSource = 1; + ierr = eq.finalizeInput(); if(ierr) return ierr; + } + Err.enable(); + } - Err.enable(); -#endif // calculate uplift on a rectangular grid // set grid resolution, grid dimensions will be set automatically