diff --git a/autotest/test_par_gwf01.py b/autotest/test_par_gwf01.py index 3a47a73fc22..6e0d58e055b 100644 --- a/autotest/test_par_gwf01.py +++ b/autotest/test_par_gwf01.py @@ -71,6 +71,7 @@ def get_model(idx, dir): version="mf6", exe_name="mf6", sim_ws=dir, + profile_option="detail", ) tdis = flopy.mf6.ModflowTdis(sim, time_units="DAYS", nper=nper, perioddata=tdis_rc) diff --git a/doc/mf6io/mf6ivar/dfn/sim-nam.dfn b/doc/mf6io/mf6ivar/dfn/sim-nam.dfn index c22dbc7f1c9..3e0955f068f 100644 --- a/doc/mf6io/mf6ivar/dfn/sim-nam.dfn +++ b/doc/mf6io/mf6ivar/dfn/sim-nam.dfn @@ -25,6 +25,15 @@ longname memory print option description is a flag that controls printing of detailed memory manager usage to the end of the simulation list file. NONE means do not print detailed information. SUMMARY means print only the total memory for each simulation component. ALL means print information for each variable stored in the memory manager. NONE is default if MEMORY\_PRINT\_OPTION is not specified. mf6internal prmem +block options +name profile_option +type string +reader urword +optional true +longname profiling option +description is a flag that controls performance profiling and reporting. NONE disables profiling. SUMMARY means to measure and print a coarse performance profile. DETAIL means collect and print information with the highest resolution available. NONE is default if PROFILE\_OPTION is not specified. +mf6internal prprof + block options name maxerrors type integer diff --git a/make/makefile b/make/makefile index 27d99da2582..2fd29d40927 100644 --- a/make/makefile +++ b/make/makefile @@ -1,51 +1,52 @@ -# makefile created by pymake (version 1.2.10.dev0) for the 'mf6' executable. +# makefile created by pymake (version 1.4.0.dev0) for the 'mf6' executable. include ./makedefaults # Define the source file directories SOURCEDIR1=../src -SOURCEDIR2=../src/Distributed -SOURCEDIR3=../src/Exchange -SOURCEDIR4=../src/Idm -SOURCEDIR5=../src/Idm/selector +SOURCEDIR2=../src/Solution +SOURCEDIR3=../src/Solution/LinearMethods +SOURCEDIR4=../src/Solution/PETSc +SOURCEDIR5=../src/Solution/ParticleTracker SOURCEDIR6=../src/Model -SOURCEDIR7=../src/Model/ChannelFlow -SOURCEDIR8=../src/Model/Connection -SOURCEDIR9=../src/Model/Discretization -SOURCEDIR10=../src/Model/Geometry -SOURCEDIR11=../src/Model/GroundWaterEnergy -SOURCEDIR12=../src/Model/GroundWaterFlow -SOURCEDIR13=../src/Model/GroundWaterFlow/submodules -SOURCEDIR14=../src/Model/GroundWaterTransport -SOURCEDIR15=../src/Model/ModelUtilities -SOURCEDIR16=../src/Model/OverlandFlow -SOURCEDIR17=../src/Model/ParticleTracking -SOURCEDIR18=../src/Model/SurfaceWaterFlow -SOURCEDIR19=../src/Model/TransportModel -SOURCEDIR20=../src/Solution -SOURCEDIR21=../src/Solution/LinearMethods -SOURCEDIR22=../src/Solution/ParticleTracker -SOURCEDIR23=../src/Solution/PETSc -SOURCEDIR24=../src/Timing -SOURCEDIR25=../src/Utilities -SOURCEDIR26=../src/Utilities/ArrayRead -SOURCEDIR27=../src/Utilities/Export -SOURCEDIR28=../src/Utilities/Idm -SOURCEDIR29=../src/Utilities/Idm/mf6blockfile -SOURCEDIR30=../src/Utilities/Idm/netcdf -SOURCEDIR31=../src/Utilities/Libraries -SOURCEDIR32=../src/Utilities/Libraries/blas -SOURCEDIR33=../src/Utilities/Libraries/daglib -SOURCEDIR34=../src/Utilities/Libraries/rcm -SOURCEDIR35=../src/Utilities/Libraries/sparsekit -SOURCEDIR36=../src/Utilities/Libraries/sparskit2 -SOURCEDIR37=../src/Utilities/Matrix -SOURCEDIR38=../src/Utilities/Memory -SOURCEDIR39=../src/Utilities/Observation -SOURCEDIR40=../src/Utilities/OutputControl -SOURCEDIR41=../src/Utilities/TimeSeries -SOURCEDIR42=../src/Utilities/Vector +SOURCEDIR7=../src/Model/GroundWaterFlow +SOURCEDIR8=../src/Model/GroundWaterFlow/submodules +SOURCEDIR9=../src/Model/GroundWaterEnergy +SOURCEDIR10=../src/Model/ChannelFlow +SOURCEDIR11=../src/Model/Geometry +SOURCEDIR12=../src/Model/TransportModel +SOURCEDIR13=../src/Model/Connection +SOURCEDIR14=../src/Model/Discretization +SOURCEDIR15=../src/Model/ParticleTracking +SOURCEDIR16=../src/Model/SurfaceWaterFlow +SOURCEDIR17=../src/Model/OverlandFlow +SOURCEDIR18=../src/Model/GroundWaterTransport +SOURCEDIR19=../src/Model/ModelUtilities +SOURCEDIR20=../src/Utilities +SOURCEDIR21=../src/Utilities/TimeSeries +SOURCEDIR22=../src/Utilities/Vector +SOURCEDIR23=../src/Utilities/Export +SOURCEDIR24=../src/Utilities/Matrix +SOURCEDIR25=../src/Utilities/Observation +SOURCEDIR26=../src/Utilities/Memory +SOURCEDIR27=../src/Utilities/Performance +SOURCEDIR28=../src/Utilities/OutputControl +SOURCEDIR29=../src/Utilities/Idm +SOURCEDIR30=../src/Utilities/Idm/mf6blockfile +SOURCEDIR31=../src/Utilities/Idm/netcdf +SOURCEDIR32=../src/Utilities/Libraries +SOURCEDIR33=../src/Utilities/Libraries/sparskit2 +SOURCEDIR34=../src/Utilities/Libraries/daglib +SOURCEDIR35=../src/Utilities/Libraries/rcm +SOURCEDIR36=../src/Utilities/Libraries/sparsekit +SOURCEDIR37=../src/Utilities/Libraries/blas +SOURCEDIR38=../src/Utilities/ArrayRead +SOURCEDIR39=../src/Idm +SOURCEDIR40=../src/Idm/selector +SOURCEDIR41=../src/Exchange +SOURCEDIR42=../src/Timing +SOURCEDIR43=../src/Distributed VPATH = \ ${SOURCEDIR1} \ @@ -89,7 +90,8 @@ ${SOURCEDIR38} \ ${SOURCEDIR39} \ ${SOURCEDIR40} \ ${SOURCEDIR41} \ -${SOURCEDIR42} +${SOURCEDIR42} \ +${SOURCEDIR43} .SUFFIXES: .f90 .F90 .o @@ -123,18 +125,6 @@ $(OBJDIR)/PtrHashTable.o \ $(OBJDIR)/MemoryContainerIterator.o \ $(OBJDIR)/utl-ncfidm.o \ $(OBJDIR)/utl-hpcidm.o \ -$(OBJDIR)/swf-zdgidm.o \ -$(OBJDIR)/swf-stoidm.o \ -$(OBJDIR)/swf-namidm.o \ -$(OBJDIR)/swf-icidm.o \ -$(OBJDIR)/swf-flwidm.o \ -$(OBJDIR)/swf-disv2didm.o \ -$(OBJDIR)/swf-disv1didm.o \ -$(OBJDIR)/swf-dis2didm.o \ -$(OBJDIR)/swf-dfwidm.o \ -$(OBJDIR)/swf-cxsidm.o \ -$(OBJDIR)/swf-chdidm.o \ -$(OBJDIR)/swf-cdbidm.o \ $(OBJDIR)/sim-tdisidm.o \ $(OBJDIR)/sim-namidm.o \ $(OBJDIR)/prt-namidm.o \ @@ -147,10 +137,8 @@ $(OBJDIR)/olf-namidm.o \ $(OBJDIR)/olf-icidm.o \ $(OBJDIR)/olf-flwidm.o \ $(OBJDIR)/olf-disv2didm.o \ -$(OBJDIR)/olf-disv1didm.o \ $(OBJDIR)/olf-dis2didm.o \ $(OBJDIR)/olf-dfwidm.o \ -$(OBJDIR)/olf-cxsidm.o \ $(OBJDIR)/olf-chdidm.o \ $(OBJDIR)/olf-cdbidm.o \ $(OBJDIR)/gwt-namidm.o \ @@ -196,9 +184,7 @@ $(OBJDIR)/chf-stoidm.o \ $(OBJDIR)/chf-namidm.o \ $(OBJDIR)/chf-icidm.o \ $(OBJDIR)/chf-flwidm.o \ -$(OBJDIR)/chf-disv2didm.o \ $(OBJDIR)/chf-disv1didm.o \ -$(OBJDIR)/chf-dis2didm.o \ $(OBJDIR)/chf-dfwidm.o \ $(OBJDIR)/chf-cxsidm.o \ $(OBJDIR)/chf-chdidm.o \ @@ -208,7 +194,6 @@ $(OBJDIR)/LongLineReader.o \ $(OBJDIR)/DevFeature.o \ $(OBJDIR)/MemoryStore.o \ $(OBJDIR)/IdmUtlDfnSelector.o \ -$(OBJDIR)/IdmSwfDfnSelector.o \ $(OBJDIR)/IdmSimDfnSelector.o \ $(OBJDIR)/IdmPrtDfnSelector.o \ $(OBJDIR)/IdmOlfDfnSelector.o \ @@ -256,17 +241,17 @@ $(OBJDIR)/TimeArraySeriesManager.o \ $(OBJDIR)/PackageMover.o \ $(OBJDIR)/Obs.o \ $(OBJDIR)/NumericalPackage.o \ +$(OBJDIR)/Particle.o \ $(OBJDIR)/PackageBudget.o \ $(OBJDIR)/HeadFileReader.o \ $(OBJDIR)/BudgetObject.o \ $(OBJDIR)/BoundaryPackage.o \ $(OBJDIR)/CellDefn.o \ -$(OBJDIR)/Particle.o \ +$(OBJDIR)/TrackFile.o \ $(OBJDIR)/sort.o \ $(OBJDIR)/FlowModelInterface.o \ $(OBJDIR)/Cell.o \ $(OBJDIR)/Subcell.o \ -$(OBJDIR)/TrackFile.o \ $(OBJDIR)/TrackControl.o \ $(OBJDIR)/TimeSelect.o \ $(OBJDIR)/prt-fmi.o \ @@ -333,7 +318,9 @@ $(OBJDIR)/MethodCellPollockQuad.o \ $(OBJDIR)/MethodCellPollock.o \ $(OBJDIR)/MethodCellPassToBot.o \ $(OBJDIR)/SwfCxsUtils.o \ +$(OBJDIR)/swf-cxsidm.o \ $(OBJDIR)/Disv1dGeom.o \ +$(OBJDIR)/swf-icidm.o \ $(OBJDIR)/CellWithNbrs.o \ $(OBJDIR)/NumericalExchange.o \ $(OBJDIR)/tsp-ssm.o \ @@ -360,20 +347,25 @@ $(OBJDIR)/VirtualSolution.o \ $(OBJDIR)/SparseMatrix.o \ $(OBJDIR)/LinearSolverBase.o \ $(OBJDIR)/ImsReordering.o \ -$(OBJDIR)/StructVector.o \ $(OBJDIR)/ModflowInput.o \ +$(OBJDIR)/DefinitionSelect.o \ +$(OBJDIR)/StructVector.o \ $(OBJDIR)/IdmLogger.o \ $(OBJDIR)/NCFileVars.o \ $(OBJDIR)/Integer1dReader.o \ $(OBJDIR)/Double2dReader.o \ $(OBJDIR)/Double1dReader.o \ -$(OBJDIR)/DefinitionSelect.o \ $(OBJDIR)/MethodCellPool.o \ $(OBJDIR)/CellUtil.o \ +$(OBJDIR)/swf-zdgidm.o \ $(OBJDIR)/swf-cxs.o \ $(OBJDIR)/Disv1d.o \ +$(OBJDIR)/swf-stoidm.o \ $(OBJDIR)/swf-ic.o \ +$(OBJDIR)/swf-flwidm.o \ $(OBJDIR)/VectorInterpolation.o \ +$(OBJDIR)/swf-dfwidm.o \ +$(OBJDIR)/swf-cdbidm.o \ $(OBJDIR)/VirtualExchange.o \ $(OBJDIR)/GridSorting.o \ $(OBJDIR)/DisConnExchange.o \ @@ -409,12 +401,13 @@ $(OBJDIR)/gwe-esl.o \ $(OBJDIR)/gwe-ctp.o \ $(OBJDIR)/gwe-cnd.o \ $(OBJDIR)/RouterBase.o \ +$(OBJDIR)/STLStackInt.o \ $(OBJDIR)/ImsLinearSolver.o \ $(OBJDIR)/ImsLinearBase.o \ +$(OBJDIR)/DynamicPackageParams.o \ $(OBJDIR)/StructArray.o \ $(OBJDIR)/LoadNCInput.o \ $(OBJDIR)/LayeredArrayReader.o \ -$(OBJDIR)/DynamicPackageParams.o \ $(OBJDIR)/InputLoadType.o \ $(OBJDIR)/ReleaseSchedule.o \ $(OBJDIR)/MethodDisv.o \ @@ -437,12 +430,13 @@ $(OBJDIR)/GwfExchangeMover.o \ $(OBJDIR)/gwe.o \ $(OBJDIR)/SerialRouter.o \ $(OBJDIR)/Timer.o \ +$(OBJDIR)/Profiler.o \ $(OBJDIR)/LinearSolverFactory.o \ $(OBJDIR)/ImsLinear.o \ $(OBJDIR)/BaseSolution.o \ +$(OBJDIR)/BoundInputContext.o \ $(OBJDIR)/LoadMf6File.o \ $(OBJDIR)/AsciiInputLoadType.o \ -$(OBJDIR)/BoundInputContext.o \ $(OBJDIR)/prt-prp.o \ $(OBJDIR)/prt-oc.o \ $(OBJDIR)/prt-mip.o \ @@ -498,6 +492,11 @@ $(OBJDIR)/ConnectionBuilder.o \ $(OBJDIR)/comarg.o \ $(OBJDIR)/mf6core.o \ $(OBJDIR)/BaseGeometry.o \ +$(OBJDIR)/swf-namidm.o \ +$(OBJDIR)/swf-disv2didm.o \ +$(OBJDIR)/swf-disv1didm.o \ +$(OBJDIR)/swf-dis2didm.o \ +$(OBJDIR)/swf-chdidm.o \ $(OBJDIR)/mf6.o \ $(OBJDIR)/StringList.o \ $(OBJDIR)/MemorySetHandler.o \ @@ -513,7 +512,12 @@ $(OBJDIR)/gwf-sfr-constant.o \ $(OBJDIR)/RectangularGeometry.o \ $(OBJDIR)/CircularGeometry.o \ $(OBJDIR)/ExplicitModel.o \ -$(OBJDIR)/exg-swfgwfidm.o +$(OBJDIR)/IdmSwfDfnSelector.o \ +$(OBJDIR)/olf-disv1didm.o \ +$(OBJDIR)/olf-cxsidm.o \ +$(OBJDIR)/exg-swfgwfidm.o \ +$(OBJDIR)/chf-disv2didm.o \ +$(OBJDIR)/chf-dis2didm.o # Define the objects that make up the program $(PROGRAM) : $(OBJECTS) diff --git a/src/Distributed/MpiRouter.f90 b/src/Distributed/MpiRouter.f90 index cec227e558d..aa0969489d4 100644 --- a/src/Distributed/MpiRouter.f90 +++ b/src/Distributed/MpiRouter.f90 @@ -1,7 +1,7 @@ module MpiRouterModule use RouterBaseModule use KindModule, only: I4B, LGP - use CpuTimerModule, only: g_timer + use ProfilerModule, only: g_prof use STLVecIntModule use SimVariablesModule, only: proc_id, nr_procs use SimStagesModule, only: STG_TO_STR, NR_SIM_STAGES @@ -33,7 +33,7 @@ module MpiRouterModule type(MpiWorldType), pointer :: mpi_world => null() integer(I4B) :: imon !< the output file unit for the mpi monitor logical(LGP) :: enable_monitor !< when true, log diagnostics - integer(I4B), dimension(:,:), allocatable :: tmr_mpi_wait !< array with timer handles for MPI_Wait calls + integer(I4B), dimension(:, :), allocatable :: tmr_mpi_wait !< array with timer handles for MPI_Wait calls contains procedure :: initialize => mr_initialize procedure :: route_all => mr_route_all @@ -328,10 +328,10 @@ subroutine route_active(this, unit, stage) end do ! wait for exchange of all messages - call g_timer%start("MPI_WaitAll ("//trim(STG_TO_STR(stage))//")", & - this%tmr_mpi_wait(stage, unit + 1)) + call g_prof%start("MPI_WaitAll ("//trim(STG_TO_STR(stage))//")", & + this%tmr_mpi_wait(stage, unit + 1)) call MPI_WaitAll(this%senders%size, rcv_req, rcv_stat, ierr) - call g_timer%stop(this%tmr_mpi_wait(stage, unit + 1)) + call g_prof%stop(this%tmr_mpi_wait(stage, unit + 1)) call CHECK_MPI(ierr) deallocate (rcv_req, snd_req, rcv_stat) @@ -447,10 +447,10 @@ subroutine compose_messages(this, unit, stage, body_snd_t, body_rcv_t) end do ! wait for exchange of all headers - call g_timer%start("MPI_WaitAll ("//trim(STG_TO_STR(stage))//")", & - this%tmr_mpi_wait(stage, unit + 1)) + call g_prof%start("MPI_WaitAll ("//trim(STG_TO_STR(stage))//")", & + this%tmr_mpi_wait(stage, unit + 1)) call MPI_WaitAll(this%receivers%size, rcv_req, rcv_stat, ierr) - call g_timer%stop(this%tmr_mpi_wait(stage, unit + 1)) + call g_prof%stop(this%tmr_mpi_wait(stage, unit + 1)) call CHECK_MPI(ierr) ! reinit handles @@ -520,10 +520,10 @@ subroutine compose_messages(this, unit, stage, body_snd_t, body_rcv_t) end do ! wait on receiving maps - call g_timer%start("MPI_WaitAll ("//trim(STG_TO_STR(stage))//")", & - this%tmr_mpi_wait(stage, unit + 1)) + call g_prof%start("MPI_WaitAll ("//trim(STG_TO_STR(stage))//")", & + this%tmr_mpi_wait(stage, unit + 1)) call MPI_WaitAll(this%receivers%size, rcv_req, rcv_stat, ierr) - call g_timer%stop(this%tmr_mpi_wait(stage, unit + 1)) + call g_prof%stop(this%tmr_mpi_wait(stage, unit + 1)) call CHECK_MPI(ierr) ! print maps diff --git a/src/Distributed/MpiRunControl.F90 b/src/Distributed/MpiRunControl.F90 index 2c3a6613e7d..3c291a95370 100644 --- a/src/Distributed/MpiRunControl.F90 +++ b/src/Distributed/MpiRunControl.F90 @@ -9,7 +9,7 @@ module MpiRunControlModule use MpiWorldModule use SimVariablesModule, only: proc_id, nr_procs use SimStagesModule - use CpuTimerModule + use ProfilerModule use KindModule, only: I4B, LGP, DP use STLVecIntModule use NumericalSolutionModule @@ -57,11 +57,11 @@ subroutine mpi_ctrl_start(this) ! add timed section for parallel stuff and start timers tmr_func => mpi_walltime call set_timer_func(tmr_func) - call g_timer%initialize() - call g_timer%start("Run", SECTION_RUN) - call g_timer%start("Initialize", SECTION_INIT) + call g_prof%initialize() + call g_prof%start("Run", SECTION_RUN) + call g_prof%start("Initialize", SECTION_INIT) tmr_init_par = -1 - call g_timer%start("Initialize parallel", tmr_init_par) + call g_prof%start("Initialize parallel", tmr_init_par) ! set mpi abort function pstop_alternative => mpi_stop @@ -115,7 +115,7 @@ subroutine mpi_ctrl_start(this) if (wait_dbg) call this%wait_for_debugger() ! done with parallel pre-work - call g_timer%stop(tmr_init_par) + call g_prof%stop(tmr_init_par) ! start everything else by calling parent call this%RunControlType%start() @@ -148,7 +148,7 @@ subroutine mpi_ctrl_finish(this) ! timer tmr_final_par = -1 - call g_timer%start("Finalize parallel", tmr_final_par) + call g_prof%start("Finalize parallel", tmr_final_par) ! finish mpi #if defined(__WITH_PETSC__) @@ -163,7 +163,7 @@ subroutine mpi_ctrl_finish(this) pstop_alternative => null() - call g_timer%stop(tmr_final_par) + call g_prof%stop(tmr_final_par) ! finish everything else by calling parent call this%RunControlType%finish() diff --git a/src/Idm/sim-namidm.f90 b/src/Idm/sim-namidm.f90 index 21d91522715..8ba7f74b3a8 100644 --- a/src/Idm/sim-namidm.f90 +++ b/src/Idm/sim-namidm.f90 @@ -15,6 +15,7 @@ module SimNamInputModule logical :: continue = .false. logical :: nocheck = .false. logical :: prmem = .false. + logical :: prprof = .false. logical :: maxerrors = .false. logical :: print_input = .false. logical :: hpc_filerecord = .false. @@ -97,6 +98,24 @@ module SimNamInputModule .false. & ! timeseries ) + type(InputParamDefinitionType), parameter :: & + simnam_prprof = InputParamDefinitionType & + ( & + 'SIM', & ! component + 'NAM', & ! subcomponent + 'OPTIONS', & ! block + 'PROFILE_OPTION', & ! tag name + 'PRPROF', & ! fortran variable + 'STRING', & ! type + '', & ! shape + 'profiling option', & ! longname + .false., & ! required + .false., & ! multi-record + .false., & ! preserve case + .false., & ! layered + .false. & ! timeseries + ) + type(InputParamDefinitionType), parameter :: & simnam_maxerrors = InputParamDefinitionType & ( & @@ -427,6 +446,7 @@ module SimNamInputModule simnam_continue, & simnam_nocheck, & simnam_prmem, & + simnam_prprof, & simnam_maxerrors, & simnam_print_input, & simnam_hpc_filerecord, & diff --git a/src/Idm/utl-ncfidm.f90 b/src/Idm/utl-ncfidm.f90 index 3df61da5b00..4b01f8388ac 100644 --- a/src/Idm/utl-ncfidm.f90 +++ b/src/Idm/utl-ncfidm.f90 @@ -99,8 +99,8 @@ module UtlNcfInputModule 'INTEGER', & ! type '', & ! shape 'chunking parameter for the time dimension', & ! longname - .true., & ! required - .true., & ! multi-record + .false., & ! required + .false., & ! multi-record .false., & ! preserve case .false., & ! layered .false. & ! timeseries @@ -118,7 +118,7 @@ module UtlNcfInputModule '', & ! shape 'chunking parameter for the mesh face dimension', & ! longname .false., & ! required - .true., & ! multi-record + .false., & ! multi-record .false., & ! preserve case .false., & ! layered .false. & ! timeseries @@ -136,7 +136,7 @@ module UtlNcfInputModule '', & ! shape 'chunking parameter for structured z', & ! longname .false., & ! required - .true., & ! multi-record + .false., & ! multi-record .false., & ! preserve case .false., & ! layered .false. & ! timeseries @@ -154,7 +154,7 @@ module UtlNcfInputModule '', & ! shape 'chunking parameter for structured y', & ! longname .false., & ! required - .true., & ! multi-record + .false., & ! multi-record .false., & ! preserve case .false., & ! layered .false. & ! timeseries @@ -172,7 +172,7 @@ module UtlNcfInputModule '', & ! shape 'chunking parameter for structured x', & ! longname .false., & ! required - .true., & ! multi-record + .false., & ! multi-record .false., & ! preserve case .false., & ! layered .false. & ! timeseries diff --git a/src/RunControl.f90 b/src/RunControl.f90 index 5232a783111..b11e919a7a3 100644 --- a/src/RunControl.f90 +++ b/src/RunControl.f90 @@ -5,7 +5,7 @@ module RunControlModule use MapperModule use ListsModule, only: basesolutionlist use NumericalSolutionModule, only: NumericalSolutionType - use CpuTimerModule + use ProfilerModule implicit none private @@ -41,10 +41,10 @@ subroutine ctrl_start(this) class(RunControlType) :: this ! initialize and start timers, if not done so in the derived class - if (.not. g_timer%is_initialized()) then - call g_timer%initialize() - call g_timer%start("Run", SECTION_RUN) - call g_timer%start("Initialize", SECTION_INIT) + if (.not. g_prof%is_initialized()) then + call g_prof%initialize() + call g_prof%start("Run", SECTION_RUN) + call g_prof%start("Initialize", SECTION_INIT) end if allocate (this%virtual_data_mgr) @@ -64,12 +64,12 @@ subroutine ctrl_finish(this) ! -- Write memory usage, elapsed time and terminate call mem_write_usage(iout) call mem_da() - call elapsed_time(iout, 1) ! stop and print timings - call g_timer%stop(SECTION_RUN) - call g_timer%print_timings() + call g_prof%stop(SECTION_RUN) + call g_prof%print(iout) + call elapsed_time(iout, 1) call final_message() end subroutine ctrl_finish diff --git a/src/SimulationCreate.f90 b/src/SimulationCreate.f90 index fd23108b6d0..2a02bc0ca25 100644 --- a/src/SimulationCreate.f90 +++ b/src/SimulationCreate.f90 @@ -98,12 +98,13 @@ subroutine options_create() use MemoryManagerModule, only: mem_setptr use SimVariablesModule, only: idm_context use MemoryManagerModule, only: mem_set_print_option + use ProfilerModule, only: g_prof use SimVariablesModule, only: isimcontinue, isimcheck ! -- dummy ! -- locals character(len=LENMEMPATH) :: input_mempath integer(I4B), pointer :: simcontinue, nocheck, maxerror - character(len=:), pointer :: prmem + character(len=:), pointer :: prmem, prprof character(len=LINELENGTH) :: errmsg ! ! -- set input memory path @@ -113,6 +114,7 @@ subroutine options_create() call mem_setptr(simcontinue, 'CONTINUE', input_mempath) call mem_setptr(nocheck, 'NOCHECK', input_mempath) call mem_setptr(prmem, 'PRMEM', input_mempath) + call mem_setptr(prprof, 'PRPROF', input_mempath) call mem_setptr(maxerror, 'MAXERRORS', input_mempath) ! ! -- update sim options @@ -127,6 +129,10 @@ subroutine options_create() call store_error(errmsg, .true.) end if end if + + ! set profiler print option + call g_prof%set_print_option(prprof) + ! ! -- log values to list file if (iout > 0) then diff --git a/src/Solution/NumericalSolution.f90 b/src/Solution/NumericalSolution.f90 index 75888ab50b5..04a5ec38aa0 100644 --- a/src/Solution/NumericalSolution.f90 +++ b/src/Solution/NumericalSolution.f90 @@ -4,7 +4,7 @@ module NumericalSolutionModule use KindModule, only: DP, I4B, LGP use ErrorUtilModule, only: pstop use TimerModule, only: code_timer - use CpuTimerModule + use ProfilerModule use ConstantsModule, only: LINELENGTH, LENSOLUTIONNAME, LENPAKLOC, & DPREC, DZERO, DEM20, DEM15, DEM6, & DEM4, DEM3, DEM2, DEM1, DHALF, DONETHIRD, & @@ -129,12 +129,14 @@ module NumericalSolutionModule real(DP), pointer :: ptcdel0 => null() !< initial PTC delta value real(DP), pointer :: ptcexp => null() !< PTC exponent ! - ! -- timer - integer(I4B) :: tmr_prep_solve !< timer - prepare solve - integer(I4B) :: tmr_solve !< timer - solve - integer(I4B) :: tmr_final_solve !< timer - finalize solve - integer(I4B) :: tmr_formulate !< timer - formulate - integer(I4B) :: tmr_linsolve !< timer - linear solve + ! -- timer handles + integer(I4B) :: tmr_prep_solve = -1 !< timer - prepare solve + integer(I4B) :: tmr_solve = -1 !< timer - solve + integer(I4B) :: tmr_final_solve = -1 !< timer - finalize solve + integer(I4B) :: tmr_formulate = -1 !< timer - formulate + integer(I4B) :: tmr_linsolve = -1 !< timer - linear solve + integer(I4B) :: tmr_flows = -1 !< timer - calculate flows + integer(I4B) :: tmr_budgets = -1 !< timer - calculate budgets character(len=24) :: id_postfix !< solution id based postfix for timer titles ! ! -- adaptive time step @@ -504,7 +506,7 @@ subroutine sln_df(this) call this%sln_connect() ! add timers - write(this%id_postfix,'(a,i0,a)') " (", this%id, ")" + write (this%id_postfix, '(a,i0,a)') " (", this%id, ")" this%tmr_prep_solve = -1 this%tmr_solve = -1 this%tmr_final_solve = -1 @@ -1440,7 +1442,7 @@ subroutine prepareSolve(this) class(NumericalModelType), pointer :: mp => null() ! start timer - call g_timer%start("Prepare solve"//this%id_postfix, this%tmr_prep_solve) + call g_prof%start("Prepare solve"//this%id_postfix, this%tmr_prep_solve) ! synchronize for AD call this%synchronize(STG_BFR_EXG_AD, this%synchronize_ctx) @@ -1461,7 +1463,7 @@ subroutine prepareSolve(this) call this%sln_ad() ! stop timer - call g_timer%stop(this%tmr_prep_solve) + call g_prof%stop(this%tmr_prep_solve) end subroutine prepareSolve @@ -1519,7 +1521,7 @@ subroutine solve(this, kiter) real(DP) :: outer_hncg ! start timer - call g_timer%start("Solve"//this%id_postfix, this%tmr_solve) + call g_prof%start("Solve"//this%id_postfix, this%tmr_solve) ! ! -- initialize local variables @@ -1574,7 +1576,7 @@ subroutine solve(this, kiter) end if ! call code_timer(0, ttform, this%ttform) - call g_timer%start("Formulate", this%tmr_formulate) + call g_prof%start("Formulate", this%tmr_formulate) ! ! -- (re)build the solution matrix call this%sln_buildsystem(kiter, inewton=1) @@ -1588,13 +1590,13 @@ subroutine solve(this, kiter) call mp%model_nr(kiter, this%system_matrix, 1) end do call code_timer(1, ttform, this%ttform) - call g_timer%stop(this%tmr_formulate) + call g_prof%stop(this%tmr_formulate) ! ! -- linear solve call code_timer(0, ttsoln, this%ttsoln) - call g_timer%start("Linear solve", this%tmr_linsolve) + call g_prof%start("Linear solve", this%tmr_linsolve) call this%sln_ls(kiter, kstp, kper, iter, iptc, ptcf) - call g_timer%stop(this%tmr_linsolve) + call g_prof%stop(this%tmr_linsolve) call code_timer(1, ttsoln, this%ttsoln) ! ! -- increment counters storing the total number of linear and @@ -1828,7 +1830,7 @@ subroutine solve(this, kiter) end if ! stop timer - call g_timer%stop(this%tmr_solve) + call g_prof%stop(this%tmr_solve) end subroutine solve @@ -1856,9 +1858,9 @@ subroutine finalizeSolve(this, kiter, isgcnvg, isuppress_output) character(len=*), parameter :: fmtcnvg = & "(1X, I0, ' CALLS TO NUMERICAL SOLUTION ', 'IN TIME STEP ', I0, & &' STRESS PERIOD ',I0,/1X,I0,' TOTAL ITERATIONS')" - + ! start timer - call g_timer%start("Finalize solve"//this%id_postfix, this%tmr_final_solve) + call g_prof%start("Finalize solve"//this%id_postfix, this%tmr_final_solve) ! ! -- finalize the outer iteration table @@ -1895,6 +1897,9 @@ subroutine finalizeSolve(this, kiter, isgcnvg, isuppress_output) ! ! -- set solution group convergence flag if (this%icnvg == 0) isgcnvg = 0 + + call g_prof%start("Calculate flows", this%tmr_flows) + ! ! -- Calculate flow for each model do im = 1, this%modellist%Count() @@ -1907,6 +1912,10 @@ subroutine finalizeSolve(this, kiter, isgcnvg, isuppress_output) cp => GetNumericalExchangeFromList(this%exchangelist, ic) call cp%exg_cq(isgcnvg, isuppress_output, this%id) end do + + call g_prof%stop(this%tmr_flows) + call g_prof%start("Calculate budgets", this%tmr_budgets) + ! ! -- Budget terms for each model do im = 1, this%modellist%Count() @@ -1921,7 +1930,8 @@ subroutine finalizeSolve(this, kiter, isgcnvg, isuppress_output) end do ! stop timer - call g_timer%stop(this%tmr_final_solve) + call g_prof%stop(this%tmr_budgets) + call g_prof%stop(this%tmr_final_solve) end subroutine finalizeSolve diff --git a/src/Solution/ParallelSolution.f90 b/src/Solution/ParallelSolution.f90 index 22d5353d522..6d7de0834ac 100644 --- a/src/Solution/ParallelSolution.f90 +++ b/src/Solution/ParallelSolution.f90 @@ -1,6 +1,7 @@ module ParallelSolutionModule use KindModule, only: DP, LGP, I4B use ConstantsModule, only: LENPAKLOC, DONE, DZERO + use ProfilerModule use NumericalSolutionModule, only: NumericalSolutionType use mpi use MpiWorldModule @@ -10,6 +11,13 @@ module ParallelSolutionModule public :: ParallelSolutionType type, extends(NumericalSolutionType) :: ParallelSolutionType + integer(I4B) :: tmr_convergence = -1 !< timer for convergence check + integer(I4B) :: tmr_pkg_cnvg = -1 !< timer for package convergence check + integer(I4B) :: tmr_sync_nur = -1 !< timer for NUR synchronization + integer(I4B) :: tmr_nur_cnvg = -1 !< timer for NUR convergence check + integer(I4B) :: tmr_calcptc = -1 !< timer for PTC calculation + integer(I4B) :: tmr_underrelax = -1 !< timer for underrelaxation + integer(I4B) :: tmr_backtracking = -1 !< timer for backtracking contains ! override procedure :: sln_has_converged => par_has_converged @@ -37,6 +45,8 @@ function par_has_converged(this, max_dvc) result(has_converged) integer :: ierr type(MpiWorldType), pointer :: mpi_world + call g_prof%start("Parallel Solution (cnvg check)", this%tmr_convergence) + mpi_world => get_mpi_world() has_converged = .false. @@ -48,6 +58,8 @@ function par_has_converged(this, max_dvc) result(has_converged) has_converged = .true. end if + call g_prof%stop(this%tmr_convergence) + end function par_has_converged function par_package_convergence(this, dpak, cpakout, iend) & @@ -62,6 +74,8 @@ function par_package_convergence(this, dpak, cpakout, iend) & integer :: ierr type(MpiWorldType), pointer :: mpi_world + call g_prof%start("Parallel Solution (package cnvg)", this%tmr_pkg_cnvg) + mpi_world => get_mpi_world() icnvg_local = & @@ -71,6 +85,8 @@ function par_package_convergence(this, dpak, cpakout, iend) & MPI_MIN, mpi_world%comm, ierr) call CHECK_MPI(ierr) + call g_prof%stop(this%tmr_pkg_cnvg) + end function par_package_convergence function par_sync_newtonur_flag(this, inewtonur) result(ivalue) @@ -81,11 +97,15 @@ function par_sync_newtonur_flag(this, inewtonur) result(ivalue) integer :: ierr type(MpiWorldType), pointer :: mpi_world + call g_prof%start("Parallel Solution (NUR)", this%tmr_sync_nur) + mpi_world => get_mpi_world() call MPI_Allreduce(inewtonur, ivalue, 1, MPI_INTEGER, & MPI_MAX, mpi_world%comm, ierr) call CHECK_MPI(ierr) + call g_prof%stop(this%tmr_sync_nur) + end function par_sync_newtonur_flag function par_nur_has_converged(this, dxold_max, hncg) & @@ -100,6 +120,8 @@ function par_nur_has_converged(this, dxold_max, hncg) & integer :: ierr type(MpiWorldType), pointer :: mpi_world + call g_prof%start("Parallel Solution (NUR cnvg)", this%tmr_nur_cnvg) + mpi_world => get_mpi_world() has_converged = .false. @@ -114,6 +136,8 @@ function par_nur_has_converged(this, dxold_max, hncg) & call CHECK_MPI(ierr) if (icnvg_global == 1) has_converged = .true. + call g_prof%stop(this%tmr_nur_cnvg) + end function par_nur_has_converged !> @brief Calculate pseudo-transient continuation factor @@ -128,6 +152,8 @@ subroutine par_calc_ptc(this, iptc, ptcf) integer :: ierr type(MpiWorldType), pointer :: mpi_world + call g_prof%start("Parallel Solution (PTC calc)", this%tmr_calcptc) + mpi_world => get_mpi_world() call this%NumericalSolutionType%sln_calc_ptc(iptc_loc, ptcf_loc) if (iptc_loc == 0) ptcf_loc = DZERO @@ -144,6 +170,8 @@ subroutine par_calc_ptc(this, iptc, ptcf) ptcf = ptcf_glo_max end if + call g_prof%stop(this%tmr_calcptc) + end subroutine par_calc_ptc !> @brief apply under-relaxation in sync over all processes @@ -161,6 +189,8 @@ subroutine par_underrelax(this, kiter, bigch, neq, active, x, xtemp) integer :: ierr type(MpiWorldType), pointer :: mpi_world + call g_prof%start("Parallel Solution (underrelax)", this%tmr_underrelax) + mpi_world => get_mpi_world() ! first reduce largest change over all processes @@ -179,6 +209,8 @@ subroutine par_underrelax(this, kiter, bigch, neq, active, x, xtemp) call this%NumericalSolutionType%sln_underrelax(kiter, dvc_global_max, & neq, active, x, xtemp) + call g_prof%stop(this%tmr_underrelax) + end subroutine par_underrelax !> @brief synchronize backtracking flag over processes @@ -192,6 +224,8 @@ subroutine par_backtracking_xupdate(this, bt_flag) type(MpiWorldType), pointer :: mpi_world integer :: ierr + call g_prof%start("Parallel Solution (backtrack)", this%tmr_backtracking) + mpi_world => get_mpi_world() ! get local bt flag @@ -207,6 +241,8 @@ subroutine par_backtracking_xupdate(this, bt_flag) call this%NumericalSolutionType%apply_backtracking() end if + call g_prof%stop(this%tmr_backtracking) + end subroutine par_backtracking_xupdate end module ParallelSolutionModule diff --git a/src/Utilities/Timer/CpuTimer.f90 b/src/Utilities/Performance/Profiler.f90 similarity index 52% rename from src/Utilities/Timer/CpuTimer.f90 rename to src/Utilities/Performance/Profiler.f90 index 05b4407b76f..ab5ab4fc968 100644 --- a/src/Utilities/Timer/CpuTimer.f90 +++ b/src/Utilities/Performance/Profiler.f90 @@ -1,6 +1,7 @@ -module CpuTimerModule +module ProfilerModule use KindModule, only: I4B, DP, LGP - use ConstantsModule, only: DNODATA, DZERO + use ConstantsModule, only: DNODATA, DZERO, LENMEMPATH, LINELENGTH + use STLStackIntModule use STLVecIntModule implicit none private @@ -20,42 +21,52 @@ module CpuTimerModule integer(I4B), public :: SECTION_FINAL_TSTP = -1 ! constants for memory allocation - integer(I4B), parameter :: MAX_NR_TIMED_SECTIONS = 50 + integer(I4B), parameter :: MAX_NR_TIMED_SECTIONS = 75 integer(I4B), public, parameter :: LEN_SECTION_TITLE = 128 ! data structure to store measurements for a section - type, private :: TimedSectionType + type, private :: MeasuredSectionType character(len=LEN_SECTION_TITLE) :: title !< title to identify timed section in log real(DP) :: walltime !< walltime spent in section integer(I4B) :: count !< number of times section was entered integer(I4B) :: status !< =1 means section timer started, =0 otherwise integer(I4B) :: parent_id !< id of parent, or 0 when root type(STLVecInt) :: children !< ids of children - end type TimedSectionType + end type MeasuredSectionType - ! this is the timer object - type, public :: CpuTimerType + !> @brief A public type for profiling performance in the application. + !! The ProfilerType is used to measure and record the performance of various + !! parts of the application. It provides mechanisms to start, stop, and + !< report on the performance metrics collected during execution. + type, public :: ProfilerType private - integer(I4B) :: nr_sections - integer(I4B) :: root_id !< - type(TimedSectionType), dimension(:), pointer :: all_sections => null() !< all timed sections, dynamic up to MAX_NR_TIMED_SECTIONS - type(STLVecInt) :: callstack !< call stack of section ids + integer(I4B) :: iout !< output unit number, typically simulation listing file + integer(I4B) :: pr_option !< 0 = NONE, 1 = SUMMARY, 2 = DETAIL + integer(I4B) :: nr_sections !< number of sections + integer(I4B), dimension(3) :: top_three !< top three leaf sections based on walltime + integer(I4B) :: max_title_len !< maximum title length + integer(I4B) :: root_id + type(MeasuredSectionType), dimension(:), pointer :: all_sections => null() !< all timed sections (up to MAX_NR_TIMED_SECTIONS) + type(STLStackInt) :: callstack !< call stack of section ids contains procedure :: initialize procedure :: add_section procedure :: start procedure :: stop - procedure :: print_timings + procedure :: print procedure :: destroy procedure :: is_initialized + procedure :: set_print_option ! private procedure, private :: print_section procedure, private :: print_total procedure, private :: aggregate_walltime procedure, private :: aggregate_counts - end type CpuTimerType + procedure, private :: largest_title_length + procedure, private :: sort_by_walltime + end type ProfilerType - type(CpuTimerType), public :: g_timer !< the global timer object (to reduce trivial lines of code) + type(ProfilerType), public :: g_prof !< the global timer object (to reduce trivial lines of code) public :: set_timer_func, timer_func_iface procedure(timer_func_iface), private, pointer :: g_timer_func => serial_timer ! typically set to use MPI_Wtime for parallel, using the method below @@ -77,17 +88,16 @@ subroutine set_timer_func(timer_func) end subroutine set_timer_func - !< @brief Initialize the CPU timer object !< subroutine initialize(this) - class(CpuTimerType) :: this + class(ProfilerType) :: this ! local integer(I4B) :: i call this%callstack%init() - allocate(this%all_sections(MAX_NR_TIMED_SECTIONS)) + allocate (this%all_sections(MAX_NR_TIMED_SECTIONS)) do i = 1, MAX_NR_TIMED_SECTIONS this%all_sections(i)%title = "undefined" this%all_sections(i)%status = 0 @@ -96,9 +106,10 @@ subroutine initialize(this) this%all_sections(i)%parent_id = 0 call this%all_sections(i)%children%init() end do - + this%nr_sections = 0 this%root_id = 0 + this%top_three = [0, 0, 0] end subroutine initialize @@ -106,7 +117,8 @@ end subroutine initialize !! passing the parent id will add it as a child !< in the tree function add_section(this, title, parent_id) result(section_id) - class(CpuTimerType) :: this + use SimModule, only: ustop + class(ProfilerType) :: this character(len=*) :: title integer(I4B) :: parent_id integer(I4B) :: section_id @@ -114,12 +126,17 @@ function add_section(this, title, parent_id) result(section_id) ! increment to new section id this%nr_sections = this%nr_sections + 1 section_id = this%nr_sections + if (section_id > size(this%all_sections)) then + write (*, *) "Internal error: Too many profiled sections, "& + &"increase MAX_NR_TIMED_SECTIONS" + call ustop() + end if ! initialize new section this%all_sections(section_id)%title = title this%all_sections(section_id)%walltime = DZERO this%all_sections(section_id)%status = 0 - + ! if parent, otherwise root section if (parent_id > 0) then ! add child to parent @@ -136,24 +153,25 @@ end function add_section !> @brief Start section timing, add when not exist yet (i.e. when id < 1) !< subroutine start(this, title, section_id) - class(CpuTimerType) :: this + class(ProfilerType) :: this character(len=*) :: title - integer(I4B) :: section_id, parent_id + integer(I4B) :: section_id ! local + integer(I4B) :: parent_id real(DP) :: start_time - type(TimedSectionType), pointer :: section + type(MeasuredSectionType), pointer :: section call cpu_time(start_time) - + if (section_id < 1) then ! add section if not exist parent_id = 0 ! root - if (this%callstack%size > 0) then - parent_id = this%callstack%at(this%callstack%size) + if (this%callstack%size() > 0) then + parent_id = this%callstack%top() end if section_id = this%add_section(title, parent_id) end if - call this%callstack%push_back(section_id) + call this%callstack%push(section_id) section => this%all_sections(section_id) section%count = section%count + 1 @@ -163,11 +181,11 @@ subroutine start(this, title, section_id) end subroutine start subroutine stop(this, section_id) - class(CpuTimerType) :: this + class(ProfilerType) :: this integer(I4B) :: section_id ! local real(DP) :: end_time - type(TimedSectionType), pointer :: section + type(MeasuredSectionType), pointer :: section call cpu_time(end_time) @@ -181,33 +199,66 @@ subroutine stop(this, section_id) end subroutine stop - subroutine print_timings(this) - class(CpuTimerType) :: this + subroutine print(this, output_unit) + class(ProfilerType) :: this + integer(I4B), intent(in) :: output_unit ! local - integer(I4B) :: level + integer(I4B) :: level, i, top_idx + integer(I4B), dimension(:), allocatable :: sorted_idxs - ! print timing call stack - level = 0 - write(*,'(a/)') "-------------------- Timing: Call Stack --------------------" - call this%print_section(this%root_id, level) + this%iout = output_unit + if (this%pr_option == 0) return + + ! get top three leaf sections based on walltime + top_idx = 1 + sorted_idxs = (/(i, i=1, this%nr_sections)/) + call this%sort_by_walltime(sorted_idxs) + do i = 1, this%nr_sections + if (this%all_sections(sorted_idxs(i))%children%size == 0) then ! leaf node + if (top_idx > 3) exit + this%top_three(top_idx) = sorted_idxs(i) + top_idx = top_idx + 1 + end if + end do + + this%max_title_len = this%largest_title_length() + + if (this%pr_option > 1) then + ! print timing call stack + level = 0 + write (this%iout, '(/1x,a/)') & + repeat('-', 18)//" Profiler: Call Stack "//repeat('-', 18) + call this%print_section(this%root_id, level) + end if ! print walltime per category from substring (if exist) - write(*,'(a/)') "-------------------- Timing: Cumulative --------------------" + ! note: the sections containing the substring should not be nested, + ! otherwise the walltime will be counted multiple times + write (this%iout, '(1x,a/)') & + repeat('-', 20)//" Profiler: Totals "//repeat('-', 20) call this%print_total("Formulate") call this%print_total("Linear solve") + call this%print_total("Calculate flows") + call this%print_total("Calculate budgets") + call this%print_total("Write output") + call this%print_total("Parallel Solution") call this%print_total("MPI_WaitAll") - write(*,'(/a/)') "------------------------------------------------------------" + write (this%iout, '(/1x,a/)') & + repeat('-', 22)//" End Profiler "//repeat('-', 22) - end subroutine print_timings + end subroutine print recursive subroutine print_section(this, section_id, level) - class(CpuTimerType) :: this + use ArrayHandlersModule, only: ifind + class(ProfilerType) :: this integer(I4B) :: section_id integer(I4B) :: level ! local - integer(I4B) :: i, new_level + integer(I4B) :: i, new_level, nr_padding, idx_top real(DP) :: percent - type(TimedSectionType), pointer :: section + type(MeasuredSectionType), pointer :: section + character(len=:), allocatable :: title_padded + character(len=LINELENGTH) :: top_marker section => this%all_sections(section_id) @@ -218,37 +269,50 @@ recursive subroutine print_section(this, section_id, level) end if percent = percent * 100.0_DP + ! determine if section should be marked as top three + top_marker = "" + idx_top = ifind(this%top_three, section_id) + if (idx_top > 0) then + nr_padding = max(0, 32 - level * 4) + write (top_marker, '(a,i0)') repeat(" ", nr_padding)//"<== #", idx_top + end if + ! print section timing - write(*,'(3a,f0.2,3a,f14.6,2a,i0,a)') & - " ", repeat('....', level), "[", percent, "%] ", & - trim(section%title), ": ", section%walltime, "s", " (", & - section%count, "x)" - + nr_padding = this%max_title_len - len_trim(section%title) + 2 + title_padded = trim(section%title)//":"//repeat(' ', nr_padding) + write (this%iout, '(3a,f6.2,2a,f14.6,2a,i0,a,a)') " ", & + repeat('....', level), "[", percent, "%] ", title_padded, & + section%walltime, "s", " (", section%count, "x)", trim(top_marker) + ! print children new_level = level + 1 do i = 1, section%children%size call this%print_section(section%children%at(i), new_level) end do - if (level == 0) write(*,*) + if (level == 0) write (this%iout, *) end subroutine print_section subroutine print_total(this, subtitle) - class(CpuTimerType) :: this + class(ProfilerType) :: this character(len=*) :: subtitle ! local integer(I4B) :: count real(DP) :: walltime, percent - + integer(I4B) :: nr_padding + character(len=:), allocatable :: title_padded + + ! get maximum length of title + nr_padding = this%max_title_len - len_trim(subtitle) + title_padded = trim(subtitle)//repeat(' ', nr_padding) + count = this%aggregate_counts(subtitle) if (count > 0) then walltime = aggregate_walltime(this, subtitle) percent = (walltime / this%all_sections(this%root_id)%walltime) * 100.0_DP - write(*,'(2a,f0.2,3a,f14.6,2a,i0,a)') & - " ", "[", percent, "%] ", & - trim(subtitle), ": ", walltime, "s", " (", & - count, "x)" + write (this%iout, '(2a,f6.2,3a,f14.6,2a,i0,a)') " ", "[", percent, & + "%] ", title_padded, ": ", walltime, "s", " (", count, "x)" end if end subroutine print_total @@ -256,7 +320,7 @@ end subroutine print_total !> @brief Aggregate walltime over sections with a certain title !< function aggregate_walltime(this, title) result(walltime) - class(CpuTimerType) :: this + class(ProfilerType) :: this character(len=*) :: title real(DP) :: walltime ! local @@ -274,7 +338,7 @@ end function aggregate_walltime !> @brief Aggregate counts over sections with a certain title !< function aggregate_counts(this, title) result(counts) - class(CpuTimerType) :: this + class(ProfilerType) :: this character(len=*) :: title integer(I4B) :: counts ! local @@ -289,13 +353,32 @@ function aggregate_counts(this, title) result(counts) end function aggregate_counts + !> @brief Set the profile option from the user input + !< + subroutine set_print_option(this, profile_option) + class(ProfilerType) :: this + character(len=*), intent(in) :: profile_option + + select case (trim(profile_option)) + case ("NONE") + this%pr_option = 0 + case ("SUMMARY") + this%pr_option = 1 + case ("DETAIL") + this%pr_option = 2 + case default + this%pr_option = 0 + end select + + end subroutine set_print_option + !> @brief Clean up the CPU timer object !< subroutine destroy(this) - class(CpuTimerType) :: this + class(ProfilerType) :: this ! local integer(I4B) :: i - + call this%callstack%destroy() do i = 1, this%nr_sections @@ -307,7 +390,7 @@ subroutine destroy(this) end subroutine destroy function is_initialized(this) result(initialized) - class(CpuTimerType) :: this + class(ProfilerType) :: this logical(LGP) :: initialized initialized = associated(this%all_sections) @@ -319,4 +402,39 @@ subroutine serial_timer(walltime) call cpu_time(walltime) end subroutine serial_timer -end module CpuTimerModule \ No newline at end of file + !> @brief Calculate the largest title length + !< + function largest_title_length(this) result(max_length) + class(ProfilerType) :: this + integer(I4B) :: max_length + integer(I4B) :: i + + max_length = 0 + do i = 1, this%nr_sections + max_length = max(max_length, len_trim(this%all_sections(i)%title)) + end do + + end function largest_title_length + + !> @brief Sort section indexes based on walltime + !< + subroutine sort_by_walltime(this, idxs) + class(ProfilerType) :: this + integer(I4B), dimension(:), allocatable :: idxs !< array with unsorted section idxs + integer(I4B) :: i, j, temp + + ! Simple bubble sort for demonstration purposes + do i = 1, size(idxs) - 1 + do j = 1, size(idxs) - i + if (this%all_sections(idxs(j))%walltime < & + this%all_sections(idxs(j + 1))%walltime) then + temp = idxs(j) + idxs(j) = idxs(j + 1) + idxs(j + 1) = temp + end if + end do + end do + + end subroutine sort_by_walltime + +end module ProfilerModule diff --git a/src/Utilities/STLStackInt.f90 b/src/Utilities/STLStackInt.f90 new file mode 100644 index 00000000000..9cb1a9fe768 --- /dev/null +++ b/src/Utilities/STLStackInt.f90 @@ -0,0 +1,86 @@ +module STLStackIntModule + use KindModule, only: I4B, LGP + use STLVecIntModule + use SimModule, only: ustop + implicit none + private + public :: STLStackInt + + !> @brief A derived type representing a stack of integers. + !! + !! This type provides a stack data structure specifically for integers. + !! It includes methods for typical stack operations such as push, pop, + !< and checking if the stack is empty. + type :: STLStackInt + type(STLVecInt), private :: stack !< the internal stack + contains + procedure, pass(this) :: init !< allocate memory, init size and capacity + procedure, pass(this) :: destroy !< deletes the memory + procedure, pass(this) :: push !< adds an element at the end of the vector + procedure, pass(this) :: pop !< removes the last element + procedure, pass(this) :: top !< returns the last element (without removing it) + procedure, pass(this) :: size !< returns the size of the stack + end type STLStackInt + +contains ! module routines + + subroutine init(this, capacity) + class(STLStackInt), intent(inout) :: this + integer(I4B), intent(in), optional :: capacity ! the initial capacity, when given + + ! init the vector + if (present(capacity)) then + call this%stack%init(capacity) + else + call this%stack%init() + end if + + end subroutine init + + subroutine push(this, newValue) + class(STLStackInt), intent(inout) :: this + integer(I4B) :: newValue + + call this%stack%push_back(newValue) + + end subroutine push + + subroutine pop(this) + class(STLStackInt), intent(inout) :: this + + if (this%stack%size == 0) then + write (*, *) 'STLStackInt exception: cannot pop an empty stack' + call ustop() + end if + this%stack%size = this%stack%size - 1 + + end subroutine pop + + function top(this) result(top_value) + class(STLStackInt), intent(in) :: this + integer(I4B) :: top_value + + if (this%stack%size == 0) then + write (*, *) 'STLStackInt exception: cannot get top of an empty stack' + call ustop() + end if + top_value = this%stack%at(this%stack%size) + + end function top + + function size(this) result(size_value) + class(STLStackInt), intent(in) :: this + integer(I4B) :: size_value + + size_value = this%stack%size + + end function size + + subroutine destroy(this) + class(STLStackInt), intent(inout) :: this + + call this%stack%destroy() + + end subroutine destroy + +end module STLStackIntModule diff --git a/src/Utilities/STLVecInt.f90 b/src/Utilities/STLVecInt.f90 index 5bc7e62c65e..322df0a0e2b 100644 --- a/src/Utilities/STLVecInt.f90 +++ b/src/Utilities/STLVecInt.f90 @@ -11,8 +11,8 @@ module STLVecIntModule ! This is a dynamic vector type for integers type :: STLVecInt integer(I4B), private, allocatable :: values(:) !< the internal array for storage - integer(I4B) :: size !< the number of elements (technically this stuff should be unsigned) - integer(I4B) :: capacity !< the reserved storage + integer(I4B) :: size !< the number of elements + integer(I4B), private :: capacity !< the reserved storage contains procedure, pass(this) :: init !< allocate memory, init size and capacity procedure, pass(this) :: push_back !< adds an element at the end of the vector @@ -75,7 +75,7 @@ end subroutine push_back_unique subroutine pop(this) class(STLVecInt), intent(inout) :: this - + if (this%size > 0) then this%size = this%size - 1 else diff --git a/src/meson.build b/src/meson.build index 5e24e70c7c5..fa396802615 100644 --- a/src/meson.build +++ b/src/meson.build @@ -356,7 +356,7 @@ modflow_sources = files( 'Utilities' / 'OutputControl' / 'OutputControl.f90', 'Utilities' / 'OutputControl' / 'OutputControlData.f90', 'Utilities' / 'OutputControl' / 'PrintSaveManager.f90', - 'Utilities' / 'Timer' / 'CpuTimer.f90', + 'Utilities' / 'Performance' / 'Profiler.f90', 'Utilities' / 'TimeSeries' / 'TimeArray.f90', 'Utilities' / 'TimeSeries' / 'TimeArraySeries.f90', 'Utilities' / 'TimeSeries' / 'TimeArraySeriesLink.f90', @@ -411,6 +411,7 @@ modflow_sources = files( 'Utilities' / 'SmoothingFunctions.f90', 'Utilities' / 'sort.f90', 'Utilities' / 'Sparse.f90', + 'Utilities' / 'STLStackInt.f90', 'Utilities' / 'STLVecInt.f90', 'Utilities' / 'StringList.f90', 'Utilities' / 'Table.f90', diff --git a/src/mf6core.f90 b/src/mf6core.f90 index f501b2b94b1..9c874aea850 100644 --- a/src/mf6core.f90 +++ b/src/mf6core.f90 @@ -18,7 +18,7 @@ module Mf6CoreModule use SolutionGroupModule, only: SolutionGroupType, GetSolutionGroupFromList use RunControlModule, only: RunControlType use SimStagesModule - use CpuTimerModule + use ProfilerModule implicit none class(RunControlType), pointer :: run_ctrl => null() !< the run controller for this simulation @@ -99,7 +99,7 @@ subroutine Mf6Initialize() call export_cr() ! -- stop the timer - call g_timer%stop(SECTION_INIT) + call g_prof%stop(SECTION_INIT) end subroutine Mf6Initialize @@ -113,7 +113,7 @@ end subroutine Mf6Initialize function Mf6Update() result(hasConverged) logical(LGP) :: hasConverged ! start timer - call g_timer%start("Update", SECTION_UPDATE) + call g_prof%start("Update", SECTION_UPDATE) ! ! -- prepare timestep call Mf6PrepareTimestep() @@ -125,7 +125,7 @@ function Mf6Update() result(hasConverged) hasConverged = Mf6FinalizeTimestep() ! stop timer - call g_timer%stop(SECTION_UPDATE) + call g_prof%stop(SECTION_UPDATE) end function Mf6Update @@ -158,7 +158,7 @@ subroutine Mf6Finalize() integer(I4B) :: tmr_dealloc ! start timer - call g_timer%start("Finalize", SECTION_FINALIZE) + call g_prof%start("Finalize", SECTION_FINALIZE) ! ! -- FINAL PROCESSING (FP) @@ -182,8 +182,8 @@ subroutine Mf6Finalize() ! start timer for deallocation tmr_dealloc = -1 - call g_timer%start("Deallocate", tmr_dealloc) - + call g_prof%start("Deallocate", tmr_dealloc) + ! ! -- DEALLOCATE (DA) ! -- Deallocate tdis @@ -228,10 +228,10 @@ subroutine Mf6Finalize() call export_da() call simulation_da() call lists_da() - + ! stop timers - call g_timer%stop(tmr_dealloc) - call g_timer%stop(SECTION_FINALIZE) + call g_prof%stop(tmr_dealloc) + call g_prof%stop(SECTION_FINALIZE) ! finish gently (No calls after this) call run_ctrl%finish() @@ -517,7 +517,7 @@ subroutine Mf6PrepareTimestep() integer(I4B) :: is ! start timer - call g_timer%start("Initialize time step", SECTION_PREP_TSTP) + call g_prof%start("Initialize time step", SECTION_PREP_TSTP) ! ! -- initialize fmt @@ -600,7 +600,7 @@ subroutine Mf6PrepareTimestep() call tdis_set_timestep() ! stop timer - call g_timer%stop(SECTION_PREP_TSTP) + call g_prof%stop(SECTION_PREP_TSTP) end subroutine Mf6PrepareTimestep @@ -625,7 +625,7 @@ subroutine Mf6DoTimestep() logical :: finishedTrying ! start timer - call g_timer%start("Do time step", SECTION_DO_TSTP) + call g_prof%start("Do time step", SECTION_DO_TSTP) ! -- By default, the solution groups will be solved once, and ! may fail. But if adaptive stepping is active, then @@ -650,7 +650,7 @@ subroutine Mf6DoTimestep() end do retryloop ! stop timer - call g_timer%stop(SECTION_DO_TSTP) + call g_prof%stop(SECTION_DO_TSTP) end subroutine Mf6DoTimestep @@ -724,13 +724,15 @@ function Mf6FinalizeTimestep() result(hasConverged) integer(I4B) :: ix integer(I4B) :: ic integer(I4B) :: is + integer(I4B), save :: tmr_output = -1 !< timer for output + integer(I4B), save :: tmr_nc_export = -1 !< timer for netcdf output ! ! -- initialize format and line fmt = "(/,a,/)" line = 'end timestep' ! start timer - call g_timer%start("Finalize time step", SECTION_FINAL_TSTP) + call g_prof%start("Finalize time step", SECTION_FINAL_TSTP) ! ! -- evaluate simulation mode @@ -743,6 +745,8 @@ function Mf6FinalizeTimestep() result(hasConverged) call mp%model_message(line, fmt=fmt) end do case (MNORMAL) + + call g_prof%start("Write output", tmr_output) ! ! -- Write output and final message for timestep for each model do im = 1, basemodellist%Count() @@ -770,15 +774,19 @@ function Mf6FinalizeTimestep() result(hasConverged) end do ! ! -- update exports + call g_prof%start("NetCDF export", tmr_nc_export) call export_post_step() + call g_prof%stop(tmr_nc_export) + + call g_prof%stop(tmr_output) end select ! ! -- Check if we're done call converge_check(hasConverged) ! stop timer - call g_timer%stop(SECTION_FINAL_TSTP) - + call g_prof%stop(SECTION_FINAL_TSTP) + end function Mf6FinalizeTimestep end module Mf6CoreModule diff --git a/utils/zonebudget/make/makefile b/utils/zonebudget/make/makefile index 4cb973e9759..7e89c0ae11f 100644 --- a/utils/zonebudget/make/makefile +++ b/utils/zonebudget/make/makefile @@ -1,4 +1,4 @@ -# makefile created by pymake (version 1.2.10.dev0) for the 'zbud6' executable. +# makefile created by pymake (version 1.4.0.dev0) for the 'zbud6' executable. include ./makedefaults