From 553eda5938a06631af60cb6e714e0661337923d9 Mon Sep 17 00:00:00 2001 From: mjr-deltares Date: Thu, 23 Jan 2025 11:57:55 +0100 Subject: [PATCH] - first working version --- src/Distributed/MpiRouter.f90 | 1 + src/Distributed/MpiRunControl.F90 | 35 +++- src/RunControl.f90 | 13 ++ src/Solution/NumericalSolution.f90 | 45 +++++ src/Utilities/Timer/CpuTimer.f90 | 264 +++++++++++++++++++++++++++++ src/meson.build | 1 + src/mf6core.f90 | 47 ++++- 7 files changed, 399 insertions(+), 7 deletions(-) create mode 100644 src/Utilities/Timer/CpuTimer.f90 diff --git a/src/Distributed/MpiRouter.f90 b/src/Distributed/MpiRouter.f90 index 3d5eed1bed6..aa581dba662 100644 --- a/src/Distributed/MpiRouter.f90 +++ b/src/Distributed/MpiRouter.f90 @@ -1,6 +1,7 @@ module MpiRouterModule use RouterBaseModule use KindModule, only: I4B, LGP + use CpuTimerModule, only: g_timer use STLVecIntModule use SimVariablesModule, only: proc_id, nr_procs use SimStagesModule, only: STG_TO_STR diff --git a/src/Distributed/MpiRunControl.F90 b/src/Distributed/MpiRunControl.F90 index 3c95253d8f9..254b67b7e1b 100644 --- a/src/Distributed/MpiRunControl.F90 +++ b/src/Distributed/MpiRunControl.F90 @@ -9,7 +9,8 @@ module MpiRunControlModule use MpiWorldModule use SimVariablesModule, only: proc_id, nr_procs use SimStagesModule - use KindModule, only: I4B, LGP + use CpuTimerModule + use KindModule, only: I4B, LGP, DP use STLVecIntModule use NumericalSolutionModule use RunControlModule, only: RunControlType @@ -42,6 +43,8 @@ end function create_mpi_run_control subroutine mpi_ctrl_start(this) use ErrorUtilModule, only: pstop_alternative + ! local + integer(I4B) :: tmr_init_par class(MpiRunControlType) :: this ! local @@ -49,6 +52,16 @@ subroutine mpi_ctrl_start(this) character(len=*), parameter :: petsc_db_file = '.petscrc' logical(LGP) :: petsc_db_exists, wait_dbg, is_parallel_mode type(MpiWorldType), pointer :: mpi_world + procedure(timer_func_iface), pointer :: tmr_func => null() + + ! add timed section for parallel stuff and start timers + tmr_func => mpi_walltime + call set_timer_func(tmr_func) + call g_timer%initialize() + tmr_init_par = g_timer%add_section("Init parallel", SECTION_INIT) + call g_timer%start(SECTION_RUN) + call g_timer%start(SECTION_INIT) + call g_timer%start(tmr_init_par) ! set mpi abort function pstop_alternative => mpi_stop @@ -101,6 +114,9 @@ subroutine mpi_ctrl_start(this) ! possibly wait to attach debugger here if (wait_dbg) call this%wait_for_debugger() + ! done with parallel pre-work + call g_timer%stop(tmr_init_par) + ! start everything else by calling parent call this%RunControlType%start() @@ -128,6 +144,11 @@ subroutine mpi_ctrl_finish(this) class(MpiRunControlType) :: this ! local integer :: ierr + integer(I4B) :: tmr_final_par + + ! timer + tmr_final_par = g_timer%add_section("Finalize parallel", SECTION_FINALIZE) + call g_timer%start(tmr_final_par) ! finish mpi #if defined(__WITH_PETSC__) @@ -142,6 +163,8 @@ subroutine mpi_ctrl_finish(this) pstop_alternative => null() + call g_timer%stop(tmr_final_par) + ! finish everything else by calling parent call this%RunControlType%finish() @@ -271,4 +294,14 @@ subroutine mpi_ctrl_after_con_cr(this) end subroutine mpi_ctrl_after_con_cr + subroutine mpi_walltime(walltime) + real(DP), intent(inout) :: walltime + ! local + integer :: ierr + + walltime = MPI_Wtime() + call CHECK_MPI(ierr) + + end subroutine mpi_walltime + end module MpiRunControlModule diff --git a/src/RunControl.f90 b/src/RunControl.f90 index e17b1b92360..76303f253a5 100644 --- a/src/RunControl.f90 +++ b/src/RunControl.f90 @@ -5,6 +5,7 @@ module RunControlModule use MapperModule use ListsModule, only: basesolutionlist use NumericalSolutionModule, only: NumericalSolutionType + use CpuTimerModule implicit none private @@ -38,6 +39,13 @@ end function create_seq_run_control subroutine ctrl_start(this) class(RunControlType) :: this + ! initialize and start timers, if not done so in the derived class + if (.not. g_timer%is_initialized()) then + call g_timer%initialize() + call g_timer%start(SECTION_RUN) + call g_timer%start(SECTION_INIT) + end if + allocate (this%virtual_data_mgr) end subroutine ctrl_start @@ -56,6 +64,11 @@ subroutine ctrl_finish(this) call mem_write_usage(iout) call mem_da() call elapsed_time(iout, 1) + + ! stop and print timings + call g_timer%stop(SECTION_RUN) + call g_timer%print_timings() + call final_message() end subroutine ctrl_finish diff --git a/src/Solution/NumericalSolution.f90 b/src/Solution/NumericalSolution.f90 index ddb728916ba..8997fadb925 100644 --- a/src/Solution/NumericalSolution.f90 +++ b/src/Solution/NumericalSolution.f90 @@ -4,6 +4,7 @@ module NumericalSolutionModule use KindModule, only: DP, I4B, LGP use ErrorUtilModule, only: pstop use TimerModule, only: code_timer + use CpuTimerModule use ConstantsModule, only: LINELENGTH, LENSOLUTIONNAME, LENPAKLOC, & DPREC, DZERO, DEM20, DEM15, DEM6, & DEM4, DEM3, DEM2, DEM1, DHALF, DONETHIRD, & @@ -128,6 +129,13 @@ module NumericalSolutionModule real(DP), pointer :: ptcdel0 => null() !< initial PTC delta value real(DP), pointer :: ptcexp => null() !< PTC exponent ! + ! -- timer + integer(I4B) :: tmr_prep_solve !< timer - prepare solve + integer(I4B) :: tmr_solve !< timer - solve + integer(I4B) :: tmr_final_solve !< timer - finalize solve + integer(I4B) :: tmr_formulate !< timer - formulate + integer(I4B) :: tmr_linsolve !< timer - linear solve + ! ! -- adaptive time step real(DP), pointer :: atsfrac => null() !< adaptive time step faction ! @@ -426,6 +434,7 @@ subroutine sln_df(this) integer(I4B), allocatable, dimension(:) :: rowmaxnnz integer(I4B) :: ncol, irow_start, irow_end integer(I4B) :: mod_offset + character(len=LEN_SECTION_TITLE) :: sec_title ! ! -- set sol id and determine nr. of equation in this solution do i = 1, this%modellist%Count() @@ -493,6 +502,18 @@ subroutine sln_df(this) ! ! -- Assign connections, fill ia/ja, map connections call this%sln_connect() + + ! add timers + write(sec_title,'(a,i0,a)') "Prepare solve (", this%id, ")" + this%tmr_prep_solve = g_timer%add_section(sec_title, SECTION_DO_TSTP) + write(sec_title,'(a,i0,a)') "Solve (", this%id, ")" + this%tmr_solve = g_timer%add_section(sec_title, SECTION_DO_TSTP) + write(sec_title,'(a,i0,a)') "Finalize solve (", this%id, ")" + this%tmr_final_solve = g_timer%add_section(sec_title, SECTION_DO_TSTP) + + this%tmr_formulate = g_timer%add_section("Formulate", this%tmr_solve) + this%tmr_linsolve = g_timer%add_section("Linear solve", this%tmr_solve) + end subroutine sln_df !> @ brief Allocate and read data @@ -1421,6 +1442,9 @@ subroutine prepareSolve(this) class(NumericalExchangeType), pointer :: cp => null() class(NumericalModelType), pointer :: mp => null() + ! start timer + call g_timer%start(this%tmr_prep_solve) + ! synchronize for AD call this%synchronize(STG_BFR_EXG_AD, this%synchronize_ctx) @@ -1439,6 +1463,9 @@ subroutine prepareSolve(this) ! advance solution call this%sln_ad() + ! stop timer + call g_timer%stop(this%tmr_prep_solve) + end subroutine prepareSolve !> @ brief Build and solve the simulation @@ -1493,6 +1520,10 @@ subroutine solve(this, kiter) real(DP) :: ttsoln real(DP) :: dpak real(DP) :: outer_hncg + + ! start timer + call g_timer%start(this%tmr_solve) + ! ! -- initialize local variables icsv0 = max(1, this%itertot_sim + 1) @@ -1546,6 +1577,7 @@ subroutine solve(this, kiter) end if ! call code_timer(0, ttform, this%ttform) + call g_timer%start(this%tmr_formulate) ! ! -- (re)build the solution matrix call this%sln_buildsystem(kiter, inewton=1) @@ -1559,10 +1591,13 @@ subroutine solve(this, kiter) call mp%model_nr(kiter, this%system_matrix, 1) end do call code_timer(1, ttform, this%ttform) + call g_timer%stop(this%tmr_formulate) ! ! -- linear solve call code_timer(0, ttsoln, this%ttsoln) + call g_timer%start(this%tmr_linsolve) call this%sln_ls(kiter, kstp, kper, iter, iptc, ptcf) + call g_timer%stop(this%tmr_linsolve) call code_timer(1, ttsoln, this%ttsoln) ! ! -- increment counters storing the total number of linear and @@ -1795,6 +1830,9 @@ subroutine solve(this, kiter) kiter, iter, icsv0, kcsv0) end if + ! stop timer + call g_timer%stop(this%tmr_solve) + end subroutine solve !> @ brief finalize a solution @@ -1821,6 +1859,10 @@ subroutine finalizeSolve(this, kiter, isgcnvg, isuppress_output) character(len=*), parameter :: fmtcnvg = & "(1X, I0, ' CALLS TO NUMERICAL SOLUTION ', 'IN TIME STEP ', I0, & &' STRESS PERIOD ',I0,/1X,I0,' TOTAL ITERATIONS')" + + ! start timer + call g_timer%start(this%tmr_final_solve) + ! ! -- finalize the outer iteration table if (this%iprims > 0) then @@ -1881,6 +1923,9 @@ subroutine finalizeSolve(this, kiter, isgcnvg, isuppress_output) call cp%exg_bd(isgcnvg, isuppress_output, this%id) end do + ! stop timer + call g_timer%stop(this%tmr_final_solve) + end subroutine finalizeSolve ! helper routine to calculate coefficients and setup the solution matrix diff --git a/src/Utilities/Timer/CpuTimer.f90 b/src/Utilities/Timer/CpuTimer.f90 new file mode 100644 index 00000000000..c721104a904 --- /dev/null +++ b/src/Utilities/Timer/CpuTimer.f90 @@ -0,0 +1,264 @@ +module CpuTimerModule + use KindModule, only: I4B, DP, LGP + use ConstantsModule, only: DNODATA, DZERO + use STLVecIntModule + implicit none + private + + ! predefined sections as integers + ! root: + integer(I4B), public :: SECTION_RUN + + ! level 1 (BMI): + integer(I4B), public :: SECTION_INIT + integer(I4B), public :: SECTION_UPDATE + integer(I4B), public :: SECTION_FINALIZE + + ! level 2 (XMI): + integer(I4B), public :: SECTION_PREP_TSTP + integer(I4B), public :: SECTION_DO_TSTP + integer(I4B), public :: SECTION_FINAL_TSTP + + ! constants for memory allocation + integer(I4B), parameter :: MAX_NR_TIMED_SECTIONS = 20 + integer(I4B), public, parameter :: LEN_SECTION_TITLE = 128 + + type, private :: TimedSectionType + character(len=LEN_SECTION_TITLE) :: title !< title to identify timed section in log + real(DP) :: walltime !< walltime spent in section + integer(I4B) :: count !< number of times section was entered + integer(I4B) :: status !< =1 means section timer started, =0 otherwise + integer(I4B) :: parent_id !< id of parent, or 0 when root + type(STLVecInt) :: children !< ids of children + end type TimedSectionType + + type, public :: CpuTimerType + private + integer(I4B) :: nr_sections + integer(I4B) :: root_id + type(TimedSectionType), dimension(:), pointer :: all_sections => null() + contains + procedure :: initialize + procedure :: add_section + procedure :: start + procedure :: stop + procedure :: print_timings + procedure :: destroy + procedure :: is_initialized + ! private + procedure, private :: get_section_id + procedure, private :: print_section + end type CpuTimerType + + type(CpuTimerType), public :: g_timer !< the global timer object (to reduce trivial lines of code) + public :: set_timer_func, timer_func_iface + procedure(timer_func_iface), private, pointer :: g_timer_func => serial_timer ! typically set to use MPI_Wtime for parallel, using the method below + + abstract interface + subroutine timer_func_iface(walltime) + import DP + real(DP), intent(inout) :: walltime + end subroutine + end interface + +contains + + !> @brief Set the timer function to be used, e.g. based on MPI_Wtime + !< + subroutine set_timer_func(timer_func) + procedure(timer_func_iface), pointer :: timer_func + + g_timer_func => timer_func + + end subroutine set_timer_func + + + !< @brief Initialize the CPU timer object + !< + subroutine initialize(this) + class(CpuTimerType) :: this + ! local + integer(I4B) :: i + + allocate(this%all_sections(MAX_NR_TIMED_SECTIONS)) + do i = 1, MAX_NR_TIMED_SECTIONS + this%all_sections(i)%title = "undefined" + this%all_sections(i)%status = 0 + this%all_sections(i)%walltime = DZERO + this%all_sections(i)%count = 0 + this%all_sections(i)%parent_id = 0 + call this%all_sections(i)%children%init() + end do + + this%nr_sections = 0 + this%root_id = 0 + + SECTION_RUN = g_timer%add_section("Run") + + SECTION_INIT = g_timer%add_section("Init", SECTION_RUN) + SECTION_UPDATE = g_timer%add_section("Update", SECTION_RUN) + SECTION_FINALIZE = g_timer%add_section("Finalize", SECTION_RUN) + + SECTION_PREP_TSTP = g_timer%add_section("Prepare timestep", SECTION_UPDATE) + SECTION_DO_TSTP = g_timer%add_section("Do timestep", SECTION_UPDATE) + SECTION_FINAL_TSTP = g_timer%add_section("Finalize timestep", SECTION_UPDATE) + + end subroutine initialize + + !> @brief Add a new timed section to the tree, + !! passing the parent id will add it as a child + !< in the tree + function add_section(this, title, parent_id) result(section_id) + class(CpuTimerType) :: this + character(len=*) :: title + integer(I4B), optional :: parent_id + integer(I4B) :: section_id + + ! increment to new section id + this%nr_sections = this%nr_sections + 1 + section_id = this%nr_sections + + ! initialize new section + this%all_sections(section_id)%title = title + this%all_sections(section_id)%walltime = DZERO + this%all_sections(section_id)%status = 0 + + ! if parent, otherwise root section + if (present(parent_id)) then + ! add child to parent + this%all_sections(section_id)%parent_id = parent_id + call this%all_sections(parent_id)%children%push_back(section_id) + else + ! this is the root, assume there's only one! + this%all_sections(section_id)%parent_id = 0 + this%root_id = section_id + end if + + end function add_section + + !> @brief Return section id for title, 0 when not found + !! @note Currently not used, but could be useful later on, else remove + !< + function get_section_id(this, title) result(section_id) + class(CpuTimerType) :: this + character(len=*) :: title + integer(I4B) :: section_id + ! local + integer(I4B) :: i + + section_id = 0 + do i = 1, this%nr_sections + if (this%all_sections(i)%title == title) then + section_id = i + exit + end if + end do + + end function get_section_id + + subroutine start(this, section_id) + class(CpuTimerType) :: this + integer(I4B) :: section_id + ! local + real(DP) :: start_time + type(TimedSectionType), pointer :: section + + call cpu_time(start_time) + + section => this%all_sections(section_id) + section%count = section%count + 1 + section%status = 1 + section%walltime = section%walltime - start_time + + end subroutine start + + subroutine stop(this, section_id) + class(CpuTimerType) :: this + integer(I4B) :: section_id + ! local + real(DP) :: end_time + type(TimedSectionType), pointer :: section + + call cpu_time(end_time) + + ! nett result (c.f. start(...)) is adding (dt = end_time - start_time) + section => this%all_sections(section_id) + section%status = 0 + section%walltime = section%walltime + end_time + + end subroutine stop + + subroutine print_timings(this) + class(CpuTimerType) :: this + ! local + integer(I4B) :: level + + ! print timing call stack + level = 0 + call this%print_section(this%root_id, level) + + ! print walltime per category + + + end subroutine print_timings + + recursive subroutine print_section(this, section_id, level) + class(CpuTimerType) :: this + integer(I4B) :: section_id + integer(I4B) :: level + ! local + integer(I4B) :: i, new_level + real(DP) :: percent + type(TimedSectionType), pointer :: section + + section => this%all_sections(section_id) + + ! calculate percentage + percent = 100.0_DP + if (section%parent_id /= 0) then + percent = section%walltime / this%all_sections(this%root_id)%walltime * 100.0_DP + end if + + ! print section timing + write(*,'(3a,f0.2,3a,f14.6,2a,i0,a)') & + " ", repeat('....', level), "[", percent, "%] ", & + trim(section%title), ": ", section%walltime, "s", " (", & + section%count, "x)" + + ! print children + new_level = level + 1 + do i = 1, section%children%size + call this%print_section(section%children%at(i), new_level) + end do + + if (level == 0) write(*,*) + + end subroutine print_section + + subroutine destroy(this) + class(CpuTimerType) :: this + ! local + integer(I4B) :: i + + do i = 1, this%nr_sections + call this%all_sections(i)%children%destroy() + end do + deallocate (this%all_sections) + nullify (this%all_sections) + + end subroutine destroy + + function is_initialized(this) result(initialized) + class(CpuTimerType) :: this + logical(LGP) :: initialized + + initialized = associated(this%all_sections) + + end function is_initialized + + subroutine serial_timer(walltime) + real(DP), intent(inout) :: walltime + call cpu_time(walltime) + end subroutine serial_timer + +end module CpuTimerModule \ No newline at end of file diff --git a/src/meson.build b/src/meson.build index 0cd60f953c5..5e24e70c7c5 100644 --- a/src/meson.build +++ b/src/meson.build @@ -356,6 +356,7 @@ modflow_sources = files( 'Utilities' / 'OutputControl' / 'OutputControl.f90', 'Utilities' / 'OutputControl' / 'OutputControlData.f90', 'Utilities' / 'OutputControl' / 'PrintSaveManager.f90', + 'Utilities' / 'Timer' / 'CpuTimer.f90', 'Utilities' / 'TimeSeries' / 'TimeArray.f90', 'Utilities' / 'TimeSeries' / 'TimeArraySeries.f90', 'Utilities' / 'TimeSeries' / 'TimeArraySeriesLink.f90', diff --git a/src/mf6core.f90 b/src/mf6core.f90 index 1c3cedbf8e7..0e05bf6ae37 100644 --- a/src/mf6core.f90 +++ b/src/mf6core.f90 @@ -18,6 +18,7 @@ module Mf6CoreModule use SolutionGroupModule, only: SolutionGroupType, GetSolutionGroupFromList use RunControlModule, only: RunControlType use SimStagesModule + use CpuTimerModule implicit none class(RunControlType), pointer :: run_ctrl => null() !< the run controller for this simulation @@ -97,6 +98,9 @@ subroutine Mf6Initialize() ! -- create model exports call export_cr() + ! -- stop the timer + call g_timer%stop(SECTION_INIT) + end subroutine Mf6Initialize !> @brief Run a time step @@ -107,8 +111,9 @@ end subroutine Mf6Initialize !! !< function Mf6Update() result(hasConverged) - ! -- return variable logical(LGP) :: hasConverged + ! start timer + call g_timer%start(SECTION_UPDATE) ! ! -- prepare timestep call Mf6PrepareTimestep() @@ -118,7 +123,10 @@ function Mf6Update() result(hasConverged) ! ! -- after timestep hasConverged = Mf6FinalizeTimestep() - ! + + ! stop timer + call g_timer%stop(SECTION_UPDATE) + end function Mf6Update !> @brief Finalize the simulation @@ -147,7 +155,10 @@ subroutine Mf6Finalize() class(BaseModelType), pointer :: mp => null() class(BaseExchangeType), pointer :: ep => null() class(SpatialModelConnectionType), pointer :: mc => null() - ! + + ! start timer + call g_timer%start(SECTION_FINALIZE) + ! ! -- FINAL PROCESSING (FP) ! -- Final processing for each model @@ -211,10 +222,13 @@ subroutine Mf6Finalize() call export_da() call simulation_da() call lists_da() - ! - ! -- finish gently (No calls after this) + + ! stop timer + call g_timer%stop(SECTION_FINALIZE) + + ! finish gently (No calls after this) call run_ctrl%finish() - ! + end subroutine Mf6Finalize !> @brief print initial message @@ -494,6 +508,10 @@ subroutine Mf6PrepareTimestep() integer(I4B) :: ie integer(I4B) :: ic integer(I4B) :: is + + ! start timer + call g_timer%start(SECTION_PREP_TSTP) + ! ! -- initialize fmt fmt = "(/,a,/)" @@ -574,6 +592,9 @@ subroutine Mf6PrepareTimestep() ! -- set time step call tdis_set_timestep() + ! stop timer + call g_timer%stop(SECTION_PREP_TSTP) + end subroutine Mf6PrepareTimestep !> @brief Run time step @@ -596,6 +617,9 @@ subroutine Mf6DoTimestep() integer(I4B) :: isg logical :: finishedTrying + ! start timer + call g_timer%start(SECTION_DO_TSTP) + ! -- By default, the solution groups will be solved once, and ! may fail. But if adaptive stepping is active, then ! the solution groups may be solved over and over with @@ -618,6 +642,9 @@ subroutine Mf6DoTimestep() end do retryloop + ! stop timer + call g_timer%stop(SECTION_DO_TSTP) + end subroutine Mf6DoTimestep !> @brief Rerun time step @@ -694,6 +721,10 @@ function Mf6FinalizeTimestep() result(hasConverged) ! -- initialize format and line fmt = "(/,a,/)" line = 'end timestep' + + ! start timer + call g_timer%start(SECTION_FINAL_TSTP) + ! ! -- evaluate simulation mode select case (isim_mode) @@ -737,6 +768,10 @@ function Mf6FinalizeTimestep() result(hasConverged) ! ! -- Check if we're done call converge_check(hasConverged) + + ! stop timer + call g_timer%stop(SECTION_FINAL_TSTP) + end function Mf6FinalizeTimestep end module Mf6CoreModule