-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions_auxiliary.R
1000 lines (875 loc) · 32.2 KB
/
functions_auxiliary.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -------------------------------------
# Script: auxiliary_functions.R
# Author: Michal Kvasnička
# Purpose: This script defines various auxiliary functions.
# Inputs: none
# Outputs: function definitions
#
# Copyright(c) Michal Kvasnička
# -------------------------------------
# necessary packages
library(dplyr, verbose = FALSE, warn.conflicts = FALSE)
library(readr, verbose = FALSE, warn.conflicts = FALSE)
library(stringr, verbose = FALSE, warn.conflicts = FALSE)
library(memuse, verbose = FALSE, warn.conflicts = FALSE)
library(logging, verbose = FALSE, warn.conflicts = FALSE)
# get district weights function
source(file.path(RSCRIPTDIR, "district_weights.R"))
# paths to folders -------------------------------------------------------------
# path to raw data
raw_data_dir <- function() {
file.path(DIR_ORIGIN, "rawdata")
}
# path to districts shape file folder
path_to_raw_districts <- function() {
file.path(raw_data_dir(),
# "arccr", "arccr_4_1.gdb")
"cuzk")
}
# path to OSM maps
path_to_raw_roads_osm <- function() {
file.path(raw_data_dir(), "osm", "czech-republic-latest.osm.pbf")
}
# path to original accidents file
path_to_raw_accidents_dir <- function() {
file.path(raw_data_dir(), "accidents")
}
# path to file with all processed accidents
path_to_raw_accidents <- function() {
file.path(path_to_accidents_dir(), "all_accidents.rds")
}
# path to created data folder
data_dir <- function() {
file.path(DIR_ORIGIN, "tmp")
}
# path to districts
path_to_districts <- function() {
file.path(data_dir(), "districts", "districts.rds")
}
# path to state polygon
path_to_state_polygon <- function() {
file.path(data_dir(), "districts", "state_polygon.rds")
}
# path to filtered/converted maps folder
path_to_map_dir <- function() {
file.path(data_dir(), "maps")
}
# path to lixels' folder
path_to_lixels_maps_dir <- function() {
file.path(data_dir(), "lixels")
}
# paths to accidents data by districts
path_to_accidents_dir <- function() {
file.path(data_dir(), "accidents")
}
# path to NKDEs by districts
path_to_densities_dir <- function() {
file.path(data_dir(), "densities")
}
# outputs
output_dir <- function() {
file.path(DIR_ORIGIN, "output")
}
# path to districts for shiny
path_to_shiny_districts <- function() {
file.path(output_dir(), "districts", "districts.rds")
}
# path to ORPs for shiny
path_to_shiny_orps <- function() {
file.path(output_dir(), "districts", "orps.rds")
}
# path to folder where accidents in individual districts for shiny are stored
path_to_shiny_accidents_dir <- function() {
file.path(output_dir(), "accidents")
}
# path to folder where the final product used in shiny in stored
shiny_dir <- function() {
file.path(output_dir(), "clusters")
}
# path to folder where finald product for GIS (shape file) is stored
gis_dir <- function() {
file.path(output_dir(), "gis")
}
# logging
log_dir <- function() {
file.path(DIR_ORIGIN, "log")
}
# path to source profiles and config
path_to_source_configs <- function() {
file.path(DIR_ORIGIN, "config")
}
# path to file where profiles are stored
path_to_configs <- function() {
file.path(path_to_source_configs(), "profiles.rds")
}
# paths to files ---------------------------------------------------------------
# general template for paths to individual district files
#
# inputs:
# - folder ... (character scalar) folder where the files are stored
# - districts ... (sf tibble) the districts table
# - txt ... (character scalar) general template in glue format
# supported parts:
# - id ... id numbers of districts
# - name ... names of districts
#
# value:
# character vector of paths to specific files;
# if is.null(folder), only file name is returned
basic_file_name <- function(folder, districts, txt, ...) {
pars <- list(...)
pars$id <- districts$district_id
pars$name <- districts$district_name
fname <- glue::glue(txt, .envir = pars) |>
as.character()
if (is.null(folder))
return(fname)
file.path(folder, fname)
}
# these functions return paths to individual district files of various types
#
# inputs:
# - districts ... (sf tibble) the districts table
# - folder ... (character scalar) folder where the files are stored;
# if is.null(folder) (implicit value) then only file name is returned
#
# value:
# character vector of paths to specific files; see folder input
# paths to geojson files describing individual districts
geojson_file_name <- function(districts, folder = NULL, ...) {
basic_file_name(folder, districts, "district_{id}.geojson")
}
# paths to osm files including selected roads in particual districts
osm_file_name <- function(districts, folder = NULL, ...) {
basic_file_name(folder, districts, "district_{id}.osm")
}
# paths to sfnetwork files including selected roads in particual districts
sf_file_name <- function(districts, folder = NULL, ...) {
basic_file_name(folder, districts, "district_{id}.rds")
}
# paths to files including lixelated selected roads in particual districts
lixel_file_name <- function(districts, folder = NULL, ...) {
basic_file_name(folder, districts, "lixel_{id}.rds")
}
# paths to files including mid-points of lixelated selected roads in particual
# districts
lixel_sample_file_name <- function(districts, folder = NULL, ...) {
basic_file_name(folder, districts, "lixel_sample_{id}.rds")
}
# paths to files including neighbors' lists of lixelated selected roads in
# particual districts
lixel_nb_file_name <- function(districts, folder = NULL, ...) {
basic_file_name(folder, districts, "lixel_nb_{id}.rds")
}
# paths to files including accidents cropped to particular districts snapped to
# selected roads there
accidents_file_name <- function(districts, folder = NULL, ...) {
basic_file_name(folder, districts, "accidents_{id}.rds")
}
# paths to files including lixelated selected roads in particual districts with
# added NKDE densities
densities_file_name <- function(districts, folder = NULL, ...) {
pars <- list(...)
stopifnot(all(c("from_date", "to_date") %in% names(pars)))
txt <- stringr::str_c(
"densities_{id}_",
if ("profile_name" %in% names(pars) && !is.null(pars$profile_name))
"{profile_name}_"
else
"",
"{as.character(from_date)}_{as.character(to_date)}.rds")
basic_file_name(folder, districts, txt, ...)
}
# paths to files including the final product used in Shiny
shiny_file_name <- function(districts, folder = NULL, ...) {
pars <- list(...)
stopifnot(all(c("from_date", "to_date") %in% names(pars)))
txt <- stringr::str_c(
"clusters_{id}_",
if ("profile_name" %in% names(pars) && !is.null(pars$profile_name))
"{profile_name}_"
else
"",
"{as.character(from_date)}_{as.character(to_date)}.rds")
basic_file_name(folder, districts, txt, ...)
}
# accidents files regexes ------------------------------------------------------
# regex for major accident-data file
accidents_file_name_pattern <- function() {
"\\d{4}_databaze_nehody.csv"
}
# regex for major gps file of accident data
accidents_gps_name_pattern <- function() {
"\\d{4}_databaze_GPS.csv"
}
# regex for outcomes file of accident data
accidents_outcomes_name_pattern <- function() {
"\\d{4}_databaze_nasledky.csv"
}
# regex for pedestrians file of accident data
accidents_pedestrians_name_pattern <- function() {
"\\d{4}_databaze_chodci.csv"
}
# regex for vehicles file of accident data
accidents_vehicles_name_pattern <- function() {
"\\d{4}_databaze_vozidla.csv"
}
# how many first rows should be skipped in each CSV file
ACCIDENTS_FILES_SKIP <- 6L
# reading/writing files --------------------------------------------------------
# create_dir_for_file(file) recursively creates folders needed to save file
#
# inputs:
# - file ... (character scalar) path to file
#
# value:
# none; it creates folders if needed
create_dir_for_file <- function(file) {
dir <- dirname(file)
dir.create(dir, showWarnings = FALSE, recursive = TRUE)
}
# write_dir_rds(object, file) writes one object to a path; it doesn't compress
# the file; it creates all necessary (sub)folders in the path
#
# inputs:
# - object ... any kind of object
# - file ... (character scalar) a path to the object
# - compress ... (logical scalar) if TRUE, .rds file would be compressed, if
# FALSE, it wouldn't be compressed
#
# value:
# none; it writes an .rds file to disk
#
# WARNING: it always compresses the writtent files now!
#
# usage:
# write_dir_rds(districts, "/tmp/boo.rds")
# districts <- readr::read_rds("/tmp/boo.rds")
write_dir_rds <- function(object, file, compress = FALSE) {
# compress <- if (compress) "gz" else "none"
compress <- "gz"
create_dir_for_file(file)
readr::write_rds(object, file = file, compress = compress)
}
# write_dir_rdata(..., file) writes all objects in ... to a path; it doesn't
# compress the file; it creates all necessary (sub)folders in the path
#
# inputs:
# - ... ... any kind of objects
# - file ... (character scalar) a path to the object
#
# value:
# none
#
# WARNING: this function may not work when used deep in functions
write_dir_rdata <- function(..., file) {
create_dir_for_file(file)
save(..., file = file, compress = FALSE)
}
# updating files ---------------------------------------------------------------
# is_behind(target, source) returns TRUE if any target files is older than any
# source files or if it doesn't exist; it returns FALSE otherwise
#
# inputs:
# - target ... (character vector) paths to files depending on source
# - source ... (character vector) paths to files used to create target
#
# values:
# logical, see above
#
# usage:
# if (is_behind(PATH_TO_DISTRICTS, PATH_TO_RAW_DISTRICTS)) {...}
is_behind <- function(target, source) {
mtarget <- file.mtime(target)
msource <- file.mtime(source)
if (any(is.na(msource))) {
logging::logerror("Some sources don't exist: %s.",
str_c(source[is.na(msource)], collapse = ", "))
stop("Some sources don't exist: ",
str_c(source[is.na(msource)], collapse = ", "))
}
if (any(is.na(mtarget)))
return(TRUE)
max(msource) > min(mtarget)
}
# districts_behind() returns subset of districts for which the computation is
# behind
#
# inputs:
# - districts ... (sf tibble) tibble of districts
# - target_fun ... (function) name function of the target, such as
# lixel_file_name
# - source_fun ... (function) name function of the target, such as sf_file_name
# - target_folder ... (character scalar) path to folder where target files
# should be stored
# - source_folder ... (character scalar) path to folder where source files
# are be stored
# - other_files ... (optional character vector) paths to other files that
# condition target creation
#
# value:
# subset of (rows of) districts table; it includes only its rows for which
# targets must be created because they either don't exist or are behind
districts_behind <- function(districts, target_fun, source_fun,
target_folder, source_folder,
other_files = NULL,
...) {
target_files <- target_fun(districts, target_folder, ...)
mtarget <- file.mtime(target_files)
if (is.list(source_fun)) {
if (!is.list(source_folder) ||
length(source_fun) != length(source_folder))
stop("When source_fun is a list, source_folder must ",
"be a list of the same length.")
msource <- -Inf
for (k in seq_along(source_fun)) {
source_files <- source_fun[[k]](districts, source_folder[[k]], ...)
msource <- pmax(msource, file.mtime(source_files))
}
} else {
source_files <- source_fun(districts, source_folder, ...)
msource <- file.mtime(source_files)
}
if (is.null(other_files)) {
mother <- -Inf
} else{
mother <- file.mtime(other_files)
if (any(is.na(mother)))
stop("Some sources don't exist: ",
str_c(other_files[is.na(mother)], collapse = ", "))
mother <- max(mother)
}
if (any(is.na(msource)))
stop("Some sources don't exist: ",
str_c(source_files[is.na(msource)], collapse = ", "))
ids <- is.na(mtarget) | (mtarget < msource) | (mtarget < mother)
districts[ids, ]
}
# parallel processing ----------------------------------------------------------
# docker_mem_limit() returns amount of memory available to the docker container
# in which it's running
#
# inputs:
# none
#
# value:
# either available memory in GBs or Inf if unconstrained
#
# notes:
# - it may be necessary to add a new file in mem_limit() in a new version
# of Docker
docker_mem_limit <- function() {
mem_limit <- function(file) {
ml <- NA
if (file.exists(file)) {
ml <- base::readLines(file) |>
as.numeric() / 1024^3
}
ml
}
suppressWarnings(
c(
mem_limit("/sys/fs/cgroup/memory/memory.limit_in_bytes"),
mem_limit("/sys/fs/cgroup/memory.max")
) |>
min(na.rm = TRUE)
)
}
# available_memory() returns memory available to the process at the moment in
# GBs
#
# inputs:
# none
#
# value:
# available memory in GBs
#
# notes:
# - if run from inside a docker container, it returns the minimum of free memory
# at the moment and the size of the memory available to the container
# - it returns the amount of free memory at the momenet otherwise
available_memory <- function() {
min(
docker_mem_limit(),
as.numeric(memuse::Sys.meminfo()$freeram) / 1024 ^ 3
)
}
# docker_cpu_limit() returns number of cpus available to the docker container
# in which it's running
#
# it returns the constraint set individually for the container; it ignores
# the constraint set globally in Docker desktop
#
# inputs:
# none
#
# value:
# either number of cpus or Inf if unconstrained
#
# notes:
# - it may be necessary to add a new file in cpu_limit() in a new version
# of Docker
docker_cpu_limit <- function() {
cpu_limit <- function(quota_file, period_file = "") {
quota <- NA
period <- 1e5
if (file.exists(quota_file)) {
if (file.exists(period_file)) {
quota <- as.numeric(base::readLines(quota_file))[1]
period <- as.numeric(base::readLines(period_file))[1]
} else {
cl <- base::readLines(quota_file) |>
base::strsplit(split = "\\s+")
quota <- as.numeric(cl[[1]][1])
period <- as.numeric(cl[[1]][2])
}
}
max(quota %/% period, 1)
}
suppressWarnings(
c(
cpu_limit("/sys/fs/cgroup/cpu/cpu.cfs_quota_us",
"/sys/fs/cgroup/cpu/cpu.cfs_period_us"),
cpu_limit("/sys/fs/cgroup/cpu.max")
) |>
min(na.rm = TRUE) |>
floor()
)
}
# docker_global_cpu_limit() returns number of cpus available to the docker
# container in which it's running
#
# it returns the constraint set globally in Docker desktop; it ignores
# the constraint set individually for the container
#
#
# inputs:
# - logical ... (logical scalar) if TRUE, it returns the number of logical
# cores; if FALSE, it returns the number of physical cores
#
# value:
# either number of cpus or Inf if unconstrained
#
# notes:
# - this function may not be needed; it seems that parallelly::availableCores()
# returns this value
#
# WARNING:
# - the distingtion between logical cores (threads) and physical cores may not
# work in Docker container
docker_global_cpu_limit <- function(logical = TRUE) {
limit <- Inf
cpuinfo <- "/proc/cpuinfo"
if (file.exists(cpuinfo)) {
info <- base::readLines(cpuinfo)
if (logical) {
limit <- length(grep("processor", info))
} else {
idx <- base::grep("cpu cores", info)
limit <- base::unique(info[idx])
limit <- as.integer(base::gsub("\\D", "", limit))
}
}
min(limit)
}
# available_cores() returns the number of cores available at the moment
#
# inputs:
# - logical ... (logical scalar) if TRUE, it returns the number of logical
# cores; if FALSE, it returns the number of physical cores
#
# value:
# (integer scalar) number of available cpu cores
#
# notes:
# - if run from inside a docker container, it returns the minimum of the number
# of cores dedicated to the docker container and the number of cores available
# at the machine
# - it returns the number of cores available at the machine otherwise
#
# WARNING:
# - the distingtion between logical cores (threads) and physical cores may not
# work in Docker container
available_cores <- function(logical = TRUE) {
min(
docker_cpu_limit(),
docker_global_cpu_limit(logical = logical),
parallelly::availableCores(logical = logical)
) |> as.integer()
}
# get_number_of_workers() returns the number of workers/cores that should be
# used in parallel
#
# inputs:
# - workers ... (integer scalar or "auto")
# - ram_needed ... (numeric scalar) RAM in GB needed per one core
#
# value:
# integer scalar
#
# notes:
# - if workers == "auto", optimal number of workers is estimated from available
# cores, RAM needed per core, and amount of memory available when
# get_number_of_workers() is run
# - it is supposed that this system is the only software running at the machine
#
# WARNINGS:
# - if any other process allocates memory after get_number_of_workers() done its
# magic, memory may be insufficient for this system
get_number_of_workers <- function(workers, ram_needed) {
if (workers == "auto") {
ram <- available_memory()
no_of_cores <- available_cores()
workers <- min(no_of_cores, floor(ram / ram_needed))}
workers
}
# balance_load() balances load for PWALK(); it sorts the rows of tab in such
# a way that each combination of profile and time window is processed together
# (if possible) and within this, the districts are processed in such a way that
# the biggest one is paired with the smallest one, the second biggest one with
# the second smallest one, and so on
#
# inputs:
# - tab ... (tibble) a tibble with columns district_id and possibly also
# profile_name, from_date, and to_date
# - workers ... (integer scalar) number of cores to be used
#
# value: the same tibble with reshuffled rows
balance_load <- function(tab, workers) {
# myx() returns the order based on sizes to balance the load; the balancing
# algorithm is based on zizgaz_sort() from
# https://www.r-bloggers.com/2020/12/going-parallel-understanding-load-balancing-in-r/
# it performs the better the more cores are used
myx <- function(weight, workers){
tab <- tibble::tibble(id = seq_along(weight), x = weight) |>
dplyr::arrange(desc(weight))
sortvec <- rep(c(seq(1, workers), seq(workers, 1)),
length = length(weight)) |>
order()
tab[sortvec, ] |>
dplyr::mutate(order = seq_along(weight)) |>
dplyr::arrange(id) |>
dplyr::pull(order)
}
present <- function(name) name %in% names(tab)
pn <- present("profile_name")
fd <- present("from_date")
td <- present("to_date")
tab |>
dplyr::mutate(.order = myx(district_sizes(district_id),
workers = workers)) |>
dplyr::arrange(
if (pn) profile_name else NULL,
if (fd) from_date else NULL,
if (td) to_date else NULL,
.order
) |>
dplyr::select(-.order)
}
# silently(.f) wrappes function .f in such a way that runs and returns TRUE when
# it succeeds and simpleError when it fails; it should be used with functions
# that return no value but are run for their side effects; it is used in PWALK()
#
# inputs:
# - .f ... (clusure) a function that returns nothing and is run for its side
# effects
#
# value:
# simpleError when .f fails and TRUE otherwise
silently <- function(.f) {
.f <- purrr::as_mapper(.f)
function(...) {
tryCatch({.f(...); TRUE},
error = function(e) e)
}
}
# PWALK() is the same as purrr::walk() with these differences:
# - if workers > 1, furrr::future_walk() is run
# - function .f is protected, i.e., it doesn't fail, and the all paralleled
# computations are carried out; PWALK() fails only then
# - .l must be a tibble, i.e., it cannot be a list
# - when run in parallel, it uses "dynamic" scheduling to balance the load;
# this assumes that the data .l sent to individual processes are small while
# the computation itself takes a lot of time; see
# https://furrr.futureverse.org/articles/chunking.html
PWALK <- function(.l, .f, workers = 1, ram_needed = NULL, ...) {
if (nrow(.l) > 0) {
workers <- get_number_of_workers(workers, ram_needed)
workers <- min(workers, nrow(.l))
logging::loginfo("PWALK: using %s worker%s", workers,
if (workers > 1) "s in parallel" else " only")
.f <- silently(.f)
if (workers == 1) {
success <- purrr::pmap(.l, .f, ...)
} else {
oplan <- future::plan()
on.exit(future::plan(oplan))
future::plan("multisession", workers = workers)
# balance the load
.l <- balance_load(.l, workers)
success <- furrr::future_pmap(
.l, .f, ...,
.options = furrr::furrr_options(seed = TRUE,
scheduling = Inf)
)
}
failed <- seq_len(nrow(.l))[!purrr::map_lgl(success, isTRUE)]
if (length(failed) > 0) {
purrr::walk(failed,
~logging::logerror("output file %s throws error %s",
.l$output_file[.],
as.character(success[[.]])))
stop("production failed in the following output files: ",
str_c(.l$output_file[failed], collapse = ", "),
"; see the log")
}
}
}
# logging ----------------------------------------------------------------------
# create_log_file(log_folder) creates a new log file in log_folder; it creates
# the folder, too, if necessary
#
# inputs:
# - log_folder (character scalar) path to a folder where logs should be stored
#
# value:
# none; it only creates the file (and the folder, if necessary)
#
# notes:
# - the reason for this approach is following: I want individual steps in data
# preparation to run in vanilla fresh R each time, but I want all logging into
# one file; therefore, I have to start logging into the newest existing file;
# therefore, there must be a way to create a new log file
create_log_file <- function(log_folder) {
tryCatch({
dir.create(log_folder, showWarnings = FALSE, recursive = TRUE)
time <- as.character(Sys.time())
log_file <- file.path(
log_folder,
stringr::str_c(stringr::str_replace_all(time, "[\\s:]", "-"),".log")
)
logging::addHandler(writeToFile, file = log_file)
logging::loginfo("log file created")
},
error = function(e) {
stop("creating new log file failed---stopping evaluation---see the log",
call. = NA)}
)
}
# start_logging(log_folder) starts logging into the newest .log file in
# log_folder
#
# inputs:
# - log_folder (character scalar) path to a folder where logs are stored
#
# value:
# none; it starts logging into the newest .log file in log_folder
#
# notes:
# - for reason why it is done this way, see notes to create_log_file()
start_logging <- function(log_folder) {
# if (is.null(log_folder))
# return(invisible(NULL))
logging::basicConfig()
log_file <- list.files(log_folder, pattern = "\\.log", full.names = TRUE) |>
file.info() |>
dplyr::arrange(dplyr::desc(ctime)) |>
dplyr::slice(1) |>
rownames()
logging::addHandler(writeToFile, file = log_file)
}
# automatic time windows -------------------------------------------------------
# auto_time_window(length, number) produces an automatic time window for one
# period length and one period number
#
# inputs:
# - length ... (integer scalar) length of a time window in years
# - number ... (integer scalar) how many windows of length length should be
# created
#
# value:
# tibble with two columns:
# - from_date ... (character in format YYYY-MM-DD) beginning of the period
# - to_date ... (character in format YYYY-MM-DD) end of the period
# the first period end in December 31 of the last finished year; it starts on
# January 1 of the year length years before that; i.e., the length of the
# period is length
#
# usage:
# auto_time_window(3, 2) # two periods of 3 years
auto_time_window <- function(length, number) {
last_complete_year <- lubridate::year(lubridate::today()) - 1
f <- function(end_year, length, shift) {
tibble::tibble(
from_date = stringr::str_c(end_year - (shift + 1) * length + 1,
"-01-01"),
to_date = stringr::str_c(end_year - shift * length, "-12-31")
)
}
purrr::map_dfr(seq_len(number) - 1,
~f(last_complete_year, length, .)) #|>
# dplyr::mutate(across(everything(), as.Date))
}
# auto_time_windows(lengths, numbers) produces an automatic time window for many
# period lengths and possibly many period numbers
#
# inputs:
# - lengths ... (integer vector) length of a time window in years
# - number ... (integer scalar or integer vector of the same length as lengths)
# how many windows of length length should be created
#
# value:
# tibble with two columns:
# - from_date ... (character in format YYYY-MM-DD) beginning of the period
# - to_date ... (character in format YYYY-MM-DD) end of the period
# the first period end in December 31 of the last finished year; it starts on
# January 1 of the year length years before that; i.e., the length of the
# period is length
#
# usage:
# auto_time_windows(3, 2) # two periods of 3 years
# auto_time_windows(c(1, 3), 2) # two periods of 1 year and two of 3 years
# auto_time_windows(c(1, 3), 1:2) # one 1-year long period and two
# # 3-years-long periods
auto_time_windows <- function(lengths, numbers) {
lengths <- unlist(lengths)
numbers <- unlist(numbers)
if (length(numbers) == 1)
numbers <- rlang::rep_along(lengths, numbers)
purrr::pmap_dfr(list(lengths, numbers), auto_time_window)
}
# compact_time_window(profile) processes the automatic time windows and joins
# them with the manual time windows; this function does the inner job;
# see help for compact_all_time_windows()
#
# inputs:
# - profile ... (tibble) one-row tibble, a row subset from profiles
#
# output:
# one-row tibble
compact_time_window <- function(profile) {
empty_date <- character(0)
empty_window <- tibble::tibble(from_date = empty_date, to_date = empty_date)
auto_window <- manual_window <- empty_window
if (profile$TIME_WINDOW_AUTO)
auto_window <- auto_time_windows(profile$TIME_WINDOW_LENGTH,
profile$TIME_WINDOW_NUMBER)
if ("TIME_WINDOW" %in% names(profile))
manual_window <- profile$TIME_WINDOW
time_window <- dplyr::bind_rows(auto_window, manual_window) |>
dplyr::distinct()
if (nrow(time_window) == 0) {
logging::logerror("config prep: profile %s has empty time windows",
profile$PROFILE_NAME)
stop("config prep: profile has empty time windows")
}
profile$TIME_WINDOW <- list(time_window)
profile |>
dplyr::select(-c(TIME_WINDOW_AUTO, TIME_WINDOW_LENGTH,
TIME_WINDOW_NUMBER))
}
# compact_all_time_windows(profiles) processes the automatic time windows and
# joins them with the manual time windows
#
# inputs:
# - profiles ... (tibble) profiles tibble created by prepare_profiles script
#
# value:
# the same profiles tibble with few differences:
# - automatic time slots are removed
# - TIME_WINDOW slot is appended from the automatic time windows
#
# notes:
# - automatic time windows are ... automatic; this means the each time this
# function is run, it may give different values as the automatic time windows
# depend on the year when it is run; example: if it is run on December 31, it
# gives older dates then when it's run on January 1 of the next year
compact_all_time_windows <- function(profiles) {
profiles |>
dplyr::rowwise() |>
dplyr::group_split() |>
purrr::map(compact_time_window) |>
dplyr::bind_rows() |>
dplyr::distinct()
}
# reading various files --------------------------------------------------------
# read_profiles() returns list of profiles
#
# inputs:
# none
#
# output:
# profiles (tibble) created by prepare_profiles script
#
# notes:
# - it uses automatic time windows, i.e., it may give different values based on
# when it is used; see compact_all_time_windows()
read_profiles <- function() {
readr::read_rds(path_to_configs()) |>
compact_all_time_windows()
}
# read_districts returns districts table
read_districts <- function() {
readr::read_rds(path_to_districts())
}
# damage cost ------------------------------------------------------------------
# add_damage_cost(accidents) takes a tibble of accidents and adds a new column,
# accident_cost
#
# inputs:
# - accidents ... (tibble) accidents table
# - unit_cost_dead ... (double scalar) cost of each deceased persion in mil. CZK
# - unit_cost_serious_injury ... (double scalar) cost of each seriously injured
# person in mil. CZK
# - unit_cost_light_injury ... (double scalar) cost of each lightly injured
# person in mil. CZK
# - unit_cost_material ... (double scalar) multiplier of the material cost
# - unit_cost_const ... (double scalar) a fixed cost in mil. CZK added to each
# accident
# - const_cost_dead ... (double scalar) a fixed cost in mil. CZK added to
# accidents where someone lost her life
# - const_cost_serious_injury ... (double scalar) a fixed cost in mil. CZK added
# to accidents where the worst damage was serious injury
# - const_cost_light_injury ... (double scalar) a fixed cost in mil. CZK added
# to accidents where the worst damage was light injury
# - const_cost_material ... (double scalar) a fixed cost in mil. CZK added to
# accidents with material cost only
# - na_zero ... (logical scalar) if TRUE (default), NAs in costs are replaced
# with 0s; if FALSE, all rows in accidents that have any NA cost are removed
#
# value:
# the same tibble as accidents but new column, accident cost, is added
add_damage_cost <- function(accidents,
unit_cost_dead,
unit_cost_serious_injury,
unit_cost_light_injury,
unit_cost_material,
unit_cost_const,
const_cost_dead,
const_cost_serious_injury,
const_cost_light_injury,
const_cost_material,
na_zero = TRUE) {
zero_na <- function(x, na_zero)
ifelse(is.na(x) & na_zero, 0, x)
accidents <- accidents |>
dplyr::mutate(accident_cost =
zero_na(accident_dead, na_zero) * unit_cost_dead +
zero_na(accident_serious_injury, na_zero) *
unit_cost_serious_injury +
zero_na(accident_light_injury, na_zero) *
unit_cost_light_injury +
zero_na(accident_material_cost, na_zero) *
unit_cost_material +
unit_cost_const +
dplyr::case_when(
zero_na(accident_dead, na_zero) > 0 ~ const_cost_dead,
zero_na(accident_serious_injury, na_zero) > 0 ~ const_cost_serious_injury,
zero_na(accident_light_injury, na_zero) > 0 ~ const_cost_light_injury,
TRUE ~ const_cost_material
)
)
if (!na_zero)
accidents <- accidents |> filter(!is.na(accident_cost))
accidents
}