From bf8399f31533e1fc11df3842a6ec369df5c311ef Mon Sep 17 00:00:00 2001
From: "Evan L. Ray" <elray@umass.edu>
Date: Tue, 7 Jan 2025 22:19:28 -0500
Subject: [PATCH 1/5] support length 1 arrays and relative metrics

---
 R/config.R                                    |  21 ++
 R/generate_eval_data.R                        |  45 ++-
 inst/schema/config_schema.json                |  25 +-
 tests/testthat/_snaps/config.md               | 267 ++++++++++++++++++
 .../helper-check_exp_scores_for_window.R      |  85 ++++++
 tests/testthat/test-config.R                  |  60 +++-
 tests/testthat/test-generate_eval_data.R      | 104 +++----
 ...onfig_invalid_rel_metrics_no_baseline.yaml |  90 ++++++
 ...config_invalid_rel_metrics_non_metric.yaml |  90 ++++++
 .../config_valid_length_one_arrays.yaml       |   9 +
 ...config_valid_mean_median_quantile_rel.yaml |  86 ++++++
 .../config_valid_rel_metrics.yaml             |  91 ++++++
 12 files changed, 888 insertions(+), 85 deletions(-)
 create mode 100644 tests/testthat/helper-check_exp_scores_for_window.R
 create mode 100644 tests/testthat/testdata/test_configs/config_invalid_rel_metrics_no_baseline.yaml
 create mode 100644 tests/testthat/testdata/test_configs/config_invalid_rel_metrics_non_metric.yaml
 create mode 100644 tests/testthat/testdata/test_configs/config_valid_length_one_arrays.yaml
 create mode 100644 tests/testthat/testdata/test_configs/config_valid_mean_median_quantile_rel.yaml
 create mode 100644 tests/testthat/testdata/test_configs/config_valid_rel_metrics.yaml

diff --git a/R/config.R b/R/config.R
index dd3596f..f4d3a7c 100644
--- a/R/config.R
+++ b/R/config.R
@@ -149,6 +149,27 @@ validate_config_targets <- function(webevals_config, task_groups, task_id_names)
       )
     }
 
+    # check that relative_metrics is a subset of metrics
+    extra_relative_metrics <- setdiff(
+      target$relative_metrics,
+      target$metrics
+    )
+    if (length(extra_relative_metrics) > 0) {
+      raise_config_error(
+        c(
+          cli::format_inline(
+            "Requested relative metrics for metrics that were not requested ",
+            "for {.arg target_id} {.val {target_id}}."
+          ),
+          "i" = cli::format_inline("Requested metric{?s}: {.val {target$metrics}}."),
+          "x" = cli::format_inline(
+            "Relative metric{?s} not found in the requested metrics: ",
+            "{.val {extra_relative_metrics}}."
+          )
+        )
+      )
+    }
+
     # check that disaggregate_by are task id variable names
     extra_disaggregate_by <- setdiff(
       target$disaggregate_by,
diff --git a/R/generate_eval_data.R b/R/generate_eval_data.R
index 524f9d0..63af6fc 100644
--- a/R/generate_eval_data.R
+++ b/R/generate_eval_data.R
@@ -34,6 +34,9 @@ generate_target_eval_data <- function(hub_path,
                                       target) {
   target_id <- target$target_id
   metrics <- target$metrics
+  # if relative_metrics and baseline are not provided, the are NULL
+  relative_metrics <- target$relative_metrics
+  baseline <- target$baseline
   # adding `NULL` at the beginning will calculate overall scores
   disaggregate_by <- c(list(NULL), as.list(target$disaggregate_by))
   eval_windows <- config$eval_windows
@@ -50,6 +53,8 @@ generate_target_eval_data <- function(hub_path,
         model_out_tbl = model_out_tbl,
         oracle_output = oracle_output,
         metric_name_to_output_type = metric_name_to_output_type,
+        relative_metrics = relative_metrics,
+        baseline = baseline,
         target_id = target_id,
         window_name = eval_window$window_name,
         by = by,
@@ -70,18 +75,22 @@ generate_target_eval_data <- function(hub_path,
 #' out_path/target_id/window_name/by/scores.csv
 #' @noRd
 get_and_save_scores <- function(model_out_tbl, oracle_output, metric_name_to_output_type,
+                                relative_metrics, baseline,
                                 target_id, window_name, by,
                                 out_path) {
   # Iterate over the output types and calculate scores for each
   scores <- purrr::map(
     unique(metric_name_to_output_type$output_type),
-    ~ hubEvals::score_model_out(
-      model_out_tbl = model_out_tbl |> dplyr::filter(output_type == !!.x),
+    ~ get_scores_for_output_type(
+      model_out_tbl = model_out_tbl,
       oracle_output = oracle_output,
-      metrics = metric_name_to_output_type$metric[
-        metric_name_to_output_type$output_type == .x
-      ],
-      by = c("model_id", by)
+      metric_name_to_output_type = metric_name_to_output_type,
+      relative_metrics = relative_metrics,
+      baseline = baseline,
+      target_id = target_id,
+      window_name = window_name,
+      by = by,
+      output_type = .x
     )
   ) |>
     purrr::reduce(dplyr::left_join, by = c("model_id", by))
@@ -97,3 +106,27 @@ get_and_save_scores <- function(model_out_tbl, oracle_output, metric_name_to_out
                    file = file.path(target_window_by_out_path, "scores.csv"),
                    row.names = FALSE)
 }
+
+
+#' Get scores for a target in a given evaluation window for a specific output type.
+get_scores_for_output_type <- function(model_out_tbl, oracle_output, metric_name_to_output_type,
+                                       relative_metrics, baseline,
+                                       target_id, window_name, by,
+                                       output_type) {
+  metrics <- metric_name_to_output_type$metric[
+    metric_name_to_output_type$output_type == output_type
+  ]
+  if (!is.null(relative_metrics)) {
+    relative_metrics <- relative_metrics[relative_metrics %in% metrics]
+  }
+  scores <- hubEvals::score_model_out(
+    model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == !!output_type),
+    oracle_output = oracle_output,
+    metrics = metrics,
+    relative_metrics = relative_metrics,
+    baseline = baseline,
+    by = c("model_id", by)
+  )
+
+  return(scores)
+}
diff --git a/inst/schema/config_schema.json b/inst/schema/config_schema.json
index 76b8e1e..d5309f1 100644
--- a/inst/schema/config_schema.json
+++ b/inst/schema/config_schema.json
@@ -22,24 +22,37 @@
                     },
                     "metrics": {
                         "description": "Names of metrics to compute for this target.  These should be names of metrics supported by hubEvals::score_model_out.",
-                        "type": "array",
+                        "type": ["string", "array"],
                         "items": {
                             "type": "string"
                         },
                         "minItems": 1
                     },
+                    "relative_metrics": {
+                        "description": "Optional names of metrics for which to compute pairwise relative skill for this target.  These should be a subset of the metrics for the target.",
+                        "type": ["string", "array"],
+                        "items": {
+                            "type": "string"
+                        },
+                        "minItems": 0
+                    },
+                    "baseline": {
+                        "description": "Name of the model to use as a baseline for relative skill metrics for this target. Required if relative_metrics is provided.",
+                        "type": "string",
+                        "minItems": 0
+                    },
                     "disaggregate_by": {
                         "description": "Optional list of task id columns to disaggregate by. Aggregated scores for each model will always be computed.",
-                        "type": "array",
+                        "type": ["string", "array"],
                         "items": {
                             "type": "string"
                         }
                     }
                 },
-                "required": [
-                    "target_id",
-                    "metrics"
-                ]
+                "required": ["target_id", "metrics"],
+                "dependentRequired": {
+                    "relative_metrics": ["baseline"]
+                }
             }
         },
         "eval_windows": {
diff --git a/tests/testthat/_snaps/config.md b/tests/testthat/_snaps/config.md
index a1b21a7..553c243 100644
--- a/tests/testthat/_snaps/config.md
+++ b/tests/testthat/_snaps/config.md
@@ -228,6 +228,273 @@
       
       
 
+# read_webevals_config succeeds, valid yaml file with length 1 arrays
+
+    Code
+      read_config(hub_path, test_path("testdata", "test_configs",
+        "config_valid_length_one_arrays.yaml"))
+    Output
+      $targets
+      $targets[[1]]
+      $targets[[1]]$target_id
+      [1] "wk inc flu hosp"
+      
+      $targets[[1]]$metrics
+      [1] "wis"
+      
+      $targets[[1]]$disaggregate_by
+      [1] "location"
+      
+      
+      
+      $eval_windows
+      $eval_windows[[1]]
+      $eval_windows[[1]]$window_name
+      [1] "Full season"
+      
+      $eval_windows[[1]]$min_round_id
+      [1] "2023-01-21"
+      
+      
+      
+
+# read_webevals_config succeeds, valid yaml file with relative metrics
+
+    Code
+      read_config(hub_path, test_path("testdata", "test_configs",
+        "config_valid_rel_metrics.yaml"))
+    Output
+      $targets
+      $targets[[1]]
+      $targets[[1]]$target_id
+      [1] "wk inc flu hosp"
+      
+      $targets[[1]]$metrics
+      [1] "wis"                  "ae_median"            "interval_coverage_50"
+      [4] "interval_coverage_95"
+      
+      $targets[[1]]$relative_metrics
+      [1] "wis"       "ae_median"
+      
+      $targets[[1]]$baseline
+      [1] "FS-base"
+      
+      $targets[[1]]$disaggregate_by
+      [1] "location"        "reference_date"  "horizon"         "target_end_date"
+      
+      
+      $targets[[2]]
+      $targets[[2]]$target_id
+      [1] "wk flu hosp rate category"
+      
+      $targets[[2]]$metrics
+      [1] "log_score" "rps"      
+      
+      $targets[[2]]$disaggregate_by
+      [1] "location"        "reference_date"  "horizon"         "target_end_date"
+      
+      
+      
+      $eval_windows
+      $eval_windows[[1]]
+      $eval_windows[[1]]$window_name
+      [1] "Full season"
+      
+      $eval_windows[[1]]$min_round_id
+      [1] "2023-01-21"
+      
+      
+      $eval_windows[[2]]
+      $eval_windows[[2]]$window_name
+      [1] "Last 4 weeks"
+      
+      $eval_windows[[2]]$min_round_id
+      [1] "2023-01-21"
+      
+      $eval_windows[[2]]$n_last_round_ids
+      [1] 4
+      
+      
+      
+      $task_id_text
+      $task_id_text$location
+      $task_id_text$location$US
+      [1] "United States"
+      
+      $task_id_text$location$`01`
+      [1] "Alabama"
+      
+      $task_id_text$location$`02`
+      [1] "Alaska"
+      
+      $task_id_text$location$`04`
+      [1] "Arizona"
+      
+      $task_id_text$location$`05`
+      [1] "Arkansas"
+      
+      $task_id_text$location$`06`
+      [1] "California"
+      
+      $task_id_text$location$`08`
+      [1] "Colorado"
+      
+      $task_id_text$location$`09`
+      [1] "Connecticut"
+      
+      $task_id_text$location$`10`
+      [1] "Delaware"
+      
+      $task_id_text$location$`11`
+      [1] "District of Columbia"
+      
+      $task_id_text$location$`12`
+      [1] "Florida"
+      
+      $task_id_text$location$`13`
+      [1] "Georgia"
+      
+      $task_id_text$location$`15`
+      [1] "Hawaii"
+      
+      $task_id_text$location$`16`
+      [1] "Idaho"
+      
+      $task_id_text$location$`17`
+      [1] "Illinois"
+      
+      $task_id_text$location$`18`
+      [1] "Indiana"
+      
+      $task_id_text$location$`19`
+      [1] "Iowa"
+      
+      $task_id_text$location$`20`
+      [1] "Kansas"
+      
+      $task_id_text$location$`21`
+      [1] "Kentucky"
+      
+      $task_id_text$location$`22`
+      [1] "Louisiana"
+      
+      $task_id_text$location$`23`
+      [1] "Maine"
+      
+      $task_id_text$location$`24`
+      [1] "Maryland"
+      
+      $task_id_text$location$`25`
+      [1] "Massachusetts"
+      
+      $task_id_text$location$`26`
+      [1] "Michigan"
+      
+      $task_id_text$location$`27`
+      [1] "Minnesota"
+      
+      $task_id_text$location$`28`
+      [1] "Mississippi"
+      
+      $task_id_text$location$`29`
+      [1] "Missouri"
+      
+      $task_id_text$location$`30`
+      [1] "Montana"
+      
+      $task_id_text$location$`31`
+      [1] "Nebraska"
+      
+      $task_id_text$location$`32`
+      [1] "Nevada"
+      
+      $task_id_text$location$`33`
+      [1] "New Hampshire"
+      
+      $task_id_text$location$`34`
+      [1] "New Jersey"
+      
+      $task_id_text$location$`35`
+      [1] "New Mexico"
+      
+      $task_id_text$location$`36`
+      [1] "New York"
+      
+      $task_id_text$location$`37`
+      [1] "North Carolina"
+      
+      $task_id_text$location$`38`
+      [1] "North Dakota"
+      
+      $task_id_text$location$`39`
+      [1] "Ohio"
+      
+      $task_id_text$location$`40`
+      [1] "Oklahoma"
+      
+      $task_id_text$location$`41`
+      [1] "Oregon"
+      
+      $task_id_text$location$`42`
+      [1] "Pennsylvania"
+      
+      $task_id_text$location$`44`
+      [1] "Rhode Island"
+      
+      $task_id_text$location$`45`
+      [1] "South Carolina"
+      
+      $task_id_text$location$`46`
+      [1] "South Dakota"
+      
+      $task_id_text$location$`47`
+      [1] "Tennessee"
+      
+      $task_id_text$location$`48`
+      [1] "Texas"
+      
+      $task_id_text$location$`49`
+      [1] "Utah"
+      
+      $task_id_text$location$`50`
+      [1] "Vermont"
+      
+      $task_id_text$location$`51`
+      [1] "Virginia"
+      
+      $task_id_text$location$`53`
+      [1] "Washington"
+      
+      $task_id_text$location$`54`
+      [1] "West Virginia"
+      
+      $task_id_text$location$`55`
+      [1] "Wisconsin"
+      
+      $task_id_text$location$`56`
+      [1] "Wyoming"
+      
+      $task_id_text$location$`60`
+      [1] "American Samoa"
+      
+      $task_id_text$location$`66`
+      [1] "Guam"
+      
+      $task_id_text$location$`69`
+      [1] "Northern Mariana Islands"
+      
+      $task_id_text$location$`72`
+      [1] "Puerto Rico"
+      
+      $task_id_text$location$`74`
+      [1] "U.S. Minor Outlying Islands"
+      
+      $task_id_text$location$`78`
+      [1] "Virgin Islands"
+      
+      
+      
+
 # read_webevals_config succeeds, valid yaml file with no min_round_id
 
     Code
diff --git a/tests/testthat/helper-check_exp_scores_for_window.R b/tests/testthat/helper-check_exp_scores_for_window.R
new file mode 100644
index 0000000..4a63eb5
--- /dev/null
+++ b/tests/testthat/helper-check_exp_scores_for_window.R
@@ -0,0 +1,85 @@
+#' Helper function to check that the output files were created and have the expected contents
+#' for one evaluation window.
+#' @param out_path The path to the output directory where scores were saved.
+#' @param window_name The name of the evaluation window.
+#' @param model_out_tbl The model output table, filtered to data for the evaluation window.
+#' @param oracle_output The oracle output.
+#' @param include_rel Whether to include relative metrics in the expected scores.
+check_exp_scores_for_window <- function(out_path, window_name, model_out_tbl, oracle_output,
+                                        include_rel = FALSE) {
+  # check that the output files were created and have the expected contents
+  # no disaggregation
+  scores_path <- file.path(out_path, "wk inc flu hosp", window_name, "scores.csv")
+  testthat::expect_true(file.exists(scores_path))
+
+  actual_scores <- read.csv(scores_path)
+  expected_mean_scores <- hubEvals::score_model_out(
+    model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "mean"),
+    oracle_output = oracle_output,
+    metrics = "se_point",
+    relative_metrics = if (include_rel) "se_point" else NULL,
+    baseline = "FS-base",
+    by = "model_id"
+  )
+  expected_median_scores <- hubEvals::score_model_out(
+    model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "median"),
+    oracle_output = oracle_output,
+    metrics = "ae_point",
+    relative_metrics = if (include_rel) "ae_point" else NULL,
+    baseline = "FS-base",
+    by = "model_id"
+  )
+  expected_quantile_scores <- hubEvals::score_model_out(
+    model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "quantile"),
+    oracle_output = oracle_output,
+    metrics = c("wis", "ae_median", "interval_coverage_50", "interval_coverage_95"),
+    relative_metrics = if (include_rel) c("wis", "ae_median") else NULL,
+    baseline = "FS-base",
+    by = "model_id"
+  )
+  expected_scores <- expected_mean_scores |>
+    dplyr::left_join(expected_median_scores, by = "model_id") |>
+    dplyr::left_join(expected_quantile_scores, by = "model_id")
+  expect_df_equal_up_to_order(actual_scores, expected_scores, ignore_attr = TRUE) # nolint: object_usage_linter
+
+  for (by in c("location", "reference_date", "horizon", "target_end_date")) {
+    # check that the output files were created and have the expected contents
+    # disaggregated by `by`
+    scores_path <- file.path(out_path, "wk inc flu hosp", window_name, by, "scores.csv")
+    testthat::expect_true(file.exists(scores_path))
+
+    actual_scores <- read.csv(scores_path)
+    if (by %in% c("reference_date", "target_end_date")) {
+      actual_scores[[by]] <- as.Date(actual_scores[[by]])
+    }
+
+    expected_mean_scores <- hubEvals::score_model_out(
+      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "mean"),
+      oracle_output = oracle_output,
+      metrics = "se_point",
+      relative_metrics = if (include_rel) "se_point" else NULL,
+      baseline = "FS-base",
+      by = c("model_id", by)
+    )
+    expected_median_scores <- hubEvals::score_model_out(
+      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "median"),
+      oracle_output = oracle_output,
+      metrics = "ae_point",
+      relative_metrics = if (include_rel) "ae_point" else NULL,
+      baseline = "FS-base",
+      by = c("model_id", by)
+    )
+    expected_quantile_scores <- hubEvals::score_model_out(
+      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      oracle_output = oracle_output,
+      metrics = c("wis", "ae_median", "interval_coverage_50", "interval_coverage_95"),
+      relative_metrics = if (include_rel) c("wis", "ae_median") else NULL,
+      baseline = "FS-base",
+      by = c("model_id", by)
+    )
+    expected_scores <- expected_mean_scores |>
+      dplyr::left_join(expected_median_scores, by = c("model_id", by)) |>
+      dplyr::left_join(expected_quantile_scores, by = c("model_id", by))
+    expect_df_equal_up_to_order(actual_scores, expected_scores, ignore_attr = TRUE) # nolint: object_usage_linter
+  }
+}
diff --git a/tests/testthat/test-config.R b/tests/testthat/test-config.R
index 0bb11ed..e2406ce 100644
--- a/tests/testthat/test-config.R
+++ b/tests/testthat/test-config.R
@@ -12,6 +12,34 @@ test_that(
   }
 )
 
+test_that(
+  "read_webevals_config succeeds, valid yaml file with length 1 arrays",
+  {
+    hub_path <- test_path("testdata", "ecfh")
+    expect_snapshot(
+      read_config(
+        hub_path,
+        test_path("testdata", "test_configs",
+                  "config_valid_length_one_arrays.yaml")
+      )
+    )
+  }
+)
+
+test_that(
+  "read_webevals_config succeeds, valid yaml file with relative metrics",
+  {
+    hub_path <- test_path("testdata", "ecfh")
+    expect_snapshot(
+      read_config(
+        hub_path,
+        test_path("testdata", "test_configs",
+                  "config_valid_rel_metrics.yaml")
+      )
+    )
+  }
+)
+
 test_that(
   "read_webevals_config succeeds, valid yaml file with no min_round_id",
   {
@@ -85,7 +113,7 @@ test_that(
 )
 
 test_that(
-  "read_webevals_config succeeds, round_id_from_variable false",
+  "read_webevals_config fails, round_id_from_variable false",
   {
     hub_path <- test_path("testdata", "test_hub_invalid_rifv_F")
     expect_error(
@@ -203,3 +231,33 @@ test_that(
     )
   }
 )
+
+test_that(
+  "read_webevals_config fails, invalid relative metrics, not a subset of metrics",
+  {
+    hub_path <- test_path("testdata", "ecfh")
+    expect_error(
+      read_config(
+        hub_path,
+        test_path("testdata", "test_configs",
+                  "config_invalid_rel_metrics_non_metric.yaml")
+      ),
+      regexp = 'Relative metric not found in the requested metrics: "log_score".'
+    )
+  }
+)
+
+test_that(
+  "read_webevals_config fails, invalid relative metrics, no baseline",
+  {
+    hub_path <- test_path("testdata", "ecfh")
+    expect_error(
+      read_config(
+        hub_path,
+        test_path("testdata", "test_configs",
+                  "config_invalid_rel_metrics_no_baseline.yaml")
+      ),
+      regexp = "must have property baseline when property relative_metrics is present"
+    )
+  }
+)
diff --git a/tests/testthat/test-generate_eval_data.R b/tests/testthat/test-generate_eval_data.R
index 44f64a5..c80a0e3 100644
--- a/tests/testthat/test-generate_eval_data.R
+++ b/tests/testthat/test-generate_eval_data.R
@@ -1,77 +1,35 @@
-#' Helper function to check that the output files were created and have the expected contents
-#' for one evaluation window.
-#' @param out_path The path to the output directory where scores were saved.
-#' @param window_name The name of the evaluation window.
-#' @param model_out_tbl The model output table, filtered to data for the evaluation window.
-#' @param oracle_output The oracle output.
-check_exp_scores_for_window <- function(out_path, window_name, model_out_tbl, oracle_output) {
-  # check that the output files were created and have the expected contents
-  # no disaggregation
-  scores_path <- file.path(out_path, "wk inc flu hosp", window_name, "scores.csv")
-  testthat::expect_true(file.exists(scores_path))
-
-  actual_scores <- read.csv(scores_path)
-  expected_mean_scores <- hubEvals::score_model_out(
-    model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "mean"),
-    oracle_output = oracle_output,
-    metrics = "se_point",
-    by = "model_id"
-  )
-  expected_median_scores <- hubEvals::score_model_out(
-    model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "median"),
-    oracle_output = oracle_output,
-    metrics = "ae_point",
-    by = "model_id"
-  )
-  expected_quantile_scores <- hubEvals::score_model_out(
-    model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "quantile"),
-    oracle_output = oracle_output,
-    metrics = c("wis", "ae_median", "interval_coverage_50", "interval_coverage_95"),
-    by = "model_id"
-  )
-  expected_scores <- expected_mean_scores |>
-    dplyr::left_join(expected_median_scores, by = "model_id") |>
-    dplyr::left_join(expected_quantile_scores, by = "model_id")
-  expect_df_equal_up_to_order(actual_scores, expected_scores, ignore_attr = TRUE) # nolint: object_usage_linter
-
-  for (by in c("location", "reference_date", "horizon", "target_end_date")) {
-    # check that the output files were created and have the expected contents
-    # disaggregated by `by`
-    scores_path <- file.path(out_path, "wk inc flu hosp", window_name, by, "scores.csv")
-    testthat::expect_true(file.exists(scores_path))
-
-    actual_scores <- read.csv(scores_path)
-    if (by %in% c("reference_date", "target_end_date")) {
-      actual_scores[[by]] <- as.Date(actual_scores[[by]])
-    }
-
-    expected_mean_scores <- hubEvals::score_model_out(
-      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "mean"),
-      oracle_output = oracle_output,
-      metrics = "se_point",
-      by = c("model_id", by)
-    )
-    expected_median_scores <- hubEvals::score_model_out(
-      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "median"),
-      oracle_output = oracle_output,
-      metrics = "ae_point",
-      by = c("model_id", by)
+test_that(
+  "generate_eval_data works, integration test, no relative metrics",
+  {
+    out_path <- withr::local_tempdir()
+    hub_path <- test_path("testdata", "ecfh")
+    model_out_tbl <- hubData::connect_hub(hub_path) |>
+      dplyr::collect()
+    oracle_output <- read.csv(
+      test_path("testdata", "ecfh", "target-data", "oracle-output.csv")
     )
-    expected_quantile_scores <- hubEvals::score_model_out(
-      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "quantile"),
-      oracle_output = oracle_output,
-      metrics = c("wis", "ae_median", "interval_coverage_50", "interval_coverage_95"),
-      by = c("model_id", by)
+    oracle_output[["target_end_date"]] <- as.Date(oracle_output[["target_end_date"]])
+
+    generate_eval_data(
+      hub_path = hub_path,
+      config_path = test_path("testdata", "test_configs", "config_valid_mean_median_quantile.yaml"),
+      out_path = out_path,
+      oracle_output = oracle_output
     )
-    expected_scores <- expected_mean_scores |>
-      dplyr::left_join(expected_median_scores, by = c("model_id", by)) |>
-      dplyr::left_join(expected_quantile_scores, by = c("model_id", by))
-    expect_df_equal_up_to_order(actual_scores, expected_scores, ignore_attr = TRUE) # nolint: object_usage_linter
+
+    check_exp_scores_for_window(out_path,
+                                "Full season",
+                                model_out_tbl,
+                                oracle_output)
+    check_exp_scores_for_window(out_path,
+                                "Last 5 weeks",
+                                model_out_tbl |> dplyr::filter(reference_date >= "2022-12-17"),
+                                oracle_output)
   }
-}
+)
 
 test_that(
-  "generate_eval_data works, integration test",
+  "generate_eval_data works, integration test, with relative metrics",
   {
     out_path <- withr::local_tempdir()
     hub_path <- test_path("testdata", "ecfh")
@@ -84,7 +42,7 @@ test_that(
 
     generate_eval_data(
       hub_path = hub_path,
-      config_path = test_path("testdata", "test_configs", "config_valid_mean_median_quantile.yaml"),
+      config_path = test_path("testdata", "test_configs", "config_valid_mean_median_quantile_rel.yaml"),
       out_path = out_path,
       oracle_output = oracle_output
     )
@@ -92,10 +50,12 @@ test_that(
     check_exp_scores_for_window(out_path,
                                 "Full season",
                                 model_out_tbl,
-                                oracle_output)
+                                oracle_output,
+                                include_rel = TRUE)
     check_exp_scores_for_window(out_path,
                                 "Last 5 weeks",
                                 model_out_tbl |> dplyr::filter(reference_date >= "2022-12-17"),
-                                oracle_output)
+                                oracle_output,
+                                include_rel = TRUE)
   }
 )
diff --git a/tests/testthat/testdata/test_configs/config_invalid_rel_metrics_no_baseline.yaml b/tests/testthat/testdata/test_configs/config_invalid_rel_metrics_no_baseline.yaml
new file mode 100644
index 0000000..395efcb
--- /dev/null
+++ b/tests/testthat/testdata/test_configs/config_invalid_rel_metrics_no_baseline.yaml
@@ -0,0 +1,90 @@
+targets:
+- target_id: wk inc flu hosp
+  metrics:
+  - wis
+  - ae_median
+  - interval_coverage_50
+  - interval_coverage_95
+  relative_metrics:
+  - wis
+  - ae_median
+  disaggregate_by:
+  - location
+  - reference_date
+  - horizon
+  - target_end_date
+- target_id: wk flu hosp rate category
+  metrics:
+  - log_score
+  - rps
+  disaggregate_by:
+  - location
+  - reference_date
+  - horizon
+  - target_end_date
+eval_windows:
+- window_name: Full season
+  min_round_id: '2023-01-21'
+- window_name: Last 4 weeks
+  min_round_id: '2023-01-21'
+  n_last_round_ids: 4
+task_id_text:
+  location:
+    US: United States
+    '01': Alabama
+    '02': Alaska
+    '04': Arizona
+    '05': Arkansas
+    '06': California
+    '08': Colorado
+    '09': Connecticut
+    '10': Delaware
+    '11': District of Columbia
+    '12': Florida
+    '13': Georgia
+    '15': Hawaii
+    '16': Idaho
+    '17': Illinois
+    '18': Indiana
+    '19': Iowa
+    '20': Kansas
+    '21': Kentucky
+    '22': Louisiana
+    '23': Maine
+    '24': Maryland
+    '25': Massachusetts
+    '26': Michigan
+    '27': Minnesota
+    '28': Mississippi
+    '29': Missouri
+    '30': Montana
+    '31': Nebraska
+    '32': Nevada
+    '33': New Hampshire
+    '34': New Jersey
+    '35': New Mexico
+    '36': New York
+    '37': North Carolina
+    '38': North Dakota
+    '39': Ohio
+    '40': Oklahoma
+    '41': Oregon
+    '42': Pennsylvania
+    '44': Rhode Island
+    '45': South Carolina
+    '46': South Dakota
+    '47': Tennessee
+    '48': Texas
+    '49': Utah
+    '50': Vermont
+    '51': Virginia
+    '53': Washington
+    '54': West Virginia
+    '55': Wisconsin
+    '56': Wyoming
+    '60': American Samoa
+    '66': Guam
+    '69': Northern Mariana Islands
+    '72': Puerto Rico
+    '74': U.S. Minor Outlying Islands
+    '78': Virgin Islands
diff --git a/tests/testthat/testdata/test_configs/config_invalid_rel_metrics_non_metric.yaml b/tests/testthat/testdata/test_configs/config_invalid_rel_metrics_non_metric.yaml
new file mode 100644
index 0000000..85f1a58
--- /dev/null
+++ b/tests/testthat/testdata/test_configs/config_invalid_rel_metrics_non_metric.yaml
@@ -0,0 +1,90 @@
+targets:
+- target_id: wk inc flu hosp
+  metrics:
+  - wis
+  - ae_median
+  - interval_coverage_50
+  - interval_coverage_95
+  relative_metrics:
+  - log_score
+  baseline: FS-base
+  disaggregate_by:
+  - location
+  - reference_date
+  - horizon
+  - target_end_date
+- target_id: wk flu hosp rate category
+  metrics:
+  - log_score
+  - rps
+  disaggregate_by:
+  - location
+  - reference_date
+  - horizon
+  - target_end_date
+eval_windows:
+- window_name: Full season
+  min_round_id: '2023-01-21'
+- window_name: Last 4 weeks
+  min_round_id: '2023-01-21'
+  n_last_round_ids: 4
+task_id_text:
+  location:
+    US: United States
+    '01': Alabama
+    '02': Alaska
+    '04': Arizona
+    '05': Arkansas
+    '06': California
+    '08': Colorado
+    '09': Connecticut
+    '10': Delaware
+    '11': District of Columbia
+    '12': Florida
+    '13': Georgia
+    '15': Hawaii
+    '16': Idaho
+    '17': Illinois
+    '18': Indiana
+    '19': Iowa
+    '20': Kansas
+    '21': Kentucky
+    '22': Louisiana
+    '23': Maine
+    '24': Maryland
+    '25': Massachusetts
+    '26': Michigan
+    '27': Minnesota
+    '28': Mississippi
+    '29': Missouri
+    '30': Montana
+    '31': Nebraska
+    '32': Nevada
+    '33': New Hampshire
+    '34': New Jersey
+    '35': New Mexico
+    '36': New York
+    '37': North Carolina
+    '38': North Dakota
+    '39': Ohio
+    '40': Oklahoma
+    '41': Oregon
+    '42': Pennsylvania
+    '44': Rhode Island
+    '45': South Carolina
+    '46': South Dakota
+    '47': Tennessee
+    '48': Texas
+    '49': Utah
+    '50': Vermont
+    '51': Virginia
+    '53': Washington
+    '54': West Virginia
+    '55': Wisconsin
+    '56': Wyoming
+    '60': American Samoa
+    '66': Guam
+    '69': Northern Mariana Islands
+    '72': Puerto Rico
+    '74': U.S. Minor Outlying Islands
+    '78': Virgin Islands
diff --git a/tests/testthat/testdata/test_configs/config_valid_length_one_arrays.yaml b/tests/testthat/testdata/test_configs/config_valid_length_one_arrays.yaml
new file mode 100644
index 0000000..277ba80
--- /dev/null
+++ b/tests/testthat/testdata/test_configs/config_valid_length_one_arrays.yaml
@@ -0,0 +1,9 @@
+targets:
+- target_id: wk inc flu hosp
+  metrics:
+  - wis
+  disaggregate_by:
+  - location
+eval_windows:
+- window_name: Full season
+  min_round_id: '2023-01-21'
diff --git a/tests/testthat/testdata/test_configs/config_valid_mean_median_quantile_rel.yaml b/tests/testthat/testdata/test_configs/config_valid_mean_median_quantile_rel.yaml
new file mode 100644
index 0000000..6c99e31
--- /dev/null
+++ b/tests/testthat/testdata/test_configs/config_valid_mean_median_quantile_rel.yaml
@@ -0,0 +1,86 @@
+targets:
+- target_id: wk inc flu hosp
+  metrics:
+  - se_point
+  - ae_point
+  - wis
+  - ae_median
+  - interval_coverage_50
+  - interval_coverage_95
+  relative_metrics:
+  - se_point
+  - ae_point
+  - wis
+  - ae_median
+  baseline: FS-base
+  disaggregate_by:
+  - location
+  - reference_date
+  - horizon
+  - target_end_date
+eval_windows:
+- window_name: Full season
+  min_round_id: '2022-10-22'
+- window_name: Last 5 weeks
+  min_round_id: '2022-10-22'
+  n_last_round_ids: 5
+task_id_text:
+  location:
+    US: United States
+    '01': Alabama
+    '02': Alaska
+    '04': Arizona
+    '05': Arkansas
+    '06': California
+    '08': Colorado
+    '09': Connecticut
+    '10': Delaware
+    '11': District of Columbia
+    '12': Florida
+    '13': Georgia
+    '15': Hawaii
+    '16': Idaho
+    '17': Illinois
+    '18': Indiana
+    '19': Iowa
+    '20': Kansas
+    '21': Kentucky
+    '22': Louisiana
+    '23': Maine
+    '24': Maryland
+    '25': Massachusetts
+    '26': Michigan
+    '27': Minnesota
+    '28': Mississippi
+    '29': Missouri
+    '30': Montana
+    '31': Nebraska
+    '32': Nevada
+    '33': New Hampshire
+    '34': New Jersey
+    '35': New Mexico
+    '36': New York
+    '37': North Carolina
+    '38': North Dakota
+    '39': Ohio
+    '40': Oklahoma
+    '41': Oregon
+    '42': Pennsylvania
+    '44': Rhode Island
+    '45': South Carolina
+    '46': South Dakota
+    '47': Tennessee
+    '48': Texas
+    '49': Utah
+    '50': Vermont
+    '51': Virginia
+    '53': Washington
+    '54': West Virginia
+    '55': Wisconsin
+    '56': Wyoming
+    '60': American Samoa
+    '66': Guam
+    '69': Northern Mariana Islands
+    '72': Puerto Rico
+    '74': U.S. Minor Outlying Islands
+    '78': Virgin Islands
diff --git a/tests/testthat/testdata/test_configs/config_valid_rel_metrics.yaml b/tests/testthat/testdata/test_configs/config_valid_rel_metrics.yaml
new file mode 100644
index 0000000..a95d8e8
--- /dev/null
+++ b/tests/testthat/testdata/test_configs/config_valid_rel_metrics.yaml
@@ -0,0 +1,91 @@
+targets:
+- target_id: wk inc flu hosp
+  metrics:
+  - wis
+  - ae_median
+  - interval_coverage_50
+  - interval_coverage_95
+  relative_metrics:
+  - wis
+  - ae_median
+  baseline: FS-base
+  disaggregate_by:
+  - location
+  - reference_date
+  - horizon
+  - target_end_date
+- target_id: wk flu hosp rate category
+  metrics:
+  - log_score
+  - rps
+  disaggregate_by:
+  - location
+  - reference_date
+  - horizon
+  - target_end_date
+eval_windows:
+- window_name: Full season
+  min_round_id: '2023-01-21'
+- window_name: Last 4 weeks
+  min_round_id: '2023-01-21'
+  n_last_round_ids: 4
+task_id_text:
+  location:
+    US: United States
+    '01': Alabama
+    '02': Alaska
+    '04': Arizona
+    '05': Arkansas
+    '06': California
+    '08': Colorado
+    '09': Connecticut
+    '10': Delaware
+    '11': District of Columbia
+    '12': Florida
+    '13': Georgia
+    '15': Hawaii
+    '16': Idaho
+    '17': Illinois
+    '18': Indiana
+    '19': Iowa
+    '20': Kansas
+    '21': Kentucky
+    '22': Louisiana
+    '23': Maine
+    '24': Maryland
+    '25': Massachusetts
+    '26': Michigan
+    '27': Minnesota
+    '28': Mississippi
+    '29': Missouri
+    '30': Montana
+    '31': Nebraska
+    '32': Nevada
+    '33': New Hampshire
+    '34': New Jersey
+    '35': New Mexico
+    '36': New York
+    '37': North Carolina
+    '38': North Dakota
+    '39': Ohio
+    '40': Oklahoma
+    '41': Oregon
+    '42': Pennsylvania
+    '44': Rhode Island
+    '45': South Carolina
+    '46': South Dakota
+    '47': Tennessee
+    '48': Texas
+    '49': Utah
+    '50': Vermont
+    '51': Virginia
+    '53': Washington
+    '54': West Virginia
+    '55': Wisconsin
+    '56': Wyoming
+    '60': American Samoa
+    '66': Guam
+    '69': Northern Mariana Islands
+    '72': Puerto Rico
+    '74': U.S. Minor Outlying Islands
+    '78': Virgin Islands

From 6495df9171a35abe7c1afe6fb96b311be79c1181 Mon Sep 17 00:00:00 2001
From: "Evan L. Ray" <elray@umass.edu>
Date: Wed, 8 Jan 2025 16:54:08 -0500
Subject: [PATCH 2/5] refactor expected scores to fixtures

---
 .../helper-check_exp_scores_for_window.R      | 83 ++++---------------
 .../testdata/create_exp_score_fixtures.R      | 55 ++++++++++++
 .../expected-scores/scores_Full season.csv    |  4 +
 .../scores_Full season_by_horizon.csv         | 13 +++
 .../scores_Full season_by_location.csv        |  7 ++
 .../scores_Full season_by_reference_date.csv  | 13 +++
 .../scores_Full season_by_target_end_date.csv | 49 +++++++++++
 .../expected-scores/scores_Last 5 weeks.csv   |  4 +
 .../scores_Last 5 weeks_by_horizon.csv        | 13 +++
 .../scores_Last 5 weeks_by_location.csv       |  7 ++
 .../scores_Last 5 weeks_by_reference_date.csv |  7 ++
 ...scores_Last 5 weeks_by_target_end_date.csv | 25 ++++++
 12 files changed, 213 insertions(+), 67 deletions(-)
 create mode 100644 tests/testthat/testdata/create_exp_score_fixtures.R
 create mode 100644 tests/testthat/testdata/expected-scores/scores_Full season.csv
 create mode 100644 tests/testthat/testdata/expected-scores/scores_Full season_by_horizon.csv
 create mode 100644 tests/testthat/testdata/expected-scores/scores_Full season_by_location.csv
 create mode 100644 tests/testthat/testdata/expected-scores/scores_Full season_by_reference_date.csv
 create mode 100644 tests/testthat/testdata/expected-scores/scores_Full season_by_target_end_date.csv
 create mode 100644 tests/testthat/testdata/expected-scores/scores_Last 5 weeks.csv
 create mode 100644 tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_horizon.csv
 create mode 100644 tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_location.csv
 create mode 100644 tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_reference_date.csv
 create mode 100644 tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_target_end_date.csv

diff --git a/tests/testthat/helper-check_exp_scores_for_window.R b/tests/testthat/helper-check_exp_scores_for_window.R
index 4a63eb5..57d6667 100644
--- a/tests/testthat/helper-check_exp_scores_for_window.R
+++ b/tests/testthat/helper-check_exp_scores_for_window.R
@@ -8,78 +8,27 @@
 check_exp_scores_for_window <- function(out_path, window_name, model_out_tbl, oracle_output,
                                         include_rel = FALSE) {
   # check that the output files were created and have the expected contents
-  # no disaggregation
-  scores_path <- file.path(out_path, "wk inc flu hosp", window_name, "scores.csv")
-  testthat::expect_true(file.exists(scores_path))
-
-  actual_scores <- read.csv(scores_path)
-  expected_mean_scores <- hubEvals::score_model_out(
-    model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "mean"),
-    oracle_output = oracle_output,
-    metrics = "se_point",
-    relative_metrics = if (include_rel) "se_point" else NULL,
-    baseline = "FS-base",
-    by = "model_id"
-  )
-  expected_median_scores <- hubEvals::score_model_out(
-    model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "median"),
-    oracle_output = oracle_output,
-    metrics = "ae_point",
-    relative_metrics = if (include_rel) "ae_point" else NULL,
-    baseline = "FS-base",
-    by = "model_id"
-  )
-  expected_quantile_scores <- hubEvals::score_model_out(
-    model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "quantile"),
-    oracle_output = oracle_output,
-    metrics = c("wis", "ae_median", "interval_coverage_50", "interval_coverage_95"),
-    relative_metrics = if (include_rel) c("wis", "ae_median") else NULL,
-    baseline = "FS-base",
-    by = "model_id"
-  )
-  expected_scores <- expected_mean_scores |>
-    dplyr::left_join(expected_median_scores, by = "model_id") |>
-    dplyr::left_join(expected_quantile_scores, by = "model_id")
-  expect_df_equal_up_to_order(actual_scores, expected_scores, ignore_attr = TRUE) # nolint: object_usage_linter
-
-  for (by in c("location", "reference_date", "horizon", "target_end_date")) {
-    # check that the output files were created and have the expected contents
-    # disaggregated by `by`
-    scores_path <- file.path(out_path, "wk inc flu hosp", window_name, by, "scores.csv")
+  # disaggregated by `by` if non-NULL, otherwise no disaggregation
+  for (by in list(NULL, "location", "reference_date", "horizon", "target_end_date")) {
+    if (is.null(by)) {
+      scores_path <- file.path(out_path, "wk inc flu hosp", window_name, "scores.csv")
+    } else {
+      scores_path <- file.path(out_path, "wk inc flu hosp", window_name, by, "scores.csv")
+    }
     testthat::expect_true(file.exists(scores_path))
 
     actual_scores <- read.csv(scores_path)
-    if (by %in% c("reference_date", "target_end_date")) {
-      actual_scores[[by]] <- as.Date(actual_scores[[by]])
-    }
 
-    expected_mean_scores <- hubEvals::score_model_out(
-      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "mean"),
-      oracle_output = oracle_output,
-      metrics = "se_point",
-      relative_metrics = if (include_rel) "se_point" else NULL,
-      baseline = "FS-base",
-      by = c("model_id", by)
+    expected_scores_path <- testthat::test_path(
+      "testdata", "expected-scores",
+      paste0("scores_", window_name, ifelse(is.null(by), "", paste0("_by_", by)), ".csv")
     )
-    expected_median_scores <- hubEvals::score_model_out(
-      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "median"),
-      oracle_output = oracle_output,
-      metrics = "ae_point",
-      relative_metrics = if (include_rel) "ae_point" else NULL,
-      baseline = "FS-base",
-      by = c("model_id", by)
-    )
-    expected_quantile_scores <- hubEvals::score_model_out(
-      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "quantile"),
-      oracle_output = oracle_output,
-      metrics = c("wis", "ae_median", "interval_coverage_50", "interval_coverage_95"),
-      relative_metrics = if (include_rel) c("wis", "ae_median") else NULL,
-      baseline = "FS-base",
-      by = c("model_id", by)
-    )
-    expected_scores <- expected_mean_scores |>
-      dplyr::left_join(expected_median_scores, by = c("model_id", by)) |>
-      dplyr::left_join(expected_quantile_scores, by = c("model_id", by))
+    expected_scores <- read.csv(expected_scores_path)
+    if (!include_rel) {
+      expected_scores <- expected_scores |>
+        dplyr::select(-dplyr::contains("relative"))
+    }
+
     expect_df_equal_up_to_order(actual_scores, expected_scores, ignore_attr = TRUE) # nolint: object_usage_linter
   }
 }
diff --git a/tests/testthat/testdata/create_exp_score_fixtures.R b/tests/testthat/testdata/create_exp_score_fixtures.R
new file mode 100644
index 0000000..78599b0
--- /dev/null
+++ b/tests/testthat/testdata/create_exp_score_fixtures.R
@@ -0,0 +1,55 @@
+library(testthat)
+library(hubData)
+library(hubEvals)
+library(dplyr)
+
+hub_path <- testthat::test_path("testdata", "ecfh")
+model_out_tbl <- hubData::connect_hub(hub_path) |>
+  dplyr::collect()
+oracle_output <- read.csv(
+  test_path("testdata", "ecfh", "target-data", "oracle-output.csv")
+)
+oracle_output[["target_end_date"]] <- as.Date(oracle_output[["target_end_date"]])
+
+make_score_fixtures_one_window <- function(window_name, model_out_tbl) {
+  for (by in list(NULL, "location", "reference_date", "horizon", "target_end_date")) {
+    expected_mean_scores <- hubEvals::score_model_out(
+      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "mean"),
+      oracle_output = oracle_output,
+      metrics = "se_point",
+      relative_metrics = "se_point",
+      baseline = "FS-base",
+      by = c("model_id", by)
+    )
+    expected_median_scores <- hubEvals::score_model_out(
+      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "median"),
+      oracle_output = oracle_output,
+      metrics = "ae_point",
+      relative_metrics = "ae_point",
+      baseline = "FS-base",
+      by = c("model_id", by)
+    )
+    expected_quantile_scores <- hubEvals::score_model_out(
+      model_out_tbl = model_out_tbl |> dplyr::filter(.data[["output_type"]] == "quantile"),
+      oracle_output = oracle_output,
+      metrics = c("wis", "ae_median", "interval_coverage_50", "interval_coverage_95"),
+      relative_metrics = c("wis", "ae_median"),
+      baseline = "FS-base",
+      by = c("model_id", by)
+    )
+    expected_scores <- expected_mean_scores |>
+      dplyr::left_join(expected_median_scores, by = c("model_id", by)) |>
+      dplyr::left_join(expected_quantile_scores, by = c("model_id", by))
+
+    save_path <- testthat::test_path("testdata", "expected-scores")
+    if (!dir.exists(save_path)) {
+      dir.create(save_path, recursive = TRUE)
+    }
+
+    file_name <- paste0("scores_", window_name, ifelse(is.null(by), "", paste0("_by_", by)), ".csv")
+    write.csv(expected_scores, file = file.path(save_path, file_name), row.names = FALSE)
+  }
+}
+
+make_score_fixtures_one_window("Full season", model_out_tbl)
+make_score_fixtures_one_window("Last 5 weeks", model_out_tbl |> dplyr::filter(reference_date >= "2022-12-17"))
diff --git a/tests/testthat/testdata/expected-scores/scores_Full season.csv b/tests/testthat/testdata/expected-scores/scores_Full season.csv
new file mode 100644
index 0000000..8113645
--- /dev/null
+++ b/tests/testthat/testdata/expected-scores/scores_Full season.csv	
@@ -0,0 +1,4 @@
+"model_id","se_point","se_point_relative_skill","se_point_scaled_relative_skill","ae_point","ae_point_relative_skill","ae_point_scaled_relative_skill","wis","ae_median","interval_coverage_50","interval_coverage_95","wis_relative_skill","wis_scaled_relative_skill","ae_median_relative_skill","ae_median_scaled_relative_skill"
+"FS-base",39601755.4396545,1.91088968575342,1,3837.4375,1.52457821016562,1,3132.195625,3837.4375,0,0.40625,1.71310584625945,1,1.52457821016562,1
+"MOBS-GLEAM",14746519.7514426,0.71155867160308,0.372370355498847,1915.1875,0.760886172317223,0.49907978957312,1251.3253125,1915.1875,0.4375,0.71875,0.684392983409579,0.399504201625338,0.760886172317223,0.49907978957312
+"PSI-DICE",15241667.712703,0.735450873390352,0.384873537637198,2169.8125,0.862046315450087,0.565432661769736,1559.4625,2169.8125,0.28125,0.6875,0.852923841809011,0.497881577878776,0.862046315450087,0.565432661769736
diff --git a/tests/testthat/testdata/expected-scores/scores_Full season_by_horizon.csv b/tests/testthat/testdata/expected-scores/scores_Full season_by_horizon.csv
new file mode 100644
index 0000000..b1ec707
--- /dev/null
+++ b/tests/testthat/testdata/expected-scores/scores_Full season_by_horizon.csv	
@@ -0,0 +1,13 @@
+"model_id","horizon","se_point","se_point_relative_skill","se_point_scaled_relative_skill","ae_point","ae_point_relative_skill","ae_point_scaled_relative_skill","wis","ae_median","interval_coverage_50","interval_coverage_95","wis_relative_skill","wis_scaled_relative_skill","ae_median_relative_skill","ae_median_scaled_relative_skill"
+"FS-base",0,6970257.38545523,1.27454923973301,1,1573.75,1.25894719773911,1,1177.7875,1573.75,0,0.25,1.62373337952405,1,1.25894719773911,1
+"FS-base",1,29097350.747173,1.71928082758726,1,3469,1.43574766705554,1,2767.495,3469,0,0.5,1.62238807312521,1,1.43574766705554,1
+"FS-base",2,57604120.7825812,1.73531847157679,1,4730.75,1.43991498437613,1,3968.39875,4730.75,0,0.5,1.55262559465219,1,1.43991498437613,1
+"FS-base",3,64735292.8434087,2.49329936252022,1,5576.25,1.8377267323066,1,4615.10125,5576.25,0,0.375,2.03279570421319,1,1.8377267323066,1
+"MOBS-GLEAM",0,7264860.05568886,1.32841893042092,1.04226568029588,1290.25,1.03215670969524,0.819857029388404,672.825,1290.25,0.5,0.75,0.927576843087799,0.571261793829532,1.03215670969524,0.819857029388404
+"MOBS-GLEAM",1,13523503.2122718,0.799065866741583,0.46476750855353,2051.625,0.849125340854086,0.591416834822715,1265.94125,2051.625,0.5,0.75,0.742132500791228,0.457432172415849,0.849125340854086,0.591416834822715
+"MOBS-GLEAM",2,23511851.0974193,0.708292201252593,0.408162658816745,2339.5,0.712081827606183,0.494530465570998,1799.845,2339.5,0.375,0.625,0.704184632002208,0.453544392432842,0.712081827606183,0.494530465570998
+"MOBS-GLEAM",3,14685864.6403903,0.565630513706274,0.226860248796814,1979.375,0.652329137100986,0.354965254427258,1266.69,1979.375,0.375,0.75,0.557934019447527,0.274466351090347,0.652329137100986,0.354965254427258
+"PSI-DICE",0,3229985.12986495,0.590620240252353,0.463395388612898,962,0.76956772309771,0.611278792692613,481.60125,962,0.125,1,0.663950012413537,0.408903346316717,0.76956772309771,0.611278792692613
+"PSI-DICE",1,12319050.3206928,0.727898124282247,0.42337360633733,1981.875,0.82025725213194,0.571310175843182,1416.75875,1981.875,0.375,0.625,0.830546215438793,0.51192820583235,0.82025725213194,0.571310175843182
+"PSI-DICE",2,27007396.0423284,0.813595148832029,0.468844861711614,3204.25,0.975288820733965,0.677323891560535,2337.735,3204.25,0.25,0.5,0.914632682644162,0.589087727134275,0.975288820733965,0.677323891560535
+"PSI-DICE",3,18410239.3579258,0.709075931207976,0.284392617215144,2531.125,0.834165626596645,0.453911678995741,2001.755,2531.125,0.375,0.625,0.881705242087001,0.433740213175171,0.834165626596645,0.453911678995741
diff --git a/tests/testthat/testdata/expected-scores/scores_Full season_by_location.csv b/tests/testthat/testdata/expected-scores/scores_Full season_by_location.csv
new file mode 100644
index 0000000..7226881
--- /dev/null
+++ b/tests/testthat/testdata/expected-scores/scores_Full season_by_location.csv	
@@ -0,0 +1,7 @@
+"model_id","location","se_point","se_point_relative_skill","se_point_scaled_relative_skill","ae_point","ae_point_relative_skill","ae_point_scaled_relative_skill","wis","ae_median","interval_coverage_50","interval_coverage_95","wis_relative_skill","wis_scaled_relative_skill","ae_median_relative_skill","ae_median_scaled_relative_skill"
+"FS-base","01",14365.9170528647,0.55883962141545,1,102.9375,0.96400016707104,1,73.93875,102.9375,0,0.6875,1.01550810548672,1,0.96400016707104,1
+"FS-base","US",79189144.9622562,1.91728375259368,1,7571.9375,1.54933892775112,1,6190.4525,7571.9375,0,0.125,1.74202276281089,1,1.54933892775112,1
+"MOBS-GLEAM","01",300919.487866858,11.7058821972324,20.9467649548242,236.5625,2.21538593343284,2.29811778992107,152.659375,236.5625,0.375,0.625,2.09669263668964,2.06467346283241,2.21538593343284,2.29811778992107
+"MOBS-GLEAM","US",29192120.0150183,0.706783454162007,0.368637898905615,3593.8125,0.735351236759228,0.474622578435176,2349.99125,3593.8125,0.5,0.8125,0.661298709570329,0.379615423912872,0.735351236759228,0.474622578435176
+"PSI-DICE","01",3929.65863864692,0.152865211307584,0.273540395937571,50,0.468245375626492,0.485731633272617,34.195625,50,0.3125,0.75,0.469658120534688,0.462485841321364,0.468245375626492,0.485731633272617
+"PSI-DICE","US",30479405.7667673,0.737950504367565,0.384893734883672,4289.625,0.877725548837983,0.566516165776593,3084.729375,4289.625,0.25,0.625,0.868057510878472,0.498304344472395,0.877725548837983,0.566516165776593
diff --git a/tests/testthat/testdata/expected-scores/scores_Full season_by_reference_date.csv b/tests/testthat/testdata/expected-scores/scores_Full season_by_reference_date.csv
new file mode 100644
index 0000000..77887db
--- /dev/null
+++ b/tests/testthat/testdata/expected-scores/scores_Full season_by_reference_date.csv	
@@ -0,0 +1,13 @@
+"model_id","reference_date","se_point","se_point_relative_skill","se_point_scaled_relative_skill","ae_point","ae_point_relative_skill","ae_point_scaled_relative_skill","wis","ae_median","interval_coverage_50","interval_coverage_95","wis_relative_skill","wis_scaled_relative_skill","ae_median_relative_skill","ae_median_scaled_relative_skill"
+"FS-base",2022-10-22,10208266.5669188,1.94998063334083,1,1994.75,1.6213568310999,1,1780.68125,1994.75,0,0,1.68730459407208,1,1.6213568310999,1
+"FS-base",2022-11-19,83701352.6732892,1.63940181445449,1,5850.25,1.28116609986496,1,5404.70125,5850.25,0,0.375,1.48340297594654,1,1.28116609986496,1
+"FS-base",2022-12-17,21306295.3973807,3.10891393913009,1,2894.375,1.82701375914094,1,1937.05,2894.375,0,0.75,1.73261142871645,1,1.82701375914094,1
+"FS-base",2023-01-14,43191107.1210294,3.1778410567911,1,4610.375,1.98229121430346,1,3406.35,4610.375,0,0.5,2.56485081622142,1,1.98229121430346,1
+"MOBS-GLEAM",2022-10-22,2877650.25924563,0.549688062931421,0.281894113989051,693,0.56327874869143,0.347411956385512,603.08375,693,0.5,0.5,0.571458806558006,0.338681473733718,0.56327874869143,0.347411956385512
+"MOBS-GLEAM",2022-11-19,41097190.9787231,0.80494289886192,0.490997930931146,4082.125,0.893958406121322,0.697769326097175,2797.1575,4082.125,0.25,0.375,0.767722685817498,0.517541556991702,0.893958406121322,0.697769326097175
+"MOBS-GLEAM",2022-12-17,11393303.8117821,1.6624570518978,0.534738845927327,1834.625,1.1580687083961,0.633858777801771,1061.10375,1834.625,0.5,1,0.949113592475093,0.547793681113033,1.1580687083961,0.633858777801771
+"MOBS-GLEAM",2023-01-14,3617933.95601942,0.266194127276707,0.0837657146847685,1051,0.451891237965012,0.227964102703142,543.95625,1051,0.5,1,0.40957817951803,0.159688889867453,0.451891237965012,0.227964102703142
+"PSI-DICE",2022-10-22,4883993.75672922,0.932939317027852,0.478435170624993,1347.125,1.09495942904898,0.675335255044492,1094.49625,1347.125,0,0.375,1.03710226118215,0.614650291847572,1.09495942904898,0.675335255044492
+"PSI-DICE",2022-11-19,38689808.514466,0.757791125874522,0.462236359136077,3987,0.873126659572088,0.681509337207812,3199.25625,3987,0.25,0.625,0.878084841796866,0.591939517471017,0.873126659572088,0.681509337207812
+"PSI-DICE",2022-12-17,1325989.57383748,0.193482132503959,0.0622346376555207,748.75,0.472632797117437,0.258691427337508,679.86125,748.75,0.875,1,0.608107881413205,0.350977646421104,0.472632797117437,0.258691427337508
+"PSI-DICE",2023-01-14,16066879.0057793,1.18214121291188,0.371995072058629,2596.375,1.11634549283673,0.563159179025567,1264.23625,2596.375,0,0.75,0.951921375580667,0.371141030722034,1.11634549283673,0.563159179025567
diff --git a/tests/testthat/testdata/expected-scores/scores_Full season_by_target_end_date.csv b/tests/testthat/testdata/expected-scores/scores_Full season_by_target_end_date.csv
new file mode 100644
index 0000000..4c19d40
--- /dev/null
+++ b/tests/testthat/testdata/expected-scores/scores_Full season_by_target_end_date.csv	
@@ -0,0 +1,49 @@
+"model_id","target_end_date","se_point","se_point_relative_skill","se_point_scaled_relative_skill","ae_point","ae_point_relative_skill","ae_point_scaled_relative_skill","wis","ae_median","interval_coverage_50","interval_coverage_95","wis_relative_skill","wis_scaled_relative_skill","ae_median_relative_skill","ae_median_scaled_relative_skill"
+"FS-base",2022-10-22,216285.49784771,3.86853672481114,1,353,2.07324050161915,1,212.02,353,0,0,1.92239967613821,1,2.07324050161915,1
+"FS-base",2022-10-29,3467107.14440222,3.53885633168634,1,1400,2.02187326095055,1,1203.76,1400,0,0,1.95605930386773,1,2.02187326095055,1
+"FS-base",2022-11-05,11788212.6451863,1.97891050438188,1,2558,1.60154169511087,1,2317.32,2558,0,0,1.67613393183549,1,1.60154169511087,1
+"FS-base",2022-11-12,25361460.9802391,1.85658122072428,1,3668,1.51437249673553,1,3389.625,3668,0,0,1.61492150895569,1,1.51437249673553,1
+"FS-base",2022-11-19,3653912.05049764,4.41757782959963,1,1390,1.90325785111796,1,1078.76,1390,0,0,2.56531239572762,1,1.90325785111796,1
+"FS-base",2022-11-26,61882926.7067664,1.70436957818155,1,5582,1.28433608386802,1,5142.425,5582,0,0.5,1.48220397976422,1,1.28433608386802,1
+"FS-base",2022-12-03,154904704.802538,1.49573443825615,1,8825.5,1.1988137044987,1,8336.7,8825.5,0,0.5,1.3303629191629,1,1.1988137044987,1
+"FS-base",2022-12-10,114363867.133355,1.82656943886595,1,7603.5,1.31743195769032,1,7060.92,7603.5,0,0.5,1.62424624888108,1,1.31743195769032,1
+"FS-base",2022-12-17,3175383.68141869,0.704102187093623,1,1303,0.932532186623287,1,802.62,1303,0,0.5,1.0219261709135,1,0.932532186623287,1
+"FS-base",2022-12-24,10914437.3525853,2.02860323794238,1,2371.5,2.1175276905063,1,1431.24,2371.5,0,1,1.41285865848852,1,2.1175276905063,1
+"FS-base",2022-12-31,10536520.7055104,6.81438673555078,1,2337.5,1.62564875830736,1,1404.75,2337.5,0,1,1.4301951892091,1,1.62564875830736,1
+"FS-base",2023-01-07,60598839.8500084,30.2439038289645,1,5565.5,5.28287394851579,1,4109.59,5565.5,0,0.5,3.34901399077292,1,5.28287394851579,1
+"FS-base",2023-01-14,20835448.3120569,1.75445319649409,1,3249,1.38977013494529,1,2617.75,3249,0,0.5,1.90292745531258,1,1.38977013494529,1
+"FS-base",2023-01-21,40124931.7849381,2.77673880767223,1,4522.5,1.78249219444739,1,3292.555,4522.5,0,0.5,2.27799926546679,1,1.78249219444739,1
+"FS-base",2023-01-28,53187044.9770904,4.18873826922126,1,5202,2.28442232579952,1,3814.825,5202,0,0.5,2.93007181293188,1,2.28442232579952,1
+"FS-base",2023-02-04,58617003.4100323,6.83431778418711,1,5468,3.07955311475367,1,3900.27,5468,0,0.5,3.90397251191362,1,3.07955311475367,1
+"MOBS-GLEAM",2022-10-22,22805.4038560825,0.407903179914191,0.10544120656735,88.5,0.519778426043329,0.25070821529745,88.83,88.5,0.5,0.5,0.805427616410514,0.418969908499198,0.519778426043329,0.25070821529745
+"MOBS-GLEAM",2022-10-29,181128.553200142,0.184876873037054,0.0522419832027924,256.5,0.370436065309869,0.183214285714286,287.81,256.5,0.5,0.5,0.467679128934482,0.239092510134911,0.370436065309869,0.183214285714286
+"MOBS-GLEAM",2022-11-05,3195096.74417106,0.536367191521472,0.271041661729421,903,0.565360496749459,0.353010164190774,783.8,903,0.5,0.5,0.566928078889691,0.338235547960575,0.565360496749459,0.353010164190774
+"MOBS-GLEAM",2022-11-12,8111570.33575524,0.593806057453923,0.319838448663329,1524,0.629199477923923,0.415485278080698,1251.895,1524,0.5,0.5,0.596441247174566,0.369331415717078,0.629199477923923,0.415485278080698
+"MOBS-GLEAM",2022-11-19,229016.022573171,0.276880255999742,0.0626769389651786,435.5,0.59630848500854,0.313309352517986,227.45,435.5,0.5,0.5,0.540880552123038,0.210843931921836,0.59630848500854,0.313309352517986
+"MOBS-GLEAM",2022-11-26,25662609.986773,0.706795461783021,0.414696126257502,3694,0.8499350580094,0.661769974919384,2586.02,3694,0.5,0.5,0.745369963733039,0.502879478067254,0.8499350580094,0.661769974919384
+"MOBS-GLEAM",2022-12-03,88681402.0744181,0.856293082154173,0.57249004920453,6909.5,0.938553429407263,0.782901818593847,5303.975,6909.5,0,0,0.846403452705154,0.636219967133278,0.938553429407263,0.782901818593847
+"MOBS-GLEAM",2022-12-10,49815735.8311282,0.795635045618531,0.435589815907853,5289.5,0.916493238666794,0.695666469389097,3071.185,5289.5,0,0.5,0.706474611788527,0.434955359924769,0.916493238666794,0.695666469389097
+"MOBS-GLEAM",2022-12-17,21404754.7053754,4.74624049069998,6.74084043154503,2955,2.1148370003621,2.26784343821949,1512.54,2955,0.5,1,1.9258231922373,1.88450325185019,2.1148370003621,2.26784343821949
+"MOBS-GLEAM",2022-12-24,23836982.874849,4.43044190741004,2.18398641219949,2976.5,2.65773610406578,1.2551127978073,1524.14,2976.5,0.5,1,1.50456554857934,1.06490875045415,2.65773610406578,1.2551127978073
+"MOBS-GLEAM",2022-12-31,130104.946396882,0.0841440401187458,0.0123479989299352,712.5,0.495518605473366,0.304812834224599,679.09,712.5,0.5,1,0.691390817611683,0.48342409681438,0.495518605473366,0.304812834224599
+"MOBS-GLEAM",2023-01-07,201372.720507006,0.100501877723489,0.0033230458042668,694.5,0.659232046939936,0.124786631928847,528.645,694.5,0.5,1,0.43080684475876,0.128636919984719,0.659232046939936,0.124786631928847
+"MOBS-GLEAM",2023-01-14,7402864.09095083,0.623359688404893,0.355301406529708,1682,0.719480876262846,0.517697753154817,862.48,1682,0.5,1,0.626964710785212,0.329473784738802,0.719480876262846,0.517697753154817
+"MOBS-GLEAM",2023-01-21,4413291.43426512,0.305410054296752,0.109988758558382,1279.5,0.504300445062562,0.282918739635158,665.795,1279.5,0.5,1,0.460639388241491,0.20221226372832,0.504300445062562,0.282918739635158
+"MOBS-GLEAM",2023-01-28,2040800.62469103,0.160722967034098,0.0383702577492334,833,0.365806189425413,0.160130718954248,432.515,833,0.5,1,0.332203970082568,0.113377415739909,0.365806189425413,0.160130718954248
+"MOBS-GLEAM",2023-02-04,614779.674170713,0.0716788545322083,0.0104880774929804,409.5,0.230628566293275,0.0748902706656913,215.035,409.5,0.5,1,0.215239131931724,0.055133362562079,0.230628566293275,0.0748902706656913
+"PSI-DICE",2022-10-22,35430.4657558039,0.633718207265103,0.163813413790466,158,0.927966003557581,0.447592067988669,71.23,158,0,1,0.645847226352819,0.335958871804547,0.927966003557581,0.447592067988669
+"PSI-DICE",2022-10-29,1497472.35088465,1.52846142035425,0.431908299488921,924.5,1.33515844982056,0.660357142857143,672.71,924.5,0,0,1.09312541894137,0.558840632684256,1.33515844982056,0.660357142857143
+"PSI-DICE",2022-11-05,5612203.99621589,0.942131752720016,0.476086084051736,1764,1.1044251564408,0.689601250977326,1454.925,1764,0,0,1.05235753403749,0.627848117653151,1.1044251564408,0.689601250977326
+"PSI-DICE",2022-11-12,12390868.2140605,0.907071294221524,0.488570757958902,2542,1.04949151763951,0.693020719738277,2179.12,2542,0,0.5,1.03819972964429,0.642879374562083,1.04949151763951,0.693020719738277
+"PSI-DICE",2022-11-19,676235.05850397,0.817567845301119,0.185071520375503,643.5,0.881112537549933,0.46294964028777,303.07,643.5,0,1,0.720706392314482,0.280942934480329,0.881112537549933,0.46294964028777
+"PSI-DICE",2022-11-26,30140437.0403912,0.830123051680458,0.487055778457478,3981.5,0.916084578631409,0.713274811895378,3140.365,3981.5,0.5,0.5,0.905149127291555,0.610677841679752,0.916084578631409,0.713274811895378
+"PSI-DICE",2022-12-03,80859897.561446,0.780769916644506,0.521997686671434,6543,0.888769822506943,0.741374426378109,5565.145,6543,0,0.5,0.888080721120448,0.667547710724867,0.888769822506943,0.741374426378109
+"PSI-DICE",2022-12-10,43082664.3975227,0.68809738692792,0.376715701186336,4780,0.828213948544716,0.628657854935227,3788.445,4780,0.5,0.5,0.871468247812224,0.536537023504019,0.828213948544716,0.628657854935227
+"PSI-DICE",2022-12-17,1349506.81439674,0.299236500166799,0.424990158604649,708.5,0.507059903470912,0.543745203376823,399.075,708.5,0.5,1,0.508117398840427,0.497215369664349,0.507059903470912,0.543745203376823
+"PSI-DICE",2022-12-24,598632.205240062,0.111264299812846,0.054847738449684,199,0.177688387269978,0.0839131351465317,476.545,199,1,1,0.470424757140252,0.332959531594981,0.177688387269978,0.0839131351465317
+"PSI-DICE",2022-12-31,2696623.46753519,1.74401357920139,0.255931112641852,1785,1.2414045063438,0.763636363636364,993.31,1785,1,1,1.01130249752148,0.707108026339206,1.2414045063438,0.763636363636364
+"PSI-DICE",2023-01-07,659195.808177916,0.328993998504522,0.0108780268699785,302.5,0.287138508566351,0.0543527086515138,850.515,302.5,1,1,0.693107252636451,0.20695860170966,0.287138508566351,0.0543527086515138
+"PSI-DICE",2023-01-14,10858768.1808033,0.914364800769576,0.521167964239081,2338,1.00008697306928,0.719606032625423,1153.03,2338,0,1,0.83817493794253,0.440466049087957,1.00008697306928,0.719606032625423
+"PSI-DICE",2023-01-21,17039659.6862554,1.1791841684355,0.424665137814682,2822.5,1.11245643312941,0.624101713653952,1377.415,2822.5,0,1,0.952983430267054,0.418342290409727,1.11245643312941,0.624101713653952
+"PSI-DICE",2023-01-28,18860859.1441164,1.48538431720322,0.354613781462017,2725,1.19666490538325,0.523836985774702,1337.56,2725,0,0.5,1.02734643243272,0.350621588146245,1.19666490538325,0.523836985774702
+"PSI-DICE",2023-02-04,17508229.011942,2.04133261587809,0.298688571462278,2500,1.40798880520925,0.457205559619605,1188.94,2500,0,0.5,1.19006865635317,0.304835306273668,1.40798880520925,0.457205559619605
diff --git a/tests/testthat/testdata/expected-scores/scores_Last 5 weeks.csv b/tests/testthat/testdata/expected-scores/scores_Last 5 weeks.csv
new file mode 100644
index 0000000..3cd8cba
--- /dev/null
+++ b/tests/testthat/testdata/expected-scores/scores_Last 5 weeks.csv	
@@ -0,0 +1,4 @@
+"model_id","se_point","se_point_relative_skill","se_point_scaled_relative_skill","ae_point","ae_point_relative_skill","ae_point_scaled_relative_skill","wis","ae_median","interval_coverage_50","interval_coverage_95","wis_relative_skill","wis_scaled_relative_skill","ae_median_relative_skill","ae_median_scaled_relative_skill"
+"FS-base",32248701.2592051,2.516318304043,1,3752.375,1.80028006322584,1,2671.7,3752.375,0,0.625,2.09158420206952,1,1.80028006322584,1
+"MOBS-GLEAM",7505618.88390074,0.585652303605228,0.232741741243249,1442.8125,0.692219348738606,0.384506479229821,802.53,1442.8125,0.5,1,0.628273784364583,0.300381779391399,0.692219348738606,0.384506479229821
+"PSI-DICE",8696434.28980838,0.678569862626817,0.269667737001535,1672.5625,0.802446696625247,0.445734368233452,972.04875,1672.5625,0.4375,0.875,0.760984320523049,0.363831549200883,0.802446696625247,0.445734368233452
diff --git a/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_horizon.csv b/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_horizon.csv
new file mode 100644
index 0000000..7de7369
--- /dev/null
+++ b/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_horizon.csv	
@@ -0,0 +1,13 @@
+"model_id","horizon","se_point","se_point_relative_skill","se_point_scaled_relative_skill","ae_point","ae_point_relative_skill","ae_point_scaled_relative_skill","wis","ae_median","interval_coverage_50","interval_coverage_95","wis_relative_skill","wis_scaled_relative_skill","ae_median_relative_skill","ae_median_scaled_relative_skill"
+"FS-base",0,12005415.9967378,1.17910075563572,1,2276,1.1362016338983,1,1710.185,2276,0,0.5,1.46955530884177,1,1.1362016338983,1
+"FS-base",1,25519684.5687617,1.73557665681884,1,3447,1.54610664512698,1,2361.8975,3447,0,0.75,1.76475125714419,1,1.54610664512698,1
+"FS-base",2,31861782.8413004,4.42711075210795,1,3769.75,2.01285819422435,1,2609.7875,3769.75,0,0.75,2.19078914445878,1,2.01285819422435,1
+"FS-base",3,59607921.6300204,9.85979244980579,1,5516.75,3.40124018345902,1,4004.93,5516.75,0,0.5,3.48430801951263,1,3.40124018345902,1
+"MOBS-GLEAM",0,14403809.3981631,1.41465673076401,1.19977595129373,2318.5,1.15741805280897,1.01867311072056,1187.51,2318.5,0.5,1,1.02042271730993,0.694375169937755,1.15741805280897,1.01867311072056
+"MOBS-GLEAM",1,14125137.1545571,0.96064111818304,0.553499676553504,2128,0.954486492843113,0.617348418914998,1094.9675,2128,0.5,1,0.818132570171667,0.463596536259512,0.954486492843113,0.617348418914998
+"MOBS-GLEAM",2,1085452.78554396,0.150820803773674,0.0340675470343409,772.75,0.412609899751142,0.204987068107965,555.8025,772.75,0.5,1,0.466569053404943,0.212968488813744,0.412609899751142,0.204987068107965
+"MOBS-GLEAM",3,408076.197338859,0.0675001996285132,0.00684600613777045,552,0.340324390496103,0.100058911496805,371.84,552,0.5,1,0.323502556592893,0.0928455678376401,0.340324390496103,0.100058911496805
+"PSI-DICE",0,6104137.49760001,0.5995121816587,0.508448645116394,1523.25,0.760421414251139,0.66926625659051,776.0525,1523.25,0.25,1,0.666858890304222,0.453782777886603,0.760421414251139,0.66926625659051
+"PSI-DICE",1,8819145.94574774,0.599784209529554,0.345582090640068,1510.75,0.677627100123464,0.438279663475486,926.98,1510.75,0.5,1,0.692616474824807,0.392472577662663,0.677627100123464,0.438279663475486
+"PSI-DICE",2,10778741.3058258,1.49767769640803,0.33829686679849,2255,1.2040573587044,0.598182903375555,1165.435,2255,0.5,0.75,0.978325762757436,0.446563178036526,1.2040573587044,0.598182903375555
+"PSI-DICE",3,9083712.41005997,1.50254390000086,0.152391027260463,1401.25,0.863912232214972,0.253999184302352,1019.7275,1401.25,0.5,0.75,0.887167742249569,0.254618058243215,0.863912232214972,0.253999184302352
diff --git a/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_location.csv b/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_location.csv
new file mode 100644
index 0000000..cba0496
--- /dev/null
+++ b/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_location.csv	
@@ -0,0 +1,7 @@
+"model_id","location","se_point","se_point_relative_skill","se_point_scaled_relative_skill","ae_point","ae_point_relative_skill","ae_point_scaled_relative_skill","wis","ae_median","interval_coverage_50","interval_coverage_95","wis_relative_skill","wis_scaled_relative_skill","ae_median_relative_skill","ae_median_scaled_relative_skill"
+"FS-base","01",7934.16128810812,2.64616637050549,1,86.5,1.84494399073852,1,49.975,86.5,0,1,1.73800770724664,1,1.84494399073852,1
+"FS-base","US",64489468.357122,2.516324718005,1,7418.25,1.80001101254491,1,5293.425,7418.25,0,0.25,2.09638683517122,1,1.80001101254491,1
+"MOBS-GLEAM","01",1071.56641993473,0.357384091554102,0.135057302344082,26.625,0.567880158998997,0.307803468208093,15.3625,26.625,0.625,1,0.534270003053056,0.307403701850926,0.567880158998997,0.307803468208093
+"MOBS-GLEAM","US",15010166.2013815,0.585684037969409,0.232753759393085,2859,0.693725809303527,0.385400869477303,1589.6975,2859,0.375,1,0.629577430662491,0.300315485720493,0.693725809303527,0.385400869477303
+"PSI-DICE","01",3170.52744208901,1.05742028544283,0.399604611875116,44.75,0.954465243763572,0.517341040462428,30.96625,44.75,0.375,0.75,1.07693008833469,0.619634817408704,0.954465243763572,0.517341040462428
+"PSI-DICE","US",17389698.0521747,0.6785313658505,0.2696517508235,3300.375,0.800823825771293,0.44489940349813,1913.13125,3300.375,0.5,1,0.757668837558793,0.361416521439333,0.800823825771293,0.44489940349813
diff --git a/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_reference_date.csv b/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_reference_date.csv
new file mode 100644
index 0000000..5f5f958
--- /dev/null
+++ b/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_reference_date.csv	
@@ -0,0 +1,7 @@
+"model_id","reference_date","se_point","se_point_relative_skill","se_point_scaled_relative_skill","ae_point","ae_point_relative_skill","ae_point_scaled_relative_skill","wis","ae_median","interval_coverage_50","interval_coverage_95","wis_relative_skill","wis_scaled_relative_skill","ae_median_relative_skill","ae_median_scaled_relative_skill"
+"FS-base",2022-12-17,21306295.3973807,3.10891393913009,1,2894.375,1.82701375914094,1,1937.05,2894.375,0,0.75,1.73261142871645,1,1.82701375914094,1
+"FS-base",2023-01-14,43191107.1210294,3.1778410567911,1,4610.375,1.98229121430346,1,3406.35,4610.375,0,0.5,2.56485081622142,1,1.98229121430346,1
+"MOBS-GLEAM",2022-12-17,11393303.8117821,1.6624570518978,0.534738845927327,1834.625,1.1580687083961,0.633858777801771,1061.10375,1834.625,0.5,1,0.949113592475093,0.547793681113033,1.1580687083961,0.633858777801771
+"MOBS-GLEAM",2023-01-14,3617933.95601942,0.266194127276707,0.0837657146847685,1051,0.451891237965012,0.227964102703142,543.95625,1051,0.5,1,0.40957817951803,0.159688889867453,0.451891237965012,0.227964102703142
+"PSI-DICE",2022-12-17,1325989.57383748,0.193482132503959,0.0622346376555207,748.75,0.472632797117437,0.258691427337508,679.86125,748.75,0.875,1,0.608107881413205,0.350977646421104,0.472632797117437,0.258691427337508
+"PSI-DICE",2023-01-14,16066879.0057793,1.18214121291188,0.371995072058629,2596.375,1.11634549283673,0.563159179025567,1264.23625,2596.375,0,0.75,0.951921375580667,0.371141030722034,1.11634549283673,0.563159179025567
diff --git a/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_target_end_date.csv b/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_target_end_date.csv
new file mode 100644
index 0000000..214da8a
--- /dev/null
+++ b/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_target_end_date.csv	
@@ -0,0 +1,25 @@
+"model_id","target_end_date","se_point","se_point_relative_skill","se_point_scaled_relative_skill","ae_point","ae_point_relative_skill","ae_point_scaled_relative_skill","wis","ae_median","interval_coverage_50","interval_coverage_95","wis_relative_skill","wis_scaled_relative_skill","ae_median_relative_skill","ae_median_scaled_relative_skill"
+"FS-base",2022-12-17,3175383.68141869,0.704102187093623,1,1303,0.932532186623287,1,802.62,1303,0,0.5,1.0219261709135,1,0.932532186623287,1
+"FS-base",2022-12-24,10914437.3525853,2.02860323794238,1,2371.5,2.1175276905063,1,1431.24,2371.5,0,1,1.41285865848852,1,2.1175276905063,1
+"FS-base",2022-12-31,10536520.7055104,6.81438673555078,1,2337.5,1.62564875830736,1,1404.75,2337.5,0,1,1.4301951892091,1,1.62564875830736,1
+"FS-base",2023-01-07,60598839.8500084,30.2439038289645,1,5565.5,5.28287394851579,1,4109.59,5565.5,0,0.5,3.34901399077292,1,5.28287394851579,1
+"FS-base",2023-01-14,20835448.3120569,1.75445319649409,1,3249,1.38977013494529,1,2617.75,3249,0,0.5,1.90292745531258,1,1.38977013494529,1
+"FS-base",2023-01-21,40124931.7849381,2.77673880767223,1,4522.5,1.78249219444739,1,3292.555,4522.5,0,0.5,2.27799926546679,1,1.78249219444739,1
+"FS-base",2023-01-28,53187044.9770904,4.18873826922126,1,5202,2.28442232579952,1,3814.825,5202,0,0.5,2.93007181293188,1,2.28442232579952,1
+"FS-base",2023-02-04,58617003.4100323,6.83431778418711,1,5468,3.07955311475367,1,3900.27,5468,0,0.5,3.90397251191362,1,3.07955311475367,1
+"MOBS-GLEAM",2022-12-17,21404754.7053754,4.74624049069998,6.74084043154503,2955,2.1148370003621,2.26784343821949,1512.54,2955,0.5,1,1.9258231922373,1.88450325185019,2.1148370003621,2.26784343821949
+"MOBS-GLEAM",2022-12-24,23836982.874849,4.43044190741004,2.18398641219949,2976.5,2.65773610406578,1.2551127978073,1524.14,2976.5,0.5,1,1.50456554857934,1.06490875045415,2.65773610406578,1.2551127978073
+"MOBS-GLEAM",2022-12-31,130104.946396882,0.0841440401187458,0.0123479989299352,712.5,0.495518605473366,0.304812834224599,679.09,712.5,0.5,1,0.691390817611683,0.48342409681438,0.495518605473366,0.304812834224599
+"MOBS-GLEAM",2023-01-07,201372.720507006,0.100501877723489,0.0033230458042668,694.5,0.659232046939936,0.124786631928847,528.645,694.5,0.5,1,0.43080684475876,0.128636919984719,0.659232046939936,0.124786631928847
+"MOBS-GLEAM",2023-01-14,7402864.09095083,0.623359688404893,0.355301406529708,1682,0.719480876262846,0.517697753154817,862.48,1682,0.5,1,0.626964710785212,0.329473784738802,0.719480876262846,0.517697753154817
+"MOBS-GLEAM",2023-01-21,4413291.43426512,0.305410054296752,0.109988758558382,1279.5,0.504300445062562,0.282918739635158,665.795,1279.5,0.5,1,0.460639388241491,0.20221226372832,0.504300445062562,0.282918739635158
+"MOBS-GLEAM",2023-01-28,2040800.62469103,0.160722967034098,0.0383702577492334,833,0.365806189425413,0.160130718954248,432.515,833,0.5,1,0.332203970082568,0.113377415739909,0.365806189425413,0.160130718954248
+"MOBS-GLEAM",2023-02-04,614779.674170713,0.0716788545322083,0.0104880774929804,409.5,0.230628566293275,0.0748902706656913,215.035,409.5,0.5,1,0.215239131931724,0.055133362562079,0.230628566293275,0.0748902706656913
+"PSI-DICE",2022-12-17,1349506.81439674,0.299236500166799,0.424990158604649,708.5,0.507059903470912,0.543745203376823,399.075,708.5,0.5,1,0.508117398840427,0.497215369664349,0.507059903470912,0.543745203376823
+"PSI-DICE",2022-12-24,598632.205240062,0.111264299812846,0.054847738449684,199,0.177688387269978,0.0839131351465317,476.545,199,1,1,0.470424757140252,0.332959531594981,0.177688387269978,0.0839131351465317
+"PSI-DICE",2022-12-31,2696623.46753519,1.74401357920139,0.255931112641852,1785,1.2414045063438,0.763636363636364,993.31,1785,1,1,1.01130249752148,0.707108026339206,1.2414045063438,0.763636363636364
+"PSI-DICE",2023-01-07,659195.808177916,0.328993998504522,0.0108780268699785,302.5,0.287138508566351,0.0543527086515138,850.515,302.5,1,1,0.693107252636451,0.20695860170966,0.287138508566351,0.0543527086515138
+"PSI-DICE",2023-01-14,10858768.1808033,0.914364800769576,0.521167964239081,2338,1.00008697306928,0.719606032625423,1153.03,2338,0,1,0.83817493794253,0.440466049087957,1.00008697306928,0.719606032625423
+"PSI-DICE",2023-01-21,17039659.6862554,1.1791841684355,0.424665137814682,2822.5,1.11245643312941,0.624101713653952,1377.415,2822.5,0,1,0.952983430267054,0.418342290409727,1.11245643312941,0.624101713653952
+"PSI-DICE",2023-01-28,18860859.1441164,1.48538431720322,0.354613781462017,2725,1.19666490538325,0.523836985774702,1337.56,2725,0,0.5,1.02734643243272,0.350621588146245,1.19666490538325,0.523836985774702
+"PSI-DICE",2023-02-04,17508229.011942,2.04133261587809,0.298688571462278,2500,1.40798880520925,0.457205559619605,1188.94,2500,0,0.5,1.19006865635317,0.304835306273668,1.40798880520925,0.457205559619605

From 221b1b0e4bef9b62d5f89c2d0fe3c6ba3d6e2b42 Mon Sep 17 00:00:00 2001
From: "Evan L. Ray" <elray@umass.edu>
Date: Wed, 8 Jan 2025 17:02:51 -0500
Subject: [PATCH 3/5] no spaces in file name

---
 tests/testthat/helper-check_exp_scores_for_window.R           | 4 +++-
 tests/testthat/testdata/create_exp_score_fixtures.R           | 1 +
 .../{scores_Full season.csv => scores_Full_season.csv}        | 0
 ...eason_by_horizon.csv => scores_Full_season_by_horizon.csv} | 0
 ...son_by_location.csv => scores_Full_season_by_location.csv} | 0
 ...ence_date.csv => scores_Full_season_by_reference_date.csv} | 0
 ...end_date.csv => scores_Full_season_by_target_end_date.csv} | 0
 .../{scores_Last 5 weeks.csv => scores_Last_5_weeks.csv}      | 0
 ...eeks_by_horizon.csv => scores_Last_5_weeks_by_horizon.csv} | 0
 ...ks_by_location.csv => scores_Last_5_weeks_by_location.csv} | 0
 ...nce_date.csv => scores_Last_5_weeks_by_reference_date.csv} | 0
 ...nd_date.csv => scores_Last_5_weeks_by_target_end_date.csv} | 0
 12 files changed, 4 insertions(+), 1 deletion(-)
 rename tests/testthat/testdata/expected-scores/{scores_Full season.csv => scores_Full_season.csv} (100%)
 rename tests/testthat/testdata/expected-scores/{scores_Full season_by_horizon.csv => scores_Full_season_by_horizon.csv} (100%)
 rename tests/testthat/testdata/expected-scores/{scores_Full season_by_location.csv => scores_Full_season_by_location.csv} (100%)
 rename tests/testthat/testdata/expected-scores/{scores_Full season_by_reference_date.csv => scores_Full_season_by_reference_date.csv} (100%)
 rename tests/testthat/testdata/expected-scores/{scores_Full season_by_target_end_date.csv => scores_Full_season_by_target_end_date.csv} (100%)
 rename tests/testthat/testdata/expected-scores/{scores_Last 5 weeks.csv => scores_Last_5_weeks.csv} (100%)
 rename tests/testthat/testdata/expected-scores/{scores_Last 5 weeks_by_horizon.csv => scores_Last_5_weeks_by_horizon.csv} (100%)
 rename tests/testthat/testdata/expected-scores/{scores_Last 5 weeks_by_location.csv => scores_Last_5_weeks_by_location.csv} (100%)
 rename tests/testthat/testdata/expected-scores/{scores_Last 5 weeks_by_reference_date.csv => scores_Last_5_weeks_by_reference_date.csv} (100%)
 rename tests/testthat/testdata/expected-scores/{scores_Last 5 weeks_by_target_end_date.csv => scores_Last_5_weeks_by_target_end_date.csv} (100%)

diff --git a/tests/testthat/helper-check_exp_scores_for_window.R b/tests/testthat/helper-check_exp_scores_for_window.R
index 57d6667..5f13f13 100644
--- a/tests/testthat/helper-check_exp_scores_for_window.R
+++ b/tests/testthat/helper-check_exp_scores_for_window.R
@@ -19,9 +19,11 @@ check_exp_scores_for_window <- function(out_path, window_name, model_out_tbl, or
 
     actual_scores <- read.csv(scores_path)
 
+    file_name <- paste0("scores_", window_name, ifelse(is.null(by), "", paste0("_by_", by)), ".csv")
+    file_name <- gsub(" ", "_", file_name)
     expected_scores_path <- testthat::test_path(
       "testdata", "expected-scores",
-      paste0("scores_", window_name, ifelse(is.null(by), "", paste0("_by_", by)), ".csv")
+      file_name
     )
     expected_scores <- read.csv(expected_scores_path)
     if (!include_rel) {
diff --git a/tests/testthat/testdata/create_exp_score_fixtures.R b/tests/testthat/testdata/create_exp_score_fixtures.R
index 78599b0..a72ede6 100644
--- a/tests/testthat/testdata/create_exp_score_fixtures.R
+++ b/tests/testthat/testdata/create_exp_score_fixtures.R
@@ -47,6 +47,7 @@ make_score_fixtures_one_window <- function(window_name, model_out_tbl) {
     }
 
     file_name <- paste0("scores_", window_name, ifelse(is.null(by), "", paste0("_by_", by)), ".csv")
+    file_name <- gsub(" ", "_", file_name)
     write.csv(expected_scores, file = file.path(save_path, file_name), row.names = FALSE)
   }
 }
diff --git a/tests/testthat/testdata/expected-scores/scores_Full season.csv b/tests/testthat/testdata/expected-scores/scores_Full_season.csv
similarity index 100%
rename from tests/testthat/testdata/expected-scores/scores_Full season.csv
rename to tests/testthat/testdata/expected-scores/scores_Full_season.csv
diff --git a/tests/testthat/testdata/expected-scores/scores_Full season_by_horizon.csv b/tests/testthat/testdata/expected-scores/scores_Full_season_by_horizon.csv
similarity index 100%
rename from tests/testthat/testdata/expected-scores/scores_Full season_by_horizon.csv
rename to tests/testthat/testdata/expected-scores/scores_Full_season_by_horizon.csv
diff --git a/tests/testthat/testdata/expected-scores/scores_Full season_by_location.csv b/tests/testthat/testdata/expected-scores/scores_Full_season_by_location.csv
similarity index 100%
rename from tests/testthat/testdata/expected-scores/scores_Full season_by_location.csv
rename to tests/testthat/testdata/expected-scores/scores_Full_season_by_location.csv
diff --git a/tests/testthat/testdata/expected-scores/scores_Full season_by_reference_date.csv b/tests/testthat/testdata/expected-scores/scores_Full_season_by_reference_date.csv
similarity index 100%
rename from tests/testthat/testdata/expected-scores/scores_Full season_by_reference_date.csv
rename to tests/testthat/testdata/expected-scores/scores_Full_season_by_reference_date.csv
diff --git a/tests/testthat/testdata/expected-scores/scores_Full season_by_target_end_date.csv b/tests/testthat/testdata/expected-scores/scores_Full_season_by_target_end_date.csv
similarity index 100%
rename from tests/testthat/testdata/expected-scores/scores_Full season_by_target_end_date.csv
rename to tests/testthat/testdata/expected-scores/scores_Full_season_by_target_end_date.csv
diff --git a/tests/testthat/testdata/expected-scores/scores_Last 5 weeks.csv b/tests/testthat/testdata/expected-scores/scores_Last_5_weeks.csv
similarity index 100%
rename from tests/testthat/testdata/expected-scores/scores_Last 5 weeks.csv
rename to tests/testthat/testdata/expected-scores/scores_Last_5_weeks.csv
diff --git a/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_horizon.csv b/tests/testthat/testdata/expected-scores/scores_Last_5_weeks_by_horizon.csv
similarity index 100%
rename from tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_horizon.csv
rename to tests/testthat/testdata/expected-scores/scores_Last_5_weeks_by_horizon.csv
diff --git a/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_location.csv b/tests/testthat/testdata/expected-scores/scores_Last_5_weeks_by_location.csv
similarity index 100%
rename from tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_location.csv
rename to tests/testthat/testdata/expected-scores/scores_Last_5_weeks_by_location.csv
diff --git a/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_reference_date.csv b/tests/testthat/testdata/expected-scores/scores_Last_5_weeks_by_reference_date.csv
similarity index 100%
rename from tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_reference_date.csv
rename to tests/testthat/testdata/expected-scores/scores_Last_5_weeks_by_reference_date.csv
diff --git a/tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_target_end_date.csv b/tests/testthat/testdata/expected-scores/scores_Last_5_weeks_by_target_end_date.csv
similarity index 100%
rename from tests/testthat/testdata/expected-scores/scores_Last 5 weeks_by_target_end_date.csv
rename to tests/testthat/testdata/expected-scores/scores_Last_5_weeks_by_target_end_date.csv

From 7dd09cbf0f3204219b5a28702fe522b3bac30c68 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray1@users.noreply.github.com>
Date: Wed, 8 Jan 2025 20:22:52 -0500
Subject: [PATCH 4/5] Update
 tests/testthat/testdata/create_exp_score_fixtures.R

Co-authored-by: Zhian N. Kamvar <zkamvar@gmail.com>
---
 tests/testthat/testdata/create_exp_score_fixtures.R | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/testthat/testdata/create_exp_score_fixtures.R b/tests/testthat/testdata/create_exp_score_fixtures.R
index a72ede6..780bce7 100644
--- a/tests/testthat/testdata/create_exp_score_fixtures.R
+++ b/tests/testthat/testdata/create_exp_score_fixtures.R
@@ -1,8 +1,3 @@
-library(testthat)
-library(hubData)
-library(hubEvals)
-library(dplyr)
-
 hub_path <- testthat::test_path("testdata", "ecfh")
 model_out_tbl <- hubData::connect_hub(hub_path) |>
   dplyr::collect()

From 8a3632a505d63006b95fc1cb4d806b0d6473cc5b Mon Sep 17 00:00:00 2001
From: "Evan L. Ray" <elray@umass.edu>
Date: Wed, 8 Jan 2025 20:37:16 -0500
Subject: [PATCH 5/5] make the script go again

---
 tests/testthat/testdata/create_exp_score_fixtures.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/testthat/testdata/create_exp_score_fixtures.R b/tests/testthat/testdata/create_exp_score_fixtures.R
index 780bce7..36e3e22 100644
--- a/tests/testthat/testdata/create_exp_score_fixtures.R
+++ b/tests/testthat/testdata/create_exp_score_fixtures.R
@@ -1,8 +1,10 @@
+library(rlang)
+
 hub_path <- testthat::test_path("testdata", "ecfh")
 model_out_tbl <- hubData::connect_hub(hub_path) |>
   dplyr::collect()
 oracle_output <- read.csv(
-  test_path("testdata", "ecfh", "target-data", "oracle-output.csv")
+  testthat::test_path("testdata", "ecfh", "target-data", "oracle-output.csv")
 )
 oracle_output[["target_end_date"]] <- as.Date(oracle_output[["target_end_date"]])