Merge pull request #97 from fab-scm/CAST-dev-week

Add training sample index calculation for LPD
HannaMeyer · Mar 13, 2024 · 6d7c784 · 6d7c784
2 parents b06fe0e + 6f495cd
commit 6d7c784
Show file tree

Hide file tree

Showing 9 changed files with 89 additions and 54 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -46,6 +46,6 @@ Suggests:
     gower, 
     clustMixType,
     testthat (>= 3.0.0)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 VignetteBuilder: knitr
 Config/testthat/edition: 3
diff --git a/R/CAST-package.R b/R/CAST-package.R
@@ -9,7 +9,7 @@
 #' by analysing the similarity between new data and training data.
 #' Methods are described in Meyer et al. (2018); Meyer et al. (2019); Meyer and Pebesma (2021); Milà et al. (2022); Meyer and Pebesma (2022).
 #' @name CAST
-#' @docType package
+#' @docType _PACKAGE
 #' @title 'caret' Applications for Spatial-Temporal Models
 #' @author Hanna Meyer, Carles Milà, Marvin Ludwig, Lan Linnenbrink, Fabian Schumacher
 #' @references

diff --git a/R/aoa.R b/R/aoa.R
@@ -28,6 +28,7 @@
 #' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model?
 #' @param LPD Logical. Indicates whether the local point density should be calculated or not.
 #' @param maxLPD numeric or integer. Only if \code{LPD = TRUE}. Number of nearest neighbors to be considered for the calculation of the LPD. Either define a number between 0 and 1 to use a percentage of the number of training samples for the LPD calculation or a whole number larger than 1 and smaller than the number of training samples. CAUTION! If not all training samples are considered, a fitted relationship between LPD and error metric will not make sense (@seealso \code{\link{DItoErrormetric}})
+#' @param indices logical. Calculate indices of the training data points that are responsible for the LPD of a new prediction location? Output is a matrix with the dimensions num(raster_cells) x maxLPD. Each row holds the indices of the training data points that are relevant for the specific LPD value at that location. Can be used in combination with exploreAOA(aoa) function from the CASTvis package (\link{https://github.com/fab-scm/CASTvis}) for a better visual interpretation of the results. Note that the matrix can be quite big for examples with a high resolution and a larger number of training samples, which can cause memory issues.
 #' @param verbose Logical. Print progress or not?
 #' @details The Dissimilarity Index (DI), the Local Data Point Density (LPD) and the corresponding Area of Applicability (AOA) are calculated.
 #' If variables are factors, dummy variables are created prior to weighting and distance calculation.
@@ -135,7 +136,6 @@
 #' @aliases aoa
 
 
-
 aoa <- function(newdata,
                 model=NA,
                 trainDI = NA,
@@ -148,6 +148,7 @@ aoa <- function(newdata,
                 useWeight=TRUE,
                 LPD = FALSE,
                 maxLPD = 1,
+                indices = FALSE,
                 verbose = TRUE) {
 
   # handling of different raster formats
@@ -298,7 +299,9 @@ aoa <- function(newdata,
   }
 
   if (calc_LPD == FALSE) {
-    message("Computing DI of new data...")
+    if (verbose) {
+      message("Computing DI of new data...")
+    }
     mindist <- rep(NA, nrow(newdata))
     mindist[okrows] <-
       .mindistfun(newdataCC, train_scaled, method, S_inv)
@@ -318,13 +321,24 @@ aoa <- function(newdata,
 
     DI_out <- rep(NA, nrow(newdata))
     LPD_out <- rep(NA, nrow(newdata))
+    if (indices) {
+      Indices_out <- matrix(NA, nrow = nrow(newdata), ncol = maxLPD)
+    }
     for (i in seq(nrow(newdataCC))) {
       knnDist  <- .knndistfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = maxLPD)
       knnDI <- knnDist / trainDI$trainDist_avrgmean
       knnDI <- c(knnDI)
 
       DI_out[okrows[i]] <- knnDI[1]
       LPD_out[okrows[i]] <- sum(knnDI < trainDI$threshold)
+      knnIndex  <- .knnindexfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = LPD_out[okrows[i]])
+
+      if (indices) {
+        if (LPD_out[okrows[i]] > 0) {
+          Indices_out[okrows[i],1:LPD_out[okrows[i]]] <- knnIndex
+        }
+      }
+
       if (verbose) {
         setTxtProgressBar(pb, i)
       }
@@ -345,6 +359,10 @@ aoa <- function(newdata,
       }
       trainDI$maxLPD <- realMaxLPD
     }
+
+    if (indices) {
+      Indices_out <- Indices_out[,1:trainDI$maxLPD]
+    }
   }
 
   if (verbose) {
@@ -402,6 +420,9 @@ aoa <- function(newdata,
 
   if (calc_LPD == TRUE) {
     result$LPD <- LPD
+    if (indices) {
+      result$indices <- Indices_out
+    }
   }
 
   if (verbose) {
@@ -431,3 +452,17 @@ aoa <- function(newdata,
     }
   }
 
+.knnindexfun <-
+  function (point,
+            reference,
+            method,
+            S_inv = NULL,
+            maxLPD = maxLPD) {
+    if (method == "L2") {
+      # Euclidean Distance
+      return(FNN::knnx.index(reference, point, k = maxLPD))
+    } else if (method == "MD") {
+      # hier muss noch was hin
+    }
+  }
+
diff --git a/R/errorProfiles.R b/R/errorProfiles.R
@@ -23,7 +23,7 @@
 #' Estimating the area of applicability of spatial prediction models.
 #' \doi{10.1111/2041-210X.13650}
 #' @seealso \code{\link{aoa}}
-#' @example
+#' @examples
 #' \dontrun{
 #' library(CAST)
 #' library(sf)

diff --git a/man/CAST.Rd b/man/CAST.Rd
diff --git a/man/aoa.Rd b/man/aoa.Rd
diff --git a/man/errorProfiles.Rd b/man/errorProfiles.Rd
diff --git a/tests/testthat/test-aoa.R b/tests/testthat/test-aoa.R
@@ -32,7 +32,7 @@ loaddata <- function() {
 test_that("AOA works in default: used with raster data and a trained model", {
   dat <- loaddata()
   # calculate the AOA of the trained model for the study area:
-  AOA <- aoa(dat$studyArea, dat$model)
+  AOA <- aoa(dat$studyArea, dat$model, verbose = F)
 
   #test threshold:
   expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.38986)
@@ -55,7 +55,7 @@ test_that("AOA works in default: used with raster data and a trained model", {
 
 test_that("AOA works without a trained model", {
   dat <- loaddata()
-  AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables)
+  AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables, verbose = F)
 
   #test threshold:
   expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.52872)
@@ -71,7 +71,7 @@ test_that("AOA works without a trained model", {
 test_that("AOA (including LPD) works with raster data and a trained model", {
   dat <- loaddata()
   # calculate the AOA of the trained model for the study area:
-  AOA <- aoa(dat$studyArea, dat$model, LPD = TRUE, maxLPD = 1)
+  AOA <- aoa(dat$studyArea, dat$model, LPD = TRUE, maxLPD = 1, verbose = F)
 
   #test threshold:
   expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.38986)
@@ -94,7 +94,7 @@ test_that("AOA (including LPD) works with raster data and a trained model", {
 
 test_that("AOA (inluding LPD) works without a trained model", {
   dat <- loaddata()
-  AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables, LPD = TRUE, maxLPD = 1)
+  AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables, LPD = TRUE, maxLPD = 1, verbose = F)
 
   #test threshold:
   expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.52872)

diff --git a/tests/testthat/test_trainDI.R b/tests/testthat/test_trainDI.R
@@ -31,7 +31,7 @@ loaddata <- function() {
 test_that("trainDI works in default for a trained model", {
 dat <- loaddata()
 #...then calculate the DI of the trained model:
-DI <- trainDI(model=dat$model)
+DI <- trainDI(model=dat$model, verbose = F)
 
 #test threshold:
 expect_equal(as.numeric(round(DI$threshold,5)), 0.38986)
@@ -50,7 +50,7 @@ expect_equal(as.numeric(colMeans(DI$train)),
 test_that("trainDI (with LPD = TRUE) works in default for a trained model", {
   dat <- loaddata()
   #...then calculate the DI of the trained model:
-  DI <- trainDI(model=dat$model, LPD = TRUE)
+  DI <- trainDI(model=dat$model, LPD = TRUE, verbose = F)
 
   #test threshold:
   expect_equal(as.numeric(round(DI$threshold,5)), 0.38986)