Skip to content

Commit

Permalink
Merge pull request #97 from fab-scm/CAST-dev-week
Browse files Browse the repository at this point in the history
Add training sample index calculation for LPD
  • Loading branch information
HannaMeyer authored Mar 13, 2024
2 parents b06fe0e + 6f495cd commit 6d7c784
Show file tree
Hide file tree
Showing 9 changed files with 89 additions and 54 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,6 @@ Suggests:
gower,
clustMixType,
testthat (>= 3.0.0)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
VignetteBuilder: knitr
Config/testthat/edition: 3
2 changes: 1 addition & 1 deletion R/CAST-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' by analysing the similarity between new data and training data.
#' Methods are described in Meyer et al. (2018); Meyer et al. (2019); Meyer and Pebesma (2021); Milà et al. (2022); Meyer and Pebesma (2022).
#' @name CAST
#' @docType package
#' @docType _PACKAGE
#' @title 'caret' Applications for Spatial-Temporal Models
#' @author Hanna Meyer, Carles Milà, Marvin Ludwig, Lan Linnenbrink, Fabian Schumacher
#' @references
Expand Down
39 changes: 37 additions & 2 deletions R/aoa.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model?
#' @param LPD Logical. Indicates whether the local point density should be calculated or not.
#' @param maxLPD numeric or integer. Only if \code{LPD = TRUE}. Number of nearest neighbors to be considered for the calculation of the LPD. Either define a number between 0 and 1 to use a percentage of the number of training samples for the LPD calculation or a whole number larger than 1 and smaller than the number of training samples. CAUTION! If not all training samples are considered, a fitted relationship between LPD and error metric will not make sense (@seealso \code{\link{DItoErrormetric}})
#' @param indices logical. Calculate indices of the training data points that are responsible for the LPD of a new prediction location? Output is a matrix with the dimensions num(raster_cells) x maxLPD. Each row holds the indices of the training data points that are relevant for the specific LPD value at that location. Can be used in combination with exploreAOA(aoa) function from the CASTvis package (\link{https://github.com/fab-scm/CASTvis}) for a better visual interpretation of the results. Note that the matrix can be quite big for examples with a high resolution and a larger number of training samples, which can cause memory issues.
#' @param verbose Logical. Print progress or not?
#' @details The Dissimilarity Index (DI), the Local Data Point Density (LPD) and the corresponding Area of Applicability (AOA) are calculated.
#' If variables are factors, dummy variables are created prior to weighting and distance calculation.
Expand Down Expand Up @@ -135,7 +136,6 @@
#' @aliases aoa



aoa <- function(newdata,
model=NA,
trainDI = NA,
Expand All @@ -148,6 +148,7 @@ aoa <- function(newdata,
useWeight=TRUE,
LPD = FALSE,
maxLPD = 1,
indices = FALSE,
verbose = TRUE) {

# handling of different raster formats
Expand Down Expand Up @@ -298,7 +299,9 @@ aoa <- function(newdata,
}

if (calc_LPD == FALSE) {
message("Computing DI of new data...")
if (verbose) {
message("Computing DI of new data...")
}
mindist <- rep(NA, nrow(newdata))
mindist[okrows] <-
.mindistfun(newdataCC, train_scaled, method, S_inv)
Expand All @@ -318,13 +321,24 @@ aoa <- function(newdata,

DI_out <- rep(NA, nrow(newdata))
LPD_out <- rep(NA, nrow(newdata))
if (indices) {
Indices_out <- matrix(NA, nrow = nrow(newdata), ncol = maxLPD)
}
for (i in seq(nrow(newdataCC))) {
knnDist <- .knndistfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = maxLPD)
knnDI <- knnDist / trainDI$trainDist_avrgmean
knnDI <- c(knnDI)

DI_out[okrows[i]] <- knnDI[1]
LPD_out[okrows[i]] <- sum(knnDI < trainDI$threshold)
knnIndex <- .knnindexfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = LPD_out[okrows[i]])

if (indices) {
if (LPD_out[okrows[i]] > 0) {
Indices_out[okrows[i],1:LPD_out[okrows[i]]] <- knnIndex
}
}

if (verbose) {
setTxtProgressBar(pb, i)
}
Expand All @@ -345,6 +359,10 @@ aoa <- function(newdata,
}
trainDI$maxLPD <- realMaxLPD
}

if (indices) {
Indices_out <- Indices_out[,1:trainDI$maxLPD]
}
}

if (verbose) {
Expand Down Expand Up @@ -402,6 +420,9 @@ aoa <- function(newdata,

if (calc_LPD == TRUE) {
result$LPD <- LPD
if (indices) {
result$indices <- Indices_out
}
}

if (verbose) {
Expand Down Expand Up @@ -431,3 +452,17 @@ aoa <- function(newdata,
}
}

.knnindexfun <-
function (point,
reference,
method,
S_inv = NULL,
maxLPD = maxLPD) {
if (method == "L2") {
# Euclidean Distance
return(FNN::knnx.index(reference, point, k = maxLPD))
} else if (method == "MD") {
# hier muss noch was hin
}
}

2 changes: 1 addition & 1 deletion R/errorProfiles.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#' Estimating the area of applicability of spatial prediction models.
#' \doi{10.1111/2041-210X.13650}
#' @seealso \code{\link{aoa}}
#' @example
#' @examples
#' \dontrun{
#' library(CAST)
#' library(sf)
Expand Down
2 changes: 1 addition & 1 deletion man/CAST.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions man/aoa.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

81 changes: 39 additions & 42 deletions man/errorProfiles.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions tests/testthat/test-aoa.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ loaddata <- function() {
test_that("AOA works in default: used with raster data and a trained model", {
dat <- loaddata()
# calculate the AOA of the trained model for the study area:
AOA <- aoa(dat$studyArea, dat$model)
AOA <- aoa(dat$studyArea, dat$model, verbose = F)

#test threshold:
expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.38986)
Expand All @@ -55,7 +55,7 @@ test_that("AOA works in default: used with raster data and a trained model", {

test_that("AOA works without a trained model", {
dat <- loaddata()
AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables)
AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables, verbose = F)

#test threshold:
expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.52872)
Expand All @@ -71,7 +71,7 @@ test_that("AOA works without a trained model", {
test_that("AOA (including LPD) works with raster data and a trained model", {
dat <- loaddata()
# calculate the AOA of the trained model for the study area:
AOA <- aoa(dat$studyArea, dat$model, LPD = TRUE, maxLPD = 1)
AOA <- aoa(dat$studyArea, dat$model, LPD = TRUE, maxLPD = 1, verbose = F)

#test threshold:
expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.38986)
Expand All @@ -94,7 +94,7 @@ test_that("AOA (including LPD) works with raster data and a trained model", {

test_that("AOA (inluding LPD) works without a trained model", {
dat <- loaddata()
AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables, LPD = TRUE, maxLPD = 1)
AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables, LPD = TRUE, maxLPD = 1, verbose = F)

#test threshold:
expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.52872)
Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test_trainDI.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ loaddata <- function() {
test_that("trainDI works in default for a trained model", {
dat <- loaddata()
#...then calculate the DI of the trained model:
DI <- trainDI(model=dat$model)
DI <- trainDI(model=dat$model, verbose = F)

#test threshold:
expect_equal(as.numeric(round(DI$threshold,5)), 0.38986)
Expand All @@ -50,7 +50,7 @@ expect_equal(as.numeric(colMeans(DI$train)),
test_that("trainDI (with LPD = TRUE) works in default for a trained model", {
dat <- loaddata()
#...then calculate the DI of the trained model:
DI <- trainDI(model=dat$model, LPD = TRUE)
DI <- trainDI(model=dat$model, LPD = TRUE, verbose = F)

#test threshold:
expect_equal(as.numeric(round(DI$threshold,5)), 0.38986)
Expand Down

0 comments on commit 6d7c784

Please sign in to comment.