From 422c85ee33f06232f2b665c5c0cee22b32b8979a Mon Sep 17 00:00:00 2001 From: fab-scm Date: Wed, 13 Mar 2024 14:30:24 +0100 Subject: [PATCH 1/3] trainig sample index calculation for LPD --- R/aoa.R | 39 +++++++++++++++++++++++++++++++++++++-- man/aoa.Rd | 3 +++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/R/aoa.R b/R/aoa.R index 8c3fc606..dea5d6c6 100644 --- a/R/aoa.R +++ b/R/aoa.R @@ -28,6 +28,7 @@ #' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model? #' @param LPD Logical. Indicates whether the local point density should be calculated or not. #' @param maxLPD numeric or integer. Only if \code{LPD = TRUE}. Number of nearest neighbors to be considered for the calculation of the LPD. Either define a number between 0 and 1 to use a percentage of the number of training samples for the LPD calculation or a whole number larger than 1 and smaller than the number of training samples. CAUTION! If not all training samples are considered, a fitted relationship between LPD and error metric will not make sense (@seealso \code{\link{DItoErrormetric}}) +#' @param indices logical. Calculate indices of the training data points that are responsible for the LPD of a new prediction location? Output is a matrix with the dimensions num(raster_cells) x maxLPD. Each row holds the indices of the training data points that are relevant for the specific LPD value at that location. Can be used in combination with exploreAOA(aoa) function from the CASTvis package (\link{https://github.com/fab-scm/CASTvis}) for a better visual interpretation of the results. Note that the matrix can be quite big for examples with a high resolution and a larger number of training samples, which can cause memory issues. #' @param verbose Logical. Print progress or not? #' @details The Dissimilarity Index (DI), the Local Data Point Density (LPD) and the corresponding Area of Applicability (AOA) are calculated. #' If variables are factors, dummy variables are created prior to weighting and distance calculation. @@ -135,7 +136,6 @@ #' @aliases aoa - aoa <- function(newdata, model=NA, trainDI = NA, @@ -148,6 +148,7 @@ aoa <- function(newdata, useWeight=TRUE, LPD = FALSE, maxLPD = 1, + indices = FALSE, verbose = TRUE) { # handling of different raster formats @@ -298,7 +299,9 @@ aoa <- function(newdata, } if (calc_LPD == FALSE) { - message("Computing DI of new data...") + if (verbose) { + message("Computing DI of new data...") + } mindist <- rep(NA, nrow(newdata)) mindist[okrows] <- .mindistfun(newdataCC, train_scaled, method, S_inv) @@ -318,6 +321,9 @@ aoa <- function(newdata, DI_out <- rep(NA, nrow(newdata)) LPD_out <- rep(NA, nrow(newdata)) + if (indices) { + Indices_out <- matrix(NA, nrow = nrow(newdata), ncol = maxLPD) + } for (i in seq(nrow(newdataCC))) { knnDist <- .knndistfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = maxLPD) knnDI <- knnDist / trainDI$trainDist_avrgmean @@ -325,6 +331,14 @@ aoa <- function(newdata, DI_out[okrows[i]] <- knnDI[1] LPD_out[okrows[i]] <- sum(knnDI < trainDI$threshold) + knnIndex <- .knnindexfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = LPD_out[okrows[i]]) + + if (indices) { + if (LPD_out[okrows[i]] > 0) { + Indices_out[okrows[i],1:LPD_out[okrows[i]]] <- knnIndex + } + } + if (verbose) { setTxtProgressBar(pb, i) } @@ -345,6 +359,10 @@ aoa <- function(newdata, } trainDI$maxLPD <- realMaxLPD } + + if (indices) { + Indices_out <- Indices_out[,1:trainDI$maxLPD] + } } if (verbose) { @@ -402,6 +420,9 @@ aoa <- function(newdata, if (calc_LPD == TRUE) { result$LPD <- LPD + if (indices) { + result$indices <- Indices_out + } } if (verbose) { @@ -431,3 +452,17 @@ aoa <- function(newdata, } } +.knnindexfun <- + function (point, + reference, + method, + S_inv = NULL, + maxLPD = maxLPD) { + if (method == "L2") { + # Euclidean Distance + return(FNN::knnx.index(reference, point, k = maxLPD)) + } else if (method == "MD") { + # hier muss noch was hin + } + } + diff --git a/man/aoa.Rd b/man/aoa.Rd index cf0f4d43..0c7b8d26 100644 --- a/man/aoa.Rd +++ b/man/aoa.Rd @@ -17,6 +17,7 @@ aoa( useWeight = TRUE, LPD = FALSE, maxLPD = 1, + indices = FALSE, verbose = TRUE ) } @@ -52,6 +53,8 @@ Relevant if some data points are excluded, e.g. when using \code{\link{nndm}}.} \item{maxLPD}{numeric or integer. Only if \code{LPD = TRUE}. Number of nearest neighbors to be considered for the calculation of the LPD. Either define a number between 0 and 1 to use a percentage of the number of training samples for the LPD calculation or a whole number larger than 1 and smaller than the number of training samples. CAUTION! If not all training samples are considered, a fitted relationship between LPD and error metric will not make sense (@seealso \code{\link{DItoErrormetric}})} +\item{indices}{logical. Calculate indices of the training data points that are responsible for the LPD of a new prediction location? Output is a matrix with the dimensions num(raster_cells) x maxLPD. Each row holds the indices of the training data points that are relevant for the specific LPD value at that location. Can be used in combination with exploreAOA(aoa) function from the CASTvis package (\link{https://github.com/fab-scm/CASTvis}) for a better visual interpretation of the results. Note that the matrix can be quite big for examples with a high resolution and a larger number of training samples, which can cause memory issues.} + \item{verbose}{Logical. Print progress or not?} } \value{ From 0e89732f7d2b888118849c8523ac8f45947760da Mon Sep 17 00:00:00 2001 From: fab-scm Date: Wed, 13 Mar 2024 14:30:55 +0100 Subject: [PATCH 2/3] fix roxygen warnings --- DESCRIPTION | 2 +- R/CAST-package.R | 2 +- R/errorProfiles.R | 2 +- man/CAST.Rd | 2 +- man/errorProfiles.Rd | 81 +++++++++++++++++++++----------------------- 5 files changed, 43 insertions(+), 46 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c95e0e75..1e90bd5c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,6 +46,6 @@ Suggests: gower, clustMixType, testthat (>= 3.0.0) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 VignetteBuilder: knitr Config/testthat/edition: 3 diff --git a/R/CAST-package.R b/R/CAST-package.R index cf052907..07fd8d36 100644 --- a/R/CAST-package.R +++ b/R/CAST-package.R @@ -9,7 +9,7 @@ #' by analysing the similarity between new data and training data. #' Methods are described in Meyer et al. (2018); Meyer et al. (2019); Meyer and Pebesma (2021); Milà et al. (2022); Meyer and Pebesma (2022). #' @name CAST -#' @docType package +#' @docType _PACKAGE #' @title 'caret' Applications for Spatial-Temporal Models #' @author Hanna Meyer, Carles Milà, Marvin Ludwig, Lan Linnenbrink, Fabian Schumacher #' @references diff --git a/R/errorProfiles.R b/R/errorProfiles.R index 968902a0..da858dc3 100644 --- a/R/errorProfiles.R +++ b/R/errorProfiles.R @@ -23,7 +23,7 @@ #' Estimating the area of applicability of spatial prediction models. #' \doi{10.1111/2041-210X.13650} #' @seealso \code{\link{aoa}} -#' @example +#' @examples #' \dontrun{ #' library(CAST) #' library(sf) diff --git a/man/CAST.Rd b/man/CAST.Rd index 30ee9746..814ac079 100644 --- a/man/CAST.Rd +++ b/man/CAST.Rd @@ -1,6 +1,6 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/CAST-package.R -\docType{package} +\docType{_PACKAGE} \name{CAST} \alias{CAST} \alias{CAST-package} diff --git a/man/errorProfiles.Rd b/man/errorProfiles.Rd index 9c3187e6..99b8c752 100644 --- a/man/errorProfiles.Rd +++ b/man/errorProfiles.Rd @@ -59,63 +59,60 @@ the AOA threshold changes accordingly. See Meyer and Pebesma (2021) for the full } \examples{ \dontrun{ +library(CAST) +library(sf) +library(terra) +library(caret) - library(CAST) - library(sf) - library(terra) - library(caret) +data(splotdata) +predictors <- terra::rast(system.file("extdata","predictors_chile.tif", package="CAST")) +model <- caret::train(st_drop_geometry(splotdata)[,6:16], splotdata$Species_richness, + ntree = 10, trControl = trainControl(method = "cv", savePredictions = TRUE)) - data(splotdata) - predictors <- terra::rast(system.file("extdata","predictors_chile.tif", package="CAST")) +AOA <- aoa(predictors, model, LPD = TRUE, maxLPD = 1) - model <- caret::train(st_drop_geometry(splotdata)[,6:16], splotdata$Species_richness, ntree = 10, - trControl = trainControl(method = "cv", savePredictions = TRUE)) +### DI ~ error +errormodel_DI <- errorProfiles(model, AOA, variable = "DI") +plot(errormodel_DI) +summary(errormodel_DI) - AOA <- aoa(predictors, model, LPD = TRUE, maxLPD = 1) +expected_error_DI = terra::predict(AOA$DI, errormodel_DI) +plot(expected_error_DI) - ### DI ~ error - errormodel_DI <- errorProfiles(model, AOA, variable = "DI") - plot(errormodel_DI) +### LPD ~ error +errormodel_LPD <- errorProfiles(model, AOA, variable = "LPD") +plot(errormodel_LPD) +summary(errormodel_DI) - expected_error_DI = terra::predict(AOA$DI, errormodel_DI) - plot(expected_error_DI) +expected_error_LPD = terra::predict(AOA$LPD, errormodel_LPD) +plot(expected_error_LPD) - ### LPD ~ error - errormodel_LPD <- errorProfiles(model, AOA, variable = "LPD") - plot(errormodel_LPD) +### geodist ~ error +errormodel_geodist = errorProfiles(model, locations=splotdata, variable = "geodist") +plot(errormodel_geodist) +summary(errormodel_DI) - expected_error_LPD = terra::predict(AOA$LPD, errormodel_LPD) - plot(expected_error_LPD) +dist <- terra::distance(predictors[[1]],vect(splotdata)) +names(dist) <- "geodist" +expected_error_DI <- terra::predict(dist, errormodel_geodist) +plot(expected_error_DI) +### with multiCV = TRUE (for DI ~ error) +errormodel_DI = errorProfiles(model, AOA, multiCV = TRUE, length.out = 3, variable = "DI") +plot(errormodel_DI) - ### geodist ~ error - errormodel_geodist = errorProfiles(model, locations=splotdata, - variable = "geodist") - plot(errormodel_geodist) - - dist <- terra::distance(predictors[[1]],vect(splotdata)) - names(dist) <- "geodist" - expected_error_DI <- terra::predict(dist, errormodel_geodist) - plot(expected_error_DI) - - - ### with multiCV = TRUE (for DI ~ error) - errormodel_DI = errorProfiles(model, AOA, multiCV = TRUE, length.out = 3, variable = "DI") - plot(errormodel_DI) - - expected_error_DI = terra::predict(AOA$DI, errormodel_DI) - plot(expected_error_DI) - - # mask AOA based on new threshold from multiCV - mask_aoa = terra::mask(expected_error_DI, AOA$DI > attr(errormodel_DI, 'AOA_threshold'), - maskvalues = 1) - plot(mask_aoa) - +expected_error_DI = terra::predict(AOA$DI, errormodel_DI) +plot(expected_error_DI) +# mask AOA based on new threshold from multiCV +mask_aoa = terra::mask(expected_error_DI, AOA$DI > attr(errormodel_DI, 'AOA_threshold'), + maskvalues = 1) +plot(mask_aoa) } + } \references{ Meyer, H., Pebesma, E. (2021): Predicting into unknown space? From 6f495cdd28ceae06a99a3f91870f62d714275cca Mon Sep 17 00:00:00 2001 From: fab-scm Date: Wed, 13 Mar 2024 15:18:45 +0100 Subject: [PATCH 3/3] supress progress feedback in and --- tests/testthat/test-aoa.R | 8 ++++---- tests/testthat/test_trainDI.R | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test-aoa.R b/tests/testthat/test-aoa.R index 783df669..9b39d8e9 100644 --- a/tests/testthat/test-aoa.R +++ b/tests/testthat/test-aoa.R @@ -32,7 +32,7 @@ loaddata <- function() { test_that("AOA works in default: used with raster data and a trained model", { dat <- loaddata() # calculate the AOA of the trained model for the study area: - AOA <- aoa(dat$studyArea, dat$model) + AOA <- aoa(dat$studyArea, dat$model, verbose = F) #test threshold: expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.38986) @@ -55,7 +55,7 @@ test_that("AOA works in default: used with raster data and a trained model", { test_that("AOA works without a trained model", { dat <- loaddata() - AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables) + AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables, verbose = F) #test threshold: expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.52872) @@ -71,7 +71,7 @@ test_that("AOA works without a trained model", { test_that("AOA (including LPD) works with raster data and a trained model", { dat <- loaddata() # calculate the AOA of the trained model for the study area: - AOA <- aoa(dat$studyArea, dat$model, LPD = TRUE, maxLPD = 1) + AOA <- aoa(dat$studyArea, dat$model, LPD = TRUE, maxLPD = 1, verbose = F) #test threshold: expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.38986) @@ -94,7 +94,7 @@ test_that("AOA (including LPD) works with raster data and a trained model", { test_that("AOA (inluding LPD) works without a trained model", { dat <- loaddata() - AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables, LPD = TRUE, maxLPD = 1) + AOA <- aoa(dat$studyArea,train=dat$trainDat,variables=dat$variables, LPD = TRUE, maxLPD = 1, verbose = F) #test threshold: expect_equal(as.numeric(round(AOA$parameters$threshold,5)), 0.52872) diff --git a/tests/testthat/test_trainDI.R b/tests/testthat/test_trainDI.R index 990906c8..014c579a 100644 --- a/tests/testthat/test_trainDI.R +++ b/tests/testthat/test_trainDI.R @@ -31,7 +31,7 @@ loaddata <- function() { test_that("trainDI works in default for a trained model", { dat <- loaddata() #...then calculate the DI of the trained model: -DI <- trainDI(model=dat$model) +DI <- trainDI(model=dat$model, verbose = F) #test threshold: expect_equal(as.numeric(round(DI$threshold,5)), 0.38986) @@ -50,7 +50,7 @@ expect_equal(as.numeric(colMeans(DI$train)), test_that("trainDI (with LPD = TRUE) works in default for a trained model", { dat <- loaddata() #...then calculate the DI of the trained model: - DI <- trainDI(model=dat$model, LPD = TRUE) + DI <- trainDI(model=dat$model, LPD = TRUE, verbose = F) #test threshold: expect_equal(as.numeric(round(DI$threshold,5)), 0.38986)