From 5251e4838544cabb413d6dcfc3420c502b0aaed5 Mon Sep 17 00:00:00 2001 From: Ludwigm6 Date: Wed, 3 Jan 2024 14:11:50 +0100 Subject: [PATCH] no cache --- vignettes/cast01-CAST-intro-cookfarm.Rmd | 32 ++-- vignettes/cast04-plotgeodist.R | 179 ----------------------- 2 files changed, 16 insertions(+), 195 deletions(-) delete mode 100644 vignettes/cast04-plotgeodist.R diff --git a/vignettes/cast01-CAST-intro-cookfarm.Rmd b/vignettes/cast01-CAST-intro-cookfarm.Rmd index 7bd60fde..d898524b 100644 --- a/vignettes/cast01-CAST-intro-cookfarm.Rmd +++ b/vignettes/cast01-CAST-intro-cookfarm.Rmd @@ -14,7 +14,7 @@ editor_options: --- ```{r setup, echo=FALSE} -knitr::opts_chunk$set(fig.width = 8.83,cache = TRUE) +knitr::opts_chunk$set(fig.width = 8.83,cache = FALSE) user_hanna <- Sys.getenv("USER") %in% c("hanna") ``` @@ -34,14 +34,14 @@ In order to follow this tutorial, I assume that the reader is familiar with the To work with the tutorial, first install the CAST package and load the library: -```{r, message = FALSE, warning=FALSE} +```{r c1, message = FALSE, warning=FALSE} #install.packages("CAST") library(CAST) ``` If you need help, see -```{r, message = FALSE, warning=FALSE} +```{r c2, message = FALSE, warning=FALSE} help(CAST) ``` @@ -52,7 +52,7 @@ The example prediction task for this tutorial is the following: we have a set of To do so, we will work with the cookfarm dataset, described in e.g. [Gasch et al 2015](https://www.sciencedirect.com/science/article/pii/S2211675315000251/) and available via the GSIF package ([Hengl 2017](https://CRAN.R-project.org/package=GSIF)). The dataset included in the CAST package is a re-structured dataset which was used for the analysis in [Meyer et al 2018](https://www.sciencedirect.com/science/article/pii/S1364815217310976). -```{r, message = FALSE, warning=FALSE} +```{r c3, message = FALSE, warning=FALSE} data <- readRDS(system.file("extdata","Cookfarm.RDS",package="CAST")) head(data) ``` @@ -62,7 +62,7 @@ See [Gasch et al 2015](https://www.sciencedirect.com/science/article/pii/S221167 To get an impression on the spatial properties of the dataset, let's have a look on the spatial distribution of the data loggers on the cookfarm: -```{r, message = FALSE, warning=FALSE} +```{r c4, message = FALSE, warning=FALSE} library(sf) data_sp <- unique(data[,c("SOURCEID","Easting","Northing")]) @@ -70,7 +70,7 @@ data_sp <- st_as_sf(data_sp,coords=c("Easting","Northing"),crs=26911) plot(data_sp,axes=T,col="black") ``` -```{r, message = FALSE, warning=FALSE, eval=user_hanna} +```{r c5, message = FALSE, warning=FALSE, eval=user_hanna} #...or plot the data with mapview: library(mapview) mapviewOptions(basemaps = c("Esri.WorldImagery")) @@ -84,7 +84,7 @@ We see that the data are taken at 42 locations (SOURCEID) over the field. The lo To reduce the data to an amount that can be handled in a tutorial, let's restrict the data to the depth of -0.3 and to two weeks of the year 2012. After subsetting let's have an overview on the soil moisture time series measured by the data loggers. -```{r, message = FALSE, warning=FALSE} +```{r c6, message = FALSE, warning=FALSE} library(lubridate) library(ggplot2) trainDat <- data[data$altitude==-0.3& @@ -102,7 +102,7 @@ In the following we will use this subset of the cookfarm data as an example to s To start with, lets use this dataset to create a "default" Random Forest model that predicts soil moisture based on some predictor variables. To keep computation time at a minimum, we don't include hyperparameter tuning (hence mtry was set to 2) which is reasonable as Random Forests are comparably insensitive to tuning. -```{r, message = FALSE, warning=FALSE} +```{r c7, message = FALSE, warning=FALSE} library(caret) predictors <- c("DEM","TWI","Precip_cum","cday", "MaxT_wrcc","Precip_wrcc","BLD", @@ -118,7 +118,7 @@ model <- train(trainDat[,predictors],trainDat$VW, Based on the trained model we can make spatial predictions of soil moisture. To do this we load a multiband raster that contains spatial data of all predictor variables for the 25th of March 2012 (as an example). We then apply the trained model on this data set. -```{r, message = FALSE, warning=FALSE} +```{r c8, message = FALSE, warning=FALSE} library(terra) predictors_sp <- rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST")) prediction <- predict(predictors_sp,model,na.rm=TRUE) @@ -139,7 +139,7 @@ In the example above we used a random k-fold CV that we defined in caret's train To assess the performance of the model let's have a look on the output of the Random CV: -```{r, message = FALSE, warning=FALSE} +```{r c9, message = FALSE, warning=FALSE} model ``` @@ -158,7 +158,7 @@ Note that several suggestions of spatial CV exist. What we call LLO here is just -```{r, message = FALSE, warning=FALSE} +```{r c10, message = FALSE, warning=FALSE} set.seed(10) indices <- CreateSpacetimeFolds(trainDat,spacevar = "SOURCEID", k=3) @@ -177,7 +177,7 @@ Apparently, there is considerable overfitting in the model, causing a good rando Let's have a look at the variable importance ranking of Random Forest and see if we find something suspicious: -```{r, message = FALSE, warning=FALSE} +```{r c11, message = FALSE, warning=FALSE} plot(varImp(model_LLO)) ``` @@ -195,7 +195,7 @@ ffs is doing this job by first training models using all possible pairs of two p So let's run the ffs on our case study using R² as a metric to select the optimal variables. This process will take 1-2 minutes... -```{r, message = FALSE, warning=FALSE} +```{r c12, message = FALSE, warning=FALSE} set.seed(10) ffsmodel_LLO <- ffs(trainDat[,predictors],trainDat$VW,metric="Rsquared", method="rf", tuneGrid=data.frame("mtry"=2), @@ -212,7 +212,7 @@ Using the ffs with LLO CV, the R² could be increased from 0.16 to 0.28. The var Using the plot$\_$ffs function we can visualize how the performance of the model changed depending on the variables being used: -```{r, message = FALSE, warning=FALSE} +```{r c13, message = FALSE, warning=FALSE} plot(ffsmodel_LLO) ``` @@ -221,7 +221,7 @@ Note that the R² features a high standard deviation regardless of the variables What effect does the new model has on the spatial representation of soil moisture? -```{r, message = FALSE, warning=FALSE} +```{r c14, message = FALSE, warning=FALSE} prediction_ffs <- predict(predictors_sp,ffsmodel_LLO,na.rm=TRUE) plot(prediction_ffs) ``` @@ -232,7 +232,7 @@ We see that the variable selection does not only have an effect on the statistic Still it is required to analyse if the model can be applied to the entire study area of if there are locations that are very different in their predictor properties to what the model has learned from. See more details in the vignette on the Area of applicability and [Meyer and Pebesma 2021](https://doi.org/10.1111/2041-210X.13650). -```{r, message = FALSE, warning=FALSE} +```{r c15, message = FALSE, warning=FALSE} ### AOA for which the spatial CV error applies: AOA <- aoa(predictors_sp,ffsmodel_LLO) diff --git a/vignettes/cast04-plotgeodist.R b/vignettes/cast04-plotgeodist.R deleted file mode 100644 index 2c03c1dc..00000000 --- a/vignettes/cast04-plotgeodist.R +++ /dev/null @@ -1,179 +0,0 @@ -## ----setup, include=FALSE----------------------------------------------------- -knitr::opts_chunk$set(echo = TRUE,fig.width=6.2, fig.height=3.4) - - -## ----message = FALSE, warning=FALSE------------------------------------------- -library(CAST) -library(caret) -library(terra) -library(sf) -library(rnaturalearth) -library(ggplot2) - -## ----message = FALSE, warning=FALSE------------------------------------------- -seed <- 10 # random realization -samplesize <- 300 # how many samples will be used? -nparents <- 20 #For clustered samples: How many clusters? -radius <- 500000 # For clustered samples: What is the radius of a cluster? - - -## ----message = FALSE, warning=FALSE------------------------------------------- -ee <- st_crs("+proj=eqearth") -co <- ne_countries(returnclass = "sf") -co.ee <- st_transform(co, ee) - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- -sf_use_s2(FALSE) -set.seed(seed) -pts_random <- st_sample(co.ee, samplesize) -### See points on the map: -ggplot() + geom_sf(data = co.ee, fill="#00BFC4",col="#00BFC4") + - geom_sf(data = pts_random, color = "#F8766D",size=0.5, shape=3) + - guides(fill = "none", col = "none") + - labs(x = NULL, y = NULL) - - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- -set.seed(seed) -sf_use_s2(FALSE) -pts_clustered <- clustered_sample(co.ee, samplesize, nparents, radius) - -ggplot() + geom_sf(data = co.ee, fill="#00BFC4",col="#00BFC4") + - geom_sf(data = pts_clustered, color = "#F8766D",size=0.5, shape=3) + - guides(fill = "none", col = "none") + - labs(x = NULL, y = NULL) - - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- -dist_random <- geodist(pts_random,co.ee, - sampling="Fibonacci") -dist_clstr <- geodist(pts_clustered,co.ee, - sampling="Fibonacci") - -plot(dist_random, unit = "km")+scale_x_log10(labels=round)+ggtitle("Randomly distributed reference data") -plot(dist_clstr, unit = "km")+scale_x_log10(labels=round)+ggtitle("Clustered reference data") - - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- -randomfolds <- caret::createFolds(1:nrow(pts_clustered)) - -## ----message = FALSE, warning=FALSE, results='hide',echo=FALSE---------------- -for (i in 1:nrow(pts_clustered)){ - pts_clustered$randomCV[i] <- which(unlist(lapply(randomfolds,function(x){sum(x%in%i)}))==1) -} - -ggplot() + geom_sf(data = co.ee, fill="#00BFC4",col="#00BFC4") + - geom_sf(data = pts_clustered, color = rainbow(max(pts_clustered$randomCV))[pts_clustered$randomCV],size=0.5, shape=3) + - guides(fill = FALSE, col = FALSE) + - labs(x = NULL, y = NULL)+ggtitle("random fold membership shown by color") - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- -dist_clstr <- geodist(pts_clustered,co.ee, - sampling="Fibonacci", - cvfolds= randomfolds) -plot(dist_clstr, unit = "km")+scale_x_log10(labels=round) - - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- -spatialfolds <- CreateSpacetimeFolds(pts_clustered,spacevar="parent",k=length(unique(pts_clustered$parent))) - -## ----message = FALSE, warning=FALSE, results='hide',echo=FALSE---------------- -ggplot() + geom_sf(data = co.ee, fill="#00BFC4",col="#00BFC4") + - geom_sf(data = pts_clustered, color = rainbow(max(pts_clustered$parent))[pts_clustered$parent],size=0.5, shape=3) + - guides(fill = FALSE, col = FALSE) + - labs(x = NULL, y = NULL)+ ggtitle("spatial fold membership by color") - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- -dist_clstr <- geodist(pts_clustered,co.ee, - sampling="Fibonacci", - cvfolds= spatialfolds$indexOut) -plot(dist_clstr, unit = "km")+scale_x_log10(labels=round) - - - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- -# create a spatial CV for the randomly distributed data. Here: -# "leave region-out-CV" -sf_use_s2(FALSE) -pts_random_co <- st_join(st_as_sf(pts_random),co.ee) - - -ggplot() + geom_sf(data = co.ee, fill="#00BFC4",col="#00BFC4") + - geom_sf(data = pts_random_co, aes(color=subregion),size=0.5, shape=3) + - scale_color_manual(values=rainbow(length(unique(pts_random_co$subregion))))+ - guides(fill = FALSE, col = FALSE) + - labs(x = NULL, y = NULL)+ ggtitle("spatial fold membership by color") - - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- - -spfolds_rand <- CreateSpacetimeFolds(pts_random_co,spacevar = "subregion", - k=length(unique(pts_random_co$subregion))) -dist_rand_sp <- geodist(pts_random_co,co.ee, - sampling="Fibonacci", - cvfolds= spfolds_rand$indexOut) -plot(dist_rand_sp, unit = "km")+scale_x_log10(labels=round) - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- - - -nndmfolds_clstr <- nndm(pts_clustered, modeldomain=co.ee, samplesize = 2000) -dist_clstr <- geodist(pts_clustered,co.ee, - sampling = "Fibonacci", - cvfolds = nndmfolds_clstr$indx_test, - cvtrain = nndmfolds_clstr$indx_train) -plot(dist_clstr, unit = "km")+scale_x_log10(labels=round) - - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- - -nndmfolds_rand <- nndm(pts_random_co, modeldomain=co.ee, samplesize = 2000) -dist_rand <- geodist(pts_random_co,co.ee, - sampling = "Fibonacci", - cvfolds = nndmfolds_rand$indx_test, - cvtrain = nndmfolds_rand$indx_train) -plot(dist_rand, unit = "km")+scale_x_log10(labels=round) - - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- - -knndmfolds_clstr <- knndm(pts_clustered, modeldomain=co.ee, samplesize = 2000) -pts_clustered$knndmCV <- as.character(knndmfolds_clstr$clusters) - -ggplot() + geom_sf(data = co.ee, fill="#00BFC4",col="#00BFC4") + - geom_sf(data = pts_clustered, aes(color=knndmCV),size=0.5, shape=3) + - scale_color_manual(values=rainbow(length(unique(pts_clustered$knndmCV))))+ - guides(fill = FALSE, col = FALSE) + - labs(x = NULL, y = NULL)+ ggtitle("spatial fold membership by color") - - -dist_clstr <- geodist(pts_clustered,co.ee, - sampling = "Fibonacci", - cvfolds = knndmfolds_clstr$indx_test, - cvtrain = knndmfolds_clstr$indx_train) -plot(dist_clstr, unit = "km")+scale_x_log10(labels=round) - - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- -predictors_global <- rast(system.file("extdata","bioclim_global.tif",package="CAST")) - -plot(predictors_global) - -## ----message = FALSE, warning=FALSE, results='hide'--------------------------- - -# use random CV: -dist_clstr_rCV <- geodist(pts_clustered,predictors_global, - type = "feature", - sampling="Fibonacci", - cvfolds = randomfolds) - -# use spatial CV: -dist_clstr_sCV <- geodist(pts_clustered,predictors_global, - type = "feature", sampling="Fibonacci", - cvfolds = spatialfolds$indexOut) - - -# Plot results: -plot(dist_clstr_rCV)+scale_x_log10()+ggtitle("Clustered reference data and random CV") -plot(dist_clstr_sCV)+scale_x_log10()+ggtitle("Clustered reference data and spatial CV") -