Chapter 6 Model validation
In this section, model validation is performed in order to assess the predictive performance of the selected model. This validation is conducted via k-fold cross-validation. The data set is divided into k equally sized groups (Hijmans 2012), using a percentage of randomly selected observations to run the model and the remaining for validation, iteratively for each fold.
First, we load the list of required libraries.
<- c(
requiredPackages #GENERAL USE LIBRARIES --------#
"here", # Library for reproducible workflow
"rstudioapi", # Library for reproducible workflow
"stringr",
"RColorBrewer",
"ggplot2",
"dplyr",
"tidyverse",
"R.utils",
"ggpubr",
"hrbrthemes",
#SPATIAL DATA --------#
"rgdal",
"fields",
"maps" ,
"raster",
#MODEL FIT --------#
"scam",
"plotmo",
"SDMTools",
"pkgbuild",
"dismo"
)
We run a function to install the required packages that are not in our system and load all the required packages.
<- function(pkg){
install_load_function <- pkg[!(pkg %in% installed.packages()[, "Package"])]
new.pkg if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
install_load_function(requiredPackages)
## here rstudioapi stringr RColorBrewer ggplot2 dplyr
## TRUE TRUE TRUE TRUE TRUE TRUE
## tidyverse R.utils ggpubr hrbrthemes rgdal fields
## TRUE TRUE TRUE TRUE TRUE TRUE
## maps raster scam plotmo SDMTools pkgbuild
## TRUE TRUE TRUE TRUE TRUE TRUE
## dismo
## TRUE
We define some overall settings.
# General settings for ggplot (black-white background, larger base_size)
theme_set(theme_bw(base_size = 16))
We load output from the selected model saved in the previous step.
load(here::here ("models", "selected_model.Rdata"))
6.1 Optimum threshold
We generate a data frame with the data used in the selected model and we add the predicted values.
#PAdata_enviroment used in the selected model
<-selected_model$model
data
# Predict
<- predict(selected_model, newdata=data, type="response")
scgam.pred
# Add the prediction to the data object
$scgam.pred <- as.vector(scgam.pred)
datahead(data)
## occurrenceStatus BO_sstmean BO2_chlomean_ss BO2_salinitymean_ss scgam.pred
## 1 1 16.99439 1.20410309 35.42666 0.9804524
## 5 1 16.99439 1.20410309 35.42666 0.9804524
## 8 1 25.52963 0.08455528 36.43070 0.4238477
## 10 1 16.99439 1.20410309 35.42666 0.9804524
## 11 1 15.21486 0.72283228 34.20426 0.8449155
## 12 1 16.99439 1.20410309 35.42666 0.9804524
The threshold for presence-absence classification for each species is obtained as the values maximizing sensitivity plus specificity (Jiménez-Valverde and Lobo 2007). If the result was a range (instead of a single value), we would select the mean value of the range.
# Optimizing the threshold probability
<- data$occurrenceStatus
obs <- data$scgam.pred
predSCGAM_P
# Threshold optimizing
<- optim.thresh (obs,predSCGAM_P)
myoptim myoptim
## $min.occurence.prediction
## [1] 0.01156124
##
## $mean.occurence.prediction
## [1] 0.6642745
##
## $`10.percent.omission`
## [1] 0.36
##
## $`sensitivity=specificity`
## [1] 0.44
##
## $`max.sensitivity+specificity`
## [1] 0.42
##
## $maxKappa
## [1] 0.42
##
## $max.prop.correct
## [1] 0.42
##
## $min.ROC.plot.distance
## [1] 0.42
# Select the threshold that maximizes the sum of sensitivity and specificity
<- as.numeric((myoptim[["max.sensitivity+specificity"]])) myThreshold
Accuracy indicators, such as AUC (Area Under the Receiver Operating Characteristic—ROC—curve), sensitivity (true predicted presences) and specificity (true predicted absences) are first computed for the all observations.
# Accuracy values with all observations
accuracy (obs, predSCGAM_P, threshold=myThreshold)
## threshold AUC omission.rate sensitivity specificity prop.correct
## 1 0.42 0.7415229 0.1985506 0.8014494 0.6815964 0.7418159
## Kappa
## 1 0.483323
# Create confusion matrix with all observations
confusion.matrix(obs, predSCGAM_P, threshold=myThreshold)
## obs
## pred 0 1
## 0 10059 2959
## 1 4699 11944
## attr(,"class")
## [1] "confusion.matrix"
6.2 k-fold validation
In this case we use a 5-fold cross-validation.
# Number of groups
<- 5
k
# Generate groups
<-kfold(data, k, by=data$occurrencestatus) groups
The model is run for each of the 5 random subset (with a 20% of the observations) and indicators are then computed using the remaining 80% of the observations. Indicators are the averaged across folds.
# Initialise the confusion matrix and the accuracy table:
<- NULL
myCM <- NULL
myACC
# Get the formula of the selected model
<- summary(selected_model)[["formula"]]
formula
# Get the smoothing parameters of the selected model
<- selected_model$sp
sp
# Loop for each group k
for (j in 1:k) {
# Preparation of Training Sites
<- data[groups != j,]
p_Training
# Model fit
<- scam (formula, family=binomial(link="logit"),data=p_Training, sp=c(sp))
selected_model.sp.j
# Predict Model
<-data[groups == j,]
p_validacion
<- predict(selected_model.sp.j, newdata=p_validacion, type="response")
selected_model.sp.j.pred $Pred <- selected_model.sp.j.pred
p_validacion
# Confussion matrix and accuracy table for fold j
<- p_validacion$occurrenceStatus
obs <- p_validacion$Pred
predSCGAM <- rbind(myCM, as.numeric(confusion.matrix(obs, predSCGAM, threshold=myThreshold)))
myCM <- rbind(myACC, accuracy(obs, predSCGAM, threshold=myThreshold))
myACC
}
# Mean values across k-folds
<-cbind(Threshold=myThreshold,
validation_summarymean_AUC=mean(myACC$AUC),
mean_Omision=mean(myACC$omission.rate),
mean_sensitivity=mean(myACC$sensitivity),
mean_specificity=mean(myACC$specificity),
mean_Prop.Corr=mean(myACC$prop.correct))
validation_summary
## Threshold mean_AUC mean_Omision mean_sensitivity mean_specificity
## [1,] 0.42 0.725464 0.2399716 0.7600284 0.6908995
## mean_Prop.Corr
## [1,] 0.7256674
We save the validation summary object.
save(validation_summary, file = here::here("models/validation_summary.RData"))