Chapter 6 Model validation

In this section, model validation is performed in order to assess the predictive performance of the selected model. This validation is conducted via k-fold cross-validation. The data set is divided into k equally sized groups (Hijmans 2012), using a percentage of randomly selected observations to run the model and the remaining for validation, iteratively for each fold.

First, we load the list of required libraries.

requiredPackages <- c(
  #GENERAL USE LIBRARIES --------#
  "here", # Library for reproducible workflow
  "rstudioapi",  # Library for reproducible workflow
  "stringr",
  "RColorBrewer",  
  "ggplot2",
  "dplyr",
  "tidyverse",
  "R.utils",
  "ggpubr",
  "hrbrthemes",
  
  #SPATIAL DATA --------#
  "rgdal",
  "fields",
  "maps" ,
  "raster",

  #MODEL FIT --------#
  "scam",
  "plotmo",
  "SDMTools",
  "pkgbuild",
  "dismo"
  )

We run a function to install the required packages that are not in our system and load all the required packages.

install_load_function <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg))
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}

install_load_function(requiredPackages)
##         here   rstudioapi      stringr RColorBrewer      ggplot2        dplyr 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##    tidyverse      R.utils       ggpubr   hrbrthemes        rgdal       fields 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##         maps       raster         scam       plotmo     SDMTools     pkgbuild 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##        dismo 
##         TRUE

We define some overall settings.

# General settings for ggplot (black-white background, larger base_size)
theme_set(theme_bw(base_size = 16))

We load output from the selected model saved in the previous step.

load(here::here ("models", "selected_model.Rdata"))

6.1 Optimum threshold

We generate a data frame with the data used in the selected model and we add the predicted values.

#PAdata_enviroment used in the selected model
data<-selected_model$model

# Predict 
scgam.pred <- predict(selected_model, newdata=data, type="response")
  
# Add the prediction to the data object
data$scgam.pred <- as.vector(scgam.pred)
head(data)
##    occurrenceStatus BO_sstmean BO2_chlomean_ss BO2_salinitymean_ss scgam.pred
## 1                 1   16.99439      1.20410309            35.42666  0.9804524
## 5                 1   16.99439      1.20410309            35.42666  0.9804524
## 8                 1   25.52963      0.08455528            36.43070  0.4238477
## 10                1   16.99439      1.20410309            35.42666  0.9804524
## 11                1   15.21486      0.72283228            34.20426  0.8449155
## 12                1   16.99439      1.20410309            35.42666  0.9804524

The threshold for presence-absence classification for each species is obtained as the values maximizing sensitivity plus specificity (Jiménez-Valverde and Lobo 2007). If the result was a range (instead of a single value), we would select the mean value of the range.

# Optimizing the threshold probability
obs <- data$occurrenceStatus
predSCGAM_P <- data$scgam.pred
  
# Threshold optimizing
myoptim <- optim.thresh (obs,predSCGAM_P)
myoptim
## $min.occurence.prediction
## [1] 0.01156124
## 
## $mean.occurence.prediction
## [1] 0.6642745
## 
## $`10.percent.omission`
## [1] 0.36
## 
## $`sensitivity=specificity`
## [1] 0.44
## 
## $`max.sensitivity+specificity`
## [1] 0.42
## 
## $maxKappa
## [1] 0.42
## 
## $max.prop.correct
## [1] 0.42
## 
## $min.ROC.plot.distance
## [1] 0.42
# Select the threshold that maximizes the sum of sensitivity and specificity 
myThreshold <- as.numeric((myoptim[["max.sensitivity+specificity"]]))

Accuracy indicators, such as AUC (Area Under the Receiver Operating Characteristic—ROC—curve), sensitivity (true predicted presences) and specificity (true predicted absences) are first computed for the all observations.

# Accuracy values with all observations
accuracy (obs, predSCGAM_P, threshold=myThreshold)
##   threshold       AUC omission.rate sensitivity specificity prop.correct
## 1      0.42 0.7415229     0.1985506   0.8014494   0.6815964    0.7418159
##      Kappa
## 1 0.483323
# Create confusion matrix with all observations
confusion.matrix(obs, predSCGAM_P, threshold=myThreshold)
##     obs
## pred     0     1
##    0 10059  2959
##    1  4699 11944
## attr(,"class")
## [1] "confusion.matrix"

6.2 k-fold validation

In this case we use a 5-fold cross-validation.

# Number of groups
k <- 5 

# Generate groups
groups<-kfold(data, k, by=data$occurrencestatus)

The model is run for each of the 5 random subset (with a 20% of the observations) and indicators are then computed using the remaining 80% of the observations. Indicators are the averaged across folds.

# Initialise the confusion matrix and the accuracy table: 
myCM <- NULL 
myACC <- NULL
  
# Get the formula of the selected model  
formula <- summary(selected_model)[["formula"]]
  
# Get the smoothing parameters of the selected model  
sp <- selected_model$sp

# Loop for each group k
for (j in 1:k) {
  # Preparation of Training Sites
  p_Training <- data[groups != j,]
  
  # Model fit
  selected_model.sp.j <- scam (formula, family=binomial(link="logit"),data=p_Training, sp=c(sp))
  
  # Predict Model
  p_validacion<-data[groups == j,]
  
  selected_model.sp.j.pred <- predict(selected_model.sp.j, newdata=p_validacion, type="response")
  p_validacion$Pred <- selected_model.sp.j.pred
    
  # Confussion matrix and accuracy table for fold j
  obs <- p_validacion$occurrenceStatus
  predSCGAM <- p_validacion$Pred
  myCM <- rbind(myCM, as.numeric(confusion.matrix(obs, predSCGAM, threshold=myThreshold)))
  myACC <- rbind(myACC, accuracy(obs, predSCGAM, threshold=myThreshold))
  } 
  
# Mean values across k-folds
validation_summary<-cbind(Threshold=myThreshold,
                            mean_AUC=mean(myACC$AUC),
                            mean_Omision=mean(myACC$omission.rate),
                            mean_sensitivity=mean(myACC$sensitivity),
                            mean_specificity=mean(myACC$specificity),
                            mean_Prop.Corr=mean(myACC$prop.correct))

validation_summary
##      Threshold mean_AUC mean_Omision mean_sensitivity mean_specificity
## [1,]      0.42 0.725464    0.2399716        0.7600284        0.6908995
##      mean_Prop.Corr
## [1,]      0.7256674

We save the validation summary object.

save(validation_summary, file = here::here("models/validation_summary.RData"))

References

Hijmans, RJ. 2012. “Cross-Validation of Species Distribution Models: Removing Spatial Sorting Bias and Calibration with a Null Model.” Ecology 93 (3): 679–88. https://doi.org/https://doi.org/10.1890/11-0826.1.
Jiménez-Valverde, A, and JM Lobo. 2007. “Threshold Criteria for Conversion of Probability of Species Presence to Either-or Presence-Absence.” Acta Oecologica 31 (3): 361–69. https://doi.org/https://doi.org/10.1016/j.actao.2007.02.001.