Chapter 6 Model validation

In this section, model validation is performed in order to assess the predictive performance of the selected model. This validation is conducted via k-fold cross-validation. The data set is divided into k equally sized groups (Hijmans 2012), using a percentage of randomly selected observations to run the model and the remaining for validation, iteratively for each fold.

First, we load the list of required libraries.

requiredPackages <- c("here", "rstudioapi",
    "stringr", "RColorBrewer", "ggplot2",
    "dplyr", "tidyverse", "R.utils", "ggpubr",
    "hrbrthemes", "fields", "maps", "raster",
    "scam", "plotmo", "pkgbuild", "dismo",
    "SDMTools")

We run a function to install the required packages that are not in our system and load all the required packages.

install_load_function <- function(pkg) {
    new.pkg <- pkg[!(pkg %in% installed.packages()[,
        "Package"])]
    if (length(new.pkg))
        install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE)
}

install_load_function(requiredPackages)

##         here   rstudioapi      stringr RColorBrewer      ggplot2        dplyr 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##    tidyverse      R.utils       ggpubr   hrbrthemes       fields         maps 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##       raster         scam       plotmo     pkgbuild        dismo     SDMTools 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE

Note: in case SDMTools failed to load you will have to first manually install RTools and then run this code below:

# find_rtools()

# install.packages('remotes')
# remotes::install_version('SDMTools',
# version = '1.1-221.2')

library(SDMTools)

We define some overall settings.

# General settings for ggplot
# (black-white background, larger
# base_size)
theme_set(theme_bw(base_size = 16))

We load output from the selected model saved in the previous step.

load(here::here("models", "selected_model.Rdata"))

6.1 Optimum threshold

We generate a data frame with the data used in the selected model and we add the predicted values.

# PAdata_enviroment used in the
# selected model
data <- selected_model$model

# Predict
scgam.pred <- predict(selected_model, newdata = data,
    type = "response")

# Add the prediction to the data object
data$scgam.pred <- as.vector(scgam.pred)
head(data)

##    occurrenceStatus BO_sstmean BO2_chlomean_ss BO2_salinitymean_ss scgam.pred
## 1                 1   16.99439      1.20410309            35.42666  0.9814526
## 5                 1   16.99439      1.20410309            35.42666  0.9814526
## 8                 1   25.52963      0.08455528            36.43070  0.4357622
## 10                1   16.99439      1.20410309            35.42666  0.9814526
## 11                1   15.21486      0.72283228            34.20426  0.8506453
## 12                1   16.99439      1.20410309            35.42666  0.9814526

The threshold for presence-absence classification for each species is obtained as the values maximizing sensitivity plus specificity (Jiménez-Valverde and Lobo 2007). If the result was a range (instead of a single value), we would select the mean value of the range.

# Optimizing the threshold probability
obs <- data$occurrenceStatus
predSCGAM_P <- data$scgam.pred

# Threshold optimizing
myoptim <- optim.thresh(obs, predSCGAM_P)
myoptim

## $min.occurence.prediction
## [1] 0.01055724
## 
## $mean.occurence.prediction
## [1] 0.671467
## 
## $`10.percent.omission`
## [1] 0.37
## 
## $`sensitivity=specificity`
## [1] 0.45
## 
## $`max.sensitivity+specificity`
## [1] 0.43
## 
## $maxKappa
## [1] 0.43
## 
## $max.prop.correct
## [1] 0.43
## 
## $min.ROC.plot.distance
## [1] 0.43

# Select the threshold that maximizes
# the sum of sensitivity and
# specificity
myThreshold <- as.numeric((myoptim[["max.sensitivity+specificity"]]))

Accuracy indicators, such as AUC (Area Under the Receiver Operating Characteristic—ROC—curve), sensitivity (true predicted presences) and specificity (true predicted absences) are first computed for the all observations.

# Accuracy values with all observations
accuracy(obs, predSCGAM_P, threshold = myThreshold)

##   threshold       AUC omission.rate sensitivity specificity prop.correct
## 1      0.43 0.7409775     0.1978796   0.8021204   0.6798347    0.7412764
##       Kappa
## 1 0.4822374

# Create confusion matrix with all
# observations
confusion.matrix(obs, predSCGAM_P, threshold = myThreshold)

##     obs
## pred     0     1
##    0 10033  2949
##    1  4725 11954
## attr(,"class")
## [1] "confusion.matrix"

6.2 k-fold validation

In this case we use a 5-fold cross-validation.

# Number of groups
k <- 5

# Generate groups
groups <- kfold(data, k, by = data$occurrencestatus)

The model is run for each of the 5 random subset (with a 20% of the observations) and indicators are then computed using the remaining 80% of the observations. Indicators are the averaged across folds.

# Initialise the confusion matrix and
# the accuracy table:
myCM <- NULL
myACC <- NULL

# Get the formula of the selected model
formula <- summary(selected_model)[["formula"]]

# Get the smoothing parameters of the
# selected model
sp <- selected_model$sp

# Loop for each group k
for (j in 1:k) {
    # Preparation of Training Sites
    p_Training <- data[groups != j, ]

    # Model fit
    selected_model.sp.j <- scam(formula,
        family = binomial(link = "logit"),
        data = p_Training, sp = c(sp))

    # Predict Model
    p_validacion <- data[groups == j, ]

    selected_model.sp.j.pred <- predict(selected_model.sp.j,
        newdata = p_validacion, type = "response")
    p_validacion$Pred <- selected_model.sp.j.pred

    # Confussion matrix and accuracy
    # table for fold j
    obs <- p_validacion$occurrenceStatus
    predSCGAM <- p_validacion$Pred
    myCM <- rbind(myCM, as.numeric(confusion.matrix(obs,
        predSCGAM, threshold = myThreshold)))
    myACC <- rbind(myACC, accuracy(obs, predSCGAM,
        threshold = myThreshold))
}

# Mean values across k-folds
validation_summary <- cbind(Threshold = myThreshold,
    mean_AUC = mean(myACC$AUC), mean_Omision = mean(myACC$omission.rate),
    mean_sensitivity = mean(myACC$sensitivity),
    mean_specificity = mean(myACC$specificity),
    mean_Prop.Corr = mean(myACC$prop.correct))

validation_summary

##      Threshold  mean_AUC mean_Omision mean_sensitivity mean_specificity
## [1,]      0.43 0.7396357    0.1912325        0.8087675        0.6705039
##      mean_Prop.Corr
## [1,]      0.7398605

We save the validation summary object.

save(validation_summary, file = here::here("models/validation_summary.RData"))

References

Hijmans, RJ. 2012. “Cross-Validation of Species Distribution Models: Removing Spatial Sorting Bias and Calibration with a Null Model.” Ecology 93 (3): 679–88. https://doi.org/https://doi.org/10.1890/11-0826.1.

Jiménez-Valverde, A, and JM Lobo. 2007. “Threshold Criteria for Conversion of Probability of Species Presence to Either-or Presence-Absence.” Acta Oecologica 31 (3): 361–69. https://doi.org/https://doi.org/10.1016/j.actao.2007.02.001.