back to overview  
 

Preparation: Please open your RStudio project and download all new data for the class from Stud-IP (log in first) and place them into your project folder /data/. You can use getwd() to get your current working directory which is your project folder as long as you do not change it. Please install the following R-packages using install.packages() if not done yet:

And load them into your console:

library("dplyr")
library("ggplot2")
library("sf")
library("terra")
library("tidyterra")
library("BIEN")
library("ROCR")
library("performance")

 

 

If you want to visualize this tutorial in the viewer inside RStudio (to save space on your screen) run the following chunk of code:

install.packages("rstudioapi") # install an R-package required for this step
dir <- tempfile()
dir.create(dir)
download.file("https://gift.uni-goettingen.de/mcmmb/index.html",
              destfile = file.path(dir, "index.html"))
download.file("https://gift.uni-goettingen.de/mcmmb/Day3.html",
              destfile = file.path(dir, "Day3.html"))
htmlFile <- file.path(dir, "Day3.html")
rstudioapi::viewer(htmlFile)

Now you can conveniently copy code from the viewer into your script.
 

1. Loading palm data set and setting up for later analyses

Load palm data set

For today’s exercise, we will continue to use the Neotropical palm data set from Kreft, H., Sommer, J.H. & Barthlott, W. (2006) The significance of geographic range size for spatial diversity patterns in Neotropical palms. Ecography, 29, 21-30. If you still haven’t downloaded the dataset, you can find it here

species <- read.csv("data/palms_species_per_gridcell.csv")

 

Data inspection

head(species) 
##   grid_id spp_Id    GENUS EPITHET
## 1   13735    550 Ammandra natalia
## 2   13736    550 Ammandra natalia
## 3   13737    550 Ammandra natalia
## 4   13738    550 Ammandra natalia
## 5   13910    550 Ammandra natalia
## 6   13911    550 Ammandra natalia
str(species)
## 'data.frame':    140860 obs. of  4 variables:
##  $ grid_id: int  13735 13736 13737 13738 13910 13911 13912 13913 13914 14085 ...
##  $ spp_Id : int  550 550 550 550 550 550 550 550 550 550 ...
##  $ GENUS  : chr  "Ammandra" "Ammandra" "Ammandra" "Ammandra" ...
##  $ EPITHET: chr  "natalia" "natalia" "natalia" "natalia" ...

There is no column with the species name so let’s make one by combining the columns Genus and Epithet:

species$species_name <- paste(species$GENUS, species$EPITHET, sep = " ")

 

Create a subset of the palm data set (4 palm species) From now on, we will only work with the following species:
- Geonoma pauciflora
- Geonoma weberbaueri which is actually a synonym of Geonoma undata
- Phytelephas tenuicaulis
- Mauritia flexuosa

Let’s start by first changing the name of Geonoma weberbaueri to Geonoma undata, and then making a subset with these 4 species:

species[which(species$species_name == "Geonoma weberbaueri"), "species"] <- "Geonoma undata"

subset_sp <- c("Geonoma pauciflora", "Geonoma undata",
               "Phytelephas tenuicaulis", "Mauritia flexuosa")

species <- species[which(species$species_name %in% subset_sp),]
unique(species$species_name) # list species in the "new" data set
## [1] "Phytelephas tenuicaulis" "Geonoma undata"         
## [3] "Geonoma pauciflora"      "Mauritia flexuosa"

 

Load GIS shapefile & join it to species distribution data

grid <- st_read("data/30min_grid_select50%.shp")
## Reading layer `30min_grid_select50%' from data source 
##   `P:\POSTDOC\Teaching\MCMM - Macroecology SE\MCMMB\data\30min_grid_select50%.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 6038 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -116.0007 ymin: -35 xmax: -35.00066 ymax: 36.5
## CRS:           NA
st_crs(grid)
## Coordinate Reference System: NA

The coordinate reference system (CRS) is not defined. It is actually based on the latitude/longitude system.

st_crs(grid) <- "EPSG:4326"

# Alternatively:
# st_crs(grid) <- "+proj=longlat +ellps=WGS84 +no_defs"
# or if we use the unique EPSG identifier
# st_crs(grid) <- 4326 # https://epsg.io/4326


# Shape file showing the borders of the Neotropics
americas <- st_read("data/americas.shp")  
## Reading layer `americas' from data source 
##   `P:\POSTDOC\Teaching\MCMM - Macroecology SE\MCMMB\data\americas.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 15 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -124.7158 ymin: -55.91972 xmax: -29.84 ymax: 49.37666
## CRS:           NA
# st_crs(americas) <- 3857
st_crs(americas) <- "EPSG:4326"

We can now merge the spatial grid with the species table. Remember to put the spatial table on the left part, not to lose its spatial component.

colnames(grid)[1] <- "grid_id"
grid_species <- left_join(grid, species, by = "grid_id")

ggplot(grid_species[which(grid_species$species_name %in% subset_sp), ]) +
  geom_sf(data = grid, color = "grey90", fill = "grey90") +
  geom_sf(aes(color = species_name, fill = species_name), show.legend = FALSE) +
  geom_sf(data = americas, fill = NA) +
  theme_bw() +
  facet_wrap(~species_name)

2. Point occurrences (presences)

2.1. Load data

For today’s practical, we will also use a new dataset of occurrence data downloaded from BIEN. The file with the occurrences is available via StudIP (‘Data/BIEN_data.csv’).

# Load species occurrence data from your data folder: 
spec_occ <- read.csv("data/BIEN_data.csv")

# Alternatively, you could also download the occurrences directly from the repository (this can take quite some time!)
# spec_occ <- BIEN_occurrence_species(c("Geonoma pauciflora", "Geonoma undata",
#                                       "Phytelephas tenuicaulis", "Mauritia flexuosa"),
#                                     cultivated = FALSE, new.world = TRUE)

Look at the structure of the table, how many observations do we have?

str(spec_occ)
## 'data.frame':    1408 obs. of  10 variables:
##  $ scrubbed_species_binomial  : chr  "Geonoma pauciflora" "Geonoma pauciflora" "Geonoma pauciflora" "Geonoma pauciflora" ...
##  $ latitude                   : num  -24.2 -24 -14.3 -24 -13.6 ...
##  $ longitude                  : num  -46.9 -46.8 -39.2 -46.8 -39.7 ...
##  $ date_collected             : chr  "1987-09-03" "2001-04-16" "1994-02-13" "2001-04-17" ...
##  $ datasource                 : chr  "NY" "SpeciesLink" "GBIF" "GBIF" ...
##  $ dataset                    : chr  "NY" "UNICAMP" "USP" "Escola Superior de Agricultura Luiz de Queiroz, ESALQ" ...
##  $ dataowner                  : chr  "NY" "UNICAMP" "USP" "Escola Superior de Agricultura Luiz de Queiroz, ESALQ" ...
##  $ custodial_institution_codes: chr  "NY" "UNICAMP" "USP" "Escola Superior de Agricultura Luiz de Queiroz, ESALQ" ...
##  $ collection_code            : chr  "Herbarium" "UEC" "SPF" "ESA" ...
##  $ datasource_id              : int  104 1030 3527 2472 3702 3199 2724 1825 2774 3165 ...
summary(spec_occ)
##  scrubbed_species_binomial    latitude          longitude     
##  Length:1408               Min.   :-25.5347   Min.   :-90.22  
##  Class :character          1st Qu.:-12.1398   1st Qu.:-79.13  
##  Mode  :character          Median :  0.2644   Median :-74.84  
##                            Mean   : -1.3158   Mean   :-71.00  
##                            3rd Qu.:  8.0146   3rd Qu.:-68.96  
##                            Max.   : 16.2500   Max.   :-34.87  
##  date_collected      datasource          dataset           dataowner        
##  Length:1408        Length:1408        Length:1408        Length:1408       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  custodial_institution_codes collection_code    datasource_id 
##  Length:1408                 Length:1408        Min.   :  14  
##  Class :character            Class :character   1st Qu.:  47  
##  Mode  :character            Mode  :character   Median : 280  
##                                                 Mean   :1112  
##                                                 3rd Qu.:2472  
##                                                 Max.   :3789

 

Subset data set (spec_occ) to only include records with spatial coordinates

 

spec_occ <- spec_occ[complete.cases(spec_occ[, c("longitude",
                                                 "latitude")]), ]

# Alternative:
#spec_occ <- spec_occ[which(!(is.na(spec_occ$latitude) |
#                               is.na(spec_occ$longitude))), ]
#spec_occ <- spec_occ[which(!is.na(spec_occ$latitude) &
#                             !is.na(spec_occ$longitude)), ]

Count how many records there are for each species

table(spec_occ$scrubbed_species_binomial)
## 
##      Geonoma pauciflora          Geonoma undata       Mauritia flexuosa 
##                     133                     814                     420 
## Phytelephas tenuicaulis 
##                      41

 

2.2. Spatial data.frame

We need to transform the data frame of BIEN occurrences into a spatial data.frame. To do this we can use the function st_as_fs from the package sf and giving as an argument the columns with the latitude and longitude coordinates. Remember to specify the CRS!

spec_occ_sf <- st_as_sf(spec_occ, coords = c("longitude", "latitude"),
                        crs = "EPSG:4326") 

2.3. Combined plot

We can now plot the BIEN occurrences along with our grid of species range!

Side note
For plotting purpouses, all the point shapes you can use in R can be quickly viewed by running this:

ggpubr::show_point_shapes()
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

In order to use the facet on two different datasets, we first need to rename the species column in the species occurrences dataset (spec_occ_sf) with the same name as in the other dataset. Here is an example using dplyr:

spec_occ_sf <- spec_occ_sf%>%
  rename(species_name = scrubbed_species_binomial)

Now we can combine the species ranges derived from the spatial grid with the spatial occurrences:

ggplot(grid_species[which(grid_species$species_name %in% subset_sp), ]) +
  geom_sf(data = grid, color = "grey90", fill = "grey90") +
  geom_sf(data = americas, fill = NA) +
  geom_sf(aes(fill = species_name), color = NA, alpha = 0.5, show.legend = FALSE) +
  geom_sf(data = spec_occ_sf, aes(color = species_name), size = 2, shape = 20,
          show.legend = FALSE) +
  scale_fill_brewer(palette = "Set1") +
  scale_color_brewer(palette = "Set1") +
  theme_bw() +
  facet_wrap(~species_name)

 

3. Create pseudo‐absences (background data)

Pseudo-absences are necessary because occurrence data sets (like BIEN) only record presences and not true absences. Therefore, we must generate pseudo-absences, i.e. possible locations where a species doesn’t occur. For an overview of the different methods and recommendations, see this paper.

In this practical, we will set pseudo-absences by randomly selecting coordinates from the centroids of each grid cell.

First, we extract the centroids of all grid cells:

grid_centroids <- st_centroid(grid)
## Warning: st_centroid assumes attributes are constant over geometries
grid_centroids$present <- 0

As we can see in the next plot, we now have points located in the middle of each grid cell:

ggplot(grid) +
  geom_sf() +
  geom_sf(data = grid_centroids, shape = 3) +
  scale_x_continuous(limits = c(-90, -76)) +
  scale_y_continuous(limits = c(20, 31)) +
  theme_bw()

 

We now create a new data.frame that contains all georeferenced occurrences for our four selected species, and the same number of pseudo-absences for each species. The pseudo-absence locations are randomly extracted from the centroids of all grid cells.

First, we add a column to the occurrence data.frame that mentions that all locations correspond to observed occurrences.

spec_occ_sf$present <- 1

Second, we bind (i.e. combine) a new data.frame for each of the selected species that contains the species occurrences and the same number of randomly selected pseudo-absences.

# Example with the first specie, Geonoma pauciflora: 
# First we would need to retrieve the number of rows (observatons) for this species: 
nrow(spec_occ_sf[which(spec_occ_sf$species_name == subset_sp[1]), ])
## [1] 133
# Then we can sample a random n of centroids based on the same number of rows 
sample(nrow(grid_centroids), 
       nrow(spec_occ_sf[which(spec_occ_sf$species_name == subset_sp[1]), ]), 
       replace = TRUE)
##   [1] 2138 1717 5353 1598 5675  892 3394 2380 5251 4887 4603 4223 2773 3107 5842
##  [16]  142 5876 1215 1621 4128  226 2137  873  106 3353 5972 2791 1198 3050 5276
##  [31] 4982  460 1357 4176 3078  354 5889 2208 4756 2102 1942 3096 5768 4086 4455
##  [46]  961 4103 2617 2807 4002 4928  507 3774  883 3164   58 3520 3690 5742 1716
##  [61] 2276 5711  856 2526 2962 2346 3009 3302  363 3401 4111 4511 1232 4997 5701
##  [76] 2829 3872 3431 2175 4559 2251 5090 4691 5842  144  465 4590 3725 5471 1823
##  [91] 4530 1717 5956 5715 3760 2655 5916 3638  736 1432 1641 5263 1467 4056 3664
## [106] 5270 5813 1592 5601 3055 5861 1226 5780 2327 3365  401  653 2781 5713 2471
## [121] 1376 4503 5716 3020 5824 2080 4795 2082 4253 2879 4876 3317 4105
# We can use this indeces (i.e. the row number) to subset the grid_centroids dataset, keeping the column "present"
grid_centroids[
  sample(nrow(grid_centroids), 
       nrow(spec_occ_sf[which(spec_occ_sf$species_name == subset_sp[1]), ]), 
       replace = TRUE),
  "present"]
## Simple feature collection with 133 features and 1 field
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -114.1805 ymin: -32.75001 xmax: -35.25066 ymax: 34.75
## Geodetic CRS:  WGS 84
## First 10 features:
##      present                    geometry
## 2856       0 POINT (-50.25066 -5.250016)
## 3712       0 POINT (-63.75066 -10.25003)
## 2518       0  POINT (-48.25066 -3.25001)
## 150        0  POINT (-81.75066 32.25002)
## 3040       0 POINT (-49.75066 -6.250019)
## 5873       0 POINT (-58.25066 -30.75002)
## 476        0  POINT (-107.2507 25.25004)
## 1942       0 POINT (-64.75066 0.7500024)
## 4419       0 POINT (-56.75066 -14.75004)
## 2880       0 POINT (-38.25066 -5.250016)
# Finally we can combine this columns with the species_name column using cbind: 
cbind(
  grid_centroids[
  sample(nrow(grid_centroids), 
       nrow(spec_occ_sf[which(spec_occ_sf$species_name == subset_sp[1]), ]), 
       replace = TRUE),
  "present"], 
species_name = subset_sp[1])
## Simple feature collection with 133 features and 2 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -113.2507 ymin: -32.24905 xmax: -38.75066 ymax: 33.75001
## Geodetic CRS:  WGS 84
## First 10 features:
##      present       species_name                    geometry
## 5081       0 Geonoma pauciflora POINT (-58.75066 -20.25004)
## 5961       0 Geonoma pauciflora POINT (-58.25083 -32.24905)
## 3412       0 Geonoma pauciflora POINT (-41.75066 -8.250025)
## 3538       0 Geonoma pauciflora POINT (-65.75066 -9.250027)
## 1936       0 Geonoma pauciflora POINT (-67.75066 0.7500024)
## 2187       0 Geonoma pauciflora POINT (-68.25066 -1.250004)
## 1216       0 Geonoma pauciflora  POINT (-61.75066 8.250025)
## 1492       0 Geonoma pauciflora  POINT (-70.25066 4.750015)
## 4363       0 Geonoma pauciflora POINT (-49.75066 -14.25004)
## 5481       0 Geonoma pauciflora POINT (-63.25066 -24.75004)
# We now do that for each species and then combine the data by rows with rbind, and save the results in a new dataframe named "pseudo_absences_sf"

set.seed(100)
pseudo_absences_sf <-
  rbind(
    cbind(grid_centroids[
      sample(nrow(grid_centroids),
             nrow(spec_occ_sf[which(spec_occ_sf$species_name == subset_sp[1]), ]),
             replace = TRUE),
      "present"],
      species_name = subset_sp[1]),
    cbind(grid_centroids[
      sample(nrow(grid_centroids),
             nrow(spec_occ_sf[which(spec_occ_sf$species_name == subset_sp[2]), ]),
             replace = TRUE),
      "present"],
      species_name = subset_sp[2]),
    cbind(grid_centroids[
      sample(nrow(grid_centroids),
             nrow(spec_occ_sf[which(spec_occ_sf$species_name == subset_sp[3]), ]),
             replace = TRUE),
      "present"],
      species_name = subset_sp[3]),
    cbind(grid_centroids[
      sample(nrow(grid_centroids),
             nrow(spec_occ_sf[which(spec_occ_sf$species_name == subset_sp[4]), ]),
             replace = TRUE),
      "present"],
      species_name = subset_sp[4])
  )

# Now we have a dataframe that contains a number of random pseudoabsence that is equal to the number of actual observation. We can combine this dataframe with the dataframe with the actual observations: 

sdm_sf <- rbind(spec_occ_sf[, c("species_name", "present")],
                pseudo_absences_sf)

We can now plot the presences and generated pseudo-absences for each species:

ggplot() +
  geom_sf(data = grid, color = "grey90", fill = "grey90") +
  geom_sf(data = americas, fill = NA) +
  geom_sf(data = sdm_sf, aes(shape = as.factor(present)), size = 2,
          show.legend = FALSE) +
  # scale_color_viridis_d() +
  scale_shape_manual(values = c("1" = 20,
                                "0" = 4)) +
  scale_y_continuous(limits = c(-35, 36.5)) +
  theme_bw() +
  facet_wrap(~species_name)

 

4. Environmental predictors

4.1. Load raster data

We will use three environmental variables: elevation, temperature (‘bio1’), and precipitation (‘bio12’).
As a reminder, you can download the rasters from StudIP: Elevation Temperature Precipitation

# Import global digital elevation model at 30 arc seconds resolution (this could take a while since the raster files are pretty heavy)
elev <- rast("data/mn30_grd/") + 0 # +0 to treat raster as numeric 
## |---------|---------|---------|---------|=========================================                                          
names(elev) <- "mn30_grd"

plot(elev, main = "Elevation (meters a.s.l.)")

 

# Import climate data from Chelsa at 30 arc seconds resolution -->
temp <- rast("data/CHELSA_bio10_01.tif") # temperature
prec <- rast("data/CHELSA_bio10_12.tif") # precipitation

plot(temp, main = "Mean annual temperature (°C)")

plot(prec, main = "Mean annual precipitation (mm/year)")

 

4.2. Extract environmental data for each presence/absence

For each presence and pseudo-absence, we can extract the corresponding environmental values, using the function terra::extract.

Before doing any extraction, we always need to make sure that our spatial objects and raster are having the same CRS:

crs(elev) == crs(sdm_sf)
## [1] FALSE
# The elevation raster and our spatial object have different csr so we have to re-project the elevation one (this also takes a while)  
elev <- project(x = elev, y = crs(sdm_sf))
## |---------|---------|---------|---------|=========================================                                          
crs(temp) == crs(sdm_sf)
## [1] TRUE
crs(prec) == crs(sdm_sf)
## [1] TRUE

Now that this condition is met, we can extract the values:

sdm_sf$elev <- terra::extract(elev, sdm_sf)[, "mn30_grd"]
sdm_sf$temp <- terra::extract(temp, sdm_sf)[, "CHELSA_bio10_01"]
sdm_sf$prec <- terra::extract(prec, sdm_sf)[, "CHELSA_bio10_12"]

We can check if we obtained some missing values, and remove them:

summary(sdm_sf$elev)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0     151     337     844    1695    4842
summary(sdm_sf$temp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2.05   16.95   23.85   21.29   25.15   28.95       4
summary(sdm_sf$prec)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      62    1479    2192    2428    3138    6553       4
# Removing the NAs
sdm_sf <- na.omit(sdm_sf)

 

5. Building species distribution models

We here use a Generalized Linear Model to model the presence of our four palm species. Since we model presence/absence data, we fit a logistic regression. A logistic model (or logit model) is a statistical model that models the log-odds of an event as a linear combination of one or more independent variables. When fitting a logistic regression, we use the Binomial distribution family.

5.1. Fit logistic regression models (for each species)

SDM 1: Geonoma pauciflora

logreg_geo_pau <- glm(
  present ~ elev + temp + prec,
  data = sdm_sf[which(sdm_sf$species == "Geonoma pauciflora"), ],
  family = "binomial")
summary(logreg_geo_pau)
## 
## Call:
## glm(formula = present ~ elev + temp + prec, family = "binomial", 
##     data = sdm_sf[which(sdm_sf$species == "Geonoma pauciflora"), 
##         ])
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  6.9499102  1.8291482   3.800 0.000145 ***
## elev        -0.0022231  0.0005415  -4.105 4.04e-05 ***
## temp        -0.2574121  0.0723763  -3.557 0.000376 ***
## prec        -0.0001537  0.0001960  -0.784 0.432880    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 367.36  on 264  degrees of freedom
## Residual deviance: 342.47  on 261  degrees of freedom
## AIC: 350.47
## 
## Number of Fisher Scoring iterations: 4

 

SDM 2: Geonoma undata

logreg_geo_und <- glm(
  present ~ elev + temp + prec,
  data = sdm_sf[which(sdm_sf$species == "Geonoma undata"), ],
  family = "binomial")
summary(logreg_geo_und)
## 
## Call:
## glm(formula = present ~ elev + temp + prec, family = "binomial", 
##     data = sdm_sf[which(sdm_sf$species == "Geonoma undata"), 
##         ])
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.081e+00  1.768e+00  -3.439 0.000583 ***
## elev         2.069e-03  3.542e-04   5.841 5.19e-09 ***
## temp         3.334e-02  6.580e-02   0.507 0.612357    
## prec         1.176e-03  8.834e-05  13.312  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2255.50  on 1626  degrees of freedom
## Residual deviance:  889.96  on 1623  degrees of freedom
## AIC: 897.96
## 
## Number of Fisher Scoring iterations: 6

 

SDM 3: Phytelephas tenuicaulis

logreg_phy_ten <- glm(
  present ~ elev + temp + prec,
  data = sdm_sf[which(sdm_sf$species == "Phytelephas tenuicaulis"), ],
  family = "binomial")
summary(logreg_phy_ten)
## 
## Call:
## glm(formula = present ~ elev + temp + prec, family = "binomial", 
##     data = sdm_sf[which(sdm_sf$species == "Phytelephas tenuicaulis"), 
##         ])
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.243179   7.300651  -0.307    0.759    
## elev        -0.001306   0.002109  -0.619    0.536    
## temp        -0.137937   0.279686  -0.493    0.622    
## prec         0.002152   0.000503   4.278 1.89e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 112.277  on 80  degrees of freedom
## Residual deviance:  62.612  on 77  degrees of freedom
## AIC: 70.612
## 
## Number of Fisher Scoring iterations: 6

 

SDM 4: Mauritia flexuosa

logreg_mau_fle <- glm(
  present ~ elev + temp + prec,
  data = sdm_sf[which(sdm_sf$species == "Mauritia flexuosa"), ],
  family = "binomial")
summary(logreg_mau_fle)
## 
## Call:
## glm(formula = present ~ elev + temp + prec, family = "binomial", 
##     data = sdm_sf[which(sdm_sf$species == "Mauritia flexuosa"), 
##         ])
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.1490514  1.4444749  -5.642 1.69e-08 ***
## elev         0.0001173  0.0003470   0.338    0.735    
## temp         0.2543692  0.0546021   4.659 3.18e-06 ***
## prec         0.0009111  0.0001085   8.401  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1163.10  on 838  degrees of freedom
## Residual deviance:  979.85  on 835  degrees of freedom
## AIC: 987.85
## 
## Number of Fisher Scoring iterations: 5

 

5.2. Check model performance

SDMs predict a continuous response, the probability of occurrence of species. The observations are however binary (presence or absence of a species).
Many performance measures therefore rely on comparisons like “How many presence observations does the model correctly predict as presence”.
To answer this, the continuous probabilities need to be converted into binary predictions.
Depending on the threshold applied to do this, we will have different results and positive/negative rates, as shown in the next contingency table and in the following figures.

True positive rate is also called the Sensitivity and the true negative rate the Specificity.

The threshold chosen affect these two rates:

The most common evaluation statistic that avoids thresholding the data is AUC - the area under the receiver-operating characteristic (ROC) curve. ROC curves are generated by calculating sensitivity (true positive rate) and specificity (true negative rate) for many thresholds along the entire range of predicted probabilities. Then, (1-specificity) is plotted on the x-axis against sensitivity on the y axis. The area under this curve is called the AUC. The further the generated curve deviates from the 1:1 line towards the upper-left corner, the better the model predicts presence/absence of a species. If we would take a random presence and a random absence from our observations and make predictions, than AUC can be interpeted as the chance of assigning a higher predicted occurrence probability to the presence compared to the absence point. Typically, we regard AUC>0.7 as indicating fair predictions (Araujo et al. 2005).

We will assess classification precision of the logistic models using ROC (Receiver Operating Characteristic Curve) and AUC (Area Under Curve) for one species (Mauritia flexuosa). The ROC curve shows the true positive rate (sensitivity) against the false positive rate (1 - specificity) at various threshold values at which the species may be considered present. The AUC is the area under the ROC curve.
 

p <- predict(logreg_mau_fle,
             newdata = sdm_sf[which(sdm_sf$species == "Mauritia flexuosa"), ],
             type = "response")
pr <- prediction(p,
                 sdm_sf[which(sdm_sf$species == "Mauritia flexuosa"), ]$present)

prf <- ROCR::performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf, main = "Receiver Operating Characteristic Curve")
abline(a = 0, b = 1)
text(0.9, 0.75, "low threshold")
text(0.3, 0.05, "high threshold")

 

For the ROC, the curve should be located in the upper left of the figure which means that at intermediate threshold values the true positive rate is much higher than the false positive rate.
 

auc <- ROCR::performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.7275486

For the AUC, the value should be greater than 0.50 (i.e. a good model should be correct more than 50% of the time). The higher the AUC, the better the model is at predicting 0s as 0s and 1s as 1s.
 

We can also assess model performance using commonly used indices, such as R2, AIC, RMSE, LOGLOSS, and PCP.
* The Root Mean Square Error (RMSE) is the square root of the mean squared residuals (prediction errors). Residuals are a measure of how far from the regression line data points are.
* LOGLOSS evaluates how good or bad predicted probabilities are (high is bad, low is good).
* PCP is percentage of correct predictions for models with binary outcomes.

# Geonoma undata model 
model_performance(logreg_geo_und,
                  metrics = c("AIC", "R2", "RMSE", "LOGLOSS", "PCP"))
## # Indices of model performance
## 
## AIC   | Tjur's R2 |  RMSE | Log_loss |   PCP
## --------------------------------------------
## 898.0 |     0.698 | 0.269 |    0.273 | 0.849
#Geonoma pauciflora 
model_performance(logreg_geo_pau,  
                  metrics = c("AIC", "R2", "RMSE", "LOGLOSS", "PCP"))
## # Indices of model performance
## 
## AIC   | Tjur's R2 |  RMSE | Log_loss |   PCP
## --------------------------------------------
## 350.5 |     0.098 | 0.472 |    0.646 | 0.549
# Phytelephas tenuicaulis
model_performance(logreg_phy_ten,
                  metrics = c("AIC", "R2", "RMSE", "LOGLOSS", "PCP"))
## # Indices of model performance
## 
## AIC  | Tjur's R2 |  RMSE | Log_loss |   PCP
## -------------------------------------------
## 70.6 |     0.548 | 0.318 |    0.386 | 0.774
# Mauritia flexuosa
model_performance(logreg_mau_fle,
                  metrics = c("AIC", "R2", "RMSE", "LOGLOSS", "PCP"))
## # Indices of model performance
## 
## AIC   | Tjur's R2 |  RMSE | Log_loss |   PCP
## --------------------------------------------
## 987.8 |     0.196 | 0.448 |    0.584 | 0.598

 

More details on the evaluation of the performance of SDMs here.

6. Predict species distributions and compare to actual distributions

Create new data for predictions
We first crop the raster layers to a smaller extent (i.e. area), and then aggregate them by a factor 4 to make the operations faster (remember Day.1 practical, section 3.2):

new_extent <- ext(-97, -35, -30, 17) # We set up the boundaries of the new extent, based on latitude and longitude coordinates 
elev <- crop(elev, new_extent) # crop raster layers to smaller extent
elev <- aggregate(elev, fact = 4, fun = mean) # aggregate data
## |---------|---------|---------|---------|=========================================                                          
plot(elev, main = "Elevation (meters a.s.l.)")
plot(st_geometry(americas), add = TRUE)

temp <- crop(temp, new_extent)
temp <- aggregate(temp, fact = 4, fun = mean)
## |---------|---------|---------|---------|=========================================                                          
plot(temp, main = "Mean annual temperature (°C x 10)")
plot(st_geometry(americas), add = TRUE)

prec <- crop(prec, new_extent)
prec <- aggregate(prec, fact = 4, fun = mean)
## |---------|---------|---------|---------|=========================================                                          
plot(prec, main = "Mean annual precipitation (mm/year)")
plot(st_geometry(americas), add = TRUE)

 

Predict presences of each species using our previously fitted species distribution models
We first extract values from the 3 rater layers:

new_data <- data.frame(elev = as.numeric(values(elev)),
                       temp = as.numeric(values(temp)),
                       prec = as.numeric(values(prec)))

Then, using the GLMs, we get predicted probabilities of presence for all these raster values:

# predict presences
geo_pau_predict <- terra::predict(logreg_geo_pau, newdata = new_data,
                                  type = "response")
geo_und_predict <- terra::predict(logreg_geo_und, newdata = new_data,
                                  type = "response")
phy_ten_predict <- terra::predict(logreg_phy_ten, newdata = new_data,
                                  type = "response")
mau_fle_predict <- terra::predict(logreg_mau_fle, newdata = new_data,
                                  type = "response")

We now create rasters with the predicted probabilities as values:

# create raster layers with SDM predictions
geo_pau_predict_rst <- rast(new_extent, nrows=nrow(elev), ncols=ncol(elev),
                            vals = geo_pau_predict)
geo_und_predict_rst <- rast(new_extent, nrows=nrow(elev), ncols=ncol(elev),
                            vals = geo_und_predict)
phy_ten_predict_rst <- rast(new_extent, nrows=nrow(elev), ncols=ncol(elev),
                            vals = phy_ten_predict)
mau_fle_predict_rst <- rast(new_extent, nrows=nrow(elev), ncols=ncol(elev),
                            vals = mau_fle_predict)

 

Compare SDM predictions to actual species ranges

# Geonoma Pauciflora

cowplot::plot_grid(
  ggplot(grid_species%>%
         filter(species_name == "Geonoma pauciflora"))+
  geom_sf(fill = "#7fbf7b", color = "#7fbf7b")+
  labs(title = "Geonoma pauciflora")+
  geom_sf(data = spec_occ_sf%>%
            filter(species_name == "Geonoma pauciflora"), color = "darkgreen")+
  geom_sf(data = americas, color = "black", fill = NA)+
  coord_sf(xlim = c(-97, -35), ylim = c(-30, 17))+
  theme_bw(),
  
  ggplot()+
geom_spatraster(data = geo_pau_predict_rst)+
  viridis::scale_fill_viridis(na.value="white", n.breaks = 8, name = "Probability" )+
  labs(title = "SDM Predictions")+
  theme_bw()
)
## <SpatRaster> resampled to 500808 cells.

# Geonoma undata

cowplot::plot_grid(
  ggplot(grid_species%>%
         filter(species_name == "Geonoma undata"))+
  geom_sf(fill = "#7fbf7b", color = "#7fbf7b")+
  labs(title = "Geonoma undata")+
  geom_sf(data = spec_occ_sf%>%
            filter(species_name == "Geonoma undata"), color = "darkgreen")+
  geom_sf(data = americas, color = "black", fill = NA)+
  coord_sf(xlim = c(-97, -35), ylim = c(-30, 17))+
  theme_bw(),
  
  ggplot()+
geom_spatraster(data = geo_und_predict_rst)+
  viridis::scale_fill_viridis(na.value="white", n.breaks = 8, name = "Probability" )+
  labs(title = "SDM Predictions")+
  theme_bw()
)
## <SpatRaster> resampled to 500808 cells.

# Phytelephas tenuicaulis

cowplot::plot_grid(
  ggplot(grid_species%>%
         filter(species_name == "Phytelephas tenuicaulis"))+
  geom_sf(fill = "#7fbf7b", color = "#7fbf7b")+
  labs(title = "Phytelephas tenuicaulis")+
  geom_sf(data = spec_occ_sf%>%
            filter(species_name == "Phytelephas tenuicaulis"), color = "darkgreen")+
  geom_sf(data = americas, color = "black", fill = NA)+
  coord_sf(xlim = c(-97, -35), ylim = c(-30, 17))+
  theme_bw(),
  
  ggplot()+
geom_spatraster(data = phy_ten_predict_rst)+
  viridis::scale_fill_viridis(na.value="white", n.breaks = 8, name = "Probability" )+
  labs(title = "SDM Predictions")+
  theme_bw()
)
## <SpatRaster> resampled to 500808 cells.

# Mauritia flexuosa

cowplot::plot_grid(
  ggplot(grid_species%>%
         filter(species_name == "Mauritia flexuosa"))+
  geom_sf(fill = "#7fbf7b", color = "#7fbf7b")+
  labs(title = "Mauritia flexuosa")+
  geom_sf(data = spec_occ_sf%>%
            filter(species_name == "Mauritia flexuosa"), color = "darkgreen")+
  geom_sf(data = americas, color = "black", fill = NA)+
  coord_sf(xlim = c(-97, -35), ylim = c(-30, 17))+
  theme_bw(),
  
  ggplot()+
geom_spatraster(data = mau_fle_predict_rst)+
  viridis::scale_fill_viridis(na.value="white", n.breaks = 8, name = "Probability" )+
  labs(title = "SDM Predictions")+
  theme_bw()
)
## <SpatRaster> resampled to 500808 cells.

 

For further information about geo-spatial analysis, consult tutorials available via the Coding Club or Geocomputation with R

Some images from the evaluation of the performance of the models were extracted from support.bccvl.org.au

See also the site of Damaris Zurell for more content on the interpretation and evaluation of SDMs.