Preparation: Open your RStudio project and download
new data for today from Stud-IP
(log in first) and unzip them into your project folder ’/data/. You can
use getwd()
to get your current working directory which is
your project folder as long as you do not change it. Please install the
following R-packages using install.packages()
if not done
yet:
dplyr
classInt
colorRamps
performance
sf
ggplot2
cowplot
viridis
If you want to visualize this tutorial in the viewer inside RStudio (to save space on your screen) run the following chunk of code:
install.packages("rstudioapi") # install an R-package required for this step
dir <- tempfile()
dir.create(dir)
download.file("https://gift.uni-goettingen.de/mcmmb/index.html",
destfile = file.path(dir, "index.html"))
download.file("https://gift.uni-goettingen.de/mcmmb/Day4.html",
destfile = file.path(dir, "Day4.html"))
htmlFile <- file.path(dir, "Day4.html")
rstudioapi::viewer(htmlFile)
Now you can conveniently copy code from the viewer into your script.
Load R packages
library("dplyr")
library("classInt")
library("colorRamps")
library("performance")
library("sf")
library("ggplot2")
library("cowplot")
library("viridis")
For today’s exercise, we will continue to use the Neotropical palm
data set from Kreft,
H., Sommer, J.H. & Barthlott, W. (2006) The significance of
geographic range size for spatial diversity patterns in Neotropical
palms. Ecography, 29, 21-30.
species <- read.csv("data/palms_species_per_gridcell.csv")
str(species)
## 'data.frame': 140860 obs. of 4 variables:
## $ grid_id: int 13735 13736 13737 13738 13910 13911 13912 13913 13914 14085 ...
## $ spp_Id : int 550 550 550 550 550 550 550 550 550 550 ...
## $ GENUS : chr "Ammandra" "Ammandra" "Ammandra" "Ammandra" ...
## $ EPITHET: chr "natalia" "natalia" "natalia" "natalia" ...
Let’s make a new column combining ‘Genus’ and ‘Epithet’
species$species <- paste(species$GENUS,species$EPITHET, sep = " ")
n_distinct(species$species)
## [1] 550
Load GIS shapefiles
# gridcells corresponding to palm distribution data and americas coastline
grid <- st_read("data/30min_grid_select50/30min_grid_select50%.shp")
## Reading layer `30min_grid_select50%' from data source
## `C:\Users\Pierre\Work\classes\macroecology\class_content\Gitlab\mcmmb\data\30min_grid_select50\30min_grid_select50%.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 6038 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -116.0007 ymin: -35 xmax: -35.00066 ymax: 36.5
## CRS: NA
americas <- st_read("data/americas/americas.shp")
## Reading layer `americas' from data source
## `C:\Users\Pierre\Work\classes\macroecology\class_content\Gitlab\mcmmb\data\americas\americas.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 15 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -124.7158 ymin: -55.91972 xmax: -29.84 ymax: 49.37666
## CRS: NA
# remember to define the Coordinate Reference System (CRS)
st_crs(grid) <- 4326 # "+proj=longlat +ellps=WGS84 +no_defs"
st_crs(americas) <- 4326 # "+proj=longlat +ellps=WGS84 +no_defs"
We can now merge the spatial grid with the species table. Remember to put the spatial table on the left part, not to lose its spatial component.
colnames(grid)[1] <- "grid_id"
grid_species <- left_join(grid, species, by = c("grid_id"))
Count species per gridcell
# ?table
species_num <- data.frame(table(species$grid_id))
head(species_num)
## Var1 Freq
## 1 538 1
## 2 539 1
## 3 540 1
## 4 712 1
## 5 713 1
## 6 714 1
colnames(species_num) <- c("grid_id", "spec_num")
head(species_num)
## grid_id spec_num
## 1 538 1
## 2 539 1
## 3 540 1
## 4 712 1
## 5 713 1
## 6 714 1
str(species_num)
## 'data.frame': 6638 obs. of 2 variables:
## $ grid_id : Factor w/ 6638 levels "538","539","540",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ spec_num: int 1 1 1 1 1 1 1 1 1 1 ...
# change mode of grid_ID to numeric
species_num$grid_id <- as.numeric(as.character(species_num$grid_id))
str(species_num)
## 'data.frame': 6638 obs. of 2 variables:
## $ grid_id : num 538 539 540 712 713 714 715 716 885 886 ...
## $ spec_num: int 1 1 1 1 1 1 1 1 1 1 ...
# Species richness distribution
summary(species_num$spec_num)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 16.00 21.22 36.00 82.00
ggplot(species_num, aes(spec_num)) +
geom_histogram(fill = "grey", color = "black") + # , bins = 30
labs(x = "Species number", y = "Count") +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Join species richness per grid to the shapefile
grid <- left_join(grid, species_num, by = "grid_id")
head(grid)
## Simple feature collection with 6 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -115.0007 ymin: 35.5 xmax: -113.0007 ymax: 36.5
## Geodetic CRS: WGS 84
## grid_id AREA PORTION spec_num geometry
## 1 538 0.25 100 1 MULTIPOLYGON (((-114.0007 3...
## 2 539 0.25 100 1 MULTIPOLYGON (((-113.5007 3...
## 3 540 0.25 100 1 MULTIPOLYGON (((-113.0007 3...
## 4 712 0.25 100 1 MULTIPOLYGON (((-114.5007 3...
## 5 713 0.25 100 1 MULTIPOLYGON (((-114.0007 3...
## 6 714 0.25 100 1 MULTIPOLYGON (((-113.5007 3...
Make a species richness map
ggplot(grid) +
geom_sf(color = NA, aes(fill = spec_num)) +
geom_sf(data = americas, fill = NA, color = "black") +
labs(title = "Palm species richness") +
scale_fill_viridis_c("Species number") +
ylim(-55, 40) +
theme_void() +
theme(plot.title = element_text(hjust = 0.5,vjust = 10),
plot.margin = margin(1,0,0,0, "cm"))
With a base R approach.
# Defining a color palette
cols <- viridis(n = 10)
colours <- colorRamp(cols)(grid$spec_num/max(grid$spec_num))
colours <- rgb(colours[,1], colours[,2], colours[,3], maxColorValue = 255)
# Plot
plot(st_geometry(grid),
col = colours,
# col = cols[grid$spec_num],
border = NA,
main = "Palm species richness")
plot(st_geometry(americas), add = TRUE)
legend_colours <- colorRamp(cols)(seq(0,1,length.out=8))
legend_colours <- rgb(legend_colours[,1], legend_colours[,2],
legend_colours[,3], maxColorValue = 255)
legend("bottomleft", # position
legend = round(seq(0,max(grid$spec_num),length.out=8)),
title = "Species number",
fill = legend_colours,
cex = 0.7,
bty = "n") # no box around it
Plot with intervals instead of a continuous color scale. First with base R.
# Splitting species richness in 10 intervals
my.class.fr <- classIntervals(grid$spec_num, n = 10, style = "equal",
dataPrecision = 0) # bin data into n quantiles
my.class.fr
## style: equal
## one of 125,595,622,175 possible partitions of this variable into 10 classes
## [1,10) [10,18) [18,26) [26,34) [34,42) [42,50) [50,58) [58,66) [66,74) [74,82]
## 1844 1215 711 492 657 665 327 100 22 5
# Palette of 10 colors
my.pal <- viridis(n = 10)
my.pal
## [1] "#440154FF" "#482878FF" "#3E4A89FF" "#31688EFF" "#26828EFF" "#1F9E89FF"
## [7] "#35B779FF" "#6DCD59FF" "#B4DE2CFF" "#FDE725FF"
# Create color scheme based on the intervals and the palette
my.col.fr <- findColours(my.class.fr, my.pal) # ramp colors based on classInts
# Plot
plot(st_geometry(grid), col = my.col.fr, border = NA,
main = "Palm species richness")
plot(st_geometry(americas), add = TRUE)
legend("bottomleft", # position
legend = names(attr(my.col.fr, "table")),
title = "Species number",
fill = attr(my.col.fr, "palette"),
cex = 0.7,
bty = "n") # no box around it
Second with ggplot2.
grid$spec_num_cat <- cut_interval(x = grid$spec_num, n = 10)
ggplot(grid) +
geom_sf(color = NA, aes(fill = spec_num_cat)) +
geom_sf(data = americas, fill = NA, color = "black") +
labs(title = "Palm species richness") +
scale_fill_viridis_d("Species number") +
ylim(-55, 40) +
theme_void() +
theme(plot.title = element_text(hjust = 0.5,vjust = 10),
plot.margin = margin(1,0,0,0, "cm"))
Our aim here is to add the latitude and longitude of the centroid of each grid cell to the spatial grid. This will be useful for plotting the latitudinal diversity gradient further ahead. Our aim here is to add the longitude and latitude of the centroid of each grid cell to the spatial grid. This will be useful for plotting the latitudinal diversity gradient further ahead.
Calculate latitude and longitude from the grid’s polygons
# convert the polygons to points (mass centroids)
grid_centroids <- st_centroid(grid)
# extract the coordinates of the points
coordinates <- data.frame(st_coordinates(grid_centroids))
str(coordinates)
## 'data.frame': 6038 obs. of 2 variables:
## $ X: num -114 -114 -113 -115 -114 ...
## $ Y: num 36.2 36.2 36.2 35.7 35.7 ...
# add grid_id
coordinates <- data.frame(grid_id = grid_centroids$grid_id,
Long = coordinates$X,
Lat = coordinates$Y)
# join centroid coordinates to the grid shapefile
grid <- left_join(grid, coordinates, by = "grid_id")
head(grid)
## Simple feature collection with 6 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -115.0007 ymin: 35.5 xmax: -113.0007 ymax: 36.5
## Geodetic CRS: WGS 84
## grid_id AREA PORTION spec_num spec_num_cat Long Lat
## 1 538 0.25 100 1 [1,9.1] -114.2507 36.24999
## 2 539 0.25 100 1 [1,9.1] -113.7507 36.24999
## 3 540 0.25 100 1 [1,9.1] -113.2507 36.24999
## 4 712 0.25 100 1 [1,9.1] -114.7507 35.75000
## 5 713 0.25 100 1 [1,9.1] -114.2507 35.75000
## 6 714 0.25 100 1 [1,9.1] -113.7507 35.75000
## geometry
## 1 MULTIPOLYGON (((-114.0007 3...
## 2 MULTIPOLYGON (((-113.5007 3...
## 3 MULTIPOLYGON (((-113.0007 3...
## 4 MULTIPOLYGON (((-114.5007 3...
## 5 MULTIPOLYGON (((-114.0007 3...
## 6 MULTIPOLYGON (((-113.5007 3...
How many of the grid cells are located within the Tropics (within +/- 23.433333)?
# length(grid@data$spec_num[which(grid$Lat > -23.433333 &
# grid$Lat < 23.433333)])
length(which(abs(grid$Lat) < 23.433333))
## [1] 4870
nrow(grid[which(abs(grid$Lat) < 23.433333), ])
## [1] 4870
Compare mean species richness per gridcell within and outside the Tropics
data.frame(
Tropics = mean(grid$spec_num[which(abs(grid$Lat) < 23.433333)]),
nonTropics = mean(grid$spec_num[which(abs(grid$Lat) >= 23.433333)]))
## Tropics nonTropics
## 1 26.6345 3.128425
# alternatively
grid$climate <- ifelse(grid$Lat > -23.433333 & grid$Lat < 23.433333,
"tropical", "non-tropical")
aggregate(grid$spec_num, list(grid$climate), FUN = mean)
## Group.1 x
## 1 non-tropical 3.128425
## 2 tropical 26.634497
How many grid cells have more than 20 species and are located South of 17°S? Can you plot them?
length(grid$spec_num[which(grid$spec_num > 20 & grid$Lat < -17)])
## [1] 50
ggplot(grid) +
geom_sf() +
geom_sf(data = grid[which(grid$spec_num > 20 & grid$Lat < -17), ],
color = "firebrick3", fill = "firebrick3") +
geom_sf(data = americas, color = "black", fill = NA) +
ylim(-55, 40) +
theme_void()
We here build latitudial belts of 0.5 degrees, and calculate the species richness per belt.
# Merging coordinates to species_num data.frame
species_num <- inner_join(species_num, coordinates, by = "grid_id")
range(species_num$Lat, na.rm = TRUE)
## [1] -34.75000 36.24999
Establish latitudinal belts
belts <- seq(round(min(species_num$Lat), 2), round(max(species_num$Lat), 2),
by = 0.5)
belts
## [1] -34.75 -34.25 -33.75 -33.25 -32.75 -32.25 -31.75 -31.25 -30.75 -30.25
## [11] -29.75 -29.25 -28.75 -28.25 -27.75 -27.25 -26.75 -26.25 -25.75 -25.25
## [21] -24.75 -24.25 -23.75 -23.25 -22.75 -22.25 -21.75 -21.25 -20.75 -20.25
## [31] -19.75 -19.25 -18.75 -18.25 -17.75 -17.25 -16.75 -16.25 -15.75 -15.25
## [41] -14.75 -14.25 -13.75 -13.25 -12.75 -12.25 -11.75 -11.25 -10.75 -10.25
## [51] -9.75 -9.25 -8.75 -8.25 -7.75 -7.25 -6.75 -6.25 -5.75 -5.25
## [61] -4.75 -4.25 -3.75 -3.25 -2.75 -2.25 -1.75 -1.25 -0.75 -0.25
## [71] 0.25 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75
## [81] 5.25 5.75 6.25 6.75 7.25 7.75 8.25 8.75 9.25 9.75
## [91] 10.25 10.75 11.25 11.75 12.25 12.75 13.25 13.75 14.25 14.75
## [101] 15.25 15.75 16.25 16.75 17.25 17.75 18.25 18.75 19.25 19.75
## [111] 20.25 20.75 21.25 21.75 22.25 22.75 23.25 23.75 24.25 24.75
## [121] 25.25 25.75 26.25 26.75 27.25 27.75 28.25 28.75 29.25 29.75
## [131] 30.25 30.75 31.25 31.75 32.25 32.75 33.25 33.75 34.25 34.75
## [141] 35.25 35.75 36.25
length(belts)
## [1] 143
Species richness of one particular latitudinal
belt
Select gridcells based on latitude
species_num[which(species_num$Lat > -25.5 & species_num$Lat < -25),
"spec_num"]
## [1] 1 1 1 1 1 1 1 2 3 3 3 3 3 3 5 6 6 9 7 8 8 7 7 7 7
## [26] 8 8 8 8 11 16 17 14 11
Select gridcells based on latitude taking from belts object
belts[20]
## [1] -25.25
species_num[which(species_num$Lat > belts[20] - 0.25 &
species_num$Lat < belts[20] + 0.25), "spec_num"]
## [1] 1 1 1 1 1 1 1 2 3 3 3 3 3 3 5 6 6 9 7 8 8 7 7 7 7
## [26] 8 8 8 8 11 16 17 14 11
mean(species_num[which(species_num$Lat > belts[20] - 0.25 &
species_num$Lat < belts[20] + 0.25), "spec_num"])
## [1] 6.029412
Species richness across all latitudinal belts (‘for loop approach’)
# for-loop example
for (i in 1:5){
print(i)
Sys.sleep(1)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
# create an empty vector for the mean species counts
species_per_belt_loop <- NA
# loop trough the latitudinal belts and calculate the mean species richness
for (i in 1:length(belts)){
belt_subset <- species_num[which(species_num$Lat > belts[i] - 0.25 &
species_num$Lat < belts[i] + 0.25),
"spec_num"]
species_per_belt_loop[i] <- mean(belt_subset)
}
Species richness across all latitudinal belts (‘apply a function approach’)
# function example
myFunction <- function(x){
if (x > 5) message("Hello")
}
myFunction(5)
myFunction(6)
## Hello
# define the function
species_per_belt_function <- function(x){
mean(species_num[which(species_num$Lat > x - 0.25 &
species_num$Lat < x + 0.25), "spec_num"])
}
# apply the function to one latitudinal belt
species_per_belt_function(-15.75)
## [1] 16.92537
# apply the function to all belts
species_per_belt_apply <- sapply(belts, function(x)
mean(species_num$spec_num[which(species_num$Lat > x - 0.25 &
species_num$Lat < x + 0.25)]))
species_per_belt_apply <- sapply(belts, species_per_belt_function)
Species richness across all latitudinal belts (‘dplyr approach’)
# create belt column
species_num$belt <- round(species_num$Lat*100/25)*25/100 # round at a quarter
species_num <- group_by(species_num, belt)
species_per_belt_dplyr <- summarize(species_num, richness = mean(spec_num))
species_per_belt_dplyr <- species_per_belt_dplyr$richness
species_num <- ungroup(species_num)
Compare loop, apply and dplyr approach
# compare loop and apply approach
all(species_per_belt_apply == species_per_belt_loop)
## [1] TRUE
all(species_per_belt_apply == species_per_belt_dplyr)
## [1] TRUE
species_per_belt <- species_per_belt_apply
Save the results
save(species_num, file = "data/species_num.RData")
save(belts, species_per_belt, file = "data/species_num_belts.RData")
write.csv(species_num, "data/species_num.csv", row.names = FALSE)
In case you had problems creating the species_num
and
species_per_belt
data.frames you can now load our provided
*.Rdata files
load("data/species_num.RData")
load("data/species_num_belts.RData")
Gridcell level
# par(mfrow = c(1, 2))
# plot(species_num$Lat, species_num$spec_num,
# xlab = "Latitude", ylab = "Species number",
# pch = 16, col = "#00000070")
# plot(abs(species_num$Lat), species_num$spec_num,
# xlab = "Absolute latitude", ylab = "Species number",
# pch = 16, col = "#00000070")
# for color codes, see here: https://htmlcolorcodes.com/
# With ggplot2
plot_grid(
ggplot(species_num, aes(Lat, spec_num)) +
geom_point(color = "black", alpha = 0.3) +
labs(x = "Latitude", y = "Species number") +
theme_bw(),
ggplot(species_num, aes(abs(Lat), spec_num)) +
geom_point(color = "black", alpha = 0.3) +
labs(x = "Absolute latitude", y = "Species number") +
theme_bw()
)
What could be the reason for the triangular relationship?
Linear model of species richness ~ latitude
# run a linear model for palm species richness in dependence on latitude
model1 <- lm(species_num$spec_num ~ abs(species_num$Lat))
summary(model1)
##
## Call:
## lm(formula = species_num$spec_num ~ abs(species_num$Lat))
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.000 -6.952 0.901 6.441 50.045
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.30384 0.25271 163.44 <2e-16 ***
## abs(species_num$Lat) -1.38503 0.01498 -92.44 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.17 on 6036 degrees of freedom
## Multiple R-squared: 0.5861, Adjusted R-squared: 0.586
## F-statistic: 8546 on 1 and 6036 DF, p-value: < 2.2e-16
# Add regression line
ggplot(species_num, aes(abs(Lat), spec_num)) +
geom_point(color = "black", alpha = 0.3) +
stat_smooth(method = "lm", color = "firebrick3") +
labs(x = "Absolute latitude", y = "Species number") +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
Latitudinal belts
model2 <- lm(species_per_belt ~ abs(belts))
summary(model2)
##
## Call:
## lm(formula = species_per_belt ~ abs(belts))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.059 -3.580 -0.834 3.622 10.044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.49252 0.70119 57.75 <2e-16 ***
## abs(belts) -1.30523 0.03395 -38.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.195 on 141 degrees of freedom
## Multiple R-squared: 0.9129, Adjusted R-squared: 0.9123
## F-statistic: 1478 on 1 and 141 DF, p-value: < 2.2e-16
model3 <- lm(species_per_belt ~ abs(belts) + I(abs(belts)^2))
summary(model3)
##
## Call:
## lm(formula = species_per_belt ~ abs(belts) + I(abs(belts)^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.1598 -1.2409 0.1004 1.0354 13.0752
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.994483 0.753758 62.35 <2e-16 ***
## abs(belts) -2.393983 0.097118 -24.65 <2e-16 ***
## I(abs(belts)^2) 0.030401 0.002625 11.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.009 on 140 degrees of freedom
## Multiple R-squared: 0.9555, Adjusted R-squared: 0.9549
## F-statistic: 1504 on 2 and 140 DF, p-value: < 2.2e-16
# Plot
# par(mfrow = c(1, 2))
# plot(belts, species_per_belt,
# xlab = "Latitude", ylab = "Species number", pch = 16)
# plot(abs(belts), species_per_belt,
# xlab = "Absolute latitude", ylab = "Species number",pch = 16)
# abline(model2, col = "red", lwd = 2)
# points(abs(belts), predict(model3), type = "l", col = "blue", lwd = 2)
# With ggplot2
belt_sp <- data.frame(belts = belts,
spec_num = species_per_belt)
plot_grid(
ggplot(belt_sp, aes(belts, spec_num)) +
geom_point(color = "black") +
labs(x = "Latitude", y = "Species number") +
theme_bw(),
ggplot(belt_sp, aes(abs(belts), spec_num)) +
geom_point(color = "black") +
stat_smooth(method = "lm", formula = y ~ abs(x),
color = "red") +
stat_smooth(method = "lm", formula = y ~ abs(x) + I(abs(x)^2),
color = "blue") +
labs(x = "Absolute latitude", y = "Species number") +
theme_bw()
)
Let’s also assess model performance using the performance package to calculate commonly used indices, such as R2, AIC, RMSE, etc.
AIC(model2, model3)
## df AIC
## model2 3 819.8933
## model3 4 725.8151
compare_performance(model3, model2)
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights) | BIC (weights) | R2 | R2 (adj.) | RMSE | Sigma
## ---------------------------------------------------------------------------------------------------
## model3 | lm | 725.8 (>.999) | 726.1 (>.999) | 737.7 (>.999) | 0.956 | 0.955 | 2.977 | 3.009
## model2 | lm | 819.9 (<.001) | 820.1 (<.001) | 828.8 (<.001) | 0.913 | 0.912 | 4.166 | 4.195