fullQuery<-dbGetQuery(myConnection, "SELECT allrawdata.STUDY_ID, allrawdata.DAY, allrawdata.MONTH, allrawdata.YEAR, allrawdata.SAMPLE_DESC, allrawdata.PLOT, allrawdata.ID_SPECIES, allrawdata.LATITUDE,
allrawdata.LONGITUDE, sum(allrawdata.ABUNDANCE), sum(allrawdata.BIOMASS), species.GENUS, species.SPECIES, species.GENUS_SPECIES from allrawdata inner join site on allrawdata.STUDY_ID=site.STUDY_ID inner join
species on allrawdata.ID_SPECIES=species.ID_SPECIES group by concat(allrawdata.STUDY_ID, allrawdata.DAY, allrawdata.MONTH, allrawdata.YEAR, allrawdata.SAMPLE_DESC, allrawdata.ID_SPECIES, allrawdata.LATITUDE,
allrawdata.LONGITUDE)")
library(RMySQL)
library(ggplot2)
library(mapdata)
library(vegan)
library(dplyr)
myConnection <- dbConnect(MySQL(), user="username", host="hostname", dbname="database", password="password")
### list the tables in the database
dbListTables(myConnection)
## [1] "abundance" "allrawdata" "biomass" "citation1" "contacts"
## [6] "curation" "datasets" "methods" "sample" "site"
## [11] "species"
### or
dbGetQuery(myConnection, "show tables")
## Tables_in_biopart
## 1 abundance
## 2 allrawdata
## 3 biomass
## 4 citation1
## 5 contacts
## 6 curation
## 7 datasets
## 8 methods
## 9 sample
## 10 site
## 11 species
### list the fields with their associated information in a chosen table
dbGetQuery(myConnection, "show columns from contacts")
## Field Type Null Key Default Extra
## 1 ID_CONTACTS int(11) NO PRI <NA> auto_increment
## 2 STUDY_ID int(11) YES MUL <NA>
## 3 CONTACT_1 varchar(500) YES <NA>
## 4 CONTACT_2 varchar(500) YES <NA>
## 5 CONT_1_MAIL varchar(60) NO <NA>
## 6 CONT_2_MAIL varchar(60) YES <NA>
## 7 LICENSE varchar(200) NO <NA>
## 8 WEB_LINK varchar(200) YES <NA>
## 9 DATA_SOURCE varchar(250) NO <NA>
### get the list of fields from a chosen table
dbListFields(myConnection, "site")
## [1] "ID_SITE" "STUDY_ID" "REALM" "CLIMATE"
## [5] "GENERAL_TREAT" "TREATMENT" "TREAT_COMMENTS" "TREAT_DATE"
## [9] "CEN_LATITUDE" "CEN_LONGITUDE" "HABITAT" "PROTECTED_AREA"
## [13] "AREA" "BIOME_MAP"
### read entire tables - N.B. not recommended unless the number of records is known and is not too large!
dbReadTable(myConnection, "biomass")
## ID_BIOMASS BIOMASS_TYPE
## 1 1 NA
## 2 2 Size
## 3 3 Weight
## 4 9 Relative biomass
## 5 10 Volume
## 6 12 Cover
dbReadTable(myConnection, "abundance")
## ID_ABUNDANCE ABUNDANCE_TYPE
## 1 1 Count
## 2 5 Presence/Absence
## 3 6 MeanCount
## 4 7 Density
## 5 8 NA
#### query the datasets table for the number of entries, as each record corresponds to each individual study
result<-dbGetQuery(myConnection, "SELECT count(*) FROM datasets")
## [1] "There are 361 studies in the database"
#### or interrogate the raw data table for the unique number of Study IDs
result2 <- dbGetQuery(myConnection, "SELECT count(distinct STUDY_ID) FROM allrawdata")
## [1] "There are 361 studies in the database"
dbGetQuery(myConnection, "SELECT count(distinct ID_SPECIES) as PolarSpecies FROM allrawdata inner join site on allrawdata.STUDY_ID = site.STUDY_ID where site.CLIMATE = 'Polar'")
## PolarSpecies
## 1 1258
dbGetQuery(myConnection, "SELECT distinct ID_SPECIES FROM allrawdata inner join site on allrawdata.STUDY_ID = site.STUDY_ID where site.CLIMATE = 'Polar'")
#### select all the central latitudes and longitudes for each study, along with the taxonomic classification and total number of records
result<-dbGetQuery(myConnection, "SELECT CENT_LAT, CENT_LONG, TAXA, TOTAL from datasets")
#### draw a basic world map, add "y" or "n" for display of tropics and polar latitudes
drawWorld<-function(lats) {
world_map<-map_data("world")
g1<-ggplot()+coord_fixed()+xlab("")+ylab("")
g1<-g1+geom_polygon(data=world_map, aes(x=long, y=lat, group=group), colour="gray60", fill="gray60")
g1<-g1+theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
panel.background=element_rect(fill="white", colour="white"), axis.line=element_line(colour="white"),
legend.position="none",axis.ticks=element_blank(), axis.text.x=element_blank(), axis.text.y=element_blank())
if(lats=="y") {
g1<-g1+geom_hline(yintercept=23.5, colour="red")+geom_hline(yintercept =-23.5, colour="red")
g1<-g1+geom_hline(yintercept=66.5, colour="darkblue")+geom_hline(yintercept =-66.5, colour="darkblue")
}
else { return(g1) }
return(g1)
}
taxaCol<-c('#ffffff','#ffffbf','#5e4fa2','#f46d43','#3288bd','#abdda4','#a8c614','#d53e4f','#66c2a5','#e6f598','#fee08b','#9e0142','#fdae61')
#### add the point coordinates to the blank map (without polar and tropic latitudinal lines)
points<-drawWorld("n")+geom_point(data=result, aes(x=CENT_LONG, y=CENT_LAT, colour=TAXA, size=TOTAL), alpha=I(0.7))
points<-points+scale_colour_manual(values= taxaCol)+scale_size(range=c(3, 10))
points
#### theme no grid
themeNoGrid<-function() {
theme_bw()+
theme(axis.text=element_text(size=16,color="black"),
axis.title=element_text(size=0,face="bold"),
legend.position="none",
plot.title=element_text(size=18,face="bold", hjust=0.5),
plot.background=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.border=element_blank(),
axis.line=element_line(color="black")
)
}
#### get studies by taxa
taxaFile<-dbGetQuery(myConnection, "SELECT distinct TAXA, ((count(TAXA)/361)*100) as valueX from datasets group by TAXA")
taxaPlot<-function(x, cols, pos) {
plot<-ggplot(x, aes(factor(TAXA), valueX, fill=TAXA))+
geom_bar(stat="identity", width=0.93,position=position_dodge(width=1),colour="black")+
scale_fill_manual(values= cols)+
scale_y_continuous(position=pos)+themeNoGrid()+
theme(axis.text.x=element_blank())+
geom_text(aes(label=TAXA), vjust=-1, hjust=0.7, angle=0.5)
return(plot)
}
taxaPlot(taxaFile, taxaCol, "left") ##axis is on the left
#### select studies by climate, use an inner join here to link the datasets and site tables
result<-dbGetQuery(myConnection, "SELECT datasets.CENT_LAT, datasets.CENT_LONG, site.CLIMATE FROM datasets inner join site on site.STUDY_ID = datasets.STUDY_ID")
###colours for the climate
climCol<-c('#3288bd','#66c2a5','#abdda4','#e6f598','#fdae61','#f46d43')
#### add the point co-ordinates to the blank map (including polar and tropic latitudinal lines)
points<-drawWorld("y")+geom_point(data=result, aes(x=result$CENT_LONG, y=result$CENT_LAT, colour=result$CLIMATE), size=5, alpha=I(0.7))
points<-points+scale_colour_manual(values= climCol)
points
climPlot<-function(x, cols, pos) {
plot<-ggplot(x, aes(factor(CLIMATE), valueX, fill=CLIMATE))+
geom_bar(stat="identity", width=0.93, position=position_dodge(width=1), colour="black")+
scale_fill_manual(values= cols)+
scale_y_continuous(position=pos)+
themeNoGrid()+
theme(axis.text.x=element_blank())+
geom_text(aes(label=CLIMATE), vjust=-0.8, hjust=0.5, angle=0)
return(plot)
}
#### get studies by climate
climFile<-dbGetQuery(myConnection, "SELECT distinct CLIMATE, ((count(CLIMATE)/361)*100) as valueX from site group by CLIMATE")
climPlot(climFile, climCol, "right") ##axis is on the right
result<-dbGetQuery(myConnection, "SELECT distinct GENUS_SPECIES FROM allrawdata inner join site on site.STUDY_ID = allrawdata.STUDY_ID inner join datasets on datasets.STUDY_ID = allrawdata.STUDY_ID inner join species on species.ID_SPECIES = allrawdata.ID_SPECIES where datasets.TAXA='Mammals' and site.CLIMATE = 'Temperate'")
head(result) #### showing only first entries
## GENUS_SPECIES
## 1 Blarina transitans
## 2 Clethrionomys gapperi
## 3 Napaeozapus insignis
## 4 Peromyscus sp
## 5 Tamias striatus
## 6 Ammospermophilus interpres
dbGetQuery(myConnection, "SELECT count(distinct GENUS_SPECIES) FROM allrawdata inner join site on site.STUDY_ID = allrawdata.STUDY_ID inner join datasets on datasets.STUDY_ID = allrawdata.STUDY_ID inner join species on species.ID_SPECIES = allrawdata.ID_SPECIES where datasets.TAXA='Fish' and site.CLIMATE = 'Tropical'")
## count(distinct GENUS_SPECIES)
## 1 1376
result <- dbGetQuery(myConnection, "SELECT GENUS_SPECIES, allrawdata.ABUNDANCE, allrawdata.LATITUDE, allrawdata.LONGITUDE, allrawdata.YEAR FROM allrawdata inner join species on species.ID_SPECIES = allrawdata.ID_SPECIES where allrawdata.STUDY_ID=302")
head(result)
## GENUS_SPECIES ABUNDANCE LATITUDE LONGITUDE YEAR
## 1 Actinostemon concepcionis 1 -22.379 -49.6803 2002
## 2 Actinostemon concepcionis 1 -22.379 -49.6803 2002
## 3 Actinostemon concepcionis 1 -22.379 -49.6803 2002
## 4 Actinostemon concepcionis 1 -22.379 -49.6803 2002
## 5 Actinostemon concepcionis 1 -22.379 -49.6803 2002
## 6 Actinostemon concepcionis 2 -22.379 -49.6803 2002
write.csv(result, "result.csv") #### save the object as a csv file
dbGetQuery(myConnection, "SELECT count(*) FROM allrawdata inner join datasets on datasets.STUDY_ID = allrawdata.STUDY_ID where datasets.DATA_POINTS>50")
## count(*)
## 1 293311
dbGetQuery(myConnection, "SELECT count(*) FROM allrawdata inner join datasets on datasets.STUDY_ID = allrawdata.STUDY_ID where datasets.NUMBER_OF_SPECIES>100")
## count(*)
## 1 7432315
#### selecting a count rather than the data
dbGetQuery(myConnection, "SELECT count(*) from allrawdata where STUDY_ID=383")
## count(*)
## 1 24
#### selecting a count rather than the data
dbGetQuery(myConnection, "SELECT count(STUDY_ID) from datasets where datasets.NUMBER_LAT_LONG=1")
## count(STUDY_ID)
## 1 158
result<-dbGetQuery(myConnection, "SELECT distinct GENUS_SPECIES, allrawdata.STUDY_ID FROM allrawdata inner join site on site.STUDY_ID = allrawdata.STUDY_ID inner join datasets on datasets.STUDY_ID = allrawdata.STUDY_ID inner join species on species.ID_SPECIES = allrawdata.ID_SPECIES where datasets.ORGANISMS like 'butter%'")
head(result)
## GENUS_SPECIES STUDY_ID
## 1 Acherontia atropos 70
## 2 Agrius convolvuli 70
## 3 Agrotis ipsilon 70
## 4 Autographa bractea 70
## 5 Autographa gamma 70
## 6 Colias crocea 70
dbGetQuery(myConnection, "SELECT count(distinct GENUS_SPECIES) FROM allrawdata inner join site on site.STUDY_ID = allrawdata.STUDY_ID inner join datasets on datasets.STUDY_ID = allrawdata.STUDY_ID inner join species on species.ID_SPECIES = allrawdata.ID_SPECIES where datasets.TAXA = 'Fish' and site.CLIMATE='Temperate'")
## count(distinct GENUS_SPECIES)
## 1 2941
dbDisconnect(myConnection)
## [1] TRUE
fullquery<-read.csv("/user/downloads/biotime/BioTIMEQuery_06_12_2017.csv") #### use the latest version from download site ####
#####fullquery<-read.csv(url(https://zenodo.org/record/1095627) ##adapt this to read the query from Zenodo #####
query<-fullquery%>%
filter(STUDY_ID == 163)
# Maria Dornelas 09.06.2015
rarefysamples<-function(Year, SampleID, Species, Abundance, resamps) {
#######################################################################
# takes as input a Year, SampleID, Species, Abundance and number of resamples
# which should be in dataframe so that elements match
# calculates turnover:
# 1) between each year and the first year
# 2) between pairs of adjacent years
# 3) between each year and the lasy year of the time series
# for the rarefied pooled samples
###########################################################################
rareftab<-data.frame(array(NA,dim=c(0,3)))
# getting vector with number of samples per year
nsamples<-c()
for(y in unique(Year)){
nsamples<-c(nsamples, length(unique(SampleID[Year==y])))
}
t<-1
minsample<-min(nsamples)
for(repeats in 1:resamps){
raref<-data.frame(array(NA,dim=c(1,3)))
names(raref)<-c("Year","Species","Abundance")
for(y in unique(Year)){
#getting samples for this year
samps<-unique(SampleID[Year==y])
# re-sampling samples to equalize number of samples
sam<-as.character(sample(samps,minsample,replace=T))
# getting data that belongs to bootstraped samples
rarefyear<-data.frame(SampleID[which(SampleID %in% sam & Year == y)],Species[which(SampleID %in% sam &Year
==y)],Abundance[which(SampleID %in% sam & Year == y)])
names(rarefyear)<-c("SampleID", "Species", "Abundance")
# calculating pooled abundances of each species to store
spabun<-tapply(as.numeric(rarefyear[,3]),as.character(rarefyear[,2]),sum)
spar<-data.frame(rep(y, length(spabun)),names(spabun),spabun, row.names=NULL)
names(spar)<-c("Year","Species","Abundance")
raref<-rbind(raref,spar)
}
# calculating year by species table of abundance
rareftab<-rbind(rareftab,cbind(rep(repeats,dim(raref)[1]),raref))
}
return (rareftab)
}
TSrf<-list()
IDs<-unique(query$STUDY_ID)
for(i in 1:length(IDs)){
data<-query[query$STUDY_ID==IDs[i],]
TSrf[[i]]<-rarefysamples(data$YEAR, data$SAMPLE_DESC, data$GENUS_SPECIES, data$sum.allrawdata.ABUNDANCE, 1)
}
names(TSrf)<-IDs
rf<-do.call(rbind, TSrf)
rf<-data.frame(rf, ID=rep(names(TSrf), times=unlist(lapply(TSrf, nrow))))
rf<-rf[!is.na(rf$Year),-1]
#### prepare the rarefied output for diversity metric code
t1<-with(rf, tapply(Abundance, list(Year, Species, ID), sum))
t2<-unique(rf[, c("ID", "Year")])
#### produces a list of matrices for each study - in this case is only a single dataset
dsList<-list()
for(i in 1:dim(t1)[3]){
id<-dimnames(t1)[[3]][i]
a<-subset(t2, ID==id)$Year
b<-t1[dimnames(t1)[[1]] %in% as.character(a),,i]
dsList[[i]]<-data.frame(Year = rownames(b), b)
}
names(dsList) <- dimnames(t1)[[3]]
#### replacing NA with zero
for(i in 1:(length(dsList))){
dsList[[i]][is.na(dsList[[i]])]<-0
}
#### Functions to calculate alpha and beta diversity metrics
####calculates species richness and total abundance
getAlpha<-function(x){
data.frame(S=apply(x>0, 1, sum),
N=apply(x, 1, sum))
}
####calculates Jaccard and Morisita-Horn similarity indices
getBeta<-function(x, origin=1, consec=F){
size<-dim(x)[1]-1
ifelse(consec==T, id<-cbind(1:size, 1:size+1), id<-cbind(1:dim(x)[1], origin))
output<-data.frame(Jaccard=as.matrix(1-vegdist(x, method="jaccard"))[id],
MorHorn=as.matrix(1-vegdist(x, method="horn"))[id])
ifelse(consec==T, rownames(output)<-rownames(x)[-1], rownames(output)<-rownames(x))
output
}
#### get the alpha diversity metrics
alphaX<-lapply(dsList, function(x)getAlpha(x[,-1]))
alpha<-do.call(rbind,alphaX)
alpha$Year<-substring(rownames(alpha), nchar(rownames(alpha))-3)
#### get the beta diversity metrics
betaX<-lapply(dsList, function(x)getBeta(x[,-1]))
beta<-do.call(rbind, lapply(betaX, function(x)x[-1,]))
beta$Year<-substring(rownames(beta), nchar(rownames(beta))-3)
themeNoGridAxes <- function() {
theme_bw()+
theme(axis.text=element_text(size=16,color="black"),
axis.title=element_text(size=16,face="bold"),
legend.position="none",
plot.title=element_text(size=18,face="bold", hjust=0.5),
plot.background=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.border=element_blank(),
axis.line=element_line(color="black")
)
}
#### plot species richness temporal trend with simple regression line
fitSR<-lm(alpha$S~alpha$Year)
srPlot<-ggplot(alpha, aes(x=Year, y=S))+geom_line(size=1, colour="gray30")+
geom_point(aes(x=Year, y=S), colour="darkgreen", size=5)+
scale_x_continuous(breaks=c(1993, 1995, 1997, 1999, 2001, 2003, 2005))+
geom_abline(intercept=fitSR$coef[1], slope=fitSR$coef[2], colour="gray10", size=1, linetype=3)+
themeNoGridAxes()+ylab("S")+xlab("Year")
srPlot
#### plot the Jaccard similarity temporal trend with a simple regression line
fitJ<-lm(beta$Jaccard~beta$Year)
jPlot<-ggplot(beta, aes(x=Year, y=Jaccard))+geom_line(size=1, colour="gray30")+
geom_point(aes(x=Year, y=Jaccard), colour="dodgerblue", size=5)+
scale_x_continuous(breaks=c(1993, 1995, 1997, 1999, 2001, 2003, 2005))+
geom_abline(intercept=fitJ$coef[1], slope=fitJ$coef[2], colour="gray10", size=1, linetype=3)+
themeNoGridAxes()+ylab("Jaccard Similarity")+xlab("Year")+ylim(0, 1)
jPlot
biotimeMeta<-read.csv("~/bioTIMEMetaData_06_12_2017.csv")
points<-drawWorld("y")+geom_point(data=biotimeMeta, aes(x=CENT_LONG, y=CENT_LAT, colour=TAXA, size=TOTAL), alpha=I(0.7))
points<-points+scale_colour_manual(values=taxaCol)+scale_size(range=c(3, 10))
points