if (!require(tidyverse)) install.packages("tidyverse", dependencies = T)
if (!require(sf)) install.packages("sf", dependencies = T)
if (!require(rgbif)) install.packages("rgbif", dependencies = T)
if (!require(leaflet)) install.packages("leaflet", dependencies = T)
if (!require(CoordinateCleaner)) install.packages("CoordinateCleaner", dependencies = T)
if (!require(tmap)) install.packages("tmap", dependencies = T)
if (!require(remotes)) install.packages("remotes", dependencies = T)
install.packages("devtools")



library(taxize)
library(devtools)
library(hydrographr)
library(data.table)
library(dplyr)
library(terra)
library(tools)
library(stringr)
library(ranger)
library(leaflet)
library(leafem)
library(CoordinateCleaner)
library(tidyverse)
library(sf)
library(tmap)
library(remotes)


remotes::install_github("glowabio/hydrographr")
library(hydrographr)

library(dplyr)
library(rgbif)

install_github("ropensci/CoordinateCleaner")



wdir<- "....." #as chosen by the user
setwd(wdir)


1# 1. Set up GBIF credentials
usethis::edit_r_environ()
GBIF_USER="............" #as chosen by the user
GBIF_PWD=".............." #as chosen by the user
GBIF_EMAIL=#as chosen by the user

# 2. Get a list of species of interest

taxon  <- c('Odonata', 'Ephemeroptera', 'Plecoptera', 'Trichoptera', 'Unionida')
#taxon <-  c('Pila', 'Gabbiella', 'Sierraia', 'Soapitia', 
           # 'Assiminea', 'Cleopatra', 'Potadoma', 
         #   'Pachymelania', 'Melanoides', 'Pseudocleopatra', 'Lymnaea', 'Ferrissia', 
         #   'Biomphalaria','Indoplanorbis', 'Bulinus', 'Aplexa', 'Physa')

#taxon <-  c('Potadoma', 
  #          'Pachymelania', 'Melanoides', 'Pseudocleopatra')

#taxon <-  c('Lymnaea', 'Ferrissia', 
 #           'Biomphalaria','Indoplanorbis', 'Bulinus', 'Aplexa', 'Physa')

#taxon <- c('Dugesia','Potadoma', 'Pachymelania') 

#taxon <- c('Dugesia bijuga','Dugesia pustulata', 'Dugesia congolensis', '	
#Dugesia gonocephala', 'Dugesia neumanni', 'Dugesia tanganyikae', 'Dugesia astrocheta', 'Dugesia didiaphragma', 
 #          'Dugesia ectophysa', 'Dugesia didiaphragma') 

#taxon <- c('Noteridae', 'Hydraenidae', 'Hygrobiidae', 'Dytiscidae', 'Hydrophilidae', 'Naucoridae', 'Notonectidae', 
    #       'Gerridae', 'Gyrinidae', 'Psephenidae', 'Nepidae', 'Veliidae', 'Pleidae', 'Micronectidae',
   #        'Belostomatidae', 'Mesoveliidae') 


#taxon <-  c('Potamonautidae', 'Potamidae')


# 3. Get keys for each species. Keys are used to search in GBIF

taxonkey <- name_backbone_checklist(taxon) %>%
  pull(usageKey)


#-------Download data for the clountries using ISO codes-----------
code <- c('CD', 'CG', 'GA', 'ST', 'CF', 
          'CM', 'NG', 'BJ', 'TG', 'GH', 'CI', 'GN', 'LR', 'SL', 'GQ')

# 5. Download occurrences

# start a download on GBIF servers
gbif_download <- occ_download(
  pred_and(
    # remove default geospatial issues
    pred("HAS_GEOSPATIAL_ISSUE",FALSE),
    # keep only records with coordinates
    pred("HAS_COORDINATE",TRUE),
    # remove absent records
    #pred("OCCURRENCE_STATUS","PRESENT"),
    # remove fossils and living specimens
    pred_not(pred_in("BASIS_OF_RECORD",
                     c("FOSSIL_SPECIMEN","LIVING_SPECIMEN")))
  ),
  # only records of species list
  pred_in("taxonKey", taxonkey),
  # only records from countries in the Danube basin
  pred_in("country", code),
  # records from 1970
  #pred_gte("year", 1970),
  format = "SIMPLE_CSV",  
  user='eakindele',pwd='conservationist12$',email='emmanuel.akindele@igb-berlin.de'
)



# checks if download is finished
occ_download_wait(gbif_download)

# retrieve a download from GBIF to the computer and load the download 
# from the computer to R
raw_downloaded <- occ_download_get(gbif_download) |>
  occ_download_import()

colnames(raw_downloaded)
summary(raw_downloaded)

#---Data cleaning-----
## Only keep the orders/families/genera I need
spdata <- raw_downloaded %>%
 filter(order %in% c('Odonata', 'Ephemeroptera', 'Plecoptera', 'Trichoptera', 'Unionida'))

#spdata <- raw_downloaded %>%
 # filter(genus %in% c('Lymnaea', 'Ferrissia', 
     #                 'Biomphalaria','Indoplanorbis', 'Bulinus', 'Aplexa', 'Physa'))

#spdata <- raw_downloaded %>%
 # filter(genus %in% c('Potadoma', 
       #     'Pachymelania', 'Melanoides', 'Pseudocleopatra'))

#spdata <- raw_downloaded %>%
 # filter(family %in% c('Potamonautidae', 'Potamidae'))

#spdata <- raw_downloaded %>%
  #filter(species %in% c('Dugesia bijuga','Dugesia pustulata', 'Dugesia congolensis', 	
#'Dugesia gonocephala', 'Dugesia neumanni', 'Dugesia tanganyikae', 'Dugesia astrocheta', 'Dugesia didiaphragma', 
 #                       'Dugesia ectophysa', 'Dugesia didiaphragma'))

#spdata <- raw_downloaded %>%
 # filter(species %in% c('Dugesia congolensis', 	'Dugesia gonocephala', 'Dugesia neumanni', 'Dugesia tanganyikae', 'Dugesia astrocheta', 'Dugesia didiaphragma', 
       #                 'Dugesia ectophysa', 'Dugesia didiaphragma'))
  

#spdata <- raw_downloaded %>%
 # filter(family %in% c('Noteridae', 'Hydraenidae', 'Hygrobiidae', 'Dytiscidae', 'Hydrophilidae', 'Naucoridae', 'Notonectidae', 
   #                    'Gerridae', 'Gyrinidae', 'Psephenidae', 'Nepidae', 'Veliidae', 'Pleidae', 'Micronectidae',
   #                    'Belostomatidae', 'Mesoveliidae'))



write.csv(spdata, row.names = FALSE, "C:/Users/eakindele/projects/EPTO_Unionida.csv")
#write.csv(spdata, row.names = FALSE, "C:/Users/eakindele/projects/Gastropoda.csv")
#write.csv(spdata, row.names = FALSE, "C:/Users/eakindele/projects/Platyhelminthes.csv")
#write.csv(spdata, row.names = FALSE, "C:/Users/eakindele/projects/Coleoptera_Hemiptera.csv")


----------------------------------------------------------------------------------------------------------------------
  
  #------------Load all occurrence data-----------
wdir<-"C:/Users/eakindele/projects"
setwd(wdir)

spdata1<-read.csv("EPTO_Unionida.csv", header=T)
spdata2<-read.csv("Gastropoda.csv", header=T)
spdata3<-read.csv("Platyhelminthes.csv", header=T)
spdata4<-read.csv("Coleoptera_Hemiptera.csv", header=T)
spdata5<-read.csv("Guineo_Congolian_published articles.csv", header=T) #occurrence data from published articles and regional database) 



spdata <- rbind(spdata1, spdata2, spdata3, spdata4, spdata5)


write.csv(spdata, row.names = T, '................../Guineo_Congolian raw data.CSV') #.........= wdir chosen by the user



#---Remove duplicate records-----
spdata<- spdata%>% 
  distinct(decimalLongitude,decimalLatitude,species, .keep_all = TRUE) 
summary(spdata)


#---Select bounding box and  Convert species data to a spatial vector object to plot the points----
spdata <- spdata %>%
  dplyr::select(c("species", "decimalLongitude", "decimalLatitude", "year"))

summary(spdata)
#---Get bounding box (min long., min. lat, max. long., max. lat.)
bbox <- c(-14.467, -12.750, 31.241, 13.0300)



m <- leaflet() %>%
  addProviderTiles('Esri.WorldShadedRelief') %>%
  setMaxBounds(bbox[1], bbox[2], bbox[3], bbox[4]) %>%
  addCircles(data = spdata, color = "purple", lat = spdata$decimalLatitude,
             lng = spdata$decimalLongitude) # data = spdata


m

# the map contained some sea records at this stage, hence a need for data cleaning 

------------------------------------------------------------------------------------------------------------------
#-------Data cleaning--------------
#---- select columns of interest
spdata  <- spdata  %>%
  dplyr::select(phylum, class, order, family, genus,scientificName, species, decimalLongitude, decimalLatitude, countryCode, Locality,
                stateProvince, occurrenceStatus, individualCount, gbifID, taxonRank, coordinateUncertaintyInMeters, 
                coordinatePrecision, day, month, year, speciesKey, basisOfRecord, institutionCode, recordedBy)


##-----Coordinates cleaning---
spdata_clean<- spdata |>
  #st_drop_geometry() |>
  #rename_with(tolower)|>  # set lowercase column names to work with CoordinateCleaner
  #filter(coordinateprecision < 0.01 | is.na(coordinateprecision)) |> # below 0.01 and with missing values
  #filter(coordinateuncertaintyinmeters <= 1000 | is.na(coordinateuncertaintyinmeters)) |> # below 5 km and with missing values
  #filter(!coordinateuncertaintyinmeters %in% c(301,3036,999,9999)) |> # remove with known default values
  cc_cen(buffer = 1000) |> # remove country centroids within 1km 
  cc_cap(buffer = 1000) |> # remove capitals centroids within 1km
  cc_inst(buffer = 1000) |> # remove zoo and herbaria within 1km 
  cc_sea()%>% # remove sea records
  cc_dupl()%>% # remove duplicate records
  filter(species !="" | !is.na(species))


summary(spdata_clean)
bbox <- c(-14.450, -10.6185, 30.907, 13.0300) 

m <- leaflet() %>%
  addProviderTiles('Esri.WorldShadedRelief') %>%
  setMaxBounds(bbox[1], bbox[2], bbox[3], bbox[4]) %>%
  addCircles(data = spdata, color = "purple", lat = spdata$decimalLatitude,
             lng = spdata$decimalLongitude) # data = spdata


m


write.csv(spdata_clean, row.names = FALSE, '....../New_Guineo_Congolian_data_coordinatecleaned.csv')#.......= wdir chosen by the user


-----------------------------------------------------------------------
  #---------Taxize the dataset-------------

install.packages("myTAI")
library("taxize")
library("myTAI")
library(dplyr)

wdir<-"......." #as chosen by the user
setwd(wdir)


lst <- read.csv("New_Guineo_Congolian_data_coordinatecleaned.csv", header=T)

#Phyla
phylum<- unique(lst$phylum)
phylum

#classes
class<- unique(lst$class)
class

#orders list
order<-unique(lst$order)
order

#species list
lst_unique <- unique(lst$species)
lst_unique
summary(lst_unique)

#sucatchment list
lst_unique <- unique(lst$subcatchment_id)
lst_unique
summary(lst_unique)

#year list
lst<-read.csv("Togo.csv", header=T)
lst_unique <- unique(lst$year)
lst_unique


#---TAXIZE

# Using data source 
sources <- gnr_datasources()
sources
View(sources)


# Preferred data source ((Encyclopedia of Life, Zoobank registered names, GBIF backbone taxonomy))
tax_out<- gnr_resolve(sci = lst_unique, preferred_data_sources = c(11, 51, 64), best_match_only =T, canonical = T, highestscore = T)
as.data.frame(tax_out)
View(tax_out)



# Preferred data source ((Encyclopedia of Life, Zoobank registered names, GBIF backbone taxonomy))
tax_out<- gnr_resolve(sci = lst, preferred_data_sources = c(11, 51, 64), best_match_only =T, canonical = T, highestscore = T)
as.data.frame(tax_out)
View(tax_out)


write.csv(tax_out, row.names = FALSE, '........./Guineo_Congolian_coordinatecleaned_taxized.csv')#.......= wdir chosen by the user


