-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_metadata.Rmd
131 lines (98 loc) · 3.07 KB
/
prepare_metadata.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
---
title: "Format input files"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Make metadata file
```{r}
library(rgeos)
library(sp)
library(sf)
library(rgdal)
library(tidyverse)
library(ggplot2)
library(viridis)
library(ggmap)
library(geonames)
library(metacoder)
metadata <- read_csv('data/Pram_MetaData.csv')
```
## Reformat metadata
```{r}
metadata <- metadata %>%
rename(strain = isolate_id_orig,
isolate_id = "Isolate_ID(GL)",
gps_coord = "GPS Coordinates")
```
Add "source" column as a hybrid of host and environment
```{r}
metadata <- metadata %>%
mutate(source = paste(ifelse(is.na(Host_genus), "", Host_genus), ifelse(is.na(Host_species), "", Host_species)),
source = trimws(source),
source = ifelse(source == "", Host_Environment, source))
```
fix date format
```{r}
metadata$date <- paste0(metadata$Year, '-01-01')
```
Clean up place names so it is easier to look up coordinates
```{r}
replace_key <- c(
"N.Ireland" = "Northern Ireland"
)
metadata$Country[metadata$Country %in% names(replace_key)] <- replace_key[metadata$Country[metadata$Country %in% names(replace_key)]]
```
save metadata file
```{r}
write_tsv(metadata, file = file.path('data', 'metadata_modified.tsv'))
```
## Make coordinate file
```{r}
# https://gis.stackexchange.com/questions/240018/quickly-assign-coordinates-to-region-names-using-r
options(geonamesUsername = "fosterz")
get_coords <- function(name, group, ...) {
res <- GNsearch(name_equals = name, ...)
if ("fcode" %in% colnames(res)) {
res <- filter(res, name == name, fcode %in% c("AREA", "PCLI", "ADM1"))
}
res <- res[1, ]
out <- tibble(group = group, name = name, lat = res$lat, lon = res$lng)
return(out)
}
coord_data <- bind_rows(
map_dfr(unique(metadata$Country, na.rm = TRUE), get_coords, group = 'Country'),
map_dfr(unique(metadata$State), get_coords, group = 'State', country = 'USA', fcode = "ADM1")
) %>%
filter(!is.na(name))
write_tsv(coord_data, file = file.path("config", "lat_longs.tsv"), col_names = FALSE)
```
## Make dropped strains file
These will be filtered out of the analysis.
First, I will remove the reference strain, since that gets assigned a date in 1980 for some reason and that is confusing:
```{r}
dropped_ids <- c('MT_DQ832718.1')
```
And all IDs associated with a missing location or time:
```{r}
dropped_ids <- c(dropped_ids, metadata$strain[is.na(metadata$Country)])
dropped_ids <- c(dropped_ids, metadata$strain[is.na(metadata$Year)])
```
Ignore isolates with no FASTA sequence:
```{r}
seqs <- read_fasta("data/global_n661_mt.fasta")
dropped_ids <- c(dropped_ids, metadata$strain[! metadata$strain %in% names(seqs)])
```
And write the file, one ID per line:
```{r}
write_lines(dropped_ids, file = file.path("config", "dropped_strains.txt"))
```
## Make color file
```{r}
color_data <- coord_data %>%
select(group, name) %>%
mutate(color = viridis(length(group)))
color_data$color <- substr(color_data$color, start = 1, stop = 7)
write_tsv(color_data, file = file.path("config", "colors.tsv"), col_names = FALSE)
```