-
Notifications
You must be signed in to change notification settings - Fork 1
/
load_PR_data_Informes_Arbovirales.R
227 lines (186 loc) · 9.15 KB
/
load_PR_data_Informes_Arbovirales.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#SV Scarpino
#July 2018
#PDF to CSV for Puerto Rico MoH Arboviral surveillance data (2016-present)
#set working dir
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))#sets working directory to source file location
#libraries (not included in limits_acc_functions.R)
library(pdftools)
#########
#Globals#
#########
download_new_Informes_Arbovirales <- FALSE #set to TRUE to download files
write_new <- FALSE #set to TRUE to save a new csv
Informes_Arbovirales_path <- "Raw PDFs/Informes Arbovirales/"
time_stamp <- as.numeric(Sys.time())
###########
#acc funcs#
###########
parse_Informes_Arbovirales <- function(filename, path){
data <- pdf_text(paste0(path, filename))
data_split <- strsplit(data, "\n")
#Semana
semana_loc <- grep("Semanas", data_split[[1]])
semana_raw <- data_split[[1]][semana_loc]
semana_no_em <- gsub(pattern = "\u2014", replacement = "-", semana_raw)
semana_no_space <- gsub(pattern = "- ", replacement = "-", semana_no_em)
semana_no_space <- gsub(pattern = " -", replacement = "-", semana_no_space)
semana_no_space <- gsub(pattern = " - ", replacement = "-", semana_no_space)
semana_split <- strsplit(x = semana_no_space, split = " ")
semana_comma <- paste0(unlist(semana_split)[1:2], collapse = " ")
semana <- gsub(pattern = ",", replacement = "", semana_comma)
#DENV
denv <- grep("DENV:",data_split[[1]])
if(length(denv) == 0){
denv <- grep("DENV¶:",data_split[[1]])
}
if(length(denv) == 0){
denv <- grep("DENV‣:",data_split[[1]])
}
#cumulative denv cases
denv_cases_cum <- data_split[[1]][denv[2]]
denv_cases_cum_numbers <- strsplit(denv_cases_cum, "[: ]")[[1]][3]
denv_cases_cum_numbers <- gsub(pattern = ",", "", denv_cases_cum_numbers)
denv_cases_cum_numbers <- as.numeric(denv_cases_cum_numbers)
#new denv cases
denv_cases_new <- data_split[[1]][denv[1]]
denv_cases_new_numbers <- strsplit(denv_cases_new, "[: ]")[[1]][3]
denv_cases_new_numbers <- gsub(pattern = ",", "", denv_cases_new_numbers)
denv_cases_new_numbers <- as.numeric(denv_cases_new_numbers)
#CHIKV
chik <- grep("CHIKV: ", data_split[[1]])
#cumulative chikv cases
chik_cases_cum <- data_split[[1]][chik[2]]
chik_cases_cum_numbers <- strsplit(chik_cases_cum, "[: ]")[[1]][3]
chik_cases_cum_numbers <- gsub(pattern = ",", "", chik_cases_cum_numbers)
chik_cases_cum_numbers <- as.numeric(chik_cases_cum_numbers)
#new chikv cases
chik_cases_new <- data_split[[1]][chik[1]]
chik_cases_new_numbers <- strsplit(chik_cases_new, "[: ]")[[1]][3]
chik_cases_new_numbers <- gsub(pattern = ",", "", chik_cases_new_numbers)
chik_cases_new_numbers <- gsub(pattern = "casos", "", chik_cases_new_numbers)
chik_cases_new_numbers <- as.numeric(chik_cases_new_numbers)
#ZIKV
zikv <- grep("ZIKV: ", data_split[[1]])
#cumulative zikv cases
zikv_cases_cum <- data_split[[1]][zikv[2]]
zikv_cases_cum_numbers <- strsplit(zikv_cases_cum, "[: ]")[[1]][3]
zikv_cases_cum_numbers <- gsub(pattern = ",", "", zikv_cases_cum_numbers)
zikv_cases_cum_numbers <- as.numeric(zikv_cases_cum_numbers)
#new zikv cases
zikv_cases_new <- data_split[[1]][zikv[1]]
zikv_cases_new_numbers <- strsplit(zikv_cases_new, "[: ]")[[1]][3]
zikv_cases_new_numbers <- gsub(pattern = ",", "", zikv_cases_new_numbers)
zikv_cases_new_numbers <- as.numeric(zikv_cases_new_numbers)
#Flavivirus
flavi <- grep("Flavivirus: ", data_split[[1]])
#cumulative flavivirus
flavi_cases_cum <- data_split[[1]][flavi[1]]
flavi_cases_cum_numbers <- strsplit(flavi_cases_cum, "[: ]")[[1]][3]
flavi_cases_cum_numbers <- gsub(pattern = ",", "", flavi_cases_cum_numbers)
flavi_cases_cum_numbers <- as.numeric(flavi_cases_cum_numbers)
#new flavirus
flavi_cases_new_numbers <- NA #as far as I can tell this is only reported as cumulative
#suspected cases
data_split[[1]] <- gsub(pattern = "presuntos", replacement = "sospechosos", data_split[[1]])
data_split[[1]] <- gsub(pattern = "reportes", replacement = "sospechosos", data_split[[1]])
suspected_loc <- grep("sospechosos", data_split[[1]])
suspected_raw <- data_split[[1]][suspected_loc[2]]
suspected <- strsplit(x = suspected_raw, split = " ")[[1]][1]
suspected <- gsub(pattern = ",", "", suspected)
suspected <- try(as.numeric(suspected), silent = TRUE)
if(is(suspected)[1] == "try-error"){
suspected <- NA
}
return(list("DENV" = denv_cases_cum_numbers, "CHIKV" = chik_cases_cum_numbers, "ZIKV" = zikv_cases_cum_numbers, "Flavivirus" = flavi_cases_cum_numbers, "Semana" = semana, "DENV_new" = denv_cases_new_numbers, "CHIKV_new" = chik_cases_new_numbers, "ZIKV_new" = zikv_cases_new_numbers, "Flavivirus_new" = flavi_cases_new_numbers, "suspected_new" = suspected))
}
###########
#Data Sets#
###########
#1. Download Informes Arbovirales
if(download_new_Informes_Arbovirales == TRUE){
#Download data
years <- c(2016,2017,2018)
base_file <- "http://www.salud.gov.pr/Estadisticas-Registros-y-Publicaciones/Informes%20Arbovirales/Reporte%20ArboV%20semana%20"
missed <- c()
for(i in years){
for(j in 1:54){
loc.file.ij <- paste0(base_file, j, "-", i, ".pdf")
dest.file.ij <- strsplit(x = loc.file.ij, split = "/")
dest.file.ij <- dest.file.ij[[1]][6]
try_ij <- try(download.file(url = loc.file.ij, destfile = paste0(Informes_Arbovirales_path, dest.file.ij)), silent = TRUE)
if(length(grep("error", try_ij, ignore.case = TRUE)) > 0){
missed <- c(missed, dest.file.ij)
}
}
}
#two files with weird naming conventions
loc.file.ij <- "http://www.salud.gov.pr/Estadisticas-Registros-y-Publicaciones/Informes%20Arbovirales/Reporte%20ArboV%20semana%2052-53%202016.pdf"
dest.file.ij <- strsplit(x = loc.file.ij, split = "/")
dest.file.ij <- dest.file.ij[[1]][6]
try_ij <- try(download.file(url = loc.file.ij, destfile = paste0(Informes_Arbovirales_path, dest.file.ij)), silent = TRUE)
if(length(grep("error", try_ij, ignore.case = TRUE)) > 0){
stop("Couldn't find file")
}
loc.file.ij <- "http://www.salud.gov.pr/Estadisticas-Registros-y-Publicaciones/Informes%20Arbovirales/Reporte%20ArboV%20semana%208%202017.pdf"
dest.file.ij <- strsplit(x = loc.file.ij, split = "/")
dest.file.ij <- dest.file.ij[[1]][6]
try_ij <- try(download.file(url = loc.file.ij, destfile = paste0(Informes_Arbovirales_path, dest.file.ij)), silent = TRUE)
if(length(grep("error", try_ij, ignore.case = TRUE)) > 0){
stop("Couldn't find file")
}
}
#2. Extract data from PDFs
Informes_Arbovirales_files <- list.files("Raw PDFs/Informes Arbovirales/")
data <- matrix(NA, ncol = 12, nrow = length(Informes_Arbovirales_files))
colnames(data) <- c("Year", "Week", "Group", "DENV_cumulative", "CHIKV_cumulative", "ZIKV_cumulative", "Flavivirus_cumulative","DENV_new", "CHIKV_new", "ZIKV_new", "Flavivirus_new", "Suspected_new")
data <- as.data.frame(data)
for(i in 1:length(Informes_Arbovirales_files)){
file_name.i <- Informes_Arbovirales_files[i]
year.i <- substr(x = file_name.i, start = nchar(file_name.i)-7, stop = nchar(file_name.i)-4)
week.i <- strsplit(x = file_name.i, split = "%20")[[1]][4]
week.i <- strsplit(x = week.i, split = "-")[[1]][1]
parsed.i <- parse_Informes_Arbovirales(filename = file_name.i, path = Informes_Arbovirales_path)
data$Year[i] <- year.i
data$Week[i] <- week.i
data$Group[i] <- parsed.i$Semana
data$DENV_cumulative[i] <- parsed.i$DENV
data$CHIKV_cumulative[i] <- parsed.i$CHIKV
data$ZIKV_cumulative[i] <- parsed.i$ZIKV
data$Flavivirus_cumulative[i] <- parsed.i$Flavivirus
data$DENV_new[i] <- parsed.i$DENV_new
data$CHIKV_new[i] <- parsed.i$CHIKV_new
data$ZIKV_new[i] <- parsed.i$ZIKV_new
data$Flavivirus_new[i] <- parsed.i$Flavivirus_new
data$Suspected_new[i] <- parsed.i$suspected_new
}
#3. Order data set
data_order <- c()
years_var <- as.numeric(unique(data$Year))
years_var <- years_var[order(years_var, decreasing = FALSE)]
for(i in years_var){
use.i <- which(data$Year == i)
order_i <- order(as.numeric(data$Week)[use.i])
data_order <- c(data_order, use.i[order_i])
}
data_out <- data[data_order, ]
#4. Population sizes
#from https://en.wikipedia.org/wiki/Demographics_of_Puerto_Rico
# B.R. Mitchell. International historical statistics: the Americas, 1750–2000
# "United Nations Statistics Division – Demographic and Social Statistics". Unstats.un.org. Retrieved 14 October 2017.
# "Archived copy". Archived from the original on 2017-09-27. Retrieved 2017-09-09.
#"Archived copy" (PDF). Archived from the original (PDF) on 2017-10-16. Retrieved 2017-10-03.
pop_size_pr <- read.table("PR_pop_size.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
years <- unique(data_out$Year)
use_pops <- which(pop_size_pr$Year %in% years)
#need to predict 2018
x <- 1:6
mod_18 <- lm(pop_size_pr$Averagepopulation.x1000.[103:nrow(pop_size_pr)] ~ x)
pred.18 <- predict(mod_18, newdata = data.frame(x = 7))
pops <- c(pop_size_pr$Averagepopulation.x1000.[use_pops]*1000, pred.18*1000)
tab_years <- table(data_out$Year)
data_out$population_est <- rep(pops, tab_years)
#5. Save
if(write_new == TRUE){
filename <- paste0("Data/Informes_Arbovirales_pop_", time_stamp,"-years-", paste0(years_var, collapse = "-"), ".csv")
write.csv(x = data_out, file = filename, row.names = FALSE, quote = FALSE)
}