-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_area_date_time_from_notices_add_pdf_name.R
84 lines (60 loc) · 3.44 KB
/
extract_area_date_time_from_notices_add_pdf_name.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
library(pdftools)
library(purrr)
library(textreadr)
# notices from new_notices folder(a combination of notices from notices folder and those of similar
# structure from Renamed folder)
#setwd("C:/Users/Chance/Dropbox/My Projects/Elvis/KPLC/notices/new_notices")
#notices = file.path("C:/Users/Chance/Dropbox/My Projects/Elvis/KPLC/notices/new_notices")
announcements = list.files(notices, ".pdf")
# function to read an announcement, extract area and date-time
extract_area_date_time = function(y){
#y = "18Um0M6UhiIr_Special Interruption - Turkana County.pdf" # new
x = pdf_text(y) # reading an announcement into R from .pdf
x = unlist(strsplit(x, "\n")) # breaking the announcements into lines and unlisting the lines
area = x[startsWith(x, "AREA")] # getting the line that contains the area of blackout
area = gsub("AREA: ", "", area) # cleaning the area
area = gsub("\r", "", area) # cleaning the area
dateTime = x[startsWith(x, "DATE")] # getting the line that contains date and time of blackout
dateTime = gsub("DATE: ", "", dateTime) # cleaning the date
dateTime = gsub("TIME: ", "", dateTime) # cleaning the date
dateTime = gsub("\r", "", dateTime) # cleaning the date
blackoutEntry = data.frame(cbind(area, dateTime))
blackoutEntry$pdf_name = rep(gsub(".pdf", "", y), nrow(blackoutEntry)) # new
write.table(blackoutEntry, "new_notices_blackouts.csv", sep = ",", append = T, row.names = F, col.names = F )
}
map(announcements, extract_area_date_time)
blackouts = read.csv("new_notices_blackouts.csv")
saveRDS(blackouts, "new_notices_blackouts.rds")
# notices from big fails folder
#setwd("C:/Users/Chance/Dropbox/My Projects/Elvis/KPLC/notices/bigfails")
#notices = file.path("C:/Users/Chance/Dropbox/My Projects/Elvis/KPLC/notices/bigfails")
#announcements = list.files(notices, ".pdf")
#x = read_pdf("s1lGJZH4xkaC_Special Interruption - Parts of Kiambu County.pdf" )# returns empty
# notices from new_renamed folder(notices from Renamed folder that have been modified such that
# each notice is on its own line)
setwd("C:/Users/Chance/Dropbox/My Projects/Elvis/KPLC/Renamed/new_renamed")
notices = file.path("C:/Users/Chance/Dropbox/My Projects/Elvis/KPLC/Renamed/new_renamed")
announcements = list.files(notices, ".pdf")
extract_area_date_time = function(y){
x = read_pdf(y) # from package textreadr
x = x$text
areas = which(startsWith(x, "AREA"))
areas1 = x[areas]
areas1 = gsub("AREA: ", "", areas1)
dates = areas+1
dates1 = x[dates]
dates1 = gsub("DATE: ", "", dates1)
dates1 = gsub("TIME: ", "", dates1)
blackoutEntry = data.frame(cbind(areas1, dates1))
blackoutEntry$pdf_name = rep(gsub(".pdf", "", y), nrow(blackoutEntry)) # new
write.table(blackoutEntry, "new_renamed_blackouts.csv", sep = ",", append = T, row.names = F, col.names = F)
}
map(announcements, extract_area_date_time)
blackouts = read.csv("new_renamed_blackouts.csv")
saveRDS(blackouts, "new_renamed_blackouts.rds")
# one notice had a completely different format from the rest
different_blackouts = data.frame(area = c("Parts of South C and Lang'ata", "Parts of Embakasi"),
date_time = c("Saturday 26.11.2016 8.00 A.M. – 5.00 P.M.",
"Sunday 27.11.2016 8.00 A.M. – 5.00 P.M."),
pdf_name = "32.0800-2016-11-26.pdf")
saveRDS(different_blackouts, "different_blackouts.rds")