-
Notifications
You must be signed in to change notification settings - Fork 1
/
Data Cleaning.R
76 lines (53 loc) · 2.18 KB
/
Data Cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
## Set Format for Varibles in Each File
load("CBWomenTextTables.rda")
years = 2001:2012
findColLocs = function(spacerRow) {
## Find column locations using space in space rows
spaceLocs = gregexpr(" ", spacerRow)[[1]]
rowLength = nchar(spacerRow)
# Find the next column till no space in the row
if (substring(spacerRow, rowLength, rowLength) != " ")
return( c(0, spaceLocs, rowLength + 1))
else return(c(0, spaceLocs))
}
selectCols = function(shortColNames, headerRow, searchLocs) {
## Select the columns I want from header row
## becuase some columns are not useful for me
sapply(shortColNames, function(shortName, headerRow, searchLocs){
## Find the columns in header row corresponding to
## the columns I want
startPos = regexpr(shortName, headerRow)[[1]]
if (startPos == -1)
return( c(NA, NA) )
index = sum(startPos >= searchLocs)
c(searchLocs[index] + 1, searchLocs[index + 1])
},
headerRow = headerRow, searchLocs = searchLocs )
}
extractVariables =
function(file, varNames =c("name", "home", "ag", "gun",
"net", "time")){
## Extract all variables corresponding with the right columns
# Find which row the space located
eqIndex = grep("^===", file)
# Extract the two key rows and the data
# The one row before the space row is header row
# The rows after the space row is body
spacerRow = file[eqIndex]
headerRow = tolower(file[ eqIndex - 1 ])
body = file[ -(1 : eqIndex) ]
# Remove footnotes and blank rows
footnotes = grep("^[[:blank:]]*(\\*|\\#)", body)
if ( length(footnotes) > 0 ) body = body[ -footnotes ]
blanks = grep("^[[:blank:]]*$", body)
if (length(blanks) > 0 ) body = body[ -blanks ]
# Obtain the starting and ending positions of variables
searchLocs = findColLocs(spacerRow)
locCols = selectCols(varNames, headerRow, searchLocs)
Values = mapply(substr, list(body), start = locCols[1, ],
stop = locCols[2, ])
colnames(Values) = varNames
invisible(Values)
}
womenMat = lapply(womenTables, extractVariables)
save(womenMat, file = "cbWomenTables.rda")