-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdata-prep.R
119 lines (98 loc) · 3.28 KB
/
data-prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
library(geoR)
# to get lat and lon from Google
library(ggmap)
library(rgdal)
library(tidyverse)
library(DT)
library(knitr)
library(sp)
library(rgeos)
library(ggplot2)
library(ggthemes)
library(outliers)
library(maptools)
getCurrentFileLocation <- function()
{
this_file <- commandArgs() %>%
tibble::enframe(name = NULL) %>%
tidyr::separate(col=value, into=c("key", "value"), sep="=", fill='right') %>%
dplyr::filter(key == "--file") %>%
dplyr::pull(value)
if (length(this_file)==0)
{
this_file <- rstudioapi::getSourceEditorContext()$path
}
return(dirname(this_file))
}
path = getCurrentFileLocation()
setwd(path)
db = read.csv('db.csv')
#converting date format (min = 2016-09-01, max=2017-08-31)
db$SALE.DATE = format(as.Date(db$SALE.DATE))
#db = subset(db,SALE.DATE <= as.Date('2016/12/31'))
#cleaning the data: replacing '-' in SALE.PRICE
db$SALE.PRICE <- as.character(db$SALE.PRICE)
db$SALE.PRICE[db$SALE.PRICE == "-"] = ""
db$SALE.PRICE <- as.integer(db$SALE.PRICE)
#cleaning the data: replacing '-' in GROSS.SQUARE.FEET
db$GROSS.SQUARE.FEET <- as.character(db$GROSS.SQUARE.FEET)
db$GROSS.SQUARE.FEET[db$GROSS.SQUARE.FEET == "-"] = ""
db$GROSS.SQUARE.FEET <- as.integer(db$GROSS.SQUARE.FEET)
#cleaning the data: replacing '-' in LAND.SQUARE.FEET
db$LAND.SQUARE.FEET <- as.character(db$LAND.SQUARE.FEET)
db$LAND.SQUARE.FEET[db$LAND.SQUARE.FEET == "-"] = ""
db$LAND.SQUARE.FEET <- as.integer(db$LAND.SQUARE.FEET)
#cleaning the data: removing lines for which SALES.PRICE is too low
#db = subset(db,is.na(SALE.PRICE)==FALSE)
db = subset(db,SALE.PRICE>10)
#cleaning the data: removing lines for which there is no information on GROSS.SQUARE.FEET
db = subset(db,is.na(GROSS.SQUARE.FEET)==FALSE)
db = subset(db,GROSS.SQUARE.FEET>0)
#we add price per square feet:
db$PRICE.SQUARE.FEET = db$SALE.PRICE/db$GROSS.SQUARE.FEET
#we create a function that returns the borough's name, and add it to the database
borough_name_fun = function(code){
if(code == 1){
b_name = 'Manhattan'
} else if (code == 2){
b_name = 'Bronx'
} else if (code == 3){
b_name = 'Brooklyn'
} else if (code == 4){
b_name = 'Queens'
} else if (code == 5){
b_name = 'Staten Island'
} else {
b_name = 'N/I'
}
return(b_name)
}
db$BOROUGHNAME = lapply(db$BOROUGH, borough_name_fun)
#we create a full address column
db$location = paste0(db$ADDRESS, ", ", db$BOROUGHNAME, ", ", db$ZIP.CODE , " - New York")
# the sample is too big for Google API, so for now, we take just 2500 entries, selected randomly
# and we set the seed to make our partition reproductible
db$ID <- seq.int(nrow(db))
db1 = subset(db,ID<=2499)
db2 = subset(db,ID>2499 & ID<=4999)
db3 = subset(db,ID>4999 & ID<=7499)
db4 = subset(db,ID>7499 & ID<=9999)
db5 = subset(db,ID>9999)
###
smp_size = floor(2500/12949 * nrow(db))
set.seed(123)
reduced_sample = sample(seq_len(nrow(db)), size = smp_size)
reduced_sample = db[reduced_sample, ]
db = reduced_sample
#We get longitude and latitude from google API
geo = geocode(location = db5$location, output="latlon", source="google")
db5$lon = geo$lon
db5$lat = geo$lat
x = rbind(db1,db2,db3,db4,db5)
x = subset(x,is.na(lon)==FALSE)
x <- apply(x,2,as.character)
#Creating final db
db = rbind(db1,db2,db3,db4,db5)
db = subset(db,is.na(lon)==FALSE)
#Saving into a new file
write.csv(x, file = 'db.csv')