-
Notifications
You must be signed in to change notification settings - Fork 1
/
sample-2B-JL-01.R
111 lines (74 loc) · 3.48 KB
/
sample-2B-JL-01.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# house keeping -----------------------------------------------------------
rm(list = ls())
options(warn = -1)
options(digits.secs = 6) # if options('digits.sec') is set, up to the specified number of digits will be printed for record.
# packages ----------------------------------------------------------------
if(!require(pacman)) install.packages("pacman"); require(pacman)
p_load(data.table, scales, ggplot2, rstudioapi, fasttime)
# setting up path ---------------------------------------------------------
code_path <- setwd(dirname(getActiveDocumentContext()$path))
rootpath <- gsub(basename(code_path), "", code_path)
# data cleaning -----------------------------------------------------------
# preset the type of each column to save memory and accelarate the loading
colclass = c('SYMBOL' = 'factor',
'DATE' = 'character',
'TIME' = 'character',
'BID' = 'double',
'OFR' = 'double',
'BIDSIZ' = 'integer',
'OFRSIZ' = 'integer',
'MODE' = 'integer',
'EX' = 'factor')
# read the data into memory
quotes <- fread(paste0(rootpath, 'data/quotes-1994.csv'), colClasses = colclass)
# convert uppercase column names into lowercase
setnames(quotes, tolower(colnames(quotes)))
str(quotes)
# check for NA entries except mmid
print(any(is.na(quotes)))
# check for NULL entries
print(any(is.null(quotes)))
# fastPOSIXct -------------------------------------------------------------
EST_tz_data <- cbind(
start = fastPOSIXct(
c("2010-03-14 07:00:00", "2011-03-13 07:00:00", "2012-03-11 07:00:00", "2013-03-10 07:00:00")
),
end = fastPOSIXct(
c("2010-11-07 06:00:00", "2011-11-06 06:00:00", "2012-11-04 06:00:00", "2013-11-03 06:00:00")
)
)
to_EST_tz <- function(x)
{
x <- as.numeric(x)
is_dst <- logical(length(x)) #initialise as FALSE
#Loop over DST periods in each year
for(row in seq_len(nrow(EST_tz_data)))
{
is_dst[x > EST_tz_data[row, 1] & x < EST_tz_data[row, 2]] <- TRUE
}
#Hard-coded numbers are 4/5 hours in seconds
ans <- ifelse(is_dst, x + 14400, x + 18000)
class(ans) <- c("POSIXct", "POSIXt")
ans
}
to_EST_tz(fastPOSIXct(ch))
# POSIXct -----------------------------------------------------------------
# deal with time formatting
quotes[, datetime := fastPOSIXct(paste(date, time), format='%Y%m%d %H:%M:%S')]
quotes[, datetime := as.POSIXct(paste(date, time), format='%Y%m%d %H:%M:%S', tz = 'EST')]
quotes[, ':=' (date = strftime(datetime, '%Y-%m-%d'),
time = strftime(datetime, '%H:%M:%S'))]
# strftime is a wrapper for formate and changes only the formating of the time
# only keep the quotes in normal trading time and mode which is not 0
quotes <- quotes[time >= '09:30:00' & time <= '16:00:00' & mode != 0, ]
# check the sequence of timestamps
quotes[, false := any(shift(datetime, type = 'lead') < datetime), by = symbol]
# distinguish outliers ----------------------------------------------------
# here, I simply defined outliers as quotes which have more than 100% intraday absolute return
is.out <- function(x) {abs(x-mean(x))/mean(x) > 1}
quotes[, outliers := (is.out(bid)|is.out(ofr)), by = .(symbol, date)]
# have a look at all the outliers, you will see some of them are defintely errors but some are still not clear.
quotes[outliers==T, ]
# calculate time duration of each quote
quotes[, duration := shift(datetime, type = 'lead') - datetime, by=.(symbol, date)]
quotes[, duration:=ifelse(is.na(duration), 0, duration)]