-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgjw_converter.py
201 lines (186 loc) · 8.01 KB
/
gjw_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
from __future__ import print_function, division
import pandas as pd
import numpy as np
import os
from os.path import join
import fnmatch
import re
from nilmtk.utils import get_datastore
from nilmtk.datastore import Key
from nilmtk.measurement import LEVEL_NAMES
from nilmtk.utils import check_directory_exists
from nilm_metadata import convert_yaml_to_hdf5
column_mapping = {
'frequency': ('frequency', ""),
'voltage': ('voltage', ""),
'W': ('power', 'active'),
'active': ('power', 'active'),
'energy': ('energy', 'apparent'),
'A': ('current', ''),
'reactive_power': ('power', 'reactive'),
'apparent_power': ('power', 'apparent'),
'power_factor': ('pf', ''),
'PF': ('pf', ''),
'phase_angle': ('phi', ''),
'VA': ('power', 'apparent'),
'VAR': ('power', 'reactive'),
'reactive': ('power', 'reactive'),
'VLN': ('voltage', ""),
'V': ('voltage', ""),
'f': ('frequency', "")
}
# data for file name manipulation
TYPE_A = "active"
TYPE_R = "reactive"
filename_prefix_mapping = {
TYPE_A : ('4-POWER_REAL_FINE '),
TYPE_R : ('5-POWER_REACTIVE_STANDARD ')
}
filename_suffix_mapping = {
TYPE_A : (' Dump'),
TYPE_R : (' Dump')
}
# DataFrame column names
TIMESTAMP_COLUMN_NAME = "timestamp"
ACTIVE_COLUMN_NAME = "active"
REACTIVE_COLUMN_NAME = "reactive"
type_column_mapping = {
TYPE_A : (ACTIVE_COLUMN_NAME),
TYPE_R : (REACTIVE_COLUMN_NAME)
}
TIMEZONE = "Europe/London" # local time zone
home_dir='/Users/GJWood/nilm_gjw_data' # path to input data
#regular expression matching
bld_re = re.compile('building\d+') #used to pull building name from directory path
bld_nbr_re = re.compile ('\d+') # used to pull the building number from the name
iso_date_re = re.compile ('\d{4}-\d{2}-\d{2}') # used to pull the date from the file name
def convert_gjw(gjw_path, output_filename):
"""
Parameters
----------
gjw_path : str
The root path of the gjw dataset.
output_filename : str
The destination filename (including path and suffix), will default if not specified
directory and file structure
nilm_gjw_data
building<1>
elec
4-POWER_REAL_FINE <date> Dump.csv
5-POWER_REACTIVE_STANDARD <date> Dump.csv
...
...
building<n>
HDF5
nilm_gjw_data.hdf5
metadata
building1.yaml
dataset.yaml
meter_devices.yaml
other files
"""
if gjw_path is None: gjw_path = home_dir
check_directory_exists(gjw_path)
os.chdir(gjw_path)
gjw_path = os.getcwd() # sort out potential issue with slashes or backslashes
if output_filename is None:
output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5')
# Open data store
print( 'opening datastore', output_filename)
store = get_datastore(output_filename, format, mode='w')
# walk the directory tree from the dataset home directory
#clear dataframe & add column headers
df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
found = False
for current_dir, _, files in os.walk(gjw_path):
#unused second parameter of for dirs_in_current_dir
if current_dir.find('.git')!=-1 or current_dir.find('.ipynb') != -1:
#print( 'Skipping ', current_dir)
continue
print( 'checking', current_dir)
m = bld_re.search(current_dir)
if m: #The csv files may be further down the tree so this section may be repeated
building_name = m.group()
building_nbr = int(bld_nbr_re.search(building_name).group())
meter_nbr = 1
key = Key(building=building_nbr, meter=meter_nbr)
for items in fnmatch.filter(files, "4*.csv"):
# process any .CSV files found
found = True
ds = iso_date_re.search(items).group()
# print( 'found files for date:', ds,end=" ")
# found files to process
df1 = _read_file_pair(current_dir,ds) # read two csv files into a dataframe
df = pd.concat([df,df1]) # concatenate the results into one long dataframe
if found:
found = False
df = _prepare_data_for_toolkit(df)
_summarise_dataframe(df,'Prepared for tool kit')
store.put(str(key), df)
#clear dataframe & add column headers
#df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
break # only 1 folder with .csv files at present
store.close()
convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename)
print("Done converting gjw to HDF5!")
def _read_and_standardise_file(cdir,ds,mtype):
"""
parameters
cdir - the directory path where the files may be found
ds - the date string which identifies the pair of files
type - the type of data to be read
The filename is constructed using the appropriate prefixes and suffixes
The data is then read, merged, de-duplicated, converted to the correct time zone
and converted to a time series and resampled per second
"""
fn = filename_prefix_mapping[mtype]+ds+filename_suffix_mapping[mtype]+'.csv'
ffn = join(cdir,fn)
df = pd.read_csv(ffn,names=[TIMESTAMP_COLUMN_NAME,type_column_mapping[mtype]])
df.drop_duplicates(subset=[TIMESTAMP_COLUMN_NAME], inplace=True) # remove duplicate rows with same timestamp
df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) # convert the index to time based
df = df.tz_convert(TIMEZONE) #deal with summertime etc. for London timezone
# re-sample on single file only as there may be gaps between dumps
df = df.resample('S',fill_method='ffill') # make sure we have a reading for every second
# resample seems to remove the timestamp column so put it back
df[TIMESTAMP_COLUMN_NAME] = df.index
df.drop_duplicates(subset=TIMESTAMP_COLUMN_NAME, inplace=True)
return df
def _read_file_pair(cdir,ds):
""""
parameters
cdir - the directory path where the files may be found
ds - the date string which identifies the pair of files
The files are processed individually then the columns merged on matching timestamps
"""
df1 = _read_and_standardise_file(cdir,ds,TYPE_A)
#_summarise_dataframe(df1,'read file: '+TYPE_A)
df2 = _read_and_standardise_file(cdir,ds,TYPE_R)
#_summarise_dataframe(df2,'read file: '+TYPE_R)
df3 = pd.merge(df1,df2,on=TIMESTAMP_COLUMN_NAME, how='outer') #merge the two column types into 1 frame
df3.fillna(value=0, inplace=True) # may need to enter initial entries to reactive sequence
#_summarise_dataframe(df3,'return from merge and fillna)
first_ts = pd.Timestamp(df3[TIMESTAMP_COLUMN_NAME][0])
last_ts = pd.Timestamp(df3[TIMESTAMP_COLUMN_NAME][df3.index[-1]])
print(first_ts,"to",last_ts) #print first and last entries
return df3
def _prepare_data_for_toolkit(df):
#remove any duplicate timestamps between files
df.drop_duplicates(subset=["timestamp"], inplace=True) # remove duplicate rows with same timestamp
df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) # convert the index to time based
df = df.tz_convert(TIMEZONE) #deal with summertime etc. for London timezone
df = df.drop(TIMESTAMP_COLUMN_NAME,1) # remove the timestamp column
df.rename(columns=lambda x: column_mapping[x], inplace=True) # Renaming from gjw header to nilmtk controlled vocabulary
df.columns.set_names(LEVEL_NAMES, inplace=True) # Needed for column levelling (all converter need this line)
df = df.convert_objects(convert_numeric=True) # make sure everything is numeric
df = df.dropna() # drop rows with empty cells
df = df.astype(np.float32) # Change float 64 (default) to float 32
df = df.sort_index() # Ensure that time series index is sorted
return df
def _summarise_dataframe(df,loc):
print(df.head(4))
print("...", len(df.index),"rows", loc)
print (df.tail(4))
def main():
convert_gjw('c:/Users/GJWood/nilm_gjw_data', None)
if __name__ == '__main__':
main()