gjw_converter.py

from __future__ import print_function, division
import pandas as pd
import numpy as np
import os
from os.path import join
import fnmatch
import re
from nilmtk.utils import get_datastore
from nilmtk.datastore import Key
from nilmtk.measurement import LEVEL_NAMES
from nilmtk.utils import check_directory_exists
from nilm_metadata import convert_yaml_to_hdf5

column_mapping = {
    'frequency': ('frequency', ""),
    'voltage': ('voltage', ""),
    'W': ('power', 'active'),
    'active': ('power', 'active'),
    'energy': ('energy', 'apparent'),
    'A': ('current', ''),
    'reactive_power': ('power', 'reactive'),
    'apparent_power': ('power', 'apparent'),
    'power_factor': ('pf', ''),
    'PF': ('pf', ''),
    'phase_angle': ('phi', ''),
    'VA': ('power', 'apparent'),
    'VAR': ('power', 'reactive'),
    'reactive': ('power', 'reactive'),
    'VLN': ('voltage', ""),
    'V': ('voltage', ""),
    'f': ('frequency', "")
    
}
# data for file name manipulation
TYPE_A = "active"
TYPE_R = "reactive"

filename_prefix_mapping = {
    TYPE_A : ('4-POWER_REAL_FINE '),
    TYPE_R : ('5-POWER_REACTIVE_STANDARD ')
}
filename_suffix_mapping = {
    TYPE_A : (' Dump'),
    TYPE_R : (' Dump')
}

# DataFrame column names
TIMESTAMP_COLUMN_NAME = "timestamp"
ACTIVE_COLUMN_NAME = "active"
REACTIVE_COLUMN_NAME = "reactive"

type_column_mapping = {
    TYPE_A : (ACTIVE_COLUMN_NAME),
    TYPE_R : (REACTIVE_COLUMN_NAME) 
}


TIMEZONE = "Europe/London" # local time zone
home_dir='/Users/GJWood/nilm_gjw_data' # path to input data

#regular expression matching
bld_re = re.compile('building\d+') #used to pull building name from directory path
bld_nbr_re = re.compile ('\d+') # used to pull the building number from the name
iso_date_re = re.compile ('\d{4}-\d{2}-\d{2}') # used to pull the date from the file name

def convert_gjw(gjw_path, output_filename):
    """
    Parameters
    ----------
    gjw_path : str
        The root path of the gjw dataset.
    output_filename : str
        The destination filename (including path and suffix), will default if not specified
    directory and file structure
    nilm_gjw_data
        building<1>
            elec
                4-POWER_REAL_FINE <date> Dump.csv
                5-POWER_REACTIVE_STANDARD <date> Dump.csv
                ...
        ...
        building<n>
        HDF5
            nilm_gjw_data.hdf5
        metadata
            building1.yaml
            dataset.yaml
            meter_devices.yaml
        other files    
    """
    if gjw_path is None: gjw_path = home_dir
    check_directory_exists(gjw_path)
    os.chdir(gjw_path)
    gjw_path = os.getcwd()  # sort out potential issue with slashes or backslashes
    if output_filename is None:
        output_filename =join(home_dir,'HDF5','nilm_gjw_data.hdf5')
    # Open data store
    print( 'opening datastore', output_filename)
    store = get_datastore(output_filename, format, mode='w')
    # walk the directory tree from the dataset home directory
    #clear dataframe & add column headers
    df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
    found = False
    for current_dir, _, files in os.walk(gjw_path):
        #unused second parameter of for dirs_in_current_dir
        if current_dir.find('.git')!=-1 or current_dir.find('.ipynb') != -1:
            #print( 'Skipping ', current_dir)
            continue
        print( 'checking', current_dir)
        m = bld_re.search(current_dir)
        if m: #The csv files may be further down the tree so this section may be repeated
            building_name = m.group()
            building_nbr = int(bld_nbr_re.search(building_name).group())
            meter_nbr = 1
            key = Key(building=building_nbr, meter=meter_nbr)
        for items in fnmatch.filter(files, "4*.csv"):
            # process any .CSV files found
            found = True
            ds = iso_date_re.search(items).group()
            # print( 'found files for date:', ds,end=" ")
            # found files to process
            df1 = _read_file_pair(current_dir,ds) # read two csv files into a dataframe    
            df = pd.concat([df,df1]) # concatenate the results into one long dataframe
        if found:
            found = False
            df = _prepare_data_for_toolkit(df)
            _summarise_dataframe(df,'Prepared for tool kit')
            store.put(str(key), df)
            #clear dataframe & add column headers
            #df = pd.DataFrame(columns=[ACTIVE_COLUMN_NAME,REACTIVE_COLUMN_NAME])
            break # only 1 folder with .csv files at present
    store.close()
    convert_yaml_to_hdf5(join(gjw_path, 'metadata'),output_filename)
    print("Done converting gjw to HDF5!")

def _read_and_standardise_file(cdir,ds,mtype):   
    """
    parameters 
        cdir  - the directory path where the files may be found
        ds   - the date string which identifies the pair of files
        type - the type of data to be read
    The filename is constructed using the appropriate prefixes and suffixes
    The data is then read, merged, de-duplicated, converted to the correct time zone
    and converted to a time series and resampled per second
    """
    fn = filename_prefix_mapping[mtype]+ds+filename_suffix_mapping[mtype]+'.csv'
    ffn = join(cdir,fn)
    df = pd.read_csv(ffn,names=[TIMESTAMP_COLUMN_NAME,type_column_mapping[mtype]])
    df.drop_duplicates(subset=[TIMESTAMP_COLUMN_NAME], inplace=True) # remove duplicate rows with same timestamp
    df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) # convert the index to time based
    df = df.tz_convert(TIMEZONE) #deal with summertime etc. for London timezone
    # re-sample on single file only as there may be gaps between dumps            
    df = df.resample('S',fill_method='ffill') # make sure we have a reading for every second
    # resample seems to remove the timestamp column so put it back
    df[TIMESTAMP_COLUMN_NAME] = df.index
    df.drop_duplicates(subset=TIMESTAMP_COLUMN_NAME, inplace=True)
    return df

def _read_file_pair(cdir,ds):
    """"
    parameters 
        cdir - the directory path where the files may be found
        ds  - the date string which identifies the pair of files
    The files are processed individually then the columns merged on matching timestamps   
    """
    df1 = _read_and_standardise_file(cdir,ds,TYPE_A)
    #_summarise_dataframe(df1,'read file: '+TYPE_A)
    df2 = _read_and_standardise_file(cdir,ds,TYPE_R)
    #_summarise_dataframe(df2,'read file: '+TYPE_R)  
    df3 = pd.merge(df1,df2,on=TIMESTAMP_COLUMN_NAME, how='outer') #merge the two column types into 1 frame
    df3.fillna(value=0, inplace=True) # may need to enter initial entries to reactive sequence
    #_summarise_dataframe(df3,'return from merge and fillna)
    first_ts = pd.Timestamp(df3[TIMESTAMP_COLUMN_NAME][0])
    last_ts = pd.Timestamp(df3[TIMESTAMP_COLUMN_NAME][df3.index[-1]])
    print(first_ts,"to",last_ts) #print first and last entries
    return df3

def _prepare_data_for_toolkit(df):
    #remove any duplicate timestamps between files
    df.drop_duplicates(subset=["timestamp"], inplace=True) # remove duplicate rows with same timestamp
    df.index = pd.to_datetime(df.timestamp.values, unit='s', utc=True) # convert the index to time based
    df = df.tz_convert(TIMEZONE) #deal with summertime etc. for London timezone
    df = df.drop(TIMESTAMP_COLUMN_NAME,1) # remove the timestamp column  
    df.rename(columns=lambda x: column_mapping[x], inplace=True) # Renaming from gjw header to nilmtk controlled vocabulary
    df.columns.set_names(LEVEL_NAMES, inplace=True) # Needed for column levelling (all converter need this line)
    df = df.convert_objects(convert_numeric=True) # make sure everything is numeric
    df = df.dropna() # drop rows with empty cells
    df = df.astype(np.float32) # Change float 64 (default) to float 32 
    df = df.sort_index() # Ensure that time series index is sorted
    return df

def _summarise_dataframe(df,loc):
    print(df.head(4))
    print("...", len(df.index),"rows", loc)
    print (df.tail(4))
    
def main():
    convert_gjw('c:/Users/GJWood/nilm_gjw_data', None)

if __name__ == '__main__':
    main()