ADS_Capstone_model_evaluation_Aux.py

# Advanced Data Science - Capstone Project

## Functions Definitions - To Help in the Model Evaluation

### Carlos Granados

# Standard libraries:
import numpy as np
from numpy.random import seed
from math import ceil

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd

from keras.callbacks import Callback
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from ADS_Capstone_model_train_Aux import customNorm, create_trimmed_data_norm

#### Some Functions used in the Model Evaluation:


# Generate an array with random numbers and transform it to a DF
def randDF(nrows, amin=0.75, amax=1.25, nmax=10):
    a0 = amin                    # Coefficient to scale generated random numbers
    a1 = (amax - amin) / nmax    # Coefficient to scale the generated random numbers
    randVec = a0 + a1 * np.random.randint(nmax+1, size=(1, nrows))
    randVec = randVec.T
    df_rand = pd.DataFrame(data=randVec, columns=["alpha"], index=None)
    return df_rand

def locDay(nrows, t0, t1):
    dt = int((t1 - t0) / 0.25)
    nDays = int(nrows / dt)
    dSel = max(1, np.random.randint(nDays))
    nmin, nmax = (dSel-1) * dt, dSel * dt
    return [nmin, nmax]

def selDay(df, nMonth, t_min, t_max):
    temp = df.loc[(df['MONTH'] == nMonth) & (df['TIME'] >= t_min) & (df['TIME'] < t_max)]
    [nmin, nmax] = locDay(temp.shape[0], t_min, t_max, nDay)
    temp = temp.iloc[nmin:nmax]
    temp.reset_index(inplace=True)
    temp.drop('index', axis=1, inplace=True)
    nday = temp['DAY'].unique()[0]
    return [nday, temp]

def selTime(df, nMonth, nDay, t_min, t_max):
    temp = df.loc[(df['MONTH'] == nMonth) & (df['DAY'] == nDay) &
                  (df['TIME'] >= t_min) & (df['TIME'] < t_max)]
    temp.reset_index(inplace=True)
    temp.drop('index', axis=1, inplace=True)
    return temp

def df_mult(x0, df, nMonth, nDay, t0):
    #if nDay == None:
    #    temp = selTime(df, nMonth, nDay, t0, t1)
    #else:
    temp = df
    temp.loc[:,'DC_POWER'] *= x0
    temp.loc[:,'AC_POWER'] *= x0
    temp.reset_index(inplace=True, drop=True)
    #temp.drop('index', axis=1, inplace=True)
    return temp

def df_mult_vec(df, nMonth, nDay, t0):
    #if nDay == None:
    #    temp = selTime(df, nMonth, nDay, t0, t1)
    #else:
    temp = df
    temp.reset_index(inplace=True, drop=True)
    nrows = temp.shape[0]
    df_v = randDF(nrows)
    temp.loc[:,'DC_POWER'] = temp.loc[:,'DC_POWER'].multiply(df_v.loc[:,'alpha'], axis='index')
    temp.loc[:,'AC_POWER'] = temp.loc[:,'AC_POWER'].multiply(df_v.loc[:,'alpha'], axis='index')
    temp.reset_index(inplace=True, drop=True)
    #temp.drop('index', axis=1, inplace=True)
    return temp

def saveDF(df_all, df, i):
    df_all[i] = df
    i += 1
    return i


def genInterval(icount, dfDict, df, num_df, t0, nDay, nMonth, x0, verbose):
    """
    Function to generate a set of DFs, used to evaluate a trained model.
    4 different sets are generated: 1. From starting t0 and for the given day and
    month, the readed data from the sensors. 2. The same data with modified DC
    output (by a constant), 3. Data modified as in 2, for the next time interval.
    4. The following data for the next time interval, with the DC output multiplied
    by a random vector, generated by randDF
    icount : (int) starting point in the dict dfDict
    dfDict : (dict of DFs) store all selected DFs
    df     : (dict of DFs) DFs used in the training of the model
    num_df : (int) Number of DFs to extract. Total set (at the end, 4X)
    t0     : (float) starting time (0.0 - 23.99)
    nDay   : (int) Selected day starting day...
    nMonth : (int) Selected month
    x0     : (float) Constant factor, used to modify the DC output
    verbose: (int) To print (1) or not (0) the selected DFs, saved in dfDict
    """
    temp = None
    i_check = 1
    temp = df[0]
    nd0 = temp['DAY'].min()
    nd1 = nd0 + nDay
    n_max = len(df)
    nloops = 0
    while i_check <= num_df:
        ndi = np.random.randint(nd0, nd1+1)
        # 1. A DF from the training set, for a given day, month and for a time interval
        # For the given month and time interval, a random day is selected
        iref = int(n_max * 0.30)
        for i in range(n_max):
            temp = df[i]
            nd = temp['DAY'].min()
            tmin = temp['TIME'].min()
            if nd == ndi and tmin == t0:
                iref = i
                break
        df0 = df[iref].copy(deep=True)
        icount = saveDF(dfDict, df0, icount)
        if verbose == 1:
            print('Normal')
            print(df0.describe())
        #2. "faulty" DC output, using the same DF as before.
        # 2.1 Multiplied by a constant value: 0.90 (90 % of the original output)
        temp = df[iref].copy(deep=True)
        df1 = df_mult(x0, temp, nMonth, nDay, t0)
        icount = saveDF(dfDict, df1, icount)
        if verbose == 1:
            print('Factor')
            print(df1.describe())
        temp = df[iref+1].copy(deep=True)
        df2 = df_mult(x0, temp, nMonth, nDay, t0)
        icount = saveDF(dfDict, df2, icount)
        if verbose == 1:
            print('Factor')
            print(df2.describe())
        #df01 = pd.concat([df0, df1], ignore_index=True)
        # 2.2 Multiplied by a random value between 0.75 and 1.25
        # Generate random vector df_vec and multiply df by it
        temp = df[iref+2].copy(deep=True)
        df3 = df_mult_vec(temp, nMonth, nDay, t0)
        icount = saveDF(dfDict, df3, icount)
        #df_all = pd.concat([df0, df1, df2], ignore_index=True)
        #count = saveDF(dfDict, df_all, icount)
        if verbose == 1:
            print('Random')
            print(df3.describe())
        i_check += 1
        nloops += 1
        if nloops >= 10*num_df:
            break
    return [icount, dfDict]

def genTest(df_train, df_eval, t_min, nDay, nMonth, x0=0.90, n0=1, n1=1, verbose=0):
    """
    Function to generate a list of DFs, starting with two existing ones.
    df1    : first list of DFs
    df2    : second list of DFs
    t_min  : (float), starting time
    nDay   : (int) selected starting day
    nMonth : (int), given date (month) to extract the data
    x0     : scalar value to multiply columns of df1 and df2
    n0     : (int) number of day (starting with nDay) in the training data
    n1     : (int) number of day (starting with nDay) in the evaluating data
    verbose: to print the description of each generated df
    To Return :
    df_res : Resulting list of DFs, to be returned
    """
    # Time interval
    t0 = t_min
    # Empty dict
    df_res = {}
    icount = 0
    # Data sets used during the training
    [icount, df_res] = genInterval(icount, df_res, df_train, n0, t0,
                                   nDay, nMonth, x0, verbose)
#    # New data sets from different sources
    [icount, df_res] = genInterval(icount, df_res, df_eval, n1, t0,
                                   nDay, nMonth, x0, verbose)
    return df_res

# Function to transform a data set to a DF, with the used labels
def npToDF(data):
    """
    Function to transform a np array to a DF, with the used columns
    """
    cols_end = ['TIME', 'DAY', 'MONTH', 'AMB_TEMP', 'MOD_TEMP',
                'IRRADIATION', 'AC_POWER', 'DC_POWER']
    df = pd.DataFrame(data, columns=cols_end)
    df.reset_index(inplace=True)
    return df

# Function to calculate the loss
def lossCalc(x, x_pred, lossFun='mse', l_min=0.1):
    """
    Function to calculate the loss value...
    x      : (array) original values
    x_pred : (array) predicted values
    loss   : (str) name of the loss function. Only MSE, MSLE
    """
    # Transform np arrays to DFs
    batch_size=44
    df = npToDF(x)
    temp0 = x
    temp1 = x_pred
    # Calculate the loss
    score = pd.DataFrame(index=df.index)
    loss = []
    if lossFun == 'mse':
        #loss = MSE(temp0, temp1)
        loss = np.mean(np.square(temp0 - temp1), axis=1)
    elif lossFun == 'msle':
        loss = np.mean(np.square(np.log(temp0 + 1.0) - np.log(temp1 + 1.0)), axis=1)
    else:
        loss = np.mean(np.abs(temp0 - temp1), axis=1)
    score['Loss'] = loss
    score['Threshold'] = l_min
    score['Anomaly'] = score['Loss'] > score['Threshold']
    return score

# Predict function
def predData(df, time_steps, dataTrimming, scaler, model):
    """
    Function to predict values...
    """
    n, dim = df.shape[0], df.shape[-1]
    data = dataTrimming(df, time_steps, scaler)
    data_pred = model.predict(data, verbose=1)
    data.shape = (n, dim)
    data_pred.shape = (n, dim)
    return [data, data_pred]
    
# Function to extract the loss
def scoreAll(df_all, time_steps, scaler, model, lossFun='mse', l_min=0.1):
    """
    Function to check the scores for a given model
    df_all     : (dict of DFs). All DFs to be evaluated. Each key must be the index...
    time_steps : (int) time steps used in the model training
    model      : trained model
    """
    for i in df_all:
        print('df : {}'.format(i))
        df = df_all[i]
        data = df.to_numpy()
        [data, data_pred] = predData(df, time_steps, create_trimmed_data_norm, scaler, model)
        score = lossCalc(data, data_pred, lossFun=lossFun, l_min=l_min)
        print('Loss : {}'.format(score['Loss'].describe()))
        print('---------------------------\n')

# Modification to the Callbacks, in order to save the losses and the accuracy

losses = []

def handleLoss(loss):
    global losses
    losses += [loss]
    print(loss)

class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.acc = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        self.acc.append(logs.get('accuracy'))
        handleLoss(logs.get('loss'))

# Function to generate a prediction of the model, and return its score...

def predictModel(df, model, dataTrimming, scaler, time_steps, batch_size,
		     lossFun='mse', l_min=0.1):
    """
    Function to train a model, for a giving data set
    df           : (df), contains the data ued to train the model
    model        : trained model
    dataTrimming : Function to be used to reshape the data set
    time_steps   : (int)
    batch_size   : (int)
    """
    # Array to store all the final losses and accuracy tests, at each loop
    lossHistory = LossHistory()
    lossVals = []
    # Get reshaped data (normalization, if needed, must be done in this function)
    data = df.to_numpy()
    [data, data_pred] = predData(df, time_steps, create_trimmed_data_norm, scaler, model)
    score = lossCalc(data, data_pred, lossFun=lossFun, l_min=l_min)
    return score

# Function to plot the Losses (and generate a color palette)
# Colors
ggrey    = '#424242' #grey 900
gred     = '#E53935' #red 600
gpurple  = '#8E42AA' #purple 600
gindigo  = '#3949AB' #indigo 600
gblue    = '#1E88E5' #blue 600
gcyan    = '#00ACC1' #cyan 600
gteal    = '#00897B' #teal 600
ggreen   = '#43A047' #green 600
gyellow  = '#FDD835' #yellow 600
gorange  = '#F57C00' #orange 700
gdorange = '#F4511E' #deep orange 600
gbgrey   = '#546E7A' #blue grey 600
gdpurple = '#512DA8' #deep purple 700

# Define color palette
def colorPalette(indx=0):
    """
    Function to define the color palette to be used
    for all plots...
    """
    # Define color sequence
    colors = None
    if indx == 0:
        colors = [ggrey, gred, gblue, gteal, gbgrey]
    if indx == 1:
        colors = [ggrey, gred, gpurple, gindigo, gblue,
                  gcyan, gteal, ggreen, gyellow, gdorange,
                  gbgrey, gdpurple]
    return colors

# Plot losses
def plotLosses(score, c_xmin=0.95, c_xmax=1.0, c_ymin=-1.5, c_ymax=1.5):
    """
    Function to plot the losses...
    """
    # Number of entries and define number of columns for the legend
    n_df = len(score)
    n_col = int(n_df / 10.) + 1
    # Load color palette
    colors = colorPalette(indx=1)
    # Define figure
    fig, ax = plt.subplots(figsize=(10,5))
    # Obtain number of points
    n_max = 0
    for i in score:
        df = score[i]
        n_max += df.shape[0] + 1
    # Plot all points, for each DF in score
    j = 1
    j_max = len(colors)
    y_max = 0.0
    for i in score:
        df = score[i]
        y = df['Loss'].to_numpy()
        y_max = max(y.max(), y_max)
        if i == 0:
            n0 = len(y) + 1
            #n_min = int(factor * n0)
            val = df['Threshold'].to_numpy()[0]
            ax.plot([1, n_max], [val, val], c=gorange, label='Threshold')
            ax.plot(range(1, n0), y, c=colors[i], label='Training')
            num_y = n0 -1
            yi_old = y[-1]
        else:
            ni = df.shape[0]
            n1 = n0 + ni + 1
            xi = np.arange(n0, n1)
            yi = np.zeros(ni + 1)
            yi[0] = yi_old
            yi[1:ni+1] = y[:]
            yi_old = yi[-1]
            labi = "Entry : " + str(i)
            ax.plot(xi, yi, c=colors[j], label=labi)
            j += 1
            if j >= j_max:
                j = 1
            n0 = n1
    x_min = c_xmin * n_max
    x_max = c_xmax * n_max
    y_min = c_ymin * y_max
    y_max = c_ymax * y_max
    ax.legend(ncol=n_col)
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xlabel('epochs')
    ax.set_ylabel('Loss')
    plt.show()

def plotTimeLosses(df, score):
    """
    Function to plot the losses...
    """
    colors = colorPalette()
    # Plot each DF
    n_max = len(df)
    iref = 1
    i = 0
    while iref == 1:
        time = []
        loss = []
        df0 = df[i]
        df1 = score[i]
        t_min = 24.0
        t_max = 0.0
        for j in range(4):
            dft = df[i+j]
            dfl = score[i+j]
            t = dft['TIME']
            l = dfl['Loss']
            time.append(t)
            loss.append(l)
            t_min = min(t_min, t.min())
            t_max = max(t_max, t.max())
        val = df1['Threshold'].min()
        nMonth = df0['MONTH'].unique()[0]
        nDay = df0['DAY'].unique()[0]
        strTitle = 'Evaluation Data for Month {0} and Day {1}'.format(nMonth, nDay)
        # Define figure
        fig, ax = plt.subplots(figsize=(10,5))
        ax.plot([t_min, t_max], [val, val], c=gorange, label='Training')
        for j in range(4):
            ax.plot(time[j], loss[j], c=colors[j+1], label='Loss - MSE')
        ax.set_title(strTitle)
        ax.legend()
        ax.set_xlabel('Time')
        ax.set_ylabel('Loss')
        plt.show()
        i += 4
        if i >= n_max:
            iref = 0


#-------------------------------------------------------------------------------


# Functions used to evaluate the machine learning model      

# Predict function
def predData_LR(df, pipe):
    """
    Function to predict values...
    """
    cols_x = ['TIME', 'DAY', 'MONTH', 'MOD_TEMP', 'AMB_TEMP', 'IRRADIATION']
    cols_y = ['DC_POWER']
    x_test = df[cols_x]
    y_test = df[cols_y]
    ny = y_test.shape[0]
    y_test = y_test.to_numpy()
    y_hat = pipe.predict(x_test)
    return [y_test, y_hat]

def predictModel_LR(df, pipe, cv=1):
    """
    Function to predict values and return the R2 of the prediction...
    """
    [data, data_pred] = predData_LR(df, pipe)
    r2 = r2_score(data, data_pred)
    mse = mean_squared_error(data, data_pred)
    mae = mean_absolute_error(data, data_pred)
    if cv == 1:
        return r2
    elif cv == 2:
        return [r2, mse]
    elif cv == 3:
        return [r2, mse, mae]

# Function to check different score values
def scoreAll_LR(df_all, pipe):
    """
    Function to check the scores for a given model
    df_all     : (dict of DFs). All DFs to be evaluated. Each key must be the index...
    pipe      : trained model
    """
    sum_r2 = 0
    sum_mse = 0
    sum_mae = 0
    n_data = 0
    for i in df_all:
        print('df : {}'.format(i))
        df = df_all[i]
        [r2, mse, mae] = predictModel_LR(df, pipe, cv=3)
        sum_r2 += r2
        sum_mse += mse
        sum_mae += mae
        n_data += 1
        print('R2 : {}'.format(r2))
        print('MSE : {}'.format(mse))
        print('MAE : {}'.format(mae))
        print('---------------------------\n')
    print('Mean R2 : {}'.format(sum_r2/n_data))
    print('Mean MSE : {}'.format(sum_mse/n_data))
    print('MeanMAE : {}'.format(sum_mae/n_data))
    print('---------------------------\n')

# Plot losses
def plotR2(score, factor=0.85):
    """
    Function to plot the losses...
    """
    # Number of entries, and define number of columns in the legend box...
    n_df = len(score)
    n_col = int(n_df / 10.) + 1
    # Load color palette
    colors = colorPalette(indx=1)
    val = 0.75
    # Define figure
    fig, ax = plt.subplots(figsize=(10,5))
    # Obtain number of points
    n_max = 0
    for i in score:
        y = np.array(score[i])
        n_max += y.size
    n_max += len(score) - 1
    # Plot all points, for each DF in score
    j = 1
    j_max = len(colors)
    y_min, y_max = 0.0, 0.0
    for i in score:
        r2 = np.array(score[i])
        if i == 0:
            n0 = r2.size + 1
            n_min = int(factor * n0)
            ax.plot([1, n_max], [val, val], c=gorange, label='Threshold')
            ax.plot(range(1, n0), r2, c=colors[i], label='Training')
            ax.scatter(range(1, n0), r2, c=colors[i])
            num_y = n0 -1
            yi_old = r2[-1]
            y_min = min(y_min, r2.min())
            y_max = max(y_max, r2.max())
        else:
            n1 = n0 + 2
            xi = np.arange(n0, n1)
            yi = np.zeros(2)
            yi[0] = yi_old
            yi[1] = r2
            yi_old = r2
            labi = "Entry : " + str(i)
            ax.plot(xi, yi, c=colors[j], ls='--', label=labi)
            ax.scatter(xi, yi, c=[colors[j-1], colors[j]])
            y_min = min(y_min, r2)
            y_max = max(y_max, r2)
            j += 1
            if j >= j_max:
                j = 1
            n0 = n1
    n_max = max(n_max, n1)
    y_min *= 2
    y_max = max(y_max, 3.0)
    ax.legend(ncol=n_col)
    ax.set_xlim(n_min, n_max+2)
    ax.set_ylim(y_min, y_max)
    ax.set_xlabel('Index Data Set')
    ax.set_ylabel('R2 - Score')
    plt.show()