From 4664679ff479079ed535c53a7a1d8fb312f1b04f Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 19 Nov 2022 16:57:21 +0000 Subject: [PATCH] fixed image augmentation step --- image_aug.py | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 image_aug.py diff --git a/image_aug.py b/image_aug.py new file mode 100644 index 0000000..1cf3fc7 --- /dev/null +++ b/image_aug.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- +"""image_augmentation_ben_pc.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1iHvkV1IOUOCNQH2MeZLeq6bX4--PZkXA +""" + + + +# Commented out IPython magic to ensure Python compatibility. +# importing all the required libraries +import warnings +warnings.filterwarnings('ignore') +import numpy as np +import skimage.io as io +from skimage.transform import rotate, AffineTransform, warp +import os +import pandas as pd +import matplotlib.pyplot as plt +# %matplotlib inline + +path = '' #path to save files +#dir_path_0 = r'/content/drive/MyDrive/Hackerthon_data/0_img' #path to file for scenario 0 +#dir_path_1 = r'/content/drive/MyDrive/Hackerthon_data/1_img' #path to file for scenario 1 +#dir_path_2 = r'/content/drive/MyDrive/Hackerthon_data/2_img' #path to file for scenario 2 + + +path = os.getcwd() +df = pd.read_csv(path+'/train.csv') +df['File_ID'] = df.example_path.str.extract('(\d+)') +df['version_no'] = np.zeros([len(df)]) + +df_0 = df.copy(deep=True) +df_0.drop(df_0[df_0['label']!=0].index,inplace=True) +df_0.reset_index(inplace=True) +df_1 = df.copy(deep=True) +df_1.drop(df_1[df_1['label']!=1].index,inplace=True) +df_1.reset_index(inplace=True) +df_2 = df.copy(deep=True) +df_2.drop(df_2[df_2['label']!=2].index,inplace=True) +df_2.reset_index(inplace=True) + + + +def percentage_matcher(n_iter,df,df_0,df_1,df_2,path,outpath): + '''change the percentages of files in each scenario to be similar to 2 dp. + input path0: path to scenario 0 + input path1: path to scenario 1 + input path2: path to scenario 2 + input n_iter: numbner of iterations for the for loop + input df: dataframe of all training data + input df_0: dataframe of scenario 0 + input df_1: dataframe of scenario 1 + input df_2: dataframe of scenario 2 + input path: path to folder location to save file e.g.,/content/drive/MyDrive/Hackerthon_data/ + return lengths: the length of each directory + return percentages: the percentage of evenets in each directory + return df: dataframe of augmented training data''' + + len_0 = len(df_0)#(len([entry for entry in os.listdir(dir_path_0) if os.path.isfile(os.path.join(dir_path_0, entry))])) #calculate the number of files in the directory + len_1 = len(df_1)#(len([entry for entry in os.listdir(dir_path_1) if os.path.isfile(os.path.join(dir_path_1, entry))])) #calculate the number of files in the directory + len_2 = len(df_2)#(len([entry for entry in os.listdir(dir_path_2) if os.path.isfile(os.path.join(dir_path_2, entry))])) #calculate the number of files in the directory + lengths = np.array([len_0,len_1,len_2]) #put the starting number of flies for each scenario into an array + percentages = (lengths)/np.sum(lengths) #calculate the starting percentages of each image in the three scenarios + folders = np.array(['0_img','1_img','2_img']) + + for i in range(n_iter): + min_length = min(lengths) + boolarr = lengths == min_length #identify scenario with less events + lengths[boolarr] = lengths[boolarr] + 5 + percentages = (lengths)/np.sum(lengths) + + if folders[boolarr] == '0_img': + sample_img_no = np.array(df_0.sample()) + elif folders[boolarr] == '1_img': + sample_img_no = np.array(df_1.sample()) + elif folders[boolarr] == '2_img': + sample_img_no = np.array(df_2.sample()) + image_path = '{h}/{p}.png'.format(h=path,k=folders[boolarr][0],p=sample_img_no[0][-2]) + img = io.imread(image_path) + io.imshow(img) + rotations = [90,180,270] + + for j in range(len(rotations)): + rotated = rotate(img, angle=rotations[j], mode = 'wrap') + sample_img_no[0][-1] +=1 + sample_img_no_version = sample_img_no[0][1:] + df_ev = pd.DataFrame([sample_img_no_version],columns=['label','latitude','longitude','year','example_path','File_ID','version_no']) + df_ev['example_path'] = 'train_test_data/train/{p}_{q}.png'.format(p=df_ev['File_ID'].iloc[-1],q=int(df_ev['version_no'].iloc[-1])) + df = pd.concat([df,df_ev],ignore_index=True) + io.imsave('{h}/{p}_{q}.png'.format(h=outpath,k=folders[boolarr][0],p=df['File_ID'].iloc[-1],q=int(df['version_no'].iloc[-1])),arr=rotated) + + flipLR = np.fliplr(img) + sample_img_no[0][-1] +=1 + sample_img_no_version = sample_img_no[0][1:] + df_ev = pd.DataFrame([sample_img_no_version],columns=['label','latitude','longitude','year','example_path','File_ID','version_no']) + df_ev['example_path'] = 'rain_test_data/train/{p}_{q}.png'.format(p=df_ev['File_ID'].iloc[-1],q=int(df_ev['version_no'].iloc[-1])) + df = pd.concat([df,df_ev],ignore_index=True) + io.imsave('{h}/{p}_{q}.png'.format(h=outpath,k=folders[boolarr][0],p=df['File_ID'].iloc[-1],q=int(df['version_no'].iloc[-1])),arr=flipLR) + + flipUD = np.flipud(img) + sample_img_no[0][-1] +=1 + sample_img_no_version = sample_img_no[0][1:] + df_ev = pd.DataFrame([sample_img_no_version],columns=['label','latitude','longitude','year','example_path','File_ID','version_no']) + df_ev['example_path'] = 'rain_test_data/train/{p}_{q}.png'.format(p=df_ev['File_ID'].iloc[-1],q=int(df_ev['version_no'].iloc[-1])) + df = pd.concat([df,df_ev],ignore_index=True) + io.imsave('{h}/{p}_{q}.png'.format(h=outpath,k=folders[boolarr][0],p=df['File_ID'].iloc[-1],q=int(df['version_no'].iloc[-1])),arr=flipUD) + return lengths, percentages, df + +lengths, percentages,df = percentage_matcher(30,df,df_0,df_1,df_2,path+'/train_test_data'+'/train',path+'/figs') + + + +df.to_csv(path+'/new_train.csv') + + +print(len(df)) + +# visualise +plt.figure(figsize = (8,8)) +plt.pie(df.groupby('label').size(), labels = label, autopct='%1.1f%%', shadow=True, startangle=90) +plt.show() + + + + + + + + + + + + + + + + + + + + + + + + +