From 4664679ff479079ed535c53a7a1d8fb312f1b04f Mon Sep 17 00:00:00 2001
From: Ben <bwwjohnson@gmail.com>
Date: Sat, 19 Nov 2022 16:57:21 +0000
Subject: [PATCH] fixed image augmentation step

---
 image_aug.py | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 image_aug.py

diff --git a/image_aug.py b/image_aug.py
new file mode 100644
index 0000000..1cf3fc7
--- /dev/null
+++ b/image_aug.py
@@ -0,0 +1,149 @@
+# -*- coding: utf-8 -*-
+"""image_augmentation_ben_pc.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1iHvkV1IOUOCNQH2MeZLeq6bX4--PZkXA
+"""
+
+
+
+# Commented out IPython magic to ensure Python compatibility.
+# importing all the required libraries
+import warnings
+warnings.filterwarnings('ignore')
+import numpy as np
+import skimage.io as io
+from skimage.transform import rotate, AffineTransform, warp
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+# %matplotlib inline
+
+path = '' #path to save files
+#dir_path_0 = r'/content/drive/MyDrive/Hackerthon_data/0_img' #path to file for scenario 0
+#dir_path_1 = r'/content/drive/MyDrive/Hackerthon_data/1_img' #path to file for scenario 1
+#dir_path_2 = r'/content/drive/MyDrive/Hackerthon_data/2_img' #path to file for scenario 2
+
+
+path = os.getcwd()
+df = pd.read_csv(path+'/train.csv')
+df['File_ID'] = df.example_path.str.extract('(\d+)')
+df['version_no'] = np.zeros([len(df)])
+
+df_0 = df.copy(deep=True)
+df_0.drop(df_0[df_0['label']!=0].index,inplace=True)
+df_0.reset_index(inplace=True)
+df_1 = df.copy(deep=True)
+df_1.drop(df_1[df_1['label']!=1].index,inplace=True)
+df_1.reset_index(inplace=True)
+df_2 = df.copy(deep=True)
+df_2.drop(df_2[df_2['label']!=2].index,inplace=True)
+df_2.reset_index(inplace=True)
+
+
+
+def percentage_matcher(n_iter,df,df_0,df_1,df_2,path,outpath):
+  '''change the percentages of files in each scenario to be similar to 2 dp.
+  input        path0: path to scenario 0
+  input        path1: path to scenario 1
+  input        path2: path to scenario 2
+  input       n_iter: numbner of iterations for the for loop
+  input           df: dataframe of all training data
+  input         df_0: dataframe of scenario 0
+  input         df_1: dataframe of scenario 1
+  input         df_2: dataframe of scenario 2
+  input         path: path to folder location to save file e.g.,/content/drive/MyDrive/Hackerthon_data/
+  return     lengths: the length of each directory
+  return percentages: the percentage of evenets in each directory
+  return          df: dataframe of augmented training data'''
+
+  len_0 = len(df_0)#(len([entry for entry in os.listdir(dir_path_0) if os.path.isfile(os.path.join(dir_path_0, entry))])) #calculate the number of files in the directory
+  len_1 = len(df_1)#(len([entry for entry in os.listdir(dir_path_1) if os.path.isfile(os.path.join(dir_path_1, entry))])) #calculate the number of files in the directory
+  len_2 = len(df_2)#(len([entry for entry in os.listdir(dir_path_2) if os.path.isfile(os.path.join(dir_path_2, entry))])) #calculate the number of files in the directory
+  lengths = np.array([len_0,len_1,len_2]) #put the starting number of flies for each scenario into an array
+  percentages = (lengths)/np.sum(lengths) #calculate the starting percentages of each image in the three scenarios
+  folders = np.array(['0_img','1_img','2_img'])
+  
+  for i in range(n_iter):
+    min_length = min(lengths)
+    boolarr = lengths == min_length #identify scenario with less events
+    lengths[boolarr] = lengths[boolarr] + 5
+    percentages = (lengths)/np.sum(lengths)
+    
+    if folders[boolarr] == '0_img':
+      sample_img_no = np.array(df_0.sample())
+    elif folders[boolarr] == '1_img':
+      sample_img_no = np.array(df_1.sample())
+    elif folders[boolarr] == '2_img':
+      sample_img_no = np.array(df_2.sample())
+    image_path = '{h}/{p}.png'.format(h=path,k=folders[boolarr][0],p=sample_img_no[0][-2])
+    img = io.imread(image_path)
+    io.imshow(img)
+    rotations = [90,180,270]
+    
+    for j in range(len(rotations)):
+      rotated = rotate(img, angle=rotations[j], mode = 'wrap')
+      sample_img_no[0][-1] +=1
+      sample_img_no_version = sample_img_no[0][1:]
+      df_ev = pd.DataFrame([sample_img_no_version],columns=['label','latitude','longitude','year','example_path','File_ID','version_no'])
+      df_ev['example_path'] = 'train_test_data/train/{p}_{q}.png'.format(p=df_ev['File_ID'].iloc[-1],q=int(df_ev['version_no'].iloc[-1]))
+      df = pd.concat([df,df_ev],ignore_index=True)
+      io.imsave('{h}/{p}_{q}.png'.format(h=outpath,k=folders[boolarr][0],p=df['File_ID'].iloc[-1],q=int(df['version_no'].iloc[-1])),arr=rotated)
+    
+    flipLR = np.fliplr(img)
+    sample_img_no[0][-1] +=1
+    sample_img_no_version = sample_img_no[0][1:]
+    df_ev = pd.DataFrame([sample_img_no_version],columns=['label','latitude','longitude','year','example_path','File_ID','version_no'])
+    df_ev['example_path'] = 'rain_test_data/train/{p}_{q}.png'.format(p=df_ev['File_ID'].iloc[-1],q=int(df_ev['version_no'].iloc[-1]))
+    df = pd.concat([df,df_ev],ignore_index=True)
+    io.imsave('{h}/{p}_{q}.png'.format(h=outpath,k=folders[boolarr][0],p=df['File_ID'].iloc[-1],q=int(df['version_no'].iloc[-1])),arr=flipLR)
+    
+    flipUD = np.flipud(img)
+    sample_img_no[0][-1] +=1
+    sample_img_no_version = sample_img_no[0][1:]
+    df_ev = pd.DataFrame([sample_img_no_version],columns=['label','latitude','longitude','year','example_path','File_ID','version_no'])
+    df_ev['example_path'] = 'rain_test_data/train/{p}_{q}.png'.format(p=df_ev['File_ID'].iloc[-1],q=int(df_ev['version_no'].iloc[-1]))
+    df = pd.concat([df,df_ev],ignore_index=True)
+    io.imsave('{h}/{p}_{q}.png'.format(h=outpath,k=folders[boolarr][0],p=df['File_ID'].iloc[-1],q=int(df['version_no'].iloc[-1])),arr=flipUD)
+  return lengths, percentages, df
+
+lengths, percentages,df = percentage_matcher(30,df,df_0,df_1,df_2,path+'/train_test_data'+'/train',path+'/figs')
+
+
+
+df.to_csv(path+'/new_train.csv')
+
+
+print(len(df))
+
+# visualise 
+plt.figure(figsize = (8,8))
+plt.pie(df.groupby('label').size(), labels = label, autopct='%1.1f%%', shadow=True, startangle=90)
+plt.show()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+