-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheda.py
153 lines (110 loc) · 4.79 KB
/
eda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import prince
import pandas
import webbrowser
import os
# View the original dataset
def loadDataset():
# Read the dataset into a data table using Pandas
data_table = pandas.read_csv("primary-tumor1.csv")
# Create a web page view of the data for easy viewing
# Display the first 100 records
html = data_table[0:339].to_html()
# Save the html to a temporary file
with open("data.html", "w") as f:
f.write(html)
# Open the web page in our web browser
full_filename = os.path.abspath("data.html")
webbrowser.open("file://{}".format(full_filename))
def get_details(data_frame):
print(data_frame.shape)
print(data_frame.head(10))
print(data_frame.info())
print(data_frame.describe())
# Get missing datasets - https://www.kaggle.com/funkegoodvibe/comprehensive-data-exploration-with-python
print("\n\nTotal no of missing values %d \n" % data_frame.isnull().values.sum())
if(data_frame.isnull().values.sum() != 0):
total = data_frame.isnull().sum().sort_values(ascending=False)
percent = (data_frame.isnull().sum() / data_frame.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print(missing_data.head(20))
def check_duplicates(data_frame):
print("############## Duplicate records ############## ")
duplicate_records = data_frame[data_frame.duplicated()]
is_duplicated = pd.DataFrame(duplicate_records).size != 0
if(is_duplicated):
print("Size of duplicate Records ", duplicate_records.shape)
print(duplicate_records.head())
else:
print("No duplicate records found")
return is_duplicated
def visualize_class_distribution(data_frame, target_name):
# sns.countplot(data_frame['class'], label="Count") - method 1
plt.figure(figsize=(14,8))
Y = data_frame[target_name]
total = len(Y)*1
majority_count = len(data_frame[data_frame[target_name] == '1'])
ax=sns.countplot(x=target_name, data=data_frame)
for p in ax.patches:
ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
#put 11 ticks (therefore 10 steps), from 0 to the total nusmber of rows in the dataframe
# ax.yaxis.set_ticks(np.linspace(0, total, 11)) # gives 309 at d top of d y axis
ax.yaxis.set_ticks(np.linspace(0, majority_count, 11))
#adjust the ticklabel to the desired format, without changing the position of the ticks.
# ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total)) - y axis values r display as %s
ax.set_yticklabels(map('{:.0f}'.format, ax.yaxis.get_majorticklocs()))
ax.set_xticklabels(ax.get_xticklabels(), rotation=360, ha="right")
# Use a LinearLocator to ensure the correct number of ticks
# And use a MultipleLocator to ensure a tick spacing of 10
# Need to turn the grid on ax2 off, otherwise the gridlines end up on top of the bars
# ax.legend(labels=["lung","salivary g", "Pancreas", "gall bladder", "liver"])
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Number of instances")
plt.show()
# Visualizes the feature distribution with box plots.
# source - https://towardsdatascience.com/designing-a-feature-selection-pipeline-in-python-859fec4d1b12
def visualise_feature_distribution(data_frame):
# Set graph style
sns.set(font_scale=0.75)
sns.set_style({'axes.facecolor': '1.0', 'axes.edgecolor': '0.85', 'grid.color': '0.85',
'grid.linestyle': '-', 'axes.labelcolor': '0.4', 'xtick.color': '0.4',
'ytick.color': '0.4', 'axes.grid': False})
# Create box plots based on feature type
# Set the figure size
f, ax = plt.subplots(figsize=(9, 14))
sns.boxplot(data=data_frame, orient="h", palette="Set2") # X
# Set axis label
plt.xlabel('Feature Value')
# Tight layout
f.tight_layout()
# Save figure
f.savefig('Box Plots.png', dpi=1080)
plt.show()
def perform_correspondence_analysis(data_frame):
mca = prince.MCA()
mca = prince.MCA(
n_components=2,
n_iter=3,
copy=True,
check_input=True,
engine='auto',
random_state=42
)
ptumor_mca = mca.fit(data_frame)
ax = ptumor_mca.plot_coordinates(
X=data_frame,
ax=None,
figsize=(10, 10),
show_row_points=False,
row_points_size=0,
show_row_labels=False,
show_column_points=True,
column_points_size=30,
show_column_labels=True,
legend_n_cols=1
).legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()