-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathselect_feature_panda.py
138 lines (101 loc) · 4.65 KB
/
select_feature_panda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import sys
import commands
import pandas as pd
import numpy as np
#using first file as feature template, and using second file for adding absent feature
#first file is original full set feature file
#second file is for zero selected feature
def check_value_correct(file1,file2,file_combind):
sum_file1=0
for i in open(file1):
if '.C.' not in i:
i=i.replace('\n','')
line1=i.split(',')
#del line1[0]
del line1[len(line1)-1]
sum_line1=0
for j in line1:
sum_line1+=int(j)
sum_file1+=sum_line1
sum_file2=0
for i in open(file2):
if '.C.' not in i:
i=i.replace('\n','')
line2=i.split(',')
#del line2[0]
del line2[len(line2)-1]
sum_line2=0
for j in line2:
sum_line2+=int(j)
sum_file2+=sum_line2
print 'File 1 => ',sum_file1
print 'File 2 => ',sum_file2
sum_file3=0
for i in open(file_combind):
if '.C.' not in i:
i=i.replace('\n','')
line3=i.split(',')
#del line3[0]
del line3[len(line3)-1]
sum_line3=0
for j in line3:
sum_line3+=int(j)
sum_file3+=sum_line3
print 'Sum of file1 and file2 => ',(sum_file1+sum_file2)
print 'File combind => ',sum_file3
def select_feature_from_file1(file1,file2,out_file):
print 'Select feature column ...'
#df1=pd.read_csv(file1, header=None)
#df2=pd.read_csv(file2, header=None)
df1=pd.read_csv(file1)
df2=pd.read_csv(file2, low_memory=False)
#df1 = df1.iloc[:, 1:] #remove first column (gene label)
#df1=df1.multiply(np.nan)
df1=df1[:0] #remove every row
nan_table=pd.DataFrame(np.nan, index = np.arange(len(df2.index)), columns = df1.columns) #make the new table which the number of column equals to df1 columns and the number of rows equals to df2 row
df1=df1.append(nan_table) #append the nan_table to df1 table
#label1=df1.iloc[:,-1] #copy row of label in the last column
#df1 = df1.iloc[:, :-1] #remove last column
#df_combine= df1.combine_first(df2) #combine features of 2 file but using the feature order and value from df1 and adding the new distinct feature from df2 and their value
df2_filtered=df1.fillna(df2) #replace NaN value with the vaue from df2
df2_filtered=df2_filtered.fillna(0) #the rest of NaNs are replaced with 0
#df_combine=pd.concat([df_combine, label1], axis=1)
#df_combine.sort_index(axis=0, inplace=True) #re-ordering column
#move 18 GO labels to the front
GOs=df2_filtered.iloc[:,-18:] #copy last 18 columns of GO labels
df2_filtered=df2_filtered.iloc[:, :-18] #remove last 18 columns of GO labels
df2_GOs=pd.concat([GOs, df2_filtered], axis=1)
df2_GOs.to_csv(out_file, index=False, float_format='%.0f')
head_file3=commands.getoutput('head -n1 '+out_file)
header3_list=(head_file3.replace(' ','')).split(',')
print 'File combinded contains ',len(header3_list)-1,'features'
def select_feature_from_file1_forMEKA(file1,file2,out_file):
print 'Select feature column ...'
#df1=pd.read_csv(file1, header=None)
#df2=pd.read_csv(file2, header=None)
df1=pd.read_csv(file1)
df2=pd.read_csv(file2, low_memory=False)
#df1 = df1.iloc[:, 1:] #remove first column (gene label)
#df1=df1.multiply(np.nan)
df1=df1[:0] #remove every row
nan_table=pd.DataFrame(np.nan, index = np.arange(len(df2.index)), columns = df1.columns) #make the new table which the number of column equals to df1 columns and the number of rows equals to df2 row
df1=df1.append(nan_table) #append the nan_table to df1 table
#label1=df1.iloc[:,-1] #copy row of label in the last column
#df1 = df1.iloc[:, :-1] #remove last column
#df_combine= df1.combine_first(df2) #combine features of 2 file but using the feature order and value from df1 and adding the new distinct feature from df2 and their value
df2_filtered=df1.fillna(df2) #replace NaN value with the vaue from df2
df2_filtered=df2_filtered.fillna(0) #the rest of NaNs are replaced with 0
#df_combine=pd.concat([df_combine, label1], axis=1)
#df_combine.sort_index(axis=0, inplace=True) #re-ordering column
#move 18 GO labels to the front
#GOs=df2_filtered.iloc[:,-18:] #copy last 18 columns of GO labels
#df2_filtered=df2_filtered.iloc[:, :-18] #remove last 18 columns of GO labels
#df2_GOs=pd.concat([GOs, df2_filtered], axis=1)
df2_filtered.to_csv(out_file, index=False, float_format='%.0f')
head_file3=commands.getoutput('head -n1 '+out_file)
header3_list=(head_file3.replace(' ','')).split(',')
print 'File combinded contains ',len(header3_list)-1,'features'
#combine_feature_2file(sys.argv[1],sys.argv[2],sys.argv[3])
#select_feature_from_file1('181GO_label_181nonphoto_removeuseless.csv','query_final_meka.csv','selected_feature_final_meka.csv')
#check_value_correct(sys.argv[1],sys.argv[2],'comb.temp')
#python add_absent_of_selected_feature.py test_combind_file1.csv test_combind_file2.csv