-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpart2.py
100 lines (82 loc) · 2.88 KB
/
part2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
from abc import ABC, abstractmethod
df = pd.read_csv("gene_table.txt", delimiter=",")
class Objective(ABC):
@abstractmethod
def record(self):
pass
class Number(Objective):
def __init__ (self,f):
self.f=f
def record(self):
rows=str(self.f.shape[0])
cols=str(self.f.shape[1])
return pd.DataFrame(data=[[rows,cols],[]],columns=['Rows','Columns']).head(1)
class Semantics(Objective):
def __init__ (self,f):
self.f=f
def record(self):
semantics = list(self.f)
sem_df = pd.DataFrame({'Labels': semantics})
return sem_df
class Biotype(Objective):
def __init__ (self,f):
self.f=f
def record(self):
bio=self.f.groupby(['gene_biotype'])['gene_name'].count()
bio=bio.sort_values().to_frame().reset_index()
return bio.rename(columns={'gene_name':'Number of genes'})
class Associate(Objective):
def __init__ (self,f):
self.f=f
def record(self):
bio=self.f.groupby(['gene_biotype'])['gene_name'].count()
bio=bio.sort_values().to_frame()
bio['gene_biotypes'] = bio.index
genes=[]
for index, row in bio.iterrows():
a=self.f.loc[:,['gene_biotype','gene_name']]
a=a[a['gene_biotype']==row['gene_biotypes']] #filtering
a=list(a.loc[:,'gene_name'])
genes.append(a)
bio['Gene_names']=genes
bio=bio.loc[:,['Gene_names']]
return bio
class Chromosome(Objective):
def __init__ (self,f):
self.f=f
def record(self):
return 'Number of chromosomes: '+ str(self.f.groupby(['chromosome']).count().shape[0])
class Genes_Chromosome(Objective):
def __init__ (self,f):
self.f=f
def record(self):
gene=self.f.groupby(['chromosome'])['gene_name'].count()
gene=gene.sort_values().to_frame().reset_index()
return gene.rename(columns={'gene_name':'Number of genes'})
class Plus(Objective):
def __init__ (self,f):
self.f=f
def record(self):
chrt=self.f.groupby(['chromosome'], as_index=False)['gene_name'].count()
chrp=self.f.loc[:,['chromosome','strand']]
chrp=chrp[chrp['strand']=='+']
chrp2=chrp.groupby(['chromosome'], as_index=False)['strand'].count()
conc=chrp2.merge(chrt,how='outer')
conc['Plus_Percentage']=(conc['strand']/conc['gene_name']) * 100
conc=conc.fillna(0)
conc.sort_values(by='chromosome',inplace=True)
return conc.rename(columns={'strand':'Plus strand', 'gene_name':'Number of genes','Plus_Percentage':'+ percentage'})
class Minus(Objective):
def __init__ (self,f):
self.f=f
def record(self):
chrt=self.f.groupby(['chromosome'], as_index=False)['gene_name'].count()
chrm=self.f.loc[:,['chromosome','strand']]
chrm=chrm[chrm['strand']=='-']
chrm2=chrm.groupby(['chromosome'], as_index=False)['strand'].count()
conc= chrm2.merge(chrt, how='outer')
conc['Minus_Percentage']=(conc['strand']/conc['gene_name']) * 100
conc= conc.fillna(0)
conc.sort_values(by='chromosome', inplace=True)
return conc.rename(columns={'strand':'Minus strand', 'gene_name':'Number of genes','Minus_Percentage':'- percentage'})