-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxa_boxplots.py
179 lines (156 loc) · 6.8 KB
/
xa_boxplots.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import pandas as pd
import numpy as np
import argparse
import glob
import seaborn
from statannot import add_stat_annotation
import matplotlib.pyplot as plt
import itertools
import warnings
warnings.filterwarnings('ignore')
# create the command line parser
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--filepath", default='.', help="path to bed file folder to get data")
args = parser.parse_args()
data = args.filepath
# get list of bed files
all_files = glob.glob(data + "/*.bed")
li = []
for filename in all_files:
temp = filename.replace('.', '_').split("_") #[2:-1]
print(temp)
# for genes unique to a specific experimental condition, ie. non-overlapping group
if len(temp) < 8:
# read bed file into dataframe
df = pd.read_csv(filename, sep="\t", names=['chr', 'start', 'end', 'gene_id', 'score', 'gene_name'])
n = str(len(df.index))
splitname = filename.replace('.', '_').split("_")
print(splitname)
# get file base name, shorter if embryo which is unsexed
if 'e' in temp:
fullfile = "_".join(splitname[2:5])
file1 = "_".join(splitname[2:5])
else:
fullfile = "_".join(splitname[2:6])
file1 = "_".join(splitname[2:6])
# read csv with expression data into dataframe and get gene fold changes
mag_fold1 = pd.read_csv(file1 + ".csv")
gene_ids = df['gene_id'].tolist()
mag_fold1 = mag_fold1[mag_fold1['log2FoldChange'].isin(gene_ids)].astype(float)
# add stuff into df
df['log2FoldChange'] = mag_fold1
df['dataset'] = file1
df['overlap'] = fullfile
# change all chromosome labels to either X or A (autosome)
df['chr'] = df['chr'].map({'2L':'A', '2R':'A', '3L':'A', '3R':'A', '4':'A', 'X':'X'})
# append dataframe to overall list for later visualization
li.append(df)
else: # for intersection of genes between two experimental conditions, ie. overlapping group
# read bed file into dataframe
df = pd.read_csv(filename, sep="\t", names=['chr', 'start', 'end', 'gene_id', 'score', 'gene_name'])
n = str(len(df.index))
splitname = filename.replace('.', '_').split("_")
print(splitname)
# get base names of each files, shorter if embryo which is unsexed
if 'e' in temp:
fullfile = "_".join(splitname[2:8])
file1 = "_".join(splitname[2:5])
file2 = "_".join(splitname[5:8])
else:
fullfile = "_".join(splitname[2:10])
file1 = "_".join(splitname[2:6])
file2 = "_".join(splitname[6:10])
# read csv with expression data for first experimental condition into dataframe and get gene fold changes
mag_fold1 = pd.read_csv(file1 + ".csv")
mag_fold1 = mag_fold1[['log2FoldChange']].astype(float)
# add stuff to df
df['log2FoldChange'] = mag_fold1
df['dataset'] = file1
df['overlap'] = fullfile
# change all chromosome labels to either X or A (autosome)
df['chr'] = df['chr'].map({'2L':'A', '2R':'A', '3L':'A', '3R':'A', '4':'A', 'X':'X'})
# append dataframe for first set to overall list for later visualization
li.append(df)
# read csv with expression data for second experimental condition into dataframe and get gene fold changes
df = pd.read_csv(filename, sep="\t", names=['chr', 'start', 'end', 'gene_id', 'score', 'gene_name'])
n = str(len(df.index))
mag_fold2 = pd.read_csv(file2 + ".csv")
mag_fold2 = mag_fold2[['log2FoldChange']].astype(float)
# add stuff to df
df['log2FoldChange'] = mag_fold2
df['dataset'] = file2
df['overlap'] = fullfile
# change all chromosome labels to either X or A (autosome)
df['chr'] = df['chr'].map({'2L':'A', '2R':'A', '3L':'A', '3R':'A', '4':'A', 'X':'X'})
# append dataframe for second set to overall list for later visualization
li.append(df)
# concatenate list into single larger dataframe
frame = pd.concat(li, axis=0, ignore_index=True)
# get all unique datasets (experimental conditions) from dataframe, frame
unique_datasets = frame['dataset'].unique()
xa_chrom = ['X', 'A']
# make into tuples, ie ('X', clampi_e)
subsets2 = list(itertools.product(unique_datasets, xa_chrom))
print(subsets2)
# create combinations of size 2 between pairs of tuples
subsets3 = list(itertools.combinations(subsets2, 2))
print(subsets3)
box_pairs_lst = []
for pair in subsets3:
tup1,tup2 = pair
ds1,chr1 = tup1
ds2,chr2 = tup2
if ds1 == ds2 or chr1 == chr2:
box_pairs_lst.append(pair)
print(box_pairs_lst)
# get all possible subsets of these combinations
all_ss = []
for L in range(1, len(box_pairs_lst)+1):
for ss in itertools.combinations(box_pairs_lst, L):
all_ss.append(list(ss))
print(ss)
print(all_ss)
# make violin plot of dataset (experimental condition) vs log2FoldChange with X vs A separated
plt.figure(figsize=(8,5))
g = seaborn.violinplot(x="dataset", y="log2FoldChange", data=frame, hue="chr")
# plt.show()
# sort subsets for largest to smallest by length
all_ss_sorted = sorted(all_ss, key=len, reverse=True)
print(all_ss_sorted[0])
print(box_pairs_lst)
# for each of these subsets try to create the statistical significance bars between different groups
# using the Mann-Whitney test
for blabla in all_ss_sorted:
print(blabla)
try:
plt.figure(figsize=(8,5))
g = seaborn.violinplot(x="dataset", y="log2FoldChange", data=frame, hue="chr")
add_stat_annotation(g, data=frame, x="dataset", y="log2FoldChange", hue="chr",
box_pairs=blabla,
test='Mann-Whitney', text_format='star', loc='outside', verbose=2)
break
except ValueError:
continue
plt.tick_params(axis='x', pad=17)
plt.tight_layout()
# add sample size (n: _) under each of the violin groups
medians = frame.groupby(['dataset','chr'])['log2FoldChange'].median()
mins = frame.groupby(['dataset','chr'])['log2FoldChange'].min()
nobs = frame.groupby(['dataset','chr']).apply(lambda x: 'n: {}'.format(len(x)))
ymin,ymax = g.get_ylim()
for ax in plt.gcf().axes:
for tick, label in enumerate(ax.get_xticklabels()):
ax_dat = label.get_text()
for j, ax_chr in enumerate(ax.get_legend_handles_labels()[1]):
if len(ax.get_legend_handles_labels()[1]) == 1:
x_offset = 0
else:
x_offset = (j - 0.5)*0.4
min_val = mins[ax_dat, ax_chr]
num = nobs[ax_dat, ax_chr]
ax.text(tick + x_offset, ymin - abs(0.0686*(ymax-ymin)), num,
horizontalalignment='center', size='medium', color='green', weight='semibold')
fig = g.get_figure()
figtit = max(frame['overlap'].unique(), key=len)
# save figure
fig.savefig('violin_'+figtit+'.png')