-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcleanup.py
153 lines (134 loc) · 5.38 KB
/
cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Copyrigth (C) 2019 Federico Ansaloni
# This file is part of TEspeX.
# TEspeX is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# TEspeX is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with TEspeX. If not, see <http://www.gnu.org/licenses/>.
try:
import argparse
import sys
# ensure that only the modules installed within the TEspeX_deps env are loaded - this basically deletes from sys.path all the paths not containing TEspeX_deps
new_path = []
for path in sys.path:
l = path.find("TEspeX_deps")
if l != -1:
new_path.append(path)
# sys.path is now equal to new_path --> if TEspeX_deps env has not been activated sys.path will be an empty list
sys.path = new_path
# now import other paths
import time
import os
import subprocess
import pandas as pd
except ModuleNotFoundError:
print("ERROR: it seems like none of your sys.path paths contains the TEspeX_deps one...")
print("Did you forget to activate TEspeX_deps environment through source activate TEspeX_deps?")
sys.exit(1)
__version__ = 'part of TEspeX v2.0.1'
# 1.
# define the help function
def help():
# define 2 global variables because they will be used by more than 2 functions
global dir
parser = argparse.ArgumentParser()
# create argument list
parser.add_argument('--wd', type=str, help='wrapper.py working directory (--out parameter of wrapper.py)', required=True)
parser.add_argument('--job', type=int, help='number of jobs (--job parameter of wrapper.py', required=True)
parser.add_argument('--version', action='version', version='%(prog)s ' + __version__, help='show the version number and exit')
# create arguments
arg = parser.parse_args()
dir = os.path.abspath(arg.wd)
njob = arg.job
# check that the input files exist
if os.path.exists(dir):
True
else:
print("ERROR!\n%s: no such file or directory" % (dir))
sys.exit()
return dir, njob
# 2.
# this function takes as input a string containing a shell command and executes it
def bash(command):
cmd = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
print("executing: %s" % (command))
err, out = cmd.communicate()
if int(cmd.returncode) != 0:
print("Error in", command)
print(err.decode("UTF-8"))
print(out.decode("UTF-8"))
sys.exit()
# 3. start to clean!
def clean(dir, jobs):
os.chdir(dir)
bash("mkdir tmp/")
bash("mkdir mappings/")
# create 2 empty pandas dataframe. These 2 dfs will be used to merge together the results
out = pd.DataFrame()
tot = pd.DataFrame()
for i in range(0, int(jobs)):
os.chdir(str(i))
# remove all the directories with index except one that is moved to wd. Do the same for annotation files
# if i == 0:
# c1 = "mv index/ " + dir
# bash(c1)
# c2 = "mv TE_transc_reference* " + dir
# bash(c2)
# else:
# bash("rm -r index/")
# bash("rm TE_transc_reference*")
# mv all the other files to tmp/ adding the number of the job to the end of the file
c3 = "mv outfile.txt " + dir + "/tmp/outfile_" + str(i) + ".txt"
bash(c3)
c4 = "mv mapping_stats.txt " + dir + "/tmp/mapping_stats_" + str(i) + ".txt"
bash(c4)
c5 = "mv Log.file.out " + dir + "/tmp/Log.file_" + str(i) + ".out"
bash(c5)
# now in each directory there should be only the dir with mappings
c6 = "mv * " + dir + "/mappings"
bash(c6)
# go back to wd and delete the empty dir
os.chdir(dir)
c7 = "rm -r " + str(i)
bash(c7)
# now merge together all the outfile and mapping_stat files
# mapping_stat
os.chdir("tmp/")
file = "mapping_stats_" + str(i) + ".txt"
df = pd.read_csv(file,header=0,sep='\t') # read the mapping file
frames = [ out, df ]
out = pd.concat(frames, sort = False) # cat with the previous one. If it is the 1st cycle the prev. dataframe will be empty
# outfile
file2 = "outfile_" + str(i) + ".txt" # read the output file
df2 = pd.read_csv(file2,header=0,sep='\t')
if i == 0: # if it is the 1st cycle add the column TE to the empty df 'tot'
tot["TE"] = df2["TE"]
tot = pd.merge(tot, df2, on = "TE") # merge together tot and df2 according to TE column
# return to wd
os.chdir(dir)
# move the job job.o and sample file to tmp
c8 = "mv *job* sample*txt " + dir + "/tmp"
bash(c8)
# sort mapping file and write to output
out_sorted = out.sort_values(by=["SRR"])
out_sorted.to_csv(dir+"/mapping_stats_total.txt", sep = '\t', header = True, index = False)
# sort outfile
tot = tot.reindex(sorted(tot.columns), axis = 1)
te = tot["TE"] # TE column should always be the first so
tot.drop(["TE"], axis=1,inplace=True) # I delete it and insert in 1st position
tot.insert(0,"TE",te)
#tot["TE"] = tot["TE"].str.replace("_transp","")
tot.to_csv(dir+"/outfile_total.txt", sep = '\t', header = True, index = False)
# main
def main():
dir, num_job = help()
os.chdir(dir)
clean(dir, num_job)
###################################### MAIN
if __name__ == "__main__":
main()