-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgffBecome_conc.py
executable file
·56 lines (49 loc) · 1.71 KB
/
gffBecome_conc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import pandas as pd
import argparse
def parse_arguments():
parser = argparse.ArgumentParser(description='')
parser.add_argument('gff_file', type=str, help='')
parser.add_argument('output', type=str, help='')
args = parser.parse_args()
return args
def merge_gff(path_to_gff, output_path):
# Sorting gff
gff = pd.read_csv(path_to_gff, sep = '\t', comment='#', header=None)
gff['contig_number'] = gff[0].str.extract(r'(\d+)').astype(int)
gff = gff.sort_values(['contig_number',3])
comments=''
# Getting contig sizes to a dictionary
contig_sizes={}
length=0
with open(path_to_gff) as f:
for line in f.readlines():
if '#' not in line: break
else:
# Keeping comments
comments+=line
match = re.match(r'##sequence-region (\S+) \d+ (\d+)', line)
if match:
contig_name = match.group(1)
contig_sizes[contig_name]=length
length += int(match.group(2))
# Appending contig sizes to dataframe
gff['contig_sizes_to_add']=gff[0].map(contig_sizes)
print(gff['contig_sizes_to_add'])
# Adding previous coordinates
gff[3] += gff['contig_sizes_to_add']
gff[4] += gff['contig_sizes_to_add']
gff=gff.drop(['contig_number', 'contig_sizes_to_add'], axis=1)
with open(output_path, 'w') as f:
f.write(comments)
gff.to_csv(f, sep='\t', index=False, header=False)
def main():
# Parsing arguments
args = parse_arguments()
input_gff= args.gff_file
output_path = args.output
merge_gff(input_gff,output_path)
if __name__ == "__main__":
main()