-
Notifications
You must be signed in to change notification settings - Fork 0
/
data-extraction.py
64 lines (49 loc) · 2.29 KB
/
data-extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import re
import csv
from collections import defaultdict
# Path to the SEED file
seed_file = r"C:/Users/gagou/Desktop/Data science and info/lebrun/pfam_transformer_project/Pfam-A.seed/Pfam-A.seed"
# Dictionary to store the sequences for each family
family_sequences = defaultdict(list)
# Regular expressions to match the family description and sequence count
family_re = re.compile(r"#=GF DE\s+([^\n]+)")
sequence_count_re = re.compile(r"#=GF SQ\s+(\d+)")
pattern = re.compile(r'^([A-Z0-9_]+/\d+-\d+)\s+([A-Z.]+)$')
# Variables to store the current family and sequence count
current_family = None
nb_sequences = 0
list_of_fams = []
# Read the SEED file line by line, specifying the encoding as UTF-8
with open(seed_file, "r", encoding="utf-8") as file:
for line in file:
line.strip()
# If this line contains a family description, update the current family
family_match = family_re.match(line)
if family_match:
current_family = family_match.group(1)
#print(current_family)
# If this line contains the sequence count, update the sequence count
sequence_count_match = sequence_count_re.match(line)
if sequence_count_match:
nb_sequences = int(sequence_count_match.group(1))
if nb_sequences>1:
list_of_fams.append(current_family)
#print(current_family, nb_sequences)
if current_family in list_of_fams:
sequence_match = pattern.match(line)
if sequence_match:
current_sequence = sequence_match.group(2)
family_sequences[current_family].append(current_sequence)
#print(current_sequence)
#print(nb_sequences)
print(f"Kept {sum(len(sequences) for sequences in family_sequences.values())} sequences in {len(family_sequences)} families")
# Save the filtered families to a CSV file
'''
output_file = r"C:/Users/gagou/Desktop/Data science and info/lebrun/pfam_transformer_project/filtered_families.csv"
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["sequence", "family"])
for family, sequences in family_sequences.items():
for sequence in sequences:
writer.writerow([sequence, family])
'''