-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqfomam.cfg
164 lines (120 loc) · 5.32 KB
/
qfomam.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
### Performance and UI options ###
#how many parallel threads?
threads: 15
#show tqdm progressbar?
progressbar: False
### DB, API and Files options ###
#use oracle db sql query for gc groups? (if false, get data from tsv file)
gc_from_sql: True
#use oracle db sql queries to retrieve sequences? (if false, get data via protein API)
seq_from_sql: True
#if getting sequence data via protein API, should we cache the files?
cache_sequences_flag: True
#should we store and cache alignments?
cache_alignments_flag: True
#should we store and cache trees?
cache_trees_flag: True
#should we create pdf files for suggestions?
create_pdf_files_flag: True
#where to store local copy of panther data?
panther_data_dir: 'PANTHER_Sequence_Classification_files/'
#directory name where sequences cache can be stored
fasta_dir: 'fasta'
#directory name where output n_data will be written to
n_data_dir: 'n_data'
#directory name where output alignments will optionally be written to
aln_data_dir: 'aln_data'
#directory name where trees will optionally be written to
tree_data_dir: 'tree_data'
#directory name where pdf files for suggestions will optionally be written to
pdf_data_dir: 'pdf_data'
#directory name where to create 0 sized semaphore files
semaphores_dir: 'processed'
### DATA version options ###
#Panther version?
panther_version: 'PTHR17.0'
#UniProt Release version?
up_release: '2022_05'
### Parameters for the analysis ###
#minimum number of different organisms for building an alignment
min_taxa_threshold: 3
#number of different organisms that should be in the low-cost clade for acceptance (if taxa in tree more than this, otherwise use min_taxa_threshold)
taxa_threshold: 3
#exclude tree_to_ndata solutions with costs higher than this (default=0.01)
tree_max_cost: 0.05
#improvement required to drop a taxon?
tree_drop_fact: 1.5
#set to true to work at superfamily level
superfamily_level: False
#whether we want to add refseq sequences to orthogroups
add_refseq: False
# if suggestion's length and canonical's length are less different than this amount, skip printing
seqlen_difference_threshold: 5
# parameters used when scanning n_data2 files to select suggestions:
min_delta_threshold: 0.005 #minimum cost difference for individual taxa
min_delta_sp_threshold: 0.02 #minimum cost difference for individual sp taxa
suggestion_score_difference: -0.001 #cost difference
suggestion_taxa_threshold: 3 ##minimum number of taxa
suggestion_min_canon: 1 #minimum number of canonicals
suggestion_max_clade_cost: 0.02 #max clade cost
suggestion_only_zero_cost: False #only consider suggestions with zero cost
# weights used for the suggestions scoring function:
suggestion_ranking_weights:
n_sp: 16.0
n_tax: 4.0 #was 0.5
n_canon: 0.0 #was 8
wn_canon: 12.0
scaled_prop_f: 4.0
scaled_p_cost: 8.0
p_len_diff: 1.0
# set the following to skip (re)processing of orthogroups, useful if only want to re-score existing n_data files:
skip_reprocessing_orthogroups: False
### OUTLIERS options ###
#should we remove sequences that have been flagged as outliers?
remove_outliers: True
#std/mean based outlier detection:
#outliers are identified as those under /threshold_low/ or beyond /threshold_hi/ standard deviations from mean of sequence lengths
detect_outliers_with_mean_std: False
outliers_detection_threshold_std_lo: 2
outliers_detection_threshold_std_hi: 2
#median based outlier detection:
#outliers are identified as those with seqlen under /threshold_median_lo/ or beyond /threshold_median_hi/ times the median
detect_outliers_with_median: False
outliers_detection_threshold_median_lo: 0.75
outliers_detection_threshold_median_hi: 2
#quartile based outlier detection:
#outliers are identified as those with seqlen under /threshold_quart_lo/ times Q1 or beyond /threshold_quart_hi/ times the Q3 value
detect_outliers_with_quart: False
outliers_detection_threshold_quart_lo: 0.5
outliers_detection_threshold_quart_hi: 2
#canonical seqlen outlier detection:
#outliers are identified as those with seqlen under /threshold_can_lo/ times the can_min_len (min seqlen of canonicals in group) or beyond /threshold_can_hi/ times the can_max_len (max seqlen of canonicals in group)
outliers_detection_threshold_can_lo: 0.5
outliers_detection_threshold_can_hi: 2
#median + canonical seqlen outlier detection:
detect_outliers_with_median_and_can_lengths: False
#quartile + canonical seqlen outlier detection:
detect_outliers_with_quart_and_can_lengths: True
### dataframe reading and caching ###
#do we want to dump the data into output files marked with a certain /outstamp/?
dump_orthogroup_data: False
outstamp: '240304'
#do we want to read cached dataframe from previously dumped files, marked with a certain /instamp/?
use_cached_orthogroup_data: True
instamp: '221110'
### SIMULATION ###
#if a filename is specified, it will be attempted to be read to simulate genecentric application of ortho2tree using previously created suggestions
sugg_file: False
### ORGANISMS definition ###
# organisms are the names according to panther sequence classification files
# oscodes are the names according to uniprot (mnemonic species identification code of at most 5 alphanumeric characters)
# use panther organism names as values in the following dict, with tax_id as keys
tax2org:
9606: human
10090: mouse
10116: rat
9913: cow
9615: dog
13616: opossum
9595: gorilla
9598: chimpanzee