qfomam.cfg

### Performance and UI options ###

#how many parallel threads?
threads: 15

#show tqdm progressbar?
progressbar: False

### DB, API and Files options ###

#use oracle db sql query for gc groups? (if false, get data from tsv file)
gc_from_sql: True

#use oracle db sql queries to retrieve sequences? (if false, get data via protein API)
seq_from_sql: True

#if getting sequence data via protein API, should we cache the files?
cache_sequences_flag: True

#should we store and cache alignments?
cache_alignments_flag: True

#should we store and cache trees?
cache_trees_flag: True

#should we create pdf files for suggestions?
create_pdf_files_flag: True

#where to store local copy of panther data?
panther_data_dir: 'PANTHER_Sequence_Classification_files/'

#directory name where sequences cache can be stored
fasta_dir: 'fasta'

#directory name where output n_data will be written to
n_data_dir: 'n_data'

#directory name where output alignments will optionally be written to
aln_data_dir: 'aln_data'

#directory name where trees will optionally be written to
tree_data_dir: 'tree_data'

#directory name where pdf files for suggestions will optionally be written to
pdf_data_dir: 'pdf_data'

#directory name where to create 0 sized semaphore files
semaphores_dir: 'processed'

### DATA version options ###

#Panther version?
panther_version: 'PTHR17.0'

#UniProt Release version?
up_release: '2022_05'

### Parameters for the analysis ###

#minimum number of different organisms for building an alignment
min_taxa_threshold: 3

#number of different organisms that should be in the low-cost clade for acceptance (if taxa in tree more than this, otherwise use min_taxa_threshold)
taxa_threshold: 3

#exclude tree_to_ndata solutions with costs higher than this (default=0.01)
tree_max_cost: 0.05

#improvement required to drop a taxon?
tree_drop_fact: 1.5

#set to true to work at superfamily level
superfamily_level: False

#whether we want to add refseq sequences to orthogroups
add_refseq: False

# if suggestion's length and canonical's length are less different than this amount, skip printing
seqlen_difference_threshold: 5

# parameters used when scanning n_data2 files to select suggestions:
min_delta_threshold: 0.005 #minimum cost difference for individual taxa
min_delta_sp_threshold: 0.02 #minimum cost difference for individual sp taxa
suggestion_score_difference: -0.001 #cost difference
suggestion_taxa_threshold: 3 ##minimum number of taxa
suggestion_min_canon: 1 #minimum number of canonicals
suggestion_max_clade_cost: 0.02 #max clade cost
suggestion_only_zero_cost: False #only consider suggestions with zero cost

# weights used for the suggestions scoring function:
suggestion_ranking_weights:
  n_sp: 16.0
  n_tax: 4.0 #was 0.5
  n_canon: 0.0 #was 8
  wn_canon: 12.0
  scaled_prop_f: 4.0
  scaled_p_cost: 8.0
  p_len_diff: 1.0

# set the following to skip (re)processing of orthogroups, useful if only want to re-score existing n_data files:
skip_reprocessing_orthogroups: False

### OUTLIERS options ###

#should we remove sequences that have been flagged as outliers?
remove_outliers: True

#std/mean based outlier detection:
#outliers are identified as those under /threshold_low/ or beyond /threshold_hi/ standard deviations from mean of sequence lengths
detect_outliers_with_mean_std: False
outliers_detection_threshold_std_lo: 2
outliers_detection_threshold_std_hi: 2

#median based outlier detection:
#outliers are identified as those with seqlen under /threshold_median_lo/ or beyond /threshold_median_hi/ times the median
detect_outliers_with_median: False
outliers_detection_threshold_median_lo: 0.75
outliers_detection_threshold_median_hi: 2

#quartile based outlier detection:
#outliers are identified as those with seqlen under /threshold_quart_lo/ times Q1 or beyond /threshold_quart_hi/ times the Q3 value
detect_outliers_with_quart: False
outliers_detection_threshold_quart_lo: 0.5
outliers_detection_threshold_quart_hi: 2

#canonical seqlen outlier detection:
#outliers are identified as those with seqlen under /threshold_can_lo/ times the can_min_len (min seqlen of canonicals in group) or beyond /threshold_can_hi/ times the can_max_len (max seqlen of canonicals in group)
outliers_detection_threshold_can_lo: 0.5
outliers_detection_threshold_can_hi: 2

#median + canonical seqlen outlier detection:
detect_outliers_with_median_and_can_lengths: False

#quartile + canonical seqlen outlier detection:
detect_outliers_with_quart_and_can_lengths: True

### dataframe reading and caching ###

#do we want to dump the data into output files marked with a certain /outstamp/?
dump_orthogroup_data: False
outstamp: '240304'

#do we want to read cached dataframe from previously dumped files, marked with a certain /instamp/?
use_cached_orthogroup_data: True
instamp: '221110'

### SIMULATION ###
#if a filename is specified, it will be attempted to be read to simulate genecentric application of ortho2tree using previously created suggestions
sugg_file: False

### ORGANISMS definition ###
# organisms are the names according to panther sequence classification files
# oscodes are the names according to uniprot (mnemonic species identification code of at most 5 alphanumeric characters)
# use panther organism names as values in the following dict, with tax_id as keys

tax2org:
    9606: human
    10090: mouse
    10116: rat
    9913: cow
    9615: dog
    13616: opossum
    9595: gorilla
    9598: chimpanzee