diff --git a/fantasia/config.yaml b/fantasia/config.yaml index 5fe8b4e..b98316d 100644 --- a/fantasia/config.yaml +++ b/fantasia/config.yaml @@ -1,62 +1,118 @@ -#System +# ========================== +# 🌍 Global Configuration +# ========================== + +# Maximum number of worker threads for parallel processing. max_workers: 1 + +# Path to the system constants file. constants: "./fantasia/constants.yaml" + +# Monitoring interval in seconds (for processes that require periodic checks). monitor_interval: 5 -# Postgres CONFIGURATION -DB_USERNAME: usuario -DB_PASSWORD: clave -DB_HOST: localhost -DB_PORT: 5432 -DB_NAME: BioData -# Rabbitmq CONFIGURATION -rabbitmq_host: localhost -rabbitmq_user: guest -rabbitmq_password: guest +# ========================== +# 🗄️ Database (PostgreSQL) +# ========================== + +# Credentials and configuration for connecting to the PostgreSQL database. +DB_USERNAME: usuario # Database username. +DB_PASSWORD: clave # Database password. +DB_HOST: localhost # Host where the database server is running. +DB_PORT: 5432 # Port used for the database connection. +DB_NAME: BioData # Name of the database to use. + + +# ========================== +# 📨 Message Queue (RabbitMQ) +# ========================== + +# Configuration for the RabbitMQ message broker. +rabbitmq_host: localhost # RabbitMQ server hostname. +rabbitmq_user: guest # RabbitMQ username for authentication. +rabbitmq_password: guest # RabbitMQ password for authentication. -# Database dump source for information system + +# ========================== +# 🔄 Data Source Configuration +# ========================== + +# URL to download the embeddings database dump. embeddings_url: "https://zenodo.org/records/14546346/files/embeddings.tar?download=1" -embeddings_path: ~/fantasia/dumps/ -fantasia_output_h5: ~/fantasia/embeddings/ -fantasia_output_csv: ~/fantasia/results/ -redundancy_temp: ~/fantasia/redundancy_temp/ +# ========================== +# 📂 Directory Configuration +# ========================== + +base_directory: ~/fantasia/ + +directories: + embeddings: ~/fantasia/dumps + hdf5_outputs: ~/fantasia/embeddings + csv_outputs: ~/fantasia/results + redundancy_temp: ~/fantasia/redundancy_temp -# Configuration Parameters +# ========================== +# 🔬 Pipeline Configuration +# ========================== + +# Path to the input FASTA file for protein sequences. fantasia_input_fasta: data_sample/worm_test.fasta + +# Reference tag used for lookup operations. lookup_reference_tag: GOA2022 +# Maximum number of entries to process. limit_per_entry: 100 + +# Prefix for output file names. fantasia_prefix: worm_test_Prot_100_1.2 + +# Threshold for sequence length filtering. length_filter: 5000000 + +# Threshold for redundancy filtering. redundancy_filter: 0 +# Number of sequences to package in each queue batch. sequence_queue_package: 64 + +# ========================== +# 🧬 Embedding Configuration +# ========================== + embedding: + # List of embedding models to use. The numbers correspond to: + # 1 - ESM (Evolutionary Scale Modeling) + # 2 - Prost (Protein Structural Transformer) + # 3 - Prot (Protein Language Model) types: -# - 1 # ESM -# - 2 # Prost + - 1 # ESM + - 2 # Prost - 3 # Prot - distance_threshold: - 1: 1 # Umbral para ESM - 2: 1 # Umbral para Prost - 3: 1.2 # Umbral para Prot - batch_size: - 1: 1 - 2: 1 - 3: 1 - - - - -topgo?: True - + # Distance threshold values for each embedding model. + # This determines how close two embeddings must be to be considered similar. + distance_threshold: + 1: 1.0 # Threshold for ESM + 2: 1.0 # Threshold for Prost + 3: 1.2 # Threshold for Prot + # Batch size for processing embeddings. + # Controls how many sequences are processed at once for each embedding model. + batch_size: + 1: 1 # Batch size for ESM + 2: 1 # Batch size for Prost + 3: 1 # Batch size for Prot +# ========================== +# 🧠 Functional Analysis +# ========================== +# Enable or disable the use of TopGO for Gene Ontology enrichment analysis. +topgo?: true diff --git a/fantasia/main.py b/fantasia/main.py index 9176987..9524ee8 100644 --- a/fantasia/main.py +++ b/fantasia/main.py @@ -59,6 +59,21 @@ def wait_forever(): print("Stopping container.") +def setup_directories(conf): + """ + Crea los directorios base sin modificar los valores en `conf`. + """ + base_directory = os.path.expanduser(conf.get("base_directory", "~/fantasia/")) + + for key, path in conf.get("directories", {}).items(): + full_path = os.path.expanduser(path) + os.makedirs(full_path, exist_ok=True) + + return conf # Solo crea directorios, sin sobrescribir valores + + + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="fantasia: Command Handler") parser.add_argument("command", type=str, nargs="?", default=None, help="Command to execute: initialize or run") @@ -87,6 +102,9 @@ def wait_forever(): # Leer la configuración una sola vez conf = read_yaml_config(args.config) + conf = setup_directories(conf) + + print(conf) # Sobrescribir parámetros con los valores del CLI if args.fasta: diff --git a/fantasia/src/embedder.py b/fantasia/src/embedder.py index c4bbf52..b2e27cf 100644 --- a/fantasia/src/embedder.py +++ b/fantasia/src/embedder.py @@ -74,15 +74,17 @@ def __init__(self, conf, current_date): self.base_module_path = 'protein_metamorphisms_is.operation.embedding.proccess.sequence' self.fetch_models_info() self.sequence_queue_package = conf.get('sequence_queue_package') - self.batch_sizes = conf['embedding'].get('batch_size', {}) # Store batch sizes as a dict - self.fasta_path = conf.get('fantasia_input_fasta') - self.output_csv = conf.get("fantasia_output_csv") + self.batch_sizes = conf['embedding'].get('batch_size', {}) # Store batch sizes as a dict self.fasta_path = conf.get('fantasia_input_fasta') self.length_filter = conf.get('length_filter', None) + + self.fasta_path = conf.get('fantasia_input_fasta') + self.output_csv = conf["directories"]["csv_outputs"] self.output_h5 = os.path.join( - conf.get("fantasia_output_h5"), + conf["directories"]["hdf5_outputs"], f"{conf.get('fantasia_prefix', 'default')}_embeddings_{self.current_date}.h5" ) + self.results = [] diff --git a/fantasia/src/lookup.py b/fantasia/src/lookup.py index d7cbb6c..572f359 100644 --- a/fantasia/src/lookup.py +++ b/fantasia/src/lookup.py @@ -67,12 +67,14 @@ def __init__(self, conf, current_date): super().__init__(conf) self.current_date = current_date self.logger.info("EmbeddingLookUp initialized") + + # Usar rutas desde conf self.h5_path = os.path.join( - conf.get("fantasia_output_h5"), + conf["directories"]["hdf5_outputs"], f"{conf.get('fantasia_prefix', 'default')}_embeddings_{self.current_date}.h5" ) self.output_csv = os.path.join( - conf.get("fantasia_output_csv"), + conf["directories"]["csv_outputs"], f"{conf.get('fantasia_prefix', 'default')}_results_{self.current_date}.csv" ) self.limit_per_entry = self.conf.get("limit_per_entry", 200)