Directories - Rework

CBBIO · Feb 3, 2025 · 6125c11 · 6125c11
1 parent c6f3e7a
commit 6125c11
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 39 deletions.
diff --git a/fantasia/config.yaml b/fantasia/config.yaml
@@ -1,62 +1,118 @@
-#System
+# ==========================
+# 🌍 Global Configuration
+# ==========================
+
+# Maximum number of worker threads for parallel processing.
 max_workers: 1
+
+# Path to the system constants file.
 constants: "./fantasia/constants.yaml"
+
+# Monitoring interval in seconds (for processes that require periodic checks).
 monitor_interval: 5
 
-# Postgres CONFIGURATION
-DB_USERNAME: usuario
-DB_PASSWORD: clave
-DB_HOST: localhost
-DB_PORT: 5432
-DB_NAME: BioData
 
-# Rabbitmq CONFIGURATION
-rabbitmq_host: localhost
-rabbitmq_user: guest
-rabbitmq_password: guest
+# ==========================
+# 🗄️ Database (PostgreSQL)
+# ==========================
+
+# Credentials and configuration for connecting to the PostgreSQL database.
+DB_USERNAME: usuario         # Database username.
+DB_PASSWORD: clave           # Database password.
+DB_HOST: localhost           # Host where the database server is running.
+DB_PORT: 5432                # Port used for the database connection.
+DB_NAME: BioData             # Name of the database to use.
+
+
+# ==========================
+# 📨 Message Queue (RabbitMQ)
+# ==========================
+
+# Configuration for the RabbitMQ message broker.
+rabbitmq_host: localhost     # RabbitMQ server hostname.
+rabbitmq_user: guest         # RabbitMQ username for authentication.
+rabbitmq_password: guest     # RabbitMQ password for authentication.
 
-# Database dump source for information system
+
+# ==========================
+# 🔄 Data Source Configuration
+# ==========================
+
+# URL to download the embeddings database dump.
 embeddings_url: "https://zenodo.org/records/14546346/files/embeddings.tar?download=1"
 
 
-embeddings_path: ~/fantasia/dumps/
-fantasia_output_h5: ~/fantasia/embeddings/
-fantasia_output_csv: ~/fantasia/results/
-redundancy_temp: ~/fantasia/redundancy_temp/
+# ==========================
+# 📂 Directory Configuration
+# ==========================
+
+base_directory: ~/fantasia/
+
+directories:
+  embeddings: ~/fantasia/dumps
+  hdf5_outputs: ~/fantasia/embeddings
+  csv_outputs: ~/fantasia/results
+  redundancy_temp: ~/fantasia/redundancy_temp
 
 
-# Configuration Parameters
+# ==========================
+# 🔬 Pipeline Configuration
+# ==========================
+
+# Path to the input FASTA file for protein sequences.
 fantasia_input_fasta: data_sample/worm_test.fasta
+
+# Reference tag used for lookup operations.
 lookup_reference_tag: GOA2022
 
+# Maximum number of entries to process.
 limit_per_entry: 100
+
+# Prefix for output file names.
 fantasia_prefix: worm_test_Prot_100_1.2
+
+# Threshold for sequence length filtering.
 length_filter: 5000000
+
+# Threshold for redundancy filtering.
 redundancy_filter: 0
 
+# Number of sequences to package in each queue batch.
 sequence_queue_package: 64
 
+
+# ==========================
+# 🧬 Embedding Configuration
+# ==========================
+
 embedding:
+  # List of embedding models to use. The numbers correspond to:
+  # 1 - ESM (Evolutionary Scale Modeling)
+  # 2 - Prost (Protein Structural Transformer)
+  # 3 - Prot (Protein Language Model)
   types:
-#    - 1 # ESM
-#    - 2 # Prost
+    - 1 # ESM
+    - 2 # Prost
     - 3 # Prot
-  distance_threshold:
-    1: 1 # Umbral para ESM
-    2: 1 # Umbral para Prost
-    3: 1.2 # Umbral para Prot
-  batch_size:
-    1: 1
-    2: 1
-    3: 1
-
-
-
-
-topgo?: True
-
 
+  # Distance threshold values for each embedding model.
+  # This determines how close two embeddings must be to be considered similar.
+  distance_threshold:
+    1: 1.0  # Threshold for ESM
+    2: 1.0  # Threshold for Prost
+    3: 1.2  # Threshold for Prot
 
+  # Batch size for processing embeddings.
+  # Controls how many sequences are processed at once for each embedding model.
+  batch_size:
+    1: 1  # Batch size for ESM
+    2: 1  # Batch size for Prost
+    3: 1  # Batch size for Prot
 
 
+# ==========================
+# 🧠 Functional Analysis
+# ==========================
 
+# Enable or disable the use of TopGO for Gene Ontology enrichment analysis.
+topgo?: true
diff --git a/fantasia/main.py b/fantasia/main.py
@@ -59,6 +59,21 @@ def wait_forever():
         print("Stopping container.")
 
 
+def setup_directories(conf):
+    """
+    Crea los directorios base sin modificar los valores en `conf`.
+    """
+    base_directory = os.path.expanduser(conf.get("base_directory", "~/fantasia/"))
+
+    for key, path in conf.get("directories", {}).items():
+        full_path = os.path.expanduser(path)
+        os.makedirs(full_path, exist_ok=True)
+
+    return conf  # Solo crea directorios, sin sobrescribir valores
+
+
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="fantasia: Command Handler")
     parser.add_argument("command", type=str, nargs="?", default=None, help="Command to execute: initialize or run")
@@ -87,6 +102,9 @@ def wait_forever():
 
         # Leer la configuración una sola vez
         conf = read_yaml_config(args.config)
+        conf = setup_directories(conf)
+
+        print(conf)
 
         # Sobrescribir parámetros con los valores del CLI
         if args.fasta:

diff --git a/fantasia/src/embedder.py b/fantasia/src/embedder.py
@@ -74,15 +74,17 @@ def __init__(self, conf, current_date):
         self.base_module_path = 'protein_metamorphisms_is.operation.embedding.proccess.sequence'
         self.fetch_models_info()
         self.sequence_queue_package = conf.get('sequence_queue_package')
-        self.batch_sizes = conf['embedding'].get('batch_size', {})  # Store batch sizes as a dict
-        self.fasta_path = conf.get('fantasia_input_fasta')
-        self.output_csv = conf.get("fantasia_output_csv")
+        self.batch_sizes = conf['embedding'].get('batch_size', {})  # Store batch sizes as a dict        self.fasta_path = conf.get('fantasia_input_fasta')
         self.length_filter = conf.get('length_filter', None)
+
+        self.fasta_path = conf.get('fantasia_input_fasta')
+        self.output_csv = conf["directories"]["csv_outputs"]
         self.output_h5 = os.path.join(
-            conf.get("fantasia_output_h5"),
+            conf["directories"]["hdf5_outputs"],
             f"{conf.get('fantasia_prefix', 'default')}_embeddings_{self.current_date}.h5"
         )
 
+
         self.results = []
 
 

diff --git a/fantasia/src/lookup.py b/fantasia/src/lookup.py
@@ -67,12 +67,14 @@ def __init__(self, conf, current_date):
         super().__init__(conf)
         self.current_date = current_date
         self.logger.info("EmbeddingLookUp initialized")
+
+        # Usar rutas desde conf
         self.h5_path = os.path.join(
-            conf.get("fantasia_output_h5"),
+            conf["directories"]["hdf5_outputs"],
             f"{conf.get('fantasia_prefix', 'default')}_embeddings_{self.current_date}.h5"
         )
         self.output_csv = os.path.join(
-            conf.get("fantasia_output_csv"),
+            conf["directories"]["csv_outputs"],
             f"{conf.get('fantasia_prefix', 'default')}_results_{self.current_date}.csv"
         )
         self.limit_per_entry = self.conf.get("limit_per_entry", 200)