Skip to content

Commit

Permalink
Directories - Rework
Browse files Browse the repository at this point in the history
  • Loading branch information
frapercan committed Feb 3, 2025
1 parent c6f3e7a commit 6125c11
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 39 deletions.
122 changes: 89 additions & 33 deletions fantasia/config.yaml
Original file line number Diff line number Diff line change
@@ -1,62 +1,118 @@
#System
# ==========================
# 🌍 Global Configuration
# ==========================

# Maximum number of worker threads for parallel processing.
max_workers: 1

# Path to the system constants file.
constants: "./fantasia/constants.yaml"

# Monitoring interval in seconds (for processes that require periodic checks).
monitor_interval: 5

# Postgres CONFIGURATION
DB_USERNAME: usuario
DB_PASSWORD: clave
DB_HOST: localhost
DB_PORT: 5432
DB_NAME: BioData

# Rabbitmq CONFIGURATION
rabbitmq_host: localhost
rabbitmq_user: guest
rabbitmq_password: guest
# ==========================
# 🗄️ Database (PostgreSQL)
# ==========================

# Credentials and configuration for connecting to the PostgreSQL database.
DB_USERNAME: usuario # Database username.
DB_PASSWORD: clave # Database password.
DB_HOST: localhost # Host where the database server is running.
DB_PORT: 5432 # Port used for the database connection.
DB_NAME: BioData # Name of the database to use.


# ==========================
# 📨 Message Queue (RabbitMQ)
# ==========================

# Configuration for the RabbitMQ message broker.
rabbitmq_host: localhost # RabbitMQ server hostname.
rabbitmq_user: guest # RabbitMQ username for authentication.
rabbitmq_password: guest # RabbitMQ password for authentication.

# Database dump source for information system

# ==========================
# 🔄 Data Source Configuration
# ==========================

# URL to download the embeddings database dump.
embeddings_url: "https://zenodo.org/records/14546346/files/embeddings.tar?download=1"


embeddings_path: ~/fantasia/dumps/
fantasia_output_h5: ~/fantasia/embeddings/
fantasia_output_csv: ~/fantasia/results/
redundancy_temp: ~/fantasia/redundancy_temp/
# ==========================
# 📂 Directory Configuration
# ==========================

base_directory: ~/fantasia/

directories:
embeddings: ~/fantasia/dumps
hdf5_outputs: ~/fantasia/embeddings
csv_outputs: ~/fantasia/results
redundancy_temp: ~/fantasia/redundancy_temp


# Configuration Parameters
# ==========================
# 🔬 Pipeline Configuration
# ==========================

# Path to the input FASTA file for protein sequences.
fantasia_input_fasta: data_sample/worm_test.fasta

# Reference tag used for lookup operations.
lookup_reference_tag: GOA2022

# Maximum number of entries to process.
limit_per_entry: 100

# Prefix for output file names.
fantasia_prefix: worm_test_Prot_100_1.2

# Threshold for sequence length filtering.
length_filter: 5000000

# Threshold for redundancy filtering.
redundancy_filter: 0

# Number of sequences to package in each queue batch.
sequence_queue_package: 64


# ==========================
# 🧬 Embedding Configuration
# ==========================

embedding:
# List of embedding models to use. The numbers correspond to:
# 1 - ESM (Evolutionary Scale Modeling)
# 2 - Prost (Protein Structural Transformer)
# 3 - Prot (Protein Language Model)
types:
# - 1 # ESM
# - 2 # Prost
- 1 # ESM
- 2 # Prost
- 3 # Prot
distance_threshold:
1: 1 # Umbral para ESM
2: 1 # Umbral para Prost
3: 1.2 # Umbral para Prot
batch_size:
1: 1
2: 1
3: 1




topgo?: True


# Distance threshold values for each embedding model.
# This determines how close two embeddings must be to be considered similar.
distance_threshold:
1: 1.0 # Threshold for ESM
2: 1.0 # Threshold for Prost
3: 1.2 # Threshold for Prot

# Batch size for processing embeddings.
# Controls how many sequences are processed at once for each embedding model.
batch_size:
1: 1 # Batch size for ESM
2: 1 # Batch size for Prost
3: 1 # Batch size for Prot


# ==========================
# 🧠 Functional Analysis
# ==========================

# Enable or disable the use of TopGO for Gene Ontology enrichment analysis.
topgo?: true
18 changes: 18 additions & 0 deletions fantasia/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,21 @@ def wait_forever():
print("Stopping container.")


def setup_directories(conf):
"""
Crea los directorios base sin modificar los valores en `conf`.
"""
base_directory = os.path.expanduser(conf.get("base_directory", "~/fantasia/"))

for key, path in conf.get("directories", {}).items():
full_path = os.path.expanduser(path)
os.makedirs(full_path, exist_ok=True)

return conf # Solo crea directorios, sin sobrescribir valores




if __name__ == "__main__":
parser = argparse.ArgumentParser(description="fantasia: Command Handler")
parser.add_argument("command", type=str, nargs="?", default=None, help="Command to execute: initialize or run")
Expand Down Expand Up @@ -87,6 +102,9 @@ def wait_forever():

# Leer la configuración una sola vez
conf = read_yaml_config(args.config)
conf = setup_directories(conf)

print(conf)

# Sobrescribir parámetros con los valores del CLI
if args.fasta:
Expand Down
10 changes: 6 additions & 4 deletions fantasia/src/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,17 @@ def __init__(self, conf, current_date):
self.base_module_path = 'protein_metamorphisms_is.operation.embedding.proccess.sequence'
self.fetch_models_info()
self.sequence_queue_package = conf.get('sequence_queue_package')
self.batch_sizes = conf['embedding'].get('batch_size', {}) # Store batch sizes as a dict
self.fasta_path = conf.get('fantasia_input_fasta')
self.output_csv = conf.get("fantasia_output_csv")
self.batch_sizes = conf['embedding'].get('batch_size', {}) # Store batch sizes as a dict self.fasta_path = conf.get('fantasia_input_fasta')
self.length_filter = conf.get('length_filter', None)

self.fasta_path = conf.get('fantasia_input_fasta')
self.output_csv = conf["directories"]["csv_outputs"]
self.output_h5 = os.path.join(
conf.get("fantasia_output_h5"),
conf["directories"]["hdf5_outputs"],
f"{conf.get('fantasia_prefix', 'default')}_embeddings_{self.current_date}.h5"
)


self.results = []


Expand Down
6 changes: 4 additions & 2 deletions fantasia/src/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,14 @@ def __init__(self, conf, current_date):
super().__init__(conf)
self.current_date = current_date
self.logger.info("EmbeddingLookUp initialized")

# Usar rutas desde conf
self.h5_path = os.path.join(
conf.get("fantasia_output_h5"),
conf["directories"]["hdf5_outputs"],
f"{conf.get('fantasia_prefix', 'default')}_embeddings_{self.current_date}.h5"
)
self.output_csv = os.path.join(
conf.get("fantasia_output_csv"),
conf["directories"]["csv_outputs"],
f"{conf.get('fantasia_prefix', 'default')}_results_{self.current_date}.csv"
)
self.limit_per_entry = self.conf.get("limit_per_entry", 200)
Expand Down

0 comments on commit 6125c11

Please sign in to comment.