From c2e9daeac978ba430b7682bde9fe706f3baa9d9c Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 17 Dec 2024 14:50:45 +0100
Subject: [PATCH] wip

---
 taca/analysis/analysis_nanopore.py |  6 +-
 taca/nanopore/ONT_run_classes.py   | 95 +++++++++++++++++++++++++-----
 2 files changed, 83 insertions(+), 18 deletions(-)

diff --git a/taca/analysis/analysis_nanopore.py b/taca/analysis/analysis_nanopore.py
index 740bafee..7f0a05cc 100644
--- a/taca/analysis/analysis_nanopore.py
+++ b/taca/analysis/analysis_nanopore.py
@@ -91,7 +91,7 @@ def process_user_run(ont_user_run: ONT_user_run):
 
             # Update StatusDB
             logger.info(f"{ont_user_run.run_name}: Updating StatusDB...")
-            ont_user_run.update_db_entry()
+            ont_user_run.update_db_runs_entry()
 
             # Copy HTML report
             logger.info(f"{ont_user_run.run_name}: Putting HTML report on GenStat...")
@@ -166,7 +166,7 @@ def process_qc_run(ont_qc_run: ONT_qc_run):
 
     # Update StatusDB
     logger.info(f"{ont_qc_run.run_name}: Updating StatusDB...")
-    ont_qc_run.update_db_entry()
+    ont_qc_run.update_db_runs_entry()
 
     # Copy HTML report
     logger.info(f"{ont_qc_run.run_name}: Putting HTML report on GenStat...")
@@ -320,4 +320,4 @@ def ont_updatedb(run_abspath: str):
     logger.info(
         f"{ont_run.run_name}: Manually updating StatusDB, ignoring run status..."
     )
-    ont_run.update_db_entry(force_update=True)
+    ont_run.update_db_runs_entry(force_update=True)
diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py
index 8446763c..93c219ef 100644
--- a/taca/nanopore/ONT_run_classes.py
+++ b/taca/nanopore/ONT_run_classes.py
@@ -157,7 +157,7 @@ def touch_db_entry(self):
         else:
             logger.info(f"{self.run_name}: Database entry already exists, skipping.")
 
-    def update_db_entry(self, force_update=False):
+    def update_db_runs_entry(self, force_update=False):
         """Check run vs statusdb. Create or update run entry."""
 
         # If no run document exists in the database, create an ongoing run document
@@ -201,6 +201,66 @@ def update_db_entry(self, force_update=False):
                 f"Run {self.run_name} exists in the database as an finished run, do nothing."
             )
 
+    def update_db_proj_entry(self):
+        """Add sample info to project summary document. Requires a way to identify the project ID."""
+
+        # Start by trying to identify a project based on the info available in the rundir
+        project_id_matches = []
+        project_name_matches = []
+
+        project_id_pattern = r"P|p\d{5,7}"
+        project_name_pattern = r"[A-Z]?[_-]?[a-z]+|_\d{2}_\d{2}[^A-Za-z0-9]"
+
+        # Check MinKNOW experiment name and MinKNOW sample names for project names / IDs
+        for dirname in [self.experiment_name, self.sample_name]:
+            project_id_matches.extend(re.findall(project_id_pattern, dirname))
+            project_name_matches.extend(re.findall(project_name_pattern, dirname))
+
+        # Check MinKNOW samplesheet for project IDs
+        samplesheet_path = self.get_samplesheet_path()
+        if samplesheet_path:
+            ss_df = pd.read_csv(samplesheet_path)
+            if "barcode" in ss_df.columns:
+                ss_barcodes = list(ss_df["barcode"].unique())
+                ss_barcodes.sort()
+                barcode_nums = [int(bc[-2:]) for bc in ss_barcodes]
+                # If barcodes are numbered sequentially, write arg as range
+                if barcode_nums == list(range(1, len(barcode_nums) + 1)):
+                    barcodes_arg = f"{ss_barcodes[0]}:{ss_barcodes[-1]}"
+                else:
+                    barcodes_arg = ":".join(ss_barcodes)
+            else:
+                ss_barcodes = None
+
+        # Assert we found no ambiguity in identifying project ID or name
+        if len(project_id_matches) == set(len(project_id_matches)):
+            project_id = project_id_matches[0]
+        else:
+            assert AssertionError("Multiple project IDs found identified in run dir.")
+
+        if len(project_name_matches) == set(len(project_name_matches)):
+            project_name = project_name_matches[0]
+        else:
+            raise AssertionError("Multiple project names found identified in run dir.")
+
+        # Get document IDs
+        doc_id_from_project_id = (
+            self.db_proj.id_view[project_id] if project_id else None
+        )
+        doc_id_from_project_name = (
+            self.db_proj.name_view[project_name] if project_name else None
+        )
+
+        # Assert we have a single document ID
+        if doc_id_from_project_id and doc_id_from_project_name:
+            assert (
+                doc_id_from_project_id == doc_id_from_project_name
+            ), f"Extracted project ID '{project_id}' and name '{project_name}' do not match."
+            doc_id = doc_id_from_project_id
+        elif not doc_id_from_project_id and not doc_id_from_project_name:
+            raise AssertionError("No project ID or name found from rundir.")
+        doc_id = doc_id_from_project_id or doc_id_from_project_name
+
     def parse_pore_activity(self, db_update):
         logger.info(f"{self.run_name}: Parsing pore activity...")
 
@@ -353,6 +413,19 @@ def copy_html_report(self):
             logger.error(msg)
             raise RsyncError(msg)
 
+    def get_samplesheet_path(self):
+        # Load samplesheet, if any
+        ss_glob = glob.glob(f"{self.run_abspath}/sample_sheet*.csv")
+        if len(ss_glob) == 0:
+            samplesheet = None
+        elif len(ss_glob) > 1:
+            # If multiple samplesheet, use latest one
+            samplesheet = ss_glob.sort()[-1]
+        else:
+            samplesheet = ss_glob[0]
+
+        return samplesheet
+
     def toulligqc_report(self):
         """Generate a QC report for the run using ToulligQC and publish it to GenStat."""
 
@@ -365,6 +438,8 @@ def toulligqc_report(self):
             )
             return None
 
+        samplesheet_path = self.get_samplesheet_path()
+
         # Get sequencing summary file
         glob_summary = glob.glob(f"{self.run_abspath}/sequencing_summary*.txt")
         assert len(glob_summary) == 1, f"Found {len(glob_summary)} summary files"
@@ -386,16 +461,6 @@ def toulligqc_report(self):
             raise AssertionError(f"No seq data found in {self.run_abspath}")
         raw_data_format = "pod5" if "pod5" in raw_data_dir else "fast5"
 
-        # Load samplesheet, if any
-        ss_glob = glob.glob(f"{self.run_abspath}/sample_sheet*.csv")
-        if len(ss_glob) == 0:
-            samplesheet = None
-        elif len(ss_glob) > 1:
-            # If multiple samplesheet, use latest one
-            samplesheet = ss_glob.sort()[-1]
-        else:
-            samplesheet = ss_glob[0]
-
         # Run has barcode subdirs
         barcode_dirs_glob = glob.glob(f"{self.run_abspath}/{raw_data_dir}/barcode*")
         if len(barcode_dirs_glob) > 0:
@@ -406,8 +471,8 @@ def toulligqc_report(self):
             raw_data_path = f"{self.run_abspath}/{raw_data_dir}"
 
         # Determine barcodes
-        if samplesheet:
-            ss_df = pd.read_csv(samplesheet)
+        if samplesheet_path:
+            ss_df = pd.read_csv(samplesheet_path)
             if "barcode" in ss_df.columns:
                 ss_barcodes = list(ss_df["barcode"].unique())
                 ss_barcodes.sort()
@@ -428,8 +493,8 @@ def toulligqc_report(self):
         }
         if barcode_dirs:
             command_args["--barcoding"] = ""
-            if samplesheet and ss_barcodes:
-                command_args["--samplesheet"] = samplesheet
+            if samplesheet_path and ss_barcodes:
+                command_args["--samplesheet"] = samplesheet_path
                 command_args["--barcodes"] = barcodes_arg
 
         # Build command list