Merge pull request #113 from cokelaer/main

implement --exclude-pattern option
sequana · Oct 31, 2024 · 4c9eb26 · 4c9eb26
2 parents bda77e2 + b38b14b
commit 4c9eb26
Show file tree

Hide file tree

Showing 6 changed files with 52 additions and 9 deletions.
diff --git a/README.rst b/README.rst
@@ -313,6 +313,8 @@ Changelog
 ========= ======================================================================
 Version   Description
 ========= ======================================================================
+1.1.0     * add exclude_pattern in input data section
+1.0.6     * add py3.12, slight updates wrt slurm
 1.0.5     * introspect slurm files to extract stats
 1.0.4     * add utility function to download and untar a tar.gz file
 1.0.3     * add levenshtein function. some typo corrections.

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
 #maintainer ?#maintainer email
 [tool.poetry]
 name = "sequana_pipetools"
-version = "1.0.6"
+version = "1.1.0"
 description = "A set of tools to help building or using Sequana pipelines"
 authors = ["Sequana Team"]
 license = "BSD-3"

diff --git a/sequana_pipetools/options.py b/sequana_pipetools/options.py
@@ -215,7 +215,9 @@ def deps_callback(ctx, param, value):
             data = fin.read()
         data = data.split()
         data = "\n".join(sorted(data))
-        click.echo(f"Those software will be required for the pipeline to work correctly:\n\n{data}\n")
+        click.echo(
+            f"sequana_{ctx.NAME} will need one or more of these software to work correctly. We recommend you to use --apptainer option so that you do not need to install them manually:\n\n{data}\n"
+        )
         ctx.exit(0)
 
 
@@ -302,11 +304,17 @@ def __init__(self, working_directory="analysis", caller=None):
 
 class ClickInputOptions:
     group_name = "Data"
-    metadata = {"name": group_name, "options": ["--input-directory", "--input-pattern", "--input-readtag"]}
+    metadata = {
+        "name": group_name,
+        "options": ["--input-directory", "--input-pattern", "--input-readtag", "--exclude-pattern"],
+    }
 
-    def __init__(self, input_directory=".", input_pattern="*fastq.gz", add_input_readtag=True, caller=None):
+    def __init__(
+        self, input_directory=".", input_pattern="*fastq.gz", add_input_readtag=True, caller=None, exclude_pattern=None
+    ):
         self.input_directory = input_directory
         self.input_pattern = input_pattern
+        self.exclude_pattern = exclude_pattern
         self.add_input_readtag = add_input_readtag
 
         self.options = [
@@ -327,6 +335,14 @@ def __init__(self, input_directory=".", input_pattern="*fastq.gz", add_input_rea
                 show_default=True,
                 help=f"pattern for the input files ({input_pattern})",
             ),
+            click.option(
+                "--exclude-pattern",
+                "exclude_pattern",
+                default=self.exclude_pattern,
+                type=click.STRING,
+                show_default=True,
+                help=f"pattern for excluding input files ({exclude_pattern})",
+            ),
         ]
 
         if self.add_input_readtag:

diff --git a/sequana_pipetools/sequana_manager.py b/sequana_pipetools/sequana_manager.py
@@ -178,8 +178,11 @@ def fill_data_options(self):
                 cfg.input_directory = os.path.abspath(options["input_directory"])
             if "--input-readtag" in sys.argv:
                 cfg.input_readtag = options["input_readtag"]
+            if "--exclude-pattern" in sys.argv:
+                cfg.exclude_pattern = options["exclude_pattern"]
         else:
             cfg.input_pattern = options.input_pattern
+            cfg.exclude_pattern = options.exclude_pattern
             cfg.input_readtag = options.input_readtag
             cfg.input_directory = os.path.abspath(options.input_directory)
 
@@ -258,6 +261,10 @@ def check_input_files(self, stop_on_error=True):
         cfg = self.config.config
 
         filenames = glob.glob(cfg.input_directory + os.sep + cfg.input_pattern)
+
+        # this code is just informative. Actual run is snaketools.pipeline_manager
+        if cfg.get("exclude_pattern", None) and cfg.get("exclude_pattern"):
+            filenames = [x for x in filenames if cfg.get("exclude_pattern") not in x.split("/")[-1]]
         logger.info(
             f"\u2705 Found {len(filenames)} files matching your input  pattern ({cfg.input_pattern}) in {cfg.input_directory}"
         )

diff --git a/sequana_pipetools/snaketools/file_factory.py b/sequana_pipetools/snaketools/file_factory.py
@@ -71,7 +71,7 @@ class FileFactory:
 
     """
 
-    def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **kwargs):
+    def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, exclude_pattern=None, **kwargs):
         """.. rubric:: Constructor
 
         :param pattern: can be a filename, list of filenames, or a global
@@ -92,6 +92,7 @@ def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **k
         self.pattern = pattern
         self.extra_prefixes_to_strip = extra_prefixes_to_strip
         self.sample_pattern = sample_pattern
+        self.exclude_pattern = exclude_pattern
 
         try:
             if os.path.exists(pattern):
@@ -108,6 +109,10 @@ def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **k
         # remove directories if they exist
         self._glob = [x for x in self._glob if not os.path.isdir(x)]
 
+        # remove candidates that have the exclude pattern
+        if self.exclude_pattern:  # pragma: no cover
+            self._glob = [x for x in self._glob if not self.exclude_pattern in x]
+
     def _get_realpaths(self):
         return [os.path.realpath(filename) for filename in self._glob]
 
@@ -156,7 +161,7 @@ def func(filename):
 
                 if filename.startswith(prefix) and filename.endswith(suffix):
                     res = res[len(prefix) : len(res) - len(suffix)]
-                else:
+                else:  # pragma: no cover
                     raise PipetoolsException(f"Your sample pattern does not match the filename {filename}")
             else:
                 res = filename[:]
@@ -246,6 +251,7 @@ def __init__(
         read_tag="_R[12]_",
         extra_prefixes_to_strip=[],
         sample_pattern=None,
+        exclude_pattern=None,
         **kwargs,
     ):
         r""".. rubric:: Constructor
@@ -267,7 +273,10 @@ def __init__(
             and your sample will be only 'A'.
         """
         super(FastQFactory, self).__init__(
-            pattern, extra_prefixes_to_strip=extra_prefixes_to_strip, sample_pattern=sample_pattern
+            pattern,
+            extra_prefixes_to_strip=extra_prefixes_to_strip,
+            sample_pattern=sample_pattern,
+            exclude_pattern=exclude_pattern,
         )
 
         self.read_tag = read_tag

diff --git a/sequana_pipetools/snaketools/pipeline_manager.py b/sequana_pipetools/snaketools/pipeline_manager.py
@@ -157,7 +157,6 @@ def teardown(self, extra_dirs_to_remove=[], extra_files_to_remove=[], outdir="."
         cleaner.add_makefile()
 
         # create the version file given the requirements
-
         if os.path.exists(f"{outdir}/.sequana/tools.txt"):
             with open(f"{outdir}/.sequana/tools.txt", "r") as fin:
                 deps = fin.readlines()
@@ -254,6 +253,10 @@ class PipelineManager(PipelineManagerBase):
         - input_readtag: "_R[12]_"
         - input_pattern: "*.fastq.gz"
 
+    and optional option:
+
+        - exclude_pattern:
+
     You may omit the input_readtag, which is not required for non-paired data. For instance for
     pacbio and nanopore files, there are not paired and the read tag is not required. Instead, if
     you are dealing with Illumina/MGI data sets, you must provide this field IF AND ONLY IF you want
@@ -342,6 +345,7 @@ def __init__(
         sample_func=None,
         extra_prefixes_to_strip=[],
         sample_pattern=None,
+        exclude_pattern=None,
         **kwargs,
     ):
         """.. rubric:: Constructor
@@ -371,6 +375,7 @@ def __init__(
         # can be provided in the config file or arguments
         self.sample_pattern = cfg.config.get("sample_pattern", sample_pattern)
         self.extra_prefixes_to_strip = cfg.config.get("extra_prefixes_to_strip", extra_prefixes_to_strip)
+        self.exclude_pattern = cfg.config.get("exclude_pattern", exclude_pattern)
 
         # if input_directory is not filled, the input_pattern, if valid, will be used instead and must
         # be provided anyway.
@@ -446,6 +451,7 @@ def _get_fastq_files(self, glob_dir, read_tag):
             read_tag=read_tag,
             extra_prefixes_to_strip=self.extra_prefixes_to_strip,
             sample_pattern=self.sample_pattern,
+            exclude_pattern=self.exclude_pattern,
         )
 
         # check whether it is paired or not. This is just to raise an error when
@@ -470,7 +476,10 @@ def _get_fastq_files(self, glob_dir, read_tag):
 
     def _get_any_files(self, pattern):
         self.ff = FileFactory(
-            pattern, extra_prefixes_to_strip=self.extra_prefixes_to_strip, sample_pattern=self.sample_pattern
+            pattern,
+            extra_prefixes_to_strip=self.extra_prefixes_to_strip,
+            sample_pattern=self.sample_pattern,
+            exclude_pattern=self.exclude_pattern,
         )
 
         # samples contains a correspondance between the sample name and the