From b38b14bff39835df00a5404fcd9acb683dce7a9c Mon Sep 17 00:00:00 2001 From: Thomas Cokelaer Date: Thu, 31 Oct 2024 17:00:34 +0100 Subject: [PATCH] implement --exclude-pattern option --- README.rst | 2 ++ pyproject.toml | 2 +- sequana_pipetools/options.py | 22 ++++++++++++++++--- sequana_pipetools/sequana_manager.py | 7 ++++++ sequana_pipetools/snaketools/file_factory.py | 15 ++++++++++--- .../snaketools/pipeline_manager.py | 13 +++++++++-- 6 files changed, 52 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index 99658ac..ae85067 100644 --- a/README.rst +++ b/README.rst @@ -313,6 +313,8 @@ Changelog ========= ====================================================================== Version Description ========= ====================================================================== +1.1.0 * add exclude_pattern in input data section +1.0.6 * add py3.12, slight updates wrt slurm 1.0.5 * introspect slurm files to extract stats 1.0.4 * add utility function to download and untar a tar.gz file 1.0.3 * add levenshtein function. some typo corrections. diff --git a/pyproject.toml b/pyproject.toml index a83d7fe..fe332db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api" #maintainer ?#maintainer email [tool.poetry] name = "sequana_pipetools" -version = "1.0.6" +version = "1.1.0" description = "A set of tools to help building or using Sequana pipelines" authors = ["Sequana Team"] license = "BSD-3" diff --git a/sequana_pipetools/options.py b/sequana_pipetools/options.py index 3923097..e2ce42a 100644 --- a/sequana_pipetools/options.py +++ b/sequana_pipetools/options.py @@ -215,7 +215,9 @@ def deps_callback(ctx, param, value): data = fin.read() data = data.split() data = "\n".join(sorted(data)) - click.echo(f"Those software will be required for the pipeline to work correctly:\n\n{data}\n") + click.echo( + f"sequana_{ctx.NAME} will need one or more of these software to work correctly. We recommend you to use --apptainer option so that you do not need to install them manually:\n\n{data}\n" + ) ctx.exit(0) @@ -302,11 +304,17 @@ def __init__(self, working_directory="analysis", caller=None): class ClickInputOptions: group_name = "Data" - metadata = {"name": group_name, "options": ["--input-directory", "--input-pattern", "--input-readtag"]} + metadata = { + "name": group_name, + "options": ["--input-directory", "--input-pattern", "--input-readtag", "--exclude-pattern"], + } - def __init__(self, input_directory=".", input_pattern="*fastq.gz", add_input_readtag=True, caller=None): + def __init__( + self, input_directory=".", input_pattern="*fastq.gz", add_input_readtag=True, caller=None, exclude_pattern=None + ): self.input_directory = input_directory self.input_pattern = input_pattern + self.exclude_pattern = exclude_pattern self.add_input_readtag = add_input_readtag self.options = [ @@ -327,6 +335,14 @@ def __init__(self, input_directory=".", input_pattern="*fastq.gz", add_input_rea show_default=True, help=f"pattern for the input files ({input_pattern})", ), + click.option( + "--exclude-pattern", + "exclude_pattern", + default=self.exclude_pattern, + type=click.STRING, + show_default=True, + help=f"pattern for excluding input files ({exclude_pattern})", + ), ] if self.add_input_readtag: diff --git a/sequana_pipetools/sequana_manager.py b/sequana_pipetools/sequana_manager.py index cdad8f4..75190e5 100644 --- a/sequana_pipetools/sequana_manager.py +++ b/sequana_pipetools/sequana_manager.py @@ -178,8 +178,11 @@ def fill_data_options(self): cfg.input_directory = os.path.abspath(options["input_directory"]) if "--input-readtag" in sys.argv: cfg.input_readtag = options["input_readtag"] + if "--exclude-pattern" in sys.argv: + cfg.exclude_pattern = options["exclude_pattern"] else: cfg.input_pattern = options.input_pattern + cfg.exclude_pattern = options.exclude_pattern cfg.input_readtag = options.input_readtag cfg.input_directory = os.path.abspath(options.input_directory) @@ -258,6 +261,10 @@ def check_input_files(self, stop_on_error=True): cfg = self.config.config filenames = glob.glob(cfg.input_directory + os.sep + cfg.input_pattern) + + # this code is just informative. Actual run is snaketools.pipeline_manager + if cfg.get("exclude_pattern", None) and cfg.get("exclude_pattern"): + filenames = [x for x in filenames if cfg.get("exclude_pattern") not in x.split("/")[-1]] logger.info( f"\u2705 Found {len(filenames)} files matching your input pattern ({cfg.input_pattern}) in {cfg.input_directory}" ) diff --git a/sequana_pipetools/snaketools/file_factory.py b/sequana_pipetools/snaketools/file_factory.py index 74a3a3e..d7c5915 100644 --- a/sequana_pipetools/snaketools/file_factory.py +++ b/sequana_pipetools/snaketools/file_factory.py @@ -71,7 +71,7 @@ class FileFactory: """ - def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **kwargs): + def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, exclude_pattern=None, **kwargs): """.. rubric:: Constructor :param pattern: can be a filename, list of filenames, or a global @@ -92,6 +92,7 @@ def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **k self.pattern = pattern self.extra_prefixes_to_strip = extra_prefixes_to_strip self.sample_pattern = sample_pattern + self.exclude_pattern = exclude_pattern try: if os.path.exists(pattern): @@ -108,6 +109,10 @@ def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **k # remove directories if they exist self._glob = [x for x in self._glob if not os.path.isdir(x)] + # remove candidates that have the exclude pattern + if self.exclude_pattern: # pragma: no cover + self._glob = [x for x in self._glob if not self.exclude_pattern in x] + def _get_realpaths(self): return [os.path.realpath(filename) for filename in self._glob] @@ -156,7 +161,7 @@ def func(filename): if filename.startswith(prefix) and filename.endswith(suffix): res = res[len(prefix) : len(res) - len(suffix)] - else: + else: # pragma: no cover raise PipetoolsException(f"Your sample pattern does not match the filename {filename}") else: res = filename[:] @@ -246,6 +251,7 @@ def __init__( read_tag="_R[12]_", extra_prefixes_to_strip=[], sample_pattern=None, + exclude_pattern=None, **kwargs, ): r""".. rubric:: Constructor @@ -267,7 +273,10 @@ def __init__( and your sample will be only 'A'. """ super(FastQFactory, self).__init__( - pattern, extra_prefixes_to_strip=extra_prefixes_to_strip, sample_pattern=sample_pattern + pattern, + extra_prefixes_to_strip=extra_prefixes_to_strip, + sample_pattern=sample_pattern, + exclude_pattern=exclude_pattern, ) self.read_tag = read_tag diff --git a/sequana_pipetools/snaketools/pipeline_manager.py b/sequana_pipetools/snaketools/pipeline_manager.py index 0d14f06..b7e1225 100644 --- a/sequana_pipetools/snaketools/pipeline_manager.py +++ b/sequana_pipetools/snaketools/pipeline_manager.py @@ -157,7 +157,6 @@ def teardown(self, extra_dirs_to_remove=[], extra_files_to_remove=[], outdir="." cleaner.add_makefile() # create the version file given the requirements - if os.path.exists(f"{outdir}/.sequana/tools.txt"): with open(f"{outdir}/.sequana/tools.txt", "r") as fin: deps = fin.readlines() @@ -254,6 +253,10 @@ class PipelineManager(PipelineManagerBase): - input_readtag: "_R[12]_" - input_pattern: "*.fastq.gz" + and optional option: + + - exclude_pattern: + You may omit the input_readtag, which is not required for non-paired data. For instance for pacbio and nanopore files, there are not paired and the read tag is not required. Instead, if you are dealing with Illumina/MGI data sets, you must provide this field IF AND ONLY IF you want @@ -342,6 +345,7 @@ def __init__( sample_func=None, extra_prefixes_to_strip=[], sample_pattern=None, + exclude_pattern=None, **kwargs, ): """.. rubric:: Constructor @@ -371,6 +375,7 @@ def __init__( # can be provided in the config file or arguments self.sample_pattern = cfg.config.get("sample_pattern", sample_pattern) self.extra_prefixes_to_strip = cfg.config.get("extra_prefixes_to_strip", extra_prefixes_to_strip) + self.exclude_pattern = cfg.config.get("exclude_pattern", exclude_pattern) # if input_directory is not filled, the input_pattern, if valid, will be used instead and must # be provided anyway. @@ -446,6 +451,7 @@ def _get_fastq_files(self, glob_dir, read_tag): read_tag=read_tag, extra_prefixes_to_strip=self.extra_prefixes_to_strip, sample_pattern=self.sample_pattern, + exclude_pattern=self.exclude_pattern, ) # check whether it is paired or not. This is just to raise an error when @@ -470,7 +476,10 @@ def _get_fastq_files(self, glob_dir, read_tag): def _get_any_files(self, pattern): self.ff = FileFactory( - pattern, extra_prefixes_to_strip=self.extra_prefixes_to_strip, sample_pattern=self.sample_pattern + pattern, + extra_prefixes_to_strip=self.extra_prefixes_to_strip, + sample_pattern=self.sample_pattern, + exclude_pattern=self.exclude_pattern, ) # samples contains a correspondance between the sample name and the