Skip to content

Commit

Permalink
Merge pull request #113 from cokelaer/main
Browse files Browse the repository at this point in the history
implement --exclude-pattern option
  • Loading branch information
cokelaer authored Oct 31, 2024
2 parents bda77e2 + b38b14b commit 4c9eb26
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 9 deletions.
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,8 @@ Changelog
========= ======================================================================
Version Description
========= ======================================================================
1.1.0 * add exclude_pattern in input data section
1.0.6 * add py3.12, slight updates wrt slurm
1.0.5 * introspect slurm files to extract stats
1.0.4 * add utility function to download and untar a tar.gz file
1.0.3 * add levenshtein function. some typo corrections.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
#maintainer ?#maintainer email
[tool.poetry]
name = "sequana_pipetools"
version = "1.0.6"
version = "1.1.0"
description = "A set of tools to help building or using Sequana pipelines"
authors = ["Sequana Team"]
license = "BSD-3"
Expand Down
22 changes: 19 additions & 3 deletions sequana_pipetools/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,9 @@ def deps_callback(ctx, param, value):
data = fin.read()
data = data.split()
data = "\n".join(sorted(data))
click.echo(f"Those software will be required for the pipeline to work correctly:\n\n{data}\n")
click.echo(
f"sequana_{ctx.NAME} will need one or more of these software to work correctly. We recommend you to use --apptainer option so that you do not need to install them manually:\n\n{data}\n"
)
ctx.exit(0)


Expand Down Expand Up @@ -302,11 +304,17 @@ def __init__(self, working_directory="analysis", caller=None):

class ClickInputOptions:
group_name = "Data"
metadata = {"name": group_name, "options": ["--input-directory", "--input-pattern", "--input-readtag"]}
metadata = {
"name": group_name,
"options": ["--input-directory", "--input-pattern", "--input-readtag", "--exclude-pattern"],
}

def __init__(self, input_directory=".", input_pattern="*fastq.gz", add_input_readtag=True, caller=None):
def __init__(
self, input_directory=".", input_pattern="*fastq.gz", add_input_readtag=True, caller=None, exclude_pattern=None
):
self.input_directory = input_directory
self.input_pattern = input_pattern
self.exclude_pattern = exclude_pattern
self.add_input_readtag = add_input_readtag

self.options = [
Expand All @@ -327,6 +335,14 @@ def __init__(self, input_directory=".", input_pattern="*fastq.gz", add_input_rea
show_default=True,
help=f"pattern for the input files ({input_pattern})",
),
click.option(
"--exclude-pattern",
"exclude_pattern",
default=self.exclude_pattern,
type=click.STRING,
show_default=True,
help=f"pattern for excluding input files ({exclude_pattern})",
),
]

if self.add_input_readtag:
Expand Down
7 changes: 7 additions & 0 deletions sequana_pipetools/sequana_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,11 @@ def fill_data_options(self):
cfg.input_directory = os.path.abspath(options["input_directory"])
if "--input-readtag" in sys.argv:
cfg.input_readtag = options["input_readtag"]
if "--exclude-pattern" in sys.argv:
cfg.exclude_pattern = options["exclude_pattern"]
else:
cfg.input_pattern = options.input_pattern
cfg.exclude_pattern = options.exclude_pattern
cfg.input_readtag = options.input_readtag
cfg.input_directory = os.path.abspath(options.input_directory)

Expand Down Expand Up @@ -258,6 +261,10 @@ def check_input_files(self, stop_on_error=True):
cfg = self.config.config

filenames = glob.glob(cfg.input_directory + os.sep + cfg.input_pattern)

# this code is just informative. Actual run is snaketools.pipeline_manager
if cfg.get("exclude_pattern", None) and cfg.get("exclude_pattern"):
filenames = [x for x in filenames if cfg.get("exclude_pattern") not in x.split("/")[-1]]
logger.info(
f"\u2705 Found {len(filenames)} files matching your input pattern ({cfg.input_pattern}) in {cfg.input_directory}"
)
Expand Down
15 changes: 12 additions & 3 deletions sequana_pipetools/snaketools/file_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class FileFactory:
"""

def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **kwargs):
def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, exclude_pattern=None, **kwargs):
""".. rubric:: Constructor
:param pattern: can be a filename, list of filenames, or a global
Expand All @@ -92,6 +92,7 @@ def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **k
self.pattern = pattern
self.extra_prefixes_to_strip = extra_prefixes_to_strip
self.sample_pattern = sample_pattern
self.exclude_pattern = exclude_pattern

try:
if os.path.exists(pattern):
Expand All @@ -108,6 +109,10 @@ def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **k
# remove directories if they exist
self._glob = [x for x in self._glob if not os.path.isdir(x)]

# remove candidates that have the exclude pattern
if self.exclude_pattern: # pragma: no cover
self._glob = [x for x in self._glob if not self.exclude_pattern in x]

def _get_realpaths(self):
return [os.path.realpath(filename) for filename in self._glob]

Expand Down Expand Up @@ -156,7 +161,7 @@ def func(filename):

if filename.startswith(prefix) and filename.endswith(suffix):
res = res[len(prefix) : len(res) - len(suffix)]
else:
else: # pragma: no cover
raise PipetoolsException(f"Your sample pattern does not match the filename {filename}")
else:
res = filename[:]
Expand Down Expand Up @@ -246,6 +251,7 @@ def __init__(
read_tag="_R[12]_",
extra_prefixes_to_strip=[],
sample_pattern=None,
exclude_pattern=None,
**kwargs,
):
r""".. rubric:: Constructor
Expand All @@ -267,7 +273,10 @@ def __init__(
and your sample will be only 'A'.
"""
super(FastQFactory, self).__init__(
pattern, extra_prefixes_to_strip=extra_prefixes_to_strip, sample_pattern=sample_pattern
pattern,
extra_prefixes_to_strip=extra_prefixes_to_strip,
sample_pattern=sample_pattern,
exclude_pattern=exclude_pattern,
)

self.read_tag = read_tag
Expand Down
13 changes: 11 additions & 2 deletions sequana_pipetools/snaketools/pipeline_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,6 @@ def teardown(self, extra_dirs_to_remove=[], extra_files_to_remove=[], outdir="."
cleaner.add_makefile()

# create the version file given the requirements

if os.path.exists(f"{outdir}/.sequana/tools.txt"):
with open(f"{outdir}/.sequana/tools.txt", "r") as fin:
deps = fin.readlines()
Expand Down Expand Up @@ -254,6 +253,10 @@ class PipelineManager(PipelineManagerBase):
- input_readtag: "_R[12]_"
- input_pattern: "*.fastq.gz"
and optional option:
- exclude_pattern:
You may omit the input_readtag, which is not required for non-paired data. For instance for
pacbio and nanopore files, there are not paired and the read tag is not required. Instead, if
you are dealing with Illumina/MGI data sets, you must provide this field IF AND ONLY IF you want
Expand Down Expand Up @@ -342,6 +345,7 @@ def __init__(
sample_func=None,
extra_prefixes_to_strip=[],
sample_pattern=None,
exclude_pattern=None,
**kwargs,
):
""".. rubric:: Constructor
Expand Down Expand Up @@ -371,6 +375,7 @@ def __init__(
# can be provided in the config file or arguments
self.sample_pattern = cfg.config.get("sample_pattern", sample_pattern)
self.extra_prefixes_to_strip = cfg.config.get("extra_prefixes_to_strip", extra_prefixes_to_strip)
self.exclude_pattern = cfg.config.get("exclude_pattern", exclude_pattern)

# if input_directory is not filled, the input_pattern, if valid, will be used instead and must
# be provided anyway.
Expand Down Expand Up @@ -446,6 +451,7 @@ def _get_fastq_files(self, glob_dir, read_tag):
read_tag=read_tag,
extra_prefixes_to_strip=self.extra_prefixes_to_strip,
sample_pattern=self.sample_pattern,
exclude_pattern=self.exclude_pattern,
)

# check whether it is paired or not. This is just to raise an error when
Expand All @@ -470,7 +476,10 @@ def _get_fastq_files(self, glob_dir, read_tag):

def _get_any_files(self, pattern):
self.ff = FileFactory(
pattern, extra_prefixes_to_strip=self.extra_prefixes_to_strip, sample_pattern=self.sample_pattern
pattern,
extra_prefixes_to_strip=self.extra_prefixes_to_strip,
sample_pattern=self.sample_pattern,
exclude_pattern=self.exclude_pattern,
)

# samples contains a correspondance between the sample name and the
Expand Down

0 comments on commit 4c9eb26

Please sign in to comment.