Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement --exclude-pattern option #113

Merged
merged 1 commit into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,8 @@ Changelog
========= ======================================================================
Version Description
========= ======================================================================
1.1.0 * add exclude_pattern in input data section
1.0.6 * add py3.12, slight updates wrt slurm
1.0.5 * introspect slurm files to extract stats
1.0.4 * add utility function to download and untar a tar.gz file
1.0.3 * add levenshtein function. some typo corrections.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
#maintainer ?#maintainer email
[tool.poetry]
name = "sequana_pipetools"
version = "1.0.6"
version = "1.1.0"
description = "A set of tools to help building or using Sequana pipelines"
authors = ["Sequana Team"]
license = "BSD-3"
Expand Down
22 changes: 19 additions & 3 deletions sequana_pipetools/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,9 @@ def deps_callback(ctx, param, value):
data = fin.read()
data = data.split()
data = "\n".join(sorted(data))
click.echo(f"Those software will be required for the pipeline to work correctly:\n\n{data}\n")
click.echo(
f"sequana_{ctx.NAME} will need one or more of these software to work correctly. We recommend you to use --apptainer option so that you do not need to install them manually:\n\n{data}\n"
)
ctx.exit(0)


Expand Down Expand Up @@ -302,11 +304,17 @@ def __init__(self, working_directory="analysis", caller=None):

class ClickInputOptions:
group_name = "Data"
metadata = {"name": group_name, "options": ["--input-directory", "--input-pattern", "--input-readtag"]}
metadata = {
"name": group_name,
"options": ["--input-directory", "--input-pattern", "--input-readtag", "--exclude-pattern"],
}

def __init__(self, input_directory=".", input_pattern="*fastq.gz", add_input_readtag=True, caller=None):
def __init__(
self, input_directory=".", input_pattern="*fastq.gz", add_input_readtag=True, caller=None, exclude_pattern=None
):
self.input_directory = input_directory
self.input_pattern = input_pattern
self.exclude_pattern = exclude_pattern
self.add_input_readtag = add_input_readtag

self.options = [
Expand All @@ -327,6 +335,14 @@ def __init__(self, input_directory=".", input_pattern="*fastq.gz", add_input_rea
show_default=True,
help=f"pattern for the input files ({input_pattern})",
),
click.option(
"--exclude-pattern",
"exclude_pattern",
default=self.exclude_pattern,
type=click.STRING,
show_default=True,
help=f"pattern for excluding input files ({exclude_pattern})",
),
]

if self.add_input_readtag:
Expand Down
7 changes: 7 additions & 0 deletions sequana_pipetools/sequana_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,11 @@ def fill_data_options(self):
cfg.input_directory = os.path.abspath(options["input_directory"])
if "--input-readtag" in sys.argv:
cfg.input_readtag = options["input_readtag"]
if "--exclude-pattern" in sys.argv:
cfg.exclude_pattern = options["exclude_pattern"]
else:
cfg.input_pattern = options.input_pattern
cfg.exclude_pattern = options.exclude_pattern
cfg.input_readtag = options.input_readtag
cfg.input_directory = os.path.abspath(options.input_directory)

Expand Down Expand Up @@ -258,6 +261,10 @@ def check_input_files(self, stop_on_error=True):
cfg = self.config.config

filenames = glob.glob(cfg.input_directory + os.sep + cfg.input_pattern)

# this code is just informative. Actual run is snaketools.pipeline_manager
if cfg.get("exclude_pattern", None) and cfg.get("exclude_pattern"):
filenames = [x for x in filenames if cfg.get("exclude_pattern") not in x.split("/")[-1]]
logger.info(
f"\u2705 Found {len(filenames)} files matching your input pattern ({cfg.input_pattern}) in {cfg.input_directory}"
)
Expand Down
15 changes: 12 additions & 3 deletions sequana_pipetools/snaketools/file_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class FileFactory:

"""

def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **kwargs):
def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, exclude_pattern=None, **kwargs):
""".. rubric:: Constructor

:param pattern: can be a filename, list of filenames, or a global
Expand All @@ -92,6 +92,7 @@ def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **k
self.pattern = pattern
self.extra_prefixes_to_strip = extra_prefixes_to_strip
self.sample_pattern = sample_pattern
self.exclude_pattern = exclude_pattern

try:
if os.path.exists(pattern):
Expand All @@ -108,6 +109,10 @@ def __init__(self, pattern, extra_prefixes_to_strip=[], sample_pattern=None, **k
# remove directories if they exist
self._glob = [x for x in self._glob if not os.path.isdir(x)]

# remove candidates that have the exclude pattern
if self.exclude_pattern: # pragma: no cover
self._glob = [x for x in self._glob if not self.exclude_pattern in x]

def _get_realpaths(self):
return [os.path.realpath(filename) for filename in self._glob]

Expand Down Expand Up @@ -156,7 +161,7 @@ def func(filename):

if filename.startswith(prefix) and filename.endswith(suffix):
res = res[len(prefix) : len(res) - len(suffix)]
else:
else: # pragma: no cover
raise PipetoolsException(f"Your sample pattern does not match the filename {filename}")
else:
res = filename[:]
Expand Down Expand Up @@ -246,6 +251,7 @@ def __init__(
read_tag="_R[12]_",
extra_prefixes_to_strip=[],
sample_pattern=None,
exclude_pattern=None,
**kwargs,
):
r""".. rubric:: Constructor
Expand All @@ -267,7 +273,10 @@ def __init__(
and your sample will be only 'A'.
"""
super(FastQFactory, self).__init__(
pattern, extra_prefixes_to_strip=extra_prefixes_to_strip, sample_pattern=sample_pattern
pattern,
extra_prefixes_to_strip=extra_prefixes_to_strip,
sample_pattern=sample_pattern,
exclude_pattern=exclude_pattern,
)

self.read_tag = read_tag
Expand Down
13 changes: 11 additions & 2 deletions sequana_pipetools/snaketools/pipeline_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,6 @@ def teardown(self, extra_dirs_to_remove=[], extra_files_to_remove=[], outdir="."
cleaner.add_makefile()

# create the version file given the requirements

if os.path.exists(f"{outdir}/.sequana/tools.txt"):
with open(f"{outdir}/.sequana/tools.txt", "r") as fin:
deps = fin.readlines()
Expand Down Expand Up @@ -254,6 +253,10 @@ class PipelineManager(PipelineManagerBase):
- input_readtag: "_R[12]_"
- input_pattern: "*.fastq.gz"

and optional option:

- exclude_pattern:

You may omit the input_readtag, which is not required for non-paired data. For instance for
pacbio and nanopore files, there are not paired and the read tag is not required. Instead, if
you are dealing with Illumina/MGI data sets, you must provide this field IF AND ONLY IF you want
Expand Down Expand Up @@ -342,6 +345,7 @@ def __init__(
sample_func=None,
extra_prefixes_to_strip=[],
sample_pattern=None,
exclude_pattern=None,
**kwargs,
):
""".. rubric:: Constructor
Expand Down Expand Up @@ -371,6 +375,7 @@ def __init__(
# can be provided in the config file or arguments
self.sample_pattern = cfg.config.get("sample_pattern", sample_pattern)
self.extra_prefixes_to_strip = cfg.config.get("extra_prefixes_to_strip", extra_prefixes_to_strip)
self.exclude_pattern = cfg.config.get("exclude_pattern", exclude_pattern)

# if input_directory is not filled, the input_pattern, if valid, will be used instead and must
# be provided anyway.
Expand Down Expand Up @@ -446,6 +451,7 @@ def _get_fastq_files(self, glob_dir, read_tag):
read_tag=read_tag,
extra_prefixes_to_strip=self.extra_prefixes_to_strip,
sample_pattern=self.sample_pattern,
exclude_pattern=self.exclude_pattern,
)

# check whether it is paired or not. This is just to raise an error when
Expand All @@ -470,7 +476,10 @@ def _get_fastq_files(self, glob_dir, read_tag):

def _get_any_files(self, pattern):
self.ff = FileFactory(
pattern, extra_prefixes_to_strip=self.extra_prefixes_to_strip, sample_pattern=self.sample_pattern
pattern,
extra_prefixes_to_strip=self.extra_prefixes_to_strip,
sample_pattern=self.sample_pattern,
exclude_pattern=self.exclude_pattern,
)

# samples contains a correspondance between the sample name and the
Expand Down
Loading