From c6c34f442904a61ec22089f7d4cb4217df3b611d Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 5 Mar 2024 14:41:27 +0100 Subject: [PATCH 01/13] add linters for datatypes - tools should only use available datatypes in inputs, outputs and tests - discourage the use of custom datatypes_conf.xml --- lib/galaxy/tool_util/linters/datatypes.py | 83 +++++++++++++++++++++++ packages/tool_util/setup.cfg | 1 + test/unit/tool_util/test_tool_linters.py | 65 +++++++++++++++++- 3 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 lib/galaxy/tool_util/linters/datatypes.py diff --git a/lib/galaxy/tool_util/linters/datatypes.py b/lib/galaxy/tool_util/linters/datatypes.py new file mode 100644 index 000000000000..c577e0dccd27 --- /dev/null +++ b/lib/galaxy/tool_util/linters/datatypes.py @@ -0,0 +1,83 @@ +import os.path +from typing import ( + Set, + TYPE_CHECKING, +) + +from galaxy import config +from galaxy.tool_util.lint import Linter +from galaxy.util import ( + listify, + parse_xml, +) + +if TYPE_CHECKING: + from galaxy.tool_util.lint import LintContext + from galaxy.tool_util.parser import ToolSource + +DATATYPES_CONF = os.path.join(os.path.dirname(config.__file__), "sample", "datatypes_conf.xml.sample") + + +def _parse_datatypes(datatype_conf_path: str) -> Set[str]: + datatypes = set() + tree = parse_xml(datatype_conf_path) + root = tree.getroot() + for elem in root.findall(f"./registration/datatype"): + extension = elem.get("extension") + datatypes.add(extension) + auto_compressed_types = listify(elem.get("auto_compressed_types", "")) + for act in auto_compressed_types: + datatypes.add(f"{extension}.{act}") + return datatypes + + +class DatatypesCustomConf(Linter): + """ + Check if a custom datatypes_conf.xml is present + """ + + @classmethod + def lint(cls, tool_source: "ToolSource", lint_ctx: "LintContext"): + if not tool_source.source_path: + return + if not (tool_xml := getattr(tool_source, "xml_tree", None)): + return + tool_node = tool_xml.getroot() + tool_dir = os.path.dirname(tool_source.source_path) + datatypes_conf_path = os.path.join(tool_dir, "datatypes_conf.xml") + if os.path.exists(datatypes_conf_path): + lint_ctx.warn( + f"Tool uses a custom datatypes_conf.xml which is discouraged", + linter=cls.name(), + node=tool_node, + ) + + +class ValidDatatypes(Linter): + """ + Check that used datatypes are available + """ + + @classmethod + def lint(cls, tool_source: "ToolSource", lint_ctx: "LintContext"): + tool_xml = getattr(tool_source, "xml_tree", None) + if not tool_xml: + return + # get Galaxy built-in dataypes + datatypes = _parse_datatypes(DATATYPES_CONF) + # add custom tool data types + if tool_source.source_path: + tool_dir = os.path.dirname(tool_source.source_path) + datatypes_conf_path = os.path.join(tool_dir, "datatypes_conf.xml") + if os.path.exists(datatypes_conf_path): + datatypes |= _parse_datatypes(datatypes_conf_path) + for attrib in ["format", "ftype", "ext"]: + for elem in tool_xml.findall(f".//*[@{attrib}]"): + formats = elem.get(attrib, "").split(",") + for format in formats: + if format not in datatypes: + lint_ctx.error( + f"Unknown datatype [{format}] used in {elem.tag} element", + linter=cls.name(), + node=elem, + ) diff --git a/packages/tool_util/setup.cfg b/packages/tool_util/setup.cfg index 848a609a6b2c..b85c01eba4cd 100644 --- a/packages/tool_util/setup.cfg +++ b/packages/tool_util/setup.cfg @@ -33,6 +33,7 @@ version = 23.2.dev0 [options] include_package_data = True install_requires = + galaxy-config galaxy-util>=22.1 conda-package-streaming lxml!=4.2.2 diff --git a/test/unit/tool_util/test_tool_linters.py b/test/unit/tool_util/test_tool_linters.py index 2db30308b950..f22a895727b0 100644 --- a/test/unit/tool_util/test_tool_linters.py +++ b/test/unit/tool_util/test_tool_linters.py @@ -16,6 +16,7 @@ from galaxy.tool_util.linters import ( citations, command, + datatypes, general, help, inputs, @@ -1947,6 +1948,68 @@ def test_xml_order(lint_ctx): assert lint_ctx.error_messages == ["Invalid XML: Element 'wrong_tag': This element is not expected."] +VALID_DATATYPES = """ + + + + + + + + + + + + + + + + + + + + + + + + +""" +DATATYPES_CONF = """ + + + + + +""" + + +def test_valid_datatypes(lint_ctx): + """ + test datatypes linters + """ + with tempfile.TemporaryDirectory() as tmp: + tool_path = os.path.join(tmp, "tool.xml") + datatypes_path = os.path.join(tmp, "datatypes_conf.xml") + with open(tool_path, "w") as tmpf: + tmpf.write(VALID_DATATYPES) + with open(datatypes_path, "w") as tmpf: + tmpf.write(DATATYPES_CONF) + tool_xml, _ = load_with_references(tool_path) + tool_source = XmlToolSource(tool_xml, source_path=tool_path) + run_lint_module(lint_ctx, datatypes, tool_source) + assert not lint_ctx.info_messages + assert not lint_ctx.valid_messages + assert "Tool uses a custom datatypes_conf.xml which is discouraged" in lint_ctx.warn_messages + assert len(lint_ctx.warn_messages) == 1 + assert "Unknown datatype [invalid] used in param" in lint_ctx.error_messages + assert "Unknown datatype [another_invalid] used in data" in lint_ctx.error_messages + assert "Unknown datatype [just_another_invalid] used in when" in lint_ctx.error_messages + assert "Unknown datatype [collection_format] used in collection" in lint_ctx.error_messages + assert "Unknown datatype [invalid] used in param" in lint_ctx.error_messages + assert "Unknown datatype [invalid] used in discover_datasets" in lint_ctx.error_messages + assert len(lint_ctx.error_messages) == 6 + + DATA_MANAGER = """ @@ -2168,7 +2231,7 @@ def test_skip_by_module(lint_ctx): def test_list_linters(): linter_names = Linter.list_listers() # make sure to add/remove a test for new/removed linters if this number changes - assert len(linter_names) == 133 + assert len(linter_names) == 135 assert "Linter" not in linter_names # make sure that linters from all modules are available for prefix in [ From eb61d68db60be0306ae62e46a4ef86a4a06f0259 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 5 Mar 2024 14:49:27 +0100 Subject: [PATCH 02/13] require python 3.8 for tool-util --- packages/tool_util/setup.cfg | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/tool_util/setup.cfg b/packages/tool_util/setup.cfg index b85c01eba4cd..9f532ed401cb 100644 --- a/packages/tool_util/setup.cfg +++ b/packages/tool_util/setup.cfg @@ -9,7 +9,6 @@ classifiers = Natural Language :: English Operating System :: POSIX Programming Language :: Python :: 3 - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 @@ -45,7 +44,7 @@ install_requires = sortedcontainers typing-extensions packages = find: -python_requires = >=3.7 +python_requires = >=3.8 [options.entry_points] console_scripts = From 3065b36e3ec6d389956a9ffece6d4160e951b51c Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 5 Mar 2024 14:54:30 +0100 Subject: [PATCH 03/13] revert walrus operator usage to allow usage in python 3.7 --- lib/galaxy/tool_util/linters/datatypes.py | 3 ++- packages/tool_util/setup.cfg | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/tool_util/linters/datatypes.py b/lib/galaxy/tool_util/linters/datatypes.py index c577e0dccd27..aba7feca96d2 100644 --- a/lib/galaxy/tool_util/linters/datatypes.py +++ b/lib/galaxy/tool_util/linters/datatypes.py @@ -40,7 +40,8 @@ class DatatypesCustomConf(Linter): def lint(cls, tool_source: "ToolSource", lint_ctx: "LintContext"): if not tool_source.source_path: return - if not (tool_xml := getattr(tool_source, "xml_tree", None)): + tool_xml = getattr(tool_source, "xml_tree", None) + if not tool_xml: return tool_node = tool_xml.getroot() tool_dir = os.path.dirname(tool_source.source_path) diff --git a/packages/tool_util/setup.cfg b/packages/tool_util/setup.cfg index 9f532ed401cb..b85c01eba4cd 100644 --- a/packages/tool_util/setup.cfg +++ b/packages/tool_util/setup.cfg @@ -9,6 +9,7 @@ classifiers = Natural Language :: English Operating System :: POSIX Programming Language :: Python :: 3 + Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 @@ -44,7 +45,7 @@ install_requires = sortedcontainers typing-extensions packages = find: -python_requires = >=3.8 +python_requires = >=3.7 [options.entry_points] console_scripts = From 08f2124eb6a748e5b06165822d236157d4225b35 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 5 Mar 2024 15:02:14 +0100 Subject: [PATCH 04/13] fix linter errors --- lib/galaxy/tool_util/linters/datatypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/tool_util/linters/datatypes.py b/lib/galaxy/tool_util/linters/datatypes.py index aba7feca96d2..98837d057ccc 100644 --- a/lib/galaxy/tool_util/linters/datatypes.py +++ b/lib/galaxy/tool_util/linters/datatypes.py @@ -22,7 +22,7 @@ def _parse_datatypes(datatype_conf_path: str) -> Set[str]: datatypes = set() tree = parse_xml(datatype_conf_path) root = tree.getroot() - for elem in root.findall(f"./registration/datatype"): + for elem in root.findall("./registration/datatype"): extension = elem.get("extension") datatypes.add(extension) auto_compressed_types = listify(elem.get("auto_compressed_types", "")) @@ -48,7 +48,7 @@ def lint(cls, tool_source: "ToolSource", lint_ctx: "LintContext"): datatypes_conf_path = os.path.join(tool_dir, "datatypes_conf.xml") if os.path.exists(datatypes_conf_path): lint_ctx.warn( - f"Tool uses a custom datatypes_conf.xml which is discouraged", + "Tool uses a custom datatypes_conf.xml which is discouraged", linter=cls.name(), node=tool_node, ) From 99e088a99c11b83eb0b41a1882f39067ad29367a Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 5 Mar 2024 15:54:01 +0100 Subject: [PATCH 05/13] fix return type --- lib/galaxy/tool_util/linters/datatypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/tool_util/linters/datatypes.py b/lib/galaxy/tool_util/linters/datatypes.py index 98837d057ccc..9fd3a08f4c9a 100644 --- a/lib/galaxy/tool_util/linters/datatypes.py +++ b/lib/galaxy/tool_util/linters/datatypes.py @@ -23,7 +23,7 @@ def _parse_datatypes(datatype_conf_path: str) -> Set[str]: tree = parse_xml(datatype_conf_path) root = tree.getroot() for elem in root.findall("./registration/datatype"): - extension = elem.get("extension") + extension = elem.get("extension", "") datatypes.add(extension) auto_compressed_types = listify(elem.get("auto_compressed_types", "")) for act in auto_compressed_types: From d4618c2c73ac6c06e9c25a0fe9015757a1ebf521 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Sat, 18 May 2024 12:08:40 +0200 Subject: [PATCH 06/13] try to add datatypes_conf sample to tool-util package --- packages/tool_util/MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/tool_util/MANIFEST.in b/packages/tool_util/MANIFEST.in index 3c6d371983de..496d970a6c3c 100644 --- a/packages/tool_util/MANIFEST.in +++ b/packages/tool_util/MANIFEST.in @@ -5,3 +5,4 @@ include galaxy/tool_util/toolbox/filters/*.sample include galaxy/tool_util/verify/test_config.sample.yml include galaxy/tool_util/xsd/* include galaxy/tool_util/ontologies/*tsv +include galaxy/config/sample/datatypes_conf.xml.sample \ No newline at end of file From c0263055cb9068a1919afde1c0ad8e9a4f8ae129 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Sat, 18 May 2024 12:13:11 +0200 Subject: [PATCH 07/13] revert addition of config as requirement --- packages/tool_util/setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/tool_util/setup.cfg b/packages/tool_util/setup.cfg index b85c01eba4cd..848a609a6b2c 100644 --- a/packages/tool_util/setup.cfg +++ b/packages/tool_util/setup.cfg @@ -33,7 +33,6 @@ version = 23.2.dev0 [options] include_package_data = True install_requires = - galaxy-config galaxy-util>=22.1 conda-package-streaming lxml!=4.2.2 From 6f044a17ef9ddbeab3a5db31249e687310851f76 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Sat, 18 May 2024 12:58:14 +0200 Subject: [PATCH 08/13] remove config import --- lib/galaxy/tool_util/linters/datatypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/tool_util/linters/datatypes.py b/lib/galaxy/tool_util/linters/datatypes.py index 9fd3a08f4c9a..b51fd5dee50c 100644 --- a/lib/galaxy/tool_util/linters/datatypes.py +++ b/lib/galaxy/tool_util/linters/datatypes.py @@ -4,7 +4,7 @@ TYPE_CHECKING, ) -from galaxy import config +# from galaxy import config from galaxy.tool_util.lint import Linter from galaxy.util import ( listify, @@ -15,7 +15,7 @@ from galaxy.tool_util.lint import LintContext from galaxy.tool_util.parser import ToolSource -DATATYPES_CONF = os.path.join(os.path.dirname(config.__file__), "sample", "datatypes_conf.xml.sample") +# DATATYPES_CONF = os.path.join(os.path.dirname(config.__file__), "sample", "datatypes_conf.xml.sample") def _parse_datatypes(datatype_conf_path: str) -> Set[str]: From 2bfb7318e664a68dfc9de69fffdf5e328769d434 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Sat, 18 May 2024 13:40:29 +0200 Subject: [PATCH 09/13] try --- lib/galaxy/tool_util/linters/datatypes.py | 2 +- lib/galaxy/tool_util/linters/datatypes_conf.xml.sample | 1 + packages/tool_util/MANIFEST.in | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) create mode 120000 lib/galaxy/tool_util/linters/datatypes_conf.xml.sample diff --git a/lib/galaxy/tool_util/linters/datatypes.py b/lib/galaxy/tool_util/linters/datatypes.py index b51fd5dee50c..534868ed01ec 100644 --- a/lib/galaxy/tool_util/linters/datatypes.py +++ b/lib/galaxy/tool_util/linters/datatypes.py @@ -15,7 +15,7 @@ from galaxy.tool_util.lint import LintContext from galaxy.tool_util.parser import ToolSource -# DATATYPES_CONF = os.path.join(os.path.dirname(config.__file__), "sample", "datatypes_conf.xml.sample") +DATATYPES_CONF = os.path.join(os.path.dirname(__file__), "datatypes_conf.xml.sample") def _parse_datatypes(datatype_conf_path: str) -> Set[str]: diff --git a/lib/galaxy/tool_util/linters/datatypes_conf.xml.sample b/lib/galaxy/tool_util/linters/datatypes_conf.xml.sample new file mode 120000 index 000000000000..6a8d2e103481 --- /dev/null +++ b/lib/galaxy/tool_util/linters/datatypes_conf.xml.sample @@ -0,0 +1 @@ +../../config/sample/datatypes_conf.xml.sample \ No newline at end of file diff --git a/packages/tool_util/MANIFEST.in b/packages/tool_util/MANIFEST.in index 496d970a6c3c..a60b960699b6 100644 --- a/packages/tool_util/MANIFEST.in +++ b/packages/tool_util/MANIFEST.in @@ -5,4 +5,4 @@ include galaxy/tool_util/toolbox/filters/*.sample include galaxy/tool_util/verify/test_config.sample.yml include galaxy/tool_util/xsd/* include galaxy/tool_util/ontologies/*tsv -include galaxy/config/sample/datatypes_conf.xml.sample \ No newline at end of file +include galaxy/tool_util/linters/datatypes_conf.xml.sample \ No newline at end of file From f8478d7aa78450976cff9727106876a9e4db1045 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Mon, 21 Oct 2024 22:20:01 +0200 Subject: [PATCH 10/13] restrict ext, format, ftype in xsd --- lib/galaxy/tool_util/xsd/galaxy.xsd | 30 ++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/lib/galaxy/tool_util/xsd/galaxy.xsd b/lib/galaxy/tool_util/xsd/galaxy.xsd index 96405ed94b84..d1fd80bb08e0 100644 --- a/lib/galaxy/tool_util/xsd/galaxy.xsd +++ b/lib/galaxy/tool_util/xsd/galaxy.xsd @@ -1575,7 +1575,7 @@ used to load typed parameters. This string will be loaded as JSON and its type w attempt to be preserved through API requests to Galaxy. - + This attribute name should be included only with parameters of ``type`` ``data`` for the tool. If this @@ -1720,7 +1720,7 @@ generated as JSON. This can be useful for testing tool outputs that are not file ]]> - + - + The comma-separated list of accepted data formats for this input. The list of supported data formats is contained in the @@ -5639,7 +5639,7 @@ The default is ``galaxy.json``. - + The short name for the output datatype. The valid values for format can be found in @@ -6042,12 +6042,12 @@ More information can be found on Planemo's documentation for Indicates that the entire path of the discovered dataset relative to the specified directory should be available for matching patterns. - + Format (or datatype) of discovered datasets (an alias with ``ext``). - + Format (or datatype) of discovered datasets (an alias with ``format``). @@ -6115,12 +6115,12 @@ Galaxy, including: Indicates that the entire path of the discovered dataset relative to the specified directory should be available for matching patterns. - + Format (or datatype) of discovered datasets (an alias with ``ext``). - + Format (or datatype) of discovered datasets (an alias with ``format``). @@ -7418,7 +7418,7 @@ parameter (e.g. ``value="interval"`` above), or of the deprecated ``input_datase attribute. - + This value must be a supported data type (e.g. ``format="interval"``). See @@ -7844,6 +7844,18 @@ favour of a ``has_size`` assertion. + + + + + + + + + + + + Date: Tue, 22 Oct 2024 09:23:12 +0200 Subject: [PATCH 11/13] fix test: number of linters --- test/unit/tool_util/test_tool_linters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/tool_util/test_tool_linters.py b/test/unit/tool_util/test_tool_linters.py index 6c3a900aea4e..ea84b6c2a79e 100644 --- a/test/unit/tool_util/test_tool_linters.py +++ b/test/unit/tool_util/test_tool_linters.py @@ -2208,7 +2208,7 @@ def test_skip_by_module(lint_ctx): def test_list_linters(): linter_names = Linter.list_listers() # make sure to add/remove a test for new/removed linters if this number changes - assert len(linter_names) == 135 + assert len(linter_names) == 134 assert "Linter" not in linter_names # make sure that linters from all modules are available for prefix in [ From be3c54716147b3251a7c1690dfbad868fa112980 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 22 Oct 2024 09:28:31 +0200 Subject: [PATCH 12/13] document format of extension --- doc/source/dev/data_types.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/dev/data_types.md b/doc/source/dev/data_types.md index fb03c4fe5d04..16b970290be2 100644 --- a/doc/source/dev/data_types.md +++ b/doc/source/dev/data_types.md @@ -36,7 +36,8 @@ tag section of the `datatypes_conf.xml` file. Sample where - `extension` - the data type's Dataset file extension (e.g., `ab1`, `bed`, - `gff`, `qual`, etc.) + `gff`, `qual`, etc.). The extension must consist only of lowercase letters, + numbers, `_`, `-`, and `.`. - `type` - the path to the class for that data type. - `mimetype` - if present (it's optional), the data type's mime type - `display_in_upload` - if present (it's optional and defaults to False), the From 3b702569e1c2dcf3f1f94a622688e77093610b6c Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 22 Oct 2024 09:28:58 +0200 Subject: [PATCH 13/13] fix format --- lib/galaxy/tool_util/xsd/galaxy.xsd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/tool_util/xsd/galaxy.xsd b/lib/galaxy/tool_util/xsd/galaxy.xsd index 33edb06aa50c..d818f2438ab2 100644 --- a/lib/galaxy/tool_util/xsd/galaxy.xsd +++ b/lib/galaxy/tool_util/xsd/galaxy.xsd @@ -8030,7 +8030,7 @@ favour of a ``has_size`` assertion. - +