From dd30a3d4679e57537a6ac2138bd0b3f554ea46c3 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 2 Jul 2024 03:41:34 -0500 Subject: [PATCH 1/8] Fix path handling Signed-off-by: Ben Sherman --- plugins/nf-prov/src/main/nextflow/prov/BcoRenderer.groovy | 2 +- plugins/nf-prov/src/main/nextflow/prov/DagRenderer.groovy | 2 +- plugins/nf-prov/src/main/nextflow/prov/LegacyRenderer.groovy | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/BcoRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/BcoRenderer.groovy index 4b67e3c..6350e05 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/BcoRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/BcoRenderer.groovy @@ -43,7 +43,7 @@ class BcoRenderer implements Renderer { private PathNormalizer normalizer BcoRenderer(Map opts) { - path = opts.file as Path + path = (opts.file as Path).complete() overwrite = opts.overwrite as Boolean ProvHelper.checkFileOverwrite(path, overwrite) diff --git a/plugins/nf-prov/src/main/nextflow/prov/DagRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/DagRenderer.groovy index 453b650..6cd3448 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/DagRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/DagRenderer.groovy @@ -41,7 +41,7 @@ class DagRenderer implements Renderer { private PathNormalizer normalizer DagRenderer(Map opts) { - path = opts.file as Path + path = (opts.file as Path).complete() overwrite = opts.overwrite as Boolean ProvHelper.checkFileOverwrite(path, overwrite) diff --git a/plugins/nf-prov/src/main/nextflow/prov/LegacyRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/LegacyRenderer.groovy index 6b0f346..12b8972 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/LegacyRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/LegacyRenderer.groovy @@ -37,7 +37,7 @@ class LegacyRenderer implements Renderer { private boolean overwrite LegacyRenderer(Map opts) { - path = opts.file as Path + path = (opts.file as Path).complete() overwrite = opts.overwrite as Boolean ProvHelper.checkFileOverwrite(path, overwrite) From 16f39e0f1f4d3115eee47013926dc4aa9cc27ddc Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Wed, 3 Jul 2024 03:23:15 -0500 Subject: [PATCH 2/8] Release 1.2.3 Signed-off-by: Ben Sherman --- plugins/nf-prov/src/resources/META-INF/MANIFEST.MF | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF b/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF index 2db3eb5..0f2e693 100644 --- a/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF +++ b/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF @@ -1,6 +1,6 @@ Manifest-Version: 1.0 Plugin-Id: nf-prov -Plugin-Version: 1.2.2 +Plugin-Version: 1.2.3 Plugin-Class: nextflow.prov.ProvPlugin Plugin-Provider: nextflow Plugin-Requires: >=23.04.0 From c92b21f1e1529e4082fd005abe9bb06ce4a3b03d Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 11 Jul 2024 12:46:09 -0500 Subject: [PATCH 3/8] FIx race condition with workflow events Signed-off-by: Ben Sherman --- .../main/nextflow/prov/ProvObserver.groovy | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy b/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy index 658de6d..f508959 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy @@ -19,6 +19,8 @@ package nextflow.prov import java.nio.file.FileSystems import java.nio.file.Path import java.nio.file.PathMatcher +import java.util.concurrent.locks.Lock +import java.util.concurrent.locks.ReentrantLock import groovy.transform.CompileStatic import groovy.util.logging.Slf4j @@ -50,6 +52,8 @@ class ProvObserver implements TraceObserver { private Map workflowOutputs = [:] + private Lock lock = new ReentrantLock() + ProvObserver(Map formats, List patterns) { this.renderers = formats.collect( (name, config) -> createRenderer(name, config) ) this.matchers = patterns.collect( pattern -> @@ -82,24 +86,27 @@ class ProvObserver implements TraceObserver { if( !task.isSuccess() ) return - tasks << task + lock.withLock { + tasks << task + } } @Override void onProcessCached(TaskHandler handler, TraceRecord trace) { - tasks << handler.task + lock.withLock { + tasks << handler.task + } } @Override void onFilePublish(Path destination, Path source) { - boolean match = matchers.isEmpty() || matchers.any { matcher -> - matcher.matches(destination) - } - + final match = matchers.isEmpty() || matchers.any { matcher -> matcher.matches(destination) } if( !match ) return - workflowOutputs[source] = destination + lock.withLock { + workflowOutputs[source] = destination + } } @Override @@ -107,9 +114,8 @@ class ProvObserver implements TraceObserver { if( !session.isSuccess() ) return - renderers.each( renderer -> + for( final renderer : renderers ) renderer.render(session, tasks, workflowOutputs) - ) } } From 881881e10464c8531c77a3640f39c906bac3794d Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 25 Jul 2024 23:30:57 -0500 Subject: [PATCH 4/8] Release 1.2.4 Signed-off-by: Ben Sherman --- plugins/nf-prov/src/resources/META-INF/MANIFEST.MF | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF b/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF index 0f2e693..1bb2525 100644 --- a/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF +++ b/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF @@ -1,6 +1,6 @@ Manifest-Version: 1.0 Plugin-Id: nf-prov -Plugin-Version: 1.2.3 +Plugin-Version: 1.2.4 Plugin-Class: nextflow.prov.ProvPlugin Plugin-Provider: nextflow Plugin-Requires: >=23.04.0 From 6d7383382b8b228c86c58b1b3a00c9ca212e1ab9 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 5 Nov 2024 19:09:36 +0100 Subject: [PATCH 5/8] Add passthrough options for BCO (#36) --------- Signed-off-by: Ben Sherman --- BCO.md | 180 ++++++++++++++++++ README.md | 2 +- plugins/nf-prov/build.gradle | 18 +- .../src/main/nextflow/prov/BcoRenderer.groovy | 57 +++++- .../src/main/nextflow/prov/DagRenderer.groovy | 16 +- .../src/resources/META-INF/MANIFEST.MF | 2 +- 6 files changed, 244 insertions(+), 31 deletions(-) create mode 100644 BCO.md diff --git a/BCO.md b/BCO.md new file mode 100644 index 0000000..c709ab9 --- /dev/null +++ b/BCO.md @@ -0,0 +1,180 @@ +# Additional BCO configuration + +*New in version 1.3.0* + +The `bco` format supports additional "pass-through" options for certain BCO fields. These fields cannot be inferred automatically from a pipeline or run, and so must be entered through the config. External systems can use these config options to inject fields automatically. + +The following config options are supported: + +- `prov.formats.bco.provenance_domain.review` +- `prov.formats.bco.provenance_domain.derived_from` +- `prov.formats.bco.provenance_domain.obsolete_after` +- `prov.formats.bco.provenance_domain.embargo` +- `prov.formats.bco.usability_domain` +- `prov.formats.bco.description_domain.keywords` +- `prov.formats.bco.description_domain.xref` +- `prov.formats.bco.execution_domain.external_data_endpoints` +- `prov.formats.bco.execution_domain.environment_variables` + +These options correspond exactly to fields in the BCO JSON schema. Refer to the [BCO User Guide](https://docs.biocomputeobject.org/user_guide/) for more information about these fields. + +*NOTE: The `environment_variables` setting differs from the BCO standard in that it only specifies the variable names. Only the variables specified in this list will be populated in the BCO, if they are present in the execution environment.* + +Here is an example config based on the BCO User Guide: + +```groovy +prov { + formats { + bco { + provenance_domain { + review = [ + [ + "status": "approved", + "reviewer_comment": "Approved by GW staff. Waiting for approval from FDA Reviewer", + "date": "2017-11-12T12:30:48-0400", + "reviewer": [ + "name": "Charles Hadley King", + "affiliation": "George Washington University", + "email": "hadley_king@gwu.edu", + "contribution": "curatedBy", + "orcid": "https://orcid.org/0000-0003-1409-4549" + ] + ], + [ + "status": "approved", + "reviewer_comment": "The revised BCO looks fine", + "date": "2017-12-12T12:30:48-0400", + "reviewer": [ + "name": "Eric Donaldson", + "affiliation": "FDA", + "email": "Eric.Donaldson@fda.hhs.gov", + "contribution": "curatedBy" + ] + ] + ] + derived_from = 'https://example.com/BCO_948701/1.0' + obsolete_after = '2118-09-26T14:43:43-0400' + embargo = [ + "start_time": "2000-09-26T14:43:43-0400", + "end_time": "2000-09-26T14:43:45-0400" + ] + } + usability_domain = [ + "Identify baseline single nucleotide polymorphisms (SNPs)[SO:0000694], (insertions)[SO:0000667], and (deletions)[SO:0000045] that correlate with reduced (ledipasvir)[pubchem.compound:67505836] antiviral drug efficacy in (Hepatitis C virus subtype 1)[taxonomy:31646]", + "Identify treatment emergent amino acid (substitutions)[SO:1000002] that correlate with antiviral drug treatment failure", + "Determine whether the treatment emergent amino acid (substitutions)[SO:1000002] identified correlate with treatment failure involving other drugs against the same virus", + "GitHub CWL example: https://github.com/mr-c/hive-cwl-examples/blob/master/workflow/hive-viral-mutation-detection.cwl#L20" + ] + description_domain { + keywords = [ + "HCV1a", + "Ledipasvir", + "antiviral resistance", + "SNP", + "amino acid substitutions" + ] + xref = [ + [ + "namespace": "pubchem.compound", + "name": "PubChem-compound", + "ids": ["67505836"], + "access_time": "2018-13-02T10:15-05:00" + ], + [ + "namespace": "pubmed", + "name": "PubMed", + "ids": ["26508693"], + "access_time": "2018-13-02T10:15-05:00" + ], + [ + "namespace": "so", + "name": "Sequence Ontology", + "ids": ["SO:000002", "SO:0000694", "SO:0000667", "SO:0000045"], + "access_time": "2018-13-02T10:15-05:00" + ], + [ + "namespace": "taxonomy", + "name": "Taxonomy", + "ids": ["31646"], + "access_time": "2018-13-02T10:15-05:00" + ] + ] + } + execution_domain { + external_data_endpoints = [ + [ + "url": "protocol://domain:port/application/path", + "name": "generic name" + ], + [ + "url": "ftp://data.example.com:21/", + "name": "access to ftp server" + ], + [ + "url": "http://eutils.ncbi.nlm.nih.gov/entrez/eutils", + "name": "access to e-utils web service" + ] + ] + environment_variables = ["HOSTTYPE", "EDITOR"] + } + } + } +} +``` + +Alternatively, you can use params to make it easier for an external system: + +```groovy +prov { + formats { + bco { + provenance_domain { + review = params.bco_provenance_domain_review + derived_from = params.bco_provenance_domain_derived_from + obsolete_after = params.bco_provenance_domain_obsolete_after + embargo = params.bco_provenance_domain_embargo + } + usability_domain = params.bco_usability_domain + description_domain { + keywords = params.bco_description_domain_keywords + xref = params.bco_description_domain_xref + } + execution_domain { + external_data_endpoints = params.bco_execution_domain_external_data_endpoints + environment_variables = params.bco_execution_domain_environment_variables + } + } + } +} +``` + +This way, the pass-through options can be provided as JSON in a [params file](https://nextflow.io/docs/latest/reference/cli.html#run): + +```jsonc +{ + "bco_provenance_domain_review": [ + // ... + ], + "derived_from": "...", + "obsolete_after": "...", + "embargo": { + "start_time": "...", + "end_time": "..." + }, + "bco_usability_domain": [ + // ... + ], + "bco_description_domain_keywords": [ + // ... + ], + "bco_description_domain_xref": [ + // ... + ], + "bco_execution_domain_external_data_endpoints": [ + // ... + ], + "bco_execution_domain_environment_variables": [ + // ... + ] +} +``` diff --git a/README.md b/README.md index e4e815c..b26df7e 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Configuration scope for the desired output formats. The following formats are av - `bco`: Render a [BioCompute Object](https://biocomputeobject.org/). Supports the `file` and `overwrite` options. - Visit the [BCO User Guide](https://docs.biocomputeobject.org/user_guide/) to learn more about this format and how to extend it with information that isn't available to Nextflow. + *New in version 1.3.0*: additional "pass-through" options are available for BCO fields that can't be inferred from the pipeline. See [BCO.md](./BCO.md) for more information. - `dag`: Render the task graph as a Mermaid diagram embedded in an HTML document. Supports the `file` and `overwrite` options. diff --git a/plugins/nf-prov/build.gradle b/plugins/nf-prov/build.gradle index 3eee961..4432fcf 100644 --- a/plugins/nf-prov/build.gradle +++ b/plugins/nf-prov/build.gradle @@ -56,21 +56,17 @@ sourceSets { dependencies { // This dependency is exported to consumers, that is to say found on their compile classpath. - compileOnly 'io.nextflow:nextflow:23.04.0' + compileOnly 'io.nextflow:nextflow:24.10.0' compileOnly 'org.slf4j:slf4j-api:1.7.10' - compileOnly 'org.pf4j:pf4j:3.4.1' - // add here plugins depepencies + compileOnly 'org.pf4j:pf4j:3.12.0' // test configuration - testImplementation "org.codehaus.groovy:groovy:3.0.8" - testImplementation "org.codehaus.groovy:groovy-nio:3.0.8" - testImplementation 'io.nextflow:nextflow:23.04.0' - testImplementation ("org.codehaus.groovy:groovy-test:3.0.8") { exclude group: 'org.codehaus.groovy' } + testImplementation 'io.nextflow:nextflow:24.10.0' testImplementation ("cglib:cglib-nodep:3.3.0") - testImplementation ("org.objenesis:objenesis:3.1") - testImplementation ("org.spockframework:spock-core:2.0-M3-groovy-3.0") { exclude group: 'org.codehaus.groovy'; exclude group: 'net.bytebuddy' } - testImplementation ('org.spockframework:spock-junit4:2.0-M3-groovy-3.0') { exclude group: 'org.codehaus.groovy'; exclude group: 'net.bytebuddy' } - testImplementation ('com.google.jimfs:jimfs:1.1') + testImplementation ("org.objenesis:objenesis:3.2") + testImplementation ("org.spockframework:spock-core:2.3-groovy-4.0") { exclude group: 'org.codehaus.groovy'; exclude group: 'net.bytebuddy' } + testImplementation ('org.spockframework:spock-junit4:2.3-groovy-4.0') { exclude group: 'org.codehaus.groovy'; exclude group: 'net.bytebuddy' } + testImplementation ('com.google.jimfs:jimfs:1.2') // see https://docs.gradle.org/4.1/userguide/dependency_management.html#sec:module_replacement modules { diff --git a/plugins/nf-prov/src/main/nextflow/prov/BcoRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/BcoRenderer.groovy index 6350e05..aededdb 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/BcoRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/BcoRenderer.groovy @@ -23,10 +23,14 @@ import java.time.format.DateTimeFormatter import groovy.json.JsonOutput import groovy.transform.CompileStatic import nextflow.Session +import nextflow.SysEnv +import nextflow.config.Manifest import nextflow.processor.TaskRun import nextflow.script.WorkflowMetadata import nextflow.util.CacheHelper +import static nextflow.config.Manifest.ContributionType + /** * Renderer for the BioCompute Object (BCO) format. * @@ -63,10 +67,21 @@ class BcoRenderer implements Renderer { final nextflowMeta = metadata.nextflow final dateCreated = DateTimeFormatter.ISO_OFFSET_DATE_TIME.format(metadata.start) - final authors = (manifest.author ?: '').tokenize(',')*.trim() + final contributors = getContributors(manifest) final nextflowVersion = nextflowMeta.version.toString() final params = session.config.params as Map + final config = session.config + final review = config.navigate('prov.formats.bco.provenance_domain.review', []) as List> + final derived_from = config.navigate('prov.formats.bco.provenance_domain.derived_from') as String + final obsolete_after = config.navigate('prov.formats.bco.provenance_domain.obsolete_after') as String + final embargo = config.navigate('prov.formats.bco.provenance_domain.embargo') as Map + final usability = config.navigate('prov.formats.bco.usability_domain', []) as List + final keywords = config.navigate('prov.formats.bco.description_domain.keywords', []) as List + final xref = config.navigate('prov.formats.bco.description_domain.xref', []) as List> + final external_data_endpoints = config.navigate('prov.formats.bco.execution_domain.external_data_endpoints', []) as List> + final environment_variables = config.navigate('prov.formats.bco.execution_domain.environment_variables', []) as List + // create BCO manifest final bco = [ "object_id": null, @@ -75,18 +90,20 @@ class BcoRenderer implements Renderer { "provenance_domain": [ "name": manifest.name ?: "", "version": manifest.version ?: "", + "review": review, + "derived_from": derived_from, + "obsolete_after": obsolete_after, + "embargo": embargo, "created": dateCreated, "modified": dateCreated, - "contributors": authors.collect( name -> [ - "contribution": ["authoredBy"], - "name": name - ] ), - "license": "" + "contributors": contributors, + "license": manifest.license ], - "usability_domain": [], + "usability_domain": usability, "extension_domain": [], "description_domain": [ - "keywords": [], + "keywords": keywords, + "xref": xref, "platform": ["Nextflow"], "pipeline_steps": tasks.sort( (task) -> task.id ).collect { task -> [ "step_number": task.id, @@ -112,8 +129,12 @@ class BcoRenderer implements Renderer { ] ] ], - "external_data_endpoints": [], - "environment_variables": [:] + "external_data_endpoints": external_data_endpoints, + "environment_variables": environment_variables.inject([:]) { acc, name -> + if( SysEnv.containsKey(name) ) + acc.put(name, SysEnv.get(name)) + acc + } ], "parametric_domain": params.toConfigObject().flatten().collect( (k, v) -> [ "param": k, @@ -171,4 +192,20 @@ class BcoRenderer implements Renderer { path.text = JsonOutput.prettyPrint(JsonOutput.toJson(bco)) } + private List getContributors(Manifest manifest) { + manifest.contributors.collect { c -> [ + "name": c.name, + "affiliation": c.affiliation, + "email": c.email, + "contribution": c.contribution.collect { ct -> CONTRIBUTION_TYPES[ct] }, + "orcid": c.orcid + ] } + } + + private static Map CONTRIBUTION_TYPES = [ + (ContributionType.AUTHOR) : "authoredBy", + (ContributionType.MAINTAINER) : "curatedBy", + (ContributionType.CONTRIBUTOR) : "curatedBy", + ] + } diff --git a/plugins/nf-prov/src/main/nextflow/prov/DagRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/DagRenderer.groovy index 6cd3448..af23828 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/DagRenderer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/DagRenderer.groovy @@ -64,8 +64,8 @@ class DagRenderer implements Renderer { } private Map getVertices(Set tasks) { - def result = [:] - for( def task : tasks ) { + Map result = [:] + for( final task : tasks ) { final inputs = task.getInputFilesMap() final outputs = ProvHelper.getTaskOutputs(task) @@ -154,7 +154,7 @@ class DagRenderer implements Renderer { } // render task outputs - final outputs = [:] as Map + Map outputs = [:] dag.vertices.each { task, vertex -> vertex.outputs.each { path -> @@ -184,11 +184,11 @@ class DagRenderer implements Renderer { * @param vertices */ private Map getTaskTree(Map vertices) { - def taskTree = [:] + final taskTree = [:] - for( def entry : vertices ) { - def task = entry.key - def vertex = entry.value + for( final entry : vertices ) { + final task = entry.key + final vertex = entry.value // infer subgraph keys from fully qualified process name final result = getSubgraphKeys(task.processor.name) @@ -200,7 +200,7 @@ class DagRenderer implements Renderer { // navigate to given subgraph def subgraph = taskTree - for( def key : keys ) { + for( final key : keys ) { if( key !in subgraph ) subgraph[key] = [:] subgraph = subgraph[key] diff --git a/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF b/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF index 1bb2525..0497576 100644 --- a/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF +++ b/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF @@ -3,4 +3,4 @@ Plugin-Id: nf-prov Plugin-Version: 1.2.4 Plugin-Class: nextflow.prov.ProvPlugin Plugin-Provider: nextflow -Plugin-Requires: >=23.04.0 +Plugin-Requires: >=24.10.0 From 9b5e7bb9342993760ee46cfb8321a6354a2f2d98 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Tue, 5 Nov 2024 19:15:47 +0100 Subject: [PATCH 6/8] Release 1.3.0 Signed-off-by: Ben Sherman --- README.md | 2 ++ plugins/nf-prov/src/resources/META-INF/MANIFEST.MF | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b26df7e..0e68a96 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ Nextflow plugin to render provenance reports for pipeline runs. Now supporting [ The `nf-prov` plugin requires Nextflow version `23.04.0` or later. +*New in version 1.3.0: requires Nextflow 24.10.0 or later.* + To enable and configure `nf-prov`, include the following snippet to your Nextflow config and update as needed. ```groovy diff --git a/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF b/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF index 0497576..3d994c9 100644 --- a/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF +++ b/plugins/nf-prov/src/resources/META-INF/MANIFEST.MF @@ -1,6 +1,6 @@ Manifest-Version: 1.0 Plugin-Id: nf-prov -Plugin-Version: 1.2.4 +Plugin-Version: 1.3.0 Plugin-Class: nextflow.prov.ProvPlugin Plugin-Provider: nextflow Plugin-Requires: >=24.10.0 From 4bbf0c1e0e52b893b18819e64ba90acbf54ea375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20B=C3=A4uerle?= <45968370+famosab@users.noreply.github.com> Date: Thu, 6 Feb 2025 23:44:37 +0900 Subject: [PATCH 7/8] Add Workflow Run RO-crate format (#39) Signed-off-by: fbartusch Signed-off-by: Ben Sherman Co-authored-by: fbartusch Co-authored-by: Ben Sherman --- README.md | 28 +- WRROC.md | 47 + nextflow.config | 8 + .../main/nextflow/prov/PathNormalizer.groovy | 14 +- .../src/main/nextflow/prov/ProvHelper.groovy | 32 + .../main/nextflow/prov/ProvObserver.groovy | 5 +- .../main/nextflow/prov/WrrocRenderer.groovy | 993 ++++++++++++++++++ 7 files changed, 1105 insertions(+), 22 deletions(-) create mode 100644 WRROC.md create mode 100644 plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy diff --git a/README.md b/README.md index 0e68a96..970566a 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ prov { } ``` -Finally, run your Nextflow pipeline. You do not need to modify your pipeline script in order to use the `nf-prov` plugin. The plugin will automatically generate a JSON file with provenance information. +Finally, run your Nextflow pipeline. You do not need to modify your pipeline script in order to use the `nf-prov` plugin. The plugin will automatically produce the specified provenance reports at the end of the workflow run. ## Configuration @@ -44,14 +44,16 @@ Create the provenance report (default: `true` if plugin is loaded). Configuration scope for the desired output formats. The following formats are available: -- `bco`: Render a [BioCompute Object](https://biocomputeobject.org/). Supports the `file` and `overwrite` options. - - *New in version 1.3.0*: additional "pass-through" options are available for BCO fields that can't be inferred from the pipeline. See [BCO.md](./BCO.md) for more information. +- `bco`: Render a [BioCompute Object](https://biocomputeobject.org/). Supports the `file` and `overwrite` options. See [BCO.md](./BCO.md) for more information about the additional config options for BCO. - `dag`: Render the task graph as a Mermaid diagram embedded in an HTML document. Supports the `file` and `overwrite` options. - `legacy`: Render the legacy format originally defined in this plugin (default). Supports the `file` and `overwrite` options. +*New in version 1.4.0* + +- `wrroc`: Render a [Workflow Run RO-Crate](https://www.researchobject.org/workflow-run-crate/). Includes all three profiles (Process, Workflow, and Provenance). See [WRROC.md](./WRROC.md) for more information about the additional config options for WRROC. + Any number of formats can be specified, for example: ```groovy @@ -69,6 +71,8 @@ prov { } ``` +See [nextflow.config](./nextflow.config) for a full example of each provenance format. + `prov.patterns` List of file patterns to include in the provenance report, from the set of published files. By default, all published files are included. @@ -114,16 +118,16 @@ Following these step to package, upload and publish the plugin: 2. Update the `Plugin-Version` field in the following file with the release version: - ```bash - plugins/nf-prov/src/resources/META-INF/MANIFEST.MF - ``` + ```bash + plugins/nf-prov/src/resources/META-INF/MANIFEST.MF + ``` 3. Run the following command to package and upload the plugin in the GitHub project releases page: - ```bash - ./gradlew :plugins:nf-prov:upload - ``` + ```bash + ./gradlew :plugins:nf-prov:upload + ``` -4. Create a pull request against the [nextflow-io/plugins](https://github.com/nextflow-io/plugins/blob/main/plugins.json) - project to make the plugin public accessible to Nextflow app. +4. Create a pull request against the [nextflow-io/plugins](https://github.com/nextflow-io/plugins/blob/main/plugins.json) + project to make the plugin public accessible to Nextflow app. diff --git a/WRROC.md b/WRROC.md new file mode 100644 index 0000000..c26e21a --- /dev/null +++ b/WRROC.md @@ -0,0 +1,47 @@ +# Additional WRROC configuration + +*New in version 1.4.0* + +The `wrroc` format supports additional options to configure certain aspects of the Workflow Run RO-Crate. These fields cannot be inferred automatically from the pipeline or the run, and so must be entered through the config. + +The following config options are supported: + +- `prov.formats.wrroc.agent.contactType` +- `prov.formats.wrroc.agent.email` +- `prov.formats.wrroc.agent.name` +- `prov.formats.wrroc.agent.orcid` +- `prov.formats.wrroc.agent.phone` +- `prov.formats.wrroc.agent.ror` +- `prov.formats.wrroc.organization.contactType` +- `prov.formats.wrroc.organization.email` +- `prov.formats.wrroc.organization.name` +- `prov.formats.wrroc.organization.phone` +- `prov.formats.wrroc.organization.ror` +- `prov.formats.wrroc.license` +- `prov.formats.wrroc.publisher` + +Refer to the [WRROC User Guide](https://www.researchobject.org/workflow-run-crate/) for more information about the associated RO-Crate entities. + +Here is an example config: + +```groovy +prov { + formats { + wrroc { + agent { + name = "John Doe" + orcid = "https://orcid.org/0000-0000-0000-0000" + email = "john.doe@example.org" + phone = "(0)89-99998 000" + contactType = "Researcher" + } + organization { + name = "University of XYZ" + ror = "https://ror.org/000000000" + } + license = "https://spdx.org/licenses/MIT" + publisher = "https://ror.org/000000000" + } + } +} +``` diff --git a/nextflow.config b/nextflow.config index 6219b1b..82b9e4d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -20,5 +20,13 @@ prov { file = "${params.outdir}/manifest.json" overwrite = true } + wrroc { + file = "${params.outdir}/ro-crate-metadata.json" + overwrite = true + } } } + +manifest { + license = "https://spdx.org/licenses/Apache-2.0" +} diff --git a/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy b/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy index f0dc26f..5d57dac 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/PathNormalizer.groovy @@ -32,8 +32,6 @@ class PathNormalizer { private String commitId - private String launchDir - private String projectDir private String workDir @@ -42,14 +40,12 @@ class PathNormalizer { repository = metadata.repository ? new URL(metadata.repository) : null commitId = metadata.commitId projectDir = metadata.projectDir.toUriString() - launchDir = metadata.launchDir.toUriString() workDir = metadata.workDir.toUriString() } /** - * Normalize paths so that local absolute paths become - * relative paths, and local paths derived from remote URLs - * become the URLs. + * Normalize paths against the original remote URL, or + * work directory, where appropriate. * * @param path */ @@ -66,9 +62,9 @@ class PathNormalizer { if( repository && path.startsWith(projectDir) ) return getProjectSourceUrl(path) - // replace launch directory with relative path - if( path.startsWith(launchDir) ) - return path.replace(launchDir + '/', '') + // encode local absolute paths as file URLs + if( path.startsWith('/') ) + return 'file://' + path return path } diff --git a/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy b/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy index 0a3ceba..d4ed030 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/ProvHelper.groovy @@ -19,6 +19,7 @@ package nextflow.prov import java.nio.file.Path import groovy.transform.CompileStatic +import nextflow.Session import nextflow.exception.AbortOperationException import nextflow.file.FileHelper import nextflow.processor.TaskRun @@ -49,6 +50,15 @@ class ProvHelper { } } + /** + * Get the remote file staging directory for a workflow run. + * + * @param session + */ + static Path getStageDir(Session session) { + return session.workDir.resolve("stage-${session.uniqueId}") + } + /** * Get the list of output files for a task. * @@ -98,4 +108,26 @@ class ProvHelper { return result } + /** + * Determine whether a task input file was staged into the work directory. + * + * @param source + * @param session + */ + static boolean isStagedInput(Path source, Session session) { + return source.startsWith(getStageDir(session)) + } + + /** + * Determine whether a task input file was created in the work/tmp/ + * directory (i.e. by a collectFile operator). + * + * @param source + * @param session + */ + static boolean isTmpInput(Path source, Session session) { + final tmpDir = session.workDir.resolve('tmp') + return source.startsWith(tmpDir) + } + } diff --git a/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy b/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy index f508959..7dca105 100644 --- a/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy +++ b/plugins/nf-prov/src/main/nextflow/prov/ProvObserver.groovy @@ -40,7 +40,7 @@ import nextflow.trace.TraceRecord @CompileStatic class ProvObserver implements TraceObserver { - public static final List VALID_FORMATS = ['bco', 'dag', 'legacy'] + public static final List VALID_FORMATS = ['bco', 'dag', 'legacy', 'wrroc'] private Session session @@ -71,6 +71,9 @@ class ProvObserver implements TraceObserver { if( name == 'legacy' ) return new LegacyRenderer(opts) + if( name == 'wrroc' ) + return new WrrocRenderer(opts) + throw new IllegalArgumentException("Invalid provenance format -- valid formats are ${VALID_FORMATS.join(', ')}") } diff --git a/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy new file mode 100644 index 0000000..4a4fc4d --- /dev/null +++ b/plugins/nf-prov/src/main/nextflow/prov/WrrocRenderer.groovy @@ -0,0 +1,993 @@ +/* + * Copyright 2023, Seqera Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nextflow.prov + +import java.nio.file.FileSystems +import java.nio.file.Files +import java.nio.file.Path +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter + +import groovy.json.JsonOutput +import groovy.json.JsonSlurper +import groovy.transform.CompileStatic +import groovy.util.logging.Slf4j +import nextflow.Session +import nextflow.processor.TaskProcessor +import nextflow.processor.TaskRun +import nextflow.script.ProcessDef +import nextflow.script.ScriptMeta +import nextflow.util.ConfigHelper +import nextflow.util.Duration +import nextflow.util.MemoryUnit +import org.yaml.snakeyaml.Yaml + +/** + * Renderer for the Provenance Run RO Crate format. + * + * @author Ben Sherman + * @author Felix Bartusch + * @author Famke Bäuerle + */ +@Slf4j +@CompileStatic +class WrrocRenderer implements Renderer { + + private static final List README_FILENAMES = List.of("README.md", "README.txt", "readme.md", "readme.txt", "Readme.md", "Readme.txt", "README") + + private Path path + + private boolean overwrite + + @Delegate + private PathNormalizer normalizer + + // List of contact points (people, organizations) to be added + private List contactPoints = [] + + WrrocRenderer(Map opts) { + path = (opts.file as Path).complete() + overwrite = opts.overwrite as Boolean + + ProvHelper.checkFileOverwrite(path, overwrite) + } + + @Override + void render(Session session, Set tasks, Map workflowOutputs) { + // get workflow inputs + final taskLookup = ProvHelper.getTaskLookup(tasks) + final workflowInputs = ProvHelper.getWorkflowInputs(tasks, taskLookup) + + // get workflow metadata + final metadata = session.workflowMetadata + final crateDir = path.getParent() + final projectDir = metadata.projectDir + this.normalizer = new PathNormalizer(metadata) + + final manifest = metadata.manifest + final scriptFile = metadata.getScriptFile() + + final formatter = DateTimeFormatter.ISO_OFFSET_DATE_TIME + final dateStarted = formatter.format(metadata.start) + final dateCompleted = formatter.format(metadata.complete) + final nextflowVersion = metadata.nextflow.version.toString() + final params = session.params + + // parse wrroc configuration + final wrrocOpts = session.config.navigate('prov.formats.wrroc', [:]) as Map + final agent = getAgentInfo(wrrocOpts) + final organization = getOrganizationInfo(wrrocOpts) + final publisherId = getPublisherId(wrrocOpts, agent, organization) + if( organization ) + agent["affiliation"] = ["@id": organization["@id"]] + + // create manifest + final datasetParts = [] + + // -- license + if( wrrocOpts.license ) { + datasetParts.add([ + "@id" : wrrocOpts.license, + "@type": "CreativeWork" + ]) + } + else { + log.warn "Missing license for Workflow Run RO-Crate -- the resulting crate will be invalid" + } + + // -- readme file + for( final fileName : README_FILENAMES ) { + final readmePath = projectDir.resolve(fileName) + if( !Files.exists(readmePath) ) + continue + + readmePath.copyTo(crateDir) + datasetParts.add([ + "@id" : fileName, + "@type" : "File", + "name" : fileName, + "description" : "The README file of the workflow.", + "encodingFormat": getEncodingFormat(readmePath) ?: "text/plain" + ]) + break + } + + // -- main script + final mainScriptId = metadata.scriptFile.name + final softwareApplicationId = "${mainScriptId}#software-application" + final organizeActionId = "${mainScriptId}#organize" + metadata.scriptFile.copyTo(crateDir) + + if( !metadata.repository ) + log.warn "Could not determine pipeline repository URL for Workflow Run RO-Crate -- launch the pipeline with canonical URL (e.g. `nextflow run nextflow-io/hello`) to ensure that the pipeline repository URL is recorded in the crate" + + // -- parameter schema + final schemaPath = scriptFile.getParent().resolve("nextflow_schema.json") + Map paramSchema = [:] + if( Files.exists(schemaPath) ) { + final fileName = schemaPath.name + + schemaPath.copyTo(crateDir) + datasetParts.add([ + "@id" : fileName, + "@type" : "File", + "name" : fileName, + "description" : "The parameter schema of the workflow.", + "encodingFormat": "application/json" + ]) + paramSchema = getParameterSchema(schemaPath) + } + + // -- resolved config + final configPath = crateDir.resolve("nextflow.config") + configPath.text = ConfigHelper.toCanonicalString(session.config, true) + + datasetParts.add([ + "@id" : "nextflow.config", + "@type" : "File", + "name" : "Resolved Nextflow configuration", + "description" : "The resolved Nextflow configuration for the workflow run.", + "encodingFormat": "text/plain" + ]) + + // -- pipeline parameters + // TODO: formal parameters for workflow output targets + final formalParameters = params + .findAll { name, value -> value != null } + .collect { name, value -> + final schema = paramSchema[name] ?: [:] + final type = getParameterType(name, value, schema) + final encoding = type == "File" + ? getEncodingFormat(value as Path) + : null + + if( !type ) + log.warn "Could not determine type of parameter `${name}` for Workflow Run RO-Crate -- the resulting crate will be invalid" + + return withoutNulls([ + "@id" : getFormalParameterId(name), + "@type" : "FormalParameter", + "additionalType": type, + "conformsTo" : ["@id": "https://bioschemas.org/profiles/FormalParameter/1.0-RELEASE"], + "encodingFormat": encoding, + "name" : name, + "defaultValue" : schema.default, + "description" : schema.description, + ]) + } + + final propertyValues = params + .findAll { name, value -> value != null } + .collect { name, value -> + final paramId = getFormalParameterId(name) + final normalized = normalizeParamValue(value) + + return [ + "@id" : "${paramId}/value", + "@type" : "PropertyValue", + "exampleOfWork": ["@id": paramId], + "name" : name, + "value" : normalized + ] + } + + // -- input files + Map paramInputFiles = [:] + + params.each { name, value -> + if( !value ) + return + final schema = paramSchema[name] ?: [:] + final type = getParameterType(name, value, schema) + if( type != "File" ) + return + final source = (value as Path).complete() + // don't try to download remote files + if( source.fileSystem != FileSystems.default ) + return + // don't try to copy local directories + if( !source.isFile() ) + return + paramInputFiles.put(source, name) + } + + final inputFiles = workflowInputs + .findAll { source -> + !ProvHelper.isStagedInput(source, session) && !ProvHelper.isTmpInput(source, session) + } + .collect { source -> + final paramName = paramInputFiles[source] + if( paramName ) { + log.debug "Copying input file specified by `params.${paramName}` into RO-Crate: ${source.toUriString()}" + source.copyTo(crateDir) + } + + withoutNulls([ + "@id" : paramName ? source.name : normalizePath(source), + "@type" : getType(source), + "name" : source.name, + "encodingFormat": getEncodingFormat(source), + ]) + } + + // -- copy local input files specified by params to crate + params.each { name, value -> + if( !value ) + return + final schema = paramSchema[name] ?: [:] + final type = getParameterType(name, value, schema) + if( type == "File" ) { + final source = (value as Path).complete() + // don't try to download remote files + if( source.fileSystem != FileSystems.default ) + return + // don't try to copy local directories + if( !source.isFile() ) + return + inputFiles.add(withoutNulls([ + "@id" : source.name, + "@type" : type, + "description" : "Input file specified by params.${name}", + "encodingFormat": getEncodingFormat(source) + ])) + log.debug "Copying input file specified by params.${name} into RO-Crate: ${source.toUriString()}" + source.copyTo(crateDir) + } + } + + // -- output files + final outputFiles = workflowOutputs + .findAll { source, target -> + // warn about any output files outside of the crate directory + final result = target.startsWith(crateDir) + if( !result ) + log.warn "Excluding workflow output ${target} because it is outside of the RO-Crate directory -- make sure that the workflow output directory and RO-Crate directory are the same" + return result + } + .collect { source, target -> + withoutNulls([ + "@id" : crateDir.relativize(target).toString(), + "@type" : getType(target), + "name" : target.name, + "encodingFormat": getEncodingFormat(target), + ]) + } + + // -- workflow definition + final taskProcessors = tasks + .collect { task -> task.processor } + .unique() + + final processDefs = taskProcessors + .collect { process -> ScriptMeta.get(process.getOwnerScript()) } + .unique() + .collectMany { meta -> + meta.getDefinitions().findAll { defn -> defn instanceof ProcessDef } as List + } + + final processLookup = taskProcessors + .inject([:] as Map) { acc, processor -> + // HACK: when the owner script of a processor defines only one process, that must be the definition + final meta = ScriptMeta.get(processor.getOwnerScript()) + final defs = meta.getDefinitions().findAll { defn -> defn instanceof ProcessDef } as List + final processDef = defs.size() == 1 ? defs.first() : null + if( !processDef ) + log.warn "Could not identify process definition for `${processor.name}` -- resulting RO-Crate may be invalid (hint: define each process in a separate module script to fix this issue)" + acc[processor] = processDef + acc + } + + final moduleSoftwareApplications = processDefs + .collect() { process -> + final result = [ + "@id" : getModuleId(process), + "@type" : "SoftwareApplication", + "name" : process.baseName, + "url" : getModuleUrl(process), + ] + + final metaYaml = getModuleSchema(process) + if( metaYaml ) { + final name = metaYaml.name as String + final tools = metaYaml.getOrDefault('tools', []) as List + final parts = tools.collect { tool -> + final entry = (tool as Map).entrySet().first() + final toolName = entry.key as String + ["@id": getToolId(process.baseName, toolName)] + } + + if( name ) + result.name = name + if( parts ) + result.hasPart = parts + } + + return result + } + + final toolSoftwareApplications = processDefs + .collectMany { process -> + final metaYaml = getModuleSchema(process) + if( !metaYaml ) + return [] + + final tools = metaYaml.getOrDefault('tools', []) as List + return tools + .collect { tool -> + final entry = (tool as Map).entrySet().first() + final toolName = entry.key as String + final toolDescription = (entry.value as Map)?.get('description') as String + return [ + "@id" : getToolId(process.baseName, toolName), + "@type" : "SoftwareApplication", + "name" : toolName, + "description" : toolDescription + ] + } + } + + final howToSteps = taskProcessors + .collect() { process -> + final processDef = processLookup[process] + [ + "@id" : getProcessStepId(process), + "@type" : "HowToStep", + "workExample": processDef ? ["@id": getModuleId(processDef)] : null, + "position" : process.getId() + ] + } + + final controlActions = taskProcessors + .collect() { process -> + final taskIds = tasks + .findAll { task -> task.processor == process } + .collect { task -> ["@id": getTaskId(task)] } + + return [ + "@id" : getProcessControlId(process), + "@type" : "ControlAction", + "instrument": ["@id": getProcessStepId(process)], + "name" : "Orchestrate process ${process.name}", + "object" : taskIds + ] + } + + // -- workflow execution + final stagedInputs = workflowInputs + .findAll { source -> ProvHelper.isStagedInput(source, session) } + .collect { source -> + final name = getStagedInputName(source, session) + + withoutNulls([ + "@id" : "#stage/${name}", + "@type" : "CreativeWork", + "name" : name, + "encodingFormat": getEncodingFormat(source), + ]) + } + + final tmpInputs = workflowInputs + .findAll { source -> ProvHelper.isTmpInput(source, session) } + .collect { source -> + withoutNulls([ + "@id" : "#tmp/${source.name}", + "@type" : "CreativeWork", + "name" : source.name, + "encodingFormat": getEncodingFormat(source), + ]) + } + + final taskCreateActions = tasks + .collect { task -> + final processDef = processLookup[task.processor] + final inputs = task.getInputFilesMap().collect { name, source -> + final id = + source in taskLookup ? getTaskOutputId(taskLookup[source], source) + : ProvHelper.isStagedInput(source, session) ? "#stage/${getStagedInputName(source, session)}" + : ProvHelper.isTmpInput(source, session) ? "#tmp/${source.name}" + : normalizePath(source) + ["@id": id] + } + final outputs = ProvHelper.getTaskOutputs(task).collect { target -> + ["@id": getTaskOutputId(task, target)] + } + final result = [ + "@id" : getTaskId(task), + "@type" : "CreateAction", + "name" : task.name, + "instrument" : processDef ? ["@id": getModuleId(processDef)] : null, + "agent" : agent ? ["@id": agent["@id"]] : null, + "object" : inputs, + "result" : outputs, + "actionStatus": task.exitStatus == 0 ? "http://schema.org/CompletedActionStatus" : "http://schema.org/FailedActionStatus" + ] + if( task.exitStatus != 0 ) + result["error"] = task.stderr + return result + } + + final taskOutputs = tasks.collectMany { task -> + ProvHelper.getTaskOutputs(task).collect { target -> + final name = getTaskOutputName(task, target) + + return withoutNulls([ + "@id" : getTaskOutputId(task, name), + "@type" : "CreativeWork", + "name" : name, + "encodingFormat": getEncodingFormat(target), + ]) + } + } + + final publishCreateActions = workflowOutputs + .collect { source, target -> + final task = taskLookup[source] + final sourceName = getTaskOutputName(task, source) + + return [ + "@id" : "#publish/${task.hash}/${sourceName}", + "@type" : "CreateAction", + "name" : "publish", + "instrument" : ["@id": softwareApplicationId], + "object" : ["@id": getTaskOutputId(task, sourceName)], + "result" : ["@id": crateDir.relativize(target).toString()], + "actionStatus": "http://schema.org/CompletedActionStatus" + ] + } + + final wrroc = [ + "@context": "https://w3id.org/ro/crate/1.1/context", + "@graph" : withoutNulls([ + [ + "@id" : path.name, + "@type" : "CreativeWork", + "about" : ["@id": "./"], + "conformsTo": [ + ["@id": "https://w3id.org/ro/crate/1.1"], + ["@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"] + ] + ], + withoutNulls([ + "@id" : "./", + "@type" : "Dataset", + "author" : agent ? ["@id": agent["@id"]] : null, + "publisher" : publisherId ? ["@id": publisherId] : null, + "datePublished": getDatePublished(), + "conformsTo" : [ + ["@id": "https://w3id.org/ro/wfrun/process/0.1"], + ["@id": "https://w3id.org/ro/wfrun/workflow/0.1"], + ["@id": "https://w3id.org/ro/wfrun/provenance/0.1"], + ["@id": "https://w3id.org/workflowhub/workflow-ro-crate/1.0"] + ], + "name" : "Workflow run of ${manifest.name ?: metadata.projectName}", + "description": manifest.description ?: null, + "hasPart" : withoutNulls([ + ["@id": mainScriptId], + *asReferences(datasetParts), + *asReferences(inputFiles), + *asReferences(outputFiles) + ]), + "mainEntity" : ["@id": mainScriptId], + "mentions" : [ + ["@id": "#${session.uniqueId}"], + *asReferences(stagedInputs), + *asReferences(tmpInputs), + *asReferences(taskCreateActions), + *asReferences(taskOutputs), + *asReferences(publishCreateActions), + ], + "license" : wrrocOpts.license + ]), + [ + "@id" : "https://w3id.org/ro/wfrun/process/0.1", + "@type" : "CreativeWork", + "name" : "Process Run Crate", + "version": "0.1" + ], + [ + "@id" : "https://w3id.org/ro/wfrun/workflow/0.1", + "@type" : "CreativeWork", + "name" : "Workflow Run Crate", + "version": "0.1" + ], + [ + "@id" : "https://w3id.org/ro/wfrun/provenance/0.1", + "@type" : "CreativeWork", + "name" : "Provenance Run Crate", + "version": "0.1" + ], + [ + "@id" : "https://w3id.org/workflowhub/workflow-ro-crate/1.0", + "@type" : "CreativeWork", + "name" : "Workflow RO-Crate", + "version": "1.0" + ], + withoutNulls([ + "@id" : mainScriptId, + "@type" : ["File", "SoftwareSourceCode", "ComputationalWorkflow", "HowTo"], + "conformsTo" : ["@id": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE"], + "name" : manifest.name ?: metadata.projectName, + "description" : manifest.description, + "programmingLanguage": ["@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow"], + "creator" : manifest.author, + "codeRepository" : metadata.repository, + "version" : metadata.commitId, + "license" : manifest.license, + "url" : metadata.repository ? normalizePath(metadata.scriptFile) : null, + "encodingFormat" : "application/nextflow", + "runtimePlatform" : "Nextflow " + nextflowVersion, + "hasPart" : asReferences(moduleSoftwareApplications), + "input" : asReferences(formalParameters), + "output" : [ + // TODO: workflow output targets + ], + "step" : asReferences(howToSteps), + ]), + [ + "@id" : "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", + "@type" : "ComputerLanguage", + "name" : "Nextflow", + "identifier": "https://www.nextflow.io/", + "url" : "https://www.nextflow.io/", + "version" : nextflowVersion + ], + [ + "@id" : softwareApplicationId, + "@type": "SoftwareApplication", + "name" : "Nextflow ${nextflowVersion}" + ], + *moduleSoftwareApplications, + *toolSoftwareApplications, + *formalParameters, + *howToSteps, + [ + "@id" : organizeActionId, + "@type" : "OrganizeAction", + "agent" : agent ? ["@id": agent["@id"]] : null, + "instrument": ["@id": softwareApplicationId], + "name" : "Run of Nextflow ${nextflowVersion}", + "object" : asReferences(controlActions), + "result" : ["@id": "#${session.uniqueId}"], + "startTime" : dateStarted, + "endTime" : dateCompleted + ], + [ + "@id" : "#${session.uniqueId}", + "@type" : "CreateAction", + "agent" : agent ? ["@id": agent["@id"]] : null, + "name" : "Nextflow workflow run ${session.uniqueId}", + "startTime" : dateStarted, + "endTime" : dateCompleted, + "instrument": ["@id": mainScriptId], + "object" : [ + *asReferences(propertyValues), + *asReferences(inputFiles), + ], + "result" : asReferences(outputFiles) + ], + agent, + organization, + *contactPoints, + *datasetParts, + *propertyValues, + *controlActions, + *stagedInputs, + *tmpInputs, + *taskCreateActions, + *taskOutputs, + *publishCreateActions, + *inputFiles, + *outputFiles, + ]) + ] + + // render manifest to JSON file + path.text = JsonOutput.prettyPrint(JsonOutput.toJson(wrroc)) + } + + private static String getDatePublished() { + return LocalDateTime.now().format(DateTimeFormatter.ISO_DATE) + } + + /** + * Parse information about the agent running the workflow. + * + * @param opts + */ + private Map getAgentInfo(Map opts) { + final result = [:] + + if( !opts.agent ) + return null + + final agentOpts = opts.agent as Map + result["@id"] = agentOpts.getOrDefault("orcid", "agent-1") + result["@type"] = "Person" + if( agentOpts.name ) + result.name = agentOpts.name + + // Check for contact information + if( agentOpts.email || agentOpts.phone ) { + final contactPointId = getContactPointInfo(agentOpts) + if( contactPointId ) + result.contactPoint = ["@id": contactPointId] + } + + return result + } + + /** + * Parse information about the organization of the agent running the workflow. + * + * @param opts + */ + private Map getOrganizationInfo(Map opts) { + final result = [:] + + if( !opts.organization ) + return null + + final orgOpts = opts.organization as Map + result["@id"] = orgOpts.getOrDefault("ror", "organization-1") + result["@type"] = "Organization" + if( orgOpts.name ) + result.name = orgOpts.name + + // Check for contact information + if( orgOpts.email || orgOpts.phone ) { + final contactPointId = getContactPointInfo(orgOpts) + if( contactPointId ) + result.contactPoint = ["@id": contactPointId] + } + + return result + } + + /** + * Parse a contact point and add it to the list of contact points. + * + * @param opts + */ + private String getContactPointInfo(Map opts) { + // Prefer email for the contact point ID + String contactPointId = null + if( opts.email ) + contactPointId = "mailto:" + opts.email + else if( opts.phone ) + contactPointId = opts.phone + + if( !contactPointId ) + return null + + final contactPoint = [:] + contactPoint["@id"] = contactPointId + contactPoint["@type"] = "ContactPoint" + if( opts.contactType ) + contactPoint.contactType = opts.contactType + if( opts.email ) + contactPoint.email = opts.email + if( opts.phone ) + contactPoint.phone = opts.phone + if( opts.orcid ) + contactPoint.url = opts.orcid + if( opts.ror ) + contactPoint.url = opts.ror + + contactPoints.add(contactPoint) + return contactPointId + } + + /** + * Parse information about the RO-Crate publisher. + * + * @param opts + * @param agent + * @param organization + */ + private static String getPublisherId(Map opts, Map agent, Map organization) { + if( !opts.publisher ) + return null + + final publisherId = opts.publisher + + // Check if the publisher id references either the agent or the organization + final agentId = agent?["@id"] + final organizationId = organization?["@id"] + if( publisherId != agentId && publisherId != organizationId ) + return null + + return publisherId + } + + /** + * Noraalize a parameter value. + * + * @param value + */ + private Object normalizeParamValue(Object value) { + switch( value ) { + case Boolean: + case Number: + return value + case CharSequence: + return normalizePath(value.toString()) + case List: + case Map: + return JsonOutput.toJson(value) + case Duration: + return ((Duration) value).toMillis() + case MemoryUnit: + return ((MemoryUnit) value).toBytes() + default: + log.warn "Workflow Run RO-Crate encountered parameter value of type ${value.class.name} -- JSON serialization might be incorrect" + return value + } + } + + /** + * Get the parameter schema of a pipeline as a map. + * + * @param path + */ + private static Map getParameterSchema(Path path) { + final schema = new JsonSlurper().parseText(path.text) as Map + + Map defs = null + if( schema['$defs'] ) + defs = schema['$defs'] as Map + else if( schema['defs'] ) + defs = schema['defs'] as Map + else if( schema['definitions'] ) + defs = schema['definitions'] as Map + + if( !defs ) + return [:] + + final schemaProps = schema.properties as Map ?: [:] + final defsProps = defs.values().collect { defn -> + (defn as Map).properties ?: [:] + } as List + final allProps = [schemaProps] + defsProps + final entries = allProps.collectMany { props -> + (props as Map).entrySet() + } as Map.Entry[] + + return Map.ofEntries(entries) + } + + /** + * Determine the type of a parameter based on its + * schema and/or runtime value. + * + * @param name + * @param value + * @param schema + */ + private static String getParameterType(String name, Object value, Map schema) { + // infer from schema + if( schema ) { + final type = schema.type + final format = schema.format + + switch( type ) { + case "boolean": + return "Boolean" + case "integer": + case "number": + return "Number" + case "string": + return \ + format == "file-path" ? "File" : + format == "directory-path" ? "Dataset" : + "Text" + } + } + + // infer from runtime value + switch( value ) { + case Boolean: + return "Boolean" + case Number: + case Duration: + case MemoryUnit: + return "Number" + case CharSequence: + return "Text" + case List: + case Map: + return "Text" + default: + return null + } + } + + /** + * Get the canonical id of a module script. + * + * @param name + */ + private String getFormalParameterId(String name) { + return "#param/${name}" + } + + /** + * Get the canonical id of a module script. + * + * @param process + */ + private String getModuleId(ProcessDef process) { + return "#module/${process.baseName}" + } + + /** + * Get the canonical url of a module script. + * + * @param process + */ + private String getModuleUrl(ProcessDef process) { + final scriptPath = ScriptMeta.get(process.getOwner()).getScriptPath().normalize() + return normalizePath(scriptPath) + } + + /** + * Get the canonical id of a tool used by a module. + * + * @param moduleName + * @param toolName + */ + private static String getToolId(String moduleName, String toolName) { + return "#module/${moduleName}/${toolName}" + } + + /** + * Get the canonical id of a process in the workflow DAG. + * + * @param process + */ + private static String getProcessControlId(TaskProcessor process) { + return "#process-control/${process.name}" + } + + private static String getProcessStepId(TaskProcessor process) { + return "#process-step/${process.name}" + } + + /** + * Get the relative name of a staged input. + * + * @param source + * @param session + */ + private static String getStagedInputName(Path source, Session session) { + final stageDir = ProvHelper.getStageDir(session) + return stageDir.relativize(source).toString() + } + + /** + * Get the canonical id of a task. + * + * @param task + */ + private static String getTaskId(TaskRun task) { + return "#task/${task.hash}" + } + + /** + * Get the relative name of a task output. + * + * @param task + * @param target + */ + private static String getTaskOutputName(TaskRun task, Path target) { + final workDir = task.workDir.toUriString() + return target.toUriString().replace(workDir + '/', '') + } + + /** + * Get the canonical id of a task output. + * + * @param task + * @param name + */ + private static String getTaskOutputId(TaskRun task, String name) { + return "#task/${task.hash}/${name}" + } + + private static String getTaskOutputId(TaskRun task, Path target) { + return "#task/${task.hash}/${getTaskOutputName(task, target)}" + } + + /** + * Get the nf-core meta.yml of a Nextflow module as a map. + * + * @param process + */ + private static Map getModuleSchema(ProcessDef process) { + final metaFile = ScriptMeta.get(process.getOwner()).getModuleDir().resolve('meta.yml') + return Files.exists(metaFile) + ? new Yaml().load(metaFile.text) as Map + : null + } + + /** + * Get the RO-Crate "@type" of a path based on whether + * it is a file or directory. + * + * @param path + */ + private static String getType(Path path) { + return path.isDirectory() + ? "Dataset" + : "File" + } + + /** + * Get the encodingFormat of a file as MIME Type. + * + * @param path Path to file + * @return the MIME type of the file, or null if it's not a file. + */ + private static String getEncodingFormat(Path path) { + if( !(path && path.exists() && path.isFile()) ) + return null + + String mime = Files.probeContentType(path) + if( mime ) + return mime + + // It seems that YAML has a media type only since beginning of 2024 + // Set this by hand if this is run on older systems: + // https://httptoolkit.com/blog/yaml-media-type-rfc/ + if( ["yml", "yaml"].contains(path.getExtension()) ) + return "application/yaml" + + return null + } + + private static List asReferences(List values) { + return values.collect { value -> ["@id": value["@id"]] } + } + + private static List withoutNulls(List list) { + return list.findAll { v -> v != null } + } + + private static Map withoutNulls(Map map) { + return map.findAll { k, v -> v != null } + } + +} From f2d024097cb690e8093cb809fbd644fd54fc2104 Mon Sep 17 00:00:00 2001 From: Ben Sherman Date: Thu, 6 Feb 2025 08:51:18 -0600 Subject: [PATCH 8/8] Update README and docs Signed-off-by: Ben Sherman --- README.md | 11 ++--------- BCO.md => docs/BCO.md | 0 WRROC.md => docs/WRROC.md | 0 3 files changed, 2 insertions(+), 9 deletions(-) rename BCO.md => docs/BCO.md (100%) rename WRROC.md => docs/WRROC.md (100%) diff --git a/README.md b/README.md index 970566a..02405ae 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,6 @@ Nextflow plugin to render provenance reports for pipeline runs. Now supporting [ ## Getting Started -The `nf-prov` plugin requires Nextflow version `23.04.0` or later. - -*New in version 1.3.0: requires Nextflow 24.10.0 or later.* - To enable and configure `nf-prov`, include the following snippet to your Nextflow config and update as needed. ```groovy @@ -40,11 +36,9 @@ Create the provenance report (default: `true` if plugin is loaded). `prov.formats` -*New in version 1.2.0* - Configuration scope for the desired output formats. The following formats are available: -- `bco`: Render a [BioCompute Object](https://biocomputeobject.org/). Supports the `file` and `overwrite` options. See [BCO.md](./BCO.md) for more information about the additional config options for BCO. +- `bco`: Render a [BioCompute Object](https://biocomputeobject.org/). Supports the `file` and `overwrite` options. See [BCO.md](docs/BCO.md) for more information about the additional config options for BCO. - `dag`: Render the task graph as a Mermaid diagram embedded in an HTML document. Supports the `file` and `overwrite` options. @@ -52,7 +46,7 @@ Configuration scope for the desired output formats. The following formats are av *New in version 1.4.0* -- `wrroc`: Render a [Workflow Run RO-Crate](https://www.researchobject.org/workflow-run-crate/). Includes all three profiles (Process, Workflow, and Provenance). See [WRROC.md](./WRROC.md) for more information about the additional config options for WRROC. +- `wrroc`: Render a [Workflow Run RO-Crate](https://www.researchobject.org/workflow-run-crate/). Includes all three profiles (Process, Workflow, and Provenance). See [WRROC.md](docs/WRROC.md) for more information about the additional config options for WRROC. Any number of formats can be specified, for example: @@ -130,4 +124,3 @@ Following these step to package, upload and publish the plugin: 4. Create a pull request against the [nextflow-io/plugins](https://github.com/nextflow-io/plugins/blob/main/plugins.json) project to make the plugin public accessible to Nextflow app. - diff --git a/BCO.md b/docs/BCO.md similarity index 100% rename from BCO.md rename to docs/BCO.md diff --git a/WRROC.md b/docs/WRROC.md similarity index 100% rename from WRROC.md rename to docs/WRROC.md