Skip to content

Commit

Permalink
Work on checking outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
dfornika committed Feb 9, 2024
1 parent 07abcb0 commit e444256
Show file tree
Hide file tree
Showing 7 changed files with 355 additions and 2 deletions.
264 changes: 264 additions & 0 deletions .github/data/pipeline-provenance.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "array",
"items": {
"anyOf": [
{
"$ref": "#/definitions/PipelineProvenanceRecord",
"description": "A record of the pipeline that was run."
},
{
"$ref": "#/definitions/ProcessProvenanceRecord",
"description": "A record of a process that was run."
},
{
"$ref": "#/definitions/InputFileProvenanceRecord",
"description": "A record of an input file that was used."
}
]
},
"definitions": {
"PipelineProvenanceRecord": {
"title": "PipelineProvenanceRecord",
"type": "object",
"properties": {
"pipeline_name": {
"type": "string",
"description": "Name of the pipeline.",
"examples": [
"BCCDC-PHL/routine-assembly",
"BCCDC-PHL/plasmid-screen"
]
},
"pipeline_version": {
"type": "string",
"description": "Version of the pipeline.",
"examples": [
"v0.1.0",
"0.1.0",
"1",
"2.0-beta"
]
},
"timestamp_analysis_start": {
"type": "string",
"description": "Timestamp for the start of a pipeline run. ISO-8601-formatted date, followed by 'T' and a 24-hour timestamp, assumed to be in the local timezone if not specified. Timezone may be specified with an offset from UTC. Timestamp precision is not guaranteed.",
"format": "date-time",
"examples": [
"2021-12-06T16:12:31.252055",
"2022-01-12T01:22:51-08:00",
"2022-02-04T16:55:03.182-08:00"
]
}
},
"required": [
"pipeline_name",
"pipeline_version"
]
},
"ProcessProvenanceRecord": {
"title": "ProcessProvenanceRecord",
"type": "object",
"properties": {
"process_name": {
"type": "string",
"description": "Name of the process.",
"examples": [
"fastp",
"bwa_mem",
"samtools_mpileup",
"align_reads_to_ref",
"trim_reads",
"CALL_VARIANTS"
]
},
"tools": {
"type": "array",
"description": "The tools used to run the process.",
"items": {
"$ref": "#/definitions/Tool"
},
"examples": [
[
{
"tool_name": "fastp",
"tool_version": "0.20.0",
"subcommand": "trim",
"parameters": [
{
"parameter": "cut_tail",
"value": null
}
]
}
],
[
{
"tool_name": "bwa",
"tool_version": "0.7.17-r1188",
"subcommand": "mem",
"parameters": [
{
"parameter": "exclude_flags",
"value": 1540
},
{
"parameter": "min_base_quality",
"value": 20
}
]
}
]
]
}
},
"required": [
"process_name"
]
},
"Tool": {
"title": "Tool",
"type": "object",
"properties": {
"tool_name": {
"type": "string",
"description": "Name of the tool.",
"examples": [
"fastp",
"bwa",
"samtools",
"bcftools",
"medaka"
]
},
"tool_version": {
"type": "string",
"description": "A number or string associated with a specific snapshot of the development state of a tool. Should (but may not always) map to a tagged release on GitHub or another version control system.",
"examples": [
"0.1.0",
"v0.1.1",
"1.1",
"2",
"0.7.17-r1188"
]
},
"subcommand": {
"type": "string",
"description": "Subcommand of the tool.",
"examples": [
"mem",
"mpileup",
"filter"
]
},
"parameters": {
"type": "array",
"items": {
"$ref": "#/definitions/ToolParameter"
},
"description": "The specific invocation of a process may depend on values that can be varied. These values are the parameters to that process. Each parameter has a name and a value.",
"examples": [
[
{
"parameter": "cut_tail",
"value": null
}
],
[
{
"parameter": "exclude_flags",
"value": 1540
}
],
[
{
"parameter": "min_base_quality",
"value": 20
}
],
[
{
"parameter": "min_coverage",
"value": 10
}
]
]
}
}
},
"ToolParameter": {
"title": "ToolParameter",
"type": "object",
"properties": {
"parameter": {
"type": "string",
"description": "Name of the parameter.",
"examples": [
"cut_tail",
"exclude_flags",
"min_base_quality",
"min_coverage"
]
},
"value": {
"type": "string",
"description": "Value of the parameter, or null if the parameter is a flag without a value.",
"examples": [
"null",
"1540",
"20",
"10"
]
}
},
"required": [
"parameter"
]
},
"InputFileProvenanceRecord": {
"title": "InputFileProvenanceRecord",
"type": "object",
"properties": {
"input_filename": {
"type": "string",
"description": "Filename of an input file.",
"examples": [
"sample-01_R1.fastq.gz",
"sample-01_R2.fastq.gz",
"ref.fa",
"sample-01.bam",
"sample-01.bam.bai"
]
},
"input_path": {
"type": "string",
"description": "Absolute path to an input file, at the time that the pipeline was invoked. May be invalid if the input file is moved or renamed after pipeline invocation.",
"examples":[
"/data/ref_data/ecoli.fa",
"/data/sequence/miseq/210101_M00123_0123_000000000-ABC123/Data/Intensities/BaseCalls/sample-01_S1_L001_R1_001.fastq.gz",
"/data/sequence/miseq/210101_M00123_0123_000000000-ABC123/Data/Intensities/BaseCalls/sample-01_S1_L001_R2_001.fastq.gz"
]
},
"sha256": {
"type": "string",
"description": "The checksum of a file, calculated with the SHA256 algorithm. Files with identical contents have identical checksums. If a single byte differs, the checksums will be completely different.",
"examples":[
"b0534592d61321243897e842a9ea655d396d4496cbf6d926b6c6fea8e06aa98d",
"cc66309103da91e337143eb649196d84ed3ebe2ff08a45b197cd4151d137a167"
]
},
"file_size": {
"type": "integer",
"description": "Size of the file in bytes.",
"examples": [
123456789,
1234567890
]
}
},
"required": [
"input_filename"
]
}
}
}
9 changes: 9 additions & 0 deletions .github/environments/check-outputs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: check-outputs
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- python=3
- jsonschema=4.20.0
- pyyaml=6.0.1
59 changes: 58 additions & 1 deletion .github/scripts/check_outputs.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,69 @@
#!/usr/bin/env python3

import argparse
import csv
import glob
import json
import urllib.request

from jsonschema import validate
import yaml


def check_provenance_format_valid(provenance_files, schema):
"""
Check that the provenance files are valid according to the schema.
"""
for provenance_file in provenance_files:
with open(provenance_file) as f:
try:
provenance = yaml.load(f, Loader=yaml.BaseLoader)
validate(provenance, schema)
except Exception as e:
return False

return True


def main(args):
exit(-1)
provenance_schema_url = "https://raw.githubusercontent.com/BCCDC-PHL/pipeline-provenance-schema/main/schema/pipeline-provenance.json"
provenance_schema_path = ".github/data/pipeline-provenance.json"
urllib.request.urlretrieve(provenance_schema_url, provenance_schema_path)

provenance_schema = None
with open(provenance_schema_path) as f:
provenance_schema = json.load(f)

provenace_files_glob = f"{args.pipeline_outdir}/**/*_provenance.yml"
provenance_files = glob.glob(provenace_files_glob, recursive=True)

tests = [
{
"test_name": "provenance_format_valid",
"test_result": check_provenance_format_valid(provenance_files, provenance_schema),
}
]

output_fields = [
"test_name",
"test_result"
]

output_path = args.output
with open(output_path, 'w') as f:
writer = csv.DictWriter(f, fieldnames=output_fields)
writer.writeheader()
for test in tests:
if test["test_result"]:
test["test_result"] = "PASS"
else:
test["test_result"] = "FAIL"
writer.writerow(test)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check outputs')
parser.add_argument('--pipeline-outdir', type=str, help='Path to the pipeline output directory')
parser.add_argument('-o', '--output', type=str, help='Path to the output file')
args = parser.parse_args()
main(args)
14 changes: 14 additions & 0 deletions .github/scripts/check_outputs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

set -eo pipefail

source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate check-outputs


.github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output -o artifacts/check_outputs_results.csv

grep -v 'FAIL' .github/artifacts/check_outputs_results.csv
3 changes: 3 additions & 0 deletions .github/scripts/create_output_checking_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/check-outputs.yml
4 changes: 3 additions & 1 deletion .github/workflows/pull_request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ jobs:
run: bash .github/scripts/simulate_reads.sh
- name: Run Pipeline
run: bash .github/scripts/run_pipeline.sh
- name: Create Output Checking Environment
run: bash .github/scripts/create_output_checking_environment.sh
- name: Check Outputs
run: .github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output
run: .github/scripts/check_outputs.sh
- name: Prepare Artifacts
if: always()
run: bash .github/scripts/prepare_artifacts.sh
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
*~
.nextflow*
.github/data/assemblies
.github/data/fastq
.github/data/test_output
artifacts
work
test_input
test_output

0 comments on commit e444256

Please sign in to comment.