From 1678dc03e72f9a4ac472e22894250dbf5e22cb2a Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 29 May 2021 09:47:56 +0200 Subject: [PATCH 1/9] README: hopefully made intro slightly easier to digest --- README.rst | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index e1a17d89..595236fd 100644 --- a/README.rst +++ b/README.rst @@ -12,15 +12,21 @@ Schema Salad ------------ -Salad is a schema language for describing JSON or YAML structured -linked data documents. Salad schema describes rules for -preprocessing, structural validation, and hyperlink checking for -documents described by a Salad schema. Salad supports rich data -modeling with inheritance, template specialization, object -identifiers, object references, documentation generation, code -generation, and transformation to RDF_. Salad provides a bridge -between document and record oriented data modeling and the Semantic -Web. +Schema Salad is a schema language for YAML (or JSON) that also lets +you map your YAML data structures into RDF structured linked data +documents via JSON-LD. In other words, a schema validates and +transforms YAML or JSON documents into structured linked data +documents - the missing link between a NoSQL JSON document and a +linked data document that can be reasoned about, e.g. for human and +machine processing. + +Salad schema describes rules for preprocessing, structural validation, +and hyperlink checking for documents described by a Salad +schema. Salad supports rich data modeling with inheritance, template +specialization, object identifiers, object references, documentation +generation, code generation, and transformation to RDF_. Salad +provides a bridge between document and record oriented data modeling +and the Semantic Web. The Schema Salad library is Python 3.6+ only. From 3c235764ab9cce53d0b14890f0c1de06b34f8269 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 29 May 2021 12:02:43 +0200 Subject: [PATCH 2/9] .guix-run: Starts a minimal guix container for running schema-salad-tool --- .guix-run | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .guix-run diff --git a/.guix-run b/.guix-run new file mode 100644 index 00000000..e95ff3cb --- /dev/null +++ b/.guix-run @@ -0,0 +1,18 @@ +#! /bin/sh +# +# This script sets up a Guix container. Make sure guix is in the path +# - after installing Guix (on Debian). +# +# Note that pyshex etc are part of the guix-bioinformatics channel at +# +# https://git.genenetwork.org/guix-bioinformatics/guix-bioinformatics + +env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ guix environment -C guix --ad-hoc git python python-pyyaml python-pycurl python-magic nss-certs python-pyshex --network openssl python-schema-salad python-pytest which less vim python-toml python-iniconfig python-tox python-mypy python-pylint + +# Once in the shell you can do +# --- run tests (takes 1 minute, skips lint and mypy) +# tox +# --- install and run +# python3 setup.py install --user +# ~/.local/bin/schema-salad-tool --help +# From acbc700839f6adb152812e2ff30b26d8b8c21d6f Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 29 May 2021 15:15:01 +0200 Subject: [PATCH 3/9] main: allow for passing in multiple YAML/JSON documents and validate them. print-rdf returns the correct combined document. --- schema_salad/main.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/schema_salad/main.py b/schema_salad/main.py index adf364da..5004576f 100644 --- a/schema_salad/main.py +++ b/schema_salad/main.py @@ -189,7 +189,7 @@ def main(argsl: Optional[List[str]] = None) -> int: ) parser.add_argument("schema", type=str, nargs="?", default=None) - parser.add_argument("document", type=str, nargs="?", default=None) + parser.add_argument("document", nargs="*", default=None) parser.add_argument( "--version", "-v", action="store_true", help="Print version", default=None ) @@ -350,21 +350,25 @@ def main(argsl: Optional[List[str]] = None) -> int: print(f"Schema `{args.schema}` is valid") return 0 - # Load target document and resolve refs - try: - uri = args.document - document, doc_metadata = document_loader.resolve_ref( - uri, strict_foreign_properties=args.strict_foreign_properties - ) - except ValidationException as e: - msg = to_one_line_messages(e) if args.print_oneline else str(e) - _logger.error( - "Document `%s` failed validation:\n%s", - args.document, - msg, - exc_info=args.debug, - ) - return 1 + # Load target document and resolve refs. Note that this can now + # take multiple document files. doc_metadata only returns the + # metadata for the last document as they should be the same + document = [] + for uri in args.document: + try: + document1, doc_metadata = document_loader.resolve_ref( + uri, strict_foreign_properties=args.strict_foreign_properties + ) + document.append(document1) + except ValidationException as e: + msg = to_one_line_messages(e) if args.print_oneline else str(e) + _logger.error( + "Document `%s` failed validation:\n%s", + document, + msg, + exc_info=args.debug, + ) + return 1 # Optionally print the document after ref resolution if args.print_pre: From 7221bc149f481668a4c75c969fef9a3c49f6331a Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 29 May 2021 15:17:20 +0200 Subject: [PATCH 4/9] Test: fix test that should return an error on multiple schema's --- schema_salad/tests/test_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema_salad/tests/test_examples.py b/schema_salad/tests/test_examples.py index 89ddab4b..a623ff35 100644 --- a/schema_salad/tests/test_examples.py +++ b/schema_salad/tests/test_examples.py @@ -40,7 +40,7 @@ def test_self_validate() -> None: path = get_data("metaschema/metaschema.yml") assert path assert 0 == schema_salad.main.main(argsl=[path]) - assert 0 == schema_salad.main.main(argsl=[path, path]) + assert 1 == schema_salad.main.main(argsl=[path, path]) # passing in 2 schemas should throw an error def test_avro_regression() -> None: From 8c48e9e2cadec447da1b975947fc4129533bc068 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 30 May 2021 10:56:01 +0200 Subject: [PATCH 5/9] Now we support multiple documents rename args.document -> args.documents --- schema_salad/main.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/schema_salad/main.py b/schema_salad/main.py index 5004576f..9eb7173d 100644 --- a/schema_salad/main.py +++ b/schema_salad/main.py @@ -189,7 +189,7 @@ def main(argsl: Optional[List[str]] = None) -> int: ) parser.add_argument("schema", type=str, nargs="?", default=None) - parser.add_argument("document", nargs="*", default=None) + parser.add_argument("documents", nargs="*", default=None) parser.add_argument( "--version", "-v", action="store_true", help="Print version", default=None ) @@ -253,11 +253,11 @@ def main(argsl: Optional[List[str]] = None) -> int: return 0 # Optionally print the schema after ref resolution - if not args.document and args.print_pre: + if not args.documents and args.print_pre: print(json_dumps(schema_doc, indent=4)) return 0 - if not args.document and args.print_index: + if not args.documents and args.print_index: print(json_dumps(list(metaschema_loader.idx.keys()), indent=4)) return 0 @@ -333,7 +333,7 @@ def main(argsl: Optional[List[str]] = None) -> int: print(rdfs.serialize(format=args.rdf_serializer).decode("utf-8")) return 0 - if args.print_metadata and not args.document: + if args.print_metadata and not args.documents: print(json_dumps(schema_metadata, indent=4)) return 0 @@ -346,7 +346,7 @@ def main(argsl: Optional[List[str]] = None) -> int: return 0 # If no document specified, all done. - if not args.document: + if not args.documents: print(f"Schema `{args.schema}` is valid") return 0 @@ -354,7 +354,7 @@ def main(argsl: Optional[List[str]] = None) -> int: # take multiple document files. doc_metadata only returns the # metadata for the last document as they should be the same document = [] - for uri in args.document: + for uri in args.documents: try: document1, doc_metadata = document_loader.resolve_ref( uri, strict_foreign_properties=args.strict_foreign_properties @@ -390,13 +390,13 @@ def main(argsl: Optional[List[str]] = None) -> int: ) except ValidationException as e: msg2 = to_one_line_messages(e) if args.print_oneline else str(e) - _logger.error(f"While validating document `{args.document}`:\n{msg2}") + _logger.error(f"While validating document `{args.documents}`:\n{msg2}") return 1 # Optionally convert the document to RDF if args.print_rdf: if isinstance(document, (Mapping, MutableSequence)): - printrdf(args.document, document, schema_ctx, args.rdf_serializer) + printrdf(args.documents, document, schema_ctx, args.rdf_serializer) return 0 else: print("Document must be a dictionary or list.") @@ -406,7 +406,7 @@ def main(argsl: Optional[List[str]] = None) -> int: print(json_dumps(doc_metadata, indent=4)) return 0 - print(f"Document `{args.document}` is valid") + print(f"Document `{args.documents}` is valid") return 0 From 77a5e6f7e48c5ac622281760de3a8acb16cf3273 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 30 May 2021 12:11:28 +0200 Subject: [PATCH 6/9] Fix breaking test and add tests for PubSeq --- schema_salad/main.py | 7 + .../tests/data/pubseq/MW084447.1.json | 46 +++ .../tests/data/pubseq/MW343767.1.json | 53 +++ .../tests/data/pubseq/pubseq-schema.yml | 326 ++++++++++++++++++ schema_salad/tests/test_examples.py | 10 + 5 files changed, 442 insertions(+) create mode 100644 schema_salad/tests/data/pubseq/MW084447.1.json create mode 100644 schema_salad/tests/data/pubseq/MW343767.1.json create mode 100644 schema_salad/tests/data/pubseq/pubseq-schema.yml diff --git a/schema_salad/main.py b/schema_salad/main.py index 9eb7173d..795dc56e 100644 --- a/schema_salad/main.py +++ b/schema_salad/main.py @@ -354,11 +354,18 @@ def main(argsl: Optional[List[str]] = None) -> int: # take multiple document files. doc_metadata only returns the # metadata for the last document as they should be the same document = [] + ids = {} # check for duplicate use of document id as it creates + # unpredictable output for uri in args.documents: try: document1, doc_metadata = document_loader.resolve_ref( uri, strict_foreign_properties=args.strict_foreign_properties ) + if "id" in document1: + doc_id = document1["id"] + if doc_id in ids: + raise Exception(f"Document id {doc_id} is duplicated in {uri}!") + ids[doc_id] = True document.append(document1) except ValidationException as e: msg = to_one_line_messages(e) if args.print_oneline else str(e) diff --git a/schema_salad/tests/data/pubseq/MW084447.1.json b/schema_salad/tests/data/pubseq/MW084447.1.json new file mode 100644 index 00000000..b2346010 --- /dev/null +++ b/schema_salad/tests/data/pubseq/MW084447.1.json @@ -0,0 +1,46 @@ +{ + "id": "placeholder", + "update_date": "2020-11-12", + "host": { + "host_species": "http://purl.obolibrary.org/obo/NCBITaxon_9606" + }, + "sample": { + "sample_id": "MW084447.1", + "database": "https://www.ncbi.nlm.nih.gov/genbank/", + "source_database_accession": [ + "http://identifiers.org/insdc/MW084447.1#sequence" + ], + "collection_date": "2020-04-14", + "original_collection_location": "USA", + "collection_location": "http://www.wikidata.org/entity/Q23337", + "country": "USA", + "place": "Salt Lake City" + }, + "virus": { + "virus_strain": "SARS-CoV-2/human/USA/UT-02140/2020", + "virus_species": "http://purl.obolibrary.org/obo/NCBITaxon_2697049" + }, + "technology": { + "alignment_protocol": "bwa v. 0.7.17-r1188", + "sample_sequencing_technology": [ + "http://purl.obolibrary.org/obo/OBI_0000759" + ], + "assembly_method": "http://purl.obolibrary.org/obo/GENEPIO_0002028" + }, + "submitter": { + "authors": [ + "Young,E.L.", + "Oakeson,K.", + "Sangster,A.", + "Hirschi,B.", + "Butz,H." + ], + "submitter_name": [ + " Utah Public Health Laboratory" + ], + "submitter_address": "Utah Public Health Laboratory Infectious Disease submission group, 4431 S 2700 W, Salt Lake City, UT 84129, USA" + }, + "warnings": [ + "Missing specimen_source" + ] +} \ No newline at end of file diff --git a/schema_salad/tests/data/pubseq/MW343767.1.json b/schema_salad/tests/data/pubseq/MW343767.1.json new file mode 100644 index 00000000..4b766d01 --- /dev/null +++ b/schema_salad/tests/data/pubseq/MW343767.1.json @@ -0,0 +1,53 @@ +{ + "id": "placeholder1", + "update_date": "2020-12-08", + "host": { + "host_species": "http://purl.obolibrary.org/obo/NCBITaxon_9606" + }, + "sample": { + "sample_id": "MW343767.1", + "database": "https://www.ncbi.nlm.nih.gov/genbank/", + "source_database_accession": [ + "http://identifiers.org/insdc/MW343767.1#sequence" + ], + "collection_date": "2020-10-27", + "specimen_source": [ + "http://purl.obolibrary.org/obo/NCIT_C155831" + ], + "original_collection_location": "USA", + "collection_location": "http://www.wikidata.org/entity/Q23556", + "country": "USA", + "place": "Atlanta" + }, + "virus": { + "virus_strain": "SARS-CoV-2/human/USA/GA-CDC-7701/2020", + "virus_species": "http://purl.obolibrary.org/obo/NCBITaxon_2697049" + }, + "technology": { + "alignment_protocol": "freebayes v. 1.3 ", + "sample_sequencing_technology": [ + "http://purl.obolibrary.org/obo/OBI_0000759" + ], + "assembly_method": "http://purl.obolibrary.org/obo/GENEPIO_0002028" + }, + "submitter": { + "authors": [ + "Li,Y.", + "Tao,Y.", + "Zhang,J.", + "Queen,K.", + "Uehara,A.", + "Cook,P.", + "Paden,C.R.", + "Wang,H.", + "Tong,S." + ], + "submitter_name": [ + " Respiratory Viruses Branch" + ], + "submitter_address": "Centers for Disease Control and Prevention, 1600 Clifton Rd, Atlanta, GA 30329, USA" + }, + "warnings": [ + + ] +} diff --git a/schema_salad/tests/data/pubseq/pubseq-schema.yml b/schema_salad/tests/data/pubseq/pubseq-schema.yml new file mode 100644 index 00000000..e1947c1a --- /dev/null +++ b/schema_salad/tests/data/pubseq/pubseq-schema.yml @@ -0,0 +1,326 @@ +$base: http://biohackathon.org/bh20-seq-schema +$namespaces: + cc: https://creativecommons.org/ns# + dc: http://purl.org/metadata/dublin_core_elements# + sch: https://schema.org/ + efo: http://www.ebi.ac.uk/efo/ + obo: http://purl.obolibrary.org/obo/ + sio: http://semanticscience.org/resource/ + edam: http://edamontology.org/ + # Vocabulary for clinical care, translational and basic research, + # and public information and administrative activities + evs: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl# + # Software Ontology + swo: http://www.ebi.ac.uk/swo/ + +$graph: + +- name: licenseSchema + type: record + fields: + license_type: + doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf + type: string + jsonldPredicate: + _id: https://creativecommons.org/ns#License + _type: "@id" + noLinkCheck: true + title: + doc: Attribution title related to data license + type: string? + jsonldPredicate: + _id: http://purl.org/metadata/dublin_core_elements#Title + attribution_name: + doc: Attribution NAME related to data license + type: string? + jsonldPredicate: + _id: https://creativecommons.org/ns#attributionName + attribution_url: + doc: Attribution URL related to data license + type: string? + jsonldPredicate: + _id: https://creativecommons.org/ns#attributionURL + _type: "@id" + noLinkCheck: true + attribution_source: + doc: Attribution source URL related to data license + type: string? + jsonldPredicate: + _id: https://creativecommons.org/ns#attributionSource + _type: "@id" + noLinkCheck: true + +- name: hostSchema + type: record + fields: + host_species: + doc: Host species as defined in NCBITaxon, e.g. http://purl.obolibrary.org/obo/NCBITaxon_9606 for Homo sapiens + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 + _type: "@id" + noLinkCheck: true + host_id: + doc: Identifer for the host. If you submit multiple samples from the same host, use the same host_id for those samples + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000115 + host_sex: + doc: Sex of the host as defined in PATO, expect Male (http://purl.obolibrary.org/obo/PATO_0000384) or Female (http://purl.obolibrary.org/obo/PATO_0000383) or in Intersex (http://purl.obolibrary.org/obo/PATO_0001340) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/PATO_0000047 + _type: "@id" + noLinkCheck: true + host_age: + doc: Age of the host as number (e.g. 50) + type: int? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/PATO_0000011 + host_age_unit: + doc: Unit of host age e.g. http://purl.obolibrary.org/obo/UO_0000036 + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C42574 + _type: "@id" + noLinkCheck: true + host_health_status: + doc: A condition or state at a particular time, must be one of the following (obo:NCIT_C115935 obo:NCIT_C3833 obo:NCIT_C25269 obo:GENEPIO_0002020 obo:GENEPIO_0001849 obo:NCIT_C28554 obo:NCIT_C37987) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C25688 + _type: "@id" + noLinkCheck: true + host_treatment: + doc: Process in which the act is intended to modify or alter host status + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000727 + host_vaccination: + doc: List of vaccines given to the host + type: string[]? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/VO_0000002 + ethnicity: + doc: Ethinicity of the host e.g. http://purl.obolibrary.org/obo/HANCESTRO_0010 + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001014 + _type: "@id" + noLinkCheck: true + additional_host_information: + doc: Field for additional host information + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + +- name: sampleSchema + type: record + fields: + sample_id: + doc: Unique sample identifier as defined by the submitter + type: string + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000115 + collection_date: + doc: Date when the sample was taken + type: string + jsonldPredicate: + _id: https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&ns=ncit&code=C164024 + collection_location: + doc: Geographical location where the sample was collected as wikidata reference, e.g. http://www.wikidata.org/entity/Q148 (China) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/GAZ_00000448 + _type: "@id" + noLinkCheck: true + original_collection_location: + doc: Original geographical location where the sample was collected as a string + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/GAZ_00000448 + country: + doc: Original country location as text + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001627 + place: + doc: Original place as text + type: string? + jsonldPredicate: + # Geographical region + _id: http://purl.obolibrary.org/obo/GEO_000000372 + collector_name: + doc: Name of the person that took the sample + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001895 + collecting_institution: + doc: Institute that was responsible for sampling + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C41206 + specimen_source: + doc: Method how the specimen was derived as NCIT IRI, e.g. http://purl.obolibrary.org/obo/NCIT_C155831 (=nasopharyngeal swab) + type: string[]? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001479 + _type: "@id" + noLinkCheck: true + sample_storage_conditions: + doc: Information about storage of a specified type, e.g. frozen specimen, paraffin, fresh .... + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001472 + additional_collection_information: + doc: Add additional comment about the circumstances that a sample was taken + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + database: + doc: URI to database, e.g. https://www.ncbi.nlm.nih.gov/genbank/ + type: string? + jsonldPredicate: + _id: https://schema.org/maintainer + source_database_accession: + doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here. Please use a resolveable URL (e.g. http://identifiers.org/insdc/LC522350.1#sequence) + type: string[]? + jsonldPredicate: + _id: http://edamontology.org/data_2091 + _type: "@id" + noLinkCheck: true + +- name: virusSchema + type: record + fields: + virus_species: + doc: The name of virus species from the NCBI taxonomy database, e.g. http://purl.obolibrary.org/obo/NCBITaxon_2697049 for Severe acute respiratory syndrome coronavirus 2 + type: string + jsonldPredicate: + _id: http://edamontology.org/data_1875 + _type: "@id" + noLinkCheck: true + virus_strain: + doc: Name of the virus strain + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_010055 + +- name: technologySchema + type: record + fields: + sample_sequencing_technology: + doc: Technology device that was used to sequence the sample (e.g Sanger, Nanopore MiniION) + type: string[] + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C153598 + _type: "@id" + noLinkCheck: true + assembly_method: + doc: Assembly method refers to how the reads were assembled into contigs for which either a de novo (http://purl.obolibrary.org/obo/GENEPIO_0001628) or mapping/reference based (http://purl.obolibrary.org/obo/GENEPIO_0002028) strategy is used. + type: string + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/GENEPIO_0000090 + _type: "@id" + alignment_protocol: + doc: Protocol which provides detailed instructions to obtain the assembly + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0004917 + sequencing_coverage: + doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. [100]) - if multiple technologies were used multiple float values can be submitted e.g. [100, 20] + type: double[]? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/FLU_0000848 + additional_technology_information: + doc: Field for additional technology information + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + +- name: submitterSchema + type: record + fields: + authors: + doc: Name(s) of the author(s) of the sequence data in the scientific publication + type: string[] + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C42781 + submitter_name: + doc: Name of the submitter(s) of the sequence data + type: string[]? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000116 + submitter_address: + doc: Address of the submitter of the sequence data + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000172 + originating_lab: + doc: Laboratory name or identifier where the sample to sequence was produced + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C37984 + lab_address: + doc: Address of the laboratory where the sample was produced + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C25407 + provider: + doc: Name or identifier of the provider of the sample + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C37900 + submitter_sample_id: + doc: Identifier given to the sample by the submitter + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C164332 + publication: + doc: Reference to the scientifc publication of the sequence (e.g. DOI, pubmed ID, ...) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C19026 + submitter_orcid: + # An identifier curated by ORCID, Inc. to denote some academic author + doc: ORCID of the submitter as a full URI, e.g. https://orcid.org/0000-0002-1825-0097 + type: string[]? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/APOLLO_SV_00000496 + _type: "@id" + noLinkCheck: true + additional_submitter_information: + doc: Field for additional submitter information + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + + +- name: MainSchema + type: record + documentRoot: true + fields: + host: hostSchema + sample: sampleSchema + virus: virusSchema + technology: technologySchema + submitter: submitterSchema + license: ["null", licenseSchema] + id: + doc: The subject (eg the fasta/fastq file) that the metadata describes + type: string + jsonldPredicate: + _id: "@id" + _type: "@id" + noLinkCheck: true + update_date: + doc: Date when record was last updated + type: string + jsonldPredicate: + _id: https://schema.org/dateModified + warnings: + doc: List processing warnings An error correction is a data transformation objective where the aim is to remove (correct for) erroneous contributions arising from the input data, or the transformation itself. + type: string[]? + jsonldPredicate: + _id: http://www.ebi.ac.uk/swo/objective/SWO_7000012 + _type: string? diff --git a/schema_salad/tests/test_examples.py b/schema_salad/tests/test_examples.py index a623ff35..17e72005 100644 --- a/schema_salad/tests/test_examples.py +++ b/schema_salad/tests/test_examples.py @@ -437,3 +437,13 @@ def test_nullable_links() -> None: ra, _ = ldr.resolve_all(cmap({"link": None}), "http://example.com", checklinks=True) assert {"link": None} == ra + +def test_pubseq_multidoc_example() -> None: + schema = get_data("tests/data/pubseq/pubseq-schema.yml") + doc1 = get_data("tests/data/pubseq/MW084447.1.json") + doc2 = get_data("tests/data/pubseq/MW343767.1.json") + assert schema + assert doc1 + assert doc2 + assert 0 == schema_salad.main.main(argsl=[schema,doc1]) + assert 0 == schema_salad.main.main(argsl=[schema,doc1,doc2]) From 7e3cc032ef0e51fb13878b87f76d68921237d35a Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 30 May 2021 12:14:55 +0200 Subject: [PATCH 7/9] .guix-run: add example --- .guix-run | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.guix-run b/.guix-run index e95ff3cb..93907e24 100644 --- a/.guix-run +++ b/.guix-run @@ -15,4 +15,6 @@ env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ guix environment -C guix --ad-hoc g # --- install and run # python3 setup.py install --user # ~/.local/bin/schema-salad-tool --help -# +# --- Example +# ~/.local/bin/schema-salad-tool --debug --print-rdf schema_salad/tests/data/pubseq/pubseq-schema.yml schema_salad/tests/data/pubseq/MW084447.1.json schema_salad/tests/data/pubseq/MW343767.1.json + From a5818c378d9f58d574d4041e719539037354d00c Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 30 May 2021 12:32:33 +0200 Subject: [PATCH 8/9] Add globbing support and update README --- README.rst | 8 ++++++++ schema_salad/main.py | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/README.rst b/README.rst index 595236fd..13965611 100644 --- a/README.rst +++ b/README.rst @@ -71,6 +71,14 @@ Validate a document using a schema:: $ schema-salad-tool myschema.yml mydocument.yml +Validate a JSON document using a schema:: + + $ schema-salad-tool myschema.yml mydocument.json + +Multiple documents and (lazy) expansion can be used:: + + $ schema-salad-tool myschema.yml 'my*.yml' + Generate HTML documentation:: $ schema-salad-tool myschema.yml > myschema.html diff --git a/schema_salad/main.py b/schema_salad/main.py index 795dc56e..e1e356d8 100644 --- a/schema_salad/main.py +++ b/schema_salad/main.py @@ -1,6 +1,7 @@ """Command line interface to schema-salad.""" import argparse +import glob import logging import os import sys @@ -252,6 +253,9 @@ def main(argsl: Optional[List[str]] = None) -> int: makedoc(args) return 0 + # Use globbing to expand the list of documents - and flatten again + args.documents = [item for sublist in map(lambda fn: glob.glob(fn), args.documents) for item in sublist] + # Optionally print the schema after ref resolution if not args.documents and args.print_pre: print(json_dumps(schema_doc, indent=4)) From 1869e96b9339bcc483657f9800a630c2f3d60503 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" <1330696+mr-c@users.noreply.github.com> Date: Sat, 14 Aug 2021 16:05:37 +0200 Subject: [PATCH 9/9] fix merge --- schema_salad/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/schema_salad/main.py b/schema_salad/main.py index 410d898b..7f282630 100644 --- a/schema_salad/main.py +++ b/schema_salad/main.py @@ -268,11 +268,11 @@ def main(argsl: Optional[List[str]] = None) -> int: args.documents = [item for sublist in map(lambda fn: glob.glob(fn), args.documents) for item in sublist] # Optionally print the schema after ref resolution - if not args.document and args.print_pre: + if not args.documents and args.print_pre: json_dump(schema_doc, fp=sys.stdout, indent=4) return 0 - if not args.document and args.print_index: + if not args.documents and args.print_index: json_dump(list(metaschema_loader.idx.keys()), fp=sys.stdout, indent=4) return 0 @@ -348,7 +348,7 @@ def main(argsl: Optional[List[str]] = None) -> int: rdfs.serialize(destination=stdout(), format=args.rdf_serializer) return 0 - if args.print_metadata and not args.document: + if args.print_metadata and not args.documents: json_dump(schema_metadata, fp=sys.stdout, indent=4) return 0