diff --git a/deps.edn b/deps.edn index 414bfb7..56c35fb 100644 --- a/deps.edn +++ b/deps.edn @@ -5,6 +5,7 @@ {org.clojure/clojure {:mvn/version "1.12.0"} org.clojure/core.async {:mvn/version "1.6.681"} org.clojure/core.match {:mvn/version "1.1.0"} + org.clojure/data.csv {:mvn/version "1.1.0"} org.clojure/data.xml {:mvn/version "0.2.0-alpha9"} org.clojure/data.zip {:mvn/version "1.1.0"} org.clojure/tools.logging {:mvn/version "1.3.0"} @@ -37,7 +38,7 @@ :main-opts ["-m" "com.eldrix.hermes.cmd.core"]} :dev - {:extra-paths ["cmd" "test"] + {:extra-paths ["cmd" "test/src" "test/resources"] :extra-deps {org.clojure/tools.cli {:mvn/version "1.1.230"} io.pedestal/pedestal.service {:mvn/version "0.7.1"} io.pedestal/pedestal.error {:mvn/version "0.7.1"} @@ -54,7 +55,7 @@ org.apache.lucene/lucene-backward-codecs {:mvn/version "10.0.0"}}} :test - {:extra-paths ["cmd" "test" "test/resources"] + {:extra-paths ["cmd" "test/src" "test/resources"] :extra-deps {org.clojure/test.check {:mvn/version "1.1.1"} com.wsscode/pathom3 {:git/url "https://github.com/wilkerlucio/pathom3.git" :git/sha "2d9d1cf8ccfeee83566c31e776a5ef105b2a1626"} diff --git a/src/com/eldrix/hermes/importer.clj b/src/com/eldrix/hermes/importer.clj index 22a407d..0d97701 100644 --- a/src/com/eldrix/hermes/importer.clj +++ b/src/com/eldrix/hermes/importer.clj @@ -9,6 +9,7 @@ (ns com.eldrix.hermes.importer "Provides import functionality for processing directories of files" (:require [clojure.core.async :as a] + [clojure.data.csv :as csv] [clojure.data.json :as json] [clojure.java.io :as io] [clojure.spec.alpha :as s] @@ -73,7 +74,7 @@ (merge (json/read-str (slurp f) :key-fn keyword :value-fn read-metadata-value)) ;; read in metadaa (update :modules update-keys (fn [x] (-> x name parse-long)))) ;; return all module identifiers as longs (catch Throwable e (log/warn e "Invalid metadata in distribution file" (:name default)) - (assoc default :error "Invalid metadata in distribution file"))))) + (assoc default :error "Invalid metadata in distribution file"))))) (defn metadata-files "Returns a list of release package information files from the directory. @@ -90,20 +91,35 @@ (doall (->> (metadata-files dir) (map read-metadata)))) -(defn- process-file +(defprotocol SnomedFile + (parse-filename [this] "Returns structured data about a SNOMED file")) + +(extend-protocol SnomedFile + String + (parse-filename [s] (snomed/parse-snomed-filename s)) + File + (parse-filename [f] (snomed/parse-snomed-filename (.getName f))) + java.net.URL + (parse-filename [url] (snomed/parse-snomed-filename (.getPath url))) + nil + (parse-filename [_] nil)) + +(defn process-file "Process the specified file, streaming batched results to the channel - specified, blocking if channel not being drained. + specified, blocking if channel not being drained. + Parameters: + - f : anything coercible using clojure.java.io/reader Each batch is a map with keys - :type : a type of SNOMED component - :parser : a parser that can take each row and give you data - :headings : a sequence of headings from the original file - :data : a sequence of vectors representing each column." - [filename out-c & {:keys [batch-size] :or {batch-size 1000}}] - (with-open [reader (io/reader filename)] - (let [{:keys [identifier parser filename component]} (snomed/parse-snomed-filename filename)] - (when parser - (let [csv-data (map #(str/split % #"\t") (line-seq reader)) + [f out-c & {:keys [batch-size] :or {batch-size 1000}}] + (let [{:keys [identifier parser filename component]} (parse-filename f)] + (when parser + (with-open [reader (io/reader f)] + (let [csv-data (csv/read-csv reader :separator \tab) headings (first csv-data) data (rest csv-data) batches (->> data @@ -141,12 +157,12 @@ (a/>!! processed-c e))) (a/close! raw-c)) (a/pipeline - nthreads - processed-c - (map snomed/parse-batch) - raw-c - true - (fn ex-handler [err] (log/debug "Error during import pipeline: " (ex-data err)) err)) + nthreads + processed-c + (map snomed/parse-batch) + raw-c + true + (fn ex-handler [err] (log/debug "Error during import pipeline: " (ex-data err)) err)) processed-c)) (defn load-snomed diff --git a/test/resources/example-snapshot/README.md b/test/resources/example-snapshot/README.md new file mode 100644 index 0000000..4c516ec --- /dev/null +++ b/test/resources/example-snapshot/README.md @@ -0,0 +1,19 @@ +This is an example distribution with fragments of different release files for testing + +Core terminology files +====================== + +These are 'correct' versions of Concept, Description and Relationship files +- ./Terminology/sct2_Concept_Snapshot_INT_20230131.txt +- ./Terminology/sct2_Description_Snapshot-en_INT_20230131.txt +- ./Terminology/sct2_RelationshipConcreteValues_Snapshot_INT_20230131.txt +- ./Terminology/sct2_Relationship_Snapshot_INT_20230131.txt + +Reference set files +=================== + +- ./Refset/Map/der2_iisssccRefset_ExtendedMapSnapshot_INT_20230131.txt + - This is a 'correct' International release version of an extended map + +- ./Refset/Map/der2_ssRefset_SimpleMapWithDescriptionSnapshot_12345_20241021.txt + - This is a 'custom' simple map reference set with null values for one the columns diff --git a/test/resources/example-snapshot/Refset/Map/der2_iisssccRefset_ExtendedMapSnapshot_INT_20230131.txt b/test/resources/example-snapshot/Refset/Map/der2_iisssccRefset_ExtendedMapSnapshot_INT_20230131.txt new file mode 100644 index 0000000..2e9b086 --- /dev/null +++ b/test/resources/example-snapshot/Refset/Map/der2_iisssccRefset_ExtendedMapSnapshot_INT_20230131.txt @@ -0,0 +1,5 @@ +id effectiveTime active moduleId refsetId referencedComponentId mapGroup mapPriority mapRule mapAdvice mapTarget correlationId mapCategoryId +00005b30-d2ad-5891-ae82-060d2e20a9fc 20150731 1 449080006 447562003 211339002 1 1 TRUE ALWAYS S80.8 | POSSIBLE REQUIREMENT FOR AN EXTERNAL CAUSE CODE S80.8 447561005 447637006 +00006f25-3157-5132-b658-25708c9f1290 20150731 1 449080006 447562003 37535007 1 1 TRUE ALWAYS Q99.9 Q99.9 447561005 447637006 +00009ee5-904c-5e9f-a67a-ffe16d847782 20150731 1 449080006 447562003 299741008 1 1 TRUE MAP SOURCE CONCEPT CANNOT BE CLASSIFIED WITH AVAILABLE DATA 447561005 447638001 +0000b488-4995-58e1-a542-a8fc22593548 20220331 1 449080006 447562003 1208339007 1 1 TRUE ALWAYS G11.1 | POSSIBLE REQUIREMENT FOR ADDITIONAL CODE TO FULLY DESCRIBE DISEASE OR CONDITION G11.1 447561005 447637006 diff --git a/test/resources/example-snapshot/Refset/Map/der2_ssRefset_SimpleMapWithDescriptionSnapshot_12345_20241021.txt b/test/resources/example-snapshot/Refset/Map/der2_ssRefset_SimpleMapWithDescriptionSnapshot_12345_20241021.txt new file mode 100644 index 0000000..52ec034 --- /dev/null +++ b/test/resources/example-snapshot/Refset/Map/der2_ssRefset_SimpleMapWithDescriptionSnapshot_12345_20241021.txt @@ -0,0 +1,3 @@ +id effectiveTime active moduleId refsetId referencedComponentId mapTarget mapTargetDescription +000d91ce-aae4-4f9e-ad06-51be576becd6 20241021 1 195941000112101 22671000001102 1434181000001106 ABC01256Q +00280f1a-4c71-4683-8cb4-3e19e97ff9d9 20241021 1 195941000112101 22671000001102 1291061000001109 ABC08795Q diff --git a/test/resources/example-snapshot/Terminology/sct2_Concept_Snapshot_INT_20230131.txt b/test/resources/example-snapshot/Terminology/sct2_Concept_Snapshot_INT_20230131.txt new file mode 100644 index 0000000..4f8195d --- /dev/null +++ b/test/resources/example-snapshot/Terminology/sct2_Concept_Snapshot_INT_20230131.txt @@ -0,0 +1,5 @@ +id effectiveTime active moduleId definitionStatusId +100005 20020131 0 900000000000207008 900000000000074008 +101009 20020131 1 900000000000207008 900000000000074008 +102002 20020131 1 900000000000207008 900000000000074008 +103007 20020131 1 900000000000207008 900000000000074008 diff --git a/test/resources/example-snapshot/Terminology/sct2_Description_Snapshot-en_INT_20230131.txt b/test/resources/example-snapshot/Terminology/sct2_Description_Snapshot-en_INT_20230131.txt new file mode 100644 index 0000000..646af88 --- /dev/null +++ b/test/resources/example-snapshot/Terminology/sct2_Description_Snapshot-en_INT_20230131.txt @@ -0,0 +1,5 @@ +id effectiveTime active moduleId conceptId languageCode typeId term caseSignificanceId +101013 20170731 1 900000000000207008 126813005 en 900000000000013009 Neoplasm of anterior aspect of epiglottis 900000000000448009 +102018 20170731 1 900000000000207008 126814004 en 900000000000013009 Neoplasm of junctional region of epiglottis 900000000000448009 +103011 20170731 1 900000000000207008 126815003 en 900000000000013009 Neoplasm of lateral wall of oropharynx 900000000000448009 +104017 20170731 1 900000000000207008 126816002 en 900000000000013009 Neoplasm of posterior wall of oropharynx 900000000000448009 diff --git a/test/resources/example-snapshot/Terminology/sct2_RelationshipConcreteValues_Snapshot_INT_20230131.txt b/test/resources/example-snapshot/Terminology/sct2_RelationshipConcreteValues_Snapshot_INT_20230131.txt new file mode 100644 index 0000000..a0ccd7b --- /dev/null +++ b/test/resources/example-snapshot/Terminology/sct2_RelationshipConcreteValues_Snapshot_INT_20230131.txt @@ -0,0 +1,5 @@ +id effectiveTime active moduleId sourceId value relationshipGroup typeId characteristicTypeId modifierId +13830203029 20210731 1 900000000000207008 830045007 #3 0 1142139005 900000000000011006 900000000000451002 +13830204024 20210731 1 900000000000207008 830064001 #3 0 1142139005 900000000000011006 900000000000451002 +13830205020 20210731 1 900000000000207008 830066004 #3 0 1142139005 900000000000011006 900000000000451002 +13830206021 20210731 1 900000000000207008 830108003 #1 0 1142139005 900000000000011006 900000000000451002 diff --git a/test/resources/example-snapshot/Terminology/sct2_Relationship_Snapshot_INT_20230131.txt b/test/resources/example-snapshot/Terminology/sct2_Relationship_Snapshot_INT_20230131.txt new file mode 100644 index 0000000..fcd6c80 --- /dev/null +++ b/test/resources/example-snapshot/Terminology/sct2_Relationship_Snapshot_INT_20230131.txt @@ -0,0 +1,5 @@ +id effectiveTime active moduleId sourceId destinationId relationshipGroup typeId characteristicTypeId modifierId +100022 20090731 0 900000000000207008 100000000 102272007 0 116680003 900000000000011006 900000000000451002 +101021 20020131 1 900000000000207008 10000006 29857009 0 116680003 900000000000011006 900000000000451002 +102025 20020131 1 900000000000207008 10000006 9972008 0 116680003 900000000000011006 900000000000451002 +103024 20030131 0 900000000000207008 1000004 19130008 0 116680003 900000000000011006 900000000000451002 diff --git a/test/src/com/eldrix/hermes/importer_test.clj b/test/src/com/eldrix/hermes/importer_test.clj new file mode 100644 index 0000000..6251f87 --- /dev/null +++ b/test/src/com/eldrix/hermes/importer_test.clj @@ -0,0 +1,77 @@ +(ns com.eldrix.hermes.importer-test + (:require + [clojure.core.async :as async] + [clojure.java.io :as io] + [clojure.spec.gen.alpha :as gen] + [clojure.test :refer [deftest is testing]] + [com.eldrix.hermes.importer :as importer] + [com.eldrix.hermes.rf2 :as rf2]) + (:import (java.time LocalDate))) + +(deftest parse-filename + (testing "nil filename" + (is (nil? (importer/parse-filename nil)))) + (testing "concept filename as string" + (let [{:keys [format version-date content-subtype type country-code identifier component]} (importer/parse-filename "sct2_Concept_Snapshot_INT_20230131.txt")] + (is (= "Concept" component)) + (is (= "INT" country-code)) + (is (= :info.snomed/Concept identifier)) + (is (= "2" format)) + (is (= "sct" type)) + (is (= (LocalDate/of 2023 1 31) version-date)))) + (testing "description filename as URL" + (let [{:keys [identifier]} (importer/parse-filename (java.net.URL. "file://Terminology/sct2_Description_Snapshot-en_INT_20230131.txt"))] + (is (= :info.snomed/Description identifier)))) + (testing "relationship concrete values filename as file" + (let [{:keys [identifier]} (importer/parse-filename (io/file "./Terminology/sct2_RelationshipConcreteValues_Snapshot_INT_20230131.txt"))] + (is (= :info.snomed/RelationshipConcreteValues identifier))))) + +(defn import-file + "Import a SNOMED file" + [f] + (let [ch (async/chan)] + (async/thread + (importer/process-file f ch) + (async/close! ch)) + (async/