Skip to content

Commit

Permalink
Fix #69 - improve TSV import
Browse files Browse the repository at this point in the history
  • Loading branch information
wardle committed Nov 21, 2024
1 parent 367811b commit 22aecdf
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 16 deletions.
5 changes: 3 additions & 2 deletions deps.edn
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
{org.clojure/clojure {:mvn/version "1.12.0"}
org.clojure/core.async {:mvn/version "1.6.681"}
org.clojure/core.match {:mvn/version "1.1.0"}
org.clojure/data.csv {:mvn/version "1.1.0"}
org.clojure/data.xml {:mvn/version "0.2.0-alpha9"}
org.clojure/data.zip {:mvn/version "1.1.0"}
org.clojure/tools.logging {:mvn/version "1.3.0"}
Expand Down Expand Up @@ -37,7 +38,7 @@
:main-opts ["-m" "com.eldrix.hermes.cmd.core"]}

:dev
{:extra-paths ["cmd" "test"]
{:extra-paths ["cmd" "test/src" "test/resources"]
:extra-deps {org.clojure/tools.cli {:mvn/version "1.1.230"}
io.pedestal/pedestal.service {:mvn/version "0.7.1"}
io.pedestal/pedestal.error {:mvn/version "0.7.1"}
Expand All @@ -54,7 +55,7 @@
org.apache.lucene/lucene-backward-codecs {:mvn/version "10.0.0"}}}

:test
{:extra-paths ["cmd" "test" "test/resources"]
{:extra-paths ["cmd" "test/src" "test/resources"]
:extra-deps {org.clojure/test.check {:mvn/version "1.1.1"}
com.wsscode/pathom3 {:git/url "https://github.com/wilkerlucio/pathom3.git"
:git/sha "2d9d1cf8ccfeee83566c31e776a5ef105b2a1626"}
Expand Down
44 changes: 30 additions & 14 deletions src/com/eldrix/hermes/importer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
(ns com.eldrix.hermes.importer
"Provides import functionality for processing directories of files"
(:require [clojure.core.async :as a]
[clojure.data.csv :as csv]
[clojure.data.json :as json]
[clojure.java.io :as io]
[clojure.spec.alpha :as s]
Expand Down Expand Up @@ -73,7 +74,7 @@
(merge (json/read-str (slurp f) :key-fn keyword :value-fn read-metadata-value)) ;; read in metadaa
(update :modules update-keys (fn [x] (-> x name parse-long)))) ;; return all module identifiers as longs
(catch Throwable e (log/warn e "Invalid metadata in distribution file" (:name default))
(assoc default :error "Invalid metadata in distribution file")))))
(assoc default :error "Invalid metadata in distribution file")))))

(defn metadata-files
"Returns a list of release package information files from the directory.
Expand All @@ -90,20 +91,35 @@
(doall (->> (metadata-files dir)
(map read-metadata))))

(defn- process-file
(defprotocol SnomedFile
(parse-filename [this] "Returns structured data about a SNOMED file"))

(extend-protocol SnomedFile
String
(parse-filename [s] (snomed/parse-snomed-filename s))
File
(parse-filename [f] (snomed/parse-snomed-filename (.getName f)))
java.net.URL
(parse-filename [url] (snomed/parse-snomed-filename (.getPath url)))
nil
(parse-filename [_] nil))

(defn process-file
"Process the specified file, streaming batched results to the channel
specified, blocking if channel not being drained.
specified, blocking if channel not being drained.
Parameters:
- f : anything coercible using clojure.java.io/reader
Each batch is a map with keys
- :type : a type of SNOMED component
- :parser : a parser that can take each row and give you data
- :headings : a sequence of headings from the original file
- :data : a sequence of vectors representing each column."
[filename out-c & {:keys [batch-size] :or {batch-size 1000}}]
(with-open [reader (io/reader filename)]
(let [{:keys [identifier parser filename component]} (snomed/parse-snomed-filename filename)]
(when parser
(let [csv-data (map #(str/split % #"\t") (line-seq reader))
[f out-c & {:keys [batch-size] :or {batch-size 1000}}]
(let [{:keys [identifier parser filename component]} (parse-filename f)]
(when parser
(with-open [reader (io/reader f)]
(let [csv-data (csv/read-csv reader :separator \tab)
headings (first csv-data)
data (rest csv-data)
batches (->> data
Expand Down Expand Up @@ -141,12 +157,12 @@
(a/>!! processed-c e)))
(a/close! raw-c))
(a/pipeline
nthreads
processed-c
(map snomed/parse-batch)
raw-c
true
(fn ex-handler [err] (log/debug "Error during import pipeline: " (ex-data err)) err))
nthreads
processed-c
(map snomed/parse-batch)
raw-c
true
(fn ex-handler [err] (log/debug "Error during import pipeline: " (ex-data err)) err))
processed-c))

(defn load-snomed
Expand Down
19 changes: 19 additions & 0 deletions test/resources/example-snapshot/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
This is an example distribution with fragments of different release files for testing

Core terminology files
======================

These are 'correct' versions of Concept, Description and Relationship files
- ./Terminology/sct2_Concept_Snapshot_INT_20230131.txt
- ./Terminology/sct2_Description_Snapshot-en_INT_20230131.txt
- ./Terminology/sct2_RelationshipConcreteValues_Snapshot_INT_20230131.txt
- ./Terminology/sct2_Relationship_Snapshot_INT_20230131.txt

Reference set files
===================

- ./Refset/Map/der2_iisssccRefset_ExtendedMapSnapshot_INT_20230131.txt
- This is a 'correct' International release version of an extended map

- ./Refset/Map/der2_ssRefset_SimpleMapWithDescriptionSnapshot_12345_20241021.txt
- This is a 'custom' simple map reference set with null values for one the columns
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id effectiveTime active moduleId refsetId referencedComponentId mapGroup mapPriority mapRule mapAdvice mapTarget correlationId mapCategoryId
00005b30-d2ad-5891-ae82-060d2e20a9fc 20150731 1 449080006 447562003 211339002 1 1 TRUE ALWAYS S80.8 | POSSIBLE REQUIREMENT FOR AN EXTERNAL CAUSE CODE S80.8 447561005 447637006
00006f25-3157-5132-b658-25708c9f1290 20150731 1 449080006 447562003 37535007 1 1 TRUE ALWAYS Q99.9 Q99.9 447561005 447637006
00009ee5-904c-5e9f-a67a-ffe16d847782 20150731 1 449080006 447562003 299741008 1 1 TRUE MAP SOURCE CONCEPT CANNOT BE CLASSIFIED WITH AVAILABLE DATA 447561005 447638001
0000b488-4995-58e1-a542-a8fc22593548 20220331 1 449080006 447562003 1208339007 1 1 TRUE ALWAYS G11.1 | POSSIBLE REQUIREMENT FOR ADDITIONAL CODE TO FULLY DESCRIBE DISEASE OR CONDITION G11.1 447561005 447637006
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id effectiveTime active moduleId refsetId referencedComponentId mapTarget mapTargetDescription
000d91ce-aae4-4f9e-ad06-51be576becd6 20241021 1 195941000112101 22671000001102 1434181000001106 ABC01256Q
00280f1a-4c71-4683-8cb4-3e19e97ff9d9 20241021 1 195941000112101 22671000001102 1291061000001109 ABC08795Q
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id effectiveTime active moduleId definitionStatusId
100005 20020131 0 900000000000207008 900000000000074008
101009 20020131 1 900000000000207008 900000000000074008
102002 20020131 1 900000000000207008 900000000000074008
103007 20020131 1 900000000000207008 900000000000074008
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id effectiveTime active moduleId conceptId languageCode typeId term caseSignificanceId
101013 20170731 1 900000000000207008 126813005 en 900000000000013009 Neoplasm of anterior aspect of epiglottis 900000000000448009
102018 20170731 1 900000000000207008 126814004 en 900000000000013009 Neoplasm of junctional region of epiglottis 900000000000448009
103011 20170731 1 900000000000207008 126815003 en 900000000000013009 Neoplasm of lateral wall of oropharynx 900000000000448009
104017 20170731 1 900000000000207008 126816002 en 900000000000013009 Neoplasm of posterior wall of oropharynx 900000000000448009
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id effectiveTime active moduleId sourceId value relationshipGroup typeId characteristicTypeId modifierId
13830203029 20210731 1 900000000000207008 830045007 #3 0 1142139005 900000000000011006 900000000000451002
13830204024 20210731 1 900000000000207008 830064001 #3 0 1142139005 900000000000011006 900000000000451002
13830205020 20210731 1 900000000000207008 830066004 #3 0 1142139005 900000000000011006 900000000000451002
13830206021 20210731 1 900000000000207008 830108003 #1 0 1142139005 900000000000011006 900000000000451002
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id effectiveTime active moduleId sourceId destinationId relationshipGroup typeId characteristicTypeId modifierId
100022 20090731 0 900000000000207008 100000000 102272007 0 116680003 900000000000011006 900000000000451002
101021 20020131 1 900000000000207008 10000006 29857009 0 116680003 900000000000011006 900000000000451002
102025 20020131 1 900000000000207008 10000006 9972008 0 116680003 900000000000011006 900000000000451002
103024 20030131 0 900000000000207008 1000004 19130008 0 116680003 900000000000011006 900000000000451002
77 changes: 77 additions & 0 deletions test/src/com/eldrix/hermes/importer_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
(ns com.eldrix.hermes.importer-test
(:require
[clojure.core.async :as async]
[clojure.java.io :as io]
[clojure.spec.gen.alpha :as gen]
[clojure.test :refer [deftest is testing]]
[com.eldrix.hermes.importer :as importer]
[com.eldrix.hermes.rf2 :as rf2])
(:import (java.time LocalDate)))

(deftest parse-filename
(testing "nil filename"
(is (nil? (importer/parse-filename nil))))
(testing "concept filename as string"
(let [{:keys [format version-date content-subtype type country-code identifier component]} (importer/parse-filename "sct2_Concept_Snapshot_INT_20230131.txt")]
(is (= "Concept" component))
(is (= "INT" country-code))
(is (= :info.snomed/Concept identifier))
(is (= "2" format))
(is (= "sct" type))
(is (= (LocalDate/of 2023 1 31) version-date))))
(testing "description filename as URL"
(let [{:keys [identifier]} (importer/parse-filename (java.net.URL. "file://Terminology/sct2_Description_Snapshot-en_INT_20230131.txt"))]
(is (= :info.snomed/Description identifier))))
(testing "relationship concrete values filename as file"
(let [{:keys [identifier]} (importer/parse-filename (io/file "./Terminology/sct2_RelationshipConcreteValues_Snapshot_INT_20230131.txt"))]
(is (= :info.snomed/RelationshipConcreteValues identifier)))))

(defn import-file
"Import a SNOMED file"
[f]
(let [ch (async/chan)]
(async/thread
(importer/process-file f ch)
(async/close! ch))
(async/<!! ch)))

(deftest import-concepts
(let [{:keys [type parser headings data]} (import-file (io/resource "example-snapshot/Terminology/sct2_Concept_Snapshot_INT_20230131.txt"))]
(is (= :info.snomed/Concept type))))

(deftest import-refset
(let [{:keys [type parser headings data] :as f} (import-file (io/resource "example-snapshot/Refset/Map/der2_iisssccRefset_ExtendedMapSnapshot_INT_20230131.txt"))]
(is (= :info.snomed/ExtendedMapRefset type))
(is (= ["id" "effectiveTime" "active" "moduleId" "refsetId" "referencedComponentId"
"mapGroup" "mapPriority" "mapRule" "mapAdvice" "mapTarget"
"correlationId" "mapCategoryId"] headings))))

(deftest import-custom-refset-nil-values
(let [{:keys [type parser headings data] :as f} (import-file (io/resource "example-snapshot/Refset/Map/der2_ssRefset_SimpleMapWithDescriptionSnapshot_12345_20241021.txt"))]
(is (= :info.snomed/SimpleMapRefset type))
(is (= ["id" "effectiveTime" "active" "moduleId" "refsetId" "referencedComponentId"
"mapTarget" "mapTargetDescription"] headings))
(is (= ["000d91ce-aae4-4f9e-ad06-51be576becd6" "20241021"
"1" "195941000112101" "22671000001102"
"1434181000001106" "ABC01256Q" ""] (first data))
"Empty last column should be returned as empty string")))

(comment
(require '[clojure.data.csv :as csv])
(csv/read-csv "hi\tthere\tand\thow\tare\tyou?\t" :separator \tab)
(def f (io/resource "example-snapshot/Terminology/sct2_Concept_Snapshot_INT_20230131.txt"))
(type f)
(io/as-file f)
(importer/parse-filename "sct2_Concept.txt")
(importer/parse-filename (java.net.URL. "https://wibble.com/sct_Concept_Snapshot_INT_20230131.txt"))
(importer/parse-filename f)
(importer/parse-filename nil)
(def ch (async/chan))
(async/thread
(importer/process-file f ch)
(async/close! ch))
(def ch (importer/load-snomed (io/resource "example-snapshot/")))
(async/<!! ch)

(gen/sample (rf2/gen-simple-map-refset {:fields [""]})))

0 comments on commit 22aecdf

Please sign in to comment.