Skip to content

Commit

Permalink
Docs/improve docs (#4)
Browse files Browse the repository at this point in the history
* improving/extending docs

* improve command line help messages
  • Loading branch information
Marc Zimmermann authored Oct 11, 2021
1 parent 59563e5 commit 247bfd9
Show file tree
Hide file tree
Showing 17 changed files with 549 additions and 489 deletions.
54 changes: 53 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,53 @@
Contains proof of concept code of a deidentification pipeline of clinical texts.
A pipeline to deidentify clinical texts/reports in German.

The pipeline code was developed for the needs of the [University Hospital Zurich](https://www.usz.ch/fachbereich/clinical-trials-center/angebote/research-data-service-center/)
can however be adapted fairly easily for other hospitals contexts. It takes reports in [JSON format](overview.md#document-import) and
in a first step annotates identifying information such as names, locations, age, dates, organisations and occupations.
In a second step, the annotated text parts are substituted with
some other text, where different [strategies](docs/overview.md#subst_policies) can be applied and reexported to JSON.

The pipeline to recognize identifying information is "rule" based, that is [various lexica](docs/lexica.md) and
[deterministic rules](docs/overview.md#jape_example) are used. It is based on the [GATE framework](https://gate.ac.uk/).

Some important features:
* parallel execution to scale to large corpora of reports (>100'000 reports)
* test suite to test annotation pipeline, also accessible to non-software developers tuning rules and lexica
* large parts of pipeline tuning can be done without writing code
* annotations contain information to trace back which pipeline step or rule generated the annotation


## Installation

* [Installation and running instructions](docs/installation.md)

## Pipeline Details

* [Pipeline overview](docs/overview.md)
* [About lexica](docs/lexica.md)
* [Pipeline Evaluation Results](docs/usz_pipeline_evaluation.md)


## Adapting the Pipeline

The tool is laid out such that it can be relatively easily adapted to another hospital or another context.
Adaptation to another language is possible in principle, may incur more work though.

* More details about [Pipeline components and their configuration](docs/components.md)
* [Description of the rules and how to tune them](docs/rules_and_tuning.md)
* [Tuning tutorial](docs/tuning_tutorial.md) with a heavily simplified pipeline


## Other Functionalities

For simplicity, some more functionality has been included into this code base which is not related to deidentification per se, but
still to working with medical reports.

### Diagnosis Extraction Pipeline

* [Diagnosis Extraction Pipeline](docs/structuring.md)


## Development

* [Notes on Development](docs/development.md)
* [Code overview](docs/code_overview.md)
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,20 @@ public class AnnotationCmd extends DbCommands {
@CommandLine.Option(names = {"-o"}, description = "Output corpus dir", required = true)
private File corpusOutputDir = null;

@CommandLine.Option(names = {"-c"}, description = "Config file", required = true)
@CommandLine.Option(names = {"-c"}, description = "Pipeline config file", required = true)
private File propertiesFile = null;

@CommandLine.Option(names = {"-m"}, description = "Marked Corpus Dir")
@CommandLine.Option(names = {"-m"}, description = "Path to marked/hand annotated corpus dir. " +
"Used to measure annotation accuracy of the current pipeline compared to the marked corpus (e.g. gold standard)")
private String markedCorpusDirPath = null;

@CommandLine.Option(names = {"--diagnostics-dir"}, description = "Marked Corpus Dir")
@CommandLine.Option(names = {"--diagnostics-dir"}, description = "Path to diagnostic output (only in conjunction with the -m option)")
private String diagnosticsDirPath = null;

@CommandLine.Option(names = {"--fields-blacklist-eval"}, description = "Path to files giving field blacklist used during evaluation")
private File fieldsBlacklistPath = null;

@CommandLine.Option(names = {"-t"}, description = "Number of threads")
@CommandLine.Option(names = {"-t"}, description = "Number of parallel pipeline annotations (default: number of CPUs available)")
private int threads = -1;

public static final String PHI_ANNOTATION_NAME = "phi-annotations";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@

import java.util.concurrent.Callable;

@CommandLine.Command(mixinStandardHelpOptions = true, description = "Deid entry point", name = "Deid", version="deid dev",
subcommands = {
SubstitutionCmd.class,
AnnotationCmd.class,
ImportCmd.class,
ConversionCmd.class,
DiagnosticsCmd.class,
PipelineTesterCmd.class
}
@CommandLine.Command(mixinStandardHelpOptions = true, description = "Deid entry point", name = "Deid", version = "deid dev",
subcommands = {
AnnotationCmd.class,
SubstitutionCmd.class,
PipelineTesterCmd.class,
ImportCmd.class,
ConversionCmd.class,
DiagnosticsCmd.class
}
)
public class DeidMain implements Callable<Integer> {
@Override
Expand All @@ -28,7 +28,7 @@ public Integer call() {

public static void main(String[] args) {
Integer exitCode = CommandLine.call(new DeidMain(), args);
if(exitCode == null) {
if (exitCode == null) {
System.exit(2);
}
System.exit(exitCode);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import gate.creole.SerialAnalyserController;
import gate.util.GateException;
import org.ratschlab.DeidCmd;
import org.ratschlab.deidentifier.AnnotationCmd;
import org.ratschlab.deidentifier.annotation.features.FeatureKeysDate;
import org.ratschlab.deidentifier.pipelines.PipelineFactory;
import org.ratschlab.deidentifier.sources.KisimFormat;
Expand Down Expand Up @@ -40,22 +41,22 @@ public class SubstitutionCmd extends DeidCmd {
@CommandLine.Option(names = {"-o"}, description = "Output Dir")
private File outputDir = null;

@CommandLine.Option(names = {"--db-config"}, description = "DB config path")
@CommandLine.Option(names = {"--db-config"}, description = "Path to DB config file")
private File databaseConfigPath = null;

@CommandLine.Option(names = {"--doc-type-filter"}, description = "Path to file type list to consider")
private File docTypeFilterPath = null;

@CommandLine.Option(names = {"--annotation-name"}, description = "Annotation set name")
private String finalAnnotationName = "phi-annotations";
@CommandLine.Option(names = {"--annotation-name"}, description = "Annotation set name containing annotations to deidentify")
private String finalAnnotationName = AnnotationCmd.PHI_ANNOTATION_NAME;

@CommandLine.Option(names = {"--rnd-seed"}, description = "Random seed")
@CommandLine.Option(names = {"--rnd-seed"}, description = "Random seed used to pick random values for date shifts (keep the value protected!)")
private int rngSeed = -1;

@CommandLine.Option(names = {"--method"}, description = "Substitution Methods: ${COMPLETION-CANDIDATES}", required = true)
private SubstitutionMethods substMethod = null;

@CommandLine.Option(names = {"-t"}, description = "Number of threads")
@CommandLine.Option(names = {"-t"}, description = "Number of parallel pipelines (default: 1)")
private int threads = 1;

@CommandLine.Option(names = {"--min-days-shift"}, description = "Minimum number of days to shift")
Expand All @@ -79,7 +80,7 @@ public class SubstitutionCmd extends DeidCmd {
@CommandLine.Option(names = {"--fields-blacklist"}, description = "Path to files giving field blacklist")
private File fieldsBlacklistPath = null;

@CommandLine.Option(names = {"--context-window-size"}, description = "Context window size for replacement tags")
@CommandLine.Option(names = {"--context-window-size"}, description = "Context window size for replacement tags (default 0). Number of original tokens to add in replacement tags (in case annotations get split up)")
private int contextWindowForReplacementTags = 0;


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import java.util.stream.Stream;

public class DbCommands extends DeidCmd {
@CommandLine.Option(names = {"-d"}, description = "DB config path")
@CommandLine.Option(names = {"-d"}, description = "Path to DB config file")
protected String databaseConfigPath = "";

@CommandLine.Option(names = {"--doc-type-filter"}, description = "Path to file type list to consider")
Expand Down
Loading

0 comments on commit 247bfd9

Please sign in to comment.