-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patha1_extract_completed_or_terminated_interventional_results_clinical_trials.py
113 lines (92 loc) · 3.24 KB
/
a1_extract_completed_or_terminated_interventional_results_clinical_trials.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import json
from pathlib import Path
from shutil import copy2
from multiprocessing import Pool
from tqdm.auto import tqdm
import logging
def setup_logging():
"""Configure logging for the application."""
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
def load_json_data(file_path: Path) -> dict:
"""Load JSON data from a file."""
try:
with open(file_path, "r", encoding="utf-8") as file:
return json.load(file)
except json.JSONDecodeError as e:
logging.error(f"Error decoding JSON from {file_path}: {str(e)}")
return {}
except Exception as e:
logging.error(f"Error reading {file_path}: {str(e)}")
return {}
def criteria_check(data: dict) -> bool:
"""Check if the JSON data meets specific criteria: Completed, Interventional with ResultsSection."""
try:
if (
("Study" in data["FullStudy"])
and (
data["FullStudy"]["Study"]["ProtocolSection"]["StatusModule"][
"OverallStatus"
]
in ["Completed", "Terminated"]
)
and (
data["FullStudy"]["Study"]["ProtocolSection"]["DesignModule"][
"StudyType"
]
== "Interventional"
)
and ("ResultsSection" in data["FullStudy"]["Study"])
):
return True
else:
return False
except KeyError as e:
logging.debug(f"Key error: {str(e)} in data")
return False
def process_file(file_path: Path) -> Path:
"""Process a single file to check criteria and return path if criteria met."""
data = load_json_data(file_path)
if data and criteria_check(data):
return file_path
return None
def copy_file(source: Path, target_dir: Path):
"""Copy a file to the target directory."""
try:
copy2(source, target_dir / source.name)
except Exception as e:
logging.error(f"Failed to copy {source} to {target_dir}: {str(e)}")
def copy_file_to_target(source_target_tuple):
"""Wrapper function for copying file, accepts a tuple (source, target_dir)."""
source, target_dir = source_target_tuple
copy_file(source, target_dir)
def main():
setup_logging()
dir_path = Path("data/clinicaltrials_gov/all_cts")
target_dir_path = Path("data/clinicaltrials_gov/completed_or_terminated_interventional_results_cts")
target_dir_path.mkdir(parents=True, exist_ok=True)
json_paths = list(dir_path.rglob("*.json"))
with Pool() as pool:
results = list(
tqdm(
pool.imap_unordered(process_file, json_paths),
total=len(json_paths),
desc="Filtering",
)
)
completed_files = [path for path in results if path]
if not completed_files:
logging.info("No files met the criteria.")
with Pool() as pool:
list(
tqdm(
pool.imap_unordered(
copy_file_to_target, ((p, target_dir_path) for p in completed_files)
),
total=len(completed_files),
desc="Copying",
)
)
if __name__ == "__main__":
main()