-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvalidate.py
154 lines (124 loc) · 5.38 KB
/
validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# import pydicom
# import numpy as np
# from PIL import Image
#
# tolerance = 1
#
# # Load the DICOM file
# dicom = pydicom.dcmread(r"/home/ubuntu/nlp_project/Code/physionet.org/files/mimic-cxr/2.1.0/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.dcm")
# dicom_pixels = dicom.pixel_array
#
# # Load the PNG file
# png_image = Image.open(r"/home/ubuntu/nlp_project/Code/physionet.org/files/mimic-cxr/2.1.0/files/p10/p10000032/s50414267/out_png/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.png")
# png_pixels = np.array(png_image)
#
# # Compare shapes
# assert dicom_pixels.shape == png_pixels.shape, "Shapes of DICOM and PNG do not match"
#
# # Compare pixel values
# difference = np.abs(dicom_pixels - png_pixels)
#
# dicom_non_zero_elements = dicom_pixels[dicom_pixels != 0]
# dicom_non_zero_count = len(dicom_non_zero_elements)
#
# # Get non-zero elements in PNG array
# png_non_zero_elements = png_pixels[png_pixels != 0]
# png_non_zero_count = len(png_non_zero_elements)
#
# print(f"Non-zero elements in DICOM: {dicom_non_zero_count}")
# print(f"Non-zero elements in PNG: {png_non_zero_count}")
#
# # If needed, print the actual non-zero elements (be cautious for large arrays)
# print(f"Non-zero elements in DICOM array:\n{dicom_non_zero_elements}")
# print(f"Non-zero elements in PNG array:\n{png_non_zero_elements}")
#
#
# print(f"First 10 non-zero elements in DICOM array:\n{dicom_non_zero_elements[:10]}")
# print(f"First 10 non-zero elements in PNG array:\n{png_non_zero_elements[:10]}")
#
#
# # assert np.all(difference < tolerance), f"Pixel difference exceeds tolerance: {difference.max()}"
import os
import pandas as pd
from tqdm import tqdm
import pydicom
import numpy as np
from PIL import Image
# Function to convert DICOM to PNG
def convert_dicom_to_png(dicom_path, output_path):
try:
# Read the DICOM file
dicom = pydicom.dcmread(dicom_path)
# Get pixel array
pixel_array = dicom.pixel_array
# Normalize pixel values to 0-255
pixel_array = ((pixel_array - pixel_array.min()) / (pixel_array.max() - pixel_array.min()) * 255).astype(
np.uint8)
# Save as PNG
image = Image.fromarray(pixel_array)
image.save(output_path)
except Exception as e:
print(f"Error converting {dicom_path} to PNG: {e}")
# Function to extract findings and impressions from a report
def extract_findings_and_impression(file_path):
with open(file_path, 'r') as file:
content = file.read()
# Extract Findings
findings_start = content.find("FINDINGS:")
impression_start = content.find("IMPRESSION:")
findings = ""
impression = ""
if findings_start != -1:
findings = content[findings_start + len("FINDINGS:"):impression_start].strip()
if impression_start != -1:
impression = content[impression_start + len("IMPRESSION:"):].strip()
return findings, impression
# Main logic to create the DataFrame
reports_root_path = input("Enter the root path for reports: ").strip()
# Ensure the path exists
if not os.path.exists(reports_root_path):
raise FileNotFoundError(f"The specified path does not exist: {reports_root_path}")
data = []
grp_folders = os.listdir(reports_root_path)
for p_grp in grp_folders:
cxr_path = os.path.join(reports_root_path, p_grp)
p_files = os.listdir(cxr_path)
for p in p_files:
res_path = os.path.join(cxr_path, p)
if os.path.isdir(res_path):
dicom_dirs = [d for d in os.listdir(res_path) if os.path.isdir(os.path.join(res_path, d))]
txt_files = [f for f in os.listdir(res_path) if f.endswith('.txt') and f.startswith('s')]
for dicom_dir in dicom_dirs:
dicom_path = os.path.join(res_path, dicom_dir)
dicom_files = [os.path.join(dicom_path, f) for f in os.listdir(dicom_path) if f.endswith('.dcm')]
report_file = f"{dicom_dir}.txt"
if report_file in txt_files:
report_path = os.path.join(res_path, report_file)
findings, impressions = extract_findings_and_impression(report_path)
for dicom_file in dicom_files:
dicom_id = os.path.basename(dicom_file)
png_path = dicom_file.replace('.dcm', '.png') # Define the PNG output path
# Convert the DICOM to PNG
convert_dicom_to_png(dicom_file, png_path)
# Append data to the list
# data.append({
# "dicom_path": dicom_file,
# "png_path": png_path,
# "dicom_id": dicom_id,
# "findings": findings,
# "impressions": impressions
# })
data_entry = {
"dicom_path": dicom_file,
"png_path": png_path,
"dicom_id": dicom_id,
"findings": findings,
"impressions": impressions
}
data.append(data_entry)
print(f"Processed PNG path: {data_entry['png_path']}")
df = pd.DataFrame(data)
print(df.head())
print(f"Total entries: {len(df)}")
# Save the DataFrame to a CSV file
df.to_csv('data_with_png_paths.csv', index=False)