Upload model
Browse files- dataset.py +6 -161
dataset.py
CHANGED
@@ -23,46 +23,39 @@ class StudyIDEDStayIDSubset(Dataset):
|
|
23 |
"""
|
24 |
def __init__(
|
25 |
self,
|
26 |
-
mimic_iv_duckdb_path,
|
27 |
split,
|
|
|
28 |
dataset_dir=None,
|
29 |
max_images_per_study=None,
|
30 |
transforms=None,
|
31 |
images=True,
|
32 |
columns='study_id, dicom_id, subject_id, findings, impression',
|
33 |
and_condition='',
|
34 |
-
records=None,
|
35 |
study_id_inclusion_list=None,
|
36 |
return_images=True,
|
37 |
ed_module=True,
|
38 |
extension='jpg',
|
39 |
-
images_rocksdb_path=None,
|
40 |
-
jpg_lmdb_path=None,
|
41 |
-
jpg_rocksdb_path=None,
|
42 |
):
|
43 |
"""
|
44 |
Argument/s:
|
45 |
-
mimic_iv_duckdb_path - Path to MIMIC-IV DuckDB database.
|
46 |
split - 'train', 'validate', or 'test'.
|
47 |
dataset_dir - Dataset directory.
|
|
|
48 |
max_images_per_study - the maximum number of images per study.
|
49 |
transforms - torchvision transformations.
|
50 |
colour_space - PIL target colour space.
|
51 |
images - flag to return processed images.
|
52 |
columns - which columns to query on.
|
53 |
and_condition - AND condition to add to the SQL query.
|
54 |
-
records - MIMIC-IV records class instance.
|
55 |
study_id_inclusion_list - studies not in this list are excluded.
|
56 |
return_images - return CXR images for the study as tensors.
|
57 |
ed_module - use the ED module.
|
58 |
extension - 'jpg' or 'dcm'.
|
59 |
-
images_rocksdb_path - path to image RocksDB database.
|
60 |
-
jpg_lmdb_path - path to LMDB .jpg database.
|
61 |
-
jpg_rocksdb_path - path to RocksDB .jpg database.
|
62 |
"""
|
63 |
super(StudyIDEDStayIDSubset, self).__init__()
|
64 |
self.split = split
|
65 |
self.dataset_dir = dataset_dir
|
|
|
66 |
self.max_images_per_study = max_images_per_study
|
67 |
self.transforms = transforms
|
68 |
self.images = images
|
@@ -71,9 +64,6 @@ class StudyIDEDStayIDSubset(Dataset):
|
|
71 |
self.return_images = return_images
|
72 |
self.ed_module = ed_module
|
73 |
self.extension = extension
|
74 |
-
self.images_rocksdb_path = images_rocksdb_path
|
75 |
-
self.jpg_lmdb_path = jpg_lmdb_path
|
76 |
-
self.jpg_rocksdb_path = jpg_rocksdb_path
|
77 |
|
78 |
# If max images per study is not set:
|
79 |
self.max_images_per_study = float('inf') if self.max_images_per_study is None else self.max_images_per_study
|
@@ -88,49 +78,6 @@ class StudyIDEDStayIDSubset(Dataset):
|
|
88 |
if 'physionet.org/files/mimic-cxr/2.0.0/files' not in self.dataset_dir:
|
89 |
self.dataset_dir = os.path.join(self.dataset_dir, 'physionet.org/files/mimic-cxr/2.0.0/files')
|
90 |
|
91 |
-
# Open the RocksDB images database:
|
92 |
-
if self.images_rocksdb_path is not None:
|
93 |
-
import rocksdb
|
94 |
-
|
95 |
-
# Define the column families:
|
96 |
-
column_families = {
|
97 |
-
b'shape': rocksdb.ColumnFamilyOptions(),
|
98 |
-
b'image': rocksdb.ColumnFamilyOptions(),
|
99 |
-
}
|
100 |
-
|
101 |
-
opts = rocksdb.Options()
|
102 |
-
opts.max_open_files = 1e+5
|
103 |
-
self.images_db = rocksdb.DB(self.images_rocksdb_path, opts, column_families=column_families, read_only=True)
|
104 |
-
|
105 |
-
self.shape_handle = self.images_db.get_column_family(b'shape')
|
106 |
-
self.image_handle = self.images_db.get_column_family(b'image')
|
107 |
-
|
108 |
-
self.shape_dtype = np.int32
|
109 |
-
self.image_dtype = np.uint16
|
110 |
-
|
111 |
-
# Prepare the RocksDB .jpg database:
|
112 |
-
if self.jpg_rocksdb_path is not None:
|
113 |
-
import rocksdb
|
114 |
-
|
115 |
-
opts = rocksdb.Options()
|
116 |
-
opts.max_open_files = 1e+5
|
117 |
-
|
118 |
-
self.images_db = rocksdb.DB(self.jpg_rocksdb_path, opts, read_only=True)
|
119 |
-
|
120 |
-
# Prepare the LMDB .jpg database:
|
121 |
-
if self.jpg_lmdb_path is not None:
|
122 |
-
|
123 |
-
print('Loading images using LMDB.')
|
124 |
-
|
125 |
-
# Map size:
|
126 |
-
map_size = int(0.65 * (1024 ** 4))
|
127 |
-
assert isinstance(map_size, int)
|
128 |
-
|
129 |
-
self.env = lmdb.open(self.jpg_lmdb_path, map_size=map_size, lock=False, readonly=True)
|
130 |
-
self.txn = self.env.begin(write=False)
|
131 |
-
|
132 |
-
self.records = EDCXRSubjectRecords(database_path=mimic_iv_duckdb_path) if records is None else records
|
133 |
-
|
134 |
query = f"""
|
135 |
SELECT {columns}
|
136 |
FROM mimic_cxr
|
@@ -266,115 +213,13 @@ class StudyIDEDStayIDSubset(Dataset):
|
|
266 |
|
267 |
if self.extension == 'jpg':
|
268 |
|
269 |
-
|
270 |
-
|
271 |
-
# Convert to bytes:
|
272 |
-
key = bytes(dicom_id, 'utf-8')
|
273 |
-
|
274 |
-
# Retrieve image:
|
275 |
-
image = bytearray(self.images_db.get(key))
|
276 |
-
image = torch.frombuffer(image, dtype=torch.uint8)
|
277 |
-
image = decode_image(image)
|
278 |
-
|
279 |
-
elif self.jpg_lmdb_path is not None:
|
280 |
-
|
281 |
-
# Convert to bytes:
|
282 |
-
key = bytes(dicom_id, 'utf-8')
|
283 |
-
|
284 |
-
# Retrieve image:
|
285 |
-
image = bytearray(self.txn.get(key))
|
286 |
-
image = torch.frombuffer(image, dtype=torch.uint8)
|
287 |
-
image = decode_image(image)
|
288 |
-
|
289 |
-
else:
|
290 |
-
image_file_path = mimic_cxr_image_path(self.dataset_dir, subject_id, study_id, dicom_id, self.extension)
|
291 |
-
image = read_image(image_file_path)
|
292 |
|
293 |
elif self.extension == 'dcm':
|
294 |
-
|
295 |
-
|
296 |
-
key = dicom_id.encode('utf-8')
|
297 |
-
|
298 |
-
# Retrieve the serialized image shape associated with the key:
|
299 |
-
shape_bytes = self.images_db.get((self.shape_handle, key), key)
|
300 |
-
shape = struct.unpack('iii', shape_bytes)
|
301 |
-
|
302 |
-
np.frombuffer(shape_bytes, dtype=self.shape_dtype).reshape(3)
|
303 |
-
|
304 |
-
# Retrieve the serialized image data associated with the key:
|
305 |
-
image_bytes = self.images_db.get((self.image_handle, key), key)
|
306 |
-
image = np.frombuffer(image_bytes, dtype=self.image_dtype).reshape(*shape)
|
307 |
-
|
308 |
-
else:
|
309 |
-
image_file_path = mimic_cxr_image_path(self.dataset_dir, subject_id, study_id, dicom_id, self.extension)
|
310 |
-
image = load_and_preprocess_dcm_uint16(image_file_path)
|
311 |
-
|
312 |
-
# Convert to a torch tensor:
|
313 |
-
image = torch.from_numpy(image)
|
314 |
|
315 |
if self.transforms is not None:
|
316 |
image = self.transforms(image)
|
317 |
|
318 |
return image
|
319 |
-
|
320 |
-
|
321 |
-
if __name__ == '__main__':
|
322 |
-
import time
|
323 |
-
|
324 |
-
from tqdm import tqdm
|
325 |
-
|
326 |
-
num_samples = 20
|
327 |
-
|
328 |
-
datasets = []
|
329 |
-
datasets.append(
|
330 |
-
StudyIDEDStayIDSubset(
|
331 |
-
dataset_dir='/datasets/work/hb-mlaifsp-mm/work/archive',
|
332 |
-
mimic_iv_duckdb_path='/scratch3/nic261/database/mimic_iv_duckdb_rev_b.db',
|
333 |
-
split='train',
|
334 |
-
extension='jpg',
|
335 |
-
ed_module=False,
|
336 |
-
),
|
337 |
-
)
|
338 |
-
|
339 |
-
datasets.append(
|
340 |
-
StudyIDEDStayIDSubset(
|
341 |
-
dataset_dir='/scratch3/nic261/datasets',
|
342 |
-
mimic_iv_duckdb_path='/scratch3/nic261/database/mimic_iv_duckdb_rev_b.db',
|
343 |
-
split='train',
|
344 |
-
extension='jpg',
|
345 |
-
ed_module=False,
|
346 |
-
),
|
347 |
-
)
|
348 |
-
|
349 |
-
datasets.append(
|
350 |
-
StudyIDEDStayIDSubset(
|
351 |
-
jpg_lmdb_path='/scratch3/nic261/database/mimic_cxr_jpg_lmdb_rev_a.db',
|
352 |
-
mimic_iv_duckdb_path='/scratch3/nic261/database/mimic_iv_duckdb_rev_b.db',
|
353 |
-
split='train',
|
354 |
-
extension='jpg',
|
355 |
-
ed_module=False,
|
356 |
-
),
|
357 |
-
)
|
358 |
-
|
359 |
-
datasets.append(
|
360 |
-
StudyIDEDStayIDSubset(
|
361 |
-
jpg_rocksdb_path='/scratch3/nic261/database/mimic_cxr_jpg_rocksdb.db',
|
362 |
-
mimic_iv_duckdb_path='/scratch3/nic261/database/mimic_iv_duckdb_rev_b.db',
|
363 |
-
split='train',
|
364 |
-
extension='jpg',
|
365 |
-
ed_module=False,
|
366 |
-
)
|
367 |
-
)
|
368 |
-
|
369 |
-
assert (datasets[1][0]['images'][0] == datasets[2][0]['images'][0]).all().item()
|
370 |
-
assert (datasets[1][5]['images'][0] == datasets[2][5]['images'][0]).all().item()
|
371 |
-
|
372 |
-
for d in datasets:
|
373 |
-
start_time = time.time()
|
374 |
-
indices = torch.randperm(len(d))[:num_samples] # Get random indices.
|
375 |
-
for i in tqdm(indices):
|
376 |
-
_ = d[i]
|
377 |
-
end_time = time.time()
|
378 |
-
elapsed_time = end_time - start_time
|
379 |
-
print(f"Elapsed time: {elapsed_time} seconds")
|
380 |
-
|
|
|
23 |
"""
|
24 |
def __init__(
|
25 |
self,
|
|
|
26 |
split,
|
27 |
+
records,
|
28 |
dataset_dir=None,
|
29 |
max_images_per_study=None,
|
30 |
transforms=None,
|
31 |
images=True,
|
32 |
columns='study_id, dicom_id, subject_id, findings, impression',
|
33 |
and_condition='',
|
|
|
34 |
study_id_inclusion_list=None,
|
35 |
return_images=True,
|
36 |
ed_module=True,
|
37 |
extension='jpg',
|
|
|
|
|
|
|
38 |
):
|
39 |
"""
|
40 |
Argument/s:
|
|
|
41 |
split - 'train', 'validate', or 'test'.
|
42 |
dataset_dir - Dataset directory.
|
43 |
+
records - MIMIC-CXR & MIMIC-IV-ED records class instance.
|
44 |
max_images_per_study - the maximum number of images per study.
|
45 |
transforms - torchvision transformations.
|
46 |
colour_space - PIL target colour space.
|
47 |
images - flag to return processed images.
|
48 |
columns - which columns to query on.
|
49 |
and_condition - AND condition to add to the SQL query.
|
|
|
50 |
study_id_inclusion_list - studies not in this list are excluded.
|
51 |
return_images - return CXR images for the study as tensors.
|
52 |
ed_module - use the ED module.
|
53 |
extension - 'jpg' or 'dcm'.
|
|
|
|
|
|
|
54 |
"""
|
55 |
super(StudyIDEDStayIDSubset, self).__init__()
|
56 |
self.split = split
|
57 |
self.dataset_dir = dataset_dir
|
58 |
+
self.records = records
|
59 |
self.max_images_per_study = max_images_per_study
|
60 |
self.transforms = transforms
|
61 |
self.images = images
|
|
|
64 |
self.return_images = return_images
|
65 |
self.ed_module = ed_module
|
66 |
self.extension = extension
|
|
|
|
|
|
|
67 |
|
68 |
# If max images per study is not set:
|
69 |
self.max_images_per_study = float('inf') if self.max_images_per_study is None else self.max_images_per_study
|
|
|
78 |
if 'physionet.org/files/mimic-cxr/2.0.0/files' not in self.dataset_dir:
|
79 |
self.dataset_dir = os.path.join(self.dataset_dir, 'physionet.org/files/mimic-cxr/2.0.0/files')
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
query = f"""
|
82 |
SELECT {columns}
|
83 |
FROM mimic_cxr
|
|
|
213 |
|
214 |
if self.extension == 'jpg':
|
215 |
|
216 |
+
image_file_path = mimic_cxr_image_path(self.dataset_dir, subject_id, study_id, dicom_id, self.extension)
|
217 |
+
image = read_image(image_file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
elif self.extension == 'dcm':
|
220 |
+
raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
if self.transforms is not None:
|
223 |
image = self.transforms(image)
|
224 |
|
225 |
return image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|