anicolson commited on
Commit
9ea4504
1 Parent(s): 1838aab

Upload model

Browse files
Files changed (1) hide show
  1. dataset.py +6 -161
dataset.py CHANGED
@@ -23,46 +23,39 @@ class StudyIDEDStayIDSubset(Dataset):
23
  """
24
  def __init__(
25
  self,
26
- mimic_iv_duckdb_path,
27
  split,
 
28
  dataset_dir=None,
29
  max_images_per_study=None,
30
  transforms=None,
31
  images=True,
32
  columns='study_id, dicom_id, subject_id, findings, impression',
33
  and_condition='',
34
- records=None,
35
  study_id_inclusion_list=None,
36
  return_images=True,
37
  ed_module=True,
38
  extension='jpg',
39
- images_rocksdb_path=None,
40
- jpg_lmdb_path=None,
41
- jpg_rocksdb_path=None,
42
  ):
43
  """
44
  Argument/s:
45
- mimic_iv_duckdb_path - Path to MIMIC-IV DuckDB database.
46
  split - 'train', 'validate', or 'test'.
47
  dataset_dir - Dataset directory.
 
48
  max_images_per_study - the maximum number of images per study.
49
  transforms - torchvision transformations.
50
  colour_space - PIL target colour space.
51
  images - flag to return processed images.
52
  columns - which columns to query on.
53
  and_condition - AND condition to add to the SQL query.
54
- records - MIMIC-IV records class instance.
55
  study_id_inclusion_list - studies not in this list are excluded.
56
  return_images - return CXR images for the study as tensors.
57
  ed_module - use the ED module.
58
  extension - 'jpg' or 'dcm'.
59
- images_rocksdb_path - path to image RocksDB database.
60
- jpg_lmdb_path - path to LMDB .jpg database.
61
- jpg_rocksdb_path - path to RocksDB .jpg database.
62
  """
63
  super(StudyIDEDStayIDSubset, self).__init__()
64
  self.split = split
65
  self.dataset_dir = dataset_dir
 
66
  self.max_images_per_study = max_images_per_study
67
  self.transforms = transforms
68
  self.images = images
@@ -71,9 +64,6 @@ class StudyIDEDStayIDSubset(Dataset):
71
  self.return_images = return_images
72
  self.ed_module = ed_module
73
  self.extension = extension
74
- self.images_rocksdb_path = images_rocksdb_path
75
- self.jpg_lmdb_path = jpg_lmdb_path
76
- self.jpg_rocksdb_path = jpg_rocksdb_path
77
 
78
  # If max images per study is not set:
79
  self.max_images_per_study = float('inf') if self.max_images_per_study is None else self.max_images_per_study
@@ -88,49 +78,6 @@ class StudyIDEDStayIDSubset(Dataset):
88
  if 'physionet.org/files/mimic-cxr/2.0.0/files' not in self.dataset_dir:
89
  self.dataset_dir = os.path.join(self.dataset_dir, 'physionet.org/files/mimic-cxr/2.0.0/files')
90
 
91
- # Open the RocksDB images database:
92
- if self.images_rocksdb_path is not None:
93
- import rocksdb
94
-
95
- # Define the column families:
96
- column_families = {
97
- b'shape': rocksdb.ColumnFamilyOptions(),
98
- b'image': rocksdb.ColumnFamilyOptions(),
99
- }
100
-
101
- opts = rocksdb.Options()
102
- opts.max_open_files = 1e+5
103
- self.images_db = rocksdb.DB(self.images_rocksdb_path, opts, column_families=column_families, read_only=True)
104
-
105
- self.shape_handle = self.images_db.get_column_family(b'shape')
106
- self.image_handle = self.images_db.get_column_family(b'image')
107
-
108
- self.shape_dtype = np.int32
109
- self.image_dtype = np.uint16
110
-
111
- # Prepare the RocksDB .jpg database:
112
- if self.jpg_rocksdb_path is not None:
113
- import rocksdb
114
-
115
- opts = rocksdb.Options()
116
- opts.max_open_files = 1e+5
117
-
118
- self.images_db = rocksdb.DB(self.jpg_rocksdb_path, opts, read_only=True)
119
-
120
- # Prepare the LMDB .jpg database:
121
- if self.jpg_lmdb_path is not None:
122
-
123
- print('Loading images using LMDB.')
124
-
125
- # Map size:
126
- map_size = int(0.65 * (1024 ** 4))
127
- assert isinstance(map_size, int)
128
-
129
- self.env = lmdb.open(self.jpg_lmdb_path, map_size=map_size, lock=False, readonly=True)
130
- self.txn = self.env.begin(write=False)
131
-
132
- self.records = EDCXRSubjectRecords(database_path=mimic_iv_duckdb_path) if records is None else records
133
-
134
  query = f"""
135
  SELECT {columns}
136
  FROM mimic_cxr
@@ -266,115 +213,13 @@ class StudyIDEDStayIDSubset(Dataset):
266
 
267
  if self.extension == 'jpg':
268
 
269
- if self.jpg_rocksdb_path is not None:
270
-
271
- # Convert to bytes:
272
- key = bytes(dicom_id, 'utf-8')
273
-
274
- # Retrieve image:
275
- image = bytearray(self.images_db.get(key))
276
- image = torch.frombuffer(image, dtype=torch.uint8)
277
- image = decode_image(image)
278
-
279
- elif self.jpg_lmdb_path is not None:
280
-
281
- # Convert to bytes:
282
- key = bytes(dicom_id, 'utf-8')
283
-
284
- # Retrieve image:
285
- image = bytearray(self.txn.get(key))
286
- image = torch.frombuffer(image, dtype=torch.uint8)
287
- image = decode_image(image)
288
-
289
- else:
290
- image_file_path = mimic_cxr_image_path(self.dataset_dir, subject_id, study_id, dicom_id, self.extension)
291
- image = read_image(image_file_path)
292
 
293
  elif self.extension == 'dcm':
294
- if self.images_rocksdb_path is not None:
295
-
296
- key = dicom_id.encode('utf-8')
297
-
298
- # Retrieve the serialized image shape associated with the key:
299
- shape_bytes = self.images_db.get((self.shape_handle, key), key)
300
- shape = struct.unpack('iii', shape_bytes)
301
-
302
- np.frombuffer(shape_bytes, dtype=self.shape_dtype).reshape(3)
303
-
304
- # Retrieve the serialized image data associated with the key:
305
- image_bytes = self.images_db.get((self.image_handle, key), key)
306
- image = np.frombuffer(image_bytes, dtype=self.image_dtype).reshape(*shape)
307
-
308
- else:
309
- image_file_path = mimic_cxr_image_path(self.dataset_dir, subject_id, study_id, dicom_id, self.extension)
310
- image = load_and_preprocess_dcm_uint16(image_file_path)
311
-
312
- # Convert to a torch tensor:
313
- image = torch.from_numpy(image)
314
 
315
  if self.transforms is not None:
316
  image = self.transforms(image)
317
 
318
  return image
319
-
320
-
321
- if __name__ == '__main__':
322
- import time
323
-
324
- from tqdm import tqdm
325
-
326
- num_samples = 20
327
-
328
- datasets = []
329
- datasets.append(
330
- StudyIDEDStayIDSubset(
331
- dataset_dir='/datasets/work/hb-mlaifsp-mm/work/archive',
332
- mimic_iv_duckdb_path='/scratch3/nic261/database/mimic_iv_duckdb_rev_b.db',
333
- split='train',
334
- extension='jpg',
335
- ed_module=False,
336
- ),
337
- )
338
-
339
- datasets.append(
340
- StudyIDEDStayIDSubset(
341
- dataset_dir='/scratch3/nic261/datasets',
342
- mimic_iv_duckdb_path='/scratch3/nic261/database/mimic_iv_duckdb_rev_b.db',
343
- split='train',
344
- extension='jpg',
345
- ed_module=False,
346
- ),
347
- )
348
-
349
- datasets.append(
350
- StudyIDEDStayIDSubset(
351
- jpg_lmdb_path='/scratch3/nic261/database/mimic_cxr_jpg_lmdb_rev_a.db',
352
- mimic_iv_duckdb_path='/scratch3/nic261/database/mimic_iv_duckdb_rev_b.db',
353
- split='train',
354
- extension='jpg',
355
- ed_module=False,
356
- ),
357
- )
358
-
359
- datasets.append(
360
- StudyIDEDStayIDSubset(
361
- jpg_rocksdb_path='/scratch3/nic261/database/mimic_cxr_jpg_rocksdb.db',
362
- mimic_iv_duckdb_path='/scratch3/nic261/database/mimic_iv_duckdb_rev_b.db',
363
- split='train',
364
- extension='jpg',
365
- ed_module=False,
366
- )
367
- )
368
-
369
- assert (datasets[1][0]['images'][0] == datasets[2][0]['images'][0]).all().item()
370
- assert (datasets[1][5]['images'][0] == datasets[2][5]['images'][0]).all().item()
371
-
372
- for d in datasets:
373
- start_time = time.time()
374
- indices = torch.randperm(len(d))[:num_samples] # Get random indices.
375
- for i in tqdm(indices):
376
- _ = d[i]
377
- end_time = time.time()
378
- elapsed_time = end_time - start_time
379
- print(f"Elapsed time: {elapsed_time} seconds")
380
-
 
23
  """
24
  def __init__(
25
  self,
 
26
  split,
27
+ records,
28
  dataset_dir=None,
29
  max_images_per_study=None,
30
  transforms=None,
31
  images=True,
32
  columns='study_id, dicom_id, subject_id, findings, impression',
33
  and_condition='',
 
34
  study_id_inclusion_list=None,
35
  return_images=True,
36
  ed_module=True,
37
  extension='jpg',
 
 
 
38
  ):
39
  """
40
  Argument/s:
 
41
  split - 'train', 'validate', or 'test'.
42
  dataset_dir - Dataset directory.
43
+ records - MIMIC-CXR & MIMIC-IV-ED records class instance.
44
  max_images_per_study - the maximum number of images per study.
45
  transforms - torchvision transformations.
46
  colour_space - PIL target colour space.
47
  images - flag to return processed images.
48
  columns - which columns to query on.
49
  and_condition - AND condition to add to the SQL query.
 
50
  study_id_inclusion_list - studies not in this list are excluded.
51
  return_images - return CXR images for the study as tensors.
52
  ed_module - use the ED module.
53
  extension - 'jpg' or 'dcm'.
 
 
 
54
  """
55
  super(StudyIDEDStayIDSubset, self).__init__()
56
  self.split = split
57
  self.dataset_dir = dataset_dir
58
+ self.records = records
59
  self.max_images_per_study = max_images_per_study
60
  self.transforms = transforms
61
  self.images = images
 
64
  self.return_images = return_images
65
  self.ed_module = ed_module
66
  self.extension = extension
 
 
 
67
 
68
  # If max images per study is not set:
69
  self.max_images_per_study = float('inf') if self.max_images_per_study is None else self.max_images_per_study
 
78
  if 'physionet.org/files/mimic-cxr/2.0.0/files' not in self.dataset_dir:
79
  self.dataset_dir = os.path.join(self.dataset_dir, 'physionet.org/files/mimic-cxr/2.0.0/files')
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  query = f"""
82
  SELECT {columns}
83
  FROM mimic_cxr
 
213
 
214
  if self.extension == 'jpg':
215
 
216
+ image_file_path = mimic_cxr_image_path(self.dataset_dir, subject_id, study_id, dicom_id, self.extension)
217
+ image = read_image(image_file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  elif self.extension == 'dcm':
220
+ raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  if self.transforms is not None:
223
  image = self.transforms(image)
224
 
225
  return image