Vamsi Thiriveedhi commited on
Commit
b6be892
1 Parent(s): ff79159

create dcm2parquet app

Browse files
Files changed (4) hide show
  1. Dockerfile +39 -0
  2. README.md +5 -6
  3. dcm2parquet.py +143 -0
  4. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.12.3
3
+
4
+ # Set up a new user named "user" with user ID 1000
5
+ RUN useradd -m -u 1000 user
6
+
7
+ # Switch to the "user" user
8
+ USER user
9
+
10
+ # Set home to the user's home directory
11
+ ENV HOME=/home/user \
12
+ PATH=/home/user/.local/bin:$PATH
13
+
14
+ # Set the working directory to the user's home directory
15
+ WORKDIR $HOME/app
16
+
17
+ # Try and run pip command after setting the user with `USER user` to avoid permission issues with Python
18
+ RUN pip install --no-cache-dir --upgrade pip
19
+
20
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
21
+ COPY --chown=user . $HOME/app
22
+
23
+ # Install any needed packages specified in requirements.txt
24
+ RUN pip install --no-cache-dir -r requirements.txt
25
+
26
+ # Create the .streamlit directory
27
+ RUN mkdir -p .streamlit
28
+
29
+ # Create the config.toml file and set the maxMessageSize
30
+ RUN echo "\
31
+ [server]\n\
32
+ maxMessageSize = 2000\n\
33
+ " > .streamlit/config.toml
34
+
35
+ # Make port 8501 available to the world outside this container
36
+ EXPOSE 8501
37
+
38
+ # Run filter_data_app.py when the container launches
39
+ CMD streamlit run dcm2parquet_app.py
README.md CHANGED
@@ -1,11 +1,10 @@
1
  ---
2
- title: Dcm2parquet
3
- emoji: 🚀
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: docker
 
7
  pinned: false
8
  license: mit
9
  ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: dcm2parquet
3
+ emoji: ??
4
+ colorFrom: purple
5
+ colorTo: green
6
  sdk: docker
7
+ app_port: 8501
8
  pinned: false
9
  license: mit
10
  ---
 
 
dcm2parquet.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from idc_index import index
3
+ from pathlib import Path
4
+ import pydicom
5
+ import pandas as pd
6
+ import pyarrow as pa
7
+ import pyarrow.parquet as pq
8
+ from tempfile import TemporaryDirectory
9
+ import os
10
+ from pathlib import Path
11
+ import polars
12
+ import pydicom.datadict as dd
13
+
14
+ # Helper function to get the description of a DICOM t
15
+ def get_tag_description(tag, description):
16
+ """ Get the description of the DICOM tag """
17
+ try:
18
+ return dd.get_entry(tag)[2] # Get the third element which is the keyword/description
19
+ except KeyError:
20
+ return description
21
+
22
+ def convert_value(value):
23
+ """ Convert pydicom value to Python native format """
24
+ if isinstance(value, pydicom.multival.MultiValue):
25
+ return [convert_value(v) for v in value]
26
+ elif isinstance(value, pydicom.sequence.Sequence):
27
+ return [convert_sequence_item(item) for item in value]
28
+ elif isinstance(value, pydicom.valuerep.PersonName) or isinstance(value, pydicom.uid.UID):
29
+ return str(value)
30
+ else:
31
+ return value
32
+
33
+ # Sanitize column or field name for compatibility
34
+ def sanitize_name(name):
35
+ """ Sanitize column or field name """
36
+ # Remove special characters and replace underscores with an empty string
37
+ return name.replace('(', '').replace(')', '').replace(',', '').replace(' ', '').replace('_', '')
38
+
39
+ # Convert pydicom sequence item to Python native format
40
+ def convert_sequence_item(item):
41
+ """ Convert pydicom sequence item to Python native format """
42
+ return {sanitize_name(get_tag_description(elem.tag, elem.description())): convert_value(elem.value) for elem in item}
43
+
44
+ # Clean column name by removing special characters and spaces
45
+ def clean_column_name(column_name):
46
+ """ Clean column name """
47
+ return ''.join(e for e in column_name if e.isalnum())
48
+
49
+ # Serialize complex DICOM elements to JSON string while preserving nesting
50
+ def serialize_element(value):
51
+ """ Serialize complex DICOM elements to JSON string while preserving nesting """
52
+ if isinstance(value, pydicom.Dataset):
53
+ # Convert the Dataset to a dict preserving the nested structure
54
+ return {sanitize_name(get_tag_description(elem.tag, elem.description())): serialize_element(elem.value) for elem in value}
55
+ elif isinstance(value, pydicom.sequence.Sequence):
56
+ # Convert the Sequence to a list preserving the nested structure
57
+ return [serialize_element(item) for item in value]
58
+ else:
59
+ return convert_value(value)
60
+
61
+ # Extract DICOM header data and serialize complex types while preserving nesting
62
+ def extract_dicom_header(dicom_file):
63
+ """ Extract DICOM header data and serialize complex types while preserving nesting """
64
+ ds = pydicom.dcmread(dicom_file, stop_before_pixels=True)
65
+ header_data = {}
66
+ for elem in ds:
67
+ header_data[sanitize_name(clean_column_name(get_tag_description(elem.tag, elem.description())))] = serialize_element(elem.value)
68
+ return header_data
69
+
70
+ # Save DICOM header data to a Parquet file
71
+ def save_dicom_header_to_parquet(dicom_files, parquet_file):
72
+ """ Save DICOM header data to a Parquet file """
73
+ all_header_data = []
74
+ for dicom_file in dicom_files:
75
+ header_data = extract_dicom_header(dicom_file)
76
+ all_header_data.append(header_data)
77
+
78
+ df = pd.DataFrame(all_header_data)
79
+ # Sanitize column names
80
+ df.columns = [sanitize_name(col) for col in df.columns]
81
+ table = pa.Table.from_pandas(df)
82
+ pq.write_table(table, parquet_file)
83
+
84
+ # Main Streamlit app code
85
+ st.title("IDC to Parquet Converter")
86
+ st.write("Select IDC data to download and convert to Parquet.")
87
+
88
+ # Fetch IDC index
89
+ client = index.IDCClient()
90
+ index_df = client.index
91
+
92
+ # Option to choose IDC data
93
+ st.subheader("Choose IDC Data to Process")
94
+ collection_ids = index_df["collection_id"].unique()
95
+ selected_collection_id = st.selectbox("Select Collection ID", collection_ids)
96
+
97
+ patients = index_df[index_df["collection_id"] == selected_collection_id]["PatientID"].unique()
98
+ selected_patient_id = st.selectbox("Select Patient ID", patients)
99
+
100
+ modalities = index_df[index_df["collection_id"] == selected_collection_id]["Modality"].unique()
101
+ selected_modality = st.selectbox("Select Modality", modalities)
102
+
103
+ # Button to process IDC data
104
+ if st.button("Process IDC Data"):
105
+ # Fetch data from IDC based on selection
106
+ selection = index_df[
107
+ (index_df["collection_id"] == selected_collection_id) &
108
+ (index_df["PatientID"] == selected_patient_id) &
109
+ (index_df["Modality"] == selected_modality)
110
+ ]
111
+
112
+ series_instance_uids = selection["SeriesInstanceUID"].tolist()
113
+
114
+ # with TemporaryDirectory() as temp_dir:
115
+ download_errors = []
116
+ #input_dir = os.path.join(temp_dir, "input_data")
117
+ input_dir=Path("input_data/")
118
+ os.makedirs(input_dir, exist_ok=True)
119
+
120
+ try:
121
+ client.download_from_selection(seriesInstanceUID=series_instance_uids, downloadDir=input_dir)
122
+ except Exception as e:
123
+ download_errors.append(f"Error downloading data: {str(e)}")
124
+
125
+ if download_errors:
126
+ st.error("\n".join(download_errors))
127
+ else:
128
+ st.success("Data downloaded successfully.")
129
+
130
+ # Process downloaded DICOM data
131
+ dicom_files = [str(file) for file in input_dir.glob('**/*.dcm')]
132
+ st.write(dicom_files)
133
+ parquet_file = 'dicom_headers.parquet'
134
+ save_dicom_header_to_parquet(dicom_files, parquet_file)
135
+
136
+ st.success("Processing complete.")
137
+ with open(parquet_file, "rb") as f:
138
+ st.download_button(
139
+ label="Download Processed Results",
140
+ data=f,
141
+ file_name="output_data.parquet",
142
+ mime="application/parquet"
143
+ )
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pydicom
2
+ polars
3
+ pyarrow
4
+ duckdb
5
+ pandas
6
+ streamlit
7
+ idc-index