Spaces:
Sleeping
Sleeping
Vamsi Thiriveedhi
commited on
Commit
•
b6be892
1
Parent(s):
ff79159
create dcm2parquet app
Browse files- Dockerfile +39 -0
- README.md +5 -6
- dcm2parquet.py +143 -0
- requirements.txt +7 -0
Dockerfile
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as a parent image
|
2 |
+
FROM python:3.12.3
|
3 |
+
|
4 |
+
# Set up a new user named "user" with user ID 1000
|
5 |
+
RUN useradd -m -u 1000 user
|
6 |
+
|
7 |
+
# Switch to the "user" user
|
8 |
+
USER user
|
9 |
+
|
10 |
+
# Set home to the user's home directory
|
11 |
+
ENV HOME=/home/user \
|
12 |
+
PATH=/home/user/.local/bin:$PATH
|
13 |
+
|
14 |
+
# Set the working directory to the user's home directory
|
15 |
+
WORKDIR $HOME/app
|
16 |
+
|
17 |
+
# Try and run pip command after setting the user with `USER user` to avoid permission issues with Python
|
18 |
+
RUN pip install --no-cache-dir --upgrade pip
|
19 |
+
|
20 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
21 |
+
COPY --chown=user . $HOME/app
|
22 |
+
|
23 |
+
# Install any needed packages specified in requirements.txt
|
24 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
25 |
+
|
26 |
+
# Create the .streamlit directory
|
27 |
+
RUN mkdir -p .streamlit
|
28 |
+
|
29 |
+
# Create the config.toml file and set the maxMessageSize
|
30 |
+
RUN echo "\
|
31 |
+
[server]\n\
|
32 |
+
maxMessageSize = 2000\n\
|
33 |
+
" > .streamlit/config.toml
|
34 |
+
|
35 |
+
# Make port 8501 available to the world outside this container
|
36 |
+
EXPOSE 8501
|
37 |
+
|
38 |
+
# Run filter_data_app.py when the container launches
|
39 |
+
CMD streamlit run dcm2parquet_app.py
|
README.md
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
|
|
7 |
pinned: false
|
8 |
license: mit
|
9 |
---
|
10 |
-
|
11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: dcm2parquet
|
3 |
+
emoji: ??
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: green
|
6 |
sdk: docker
|
7 |
+
app_port: 8501
|
8 |
pinned: false
|
9 |
license: mit
|
10 |
---
|
|
|
|
dcm2parquet.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from idc_index import index
|
3 |
+
from pathlib import Path
|
4 |
+
import pydicom
|
5 |
+
import pandas as pd
|
6 |
+
import pyarrow as pa
|
7 |
+
import pyarrow.parquet as pq
|
8 |
+
from tempfile import TemporaryDirectory
|
9 |
+
import os
|
10 |
+
from pathlib import Path
|
11 |
+
import polars
|
12 |
+
import pydicom.datadict as dd
|
13 |
+
|
14 |
+
# Helper function to get the description of a DICOM t
|
15 |
+
def get_tag_description(tag, description):
|
16 |
+
""" Get the description of the DICOM tag """
|
17 |
+
try:
|
18 |
+
return dd.get_entry(tag)[2] # Get the third element which is the keyword/description
|
19 |
+
except KeyError:
|
20 |
+
return description
|
21 |
+
|
22 |
+
def convert_value(value):
|
23 |
+
""" Convert pydicom value to Python native format """
|
24 |
+
if isinstance(value, pydicom.multival.MultiValue):
|
25 |
+
return [convert_value(v) for v in value]
|
26 |
+
elif isinstance(value, pydicom.sequence.Sequence):
|
27 |
+
return [convert_sequence_item(item) for item in value]
|
28 |
+
elif isinstance(value, pydicom.valuerep.PersonName) or isinstance(value, pydicom.uid.UID):
|
29 |
+
return str(value)
|
30 |
+
else:
|
31 |
+
return value
|
32 |
+
|
33 |
+
# Sanitize column or field name for compatibility
|
34 |
+
def sanitize_name(name):
|
35 |
+
""" Sanitize column or field name """
|
36 |
+
# Remove special characters and replace underscores with an empty string
|
37 |
+
return name.replace('(', '').replace(')', '').replace(',', '').replace(' ', '').replace('_', '')
|
38 |
+
|
39 |
+
# Convert pydicom sequence item to Python native format
|
40 |
+
def convert_sequence_item(item):
|
41 |
+
""" Convert pydicom sequence item to Python native format """
|
42 |
+
return {sanitize_name(get_tag_description(elem.tag, elem.description())): convert_value(elem.value) for elem in item}
|
43 |
+
|
44 |
+
# Clean column name by removing special characters and spaces
|
45 |
+
def clean_column_name(column_name):
|
46 |
+
""" Clean column name """
|
47 |
+
return ''.join(e for e in column_name if e.isalnum())
|
48 |
+
|
49 |
+
# Serialize complex DICOM elements to JSON string while preserving nesting
|
50 |
+
def serialize_element(value):
|
51 |
+
""" Serialize complex DICOM elements to JSON string while preserving nesting """
|
52 |
+
if isinstance(value, pydicom.Dataset):
|
53 |
+
# Convert the Dataset to a dict preserving the nested structure
|
54 |
+
return {sanitize_name(get_tag_description(elem.tag, elem.description())): serialize_element(elem.value) for elem in value}
|
55 |
+
elif isinstance(value, pydicom.sequence.Sequence):
|
56 |
+
# Convert the Sequence to a list preserving the nested structure
|
57 |
+
return [serialize_element(item) for item in value]
|
58 |
+
else:
|
59 |
+
return convert_value(value)
|
60 |
+
|
61 |
+
# Extract DICOM header data and serialize complex types while preserving nesting
|
62 |
+
def extract_dicom_header(dicom_file):
|
63 |
+
""" Extract DICOM header data and serialize complex types while preserving nesting """
|
64 |
+
ds = pydicom.dcmread(dicom_file, stop_before_pixels=True)
|
65 |
+
header_data = {}
|
66 |
+
for elem in ds:
|
67 |
+
header_data[sanitize_name(clean_column_name(get_tag_description(elem.tag, elem.description())))] = serialize_element(elem.value)
|
68 |
+
return header_data
|
69 |
+
|
70 |
+
# Save DICOM header data to a Parquet file
|
71 |
+
def save_dicom_header_to_parquet(dicom_files, parquet_file):
|
72 |
+
""" Save DICOM header data to a Parquet file """
|
73 |
+
all_header_data = []
|
74 |
+
for dicom_file in dicom_files:
|
75 |
+
header_data = extract_dicom_header(dicom_file)
|
76 |
+
all_header_data.append(header_data)
|
77 |
+
|
78 |
+
df = pd.DataFrame(all_header_data)
|
79 |
+
# Sanitize column names
|
80 |
+
df.columns = [sanitize_name(col) for col in df.columns]
|
81 |
+
table = pa.Table.from_pandas(df)
|
82 |
+
pq.write_table(table, parquet_file)
|
83 |
+
|
84 |
+
# Main Streamlit app code
|
85 |
+
st.title("IDC to Parquet Converter")
|
86 |
+
st.write("Select IDC data to download and convert to Parquet.")
|
87 |
+
|
88 |
+
# Fetch IDC index
|
89 |
+
client = index.IDCClient()
|
90 |
+
index_df = client.index
|
91 |
+
|
92 |
+
# Option to choose IDC data
|
93 |
+
st.subheader("Choose IDC Data to Process")
|
94 |
+
collection_ids = index_df["collection_id"].unique()
|
95 |
+
selected_collection_id = st.selectbox("Select Collection ID", collection_ids)
|
96 |
+
|
97 |
+
patients = index_df[index_df["collection_id"] == selected_collection_id]["PatientID"].unique()
|
98 |
+
selected_patient_id = st.selectbox("Select Patient ID", patients)
|
99 |
+
|
100 |
+
modalities = index_df[index_df["collection_id"] == selected_collection_id]["Modality"].unique()
|
101 |
+
selected_modality = st.selectbox("Select Modality", modalities)
|
102 |
+
|
103 |
+
# Button to process IDC data
|
104 |
+
if st.button("Process IDC Data"):
|
105 |
+
# Fetch data from IDC based on selection
|
106 |
+
selection = index_df[
|
107 |
+
(index_df["collection_id"] == selected_collection_id) &
|
108 |
+
(index_df["PatientID"] == selected_patient_id) &
|
109 |
+
(index_df["Modality"] == selected_modality)
|
110 |
+
]
|
111 |
+
|
112 |
+
series_instance_uids = selection["SeriesInstanceUID"].tolist()
|
113 |
+
|
114 |
+
# with TemporaryDirectory() as temp_dir:
|
115 |
+
download_errors = []
|
116 |
+
#input_dir = os.path.join(temp_dir, "input_data")
|
117 |
+
input_dir=Path("input_data/")
|
118 |
+
os.makedirs(input_dir, exist_ok=True)
|
119 |
+
|
120 |
+
try:
|
121 |
+
client.download_from_selection(seriesInstanceUID=series_instance_uids, downloadDir=input_dir)
|
122 |
+
except Exception as e:
|
123 |
+
download_errors.append(f"Error downloading data: {str(e)}")
|
124 |
+
|
125 |
+
if download_errors:
|
126 |
+
st.error("\n".join(download_errors))
|
127 |
+
else:
|
128 |
+
st.success("Data downloaded successfully.")
|
129 |
+
|
130 |
+
# Process downloaded DICOM data
|
131 |
+
dicom_files = [str(file) for file in input_dir.glob('**/*.dcm')]
|
132 |
+
st.write(dicom_files)
|
133 |
+
parquet_file = 'dicom_headers.parquet'
|
134 |
+
save_dicom_header_to_parquet(dicom_files, parquet_file)
|
135 |
+
|
136 |
+
st.success("Processing complete.")
|
137 |
+
with open(parquet_file, "rb") as f:
|
138 |
+
st.download_button(
|
139 |
+
label="Download Processed Results",
|
140 |
+
data=f,
|
141 |
+
file_name="output_data.parquet",
|
142 |
+
mime="application/parquet"
|
143 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pydicom
|
2 |
+
polars
|
3 |
+
pyarrow
|
4 |
+
duckdb
|
5 |
+
pandas
|
6 |
+
streamlit
|
7 |
+
idc-index
|