Spaces:

vkt1414
/

dcm2parquet

Sleeping

App Files Files Community

Vamsi Thiriveedhi commited on Jun 21

Commit

b6be892

•

1 Parent(s): ff79159

create dcm2parquet app

Browse files

Files changed (4) hide show

Dockerfile +39 -0
README.md +5 -6
dcm2parquet.py +143 -0
requirements.txt +7 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,39 @@

+# Use an official Python runtime as a parent image
+FROM python:3.12.3
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Try and run pip command after setting the user with `USER user` to avoid permission issues with Python
+RUN pip install --no-cache-dir --upgrade pip
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Create the .streamlit directory
+RUN mkdir -p .streamlit
+# Create the config.toml file and set the maxMessageSize
+RUN echo "\
+[server]\n\
+maxMessageSize = 2000\n\
+" > .streamlit/config.toml
+# Make port 8501 available to the world outside this container
+EXPOSE 8501
+# Run filter_data_app.py when the container launches
+CMD streamlit run dcm2parquet_app.py

README.md CHANGED Viewed

@@ -1,11 +1,10 @@
 ---
-title: Dcm2parquet
-emoji: 🚀
-colorFrom: blue
-colorTo: gray
 sdk: docker
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: dcm2parquet
+emoji: ??
+colorFrom: purple
+colorTo: green
 sdk: docker
+app_port: 8501
 pinned: false
 license: mit
 ---

dcm2parquet.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import streamlit as st
+from idc_index import index
+from pathlib import Path
+import pydicom
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from tempfile import TemporaryDirectory
+import os
+from pathlib import Path
+import polars
+import pydicom.datadict as dd
+# Helper function to get the description of a DICOM t
+def get_tag_description(tag, description):
+    """ Get the description of the DICOM tag """
+    try:
+        return dd.get_entry(tag)[2]  # Get the third element which is the keyword/description
+    except KeyError:
+        return description
+def convert_value(value):
+    """ Convert pydicom value to Python native format """
+    if isinstance(value, pydicom.multival.MultiValue):
+        return [convert_value(v) for v in value]
+    elif isinstance(value, pydicom.sequence.Sequence):
+        return [convert_sequence_item(item) for item in value]
+    elif isinstance(value, pydicom.valuerep.PersonName) or isinstance(value, pydicom.uid.UID):
+        return str(value)
+    else:
+        return value
+# Sanitize column or field name for compatibility
+def sanitize_name(name):
+    """ Sanitize column or field name """
+    # Remove special characters and replace underscores with an empty string
+    return name.replace('(', '').replace(')', '').replace(',', '').replace(' ', '').replace('_', '')
+# Convert pydicom sequence item to Python native format
+def convert_sequence_item(item):
+    """ Convert pydicom sequence item to Python native format """
+    return {sanitize_name(get_tag_description(elem.tag, elem.description())): convert_value(elem.value) for elem in item}
+# Clean column name by removing special characters and spaces
+def clean_column_name(column_name):
+    """ Clean column name """
+    return ''.join(e for e in column_name if e.isalnum())
+# Serialize complex DICOM elements to JSON string while preserving nesting
+def serialize_element(value):
+    """ Serialize complex DICOM elements to JSON string while preserving nesting """
+    if isinstance(value, pydicom.Dataset):
+        # Convert the Dataset to a dict preserving the nested structure
+        return {sanitize_name(get_tag_description(elem.tag, elem.description())): serialize_element(elem.value) for elem in value}
+    elif isinstance(value, pydicom.sequence.Sequence):
+        # Convert the Sequence to a list preserving the nested structure
+        return [serialize_element(item) for item in value]
+    else:
+        return convert_value(value)
+# Extract DICOM header data and serialize complex types while preserving nesting
+def extract_dicom_header(dicom_file):
+    """ Extract DICOM header data and serialize complex types while preserving nesting """
+    ds = pydicom.dcmread(dicom_file, stop_before_pixels=True)
+    header_data = {}
+    for elem in ds:
+        header_data[sanitize_name(clean_column_name(get_tag_description(elem.tag, elem.description())))] = serialize_element(elem.value)
+    return header_data
+# Save DICOM header data to a Parquet file
+def save_dicom_header_to_parquet(dicom_files, parquet_file):
+    """ Save DICOM header data to a Parquet file """
+    all_header_data = []
+    for dicom_file in dicom_files:
+        header_data = extract_dicom_header(dicom_file)
+        all_header_data.append(header_data)
+    df = pd.DataFrame(all_header_data)
+    # Sanitize column names
+    df.columns = [sanitize_name(col) for col in df.columns]
+    table = pa.Table.from_pandas(df)
+    pq.write_table(table, parquet_file)
+# Main Streamlit app code
+st.title("IDC to Parquet Converter")
+st.write("Select IDC data to download and convert to Parquet.")
+# Fetch IDC index
+client = index.IDCClient()
+index_df = client.index
+# Option to choose IDC data
+st.subheader("Choose IDC Data to Process")
+collection_ids = index_df["collection_id"].unique()
+selected_collection_id = st.selectbox("Select Collection ID", collection_ids)
+patients = index_df[index_df["collection_id"] == selected_collection_id]["PatientID"].unique()
+selected_patient_id = st.selectbox("Select Patient ID", patients)
+modalities = index_df[index_df["collection_id"] == selected_collection_id]["Modality"].unique()
+selected_modality = st.selectbox("Select Modality", modalities)
+# Button to process IDC data
+if st.button("Process IDC Data"):
+    # Fetch data from IDC based on selection
+    selection = index_df[
+        (index_df["collection_id"] == selected_collection_id) &
+        (index_df["PatientID"] == selected_patient_id) &
+        (index_df["Modality"] == selected_modality)
+    ]
+    series_instance_uids = selection["SeriesInstanceUID"].tolist()
+    #    with TemporaryDirectory() as temp_dir:
+    download_errors = []
+    #input_dir = os.path.join(temp_dir, "input_data")
+    input_dir=Path("input_data/")
+    os.makedirs(input_dir, exist_ok=True)
+    try:
+        client.download_from_selection(seriesInstanceUID=series_instance_uids, downloadDir=input_dir)
+    except Exception as e:
+        download_errors.append(f"Error downloading data: {str(e)}")
+    if download_errors:
+        st.error("\n".join(download_errors))
+    else:
+        st.success("Data downloaded successfully.")
+    # Process downloaded DICOM data
+    dicom_files = [str(file) for file in input_dir.glob('**/*.dcm')]
+    st.write(dicom_files)
+    parquet_file = 'dicom_headers.parquet'
+    save_dicom_header_to_parquet(dicom_files, parquet_file)
+    st.success("Processing complete.")
+    with open(parquet_file, "rb") as f:
+        st.download_button(
+            label="Download Processed Results",
+            data=f,
+            file_name="output_data.parquet",
+            mime="application/parquet"
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pydicom
+polars
+pyarrow
+duckdb
+pandas
+streamlit
+idc-index