Spaces:

TornikeO
/

SimMS

Sleeping

App Files Files Community

TornikeO commited on Apr 24, 2024

Commit

b31ccfa

1 Parent(s): 660a12b

Let's run this bad boy

Browse files

Files changed (4) hide show

Dockerfile +29 -0
README.md +5 -4
app.py +150 -3
requirements.txt +14 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+FROM pytorch/pytorch:2.2.1-cuda11.8-cudnn8-devel
+# Set the working directory to /code
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# RUN apt-get update && apt-get install -y --no-install-recommends git && \
+#     apt-get clean && rm -rf /var/lib/apt/lists/*
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+ENV PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+	SYSTEM=spaces
+COPY --chown=user . $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+CMD ["python3", "app.py"]

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: Cudams
-emoji: 🌍
 colorFrom: purple
 colorTo: blue
-sdk: streamlit
 sdk_version: 1.32.2
 app_file: app.py
-pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: CudaMS
+emoji: 🧬
 colorFrom: purple
 colorTo: blue
+sdk: docker
 sdk_version: 1.32.2
+app_port: 7860
 app_file: app.py
+pinned: true
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,4 +1,151 @@
-import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

+import gradio as gr
+import torch
+import os
+from pathlib import Path
+from matchms import Spectrum
+from typing import List, Optional, Literal
+# os.system("nvidia-smi")
+# print("TORCH_CUDA", torch.cuda.is_available())
+def preprocess_spectra(spectra: List[Spectrum]) -> Spectrum:
+    from matchms.filtering import select_by_intensity, \
+        normalize_intensities, \
+        select_by_relative_intensity, \
+        reduce_to_number_of_peaks, \
+        select_by_mz, \
+        require_minimum_number_of_peaks
+    def process_spectrum(spectrum: Spectrum) -> Optional[Spectrum]:
+        """
+        One of the many ways to preprocess the spectrum - we use this by default.
+        """
+        spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
+        spectrum = normalize_intensities(spectrum)
+        spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
+        spectrum = reduce_to_number_of_peaks(spectrum, n_max=1024)
+        spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
+        return spectrum
+    spectra = list(process_spectrum(s) for s in spectra) # Some might be None
+    return spectra
+def run(r_filepath:Path, q_filepath:Path,
+        tolerance: float = 0.1,
+        mz_power: float = 0.0,
+        intensity_power: float = 1.0,
+        shift: float = 0,
+        batch_size: int = 2048,
+        n_max_peaks: int = 1024,
+        match_limit: int = 2048,
+        array_type: Literal['sparse','numpy'] = "numpy",
+        sparse_threshold: float = .75):
+    print('\n>>>>', r_filepath, q_filepath, array_type, '\n')
+    # debug = os.getenv('CUDAMS_DEBUG') == '1'
+    # if debug:
+    #     r_filepath = Path('tests/data/pesticides.mgf')
+    #     q_filepath = Path('tests/data/pesticides.mgf')
+    assert r_filepath is not None, "Reference file is missing."
+    assert q_filepath is not None, "Query file is missing."
+    import tempfile
+    import numpy as np
+    from cudams.similarity import CudaCosineGreedy
+    from matchms.importing import load_from_mgf
+    from matchms import calculate_scores
+    import matplotlib.pyplot as plt
+    refs = preprocess_spectra(list(load_from_mgf(str(r_filepath))))
+    ques = preprocess_spectra(list(load_from_mgf(str(q_filepath))))
+    # If we have small spectra, don't make a huge batch
+    if batch_size > max(len(refs), len(ques)):
+         batch_size = max(len(refs), len(ques))
+    scores_obj = calculate_scores(
+        refs, ques,
+        similarity_function=CudaCosineGreedy(
+            tolerance=tolerance,
+            mz_power=mz_power,
+            intensity_power=intensity_power,
+            shift=shift,
+            batch_size=batch_size,
+            n_max_peaks=n_max_peaks,
+            match_limit=match_limit,
+            sparse_threshold=sparse_threshold
+        ),
+        array_type=array_type
+    )
+    score_vis = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False)
+    fig, axs = plt.subplots(1, 2,
+                            figsize=(10, 5),
+                            dpi=150)
+    scores = scores_obj.to_array()
+    ax = axs[0]
+    ax.imshow(scores['CudaCosineGreedy_score'])
+    ax = axs[1]
+    ax.imshow(scores['CudaCosineGreedy_matches'])
+    plt.suptitle("Score and matches")
+    plt.savefig(score_vis.name)
+    score = tempfile.NamedTemporaryFile(suffix='.npz', delete=False)
+    np.savez(score.name, scores=scores)
+    import pickle
+    pickle_ = tempfile.NamedTemporaryFile(suffix='.pickle', delete=False)
+    Path(pickle_.name).write_bytes(pickle.dumps(scores_obj))
+    return score.name,  score_vis.name, pickle_.name
+with gr.Blocks() as demo:
+    gr.Markdown("Run Cuda Cosine Greedy on your MGF files.")
+    with gr.Row():
+        refs = gr.File(label="Upload REFERENCES.mgf",
+                       interactive=True,
+                               value='tests/data/pesticides.mgf')
+        ques = gr.File(label="Upload QUERIES.mgf",
+                       interactive=True,
+                               value='tests/data/pesticides.mgf')
+    with gr.Row():
+            tolerance = gr.Slider(minimum=0, maximum=1, value=0.1, label="Tolerance")
+            mz_power = gr.Slider(minimum=0, maximum=2, value=0.0, label="mz_power")
+            intensity_power = gr.Slider(minimum=0, maximum=2, value=1.0, label="Intensity Power")
+            shift = gr.Slider(minimum=-10, maximum=10, value=0, label="Shift")
+    with gr.Row():
+            batch_size = gr.Number(value=2048, label="Batch Size", info='How many spectra to process pairwise, in one step. Limited by GPU size, default works well for the T4 GPU.')
+            n_max_peaks = gr.Number(value=1024, label="Maximum Number of Peaks",
+                                    info="Some spectra are too large to fit on GPU,"
+                                        "so we have to trim them to only use the first "
+                                        "n_max_peaks number of peaks.")
+            match_limit = gr.Number(value=2048, label="Match Limit",
+                                    info="Two very similar spectra of size N and M can have N * M matches, before filtering."
+                                         "This doesn't fit on GPU, so we stop accumulating more matches once we have at most match_limit number of them."
+                                         "In practice, a value of 2048 gives more than 99.99% accuracy on GNPS")
+    with gr.Row():
+            array_type = gr.Radio(['numpy', 'sparse'], value='numpy', type='value',
+                                     label='How to handle outputs - if sparse, everything with score less than sparse_threshold will be discarded. If `numpy`, we disable sparse behaviour.')
+            sparse_threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Sparse Threshold",
+                                         info="For very large results, when comparing, more than 10k x 10k, the output dense score matrix can grow too large for RAM."
+                                            "While most of the scores aren't useful (near zero). This argument discards all scores less than sparse_threshold, and returns "
+                                            "results as a SparseStack format."
+                                            )
+    with gr.Row():
+        score_vis = gr.Image()
+    with gr.Row():
+        out_npz = gr.File(label="Download similarity matrix (.npz format)",
+                      interactive=False)
+        out_pickle = gr.File(label="Download full `Scores` object (.pickle format)",
+                      interactive=False)
+    btn = gr.Button("Run")
+    btn.click(fn=run, inputs=[refs, ques, tolerance, mz_power, intensity_power, shift,
+                              batch_size, n_max_peaks, match_limit,
+                              array_type, sparse_threshold], outputs=[out_npz, score_vis, out_pickle])
+if __name__ == "__main__":
+    demo.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+matchms>=0.24.0
+numba
+torch
+rdkit
+pooch
+h5py
+pandas
+tqdm
+pyyaml
+python-dotenv
+joblib
+pytest
+cudams @ git+https://github.com/tornikeo/cudams@main
+gradip