TornikeO commited on
Commit
b31ccfa
·
1 Parent(s): 660a12b

Let's run this bad boy

Browse files
Files changed (4) hide show
  1. Dockerfile +29 -0
  2. README.md +5 -4
  3. app.py +150 -3
  4. requirements.txt +14 -0
Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:2.2.1-cuda11.8-cudnn8-devel
2
+
3
+ # Set the working directory to /code
4
+ WORKDIR /code
5
+ COPY ./requirements.txt /code/requirements.txt
6
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
7
+
8
+ # RUN apt-get update && apt-get install -y --no-install-recommends git && \
9
+ # apt-get clean && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Set up a new user named "user" with user ID 1000
12
+ RUN useradd -m -u 1000 user
13
+ USER user
14
+
15
+ ENV HOME=/home/user \
16
+ PATH=/home/user/.local/bin:$PATH
17
+ # Set the working directory to the user's home directory
18
+ WORKDIR $HOME/app
19
+
20
+ ENV PYTHONUNBUFFERED=1 \
21
+ GRADIO_ALLOW_FLAGGING=never \
22
+ GRADIO_NUM_PORTS=1 \
23
+ GRADIO_SERVER_NAME=0.0.0.0 \
24
+ GRADIO_THEME=huggingface \
25
+ SYSTEM=spaces
26
+
27
+ COPY --chown=user . $HOME/app
28
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
29
+ CMD ["python3", "app.py"]
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Cudams
3
- emoji: 🌍
4
  colorFrom: purple
5
  colorTo: blue
6
- sdk: streamlit
7
  sdk_version: 1.32.2
 
8
  app_file: app.py
9
- pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: CudaMS
3
+ emoji: 🧬
4
  colorFrom: purple
5
  colorTo: blue
6
+ sdk: docker
7
  sdk_version: 1.32.2
8
+ app_port: 7860
9
  app_file: app.py
10
+ pinned: true
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,4 +1,151 @@
1
- import streamlit as st
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ from pathlib import Path
5
+ from matchms import Spectrum
6
+ from typing import List, Optional, Literal
7
+ # os.system("nvidia-smi")
8
+ # print("TORCH_CUDA", torch.cuda.is_available())
9
 
10
+ def preprocess_spectra(spectra: List[Spectrum]) -> Spectrum:
11
+ from matchms.filtering import select_by_intensity, \
12
+ normalize_intensities, \
13
+ select_by_relative_intensity, \
14
+ reduce_to_number_of_peaks, \
15
+ select_by_mz, \
16
+ require_minimum_number_of_peaks
17
+
18
+ def process_spectrum(spectrum: Spectrum) -> Optional[Spectrum]:
19
+ """
20
+ One of the many ways to preprocess the spectrum - we use this by default.
21
+ """
22
+ spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
23
+ spectrum = normalize_intensities(spectrum)
24
+ spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
25
+ spectrum = reduce_to_number_of_peaks(spectrum, n_max=1024)
26
+ spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
27
+ return spectrum
28
+
29
+ spectra = list(process_spectrum(s) for s in spectra) # Some might be None
30
+ return spectra
31
+
32
+ def run(r_filepath:Path, q_filepath:Path,
33
+ tolerance: float = 0.1,
34
+ mz_power: float = 0.0,
35
+ intensity_power: float = 1.0,
36
+ shift: float = 0,
37
+ batch_size: int = 2048,
38
+ n_max_peaks: int = 1024,
39
+ match_limit: int = 2048,
40
+ array_type: Literal['sparse','numpy'] = "numpy",
41
+ sparse_threshold: float = .75):
42
+ print('\n>>>>', r_filepath, q_filepath, array_type, '\n')
43
+ # debug = os.getenv('CUDAMS_DEBUG') == '1'
44
+ # if debug:
45
+ # r_filepath = Path('tests/data/pesticides.mgf')
46
+ # q_filepath = Path('tests/data/pesticides.mgf')
47
+
48
+ assert r_filepath is not None, "Reference file is missing."
49
+ assert q_filepath is not None, "Query file is missing."
50
+ import tempfile
51
+ import numpy as np
52
+ from cudams.similarity import CudaCosineGreedy
53
+ from matchms.importing import load_from_mgf
54
+ from matchms import calculate_scores
55
+ import matplotlib.pyplot as plt
56
+
57
+ refs = preprocess_spectra(list(load_from_mgf(str(r_filepath))))
58
+ ques = preprocess_spectra(list(load_from_mgf(str(q_filepath))))
59
+
60
+ # If we have small spectra, don't make a huge batch
61
+ if batch_size > max(len(refs), len(ques)):
62
+ batch_size = max(len(refs), len(ques))
63
+
64
+ scores_obj = calculate_scores(
65
+ refs, ques,
66
+ similarity_function=CudaCosineGreedy(
67
+ tolerance=tolerance,
68
+ mz_power=mz_power,
69
+ intensity_power=intensity_power,
70
+ shift=shift,
71
+ batch_size=batch_size,
72
+ n_max_peaks=n_max_peaks,
73
+ match_limit=match_limit,
74
+ sparse_threshold=sparse_threshold
75
+ ),
76
+ array_type=array_type
77
+ )
78
+
79
+ score_vis = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False)
80
+
81
+ fig, axs = plt.subplots(1, 2,
82
+ figsize=(10, 5),
83
+ dpi=150)
84
+
85
+ scores = scores_obj.to_array()
86
+ ax = axs[0]
87
+ ax.imshow(scores['CudaCosineGreedy_score'])
88
+
89
+ ax = axs[1]
90
+ ax.imshow(scores['CudaCosineGreedy_matches'])
91
+
92
+ plt.suptitle("Score and matches")
93
+ plt.savefig(score_vis.name)
94
+
95
+ score = tempfile.NamedTemporaryFile(suffix='.npz', delete=False)
96
+ np.savez(score.name, scores=scores)
97
+
98
+
99
+ import pickle
100
+ pickle_ = tempfile.NamedTemporaryFile(suffix='.pickle', delete=False)
101
+
102
+ Path(pickle_.name).write_bytes(pickle.dumps(scores_obj))
103
+ return score.name, score_vis.name, pickle_.name
104
+
105
+ with gr.Blocks() as demo:
106
+ gr.Markdown("Run Cuda Cosine Greedy on your MGF files.")
107
+ with gr.Row():
108
+ refs = gr.File(label="Upload REFERENCES.mgf",
109
+ interactive=True,
110
+ value='tests/data/pesticides.mgf')
111
+ ques = gr.File(label="Upload QUERIES.mgf",
112
+ interactive=True,
113
+ value='tests/data/pesticides.mgf')
114
+ with gr.Row():
115
+ tolerance = gr.Slider(minimum=0, maximum=1, value=0.1, label="Tolerance")
116
+ mz_power = gr.Slider(minimum=0, maximum=2, value=0.0, label="mz_power")
117
+ intensity_power = gr.Slider(minimum=0, maximum=2, value=1.0, label="Intensity Power")
118
+ shift = gr.Slider(minimum=-10, maximum=10, value=0, label="Shift")
119
+ with gr.Row():
120
+ batch_size = gr.Number(value=2048, label="Batch Size", info='How many spectra to process pairwise, in one step. Limited by GPU size, default works well for the T4 GPU.')
121
+ n_max_peaks = gr.Number(value=1024, label="Maximum Number of Peaks",
122
+ info="Some spectra are too large to fit on GPU,"
123
+ "so we have to trim them to only use the first "
124
+ "n_max_peaks number of peaks.")
125
+ match_limit = gr.Number(value=2048, label="Match Limit",
126
+ info="Two very similar spectra of size N and M can have N * M matches, before filtering."
127
+ "This doesn't fit on GPU, so we stop accumulating more matches once we have at most match_limit number of them."
128
+ "In practice, a value of 2048 gives more than 99.99% accuracy on GNPS")
129
+ with gr.Row():
130
+ array_type = gr.Radio(['numpy', 'sparse'], value='numpy', type='value',
131
+ label='How to handle outputs - if sparse, everything with score less than sparse_threshold will be discarded. If `numpy`, we disable sparse behaviour.')
132
+ sparse_threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Sparse Threshold",
133
+ info="For very large results, when comparing, more than 10k x 10k, the output dense score matrix can grow too large for RAM."
134
+ "While most of the scores aren't useful (near zero). This argument discards all scores less than sparse_threshold, and returns "
135
+ "results as a SparseStack format."
136
+ )
137
+ with gr.Row():
138
+ score_vis = gr.Image()
139
+
140
+ with gr.Row():
141
+ out_npz = gr.File(label="Download similarity matrix (.npz format)",
142
+ interactive=False)
143
+ out_pickle = gr.File(label="Download full `Scores` object (.pickle format)",
144
+ interactive=False)
145
+ btn = gr.Button("Run")
146
+ btn.click(fn=run, inputs=[refs, ques, tolerance, mz_power, intensity_power, shift,
147
+ batch_size, n_max_peaks, match_limit,
148
+ array_type, sparse_threshold], outputs=[out_npz, score_vis, out_pickle])
149
+
150
+ if __name__ == "__main__":
151
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ matchms>=0.24.0
2
+ numba
3
+ torch
4
+ rdkit
5
+ pooch
6
+ h5py
7
+ pandas
8
+ tqdm
9
+ pyyaml
10
+ python-dotenv
11
+ joblib
12
+ pytest
13
+ cudams @ git+https://github.com/tornikeo/cudams@main
14
+ gradip