Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import time | |
import plotly.graph_objects as go | |
from scipy.ndimage import gaussian_filter1d | |
from zipfile import ZipFile | |
np.random.seed(2024) | |
uids = pd.read_csv("uniprot_ids_isoforms.tsv.gz", names=["selection"], header=None, sep="\t") | |
# del_sub_merge = pd.read_csv("del_sub_data.csv.gz") | |
zf = ZipFile("ALL_hum_isoforms_ESM1b_del_sub.zip") | |
width=600 | |
def plot_interactive_scatter(uid: str): | |
user_data = pd.read_csv(zf.open(f"{uid}.csv")) | |
# Create scatter plot for user-specified data | |
user_trace = go.Scatter( | |
x=-np.log10(user_data.aPLLR), | |
y=user_data.avg_LLR, | |
mode='markers', | |
name=f"{uid}<br>Data", | |
text=user_data.site, | |
hoverinfo='text', | |
marker=dict(color='orange')) | |
return user_trace, user_data | |
def plot_interactive_line(uid_data: pd.DataFrame, uid: str, score: str, mutation: str, | |
hline1: float, hline2: float): | |
esm_data = -np.log10(uid_data[score]) if score == "aPLLR" else uid_data[score] | |
x_ticks = uid_data["site"].tolist() | |
plot_data = esm_data | |
hover_text = [f"{x}: {np.round(y, 3)}" for x, y in zip(uid_data.site, plot_data)] | |
line_trace = go.Scatter( | |
x=np.arange(1, len(uid_data)+1), | |
y=plot_data, | |
mode='lines', | |
text=hover_text, | |
hoverinfo='text', | |
marker=dict(color='orange') | |
) | |
line_fig = go.Figure(data=[line_trace]) | |
line_fig.update_layout( | |
title=f"{uid} {mutation} Scores by Position", | |
yaxis_title=f'{mutation} Score<br>(More Negative = More Damaging)', | |
yaxis=dict(showgrid=False, zeroline=False, showline=False), | |
height=300, | |
hoverlabel=dict( # Set hover label font size | |
font=dict(size=16) # Specify the font size of the hover text | |
) | |
) | |
for hline in [hline1, hline2]: | |
line_fig.add_shape( | |
type='line', | |
x0=0, x1=1, y0=hline, y1=hline, | |
xref='paper', yref='y', | |
line=dict(color='Black', dash='dash'), | |
) | |
return line_fig | |
selection = st.selectbox("", uids.selection, index=26592) | |
selection_uid = selection.split(",")[0] | |
# Base dataset | |
base_data = pd.read_csv("rand_samp_gw_del_sub.csv.gz") | |
# Create base scatter plot | |
base_trace = go.Scatter( | |
x=-np.log10(base_data.aPLLR), | |
y=base_data.avg_LLR, | |
mode='markers', | |
name='Sample of<br>Genome-Wide<br>Data', | |
hoverinfo='none', # Disable hover information for the base data | |
marker=dict(color='grey') | |
) | |
# User-specified data | |
ut, ud = plot_interactive_scatter(selection_uid) | |
# Combine traces | |
fig = go.Figure([base_trace, ut]) | |
# Customize layout | |
fig.update_layout( | |
title='Deletion v Substitution Effects', | |
xaxis_title='Deletion Score', | |
yaxis_title='Substitution Score', | |
yaxis=dict(showgrid=False, showline=False, zeroline=False), | |
legend=dict( | |
font=dict(size=15), # Specify the font size of the legend text | |
bordercolor="grey", | |
borderwidth=1 | |
), | |
hoverlabel=dict( # Set hover label font size | |
font=dict(size=16) # Specify the font size of the hover text | |
) | |
) | |
fig.update_yaxes(showgrid=False) | |
# Extract out percentiles | |
del_bot, del_top = 0.147907659054341, -0.8033614237502615 | |
for del_cutoff in [del_bot, del_top]: | |
fig.add_shape( | |
type='line', | |
x0=del_cutoff, x1=del_cutoff, y0=0, y1=1, | |
xref='x', yref='paper', | |
line=dict(color='Black', width=2) | |
) | |
# to avoid reading the entire dataset into memory | |
sub_bot, sub_top = -12.294105263157894, -4.898842105263157 | |
for sub_cutoff in [sub_bot, sub_top]: | |
fig.add_shape( | |
type='line', | |
x0=0, x1=1, y0=sub_cutoff, y1=sub_cutoff, | |
xref='paper', yref='y', | |
line=dict(color='Black', width=2), | |
) | |
fig.add_annotation( | |
x=2.5, | |
y=-18, | |
text=r"D<sup>+</sup>S<sup>—</sup>", | |
font=dict(color="green", size=24), | |
showarrow=False | |
) | |
fig.add_annotation( | |
x=-1.5, | |
y=0.5, | |
text=r"D<sup>—</sup>S<sup>+</sup>", | |
font=dict(color="red", size=24), | |
showarrow=False | |
) | |
lt_apllr = plot_interactive_line(ud, selection_uid, "aPLLR", "Deletion", del_bot, del_top) | |
lt_llr = plot_interactive_line(ud, selection_uid, "avg_LLR", "Substitution", sub_bot, sub_top) | |
# Show the scatter plot | |
st.plotly_chart(fig) | |
show_line_plots = st.checkbox("Show Deletion and Substitution Effects Alone") | |
if show_line_plots: | |
st.plotly_chart(lt_apllr) | |
st.plotly_chart(lt_llr) | |
st.download_button( | |
label=f"Download {selection_uid} data as CSV", | |
data=ud.reset_index(drop=True)[["site", "aPLLR", "avg_LLR"]].to_csv(), | |
file_name = f"{selection_uid}_del_sub.csv", | |
mime='text/csv' | |
) | |
st.markdown(""" | |
**README**: | |
- Deletion scores are *visualized* on the -log10 scale. | |
- The genome-wide dataset can be downloaded by clicking [here](https://huggingface.co/spaces/goldmangrant/diff-tol/blob/main/ALL_hum_isoforms_ESM1b_del_sub.zip) (or go to files tab). | |
- Non-aggregated substitution effects can be downloaded or browsed [here](https://huggingface.co/spaces/ntranoslab/esm_variants). | |
- Additional supplementary data from the paper can be downloaded [here](https://github.com/ntranoslab/diff-tol). | |
""") | |