import streamlit as st import pandas as pd import numpy as np import time import plotly.graph_objects as go from scipy.ndimage import gaussian_filter1d from zipfile import ZipFile np.random.seed(2024) uids = pd.read_csv("uniprot_ids_isoforms.tsv.gz", names=["selection"], header=None, sep="\t") # del_sub_merge = pd.read_csv("del_sub_data.csv.gz") zf = ZipFile("ALL_hum_isoforms_ESM1b_del_sub.zip") width=600 def plot_interactive_scatter(uid: str): user_data = pd.read_csv(zf.open(f"{uid}.csv")) # Create scatter plot for user-specified data user_trace = go.Scatter( x=-np.log10(user_data.aPLLR), y=user_data.avg_LLR, mode='markers', name=f"{uid}
Data", text=user_data.site, hoverinfo='text', marker=dict(color='orange')) return user_trace, user_data def plot_interactive_line(uid_data: pd.DataFrame, uid: str, score: str, mutation: str, hline1: float, hline2: float): esm_data = -np.log10(uid_data[score]) if score == "aPLLR" else uid_data[score] x_ticks = uid_data["site"].tolist() plot_data = esm_data hover_text = [f"{x}: {np.round(y, 3)}" for x, y in zip(uid_data.site, plot_data)] line_trace = go.Scatter( x=np.arange(1, len(uid_data)+1), y=plot_data, mode='lines', text=hover_text, hoverinfo='text', marker=dict(color='orange') ) line_fig = go.Figure(data=[line_trace]) line_fig.update_layout( title=f"{uid} {mutation} Scores by Position", yaxis_title=f'{mutation} Score
(More Negative = More Damaging)', yaxis=dict(showgrid=False, zeroline=False, showline=False), height=300, hoverlabel=dict( # Set hover label font size font=dict(size=16) # Specify the font size of the hover text ) ) for hline in [hline1, hline2]: line_fig.add_shape( type='line', x0=0, x1=1, y0=hline, y1=hline, xref='paper', yref='y', line=dict(color='Black', dash='dash'), ) return line_fig selection = st.selectbox("", uids.selection, index=26592) selection_uid = selection.split(",")[0] # Base dataset base_data = pd.read_csv("rand_samp_gw_del_sub.csv.gz") # Create base scatter plot base_trace = go.Scatter( x=-np.log10(base_data.aPLLR), y=base_data.avg_LLR, mode='markers', name='Sample of
Genome-Wide
Data', hoverinfo='none', # Disable hover information for the base data marker=dict(color='grey') ) # User-specified data ut, ud = plot_interactive_scatter(selection_uid) # Combine traces fig = go.Figure([base_trace, ut]) # Customize layout fig.update_layout( title='Deletion v Substitution Effects', xaxis_title='Deletion Score', yaxis_title='Substitution Score', yaxis=dict(showgrid=False, showline=False, zeroline=False), legend=dict( font=dict(size=15), # Specify the font size of the legend text bordercolor="grey", borderwidth=1 ), hoverlabel=dict( # Set hover label font size font=dict(size=16) # Specify the font size of the hover text ) ) fig.update_yaxes(showgrid=False) # Extract out percentiles del_bot, del_top = 0.147907659054341, -0.8033614237502615 for del_cutoff in [del_bot, del_top]: fig.add_shape( type='line', x0=del_cutoff, x1=del_cutoff, y0=0, y1=1, xref='x', yref='paper', line=dict(color='Black', width=2) ) # to avoid reading the entire dataset into memory sub_bot, sub_top = -12.294105263157894, -4.898842105263157 for sub_cutoff in [sub_bot, sub_top]: fig.add_shape( type='line', x0=0, x1=1, y0=sub_cutoff, y1=sub_cutoff, xref='paper', yref='y', line=dict(color='Black', width=2), ) fig.add_annotation( x=2.5, y=-18, text=r"D+S", font=dict(color="green", size=24), showarrow=False ) fig.add_annotation( x=-1.5, y=0.5, text=r"DS+", font=dict(color="red", size=24), showarrow=False ) lt_apllr = plot_interactive_line(ud, selection_uid, "aPLLR", "Deletion", del_bot, del_top) lt_llr = plot_interactive_line(ud, selection_uid, "avg_LLR", "Substitution", sub_bot, sub_top) # Show the scatter plot st.plotly_chart(fig) show_line_plots = st.checkbox("Show Deletion and Substitution Effects Alone") if show_line_plots: st.plotly_chart(lt_apllr) st.plotly_chart(lt_llr) st.download_button( label=f"Download {selection_uid} data as CSV", data=ud.reset_index(drop=True)[["site", "aPLLR", "avg_LLR"]].to_csv(), file_name = f"{selection_uid}_del_sub.csv", mime='text/csv' ) st.markdown(""" **README**: - Deletion scores are *visualized* on the -log10 scale. - The genome-wide dataset can be downloaded by clicking [here](https://huggingface.co/spaces/goldmangrant/diff-tol/blob/main/ALL_hum_isoforms_ESM1b_del_sub.zip) (or go to files tab). - Non-aggregated substitution effects can be downloaded or browsed [here](https://huggingface.co/spaces/ntranoslab/esm_variants). - Additional supplementary data from the paper can be downloaded [here](https://github.com/ntranoslab/diff-tol). """)