Spaces:

ntranoslab
/

esm_variants

Running

File size: 4,337 Bytes

985a7dc
cf59915
985a7dc
 
 
 
 
 
 
073b569
985a7dc
f575805
9aa0dab
d37cbca
3977133
985a7dc
 
 
 
 
 
 
 
 
1ed98ce
 
 
 
 
 
 
 
 
 
 
 
985a7dc
d37cbca
985a7dc
c4368d7
 
985a7dc
42683c0
985a7dc
 
8b46687
985a7dc
 
 
 
 
 
 
 
 
 
 
 
 
 
d37cbca
 
ae9cb77
 
 
d37cbca
 
 
ae9cb77
 
 
d37cbca
 
 
ae9cb77
d37cbca
 
 
ae9cb77
 
 
 
d37cbca
ae9cb77
 
 
 
 
 
268ab01
ae9cb77
 
 
 
 
d37cbca
 
 
 
 
268ab01
d37cbca
 
 
 
 
 
fc3b97a
985a7dc
a087ee0
985a7dc
b6f7a3c
39a9e82
8c2f1e1
b6f7a3c
d37cbca
 
87fb83d
1ed98ce
 
 
 
208e06a
1ed98ce
 
a5dc7a3

import streamlit as st
st.set_page_config(layout="wide")
import pandas as pd
import numpy as np
from zipfile import ZipFile

import plotly.express as px
import plotly.graph_objs as go

LLR_FILE='ALL_hum_isoforms_ESM1b_LLR.zip'

df=pd.read_csv('isoform_list.csv',index_col=0)
uids=list(df.index.values)
clinvar = pd.read_csv('clinvar.csv.gz')

def load_LLR(uniprot_id):
  '''Loads the LLRs for a given uniprot id. Returns a 20xL dataframe 
     rows are indexed by AA change, 
     (AAorder=['K','R','H','E','D','N','Q','T','S','C','G','A','V','L','I','M','P','Y','F','W'])
     columns indexed by WT_AA+position e.g, "G 12"
     Usage example: load_LLR('P01116') or load_LLR('P01116-2')'''
  with ZipFile(LLR_FILE) as myzip:
    data = myzip.open(myzip.namelist()[0]+uniprot_id+'_LLR.csv')
  return pd.read_csv(data,index_col=0)
  
def meltLLR(LLR,gene_prefix=None,ignore_pos=False):
  vars = LLR.melt(ignore_index=False)
  vars['variant'] = [''.join(i.split(' '))+j for i,j in zip(vars['variable'],vars.index)]
  vars['score'] = vars['value']
  vars = vars.set_index('variant')
  if not ignore_pos:
    vars['pos'] = [int(i[1:-1]) for i in vars.index]
  del vars['variable'],vars['value']
  if gene_prefix is not None:
    vars.index=gene_prefix+'_'+vars.index
  return vars

def plot_interactive(uniprot_id, show_clinvar=False):
  primaryLLR = load_LLR(uniprot_id)
  
  template='plotly_white'

  fig = px.imshow(primaryLLR.values, x=primaryLLR.columns, y=primaryLLR.index, color_continuous_scale='Viridis_r',zmax=0,zmin=-20,
                  labels=dict(y="Amino acid change", x="Protein sequence", color="LLR"),
                  template=template,
                  title=selection)
  fig.update_xaxes(tickangle=-90,range=[0,99],rangeslider=dict(visible=True),dtick=1)
  fig.update_yaxes(dtick=1)
  fig.update_layout({
  'plot_bgcolor': 'rgba(0, 0, 0, 0)',
  'paper_bgcolor': 'rgba(0, 0, 0, 0)',
  },font={'family':'Arial','size':11},
  hoverlabel=dict(font=dict(family='Arial', size=14)))

  fig.update_traces(
      hovertemplate="<br>".join([
          "<b>%{x} %{y}</b>"+
          " (%{z:.2f})", 
      ])+'<extra></extra>'
  )
  if show_clinvar:
    iso_clinvar = clinvar[clinvar.LLR_file_id == uniprot_id]
    iso_clinvar = iso_clinvar[iso_clinvar.ClinicalSignificance.isin(['Benign','Pathogenic'])]
    b_mut=set(iso_clinvar[iso_clinvar.ClinicalSignificance=='Benign'].variant.values)
    p_mut=set(iso_clinvar[iso_clinvar.ClinicalSignificance=='Pathogenic'].variant.values)
    hwt_x=[]
    hwt_y=[]
    cust=[]
    phwt_x=[]
    phwt_y=[]
    pcust=[]
    for i in primaryLLR.columns:
      for j in list(primaryLLR.index):
        mut = i[0]+i[2:]+j
        if mut in b_mut:
                        hwt_x+=[i]
                        hwt_y+=[j]
                        cust+=[primaryLLR.loc[j,i]]
        elif mut in p_mut:
                        phwt_x+=[i]
                        phwt_y+=[j]
                        pcust+=[primaryLLR.loc[j,i]]

    fig.add_trace(go.Scatter(
        x=phwt_x,
        y=phwt_y,
        customdata=pcust,
        mode='markers',
        marker=dict(size=8),
        showlegend=False,
        hovertemplate="<br>".join([
            "<b>%{x} %{y}</b>"+
            " (%{customdata:.2f})", 
        ])+'<extra></extra>')
    )
    fig.add_trace(go.Scatter(
        x=hwt_x,
        y=hwt_y,
        customdata=cust,
        mode='markers',
        showlegend=False,
        marker=dict(size=8),
        hovertemplate="<br>".join([
            "<b>%{x} %{y}</b>"+
            " (%{customdata:.2f})", 
        ])+'<extra></extra>')
    )
  
  return fig
  

selection = st.selectbox("uniprot_id:", df, index= 6251)
uid=df[df.txt==selection].index.values[0]

show_clinvar = st.checkbox('show ClinVar annotations (red: pathogenic, green: benign)',value=False)

fig = plot_interactive(uid,show_clinvar=show_clinvar)
fig.update_layout(width = 800, height = 600, autosize = False)
st.plotly_chart(fig, use_container_width=True)

st.download_button(
     label="Download data as CSV",
     data=meltLLR(load_LLR(uid)).to_csv(),
     file_name=selection+'.csv',
     mime='text/csv',
 )

st.markdown("""
To obtain ESM effect scores for non-missense mutations (e.g. indels) or non-human proteins, 
please use the [esm-variants command-line tool](https://github.com/ntranoslab/esm-variants).
""")