diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,10 +1,9 @@ -#from turtle import shape import streamlit as st -#from st_keyup import st_keyup import pandas as pd import numpy as np from st_aggrid import AgGrid, GridOptionsBuilder,GridUpdateMode,DataReturnMode - +from iteration_utilities import duplicates +from iteration_utilities import unique_everseen import os st.set_page_config(layout="wide") @@ -24,9 +23,114 @@ caution2 = '

Pleas table_edit = '

About Table: Please note that table can be sorted by clicking on any column and Multiple rows can be selected (by clicking check box in first column) to save only those rows.

' caution_genes = '

Please make sure that desired genes from all three lists should be selected to generate Order Ready Table.

' + +#READ INPUT FILES + +cwd=os.getcwd()+'/'+'data/' + +#Here, gene column is modified for non-targeting guides in the format sgID_1|sgID_2 for coherent downstream manipulation +listA = pd.read_csv(cwd+"guides_a_new.csv",index_col=False) +listB = pd.read_csv(cwd+"guides_b_new.csv",index_col=False) +listC = pd.read_csv(cwd+"guides_c_new.csv",index_col=False) + +lista_sz=listA.shape[0] +listb_sz=listB.shape[0] +listc_sz=listC.shape[0] +#st.write(listA.shape) +variantsa1=listA['gene'].unique() +variantsb1=listB['gene'].unique() +variantsc1=listC['gene'].unique() +#Make a comprehensive lsit of genes in all 3 lists (Please not that non-targeting guide names are not same across three lists) +con = np.concatenate((variantsa1, variantsb1, variantsc1)) +variants_s=sorted(np.unique(con)) + +#NOW read GRCh38 and LR guides for stea as identified by LR-Guides pipeline +#Format is: gene (as many entries as number of guides found, both matched and mutated), ref_guide, chr, position, mutated_guide (can also be same as reference), strand, num_mismatcg (excluding leading G), Please note that each guide has trailing NGG +listA_found_ref = pd.read_csv(cwd+"seta_found_ref1.csv",index_col=False) +listA_found_ref = listA_found_ref.sort_values('gene') +lsita_ref_found_sz=listA_found_ref.shape[0] +#remove # from chr# # +listA_found_ref['chr'] = [x.split(' ')[-0] for x in listA_found_ref['chr']] +listA_found_ref.rename(columns = {'strnad':'strand'}, inplace = True) #Also change strnad to strand (was misspelled in LR-Guides pipeline) +#This (all such) file has 2-columns (gene as given in sgID_1/2, ref_guide). +listA_notfound_ref = pd.read_csv(cwd+"seta_notfound_ref1.csv",index_col=False) +listA_notfound_ref=listA_notfound_ref.sort_values('gene') +lsita_ref_notfound_sz=listA_notfound_ref.shape[0] +#LR guides +listA_found_lr = pd.read_csv(cwd+"seta_found_LR1.csv",index_col=False) +listA_found_lr=listA_found_lr.sort_values('gene') +lsita_lr_found_sz=listA_found_lr.shape[0] +listA_found_lr.rename(columns = {'strnad':'strand'}, inplace = True) +listA_notfound_lr = pd.read_csv(cwd+"seta_notfound_LR1.csv",index_col=False) +listA_notfound_lr=listA_notfound_lr.sort_values('gene') +lsita_lr_notfound_sz=listA_notfound_lr.shape[0] + +#Also read GRCh38 and LR guides for set b +listB_found_ref = pd.read_csv(cwd+"setb_found_ref1.csv",index_col=False) +listB_found_ref=listB_found_ref.sort_values('gene') +lsitb_ref_found_sz=listB_found_ref.shape[0] +#remove # from chr# # +listB_found_ref['chr'] = [x.split(' ')[-0] for x in listB_found_ref['chr']] +listB_found_ref=listB_found_ref.sort_values('gene') +listB_found_ref.rename(columns = {'strnad':'strand'}, inplace = True) +listB_notfound_ref = pd.read_csv(cwd+"setb_notfound_ref1.csv",index_col=False) +listB_notfound_ref=listB_notfound_ref.sort_values('gene') +lsitb_ref_notfound_sz=listB_notfound_ref.shape[0] + + +listB_found_lr = pd.read_csv(cwd+"setb_found_LR1.csv",index_col=False) +listB_found_lr=listB_found_lr.sort_values('gene') +lsitb_lr_found_sz=listB_found_lr.shape[0] +listB_found_lr.rename(columns = {'strnad':'strand'}, inplace = True) +listB_notfound_lr = pd.read_csv(cwd+"setb_notfound_LR1.csv",index_col=False) +listB_notfound_lr=listB_notfound_lr.sort_values('gene') +lsitb_lr_notfound_sz=listB_notfound_lr.shape[0] + +#Also read GRCh38 and LR guides for set c +listC_found_ref = pd.read_csv(cwd+"setc_found_ref1.csv",index_col=False) +listC_found_ref=listC_found_ref.sort_values('gene') +lsitc_ref_found_sz=listC_found_ref.shape[0] +#remove # from chr# # +listC_found_ref['chr'] = [x.split(' ')[-0] for x in listC_found_ref['chr']] +listC_found_ref.rename(columns = {'strnad':'strand'}, inplace = True) +listC_notfound_ref = pd.read_csv(cwd+"setc_notfound_ref1.csv",index_col=False) +listC_notfound_ref=listC_notfound_ref.sort_values('gene') +lsitc_ref_notfound_sz=listC_notfound_ref.shape[0] + +listC_found_lr = pd.read_csv(cwd+"setc_found_LR1.csv",index_col=False) +listC_found_lr=listC_found_lr.sort_values('gene') +lsitc_lr_found_sz=listC_found_lr.shape[0] +listC_found_lr.rename(columns = {'strnad':'strand'}, inplace = True) +listC_notfound_lr = pd.read_csv(cwd+"setc_notfound_LR1.csv",index_col=False) +listC_notfound_lr=listC_notfound_lr.sort_values('gene') +lsitc_lr_notfound_sz=listC_notfound_lr.shape[0] + + +#This for all guides order table +set_start=0 + +regular_lista=listA[~listA['gene'].str.contains('non-targeting')]['sgID_AB']#[['gene','guide_type','protospacer_A','protospacer_B','sgID_AB']] +regular_lista=regular_lista.sort_values() +set_end=regular_lista.shape[0] #18905 +#regular_lista=regular_lista.iloc[set_start:set_end] +non_targeting_lista=listA[listA['gene'].str.contains('non-targeting')]['sgID_AB']#[['gene','guide_type','protospacer_A','protospacer_B','sgID_AB']] +non_targeting_lista=non_targeting_lista.sort_values() +#regular_lista=regular_lista.reset_index() +regular_listb=listB[~listB['gene'].str.contains('non-targeting')]['sgID_AB']#[['gene','guide_type','protospacer_A','protospacer_B','sgID_AB']] +regular_listb=regular_listb.sort_values() +#regular_listb=regular_listb.iloc[set_start:set_end] +non_targeting_listb=listB[listB['gene'].str.contains('non-targeting')]['sgID_AB']#[['gene','guide_type','protospacer_A','protospacer_B','sgID_AB']] +non_targeting_listb=non_targeting_listb.sort_values() + +#regular_listb=regular_listb.reset_index() +regular_listc=listC[~listC['gene'].str.contains('non-targeting')]['sgID_AB']#[['gene','guide_type','protospacer_A','protospacer_B','sgID_AB']] +regular_listc=regular_listc.sort_values() +#regular_listc=regular_listc[set_start:set_end] +non_targeting_listc=listC[listC['gene'].str.contains('non-targeting')]['sgID_AB']#[['gene','guide_type','protospacer_A','protospacer_B','sgID_AB']] +non_targeting_listc=non_targeting_listc.sort_values() + +#GENERAL FUNCTIONS def transform(df,str): - # Select columns - #cols = st.multiselect('Please select columns to save current Table as csv file', cols = st.multiselect(str, df.columns.tolist(), df.columns.tolist() @@ -39,19 +143,6 @@ def convert_df(df): def convert_df1(df): return df.to_csv(index=False).encode('utf-8') - -# CSS to inject contained in a string -hide_table_row_index = """ - - """ - -# Inject CSS with Markdown -st.markdown(hide_table_row_index, unsafe_allow_html=True) - - #########TABLE DISPLAY def tbl_disp(dat,var,ref,key,flg=1): dat.reset_index(drop=True, inplace=True) @@ -67,16 +158,11 @@ def tbl_disp(dat,var,ref,key,flg=1): mime='text/csv', #key=key, ) - #st.table(dft) - #st.markdown(table_edit,unsafe_allow_html=True) gb = GridOptionsBuilder.from_dataframe(dat) gb.configure_pagination(enabled=False)#,paginationAutoPageSize=False)#True) #Add pagination gb.configure_default_column(enablePivot=True, enableValue=True, enableRowGroup=True) gb.configure_selection(selection_mode="multiple", use_checkbox=True) gb.configure_column("gene", headerCheckboxSelection = True) - - - gb.configure_side_bar() gridOptions = gb.build() @@ -114,42 +200,32 @@ def tbl_disp(dat,var,ref,key,flg=1): ) return dfs - - -def assemble_tbl(t): - dft = pd.DataFrame(columns=['sgID_1','sgRNA_1','chr_sgRNA_1','position_sgRNA_1','sgID_2','sgRNA_2','chr_sgRNA_2','position_sgRNA_2', 'sgID_1_2']) - for i in range(0,t.shape[0],2): - l1=t.iloc[[i]] - l1.columns=['sgID_1','sgRNA_1','chr_sgRNA_1','position_sgRNA_1','mutated_guide', 'strand', 'num_mismatch'] - - l2=t.iloc[[i+1]] - l2.columns=['sgID_2','sgRNA_2','chr_sgRNA_2','position_sgRNA_2','mutated_guide2', 'strand2', 'num_mismatch2'] - listA_concatenated_match_LR1=pd.concat([l1.reset_index(drop=True),l2.reset_index(drop=True)],axis=1) - listA_concatenated_match_LR1=listA_concatenated_match_LR1[['sgID_1','sgRNA_1','chr_sgRNA_1','position_sgRNA_1','sgID_2','sgRNA_2','chr_sgRNA_2','position_sgRNA_2']] - listA_concatenated_match_LR1['sgRNA_1']=listA_concatenated_match_LR1['sgRNA_1'].str.slice(0, 20) - listA_concatenated_match_LR1['sgRNA_2']=listA_concatenated_match_LR1['sgRNA_2'].str.slice(0, 20) - listA_concatenated_match_LR1['sgID_1_2']=listA_concatenated_match_LR1['sgID_1']+"|"+listA_concatenated_match_LR1['sgID_1'] - dft=dft.append(listA_concatenated_match_LR1) - - return dft - def get_lists(ref_list,list_found_ref,list_notfound_ref): - a_ref=[] + #This module retrieves guide_id and searches for guide sequences from the table + #st.table(ref_list) + a_ref=[] + #st.table(ref_list) for i in range(len(ref_list)): - a_ref.append(ref_list.gene.values[i].split('|')[0]) - a_ref.append(ref_list.gene.values[i].split('|')[1]) + a_ref.append(ref_list.sgID_AB.values[i].split('|')[0]) + a_ref.append(ref_list.sgID_AB.values[i].split('|')[1]) set_found0_ref=[] + #st.table(a_ref) for i in range(len(a_ref)): set_found0_ref.append(list_found_ref[list_found_ref['gene']==a_ref[i]]) + #st.write(set_found0_ref) list_concatenated_found_ref = pd.concat(set_found0_ref) - list_concatenated_match_ref = list_concatenated_found_ref[list_concatenated_found_ref.num_mismatch == 0] + list_concatenated_match_ref = list_concatenated_found_ref[list_concatenated_found_ref.num_mismatch == 0] #only select guides with zero mismatches for match list, MISSMATCH LIST LATER #Also remove Alternate loci's data list_concatenated_match_ref = list_concatenated_match_ref[list_concatenated_match_ref['chr'].str.contains('chr')] - + #st.table(list_concatenated_match_ref) #also create new list with both sgRNAs in one row dft=pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + + guideflg1=1 + #st.table(list_concatenated_match_ref) if list_concatenated_match_ref.shape[0]>0: + guideflg1=0 t=list_concatenated_match_ref.reset_index(drop=True) #st.table(t) @@ -169,17 +245,27 @@ def get_lists(ref_list,list_found_ref,list_notfound_ref): while i 0: - guideflg1=2 - if seta_notfound1_ref.shape[0]>0: - guideflg1=1 list_concatenated_notfound_ref = pd.concat([seta_notfound0_ref,seta_notfound1_ref]) - #st.table(dft) - #st.table(dft_mut) - return dft, dft_mut,list_concatenated_notfound_ref,list_concatenated_match_ref,list_concatenated_mutated_ref,guideflg1 + + return dft.iloc[:1], dft_mut,list_concatenated_notfound_ref,list_concatenated_match_ref,list_concatenated_mutated_ref,guideflg1 ########### - def get_mutated_res(list_concatenated_mutated_ref): ######### #if list_concatenated_mutated_ref.shape[0]>0: t=list_concatenated_mutated_ref.reset_index(drop=True) + #st.table(t) dft_mut = pd.DataFrame(columns=['sgID_1','sgRNA_1','chr_sgRNA_1','position_sgRNA_1','sgID_2','sgRNA_2','chr_sgRNA_2','position_sgRNA_2', 'sgID_1_2']) c1=['sgID_1','sgRNA_1','chr_sgRNA_1','position_sgRNA_1'] @@ -241,85 +318,11 @@ def get_mutated_res(list_concatenated_mutated_ref): list_concatenated_mutated_ref1['sgRNA_2']='G'+list_concatenated_mutated_ref1['sgRNA_2'].str.slice(1, 20) list_concatenated_mutated_ref1['sgID_1_2']=list_concatenated_mutated_ref1['sgID_1']+"|"+list_concatenated_mutated_ref1['sgID_1'] - dft_mut=dft_mut.append(list_concatenated_mutated_ref1) - return dft_mut + #dft_mut=dft_mut.append(list_concatenated_mutated_ref1) + dft_mut=pd.concat([dft_mut,list_concatenated_mutated_ref1]) - ######### - -#######THIS SECTION ADDED FOR ORDER READY LIST AND REMOVE REPITION FOR NOT_FOUND ENTRUES -def get_lists_ol(ref_list,list_found_ref,list_notfound_ref): - a_ref=[] - for i in range(len(ref_list)): - a_ref.append(ref_list.gene.values[i].split('|')[0]) - a_ref.append(ref_list.gene.values[i].split('|')[1]) - - set_found0_ref=[] - for i in range(len(a_ref)): - set_found0_ref.append(list_found_ref[list_found_ref['gene']==a_ref[i]]) - list_concatenated_found_ref = pd.concat(set_found0_ref) - list_concatenated_match_ref = list_concatenated_found_ref[list_concatenated_found_ref.num_mismatch == 0] - #Also remove Alternate loci's data - list_concatenated_match_ref = list_concatenated_match_ref[list_concatenated_match_ref['chr'].str.contains('chr')] - - #also create new list with both sgRNAs in one row - dft=pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - if list_concatenated_match_ref.shape[0]>0: - t=list_concatenated_match_ref.reset_index(drop=True) - #st.table(t) - - ########## - #check even/odd entries - if t.shape[0]==1: - t1=t.loc[t.index.repeat(2)].reset_index(drop=True) - #st.write(t1) - dft=assemble_tbl(t1) - - elif t.shape[0]%2==0: #even - dft=assemble_tbl(t) - - else: #odd - t1 = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - i=0 - while i 0] - list_concatenated_mutated_ref=list_concatenated_mutated_ref.sort_values('position') - - #Also remove Alternate loci's data + return dft_mut - list_concatenated_mutated_ref = list_concatenated_mutated_ref[list_concatenated_mutated_ref['chr'].str.contains('chr')] - dft_mut = pd.DataFrame(columns=['sgID_1','sgRNA_1','chr_sgRNA_1','position_sgRNA_1','sgID_2','sgRNA_2','chr_sgRNA_2','position_sgRNA_2', 'sgID_1_2']) - if list_concatenated_mutated_ref.shape[0]>0: - dft_mut = get_mutated_res(list_concatenated_mutated_ref) - #check not found - seta_notfound0_ref=list_notfound_ref[list_notfound_ref['gene']==a_ref[0]] - seta_notfound1_ref=list_notfound_ref[list_notfound_ref['gene']==a_ref[1]] - list_concatenated_notfound_ref = pd.concat([seta_notfound0_ref,seta_notfound1_ref]) - return dft, dft_mut,list_concatenated_notfound_ref,list_concatenated_match_ref,list_concatenated_mutated_ref - ########### - - -#THIS WILL GENERATE ORDER READY TABLE FOR GRCh38 -#THIS WILL GENERATE ORDER READY TABLE FOR CHM13 - -#CHECK IF GUIDE ARE IN NOT FOUND LIST def not_found_check(set12,set34,set56,listA_notfound_lr,listB_notfound_lr,listC_notfound_lr): flg11=0 flg12=0 @@ -343,16 +346,21 @@ def not_found_check(set12,set34,set56,listA_notfound_lr,listB_notfound_lr,listC_ flg32=1 return flg11,flg12,flg21,flg22,flg31,flg32 -def order_ready_tbl_CHM13(set12,set34,set56,listA_found_lr,listA_notfound_lr,listB_found_lr,listB_notfound_lr,listC_found_lr,listC_notfound_lr): +def order_ready_tbl_CHM13(set12,set34,set56,listA_found_lr,listA_notfound_lr,listB_found_lr,listB_notfound_lr,listC_found_lr,listC_notfound_lr,ref_sel): + # st.table(set12) + # st.table(set34) + # st.table(set56) dft_order_table=pd.DataFrame(columns=['gene','guide_type','sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - + dft_notfound_all=pd.DataFrame(columns=['gene','sgID_AB','guide_type','protospacer_A','protospacer_B']) + + #dft_notfound=pd.DataFrame(columns=['gene','ref_guide']) + dft_a = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) dft_b = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) dft_c = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) set12=set12.reset_index(drop = True) set34=set34.reset_index(drop = True) set56=set56.reset_index(drop = True) - for i in range(set12.shape[0]): gene_n=set12[i].split('_')[0] f=not_found_check(set12[i],set34[i],set56[i],listA_notfound_lr,listB_notfound_lr,listC_notfound_lr) @@ -360,43 +368,37 @@ def order_ready_tbl_CHM13(set12,set34,set56,listA_found_lr,listA_notfound_lr,lis #st.write(set12[i],set34[i],set56[i]) #ref_listA=listA[listA['gene']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listA=listA[listA['sgID_AB']==set12.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listA = ref_listA[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - - ref_listA.columns=['gene','guide_type','protospacer_A','protospacer_B'] + ref_listA=listA[listA['sgID_AB']==set12.iloc[i]][['gene','guide_type','protospacer_A','protospacer_B','sgID_AB']] + ref_listA = ref_listA[['gene','sgID_AB','guide_type','protospacer_A','protospacer_B']] + #st.write(ref_listA) + #ref_listA.columns=['gene','guide_type','protospacer_A','protospacer_B'] resa,res_muta,res_notfounda,list_matcha,list_mutateda,gflga1=get_lists(ref_listA,listA_found_lr,listA_notfound_lr) - dft_a=dft_a.append(ref_listA) + #dft_a=dft_a.append(ref_listA) #listb ref_listB=listB[listB['sgID_AB']==set34.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] ref_listB = ref_listB[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - ref_listB.columns=['gene','guide_type','protospacer_A','protospacer_B'] + #ref_listB.columns=['gene','guide_type','protospacer_A','protospacer_B'] resb,res_mutb,res_notfoundb,list_matchb,list_mutatedb,gflgb1=get_lists(ref_listB,listB_found_lr,listB_notfound_lr) - dft_b=dft_b.append(ref_listB) - #st.table(not resb.empty) - #st.table(res_mutb) - #st.table(resb) + #dft_b=dft_b.append(ref_listB) #listc ref_listC=listC[listC['sgID_AB']==set56.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] ref_listC = ref_listC[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - ref_listC.columns=['gene','guide_type','protospacer_A','protospacer_B'] + #ref_listC.columns=['gene','guide_type','protospacer_A','protospacer_B'] resc,res_mutc,res_notfoundc,list_matchc,list_mutatedc,gflgc1=get_lists(ref_listC,listC_found_lr,listC_notfound_lr) - dft_c=dft_c.append(ref_listC) - - # st.write(set12[i]) - # st.write(set34[i]) - # st.write(set56[i]) - # st.write(f) + #dft_c=dft_c.append(ref_listC) + #st.table(ref_listA) # st.write(gflga1,gflgb1,gflgc1) if gflga1==0: #Also verigy that both guides are different - + #st.table(resa) if resa['sgID_1'][0] != resa['sgID_2'][0]: resa['gene']=gene_n resa['guide_type']='1-2' - dft_order_table=dft_order_table.append(resa) + #dft_order_table=dft_order_table.append(resa) + dft_order_table=pd.concat([dft_order_table, resa]) #dft_order_table.concat(resa) else: #it is nutation case, so check next if f[2]==0 or f[3] == 0: #st.write('came in 1') @@ -405,29 +407,51 @@ def order_ready_tbl_CHM13(set12,set34,set56,listA_found_lr,listA_notfound_lr,lis resa['sgID_1_2'] = resa['sgID_1']+"|"+resa['sgID_2'] if f[2]==0: resa['gene']=gene_n - resa['guide_type']=str(gflga1)+"-3" - dft_order_table=dft_order_table.append(resa) + if f[0]==0: + resa['guide_type']="1-3" + else: + resa['guide_type']="2-3" + #dft_order_table=dft_order_table.append(resa) + dft_order_table=pd.concat([dft_order_table,resa]) else: # f[2]==0: resa['gene']=gene_n - resa['guide_type']=str(gflga1)+"-4" - dft_order_table=dft_order_table.append(resa) - - + if f[0]==0: + resa['guide_type']="1-4" + else: + resa['guide_type']="2-4" + #dft_order_table=dft_order_table.append(resa) + dft_order_table=pd.concat([dft_order_table,resa]) + else: + dft_notfound_all=pd.concat([dft_notfound_all,ref_listA], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listB], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listC], ignore_index = True) + else: + dft_notfound_all=pd.concat([dft_notfound_all,ref_listA], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listB], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listC], ignore_index = True) + + elif resa.shape[0] >0: #at least one guide is from seta #if resa['sgID_1'][0] != resa['sgID_2'][0]: if f[2]==0 or f[3] == 0: - st.write('came in 1') + #st.write('came in 1') if not resb.empty: # and resb['sgID_1'][0] != resb['sgID_2'][0]: #second guide in from setb resa[['sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2']] = resb[['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1']] resa['sgID_1_2'] = resa['sgID_1']+"|"+resa['sgID_2'] if f[2]==0: resa['gene']=gene_n resa['guide_type']=str(gflga1)+"-3" - dft_order_table=dft_order_table.append(resa) + #dft_order_table=dft_order_table.append(resa) + dft_order_table=pd.concat([dft_order_table,resa]) else: # f[2]==0: resa['gene']=gene_n resa['guide_type']=str(gflga1)+"-4" - dft_order_table=dft_order_table.append(resa) + #dft_order_table=dft_order_table.append(resa) + dft_order_table=pd.concat([dft_order_table,resa]) + else: + dft_notfound_all=pd.concat([dft_notfound_all,ref_listA], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listB], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listC], ignore_index = True) elif f[4]==0 or f[5] == 0: #st.write('came in 2') @@ -439,139 +463,315 @@ def order_ready_tbl_CHM13(set12,set34,set56,listA_found_lr,listA_notfound_lr,lis if f[4]==0: resa['gene']=gene_n resa['guide_type']=str(gflga1)+"-5" - dft_order_table=dft_order_table.append(resa) + #dft_order_table=dft_order_table.append(resa) + dft_order_table=pd.concat([dft_order_table,resa]) else: # f[2]==0: resa['gene']=gene_n resa['guide_type']=str(gflga1)+"-6" - dft_order_table=dft_order_table.append(resa) + #dft_order_table=dft_order_table.append(resa) + dft_order_table=pd.concat([dft_order_table,resa]) + else: + dft_notfound_all=pd.concat([dft_notfound_all,ref_listA], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listB], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listC], ignore_index = True) + elif resb.shape[0]>0: #at least one guide - #if resb['sgID_1'][0] != resb['sgID_2'][0]: - if f[4]==0 or f[5] == 0: + if gflgb1==0: + if resb['sgID_1'][0] != resb['sgID_2'][0]: + resb['gene']=gene_n + resb['guide_type']='3-4' + #dft_order_table=dft_order_table.append(resb) + dft_order_table=pd.concat([dft_order_table,resb]) + else: + dft_notfound_all=pd.concat([dft_notfound_all,ref_listA], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listB], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listC], ignore_index = True) + + + elif f[4]==0 or f[5] == 0: #if not resc.empty and resc['sgID_1'][0] != resc['sgID_2'][0]: resb[['sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2']] = resc[['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1']] resb['sgID_1_2'] = resb['sgID_1']+"|"+resb['sgID_2'] #dft_order_table=dft_order_table.append(resb) if f[4]==0: resb['gene']=gene_n - resb['guide_type']=str(gflgb1+1)+"-5" - dft_order_table=dft_order_table.append(resb) + resb['guide_type']=str(gflgb1+2)+"-5" + #dft_order_table=dft_order_table.append(resb) + dft_order_table=pd.concat([dft_order_table,resb]) else: # f[2]==0: resb['gene']=gene_n resb['guide_type']=str(gflgb1+2)+"-6" - dft_order_table=dft_order_table.append(resb) + #dft_order_table=dft_order_table.append(resb) + dft_order_table=pd.concat([dft_order_table,resb]) + else: + dft_notfound_all=pd.concat([dft_notfound_all,ref_listA], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listB], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listC], ignore_index = True) + elif resc.shape[0]>0: #at least one guide - #if f[4]==0 and f[5] == 0: - if resc['sgID_1'][0] != resc['sgID_2'][0]: - resc['gene']=gene_n - resc['guide_type']='5-6' - dft_order_table=dft_order_table.append(resc) + if gflgc1==0: + if resc['sgID_1'][0] != resc['sgID_2'][0]: + resc['gene']=gene_n + resc['guide_type']='5-6' + #dft_order_table=dft_order_table.append(resc) + dft_order_table=pd.concat([dft_order_table,resc]) + else: + dft_notfound_all=pd.concat([dft_notfound_all,ref_listA], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listB], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listC], ignore_index = True) + else: + dft_notfound_all=pd.concat([dft_notfound_all,ref_listA], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listB], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listC], ignore_index = True) + + + else: + dft_notfound_all=pd.concat([dft_notfound_all,ref_listA], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listB], ignore_index = True) + dft_notfound_all=pd.concat([dft_notfound_all,ref_listC], ignore_index = True) + - if dft_order_table.shape[0]>0: - st.write('Order Ready **CHM13** guides List') - tbl_disp(dft_order_table,'select_genes','SetA_CHM13',5) + if dft_order_table.shape[0]>0: + #check total guides found + # st.write(str(set12.shape[0])) + # st.write(str(set34.shape[0])) + # st.write(str(set56.shape[0])) + st.write('**Please note that for guides matching to multiple locations (an example is ABCC6), only first pair is returned**') + szt=set12.shape[0] + szf=dft_order_table.shape[0] + # st.write(str(dft_order_table.shape[0])) + szd=szt-szf + if szd>0: + st.write('Order Ready '+ref_sel+' guides List: '+str(szd)+'/'+str(szt)+' **guides were not found**') + tbl_disp(dft_order_table,'select_genes','SetA_CHM13',5) + else: + st.write('Order Ready '+ref_sel+' guides List') + tbl_disp(dft_order_table,'select_genes','SetA_CHM13',5) else: st.write('**No guides found in ListA, ListB and ListC**') - #st.table(dft_order_table) - -#def get_notfound(): + if dft_notfound_all.shape[0]>0: + st.write('**Guides not found in any lists**') + tbl_disp(dft_notfound_all,'select_genes','SetA_CHM13',6) - -cwd=os.getcwd()+'/'+'data/' - - -listA = pd.read_csv(cwd+"guides_a_new.csv",index_col=False) - -listB = pd.read_csv(cwd+"guides_b_new.csv",index_col=False) -listC = pd.read_csv(cwd+"guides_c_new.csv",index_col=False) - -lista_sz=listA.shape[0] -listb_sz=listB.shape[0] -listc_sz=listC.shape[0] - - -variantsa1=listA['gene'].unique() -variantsb1=listB['gene'].unique() -variantsc1=listC['gene'].unique() - -con = np.concatenate((variantsa1, variantsb1,variantsc1)) - - -#st.write(type(variantsc1)) -variants_s=sorted(np.unique(con)) -#st.write(len(variants_s)) -#also get names for non-targetting guides - - -#Also read GRCh38 and LR guides for stea -listA_found_ref = pd.read_csv(cwd+"seta_found_ref1.csv",index_col=False) - - - - - - -lsita_ref_found_sz=listA_found_ref.shape[0] -#remove # from chr# # -listA_found_ref['chr'] = [x.split(' ')[-0] for x in listA_found_ref['chr']] -listA_found_ref.rename(columns = {'strnad':'strand'}, inplace = True) -listA_notfound_ref = pd.read_csv(cwd+"seta_notfound_ref1.csv",index_col=False) - -lsita_ref_notfound_sz=listA_notfound_ref.shape[0] - +def assemble_tbl(t): + dft = pd.DataFrame(columns=['sgID_1','sgRNA_1','chr_sgRNA_1','position_sgRNA_1','sgID_2','sgRNA_2','chr_sgRNA_2','position_sgRNA_2', 'sgID_1_2']) + #for i in range(0,t.shape[0],2): + mid=int(t.shape[0]/2) + for i in range(int(t.shape[0]/2)): + l1=t.iloc[[i]] + l1.columns=['sgID_1','sgRNA_1','chr_sgRNA_1','position_sgRNA_1','mutated_guide', 'strand', 'num_mismatch'] -listA_found_lr = pd.read_csv(cwd+"seta_found_LR1.csv",index_col=False) -lsita_lr_found_sz=listA_found_lr.shape[0] -listA_found_lr.rename(columns = {'strnad':'strand'}, inplace = True) -listA_notfound_lr = pd.read_csv(cwd+"seta_notfound_LR1.csv",index_col=False) -lsita_lr_notfound_sz=listA_notfound_lr.shape[0] + #l2=t.iloc[[i+1]] + l2=t.iloc[[mid]] + l2.columns=['sgID_2','sgRNA_2','chr_sgRNA_2','position_sgRNA_2','mutated_guide2', 'strand2', 'num_mismatch2'] + listA_concatenated_match_LR1=pd.concat([l1.reset_index(drop=True),l2.reset_index(drop=True)],axis=1) + listA_concatenated_match_LR1=listA_concatenated_match_LR1[['sgID_1','sgRNA_1','chr_sgRNA_1','position_sgRNA_1','sgID_2','sgRNA_2','chr_sgRNA_2','position_sgRNA_2']] + listA_concatenated_match_LR1['sgRNA_1']=listA_concatenated_match_LR1['sgRNA_1'].str.slice(0, 20) + listA_concatenated_match_LR1['sgRNA_2']=listA_concatenated_match_LR1['sgRNA_2'].str.slice(0, 20) + listA_concatenated_match_LR1['sgID_1_2']=listA_concatenated_match_LR1['sgID_1']+"|"+listA_concatenated_match_LR1['sgID_2'] + #dft=dft.append(listA_concatenated_match_LR1) + dft=pd.concat([dft,listA_concatenated_match_LR1]) + + mid=mid+1 + + return dft + +#Get non-targeting lists +def get_lists_non_targeting(ref_list,list_found_ref,list_notfound_ref): + + #This module retrieves guide_id and searches for guide sequences from the table + #st.table(ref_list) + a_ref=[] + for i in range(len(ref_list)): + a_ref.append(ref_list.sgID_AB.values[i].split('|')[0]) + a_ref.append(ref_list.sgID_AB.values[i].split('|')[1]) -#Also read GRCh38 and LR guides for set b -listB_found_ref = pd.read_csv(cwd+"setb_found_ref1.csv",index_col=False) -lsitb_ref_found_sz=listB_found_ref.shape[0] -#remove # from chr# # -listB_found_ref['chr'] = [x.split(' ')[-0] for x in listB_found_ref['chr']] -listB_found_ref.rename(columns = {'strnad':'strand'}, inplace = True) -listB_notfound_ref = pd.read_csv(cwd+"setb_notfound_ref1.csv",index_col=False) -lsitb_ref_notfound_sz=listB_notfound_ref.shape[0] + set_found0_ref=[] + for i in range(len(a_ref)): + set_found0_ref.append(list_found_ref[list_found_ref['gene']==a_ref[i]]) + list_concatenated_found_ref = pd.concat(set_found0_ref) + list_concatenated_match_ref = list_concatenated_found_ref[list_concatenated_found_ref.num_mismatch == 0] #only select guides with zero mismatches for match list, MISSMATCH LIST LATER + #get matching to Alternating loci's + list_concatenated_match_alt_ref = list_concatenated_match_ref[~list_concatenated_match_ref['chr'].str.contains('chr')] + #Also remove Alternate loci's data + list_concatenated_match_ref = list_concatenated_match_ref[list_concatenated_match_ref['chr'].str.contains('chr')] + #st.table(list_concatenated_match_ref) + #also create new list with both sgRNAs in one row + dft=pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + if list_concatenated_match_ref.shape[0]>0: + t=list_concatenated_match_ref.reset_index(drop=True) + #st.table(t) + + ########## + #check even/odd entries + if t.shape[0]==1: + + t1=t.loc[t.index.repeat(2)].reset_index(drop=True) + #st.write(t1) + dft=assemble_tbl(t1) + + elif t.shape[0]%2==0: #even + dft=assemble_tbl(t) + else: #odd + t1 = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + i=0 + while i 0] + list_concatenated_mutated_ref=list_concatenated_mutated_ref.sort_values('position') + + #Also remove Alternate loci's data + list_concatenated_mutated_alt_ref = list_concatenated_mutated_ref[~list_concatenated_mutated_ref['chr'].str.contains('chr')] + list_concatenated_mutated_ref = list_concatenated_mutated_ref[list_concatenated_mutated_ref['chr'].str.contains('chr')] + dft_mut = pd.DataFrame(columns=['sgID_1','sgRNA_1','chr_sgRNA_1','position_sgRNA_1','sgID_2','sgRNA_2','chr_sgRNA_2','position_sgRNA_2', 'sgID_1_2']) + + if list_concatenated_mutated_ref.shape[0]>0: + dft_mut = get_mutated_res(list_concatenated_mutated_ref) + #check not found + seta_notfound0_ref=list_notfound_ref[list_notfound_ref['gene']==a_ref[0]] + seta_notfound1_ref=list_notfound_ref[list_notfound_ref['gene']==a_ref[1]] + #st.write(list_notfound_ref[list_notfound_ref['gene']==a_ref[0]]) + #st.write(seta_notfound0_ref) + #st.write(seta_notfound1_ref) + #add guideflg1 to return which guide is found + guideflg1=0 + if seta_notfound0_ref.shape[0]>0: + guideflg1=2 + if seta_notfound1_ref.shape[0]>0: + guideflg1=1 + list_concatenated_notfound_ref = pd.concat([seta_notfound0_ref,seta_notfound1_ref]) + #st.table(a_ref) + #st.table(seta_notfound1_ref) + #st.table(dft) + #st.table(dft_mut) + return dft, dft_mut,list_concatenated_notfound_ref,list_concatenated_match_ref,list_concatenated_mutated_ref,list_concatenated_match_alt_ref,list_concatenated_mutated_alt_ref,guideflg1 + ########### +#Get All Guides Stats +#def process_all_guides(glist,list,ref_type,guide_type): +def process_all_guides(glist,for_list,f_list,nf_list): + #st.write(type(glist)) + #st.table(for_list) + #for_list=for_list.reset_index() + variant_set=glist['gene'] + dft_c = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) + dft_resc=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) + dft_res_mutc=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) + dft_notfoundc=pd.DataFrame(columns=['gene','ref_guide']) + df_matched_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + df_matched_alt_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + df_mutated_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + df_mutated_guides_alt_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + + + #st.table(for_list) + for i in range(variant_set.shape[0]): + #st.write(variant_set.iloc[i]) + ref_listC=for_list[for_list['sgID_AB']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] + ref_listC =ref_listC[['sgID_AB','guide_type','protospacer_A','protospacer_B']] + #st.table(ref_listC) + #st.table(ref_listC) + + res,res_mut,res_notfound,list_match,list_mutated,list_match_alt,list_mutated_alt,gflgc1=get_lists_non_targeting(ref_listC,f_list,nf_list) + + + #dft_c=dft_c.append(ref_listC) + if res.shape[0]>0: + dft_resc=pd.concat([dft_resc,res]) + if res_mut.shape[0]>0: + dft_res_mutc=pd.concat([dft_res_mutc,res_mut]) + if res_notfound.shape[0]>0: + dft_notfoundc= pd.concat([dft_notfoundc,res_notfound]) + if list_match.shape[0]>0: + df_matched_guides_ref= pd.concat([df_matched_guides_ref,list_match]) + if list_mutated.shape[0]>0: + df_mutated_guides_ref= pd.concat([df_mutated_guides_ref,list_mutated]) + if list_match_alt.shape[0]>0: + df_matched_alt_ref=pd.concat([df_matched_alt_ref,list_mutated]) + if list_mutated_alt.shape[0]>0: + df_mutated_guides_alt_ref=pd.concat([df_mutated_guides_alt_ref,list_mutated_alt]) + + if df_matched_guides_ref.shape[0]>0: + #st.write(type(df_matched_guides_ref['gene'])) + gl=df_matched_guides_ref['gene'] + dupesm=gl[gl.duplicated()] + if df_mutated_guides_ref.shape[0]>0: + gl=df_mutated_guides_ref['gene'] + dupesmu=gl[gl.duplicated()] + #now check common between matched and mutated + # if dupesm.shape[0]>0 and dupesmu.shape[0]>0: + # common_list = set(dupesm).intersection(dupesmu) + # st.table(common_list) + # st.write('common guides between matched and mutated lists are: '+len(common_list)) + + + if df_matched_guides_ref.shape[0]>0: + if dupesm.shape[0]>0: + st.write('**Matched Guides**: '+str(df_matched_guides_ref.shape[0])+' and: '+str(dupesm.shape[0])+' are repeated guides (matched to multiple locations)') + tbl_disp(df_matched_guides_ref,'select_genes','SetC_GRCh38',17) + #st.table(dupesm,'select_genes','SetC_GRCh38',17) + tbl_disp(dupesm,'select_genes','SetC_GRCh38',17) + else: + st.write('**Matched Guides**: '+str(df_matched_guides_ref.shape[0])) + tbl_disp(df_matched_guides_ref,'select_genes','SetC_GRCh38',17) + + if df_matched_alt_ref.shape[0]>0: + st.write('**Matched Guides to Alt Loci**: '+str(df_matched_alt_ref.shape[0])) + tbl_disp(df_matched_alt_ref,'select_genes','SetC_GRCh38',17) + if df_mutated_guides_ref.shape[0]>0: + #gl=df_mutated_guides_ref['gene'] + #dupesmu=gl[gl.duplicated()] + if dupesmu.shape[0]>0: + st.write('**Mutated Guides (some might have >1 guides)**: '+str(df_mutated_guides_ref.shape[0])+' and: '+str(dupesmu.shape[0])+' are repeated guides') + tbl_disp(df_mutated_guides_ref,'select_genes','SetC_GRCh38',18) + #st.table(dupesmu) + else: + st.write('**Mutated Guides (some might have >1 guides)**: '+str(df_mutated_guides_ref.shape[0])) + tbl_disp(df_mutated_guides_ref,'select_genes','SetC_GRCh38',18) + + if df_mutated_guides_alt_ref.shape[0]>0: + st.write('**Mutated Guides to Alt Loci**: '+str(df_mutated_guides_alt_ref.shape[0])) + tbl_disp(df_mutated_guides_alt_ref,'select_genes','SetC_GRCh38',18) -listB_found_lr = pd.read_csv(cwd+"setb_found_LR1.csv",index_col=False) -lsitb_lr_found_sz=listB_found_lr.shape[0] -listB_found_lr.rename(columns = {'strnad':'strand'}, inplace = True) -listB_notfound_lr = pd.read_csv(cwd+"setb_notfound_LR1.csv",index_col=False) -lsitb_lr_notfound_sz=listB_notfound_lr.shape[0] + if dft_notfoundc.shape[0]>0: + st.write('**Guides Not Found**: '+str(dft_notfoundc.shape[0])) + tbl_disp(dft_notfoundc,'select_genes','SetC_GRCh38',19) + +#CALC BASED ON LIST, GUIDE TYPE AND REFERENCE -#Also read GRCh38 and LR guides for set c -listC_found_ref = pd.read_csv(cwd+"setc_found_ref1.csv",index_col=False) -lsitc_ref_found_sz=listC_found_ref.shape[0] -#remove # from chr# # -listC_found_ref['chr'] = [x.split(' ')[-0] for x in listC_found_ref['chr']] -listC_found_ref.rename(columns = {'strnad':'strand'}, inplace = True) -listC_notfound_ref = pd.read_csv(cwd+"setc_notfound_ref1.csv",index_col=False) -lsitc_ref_notfound_sz=listC_notfound_ref.shape[0] +#END GENERAL FUNCTIONS -listC_found_lr = pd.read_csv(cwd+"setc_found_LR1.csv",index_col=False) -lsitc_lr_found_sz=listC_found_lr.shape[0] -listC_found_lr.rename(columns = {'strnad':'strand'}, inplace = True) -listC_notfound_lr = pd.read_csv(cwd+"setc_notfound_LR1.csv",index_col=False) -lsitc_lr_notfound_sz=listC_notfound_lr.shape[0] -#also load all mismatched except non-targe guides -#listA_notfound_lr = pd.read_csv(cwd+"setc_notfound_LR1.csv",index_col=False) seta_all_notmatched_table.csv st.title('Long Read Guides Search') -#st.markdown('**Please select an option from the sidebar**') - -#st.write(variants) +st.write('**Important:** Please note that **MTMR3** is not present in guides_c list, so we have **removed it from list a and list b**') +#tbl_disp(regulara,'variant','ref_guides',0,1) Calc = st.sidebar.radio( "", ('ReadME', 'Single/Multiple Guides','All','Not_Found')) - if Calc == 'ReadME': expander = st.expander("How to use this app") #st.header('How to use this app') @@ -608,24 +808,24 @@ elif Calc=='Single/Multiple Guides': 'Please select genes list to start processing', variants_s) Updated=st.form_submit_button(label = 'Update') - listA_concatenated_orig = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) - reflistA_concatenated = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) - reflistB_concatenated = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) - reflistC_concatenated = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) + listA_concatenated_orig = pd.DataFrame(columns=['gene','sgID_AB','guide_type','protospacer_A','protospacer_B']) + reflistA_concatenated = pd.DataFrame(columns=['gene','sgID_AB','guide_type','protospacer_A','protospacer_B']) + reflistB_concatenated = pd.DataFrame(columns=['gene','sgID_AB','guide_type','protospacer_A','protospacer_B']) + reflistC_concatenated = pd.DataFrame(columns=['gene','sgID_AB','guide_type','protospacer_A','protospacer_B']) for variant in multi_genes: - ref_listA=listA[listA['gene']==variant][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listA = ref_listA[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - ref_listA.columns=['gene','guide_type','protospacer_A','protospacer_B'] + ref_listA=listA[listA['gene']==variant][['gene','guide_type','protospacer_A','protospacer_B','sgID_AB']] + ref_listA = ref_listA[['gene','sgID_AB','guide_type','protospacer_A','protospacer_B']] + #ref_listA.columns=['gene','guide_type','protospacer_A','protospacer_B'] reflistA_concatenated=pd.concat([reflistA_concatenated,ref_listA]) - ref_listB=listB[listB['gene']==variant][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listB = ref_listB[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - ref_listB.columns=['gene','guide_type','protospacer_A','protospacer_B'] + ref_listB=listB[listB['gene']==variant][['gene','guide_type','protospacer_A','protospacer_B','sgID_AB']] + ref_listB = ref_listB[['gene','sgID_AB','guide_type','protospacer_A','protospacer_B']] + #ref_listB.columns=['gene','guide_type','protospacer_A','protospacer_B'] reflistB_concatenated=pd.concat([reflistB_concatenated,ref_listB]) - ref_listC=listC[listC['gene']==variant][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listC = ref_listC[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - ref_listC.columns=['gene','guide_type','protospacer_A','protospacer_B'] + ref_listC=listC[listC['gene']==variant][['gene','guide_type','protospacer_A','protospacer_B','sgID_AB']] + ref_listC = ref_listC[['gene','sgID_AB','guide_type','protospacer_A','protospacer_B']] + #ref_listC.columns=['gene','guide_type','protospacer_A','protospacer_B'] reflistC_concatenated=pd.concat([reflistC_concatenated,ref_listC]) listA_concatenated_orig = pd.concat([listA_concatenated_orig,ref_listA,ref_listB,ref_listC]) @@ -637,32 +837,44 @@ elif Calc=='Single/Multiple Guides': st.markdown(caution_genes,unsafe_allow_html=True) with st.form(key='columns_in_form_a'): - c2, c3 = st.columns(2) + c2, c3 = st.columns([10,2]) with c2: get_table_order=tbl_disp(listA_concatenated_orig,'variant','ref_guides',111,0) - #multi_genes = st.multiselect( - #'Please select genes list to start processing', - #variants_s) + with c3: + ref_sel = st.radio("Select Reference", + ('CHM13','GRCh38'), + horizontal=True) + Updated1=st.form_submit_button(label = 'Generate Order Ready Table') - - #get_table_order=tbl_disp(listA_concatenated_orig,'variant','ref_guides',1,0) - - - - if not isinstance(get_table_order, type(None)): # and Updated1:# and get_table_order.shape[0]>0: - #if not isinstance(get_table_order, type(None)): - variant_set12=get_table_order[get_table_order['guide_type']=='1-2']['gene'] - variant_set34=get_table_order[get_table_order['guide_type']=='3-4']['gene'] - variant_set56=get_table_order[get_table_order['guide_type']=='5-6']['gene'] + if ref_sel=='GRCh38': + + list_founda=listA_found_ref + list_notfounda=listA_notfound_ref + list_foundb=listB_found_ref + list_notfoundb=listB_notfound_ref + list_foundc=listC_found_ref + list_notfoundc=listC_notfound_ref + + else: + list_founda=listA_found_lr + list_notfounda=listA_notfound_lr + list_foundb=listB_found_lr + list_notfoundb=listB_notfound_lr + list_foundc=listC_found_lr + list_notfoundc=listC_notfound_lr + + + variant_set12=get_table_order[get_table_order['guide_type']=='1-2']['sgID_AB'] + variant_set34=get_table_order[get_table_order['guide_type']=='3-4']['sgID_AB'] + variant_set56=get_table_order[get_table_order['guide_type']=='5-6']['sgID_AB'] #st.table(variant_set12) - #st.write(type(variant_set12)) - #if not variant_set12.equals(variant_set34): - # st.write('**Please select Identical Genes From List A and B**') + #st.write(variant_set12) if variant_set12.shape[0]==variant_set34.shape[0]==variant_set56.shape[0]: #########Here we call order ready table #order_ready_tbl_GRCh38(variant_set12,variant_set34,variant_set56) - order_ready_tbl_CHM13(variant_set12,variant_set34,variant_set56,listA_found_lr,listA_notfound_lr,listB_found_lr,listB_notfound_lr,listC_found_lr,listC_notfound_lr) + #order_ready_tbl_CHM13(variant_set12,variant_set34,variant_set56,listA_found_lr,listA_notfound_lr,listB_found_lr,listB_notfound_lr,listC_found_lr,listC_notfound_lr) + order_ready_tbl_CHM13(variant_set12,variant_set34,variant_set56,list_founda,list_notfounda,list_foundb,list_notfoundb,list_foundc,list_notfoundc,ref_sel) ########END ORDER READY TABLE @@ -675,332 +887,218 @@ elif Calc=='Single/Multiple Guides': else: st.markdown("""**Probably Mixed guides are selected from three lists, Please correct the problem and re-run**""",unsafe_allow_html=True) - - #Now BUILD Order Ready List - #if dft_lr_resa.shape[0] >0 and dft_lr_resb.shape[0] >0 and dft_lr_resc.shape[0] >0: - # for sgrna in dft_lr_resa else: st.write('**Please select guides and Press Update Button to Begin Processing**') - - - ListARes = st.checkbox('Results For SetA',key=300) - if ListARes:# and not isinstance(get_table, type(None)):#get_table!=None: - #if ListARes and get_table.shape[0]>0: - st.write('**Please select Guides From Table Below to processes from ListA**') - get_table=tbl_disp(reflistA_concatenated,variant,'ref_guides',2,0) - if not isinstance(get_table, type(None)): - #variant_set=get_table[get_table['guide_type']=='1-2']['gene'] - variant_set=get_table['gene'] - dft_a = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) - dft_resa=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_res_muta=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_notfounda=pd.DataFrame(columns=['gene','ref_guide']) - df_matched_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - df_mutated_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - #CHECK FOR GRCh38 - for i in range(variant_set.shape[0]): - #ref_listA=listA[listA['sgID_AB']==variant_set.iloc[i]['gene']][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listA=listA[listA['sgID_AB']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listA = ref_listA[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - - ref_listA.columns=['gene','guide_type','protospacer_A','protospacer_B'] - res,res_mut,res_notfound,list_match,list_mutated,gflga1=get_lists(ref_listA,listA_found_ref,listA_notfound_ref) - dft_a=dft_a.append(ref_listA) - if res.shape[0]>0: - dft_resa=dft_resa.append(res) - if res_mut.shape[0]>0: - dft_res_muta=dft_res_muta.append(res_mut) - if res_notfound.shape[0]>0: - dft_notfounda= dft_notfounda.append(res_notfound) - if list_match.shape[0]>0: - df_matched_guides_ref= df_matched_guides_ref.append(list_match) - if list_mutated.shape[0]>0: - df_mutated_guides_ref= df_mutated_guides_ref.append(list_mutated) + if 'get_table_order' in locals(): + if not isinstance(get_table_order, type(None)): + reflistA_concatenated=get_table_order[get_table_order['guide_type']=='1-2'] + reflistA_concatenated.drop("_selectedRowNodeInfo",axis=1,inplace=True) + reflistB_concatenated=get_table_order[get_table_order['guide_type']=='3-4'] + reflistB_concatenated.drop("_selectedRowNodeInfo",axis=1,inplace=True) + reflistC_concatenated=get_table_order[get_table_order['guide_type']=='5-6'] + reflistC_concatenated.drop("_selectedRowNodeInfo",axis=1,inplace=True) + + #st.write('**Important:** If a guides is **not** in **found, mutated and not_found list (such as GSTT1), then it is found in Alternative Loci and Removed**') + with st.form(key='columns_in_form_lists'): + c2, c3= st.columns([10,1])#([10,10]) + with c2: + List_Selected = st.selectbox('Please select list', + ('','ListA','ListB','ListC')) + Show_ListResults=st.form_submit_button(label = 'GO') - #st.write('Selected Reference Guides for **Set A**') - #tbl_disp(dft_a,'All','ReferenceGuides',0) - if dft_resa.shape[0]>0: - st.write('Matched to **GRCh38** Reference Guides for **Set A**') - tbl_disp(dft_resa,'select_genes','SetA_GRCh38',3) - elif dft_res_muta.shape[0]>0: - st.write('Mutated to **GRCh38** Reference Guides for **Set A**') - st.markdown(caution1,unsafe_allow_html=True) - tbl_disp(dft_res_muta,'select_genes','SetA_Mutated_GRCh38',4) - if dft_notfounda.shape[0]>0: - st.write('**SetA Guides Not Found in GRCh38**') - #tbl_disp(dft_notfound,'select_genes','SetA_Notfound_GRCh38') - st.table(dft_notfounda) - #Now CHECK FOR CHM13 - dft_a = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) - dft_lr_resa=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_lr_res_muta=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_lr_notfounda=pd.DataFrame(columns=['gene','ref_guide']) - df_matched_guides_lr = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - df_mutated_guides_lr = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - - for i in range(variant_set.shape[0]): - #ref_listA=listA[listA['gene']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listA=listA[listA['sgID_AB']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listA = ref_listA[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - - ref_listA.columns=['gene','guide_type','protospacer_A','protospacer_B'] - res,res_mut,res_notfound,list_match,list_mutated,gflga1=get_lists(ref_listA,listA_found_lr,listA_notfound_lr) - dft_a=dft_a.append(ref_listA) - if res.shape[0]>0: - dft_lr_resa=dft_lr_resa.append(res) - if res_mut.shape[0]>0: - dft_lr_res_muta=dft_lr_res_muta.append(res_mut) - if res_notfound.shape[0]>0: - dft_lr_notfounda= dft_lr_notfounda.append(res_notfound) - if list_match.shape[0]>0: - df_matched_guides_lr= df_matched_guides_lr.append(list_match) - if list_mutated.shape[0]>0: - df_mutated_guides_lr= df_mutated_guides_lr.append(list_mutated) - - if dft_lr_resa.shape[0]>0: - st.write('Matched to **CHM13** Reference Guides for **Set A**') - tbl_disp(dft_lr_resa,'select_genes','SetA_CHM13',5) - elif dft_lr_res_muta.shape[0]>0: - st.write('Mutated to **CHM13** Reference Guides for **Set A**') - st.markdown(caution1,unsafe_allow_html=True) - tbl_disp(dft_lr_res_muta,'select_genes','SetA_Mutated_CHM13',6) - if dft_lr_notfounda.shape[0]>0: - st.write('**SetA Guides Not Found in CHM13**') - st.table(dft_lr_notfounda) - #NOW MERGE FROM GRCh38 and LR - merged_mutated_set=pd.merge(df_mutated_guides_ref,df_mutated_guides_lr, how="outer",on=["gene","ref_guide","chr"],suffixes=["_GRCh38",'_LR']) - merged_mutated_set = merged_mutated_set[['gene','ref_guide','chr','position_GRCh38','position_LR','strand_GRCh38','strand_LR','mutated_guide_GRCh38','mutated_guide_LR','num_mismatch_GRCh38','num_mismatch_LR']] - merged_match_set=pd.merge(df_matched_guides_ref,df_matched_guides_lr, how="outer",on=["gene","ref_guide","chr"],suffixes=["_GRCh38",'_LR']) - merged_match_set = merged_match_set[['gene','ref_guide','chr','position_GRCh38','position_LR','strand_GRCh38','strand_LR','mutated_guide_GRCh38','mutated_guide_LR','num_mismatch_GRCh38','num_mismatch_LR']] - if merged_match_set.shape[0]>0: - #st.write('**Matched** Guides for **Set C** (*Each guide sequence has a trailing NGG*)') - st.write('**Matched** Guides for **Set A** to both **GRCh38 and CHM13 references** (*Each guide sequence has a trailing NGG* and **leading G even if it is a missmatch**)') - tbl_disp(merged_match_set,'select_genes','SetA_Matched_GRCh38_CHM13',7,0) - - #st.table(merged_match_seta) - elif merged_mutated_set.shape[0]>0: - #st.write('**Missmatched** Guides **Set C** (*Each guide sequence has a trailing NGG*)') - st.write('**Mutated** Guides for **Set A** to both **GRCh38 and CHM13 references** (*Each guide sequence has a trailing NGG* and **leading G even if it is a missmatch**)') - - tbl_disp(merged_mutated_set,'select_genes','SetA_Mutated_GRCh38_CHM13',8,0) - elif ListARes: - st.write("**Please select genes from the above table to begin**") - - ListBRes = st.checkbox('Results For SetB',key=40) - if ListBRes: # and not isinstance(get_table, type(None)):#get_table!=None: - st.write('**Please select Guides From Table Below to processes from ListB**') - get_table=tbl_disp(reflistB_concatenated,variant,'ref_guides',9,0) - if not isinstance(get_table, type(None)): - #variant_set=get_table[['gene']] - variant_set=get_table['gene'] - dft_b = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) - dft_resb=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_res_mutb=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_notfoundb=pd.DataFrame(columns=['gene','ref_guide']) - df_matched_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - df_mutated_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - #CHECK FOR GRCh38 - for i in range(variant_set.shape[0]): - #ref_listB=listB[listB['gene']==variant_set.iloc[i]['gene']][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listB=listB[listB['sgID_AB']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listB =ref_listB[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - - ref_listB.columns=['gene','guide_type','protospacer_A','protospacer_B'] - res,res_mut,res_notfound,list_match,list_mutated,gflgb1=get_lists(ref_listB,listB_found_ref,listB_notfound_ref) - dft_b=dft_b.append(ref_listB) - if res.shape[0]>0: - dft_resb=dft_resb.append(res) - if res_mut.shape[0]>0: - dft_res_mutb=dft_res_mutb.append(res_mut) - if res_notfound.shape[0]>0: - dft_notfoundb= dft_notfoundb.append(res_notfound) - if list_match.shape[0]>0: - df_matched_guides_ref= df_matched_guides_ref.append(list_match) - if list_mutated.shape[0]>0: - df_mutated_guides_ref= df_mutated_guides_ref.append(list_mutated) + #ListARes = st.checkbox('Results For SetA',key=300) + if List_Selected=='ListA':# and not isinstance(get_table, type(None)):#get_table!=None: + ref_list= listA + st.write('**Please select Guides From Table Below to processes from ListA**') + with st.form(key='columns_in_form_listsA'): + c2, c3= st.columns([100,2])#([10,10]) + with c2: + get_table=tbl_disp(reflistA_concatenated,variant,'ref_guides',2,0) + #List_Selected = st.selectbox('Please select list', + #('ListA','ListB','ListC')) + Show_ListResults=st.form_submit_button(label = 'Show ListA Results') - #st.write('Selected Reference Guides for **Set B**') - #tbl_disp(dft_b,'All','ReferenceGuides',0) - if dft_resb.shape[0]>0: - st.write('Matched to **GRCh38** Reference Guides for **Set B**') - tbl_disp(dft_resb,'select_genes','SetB_GRCh38',10) - elif dft_res_mutb.shape[0]>0: - st.write('Mutated to **GRCh38** Reference Guides for **Set B**') - st.markdown(caution1,unsafe_allow_html=True) - tbl_disp(dft_res_mutb,'select_genes','SetB_Mutated_GRCh38',11) - if dft_notfoundb.shape[0]>0: - st.write('**SetB Guides Not Found in GRCh38**') - #tbl_disp(dft_notfound,'select_genes','SetA_Notfound_GRCh38') - st.table(dft_notfoundb) - - #Now CHECK FOR CHM13 - dft_b = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) - dft_lr_resb=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_lr_res_mutb=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_lr_notfoundb=pd.DataFrame(columns=['gene','ref_guide']) - df_matched_guides_lr = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - df_mutated_guides_lr = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - - for i in range(variant_set.shape[0]): - #ref_listB=listB[listB['gene']==variant_set.iloc[i]['gene']][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listB=listB[listB['sgID_AB']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listB=ref_listB[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - - ref_listB.columns=['gene','guide_type','protospacer_A','protospacer_B'] - res,res_mut,res_notfound,list_match,list_mutated,gflgb1=get_lists(ref_listB,listB_found_lr,listB_notfound_lr) - dft_b=dft_b.append(ref_listB) - if res.shape[0]>0: - dft_lr_resb=dft_lr_resb.append(res) - if res_mut.shape[0]>0: - dft_lr_res_mutb=dft_lr_res_mutb.append(res_mut) - if res_notfound.shape[0]>0: - dft_lr_notfoundb= dft_lr_notfoundb.append(res_notfound) - if list_match.shape[0]>0: - df_matched_guides_lr= df_matched_guides_lr.append(list_match) - if list_mutated.shape[0]>0: - df_mutated_guides_lr= df_mutated_guides_lr.append(list_mutated) + #st.write('**Please select Guides From Table Below to processes from ListA**') + #get_table=tbl_disp(reflistA_concatenated,variant,'ref_guides',2,0) + if not isinstance(get_table, type(None)): + if ref_sel=='GRCh38': + list_found=listA_found_ref + list_notfound=listA_notfound_ref + else: + + list_found=listA_found_lr + list_notfound=listA_notfound_lr + + variant_set=get_table['sgID_AB'] + dft_a = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) + dft_resa=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) + dft_res_muta=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) + dft_notfounda=pd.DataFrame(columns=['gene','ref_guide']) + df_matched_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + df_mutated_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + #CHECK FOR GRCh38 + for i in range(variant_set.shape[0]): + #ref_listA=listA[listA['sgID_AB']==variant_set.iloc[i]['gene']][['guide_type','protospacer_A','protospacer_B','sgID_AB']] + ref_listA=ref_list[ref_list['sgID_AB']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] + ref_listA = ref_listA[['sgID_AB','guide_type','protospacer_A','protospacer_B']] + + #ref_listA.columns=['gene','guide_type','protospacer_A','protospacer_B'] + #st.table(ref_listA) + res,res_mut,res_notfound,list_match,list_mutated,gflga1=get_lists(ref_listA,list_found,list_notfound) + dft_a=dft_a.append(ref_listA) + if res.shape[0]>0: + dft_resa=dft_resa.append(res) + if res_mut.shape[0]>0: + dft_res_muta=dft_res_muta.append(res_mut) + if res_notfound.shape[0]>0: + dft_notfounda= dft_notfounda.append(res_notfound) + if list_match.shape[0]>0: + df_matched_guides_ref= df_matched_guides_ref.append(list_match) + if list_mutated.shape[0]>0: + df_mutated_guides_ref= df_mutated_guides_ref.append(list_mutated) - if dft_lr_resb.shape[0]>0: - st.write('Matched to **CHM13** Reference Guides for **Set B**') - tbl_disp(dft_lr_resb,'select_genes','SetB_CHM13',12) - elif dft_lr_res_mutb.shape[0]>0: - st.write('Mutated to **CHM13** Reference Guides for **Set B**') - st.markdown(caution1,unsafe_allow_html=True) - tbl_disp(dft_lr_res_mutb,'select_genes','SetB_Mutated_CHM13',13) - if dft_lr_notfoundb.shape[0]>0: - st.write('**SetB Guides Not Found in CHM13**') - st.table(dft_lr_notfoundb) - #NOW MERGE FROM GRCh38 and LR - merged_mutated_set=pd.merge(df_mutated_guides_ref,df_mutated_guides_lr, how="outer",on=["gene","ref_guide","chr"],suffixes=["_GRCh38",'_LR']) - merged_mutated_set = merged_mutated_set[['gene','ref_guide','chr','position_GRCh38','position_LR','strand_GRCh38','strand_LR','mutated_guide_GRCh38','mutated_guide_LR','num_mismatch_GRCh38','num_mismatch_LR']] - merged_match_set=pd.merge(df_matched_guides_ref,df_matched_guides_lr, how="outer",on=["gene","ref_guide","chr"],suffixes=["_GRCh38",'_LR']) - merged_match_set = merged_match_set[['gene','ref_guide','chr','position_GRCh38','position_LR','strand_GRCh38','strand_LR','mutated_guide_GRCh38','mutated_guide_LR','num_mismatch_GRCh38','num_mismatch_LR']] - if merged_match_set.shape[0]>0: - #st.write('**Matched** Guides for **Set C** (*Each guide sequence has a trailing NGG*)') - st.write('**Matched** Guides for **Set B** to both **GRCh38 and CHM13 references** (*Each guide sequence has a trailing NGG* and **leading G even if it is a missmatch**)') - tbl_disp(merged_match_set,'select_genes','SetB_Matched_GRCh38_CHM13',14,0) - - #st.table(merged_match_seta) - elif merged_mutated_set.shape[0]>0: - #st.write('**Missmatched** Guides **Set C** (*Each guide sequence has a trailing NGG*)') - st.write('**Mutated** Guides for **Set B** to both **GRCh38 and CHM13 references** (*Each guide sequence has a trailing NGG* and **leading G even if it is a missmatch**)') - #st.markdown(caution1,unsafe_allow_html=True) - tbl_disp(merged_mutated_set,'select_genes','SetB_Mutated_GRCh38_CHM13',15,0) - - elif ListBRes: - st.write("**Please select genes from the above table to begin**") - - ListCRes = st.checkbox('Results For SetC',key=50) - if ListCRes: # and not isinstance(get_table, type(None)):#get_table!=None: - #variant_set=get_table[['gene']] - st.write('**Please select Guides From Table Below to processes from ListC**') - get_table=tbl_disp(reflistC_concatenated,variant,'ref_guides',16,0) - if not isinstance(get_table, type(None)): - variant_set=get_table['gene'] - dft_c = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) - dft_resc=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_res_mutc=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_notfoundc=pd.DataFrame(columns=['gene','ref_guide']) - df_matched_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - df_mutated_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - #CHECK FOR GRCh38 - for i in range(variant_set.shape[0]): - #ref_listC=listC[listC['gene']==variant_set.iloc[i]['gene']][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listC=listC[listC['sgID_AB']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listC =ref_listC[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - - ref_listC.columns=['gene','guide_type','protospacer_A','protospacer_B'] - res,res_mut,res_notfound,list_match,list_mutated,gflgc1=get_lists(ref_listC,listC_found_ref,listC_notfound_ref) - dft_c=dft_c.append(ref_listC) - if res.shape[0]>0: - dft_resc=dft_resc.append(res) - if res_mut.shape[0]>0: - dft_res_mutc=dft_res_mutc.append(res_mut) - if res_notfound.shape[0]>0: - dft_notfoundc= dft_notfoundc.append(res_notfound) - if list_match.shape[0]>0: - df_matched_guides_ref= df_matched_guides_ref.append(list_match) - if list_mutated.shape[0]>0: - df_mutated_guides_ref= df_mutated_guides_ref.append(list_mutated) - - #st.write('Selected Reference Guides for **Set C**') - #tbl_disp(dft_c,'All','ReferenceGuides',0) - if dft_resc.shape[0]>0: - st.write('Matched to **GRCh38** Reference Guides for **Set C**') - tbl_disp(dft_resc,'select_genes','SetC_GRCh38',17) - elif dft_res_mutc.shape[0]>0: - st.write('Mutated to **GRCh38** Reference Guides for **Set C**') - st.markdown(caution1,unsafe_allow_html=True) - tbl_disp(dft_res_mutc,'select_genes','SetC_Mutated_GRCh38',18) - if dft_notfoundc.shape[0]>0: - st.write('**SetC Guides Not Found in GRCh38**') - #tbl_disp(dft_notfound,'select_genes','SetA_Notfound_GRCh38') - st.table(dft_notfoundc) - - #Now CHECK FOR CHM13 - dft_c = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) - dft_lr_resc=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_lr_res_mutc=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) - dft_lr_notfoundc=pd.DataFrame(columns=['gene','ref_guide']) - df_matched_guides_lr = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - df_mutated_guides_lr = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) - - for i in range(variant_set.shape[0]): - #ref_listC=listC[listC['gene']==variant_set.iloc[i]['gene']][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listC=listC[listC['sgID_AB']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] - ref_listC=ref_listC[['sgID_AB','guide_type','protospacer_A','protospacer_B']] - - ref_listC.columns=['gene','guide_type','protospacer_A','protospacer_B'] - res,res_mut,res_notfound,list_match,list_mutated,gflgc1=get_lists(ref_listC,listC_found_lr,listC_notfound_lr) - dft_c=dft_c.append(ref_listC) - if res.shape[0]>0: - dft_lr_resc=dft_lr_resc.append(res) - if res_mut.shape[0]>0: - dft_lr_res_mutc=dft_lr_res_mutc.append(res_mut) - if res_notfound.shape[0]>0: - dft_lr_notfoundc= dft_lr_notfoundc.append(res_notfound) - if list_match.shape[0]>0: - df_matched_guides_lr= df_matched_guides_lr.append(list_match) - if list_mutated.shape[0]>0: - df_mutated_guides_lr= df_mutated_guides_lr.append(list_mutated) + #st.write('Selected Reference Guides for **Set A**') + #tbl_disp(dft_a,'All','ReferenceGuides',0) + st.write('**Important:** If a guides is **not** in **found, mutated and not_found list (such as GSTT1), then it is found in Alternative Loci and Removed**') + if dft_resa.shape[0]>0: + st.write('Matched to '+ref_sel+' Reference Guides for **Set A**') + tbl_disp(dft_resa,'select_genes','SetA_GRCh38',3) + elif dft_res_muta.shape[0]>0: + st.write('None of the guides Matched, So reporting **Mutated to** '+ref_sel+' Reference Guides for **Set A**') + st.markdown(caution1,unsafe_allow_html=True) + tbl_disp(dft_res_muta,'select_genes','SetA_Mutated_GRCh38',4) + if dft_notfounda.shape[0]>0: + st.write('**SetA Guides Not Found in '+ref_sel+' (None of the guides are Matched/Mutated)**') + #tbl_disp(dft_notfound,'select_genes','SetA_Notfound_GRCh38') + st.table(dft_notfounda) + + #ListBRes = st.checkbox('Results For SetB',key=40) + if List_Selected=='ListB': # and not isinstance(get_table, type(None)):#get_table!=None: + ref_list= listB + st.write('**Please select Guides From Table Below to processes from ListB**') + with st.form(key='columns_in_form_listsA'): + c2, c3= st.columns([100,2])#([10,10]) + with c2: + get_table=tbl_disp(reflistB_concatenated,variant,'ref_guides',2,0) + Show_ListResults=st.form_submit_button(label = 'Show ListB Results') + if not isinstance(get_table, type(None)): + if ref_sel=='GRCh38': + + list_found=listB_found_ref + list_notfound=listB_notfound_ref + else: + + list_found=listB_found_lr + list_notfound=listB_notfound_lr - if dft_lr_resc.shape[0]>0: - st.write('Matched to **CHM13** Reference Guides for **Set C**') - tbl_disp(dft_lr_resc,'select_genes','SetC_CHM13',19) - elif dft_lr_res_mutc.shape[0]>0: - st.write('Mutated to **CHM13** Reference Guides for **Set C**') - st.markdown(caution1,unsafe_allow_html=True) - tbl_disp(dft_lr_res_mutc,'select_genes','SetC_Mutated_CHM13',20) - if dft_lr_notfoundc.shape[0]>0: - st.write('**SetC Guides Not Found in CHM13**') - st.table(dft_lr_notfoundc) - #NOW MERGE FROM GRCh38 and LR - merged_mutated_set=pd.merge(df_mutated_guides_ref,df_mutated_guides_lr, how="outer",on=["gene","ref_guide","chr"],suffixes=["_GRCh38",'_LR']) - merged_mutated_set = merged_mutated_set[['gene','ref_guide','chr','position_GRCh38','position_LR','strand_GRCh38','strand_LR','mutated_guide_GRCh38','mutated_guide_LR','num_mismatch_GRCh38','num_mismatch_LR']] - merged_match_set=pd.merge(df_matched_guides_ref,df_matched_guides_lr, how="outer",on=["gene","ref_guide","chr"],suffixes=["_GRCh38",'_LR']) - merged_match_set = merged_match_set[['gene','ref_guide','chr','position_GRCh38','position_LR','strand_GRCh38','strand_LR','mutated_guide_GRCh38','mutated_guide_LR','num_mismatch_GRCh38','num_mismatch_LR']] - if merged_match_set.shape[0]>0: - #st.write('**Matched** Guides for **Set C** (*Each guide sequence has a trailing NGG*)') - st.write('**Matched** Guides for **Set C** to both **GRCh38 and CHM13 references** (*Each guide sequence has a trailing NGG* and **leading G even if it is a missmatch**)') - tbl_disp(merged_match_set,'select_genes','SetC_Matched_GRCh38_CHM13',21,0) - - #st.table(merged_match_seta) - elif merged_mutated_set.shape[0]>0: - #st.write('**Missmatched** Guides **Set C** (*Each guide sequence has a trailing NGG*)') - st.write('**Mutated** Guides for **Set C** to both **GRCh38 and CHM13 references** (*Each guide sequence has a trailing NGG* and **leading G even if it is a missmatch**)') - #st.markdown(caution1,unsafe_allow_html=True) - tbl_disp(merged_mutated_set,'select_genes','SetC_Mutated_GRCh38_CHM13',22,0) + #variant_set=get_table[['gene']] + variant_set=get_table['sgID_AB'] + dft_b = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) + dft_resb=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) + dft_res_mutb=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) + dft_notfoundb=pd.DataFrame(columns=['gene','ref_guide']) + df_matched_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + df_mutated_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + #CHECK FOR GRCh38 + for i in range(variant_set.shape[0]): + #ref_listB=listB[listB['gene']==variant_set.iloc[i]['gene']][['guide_type','protospacer_A','protospacer_B','sgID_AB']] + ref_listB=ref_list[ref_list['sgID_AB']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] + ref_listB =ref_listB[['sgID_AB','guide_type','protospacer_A','protospacer_B']] + + #ref_listB.columns=['gene','guide_type','protospacer_A','protospacer_B'] + res,res_mut,res_notfound,list_match,list_mutated,gflgb1=get_lists(ref_listB,list_found,list_notfound) + dft_b=dft_b.append(ref_listB) + if res.shape[0]>0: + dft_resb=dft_resb.append(res) + if res_mut.shape[0]>0: + dft_res_mutb=dft_res_mutb.append(res_mut) + if res_notfound.shape[0]>0: + dft_notfoundb= dft_notfoundb.append(res_notfound) + if list_match.shape[0]>0: + df_matched_guides_ref= df_matched_guides_ref.append(list_match) + if list_mutated.shape[0]>0: + df_mutated_guides_ref= df_mutated_guides_ref.append(list_mutated) + + #st.write('Selected Reference Guides for **Set B**') + #tbl_disp(dft_b,'All','ReferenceGuides',0) + st.write('**Important:** If a guides is **not** in **found, mutated and not_found list (such as GSTT1), then it is found in Alternative Loci and Removed**') + if dft_resb.shape[0]>0: + st.write('Matched to '+ref_sel+' Reference Guides for **Set B**') + tbl_disp(dft_resb,'select_genes','SetB_GRCh38',10) + elif dft_res_mutb.shape[0]>0: + st.write('None of the guides Matched, So reporting **Mutated to '+ref_sel+' Reference Guides for **Set B**') + st.markdown(caution1,unsafe_allow_html=True) + tbl_disp(dft_res_mutb,'select_genes','SetB_Mutated_GRCh38',11) + if dft_notfoundb.shape[0]>0: + st.write('**SetB Guides Not Found in '+ref_sel+' (None of the guides are Matched/Mutated)**') + #tbl_disp(dft_notfound,'select_genes','SetA_Notfound_GRCh38') + st.table(dft_notfoundb) + + + + #ListCRes = st.checkbox('Results For SetC',key=50) + if List_Selected=='ListC': # and not isinstance(get_table, type(None)):#get_table!=None: + ref_list= listC - # if ListARes and ListBRes and ListCRes: - # Order_List = st.checkbox('Generate Order Ready List',key=100) - # if Order_List: - # if dft_lr_resa.shape[0]>0: - # st.table(dft_lr_resa) + st.write('**Please select Guides From Table Below to processes from ListC**') + with st.form(key='columns_in_form_listsA'): + c2, c3= st.columns([100,2])#([10,10]) + with c2: + get_table=tbl_disp(reflistC_concatenated,variant,'ref_guides',2,0) + Show_ListResults=st.form_submit_button(label = 'Show ListC Results') + if not isinstance(get_table, type(None)): + if ref_sel=='GRCh38': + + list_found=listC_found_ref + list_notfound=listC_notfound_ref + else: + + list_found=listB_found_lr + list_notfound=listB_notfound_lr + variant_set=get_table['sgID_AB'] + dft_c = pd.DataFrame(columns=['gene','guide_type','protospacer_A','protospacer_B']) + dft_resc=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) + dft_res_mutc=pd.DataFrame(columns=['sgID_1', 'sgRNA_1', 'chr_sgRNA_1', 'position_sgRNA_1', 'sgID_2', 'sgRNA_2', 'chr_sgRNA_2', 'position_sgRNA_2', 'sgID_1_2']) + dft_notfoundc=pd.DataFrame(columns=['gene','ref_guide']) + df_matched_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + df_mutated_guides_ref = pd.DataFrame(columns=['gene','ref_guide', 'chr', 'position', 'mutated_guide', 'strand', 'num_mismatch']) + #CHECK FOR GRCh38 + for i in range(variant_set.shape[0]): + #ref_listC=listC[listC['gene']==variant_set.iloc[i]['gene']][['guide_type','protospacer_A','protospacer_B','sgID_AB']] + ref_listC=ref_list[ref_list['sgID_AB']==variant_set.iloc[i]][['guide_type','protospacer_A','protospacer_B','sgID_AB']] + ref_listC =ref_listC[['sgID_AB','guide_type','protospacer_A','protospacer_B']] + + #ref_listC.columns=['gene','guide_type','protospacer_A','protospacer_B'] + res,res_mut,res_notfound,list_match,list_mutated,gflgc1=get_lists(ref_listC,list_found,list_notfound) + dft_c=dft_c.append(ref_listC) + if res.shape[0]>0: + dft_resc=dft_resc.append(res) + if res_mut.shape[0]>0: + dft_res_mutc=dft_res_mutc.append(res_mut) + if res_notfound.shape[0]>0: + dft_notfoundc= dft_notfoundc.append(res_notfound) + if list_match.shape[0]>0: + df_matched_guides_ref= df_matched_guides_ref.append(list_match) + if list_mutated.shape[0]>0: + df_mutated_guides_ref= df_mutated_guides_ref.append(list_mutated) - - elif ListCRes: - st.write("**Please select genes from the above table to begin**") + #st.write('Selected Reference Guides for **Set C**') + #tbl_disp(dft_c,'All','ReferenceGuides',0) + st.write('**Important:** If a guides is **not** in **found, mutated and not_found list (such as GSTT1), then it is found in Alternative Loci and Removed**') + if dft_resc.shape[0]>0: + st.write('Matched to '+ref_sel+' Reference Guides for **Set C**') + tbl_disp(dft_resc,'select_genes','SetC_GRCh38',17) + elif dft_res_mutc.shape[0]>0: + st.write('None of the guides Matched, So reporting **Mutated to '+ref_sel+' Reference Guides for **Set C**') + st.markdown(caution1,unsafe_allow_html=True) + tbl_disp(dft_res_mutc,'select_genes','SetC_Mutated_GRCh38',18) + if dft_notfoundc.shape[0]>0: + st.write('**SetC Guides Not Found in '+ref_sel+' (None of the guides are Matched/Mutated)**') + #tbl_disp(dft_notfound,'select_genes','SetA_Notfound_GRCh38') + st.table(dft_notfoundc) + + elif Calc=='Not_Found': ListAResNotFound = st.checkbox('Results For SetA',key=30) if ListAResNotFound and listA_notfound_lr.shape[0]>0: @@ -1092,4 +1190,93 @@ elif Calc=='Not_Found': tbl_disp(non_targeting_guides_c,'all_not_found','SetA_KOLF2.1',23,0) else: - st.write("**Place Holder for All**") \ No newline at end of file + guidetype = st.radio("Select Guide Type",('Non-targetting','Regular'),horizontal=True) + if guidetype=='Non-targetting': + with st.form(key='columns_in_form_non'): + c2, c3 = st.columns([5,5])#([10,10]) + with c2: + guides_List = st.selectbox('Please select list', + ('ListA','ListB','ListC')) + with c3: + ref_type_sel_non = st.radio("Select Reference", + ('CHM13','GRCh38'), + horizontal=True) + Show_Results_non=st.form_submit_button(label = 'Non-targeting Guides Results') + + if Show_Results_non and guides_List=='ListA': + for_list=listA + if ref_type_sel_non=='GRCh38': + f_list=listA_found_ref + nf_list=listA_notfound_ref + else: + f_list=listA_found_lr + nf_list=listA_notfound_lr + + st.write('Total: '+str(len(non_targeting_lista))+' Non-targeting Guide pairs and '+str(2*len(non_targeting_lista))+' single guides in ListA') + + process_all_guides(pd.DataFrame(pd.Series(non_targeting_lista,name='gene')),for_list,f_list,nf_list) + if Show_Results_non and guides_List=='ListB': + for_list=listB + if ref_type_sel_non=='GRCh38': + f_list=listB_found_ref + nf_list=listB_notfound_ref + else: + f_list=listB_found_lr + nf_list=listB_notfound_lr + + st.write('Total: '+str(len(non_targeting_listb))+' Non-targeting Guide pairs and '+str(2*len(non_targeting_listb))+' single guides in ListA') + process_all_guides(pd.DataFrame(pd.Series(non_targeting_listb,name='gene')),for_list,f_list,nf_list) + if Show_Results_non and guides_List=='ListC': + for_list=listC + if ref_type_sel_non=='GRCh38': + f_list=listC_found_ref + nf_list=listC_notfound_ref + else: + f_list=listC_found_lr + nf_list=listC_notfound_lr + + st.write('Total: '+str(len(non_targeting_listc))+' Non-targeting Guide pairs and '+str(2*len(non_targeting_listc))+' single guides in ListA') + process_all_guides(pd.DataFrame(pd.Series(non_targeting_listc,name='gene')),for_list,f_list,nf_list) + + elif guidetype=='Regular': + st.write('**Maximum End Index=** '+str(regular_lista.shape[0])) + with st.form(key='columns_in_form_regular'): + c2, c3, c4 = st.columns([5,5,5])#([10,10]) + with c2: + set_start = int(st.text_input('Start Index', '0')) + with c3: + set_end = int(st.text_input('End Index', str(regular_lista.shape[0]))) + with c4: + ref_type_sel = st.radio("Select Reference", + ('CHM13','GRCh38'), + horizontal=True) + + Show_Results=st.form_submit_button(label = 'Show Regular Guides Results') + if Show_Results:# and guides_List=="ListA": + + regular_listc=regular_listc[set_start:set_end] + regular_listb=regular_listb.iloc[set_start:set_end] + regular_lista=regular_lista.iloc[set_start:set_end] + if ref_type_sel=='GRCh38': + + list_founda=listA_found_ref + list_notfounda=listA_notfound_ref + list_foundb=listB_found_ref + list_notfoundb=listB_notfound_ref + list_foundc=listC_found_ref + list_notfoundc=listC_notfound_ref + + else: + list_founda=listA_found_lr + list_notfounda=listA_notfound_lr + list_foundb=listB_found_lr + list_notfoundb=listB_notfound_lr + list_foundc=listC_found_lr + list_notfoundc=listC_notfound_lr + + dupesq=list(duplicates(listA['gene'])) + non_targetinga=variantsa1[pd.Series(variantsa1).str.contains('non-targeting')] + regulara=variantsa1[~pd.Series(variantsa1).str.contains('non-targeting')] + st.write('Total: '+str(len(regulara))+' Regular Guide (unique genes only) **Excluding:** '+str(len(non_targetinga))+' Non-targeting pairs **and** '+str(len(dupesq))+' Repeated entries (same gene names)') + order_ready_tbl_CHM13(regular_lista,regular_listb,regular_listc,list_founda,list_notfounda,list_foundb,list_notfoundb,list_foundc,list_notfoundc,ref_type_sel) + \ No newline at end of file