kiyer commited on
Commit
2ddd003
·
verified ·
1 Parent(s): 58d5580

added embedding plot

Browse files
Files changed (2) hide show
  1. app.py +40 -3
  2. pfdr_arxiv_cutoff_distances.npy +3 -0
app.py CHANGED
@@ -276,12 +276,15 @@ class RetrievalSystem():
276
  def return_formatted_df(self, top_results, small_df):
277
 
278
  df = pd.DataFrame(small_df)
279
- df = df.drop(columns=['embed','umap_x','umap_y','cite_bibcodes','ref_bibcodes'])
280
  links = ['https://ui.adsabs.harvard.edu/abs/'+i+'/abstract' for i in small_df['bibcode']]
281
  scores = [top_results[i] for i in top_results]
 
282
  df.insert(1,'ADS Link',links,True)
283
  df.insert(2,'Relevance',scores,True)
284
- df = df[['ADS Link','Relevance','date','cites','title','authors','abstract','keywords','ads_id']]
 
 
285
  return df
286
 
287
  # @st.cache_resource
@@ -547,7 +550,39 @@ def evaluate_overall_consensus(query: str, abstracts: List[str]) -> OverallConse
547
 
548
  return response
549
 
 
550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
 
552
 
553
  # ---------------------------------------
@@ -599,7 +634,6 @@ if st.session_state.get('runpfdr'):
599
  question_type_gen = question_type_gen.replace('\n',' \n')
600
  st.markdown(question_type_gen)
601
 
602
- with col2:
603
  with st.spinner("Evaluating abstract consensus"):
604
  with st.expander("Abstract consensus", expanded=True):
605
  consensus_answer = evaluate_overall_consensus(query, [papers_df['abstract'][i] for i in range(len(papers_df))])
@@ -607,6 +641,9 @@ if st.session_state.get('runpfdr'):
607
  st.markdown(consensus_answer.explanation)
608
  st.markdown('Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score)
609
 
 
 
 
610
  session_vars = {
611
  "runtime": "pathfinder_v1_online",
612
  "query": query,
 
276
  def return_formatted_df(self, top_results, small_df):
277
 
278
  df = pd.DataFrame(small_df)
279
+ df = df.drop(columns=['umap_x','umap_y','cite_bibcodes','ref_bibcodes'])
280
  links = ['https://ui.adsabs.harvard.edu/abs/'+i+'/abstract' for i in small_df['bibcode']]
281
  scores = [top_results[i] for i in top_results]
282
+ indices = [i for i in top_results]
283
  df.insert(1,'ADS Link',links,True)
284
  df.insert(2,'Relevance',scores,True)
285
+ df.insert(3,'Indices',indices,True)
286
+ df = df[['ADS Link','Relevance','date','cites','title','authors','abstract','keywords','ads_id','Indices','embed']]
287
+ df.index += 1
288
  return df
289
 
290
  # @st.cache_resource
 
550
 
551
  return response
552
 
553
+ def calc_outlier_flag(papers_df, top_k, cutoff_adjust = 0.1):
554
 
555
+ cut_dist = np.load('pfdr_arxiv_cutoff_distances.npy') - cutoff_adjust
556
+ pts = np.array(papers_df['embed'].tolist())
557
+ centroid = np.mean(pts,0)
558
+ dists = np.sqrt(np.sum((pts-centroid)**2,1))
559
+ outlier_flag = (dists > cut_dist[top_k-1])
560
+
561
+ return outlier_flag
562
+
563
+ def make_embedding_plot(papers_df, consensus_answer):
564
+
565
+ plt_indices = np.array(papers_df['Indices'].tolist())
566
+
567
+ if 'arxiv_corpus' not in st.session_state:
568
+ st.session_state.arxiv_corpus = load_arxiv_corpus()
569
+
570
+ xax = np.array(st.session_state.arxiv_corpus['umap_x'])
571
+ yax = np.array(st.session_state.arxiv_corpus['umap_y'])
572
+
573
+ outlier_flag = calc_outlier_flag(papers_df, top_k, cutoff_adjust=0.25)
574
+ alphas = np.ones((len(plt_indices),)) * 0.9
575
+ alphas[outlier_flag] = 0.5
576
+
577
+ fig = plt.figure(figsize=(9,12))
578
+ plt.scatter(xax,yax, s=1, alpha=0.01, c='k')
579
+ plt.scatter(xax[plt_indices], yax[plt_indices], s=300*alphas**2, alpha=alphas, c='w')
580
+ plt.scatter(xax[plt_indices], yax[plt_indices], s=100*alphas**2, alpha=alphas, c='dodgerblue')
581
+ # plt.scatter(xax[plt_indices][outlier_flag], yax[plt_indices][outlier_flag], s=100, alpha=1., c='firebrick')
582
+ plt.axis([0,20,-4.2,18])
583
+ plt.axis('off')
584
+ plt.title('Query: '+st.session_state.query+'\n'+r'N$_{\rm outliers}: %.0f/%.0f$, Consensus: ' %(np.sum(outlier_flag), len(outlier_flag)) + consensus_answer.consensus + ' (%.1f)' %consensus_answer.relevance_score)
585
+ st.pyplot(fig)
586
 
587
 
588
  # ---------------------------------------
 
634
  question_type_gen = question_type_gen.replace('\n',' \n')
635
  st.markdown(question_type_gen)
636
 
 
637
  with st.spinner("Evaluating abstract consensus"):
638
  with st.expander("Abstract consensus", expanded=True):
639
  consensus_answer = evaluate_overall_consensus(query, [papers_df['abstract'][i] for i in range(len(papers_df))])
 
641
  st.markdown(consensus_answer.explanation)
642
  st.markdown('Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score)
643
 
644
+ with col2:
645
+ make_embedding_plot(papers_df, consensus_answer)
646
+
647
  session_vars = {
648
  "runtime": "pathfinder_v1_online",
649
  "query": query,
pfdr_arxiv_cutoff_distances.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64edda3cf9c3cde63a6dc818f0e6df573dc1ce32217acac1e2bcdfe7f3a4e0e3
3
+ size 928