Spaces:

mdredze1
/

tobacco-watcher-chat-with-citations

Running

App Files Files Community

vtiyyal1 commited on 2 days ago

Commit

f8e7b59

•

1 Parent(s): 4c7c1f7

Upload rerank.py

Browse files

fixed rerank to use dates

Files changed (1) hide show

rerank.py +6 -4

rerank.py CHANGED Viewed

@@ -100,6 +100,7 @@ def crossencoder_rerank_answer(csv_path: str, question: str, top_n=4) -> list:
     contents = articles['content'].tolist()
     uuids = articles['uuid'].tolist()
     titles = articles['title'].tolist()
     # biencoder retrieval does not have domain
     if 'domain' not in articles:
@@ -109,7 +110,7 @@ def crossencoder_rerank_answer(csv_path: str, question: str, top_n=4) -> list:
     cross_inp = [[question, content] for content in contents]
     cross_scores = cross_encoder.predict(cross_inp)
-    scores_sentences = list(zip(cross_scores, contents, uuids, titles, domain))
     scores_sentences = sorted(scores_sentences, key=lambda x: x[0], reverse=True)
     out_values = scores_sentences[:top_n]
@@ -132,7 +133,7 @@ def crossencoder_rerank_sentencewise(csv_path: str, question: str, top_n=10) ->
     contents = articles['content'].tolist()
     uuids = articles['uuid'].tolist()
     titles = articles['title'].tolist()
     if 'domain' not in articles:
         domain = [""] * len(contents)
     else:
@@ -142,16 +143,17 @@ def crossencoder_rerank_sentencewise(csv_path: str, question: str, top_n=10) ->
     new_uuids = []
     new_titles = []
     new_domains = []
     for idx in range(len(contents)):
         sents = sent_tokenize(contents[idx])
         sentences.extend(sents)
         new_uuids.extend([uuids[idx]] * len(sents))
         new_titles.extend([titles[idx]] * len(sents))
         new_domains.extend([domain[idx]] * len(sents))
     cross_inp = [[question, sent] for sent in sentences]
     cross_scores = cross_encoder.predict(cross_inp)
-    scores_sentences = list(zip(cross_scores, sentences, new_uuids, new_titles, new_domains))
     scores_sentences = sorted(scores_sentences, key=lambda x: x[0], reverse=True)
     out_values = scores_sentences[:top_n]

     contents = articles['content'].tolist()
     uuids = articles['uuid'].tolist()
     titles = articles['title'].tolist()
+    published_dates = articles['published_date'].tolist()
     # biencoder retrieval does not have domain
     if 'domain' not in articles:
     cross_inp = [[question, content] for content in contents]
     cross_scores = cross_encoder.predict(cross_inp)
+    scores_sentences = list(zip(cross_scores, contents, uuids, titles, domain, published_dates))
     scores_sentences = sorted(scores_sentences, key=lambda x: x[0], reverse=True)
     out_values = scores_sentences[:top_n]
     contents = articles['content'].tolist()
     uuids = articles['uuid'].tolist()
     titles = articles['title'].tolist()
+    published_dates = articles['published_date'].tolist()
     if 'domain' not in articles:
         domain = [""] * len(contents)
     else:
     new_uuids = []
     new_titles = []
     new_domains = []
+    new_published_dates = []
     for idx in range(len(contents)):
         sents = sent_tokenize(contents[idx])
         sentences.extend(sents)
         new_uuids.extend([uuids[idx]] * len(sents))
         new_titles.extend([titles[idx]] * len(sents))
         new_domains.extend([domain[idx]] * len(sents))
+        new_published_dates.extend([published_dates[idx]] * len(sents))
     cross_inp = [[question, sent] for sent in sentences]
     cross_scores = cross_encoder.predict(cross_inp)
+    scores_sentences = list(zip(cross_scores, sentences, new_uuids, new_titles, new_domains, new_published_dates))
     scores_sentences = sorted(scores_sentences, key=lambda x: x[0], reverse=True)
     out_values = scores_sentences[:top_n]