Chri12345 commited on
Commit
5c363c5
·
verified ·
1 Parent(s): 3ff48bc

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +0 -12
  2. app.py +72 -0
  3. requirements.txt +0 -0
README.md CHANGED
@@ -1,12 +0,0 @@
1
- ---
2
- title: Best Guess
3
- emoji: 💻
4
- colorFrom: gray
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.40.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from sentence_transformers import SentenceTransformer, util
3
+ from datasets import load_dataset
4
+ from datasets import load_dataset
5
+ from transformers import pipeline
6
+ import streamlit as st
7
+
8
+
9
+
10
+ dataset_id = "sentence-transformers/natural-questions"
11
+ dataset_file = load_dataset(dataset_id, split="train")
12
+
13
+ # Use the allenai-specter model with SentenceTransformers
14
+ model = SentenceTransformer('allenai-specter')
15
+
16
+ # Prepare paper texts by combining query and answer fields
17
+ paper_texts = [
18
+ record['query'] + '[SEP]' + record['answer'] for record in dataset_file.select(range(32))
19
+ ]
20
+
21
+ # Compute embeddings for all paper texts
22
+ corpus_embeddings = model.encode(paper_texts, convert_to_tensor=True, show_progress_bar=True)
23
+
24
+ # Function to search for answers given a query
25
+ def search_papers(query):
26
+ # Encode the query
27
+ query_embedding = model.encode(query, convert_to_tensor=True)
28
+
29
+ # Perform semantic search
30
+ search_hits = util.semantic_search(query_embedding, corpus_embeddings)
31
+ search_hits = search_hits[0] # Get the hits for the first query
32
+
33
+ print("\n\nQuery:", query)
34
+ print("Most similar answers:")
35
+ for hit in search_hits[:5]: # Limit to top 5 results for clarity
36
+ related_text = dataset_file[int(hit['corpus_id'])] # Access related record
37
+ print("{:.2f}\tAnswer: {}".format(
38
+ hit['score'], related_text['answer']
39
+ ))
40
+
41
+
42
+ # Summarization pipeline
43
+ summarizer = pipeline("summarization")
44
+
45
+ # Collect the relevant answers from the search function
46
+ def search_papers_and_summarize(query, max_summary_length=45):
47
+ # Encode the query
48
+ query_embedding = model.encode(query, convert_to_tensor=True)
49
+
50
+ # Perform semantic search
51
+ search_hits = util.semantic_search(query_embedding, corpus_embeddings)
52
+ search_hits = search_hits[0] # Get the hits for the first query
53
+
54
+ # Collect answers from top hits
55
+ answers = []
56
+ for hit in search_hits[:5]: # Limit to top 5 results
57
+ related_text = dataset_file[int(hit['corpus_id'])]
58
+ answers.append(related_text['answer'])
59
+
60
+ # Combine answers into a single text for summarization
61
+ combined_text = " ".join(answers)
62
+
63
+ # Summarize the combined text
64
+ summary = summarizer(combined_text, max_length=max_summary_length, clean_up_tokenization_spaces=True)
65
+ print("Summary:")
66
+ print(summary[0]['summary_text'])
67
+
68
+
69
+ title = st.text_input("Ask a question", "What is Wimpy Kid")
70
+ new_preds = search_papers_and_summarize(title)
71
+ st.write("The Answer is", new_preds)
72
+
requirements.txt ADDED
File without changes