Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,48 +1,44 @@
|
|
1 |
import os
|
2 |
from sentence_transformers import SentenceTransformer, util
|
3 |
from datasets import load_dataset
|
4 |
-
from datasets import load_dataset
|
5 |
from transformers import pipeline
|
6 |
import streamlit as st
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
dataset_id
|
11 |
-
|
12 |
-
|
13 |
-
#
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
#
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
#
|
43 |
-
summarizer = pipeline("summarization")
|
44 |
-
|
45 |
-
# Collect the relevant answers from the search function
|
46 |
def search_papers_and_summarize(query, max_summary_length=45):
|
47 |
# Encode the query
|
48 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
@@ -62,11 +58,12 @@ def search_papers_and_summarize(query, max_summary_length=45):
|
|
62 |
|
63 |
# Summarize the combined text
|
64 |
summary = summarizer(combined_text, max_length=max_summary_length, clean_up_tokenization_spaces=True)
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
1 |
import os
|
2 |
from sentence_transformers import SentenceTransformer, util
|
3 |
from datasets import load_dataset
|
|
|
4 |
from transformers import pipeline
|
5 |
import streamlit as st
|
6 |
|
7 |
+
# Cache dataset loading
|
8 |
+
@st.cache_data
|
9 |
+
def load_data(dataset_id="sentence-transformers/natural-questions", split="train"):
|
10 |
+
return load_dataset(dataset_id, split=split)
|
11 |
+
|
12 |
+
# Cache model loading
|
13 |
+
@st.cache_resource
|
14 |
+
def load_model():
|
15 |
+
return SentenceTransformer('allenai-specter')
|
16 |
+
|
17 |
+
# Cache corpus embedding generation
|
18 |
+
@st.cache_data
|
19 |
+
def generate_embeddings(model, dataset_file, sample_size=32):
|
20 |
+
# Prepare paper texts by combining query and answer fields
|
21 |
+
paper_texts = [
|
22 |
+
record['query'] + '[SEP]' + record['answer'] for record in dataset_file.select(range(sample_size))
|
23 |
+
]
|
24 |
+
# Compute embeddings for all paper texts
|
25 |
+
return paper_texts, model.encode(paper_texts, convert_to_tensor=True, show_progress_bar=True)
|
26 |
+
|
27 |
+
# Cache summarization pipeline
|
28 |
+
@st.cache_resource
|
29 |
+
def load_summarizer():
|
30 |
+
return pipeline("summarization")
|
31 |
+
|
32 |
+
# Streamlit app
|
33 |
+
st.title("Semantic Search with Summarization")
|
34 |
+
|
35 |
+
# Load resources
|
36 |
+
dataset_file = load_data()
|
37 |
+
model = load_model()
|
38 |
+
paper_texts, corpus_embeddings = generate_embeddings(model, dataset_file)
|
39 |
+
summarizer = load_summarizer()
|
40 |
+
|
41 |
+
# Function to search and summarize
|
|
|
|
|
|
|
42 |
def search_papers_and_summarize(query, max_summary_length=45):
|
43 |
# Encode the query
|
44 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
|
|
58 |
|
59 |
# Summarize the combined text
|
60 |
summary = summarizer(combined_text, max_length=max_summary_length, clean_up_tokenization_spaces=True)
|
61 |
+
return summary[0]['summary_text']
|
62 |
+
|
63 |
+
# Streamlit input
|
64 |
+
query = st.text_input("Enter your query:", "")
|
65 |
+
if query:
|
66 |
+
st.write("Searching for relevant answers...")
|
67 |
+
summary = search_papers_and_summarize(query)
|
68 |
+
st.subheader("Summary")
|
69 |
+
st.write(summary)
|