Chri12345 commited on
Commit
e8c4dfc
·
verified ·
1 Parent(s): 8783826

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -47
app.py CHANGED
@@ -1,48 +1,44 @@
1
  import os
2
  from sentence_transformers import SentenceTransformer, util
3
  from datasets import load_dataset
4
- from datasets import load_dataset
5
  from transformers import pipeline
6
  import streamlit as st
7
 
8
-
9
-
10
- dataset_id = "sentence-transformers/natural-questions"
11
- dataset_file = load_dataset(dataset_id, split="train")
12
-
13
- # Use the allenai-specter model with SentenceTransformers
14
- model = SentenceTransformer('allenai-specter')
15
-
16
- # Prepare paper texts by combining query and answer fields
17
- paper_texts = [
18
- record['query'] + '[SEP]' + record['answer'] for record in dataset_file.select(range(32))
19
- ]
20
-
21
- # Compute embeddings for all paper texts
22
- corpus_embeddings = model.encode(paper_texts, convert_to_tensor=True, show_progress_bar=True)
23
-
24
- # Function to search for answers given a query
25
- def search_papers(query):
26
- # Encode the query
27
- query_embedding = model.encode(query, convert_to_tensor=True)
28
-
29
- # Perform semantic search
30
- search_hits = util.semantic_search(query_embedding, corpus_embeddings)
31
- search_hits = search_hits[0] # Get the hits for the first query
32
-
33
- print("\n\nQuery:", query)
34
- print("Most similar answers:")
35
- for hit in search_hits[:5]: # Limit to top 5 results for clarity
36
- related_text = dataset_file[int(hit['corpus_id'])] # Access related record
37
- print("{:.2f}\tAnswer: {}".format(
38
- hit['score'], related_text['answer']
39
- ))
40
-
41
-
42
- # Summarization pipeline
43
- summarizer = pipeline("summarization")
44
-
45
- # Collect the relevant answers from the search function
46
  def search_papers_and_summarize(query, max_summary_length=45):
47
  # Encode the query
48
  query_embedding = model.encode(query, convert_to_tensor=True)
@@ -62,11 +58,12 @@ def search_papers_and_summarize(query, max_summary_length=45):
62
 
63
  # Summarize the combined text
64
  summary = summarizer(combined_text, max_length=max_summary_length, clean_up_tokenization_spaces=True)
65
- print("Summary:")
66
- print(summary[0]['summary_text'])
67
-
68
-
69
- title = st.text_input("Ask a question", "What is Wimpy Kid")
70
- new_preds = search_papers_and_summarize(title)
71
- st.write("The Answer is", new_preds)
72
-
 
 
1
  import os
2
  from sentence_transformers import SentenceTransformer, util
3
  from datasets import load_dataset
 
4
  from transformers import pipeline
5
  import streamlit as st
6
 
7
+ # Cache dataset loading
8
+ @st.cache_data
9
+ def load_data(dataset_id="sentence-transformers/natural-questions", split="train"):
10
+ return load_dataset(dataset_id, split=split)
11
+
12
+ # Cache model loading
13
+ @st.cache_resource
14
+ def load_model():
15
+ return SentenceTransformer('allenai-specter')
16
+
17
+ # Cache corpus embedding generation
18
+ @st.cache_data
19
+ def generate_embeddings(model, dataset_file, sample_size=32):
20
+ # Prepare paper texts by combining query and answer fields
21
+ paper_texts = [
22
+ record['query'] + '[SEP]' + record['answer'] for record in dataset_file.select(range(sample_size))
23
+ ]
24
+ # Compute embeddings for all paper texts
25
+ return paper_texts, model.encode(paper_texts, convert_to_tensor=True, show_progress_bar=True)
26
+
27
+ # Cache summarization pipeline
28
+ @st.cache_resource
29
+ def load_summarizer():
30
+ return pipeline("summarization")
31
+
32
+ # Streamlit app
33
+ st.title("Semantic Search with Summarization")
34
+
35
+ # Load resources
36
+ dataset_file = load_data()
37
+ model = load_model()
38
+ paper_texts, corpus_embeddings = generate_embeddings(model, dataset_file)
39
+ summarizer = load_summarizer()
40
+
41
+ # Function to search and summarize
 
 
 
42
  def search_papers_and_summarize(query, max_summary_length=45):
43
  # Encode the query
44
  query_embedding = model.encode(query, convert_to_tensor=True)
 
58
 
59
  # Summarize the combined text
60
  summary = summarizer(combined_text, max_length=max_summary_length, clean_up_tokenization_spaces=True)
61
+ return summary[0]['summary_text']
62
+
63
+ # Streamlit input
64
+ query = st.text_input("Enter your query:", "")
65
+ if query:
66
+ st.write("Searching for relevant answers...")
67
+ summary = search_papers_and_summarize(query)
68
+ st.subheader("Summary")
69
+ st.write(summary)