raannakasturi
commited on
Commit
•
9e075cc
1
Parent(s):
a171675
Upload 4 files
Browse files
TempSummary.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sumy.parsers.plaintext import PlaintextParser
|
2 |
+
from sumy.nlp.tokenizers import Tokenizer
|
3 |
+
from sumy.summarizers.text_rank import TextRankSummarizer
|
4 |
+
from sumy.summarizers.luhn import LuhnSummarizer
|
5 |
+
from sumy.summarizers.lsa import LsaSummarizer
|
6 |
+
from sumy.nlp.stemmers import Stemmer
|
7 |
+
from sumy.utils import get_stop_words
|
8 |
+
import nltk
|
9 |
+
from tools import extract_text_from_pdf
|
10 |
+
|
11 |
+
LANGUAGE = "english"
|
12 |
+
SENTENCES_COUNT = 10
|
13 |
+
|
14 |
+
def generate_textrank_summary(research_paper_text):
|
15 |
+
nltk.download('punkt', quiet=True)
|
16 |
+
nltk.download('punkt_tab', quiet=True)
|
17 |
+
parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
|
18 |
+
stemmer = Stemmer(LANGUAGE)
|
19 |
+
summarizer = TextRankSummarizer(stemmer)
|
20 |
+
summarizer.stop_words = get_stop_words(LANGUAGE)
|
21 |
+
sentences = summarizer(parser.document, SENTENCES_COUNT)
|
22 |
+
summary = ""
|
23 |
+
for sentence in sentences:
|
24 |
+
summary += str(sentence) + ""
|
25 |
+
return summary
|
26 |
+
|
27 |
+
def generate_luhn_summary(research_paper_text):
|
28 |
+
nltk.download('punkt', quiet=True)
|
29 |
+
nltk.download('punkt_tab', quiet=True)
|
30 |
+
parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
|
31 |
+
stemmer = Stemmer(LANGUAGE)
|
32 |
+
summarizer = LuhnSummarizer(stemmer)
|
33 |
+
summarizer.stop_words = get_stop_words(LANGUAGE)
|
34 |
+
sentences = summarizer(parser.document, SENTENCES_COUNT)
|
35 |
+
summary = ""
|
36 |
+
for sentence in sentences:
|
37 |
+
summary += str(sentence) + ""
|
38 |
+
return summary
|
39 |
+
|
40 |
+
def generate_lsa_summary(research_paper_text):
|
41 |
+
nltk.download('punkt', quiet=True)
|
42 |
+
nltk.download('punkt_tab', quiet=True)
|
43 |
+
parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
|
44 |
+
stemmer = Stemmer(LANGUAGE)
|
45 |
+
summarizer = LsaSummarizer(stemmer)
|
46 |
+
summarizer.stop_words = get_stop_words(LANGUAGE)
|
47 |
+
sentences = summarizer(parser.document, SENTENCES_COUNT)
|
48 |
+
summary = ""
|
49 |
+
for sentence in sentences:
|
50 |
+
summary += str(sentence) + ""
|
51 |
+
return summary
|
52 |
+
|
53 |
+
def generate_temp_summary(pdf_path):
|
54 |
+
research_paper_text, length_of_research_paper = extract_text_from_pdf(pdf_path)
|
55 |
+
textrank_summary = generate_textrank_summary(research_paper_text)
|
56 |
+
luhn_summary = generate_luhn_summary(research_paper_text)
|
57 |
+
lsa_summary = generate_lsa_summary(research_paper_text)
|
58 |
+
temp_summary = textrank_summary.replace("\n", "") + luhn_summary.replace("\n", "") + lsa_summary.replace("\n", "")
|
59 |
+
return temp_summary, length_of_research_paper
|
app.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import partial
|
2 |
+
from tools import load_llm_model
|
3 |
+
import gradio as gr
|
4 |
+
from main import summarize
|
5 |
+
import subprocess
|
6 |
+
import os
|
7 |
+
|
8 |
+
theme = gr.themes.Soft(
|
9 |
+
primary_hue="purple",
|
10 |
+
secondary_hue="cyan",
|
11 |
+
neutral_hue="slate",
|
12 |
+
font=[
|
13 |
+
gr.themes.GoogleFont('Syne'),
|
14 |
+
gr.themes.GoogleFont('Poppins'),
|
15 |
+
gr.themes.GoogleFont('Poppins'),
|
16 |
+
gr.themes.GoogleFont('Poppins')
|
17 |
+
],
|
18 |
+
)
|
19 |
+
|
20 |
+
print("Checking for LLM model...")
|
21 |
+
while not os.path.exists("Llama-3.2-1B-Instruct-Q8_0.gguf"):
|
22 |
+
print("Downloading LLM model...")
|
23 |
+
subprocess.run(["curl -o Llama-3.2-1B-Instruct-Q8_0.gguf https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf?download=true"])
|
24 |
+
print("LLM model downloaded successfully.")
|
25 |
+
|
26 |
+
print("Loading LLM model...")
|
27 |
+
llm = load_llm_model()
|
28 |
+
print("Building app...")
|
29 |
+
|
30 |
+
summarize_with_llm = partial(summarize, llm)
|
31 |
+
|
32 |
+
with gr.Blocks(theme=theme, title="PDF Summarizer", fill_height=True) as app:
|
33 |
+
with gr.Column():
|
34 |
+
with gr.Row():
|
35 |
+
pdf_file = gr.File(label="Upload PDF", file_types=['.pdf'])
|
36 |
+
with gr.Column():
|
37 |
+
summarize_btn = gr.Button(value="Summarize")
|
38 |
+
info = gr.Textbox(label="Summarization Info", placeholder="Details regarding summarization will be shown here", interactive=False)
|
39 |
+
summary_output = gr.TextArea(label="PDF Summary", interactive=False, show_copy_button=True)
|
40 |
+
summarize_btn.click(
|
41 |
+
summarize_with_llm,
|
42 |
+
inputs=pdf_file,
|
43 |
+
outputs=[summary_output, info],
|
44 |
+
concurrency_limit=5,
|
45 |
+
scroll_to_output=True,
|
46 |
+
api_name="summarize",
|
47 |
+
show_progress="full",
|
48 |
+
max_batch_size=10,
|
49 |
+
)
|
50 |
+
print("Build Successful. Launching app...")
|
51 |
+
app.queue(default_concurrency_limit=5).launch(show_api=True)
|
main.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from TempSummary import generate_temp_summary
|
2 |
+
import time
|
3 |
+
|
4 |
+
def generate_summary(llm, file):
|
5 |
+
print("Generating temporary summary...")
|
6 |
+
temp_summary, length_of_research_paper = generate_temp_summary(file)
|
7 |
+
print("Temporary summary generated successfully")
|
8 |
+
print("Generating final summary...")
|
9 |
+
prompt = f"As a text script expert, please help me to write a short text script with the research paper \"{temp_summary}\".You have three tasks, which are:\\n 1.to summarize the text I provided into a Summary . Please answer within 200-300 characters.\\n 2.to summarize the text I provided, using up to seven concise Highlights. Choose appropriate emoji for each Highlight.\\n 3.to summarize the text I provided, using up to seven Key Insights. Each insight should include a brief in-depth analysis. Choose appropriate emoji for each key insights. Using the following template strictly, provide the results for the three tasks:\\n ### Summary\\n ### Highlights -[emoji]\\n ### key Insights -[emoji] .\\n Importantly your output must use language \"English\"\""
|
10 |
+
response = llm.create_chat_completion(
|
11 |
+
messages = [
|
12 |
+
{'role':'system',
|
13 |
+
'content': 'You are a helpful research assistant for generating well-formatted summaries from scientific research papers.'},
|
14 |
+
{'role':'user',
|
15 |
+
'content': prompt}
|
16 |
+
],
|
17 |
+
temperature=0.5,
|
18 |
+
top_k=200,
|
19 |
+
top_p=3.0,
|
20 |
+
)
|
21 |
+
summary = response['choices'][0]['message']['content']
|
22 |
+
return summary, length_of_research_paper
|
23 |
+
|
24 |
+
def summarize(llm, file):
|
25 |
+
start_time = time.time()
|
26 |
+
response, length_of_research_paper = generate_summary(llm, file)
|
27 |
+
if "**" in response:
|
28 |
+
response = response.replace("- **", "### ")
|
29 |
+
response = response.replace("**", "")
|
30 |
+
response = response.replace("\n\n", "\n")
|
31 |
+
response = response.replace("\\n\\n", "\\n")
|
32 |
+
else:
|
33 |
+
pass
|
34 |
+
summary = ""
|
35 |
+
for line in response:
|
36 |
+
if line.startswith("###"):
|
37 |
+
summary += "\\n\\n" +line
|
38 |
+
else:
|
39 |
+
summary += line
|
40 |
+
end_time = time.time()
|
41 |
+
total_time_taken = end_time - start_time
|
42 |
+
total_time_taken_minutes = total_time_taken / 60
|
43 |
+
total_time_taken_minutes = round(total_time_taken_minutes, 3)
|
44 |
+
info = f"The research paper of {length_of_research_paper} characters long was summarized in {total_time_taken_minutes} minutes."
|
45 |
+
return summary, info
|
tools.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyPDFLoader
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from llama_cpp import Llama
|
4 |
+
|
5 |
+
def extract_text_from_pdf(pdf_path):
|
6 |
+
loader = PyPDFLoader(pdf_path)
|
7 |
+
pages = loader.load_and_split()
|
8 |
+
all_text = " ".join([page.page_content for page in pages])
|
9 |
+
start_index = all_text.find("ABSTRACT")
|
10 |
+
end_index = all_text.find("REFERENCES")
|
11 |
+
if start_index != -1 and end_index != -1 and start_index < end_index:
|
12 |
+
relevant_text = all_text[start_index:end_index]
|
13 |
+
else:
|
14 |
+
relevant_text = "Unable to locate the specified sections in the document."
|
15 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
|
16 |
+
text_list = text_splitter.split_text(relevant_text)
|
17 |
+
research_paper_text = "".join(text_list)
|
18 |
+
length_of_research_paper = len(research_paper_text)
|
19 |
+
return research_paper_text, length_of_research_paper
|
20 |
+
|
21 |
+
def load_llm_model():
|
22 |
+
try:
|
23 |
+
llm = Llama(
|
24 |
+
model_path="Llama-3.2-1B-Instruct-Q8_0.gguf",
|
25 |
+
n_ctx=50000,
|
26 |
+
n_batch=16384,
|
27 |
+
# verbose=True,
|
28 |
+
)
|
29 |
+
print("LLM model loaded successfully")
|
30 |
+
return llm
|
31 |
+
except Exception as e:
|
32 |
+
print(f"Error loading LLM model: {e}")
|
33 |
+
raise
|