raannakasturi commited on
Commit
9e075cc
1 Parent(s): a171675

Upload 4 files

Browse files
Files changed (4) hide show
  1. TempSummary.py +59 -0
  2. app.py +51 -0
  3. main.py +45 -0
  4. tools.py +33 -0
TempSummary.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sumy.parsers.plaintext import PlaintextParser
2
+ from sumy.nlp.tokenizers import Tokenizer
3
+ from sumy.summarizers.text_rank import TextRankSummarizer
4
+ from sumy.summarizers.luhn import LuhnSummarizer
5
+ from sumy.summarizers.lsa import LsaSummarizer
6
+ from sumy.nlp.stemmers import Stemmer
7
+ from sumy.utils import get_stop_words
8
+ import nltk
9
+ from tools import extract_text_from_pdf
10
+
11
+ LANGUAGE = "english"
12
+ SENTENCES_COUNT = 10
13
+
14
+ def generate_textrank_summary(research_paper_text):
15
+ nltk.download('punkt', quiet=True)
16
+ nltk.download('punkt_tab', quiet=True)
17
+ parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
18
+ stemmer = Stemmer(LANGUAGE)
19
+ summarizer = TextRankSummarizer(stemmer)
20
+ summarizer.stop_words = get_stop_words(LANGUAGE)
21
+ sentences = summarizer(parser.document, SENTENCES_COUNT)
22
+ summary = ""
23
+ for sentence in sentences:
24
+ summary += str(sentence) + ""
25
+ return summary
26
+
27
+ def generate_luhn_summary(research_paper_text):
28
+ nltk.download('punkt', quiet=True)
29
+ nltk.download('punkt_tab', quiet=True)
30
+ parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
31
+ stemmer = Stemmer(LANGUAGE)
32
+ summarizer = LuhnSummarizer(stemmer)
33
+ summarizer.stop_words = get_stop_words(LANGUAGE)
34
+ sentences = summarizer(parser.document, SENTENCES_COUNT)
35
+ summary = ""
36
+ for sentence in sentences:
37
+ summary += str(sentence) + ""
38
+ return summary
39
+
40
+ def generate_lsa_summary(research_paper_text):
41
+ nltk.download('punkt', quiet=True)
42
+ nltk.download('punkt_tab', quiet=True)
43
+ parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
44
+ stemmer = Stemmer(LANGUAGE)
45
+ summarizer = LsaSummarizer(stemmer)
46
+ summarizer.stop_words = get_stop_words(LANGUAGE)
47
+ sentences = summarizer(parser.document, SENTENCES_COUNT)
48
+ summary = ""
49
+ for sentence in sentences:
50
+ summary += str(sentence) + ""
51
+ return summary
52
+
53
+ def generate_temp_summary(pdf_path):
54
+ research_paper_text, length_of_research_paper = extract_text_from_pdf(pdf_path)
55
+ textrank_summary = generate_textrank_summary(research_paper_text)
56
+ luhn_summary = generate_luhn_summary(research_paper_text)
57
+ lsa_summary = generate_lsa_summary(research_paper_text)
58
+ temp_summary = textrank_summary.replace("\n", "") + luhn_summary.replace("\n", "") + lsa_summary.replace("\n", "")
59
+ return temp_summary, length_of_research_paper
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+ from tools import load_llm_model
3
+ import gradio as gr
4
+ from main import summarize
5
+ import subprocess
6
+ import os
7
+
8
+ theme = gr.themes.Soft(
9
+ primary_hue="purple",
10
+ secondary_hue="cyan",
11
+ neutral_hue="slate",
12
+ font=[
13
+ gr.themes.GoogleFont('Syne'),
14
+ gr.themes.GoogleFont('Poppins'),
15
+ gr.themes.GoogleFont('Poppins'),
16
+ gr.themes.GoogleFont('Poppins')
17
+ ],
18
+ )
19
+
20
+ print("Checking for LLM model...")
21
+ while not os.path.exists("Llama-3.2-1B-Instruct-Q8_0.gguf"):
22
+ print("Downloading LLM model...")
23
+ subprocess.run(["curl -o Llama-3.2-1B-Instruct-Q8_0.gguf https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf?download=true"])
24
+ print("LLM model downloaded successfully.")
25
+
26
+ print("Loading LLM model...")
27
+ llm = load_llm_model()
28
+ print("Building app...")
29
+
30
+ summarize_with_llm = partial(summarize, llm)
31
+
32
+ with gr.Blocks(theme=theme, title="PDF Summarizer", fill_height=True) as app:
33
+ with gr.Column():
34
+ with gr.Row():
35
+ pdf_file = gr.File(label="Upload PDF", file_types=['.pdf'])
36
+ with gr.Column():
37
+ summarize_btn = gr.Button(value="Summarize")
38
+ info = gr.Textbox(label="Summarization Info", placeholder="Details regarding summarization will be shown here", interactive=False)
39
+ summary_output = gr.TextArea(label="PDF Summary", interactive=False, show_copy_button=True)
40
+ summarize_btn.click(
41
+ summarize_with_llm,
42
+ inputs=pdf_file,
43
+ outputs=[summary_output, info],
44
+ concurrency_limit=5,
45
+ scroll_to_output=True,
46
+ api_name="summarize",
47
+ show_progress="full",
48
+ max_batch_size=10,
49
+ )
50
+ print("Build Successful. Launching app...")
51
+ app.queue(default_concurrency_limit=5).launch(show_api=True)
main.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from TempSummary import generate_temp_summary
2
+ import time
3
+
4
+ def generate_summary(llm, file):
5
+ print("Generating temporary summary...")
6
+ temp_summary, length_of_research_paper = generate_temp_summary(file)
7
+ print("Temporary summary generated successfully")
8
+ print("Generating final summary...")
9
+ prompt = f"As a text script expert, please help me to write a short text script with the research paper \"{temp_summary}\".You have three tasks, which are:\\n 1.to summarize the text I provided into a Summary . Please answer within 200-300 characters.\\n 2.to summarize the text I provided, using up to seven concise Highlights. Choose appropriate emoji for each Highlight.\\n 3.to summarize the text I provided, using up to seven Key Insights. Each insight should include a brief in-depth analysis. Choose appropriate emoji for each key insights. Using the following template strictly, provide the results for the three tasks:\\n ### Summary\\n ### Highlights -[emoji]\\n ### key Insights -[emoji] .\\n Importantly your output must use language \"English\"\""
10
+ response = llm.create_chat_completion(
11
+ messages = [
12
+ {'role':'system',
13
+ 'content': 'You are a helpful research assistant for generating well-formatted summaries from scientific research papers.'},
14
+ {'role':'user',
15
+ 'content': prompt}
16
+ ],
17
+ temperature=0.5,
18
+ top_k=200,
19
+ top_p=3.0,
20
+ )
21
+ summary = response['choices'][0]['message']['content']
22
+ return summary, length_of_research_paper
23
+
24
+ def summarize(llm, file):
25
+ start_time = time.time()
26
+ response, length_of_research_paper = generate_summary(llm, file)
27
+ if "**" in response:
28
+ response = response.replace("- **", "### ")
29
+ response = response.replace("**", "")
30
+ response = response.replace("\n\n", "\n")
31
+ response = response.replace("\\n\\n", "\\n")
32
+ else:
33
+ pass
34
+ summary = ""
35
+ for line in response:
36
+ if line.startswith("###"):
37
+ summary += "\\n\\n" +line
38
+ else:
39
+ summary += line
40
+ end_time = time.time()
41
+ total_time_taken = end_time - start_time
42
+ total_time_taken_minutes = total_time_taken / 60
43
+ total_time_taken_minutes = round(total_time_taken_minutes, 3)
44
+ info = f"The research paper of {length_of_research_paper} characters long was summarized in {total_time_taken_minutes} minutes."
45
+ return summary, info
tools.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from llama_cpp import Llama
4
+
5
+ def extract_text_from_pdf(pdf_path):
6
+ loader = PyPDFLoader(pdf_path)
7
+ pages = loader.load_and_split()
8
+ all_text = " ".join([page.page_content for page in pages])
9
+ start_index = all_text.find("ABSTRACT")
10
+ end_index = all_text.find("REFERENCES")
11
+ if start_index != -1 and end_index != -1 and start_index < end_index:
12
+ relevant_text = all_text[start_index:end_index]
13
+ else:
14
+ relevant_text = "Unable to locate the specified sections in the document."
15
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
16
+ text_list = text_splitter.split_text(relevant_text)
17
+ research_paper_text = "".join(text_list)
18
+ length_of_research_paper = len(research_paper_text)
19
+ return research_paper_text, length_of_research_paper
20
+
21
+ def load_llm_model():
22
+ try:
23
+ llm = Llama(
24
+ model_path="Llama-3.2-1B-Instruct-Q8_0.gguf",
25
+ n_ctx=50000,
26
+ n_batch=16384,
27
+ # verbose=True,
28
+ )
29
+ print("LLM model loaded successfully")
30
+ return llm
31
+ except Exception as e:
32
+ print(f"Error loading LLM model: {e}")
33
+ raise