Spaces:

raannakasturi
/

hybrid-researchpaper-summarizer

Sleeping

App Files Files Community

raannakasturi commited on Nov 16, 2024

Commit

9e075cc

•

1 Parent(s): a171675

Upload 4 files

Browse files

Files changed (4) hide show

TempSummary.py +59 -0
app.py +51 -0
main.py +45 -0
tools.py +33 -0

TempSummary.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.text_rank import TextRankSummarizer
+from sumy.summarizers.luhn import LuhnSummarizer
+from sumy.summarizers.lsa import LsaSummarizer
+from sumy.nlp.stemmers import Stemmer
+from sumy.utils import get_stop_words
+import nltk
+from tools import extract_text_from_pdf
+LANGUAGE = "english"
+SENTENCES_COUNT = 10
+def generate_textrank_summary(research_paper_text):
+    nltk.download('punkt', quiet=True)
+    nltk.download('punkt_tab', quiet=True)
+    parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
+    stemmer = Stemmer(LANGUAGE)
+    summarizer = TextRankSummarizer(stemmer)
+    summarizer.stop_words = get_stop_words(LANGUAGE)
+    sentences = summarizer(parser.document, SENTENCES_COUNT)
+    summary = ""
+    for sentence in sentences:
+        summary += str(sentence) + ""
+    return summary
+def generate_luhn_summary(research_paper_text):
+    nltk.download('punkt', quiet=True)
+    nltk.download('punkt_tab', quiet=True)
+    parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
+    stemmer = Stemmer(LANGUAGE)
+    summarizer = LuhnSummarizer(stemmer)
+    summarizer.stop_words = get_stop_words(LANGUAGE)
+    sentences = summarizer(parser.document, SENTENCES_COUNT)
+    summary = ""
+    for sentence in sentences:
+        summary += str(sentence) + ""
+    return summary
+def generate_lsa_summary(research_paper_text):
+    nltk.download('punkt', quiet=True)
+    nltk.download('punkt_tab', quiet=True)
+    parser = PlaintextParser.from_string(research_paper_text, Tokenizer(LANGUAGE))
+    stemmer = Stemmer(LANGUAGE)
+    summarizer = LsaSummarizer(stemmer)
+    summarizer.stop_words = get_stop_words(LANGUAGE)
+    sentences = summarizer(parser.document, SENTENCES_COUNT)
+    summary = ""
+    for sentence in sentences:
+        summary += str(sentence) + ""
+    return summary
+def generate_temp_summary(pdf_path):
+    research_paper_text, length_of_research_paper = extract_text_from_pdf(pdf_path)
+    textrank_summary = generate_textrank_summary(research_paper_text)
+    luhn_summary = generate_luhn_summary(research_paper_text)
+    lsa_summary = generate_lsa_summary(research_paper_text)
+    temp_summary = textrank_summary.replace("\n", "") + luhn_summary.replace("\n", "") + lsa_summary.replace("\n", "")
+    return temp_summary, length_of_research_paper

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from functools import partial
+from tools import load_llm_model
+import gradio as gr
+from main import summarize
+import subprocess
+import os
+theme = gr.themes.Soft(
+    primary_hue="purple",
+    secondary_hue="cyan",
+    neutral_hue="slate",
+    font=[
+        gr.themes.GoogleFont('Syne'),
+        gr.themes.GoogleFont('Poppins'),
+        gr.themes.GoogleFont('Poppins'),
+        gr.themes.GoogleFont('Poppins')
+    ],
+)
+print("Checking for LLM model...")
+while not os.path.exists("Llama-3.2-1B-Instruct-Q8_0.gguf"):
+    print("Downloading LLM model...")
+    subprocess.run(["curl -o Llama-3.2-1B-Instruct-Q8_0.gguf https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf?download=true"])
+    print("LLM model downloaded successfully.")
+print("Loading LLM model...")
+llm = load_llm_model()
+print("Building app...")
+summarize_with_llm = partial(summarize, llm)
+with gr.Blocks(theme=theme, title="PDF Summarizer", fill_height=True) as app:
+    with gr.Column():
+        with gr.Row():
+            pdf_file = gr.File(label="Upload PDF", file_types=['.pdf'])
+            with gr.Column():
+                summarize_btn = gr.Button(value="Summarize")
+                info = gr.Textbox(label="Summarization Info", placeholder="Details regarding summarization will be shown here", interactive=False)
+        summary_output = gr.TextArea(label="PDF Summary", interactive=False, show_copy_button=True)
+    summarize_btn.click(
+        summarize_with_llm,
+        inputs=pdf_file,
+        outputs=[summary_output, info],
+        concurrency_limit=5,
+        scroll_to_output=True,
+        api_name="summarize",
+        show_progress="full",
+        max_batch_size=10,
+    )
+print("Build Successful. Launching app...")
+app.queue(default_concurrency_limit=5).launch(show_api=True)

main.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from TempSummary import generate_temp_summary
+import time
+def generate_summary(llm, file):
+    print("Generating temporary summary...")
+    temp_summary, length_of_research_paper = generate_temp_summary(file)
+    print("Temporary summary generated successfully")
+    print("Generating final summary...")
+    prompt = f"As a text script expert, please help me to write a short text script with the research paper \"{temp_summary}\".You have three tasks, which are:\\n    1.to summarize the text I provided into a Summary . Please answer  within 200-300 characters.\\n    2.to summarize the text I provided, using up to seven concise Highlights. Choose appropriate emoji for each Highlight.\\n    3.to summarize the text I provided, using up to seven Key Insights. Each insight should include a brief in-depth analysis. Choose appropriate emoji for each key insights. Using the following template strictly, provide the results for the three tasks:\\n    ### Summary\\n    ### Highlights -[emoji]\\n    ### key Insights -[emoji] .\\n  Importantly your output must use language \"English\"\""
+    response = llm.create_chat_completion(
+        messages = [
+            {'role':'system',
+            'content': 'You are a helpful research assistant for generating well-formatted summaries from scientific research papers.'},
+            {'role':'user',
+            'content': prompt}
+        ],
+        temperature=0.5,
+        top_k=200,
+        top_p=3.0,
+    )
+    summary = response['choices'][0]['message']['content']
+    return summary, length_of_research_paper
+def summarize(llm, file):
+    start_time = time.time()
+    response, length_of_research_paper = generate_summary(llm, file)
+    if "**" in response:
+        response = response.replace("- **", "### ")
+        response = response.replace("**", "")
+        response = response.replace("\n\n", "\n")
+        response = response.replace("\\n\\n", "\\n")
+    else:
+        pass
+    summary = ""
+    for line in response:
+        if line.startswith("###"):
+            summary += "\\n\\n" +line
+        else:
+            summary += line
+    end_time = time.time()
+    total_time_taken = end_time - start_time
+    total_time_taken_minutes = total_time_taken / 60
+    total_time_taken_minutes = round(total_time_taken_minutes, 3)
+    info = f"The research paper of {length_of_research_paper} characters long was summarized in {total_time_taken_minutes} minutes."
+    return summary, info

tools.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from llama_cpp import Llama
+def extract_text_from_pdf(pdf_path):
+    loader = PyPDFLoader(pdf_path)
+    pages = loader.load_and_split()
+    all_text = " ".join([page.page_content for page in pages])
+    start_index = all_text.find("ABSTRACT")
+    end_index = all_text.find("REFERENCES")
+    if start_index != -1 and end_index != -1 and start_index < end_index:
+        relevant_text = all_text[start_index:end_index]
+    else:
+        relevant_text = "Unable to locate the specified sections in the document."
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
+    text_list = text_splitter.split_text(relevant_text)
+    research_paper_text = "".join(text_list)
+    length_of_research_paper = len(research_paper_text)
+    return research_paper_text, length_of_research_paper
+def load_llm_model():
+    try:
+        llm = Llama(
+            model_path="Llama-3.2-1B-Instruct-Q8_0.gguf",
+            n_ctx=50000,
+            n_batch=16384,
+            # verbose=True,
+        )
+        print("LLM model loaded successfully")
+        return llm
+    except Exception as e:
+        print(f"Error loading LLM model: {e}")
+        raise