Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import re | |
# --------------------- | |
# Disclaimer | |
# --------------------- | |
DISCLAIMER = """ | |
**Disclaimer:** | |
This application is provided for **research and educational purposes only**. | |
All summaries are generated using an automated language model and may contain inaccuracies or omissions. | |
It is not intended to replace professional judgment, peer-reviewed references, or expert consultation. | |
The authors and developers assume no legal liability for any misuse, misinterpretation, or unintended consequences | |
arising from the use of this tool. Please use responsibly and cross-check results with credible sources. | |
""" | |
# --------------------- | |
# Model Setup | |
# --------------------- | |
MODEL_NAME = "allenai/scibert_scivocab_cased" # Example tokenizer model (not directly used for summarization) | |
SUMMARIZATION_MODEL = "allenai/led-base-16384" # Example summarization model with a large context window | |
# Load summarization model and tokenizer | |
summarizer_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL) | |
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL) | |
# --------------------- | |
# Utility Functions | |
# --------------------- | |
def extract_text_from_pdf(pdf_file): | |
try: | |
import PyPDF2 | |
reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
except Exception as e: | |
return f"Error reading PDF: {e}" | |
def clean_text(text): | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def summarize_text(text): | |
inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384) | |
with torch.no_grad(): | |
summary_ids = summarizer_model.generate( | |
inputs["input_ids"], | |
attention_mask=inputs["attention_mask"], | |
num_beams=4, | |
length_penalty=2.0, | |
max_length=512, | |
early_stopping=True | |
) | |
summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
return summary | |
def extract_methods(text): | |
# A very naive approach: search for paragraphs containing method-related keywords | |
# and return them as "Key Methods". | |
methods_keywords = ["method", "methods", "technique", "procedure", "protocol", "experimental approach"] | |
paragraphs = re.split(r'\n+|\. ', text) | |
method_sentences = [p.strip() for p in paragraphs if any(kw in p.lower() for kw in methods_keywords)] | |
if method_sentences: | |
return " ".join(method_sentences) | |
else: | |
return "No explicit methods found using simple keyword search." | |
def extract_references(text): | |
# A naive approach for references: | |
# Look for patterns like "et al., 20XX", "(Author, Year)", or numeric citations [XX]. | |
# This is a heuristic and may produce false positives. | |
# Common patterns: | |
# - Something like "Smith et al., 2020" | |
# - (Smith et al., 2020) | |
# - [1], [2], etc., at the end of sentences. | |
references_pattern = r"([A-Z][a-zA-Z]+ et al\.,?\s?\d{4})|(\(\S+ et al\.,?\s?\d{4}\))|(\[\d+\])" | |
refs_found = re.findall(references_pattern, text) | |
# refs_found will be a list of tuples due to multiple groups, flatten them: | |
flat_refs = [] | |
for tup in refs_found: | |
for ref in tup: | |
if ref: | |
flat_refs.append(ref.strip()) | |
flat_refs = list(set(flat_refs)) # remove duplicates | |
if flat_refs: | |
return "Possible References Found:\n" + "\n".join(flat_refs) | |
else: | |
return "No explicit references found using simple pattern search." | |
def analyze_text(text): | |
text_clean = clean_text(text) | |
if len(text_clean) < 50: | |
return "Please provide a longer text snippet or PDF.", "", "" | |
summary = summarize_text(text_clean) | |
methods = extract_methods(text_clean) | |
references = extract_references(text_clean) | |
return summary, methods, references | |
def process_input(pdf_file, text_snippet): | |
# If PDF is provided, extract text | |
input_text = "" | |
if pdf_file is not None: | |
input_text = extract_text_from_pdf(pdf_file) | |
# If a text snippet is provided, append it. | |
if text_snippet is not None and text_snippet.strip(): | |
input_text = input_text + " " + text_snippet.strip() | |
if not input_text.strip(): | |
return "No input provided.", "", "" | |
summary, methods, references = analyze_text(input_text) | |
return summary, methods, references | |
# --------------------- | |
# Gradio Interface | |
# --------------------- | |
with gr.Blocks() as demo: | |
gr.Markdown("# NeuroLit Explorer") | |
gr.Markdown(DISCLAIMER) | |
gr.Markdown("**Instructions:** Upload a PDF or paste a text snippet from a neuroscience article, then click 'Summarize'.") | |
with gr.Row(): | |
pdf_input = gr.File(label="Upload PDF") | |
text_input = gr.Textbox(label="Or Paste Article Text") | |
summarize_button = gr.Button("Summarize") | |
summary_output = gr.Textbox(label="Summary") | |
methods_output = gr.Textbox(label="Key Methods") | |
references_output = gr.Textbox(label="Relevant References") | |
summarize_button.click(fn=process_input, inputs=[pdf_input, text_input], outputs=[summary_output, methods_output, references_output]) | |
demo.launch() | |