Spaces:

Rogerjs
/

NeuroLitExplorer

Sleeping

File size: 5,351 Bytes

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import re

# ---------------------
# Disclaimer
# ---------------------
DISCLAIMER = """
**Disclaimer:**  
This application is provided for **research and educational purposes only**. 
All summaries are generated using an automated language model and may contain inaccuracies or omissions. 
It is not intended to replace professional judgment, peer-reviewed references, or expert consultation. 
The authors and developers assume no legal liability for any misuse, misinterpretation, or unintended consequences 
arising from the use of this tool. Please use responsibly and cross-check results with credible sources.
"""

# ---------------------
# Model Setup
# ---------------------
MODEL_NAME = "allenai/scibert_scivocab_cased"  # Example tokenizer model (not directly used for summarization)
SUMMARIZATION_MODEL = "allenai/led-base-16384"  # Example summarization model with a large context window

# Load summarization model and tokenizer
summarizer_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL)

# ---------------------
# Utility Functions
# ---------------------
def extract_text_from_pdf(pdf_file):
    try:
        import PyPDF2
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text
    except Exception as e:
        return f"Error reading PDF: {e}"

def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def summarize_text(text):
    inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
    with torch.no_grad():
        summary_ids = summarizer_model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            num_beams=4,
            length_penalty=2.0,
            max_length=512,
            early_stopping=True
        )
    summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def extract_methods(text):
    # A very naive approach: search for paragraphs containing method-related keywords
    # and return them as "Key Methods".
    methods_keywords = ["method", "methods", "technique", "procedure", "protocol", "experimental approach"]
    paragraphs = re.split(r'\n+|\. ', text)
    method_sentences = [p.strip() for p in paragraphs if any(kw in p.lower() for kw in methods_keywords)]
    if method_sentences:
        return " ".join(method_sentences)
    else:
        return "No explicit methods found using simple keyword search."

def extract_references(text):
    # A naive approach for references:
    # Look for patterns like "et al., 20XX", "(Author, Year)", or numeric citations [XX].
    # This is a heuristic and may produce false positives.
    # Common patterns: 
    # - Something like "Smith et al., 2020"
    # - (Smith et al., 2020)
    # - [1], [2], etc., at the end of sentences.
    references_pattern = r"([A-Z][a-zA-Z]+ et al\.,?\s?\d{4})|(\(\S+ et al\.,?\s?\d{4}\))|(\[\d+\])"
    refs_found = re.findall(references_pattern, text)
    # refs_found will be a list of tuples due to multiple groups, flatten them:
    flat_refs = []
    for tup in refs_found:
        for ref in tup:
            if ref:
                flat_refs.append(ref.strip())
    flat_refs = list(set(flat_refs))  # remove duplicates
    if flat_refs:
        return "Possible References Found:\n" + "\n".join(flat_refs)
    else:
        return "No explicit references found using simple pattern search."

def analyze_text(text):
    text_clean = clean_text(text)
    if len(text_clean) < 50:
        return "Please provide a longer text snippet or PDF.", "", ""
    
    summary = summarize_text(text_clean)
    methods = extract_methods(text_clean)
    references = extract_references(text_clean)
    return summary, methods, references

def process_input(pdf_file, text_snippet):
    # If PDF is provided, extract text
    input_text = ""
    if pdf_file is not None:
        input_text = extract_text_from_pdf(pdf_file)
    
    # If a text snippet is provided, append it.
    if text_snippet is not None and text_snippet.strip():
        input_text = input_text + " " + text_snippet.strip()
    
    if not input_text.strip():
        return "No input provided.", "", ""
    
    summary, methods, references = analyze_text(input_text)
    return summary, methods, references

# ---------------------
# Gradio Interface
# ---------------------
with gr.Blocks() as demo:
    gr.Markdown("# NeuroLit Explorer")
    gr.Markdown(DISCLAIMER)
    gr.Markdown("**Instructions:** Upload a PDF or paste a text snippet from a neuroscience article, then click 'Summarize'.")
    with gr.Row():
        pdf_input = gr.File(label="Upload PDF")
        text_input = gr.Textbox(label="Or Paste Article Text")
    summarize_button = gr.Button("Summarize")
    
    summary_output = gr.Textbox(label="Summary")
    methods_output = gr.Textbox(label="Key Methods")
    references_output = gr.Textbox(label="Relevant References")
    
    summarize_button.click(fn=process_input, inputs=[pdf_input, text_input], outputs=[summary_output, methods_output, references_output])

demo.launch()