File size: 5,351 Bytes
2912f11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e46cc76
2912f11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e46cc76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2912f11
 
 
e46cc76
2912f11
 
e46cc76
 
2912f11
 
 
e46cc76
2912f11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import re

# ---------------------
# Disclaimer
# ---------------------
DISCLAIMER = """
**Disclaimer:**  
This application is provided for **research and educational purposes only**. 
All summaries are generated using an automated language model and may contain inaccuracies or omissions. 
It is not intended to replace professional judgment, peer-reviewed references, or expert consultation. 
The authors and developers assume no legal liability for any misuse, misinterpretation, or unintended consequences 
arising from the use of this tool. Please use responsibly and cross-check results with credible sources.
"""

# ---------------------
# Model Setup
# ---------------------
MODEL_NAME = "allenai/scibert_scivocab_cased"  # Example tokenizer model (not directly used for summarization)
SUMMARIZATION_MODEL = "allenai/led-base-16384"  # Example summarization model with a large context window

# Load summarization model and tokenizer
summarizer_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL)

# ---------------------
# Utility Functions
# ---------------------
def extract_text_from_pdf(pdf_file):
    try:
        import PyPDF2
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text
    except Exception as e:
        return f"Error reading PDF: {e}"

def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def summarize_text(text):
    inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
    with torch.no_grad():
        summary_ids = summarizer_model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            num_beams=4,
            length_penalty=2.0,
            max_length=512,
            early_stopping=True
        )
    summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def extract_methods(text):
    # A very naive approach: search for paragraphs containing method-related keywords
    # and return them as "Key Methods".
    methods_keywords = ["method", "methods", "technique", "procedure", "protocol", "experimental approach"]
    paragraphs = re.split(r'\n+|\. ', text)
    method_sentences = [p.strip() for p in paragraphs if any(kw in p.lower() for kw in methods_keywords)]
    if method_sentences:
        return " ".join(method_sentences)
    else:
        return "No explicit methods found using simple keyword search."

def extract_references(text):
    # A naive approach for references:
    # Look for patterns like "et al., 20XX", "(Author, Year)", or numeric citations [XX].
    # This is a heuristic and may produce false positives.
    # Common patterns: 
    # - Something like "Smith et al., 2020"
    # - (Smith et al., 2020)
    # - [1], [2], etc., at the end of sentences.
    references_pattern = r"([A-Z][a-zA-Z]+ et al\.,?\s?\d{4})|(\(\S+ et al\.,?\s?\d{4}\))|(\[\d+\])"
    refs_found = re.findall(references_pattern, text)
    # refs_found will be a list of tuples due to multiple groups, flatten them:
    flat_refs = []
    for tup in refs_found:
        for ref in tup:
            if ref:
                flat_refs.append(ref.strip())
    flat_refs = list(set(flat_refs))  # remove duplicates
    if flat_refs:
        return "Possible References Found:\n" + "\n".join(flat_refs)
    else:
        return "No explicit references found using simple pattern search."

def analyze_text(text):
    text_clean = clean_text(text)
    if len(text_clean) < 50:
        return "Please provide a longer text snippet or PDF.", "", ""
    
    summary = summarize_text(text_clean)
    methods = extract_methods(text_clean)
    references = extract_references(text_clean)
    return summary, methods, references

def process_input(pdf_file, text_snippet):
    # If PDF is provided, extract text
    input_text = ""
    if pdf_file is not None:
        input_text = extract_text_from_pdf(pdf_file)
    
    # If a text snippet is provided, append it.
    if text_snippet is not None and text_snippet.strip():
        input_text = input_text + " " + text_snippet.strip()
    
    if not input_text.strip():
        return "No input provided.", "", ""
    
    summary, methods, references = analyze_text(input_text)
    return summary, methods, references

# ---------------------
# Gradio Interface
# ---------------------
with gr.Blocks() as demo:
    gr.Markdown("# NeuroLit Explorer")
    gr.Markdown(DISCLAIMER)
    gr.Markdown("**Instructions:** Upload a PDF or paste a text snippet from a neuroscience article, then click 'Summarize'.")
    with gr.Row():
        pdf_input = gr.File(label="Upload PDF")
        text_input = gr.Textbox(label="Or Paste Article Text")
    summarize_button = gr.Button("Summarize")
    
    summary_output = gr.Textbox(label="Summary")
    methods_output = gr.Textbox(label="Key Methods")
    references_output = gr.Textbox(label="Relevant References")
    
    summarize_button.click(fn=process_input, inputs=[pdf_input, text_input], outputs=[summary_output, methods_output, references_output])

demo.launch()