Rogerjs commited on
Commit
2912f11
1 Parent(s): d538703

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
+ import re
5
+
6
+ # ---------------------
7
+ # Disclaimer
8
+ # ---------------------
9
+ DISCLAIMER = """
10
+ **Disclaimer:**
11
+ This application is provided for **research and educational purposes only**.
12
+ All summaries are generated using an automated language model and may contain inaccuracies or omissions.
13
+ It is not intended to replace professional judgment, peer-reviewed references, or expert consultation.
14
+ The authors and developers assume no legal liability for any misuse, misinterpretation, or unintended consequences
15
+ arising from the use of this tool. Please use responsibly and cross-check results with credible sources.
16
+ """
17
+
18
+ # ---------------------
19
+ # Model Setup
20
+ # ---------------------
21
+ MODEL_NAME = "allenai/scibert_scivocab_cased" # Example model for tokenization/embedding
22
+ SUMMARIZATION_MODEL = "allenai/led-base-16384" # Example summarization model with a large context window
23
+
24
+ # Load summarization model and tokenizer
25
+ summarizer_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZATION_MODEL)
26
+ summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZATION_MODEL)
27
+
28
+ # ---------------------
29
+ # Utility Functions
30
+ # ---------------------
31
+ def extract_text_from_pdf(pdf_file):
32
+ # This function extracts text from a PDF file. Requires PyPDF2 or similar library.
33
+ # For Hugging Face Spaces, PyPDF2 often works.
34
+ try:
35
+ import PyPDF2
36
+ reader = PyPDF2.PdfReader(pdf_file)
37
+ text = ""
38
+ for page in reader.pages:
39
+ text += page.extract_text() + "\n"
40
+ return text
41
+ except Exception as e:
42
+ return f"Error reading PDF: {e}"
43
+
44
+ def clean_text(text):
45
+ # Basic cleaning function
46
+ text = re.sub(r'\s+', ' ', text).strip()
47
+ return text
48
+
49
+ def summarize_text(text):
50
+ # Summarize the given text
51
+ inputs = summarizer_tokenizer(text, return_tensors="pt", truncation=True, max_length=16384)
52
+ with torch.no_grad():
53
+ summary_ids = summarizer_model.generate(
54
+ inputs["input_ids"],
55
+ attention_mask=inputs["attention_mask"],
56
+ num_beams=4,
57
+ length_penalty=2.0,
58
+ max_length=512,
59
+ early_stopping=True
60
+ )
61
+ summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
62
+ return summary
63
+
64
+ def analyze_text(text):
65
+ # In a more elaborate system, you might:
66
+ # 1. Extract main findings using IE or NER.
67
+ # 2. Identify methods mentioned.
68
+ # 3. Extract references (regex patterns for citations).
69
+ # Here we just do a simple summarization.
70
+ text_clean = clean_text(text)
71
+ if len(text_clean) < 50:
72
+ return "Please provide a longer text snippet or PDF."
73
+
74
+ summary = summarize_text(text_clean)
75
+
76
+ # Dummy logic for key methods and references (in a real app, use NLP-based extraction)
77
+ methods = "Key methods extraction is not yet implemented."
78
+ references = "Reference extraction is not yet implemented."
79
+ return summary, methods, references
80
+
81
+ def process_input(pdf_file, text_snippet):
82
+ # If PDF is provided, extract text from PDF
83
+ input_text = ""
84
+ if pdf_file is not None:
85
+ input_text = extract_text_from_pdf(pdf_file)
86
+
87
+ # If a text snippet is provided, append it.
88
+ if text_snippet is not None and text_snippet.strip():
89
+ input_text = input_text + " " + text_snippet.strip()
90
+
91
+ if not input_text.strip():
92
+ return "No input provided.", "", ""
93
+
94
+ summary, methods, references = analyze_text(input_text)
95
+ return summary, methods, references
96
+
97
+ # ---------------------
98
+ # Gradio Interface
99
+ # ---------------------
100
+ with gr.Blocks() as demo:
101
+ gr.Markdown("# NeuroLit Explorer")
102
+ gr.Markdown(DISCLAIMER)
103
+ gr.Markdown("**Instructions:** Upload a PDF or paste a text snippet from a neuroscience article, then click 'Summarize'.")
104
+ with gr.Row():
105
+ pdf_input = gr.File(label="Upload PDF")
106
+ text_input = gr.Textbox(label="Or Paste Article Text")
107
+ summarize_button = gr.Button("Summarize")
108
+
109
+ summary_output = gr.Textbox(label="Summary")
110
+ methods_output = gr.Textbox(label="Key Methods")
111
+ references_output = gr.Textbox(label="Relevant References")
112
+
113
+ summarize_button.click(fn=process_input, inputs=[pdf_input, text_input], outputs=[summary_output, methods_output, references_output])
114
+
115
+ demo.launch()