EdoAbati commited on
Commit
7980e1c
β€’
1 Parent(s): 9cddaca

added feature to parse all text from paper pdf

Browse files
Files changed (1) hide show
  1. app.py +79 -16
app.py CHANGED
@@ -1,24 +1,84 @@
 
 
1
  import re
 
 
2
 
3
  import gradio as gr
4
  import requests
5
  import xmltodict
 
6
  from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
7
  from transformers.pipelines.question_answering import QuestionAnsweringPipeline
8
 
9
  QA_MODEL_NAME = "ixa-ehu/SciBERT-SQuAD-QuAC"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  def clean_text(text: str) -> str:
13
- text = re.sub("\n", " ", text)
 
 
14
  return text
15
 
16
 
17
- def get_paper_summary(arxiv_id: str) -> str:
18
- paper_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
19
- response = requests.get(paper_url)
20
- paper_dict = xmltodict.parse(response.content)["feed"]["entry"]
21
- return clean_text(paper_dict["summary"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
  def get_qa_pipeline(qa_model_name: str = QA_MODEL_NAME) -> QuestionAnsweringPipeline:
@@ -36,24 +96,27 @@ def get_answer(question: str, context: str) -> str:
36
 
37
  demo = gr.Blocks()
38
 
39
-
40
  with demo:
41
- gr.Markdown("# Document QA")
42
 
43
- # Retrieve paper
44
- arxiv_id = gr.Textbox(
45
- label="arXiv Paper ID", placeholder="Insert here the ID of a paper on arXiv"
46
  )
47
- paper_summary = gr.Textbox(label="Paper summary")
48
- fetch_document_button = gr.Button("Get Summary")
 
 
49
  fetch_document_button.click(
50
- fn=get_paper_summary, inputs=arxiv_id, outputs=paper_summary
 
 
51
  )
52
 
53
- # QA on paper
54
  question = gr.Textbox(label="Ask a question about the paper:")
55
- answer = gr.Textbox("Answer:")
56
  ask_button = gr.Button("Ask me πŸ€–")
 
57
  ask_button.click(fn=get_answer, inputs=[question, paper_summary], outputs=answer)
58
 
59
 
 
1
+ from __future__ import annotations
2
+
3
  import re
4
+ from dataclasses import dataclass
5
+ from typing import Tuple
6
 
7
  import gradio as gr
8
  import requests
9
  import xmltodict
10
+ from PyPDF2 import PdfReader
11
  from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
12
  from transformers.pipelines.question_answering import QuestionAnsweringPipeline
13
 
14
  QA_MODEL_NAME = "ixa-ehu/SciBERT-SQuAD-QuAC"
15
+ TEMP_PDF_PATH = "/tmp/arxiv_paper.pdf"
16
+ ARXIV_URL_PATTERN = r"(http|https)://(arxiv.org/pdf/)+([0-9]+\.[0-9]+)\.pdf"
17
+
18
+
19
+ def is_valid_url(url: str) -> bool:
20
+ return re.fullmatch(ARXIV_URL_PATTERN, url) is not None
21
+
22
+
23
+ @dataclass
24
+ class PaperMetaData:
25
+ arxiv_id: str
26
+ title: str
27
+ summary: str
28
+ text: str
29
+
30
+ @staticmethod
31
+ def _clean_field(text: str) -> str:
32
+ text = re.sub(r"\n", " ", text)
33
+ text = re.sub(r"\s+", " ", text)
34
+ return text
35
+
36
+ @classmethod
37
+ def from_api(cls, arxiv_id: str, text: str) -> PaperMetaData:
38
+ paper_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
39
+ response = requests.get(paper_url)
40
+ paper_dict = xmltodict.parse(response.content)["feed"]["entry"]
41
+ return PaperMetaData(
42
+ arxiv_id=arxiv_id,
43
+ title=cls._clean_field(paper_dict["title"]),
44
+ summary=cls._clean_field(paper_dict["summary"]),
45
+ text=text,
46
+ )
47
 
48
 
49
  def clean_text(text: str) -> str:
50
+ text = re.sub(r"\x03|\x02", "", text)
51
+ text = re.sub(r"-\s+", "", text)
52
+ text = re.sub(r"\n", " ", text)
53
  return text
54
 
55
 
56
+ class PDFPaper:
57
+ def __init__(self, url: str):
58
+ if not is_valid_url(url):
59
+ raise ValueError("The URL provided is not a valid arxiv PDF url.")
60
+ self.url = url
61
+ self.arxiv_id = re.fullmatch(ARXIV_URL_PATTERN, url).group(3)
62
+
63
+ def _download(self, download_path: str = TEMP_PDF_PATH) -> None:
64
+ pdf_r = requests.get(self.url)
65
+ pdf_r.raise_for_status()
66
+ with open(download_path, "wb") as pdf_file:
67
+ pdf_file.write(pdf_r.content)
68
+
69
+ def read_text(self, pdf_path: str = TEMP_PDF_PATH) -> str:
70
+ self._download(pdf_path)
71
+ reader = PdfReader(pdf_path)
72
+ pdf_text = " ".join([page.extract_text() for page in reader.pages])
73
+ return clean_text(pdf_text)
74
+
75
+ def get_paper_full_data(self) -> PaperMetaData:
76
+ return PaperMetaData.from_api(arxiv_id=self.arxiv_id, text=self.read_text())
77
+
78
+
79
+ def get_paper_data(url: str) -> Tuple[str, str, str]:
80
+ paper_data = PDFPaper(url=url).get_paper_full_data()
81
+ return paper_data.title, paper_data.summary, paper_data.text
82
 
83
 
84
  def get_qa_pipeline(qa_model_name: str = QA_MODEL_NAME) -> QuestionAnsweringPipeline:
 
96
 
97
  demo = gr.Blocks()
98
 
 
99
  with demo:
100
+ gr.Markdown("# arXiv Paper Q&A\nImport an arXiv paper and ask questions about it!")
101
 
102
+ gr.Markdown("## πŸ“„ Import the paper on arXiv")
103
+ arxiv_url = gr.Textbox(
104
+ label="arXiv Paper URL", placeholder="Insert here the URL of a paper on arXiv"
105
  )
106
+ fetch_document_button = gr.Button("Import Paper")
107
+ paper_title = gr.Textbox(label="Paper Title")
108
+ paper_summary = gr.Textbox(label="Paper Summary")
109
+ paper_text = gr.Textbox(label="Paper Text")
110
  fetch_document_button.click(
111
+ fn=get_paper_data,
112
+ inputs=arxiv_url,
113
+ outputs=[paper_title, paper_summary, paper_text],
114
  )
115
 
116
+ gr.Markdown("## 🀨 Ask a question about the paper")
117
  question = gr.Textbox(label="Ask a question about the paper:")
 
118
  ask_button = gr.Button("Ask me πŸ€–")
119
+ answer = gr.Textbox(label="Answer:")
120
  ask_button.click(fn=get_answer, inputs=[question, paper_summary], outputs=answer)
121
 
122