add PDF URL crawling

#12
by qolina - opened
Files changed (1) hide show
  1. app.py +35 -2
app.py CHANGED
@@ -5,6 +5,35 @@ from reference_string_parsing import *
5
  from controlled_summarization import *
6
  from dataset_extraction import *
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
9
  gr.Markdown("# Gradio Demo for SciAssist")
10
  with gr.Tabs():
@@ -16,7 +45,8 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
16
  gr.Markdown(ctrlsum_file_md)
17
  with gr.Row():
18
  with gr.Column():
19
- ctrlsum_file = gr.File(label="Input File")
 
20
  ctrlsum_str = gr.TextArea(label="Input String", max_lines=5)
21
  with gr.Column():
22
  gr.Markdown("* Length 0 will exert no control over length.")
@@ -33,6 +63,9 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
33
  ctrlsum_file_examples = gr.Examples(examples=[["examples/H01-1042_body.txt", 50, "automatic evaluation technique"],["examples/H01-1042.pdf", 0, "automatic evaluation technique"]],
34
  inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords])
35
 
 
 
 
36
  ctrlsum_file_btn.click(
37
  fn=ctrlsum_for_file,
38
  inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str],
@@ -143,4 +176,4 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
143
  )
144
 
145
 
146
- demo.launch(share=False)
 
5
  from controlled_summarization import *
6
  from dataset_extraction import *
7
 
8
+ import requests
9
+ def download_pdf(url, dest_folder):
10
+
11
+ """
12
+ Download a PDF from a given URL and save it to a specified destination folder.
13
+ Parameters:
14
+ url (str): URL of the PDF
15
+ dest_folder (str): Destination folder to save the downloaded PDF
16
+ """
17
+
18
+ if not os.path.exists(dest_folder):
19
+ os.makedirs(dest_folder)
20
+
21
+ response = requests.get(url, stream=True)
22
+ filename = os.path.join(dest_folder, url.split("/")[-1])
23
+
24
+ with open(filename, 'wb') as file:
25
+ for chunk in response.iter_content(chunk_size=1024):
26
+ if chunk:
27
+ file.write(chunk)
28
+ #print(f"Downloaded {url} to {filename}")
29
+ return filename
30
+
31
+ # Example Usage
32
+ #url = "https://arxiv.org/pdf/2305.14996.pdf"
33
+ #dest_folder = "./examples/"
34
+ #download_pdf(url, dest_folder)
35
+
36
+
37
  with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
38
  gr.Markdown("# Gradio Demo for SciAssist")
39
  with gr.Tabs():
 
45
  gr.Markdown(ctrlsum_file_md)
46
  with gr.Row():
47
  with gr.Column():
48
+ ctrlsum_url = gr.TextArea(label="PDF URL", max_lines=1)
49
+ ctrlsum_file = gr.File(label="Input File", max_lines=2)
50
  ctrlsum_str = gr.TextArea(label="Input String", max_lines=5)
51
  with gr.Column():
52
  gr.Markdown("* Length 0 will exert no control over length.")
 
63
  ctrlsum_file_examples = gr.Examples(examples=[["examples/H01-1042_body.txt", 50, "automatic evaluation technique"],["examples/H01-1042.pdf", 0, "automatic evaluation technique"]],
64
  inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords])
65
 
66
+ if ctrlsum_url is not None and len(ctrlsum_url) > 4:
67
+ ctrlsum_file = download_pdf(ctrlsum_url, './examples/')
68
+
69
  ctrlsum_file_btn.click(
70
  fn=ctrlsum_for_file,
71
  inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str],
 
176
  )
177
 
178
 
179
+ demo.launch(share=False)