wing-nus dyxohjl666 commited on
Commit
7236411
1 Parent(s): 3893bea

Support url input (#14)

Browse files

- Support url input (f7eb0178c5732d964fba21fb274dcada7799686d)


Co-authored-by: Yixi Ding <dyxohjl666@users.noreply.huggingface.co>

Files changed (2) hide show
  1. app.py +10 -29
  2. controlled_summarization.py +37 -9
app.py CHANGED
@@ -6,27 +6,6 @@ from controlled_summarization import *
6
  from dataset_extraction import *
7
 
8
  import requests
9
- def download_pdf(url, dest_folder):
10
-
11
- """
12
- Download a PDF from a given URL and save it to a specified destination folder.
13
- Parameters:
14
- url (str): URL of the PDF
15
- dest_folder (str): Destination folder to save the downloaded PDF
16
- """
17
-
18
- if not os.path.exists(dest_folder):
19
- os.makedirs(dest_folder)
20
-
21
- response = requests.get(url, stream=True)
22
- filename = os.path.join(dest_folder, url.split("/")[-1])
23
-
24
- with open(filename, 'wb') as file:
25
- for chunk in response.iter_content(chunk_size=1024):
26
- if chunk:
27
- file.write(chunk)
28
- print(f"Downloaded {url} to {filename}")
29
- return filename
30
 
31
  # Example Usage
32
  #url = "https://arxiv.org/pdf/2305.14996.pdf"
@@ -45,8 +24,8 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
45
  gr.Markdown(ctrlsum_file_md)
46
  with gr.Row():
47
  with gr.Column():
48
- ctrlsum_url = gr.TextArea(label="PDF URL", max_lines=1)
49
- ctrlsum_file = gr.File(label="Input File", max_lines=2)
50
  ctrlsum_str = gr.TextArea(label="Input String", max_lines=5)
51
  with gr.Column():
52
  gr.Markdown("* Length 0 will exert no control over length.")
@@ -63,19 +42,21 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
63
  ctrlsum_file_examples = gr.Examples(examples=[["examples/H01-1042_body.txt", 50, "automatic evaluation technique"],["examples/H01-1042.pdf", 0, "automatic evaluation technique"]],
64
  inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords])
65
 
66
- if len(ctrlsum_url.value) > 4:
67
- ctrlsum_file = download_pdf(ctrlsum_url.value, './cache/')
68
 
69
  ctrlsum_file_btn.click(
70
  fn=ctrlsum_for_file,
71
- inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str],
72
- outputs=[ctrlsum_file_output, ctrlsum_str]
73
  )
74
  def clear():
75
- return None,0,None
76
 
77
- ctrlsum_file.change(clear, inputs=None,outputs=[ctrlsum_str,ctrlsum_file_length,ctrlsum_file_keywords])
78
 
 
 
 
 
79
  # Reference String Parsing
80
  with gr.TabItem("Reference String Parsing"):
81
  with gr.Box():
 
6
  from dataset_extraction import *
7
 
8
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Example Usage
11
  #url = "https://arxiv.org/pdf/2305.14996.pdf"
 
24
  gr.Markdown(ctrlsum_file_md)
25
  with gr.Row():
26
  with gr.Column():
27
+ ctrlsum_url = gr.Textbox(label="PDF URL", max_lines=1)
28
+ ctrlsum_file = gr.File(label="Input File")
29
  ctrlsum_str = gr.TextArea(label="Input String", max_lines=5)
30
  with gr.Column():
31
  gr.Markdown("* Length 0 will exert no control over length.")
 
42
  ctrlsum_file_examples = gr.Examples(examples=[["examples/H01-1042_body.txt", 50, "automatic evaluation technique"],["examples/H01-1042.pdf", 0, "automatic evaluation technique"]],
43
  inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords])
44
 
45
+
 
46
 
47
  ctrlsum_file_btn.click(
48
  fn=ctrlsum_for_file,
49
+ inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str, ctrlsum_url],
50
+ outputs=[ctrlsum_file_output, ctrlsum_str, ctrlsum_file]
51
  )
52
  def clear():
53
+ return None,0,None, None
54
 
 
55
 
56
+ ctrlsum_file.upload(clear, inputs=None,outputs=[ctrlsum_str,ctrlsum_file_length,ctrlsum_file_keywords, ctrlsum_url])
57
+ ctrlsum_url.input(clear, inputs=None, outputs=[ctrlsum_str, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_file])
58
+ ctrlsum_str.input(clear, inputs=None,
59
+ outputs=[ctrlsum_url, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_file])
60
  # Reference String Parsing
61
  with gr.TabItem("Reference String Parsing"):
62
  with gr.Box():
controlled_summarization.py CHANGED
@@ -1,12 +1,35 @@
1
  from typing import List, Tuple
2
  import torch
3
  from SciAssist import Summarization
4
-
 
5
  device = "gpu" if torch.cuda.is_available() else "cpu"
6
 
7
  ctrlsum_pipeline = Summarization(os_name="nt",checkpoint="google/flan-t5-base",device=device)
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
11
 
12
  if keywords is not None:
@@ -24,15 +47,20 @@ def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
24
  return "".join(output)
25
 
26
 
27
- def ctrlsum_for_file(input, length=None, keywords=None, text="") -> List[Tuple[str, str]]:
28
- if input == None:
29
  if text=="":
30
- return None
31
  else:
32
- return ctrlsum_for_str(text,length,keywords),text
33
  else:
34
- filename = input.name
35
- if keywords is not None:
 
 
 
 
 
36
  keywords = keywords.strip().split(",")
37
  if keywords[0] == "":
38
  keywords = None
@@ -47,12 +75,12 @@ def ctrlsum_for_file(input, length=None, keywords=None, text="") -> List[Tuple[s
47
  results = ctrlsum_pipeline.predict(filename,
48
  save_results=False, length=length, keywords=keywords)
49
  else:
50
- return [("File Format Error !", None)]
51
 
52
  output = []
53
  for res in results["summary"]:
54
  output.append(f"{res}\n\n")
55
- return "".join(output), results["raw_text"]
56
 
57
 
58
 
 
1
  from typing import List, Tuple
2
  import torch
3
  from SciAssist import Summarization
4
+ import os
5
+ import requests
6
  device = "gpu" if torch.cuda.is_available() else "cpu"
7
 
8
  ctrlsum_pipeline = Summarization(os_name="nt",checkpoint="google/flan-t5-base",device=device)
9
 
10
 
11
+ def download_pdf(url, dest_folder):
12
+ """
13
+ Download a PDF from a given URL and save it to a specified destination folder.
14
+ Parameters:
15
+ url (str): URL of the PDF
16
+ dest_folder (str): Destination folder to save the downloaded PDF
17
+ """
18
+
19
+ if not os.path.exists(dest_folder):
20
+ os.makedirs(dest_folder)
21
+
22
+ response = requests.get(url, stream=True)
23
+ filename = os.path.join(dest_folder, url.split("/")[-1])
24
+
25
+ with open(filename, 'wb') as file:
26
+ for chunk in response.iter_content(chunk_size=1024):
27
+ if chunk:
28
+ file.write(chunk)
29
+ print(f"Downloaded {url} to {filename}")
30
+ return filename
31
+
32
+
33
  def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
34
 
35
  if keywords is not None:
 
47
  return "".join(output)
48
 
49
 
50
+ def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> List[Tuple[str, str, str]]:
51
+ if input == None and url == "":
52
  if text=="":
53
+ return None,"Input cannot be left blank.",None
54
  else:
55
+ return ctrlsum_for_str(text,length,keywords),text, None
56
  else:
57
+ filename=""
58
+ if url != "":
59
+ if len(url) > 4:
60
+ filename = download_pdf(url, './cache/')
61
+ else:
62
+ filename = input.name
63
+ if keywords != "":
64
  keywords = keywords.strip().split(",")
65
  if keywords[0] == "":
66
  keywords = None
 
75
  results = ctrlsum_pipeline.predict(filename,
76
  save_results=False, length=length, keywords=keywords)
77
  else:
78
+ return "File Format Error !", None, filename
79
 
80
  output = []
81
  for res in results["summary"]:
82
  output.append(f"{res}\n\n")
83
+ return "".join(output), results["raw_text"], filename
84
 
85
 
86