Spaces:
Runtime error
Runtime error
davidpengg
commited on
Commit
·
f9b19f4
1
Parent(s):
43e0ac1
error handling
Browse files- app.py +8 -1
- download_pdf.py +6 -11
app.py
CHANGED
@@ -11,6 +11,13 @@ examples = [
|
|
11 |
"https://indianculture.gov.in/reports-proceedings/report-village-and-cottage-industries-national-committee-development-backward"
|
12 |
]
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
with gr.Blocks() as app:
|
15 |
gr.Markdown("# <p align='center'>Extract PDF from indianculture[dot]gov[dot]in</p>")
|
16 |
# with gr.Row():
|
@@ -25,7 +32,7 @@ with gr.Blocks() as app:
|
|
25 |
gr.Examples(examples=examples,inputs=landing_page_url,outputs=pdf_file)
|
26 |
|
27 |
landing_page_url_btrn.click(
|
28 |
-
|
29 |
inputs=landing_page_url,
|
30 |
outputs=pdf_file
|
31 |
)
|
|
|
11 |
"https://indianculture.gov.in/reports-proceedings/report-village-and-cottage-industries-national-committee-development-backward"
|
12 |
]
|
13 |
|
14 |
+
def try_download(url):
|
15 |
+
try:
|
16 |
+
pdf = download(url)
|
17 |
+
return pdf
|
18 |
+
except Exception as e:
|
19 |
+
raise gr.Error(str(e))
|
20 |
+
|
21 |
with gr.Blocks() as app:
|
22 |
gr.Markdown("# <p align='center'>Extract PDF from indianculture[dot]gov[dot]in</p>")
|
23 |
# with gr.Row():
|
|
|
32 |
gr.Examples(examples=examples,inputs=landing_page_url,outputs=pdf_file)
|
33 |
|
34 |
landing_page_url_btrn.click(
|
35 |
+
try_download,
|
36 |
inputs=landing_page_url,
|
37 |
outputs=pdf_file
|
38 |
)
|
download_pdf.py
CHANGED
@@ -6,22 +6,17 @@ David Peng
|
|
6 |
import requests
|
7 |
from bs4 import BeautifulSoup as bs
|
8 |
from urllib.parse import unquote
|
9 |
-
import time
|
10 |
import os
|
11 |
|
12 |
DEFAULT_TIMEOUT = 10
|
13 |
-
RETURN_CODE = 0
|
14 |
|
15 |
# script borrowed from https://github.com/lalitaalaalitah/Scrape_IndianCulture.Gov.In_Release
|
16 |
def download(book_page_url):
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
if book_page_get.status_code == 200:
|
23 |
-
break
|
24 |
-
time.sleep(10)
|
25 |
book_page_get = requests.get(book_page_url)
|
26 |
parsed_book_page = bs(book_page_get.content, 'html.parser')
|
27 |
class_pdf_in_page = parsed_book_page.find_all('iframe', class_='pdf')
|
@@ -40,4 +35,4 @@ def download(book_page_url):
|
|
40 |
os.system(cmd_for_curl)
|
41 |
return pdf_name
|
42 |
else:
|
43 |
-
|
|
|
6 |
import requests
|
7 |
from bs4 import BeautifulSoup as bs
|
8 |
from urllib.parse import unquote
|
|
|
9 |
import os
|
10 |
|
11 |
DEFAULT_TIMEOUT = 10
|
|
|
12 |
|
13 |
# script borrowed from https://github.com/lalitaalaalitah/Scrape_IndianCulture.Gov.In_Release
|
14 |
def download(book_page_url):
|
15 |
+
try:
|
16 |
+
book_page_get = requests.get(book_page_url, timeout=DEFAULT_TIMEOUT)
|
17 |
+
except Exception:
|
18 |
+
raise Exception("Bad URL!")
|
19 |
+
|
|
|
|
|
|
|
20 |
book_page_get = requests.get(book_page_url)
|
21 |
parsed_book_page = bs(book_page_get.content, 'html.parser')
|
22 |
class_pdf_in_page = parsed_book_page.find_all('iframe', class_='pdf')
|
|
|
35 |
os.system(cmd_for_curl)
|
36 |
return pdf_name
|
37 |
else:
|
38 |
+
raise Exception("Unexpected number of PDFs (=/= 1)!")
|