mynkchaudhry commited on
Commit
9e91a36
·
verified ·
1 Parent(s): 91a21a6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from urllib.parse import urljoin, urlparse
5
+ from zipfile import ZipFile
6
+ from io import BytesIO
7
+ import gradio as gr
8
+
9
+ def download_file(url, session):
10
+ """Download a file and return its content."""
11
+ try:
12
+ response = session.get(url)
13
+ response.raise_for_status()
14
+ return response.content
15
+ except requests.exceptions.RequestException as e:
16
+ print(f"Error downloading {url}: {e}")
17
+ return None
18
+
19
+ def save_webpage_as_zip(url):
20
+ """Save a webpage and its assets as a ZIP file."""
21
+ session = requests.Session()
22
+ response = session.get(url)
23
+ response.raise_for_status()
24
+
25
+ soup = BeautifulSoup(response.content, 'html.parser')
26
+ temp_dir = 'temp_webpage'
27
+ if not os.path.exists(temp_dir):
28
+ os.makedirs(temp_dir)
29
+
30
+ main_html_path = os.path.join(temp_dir, 'index.html')
31
+ with open(main_html_path, 'wb') as f:
32
+ f.write(response.content)
33
+
34
+ assets = []
35
+ for tag in soup.find_all(['img', 'link', 'script']):
36
+ if tag.name == 'img' and tag.get('src'):
37
+ assets.append(tag['src'])
38
+ elif tag.name == 'link' and tag.get('href'):
39
+ assets.append(tag['href'])
40
+ elif tag.name == 'script' and tag.get('src'):
41
+ assets.append(tag['src'])
42
+
43
+ for asset in assets:
44
+ asset_url = urljoin(url, asset)
45
+ asset_path = urlparse(asset_url).path.lstrip('/')
46
+ asset_full_path = os.path.join(temp_dir, asset_path)
47
+
48
+ if asset_path.endswith('/'):
49
+ print(f"Skipping directory {asset_full_path}")
50
+ continue
51
+
52
+ os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)
53
+
54
+ content = download_file(asset_url, session)
55
+ if content:
56
+ if os.path.isdir(asset_full_path):
57
+ print(f"Skipping directory {asset_full_path}")
58
+ continue
59
+ with open(asset_full_path, 'wb') as f:
60
+ f.write(content)
61
+
62
+ zip_buffer = BytesIO()
63
+ with ZipFile(zip_buffer, 'w') as zipf:
64
+ for root, _, files in os.walk(temp_dir):
65
+ for file in files:
66
+ file_path = os.path.join(root, file)
67
+ zipf.write(file_path, os.path.relpath(file_path, temp_dir))
68
+
69
+ for root, _, files in os.walk(temp_dir, topdown=False):
70
+ for file in files:
71
+ os.remove(os.path.join(root, file))
72
+ os.rmdir(root)
73
+ zip_buffer.seek(0)
74
+ return zip_buffer
75
+
76
+ def generate_zip_file(url):
77
+ """Generate ZIP file from a webpage URL."""
78
+ zip_buffer = save_webpage_as_zip(url)
79
+ temp_zip_path = "webpage.zip"
80
+ with open(temp_zip_path, 'wb') as f:
81
+ f.write(zip_buffer.read())
82
+ return temp_zip_path
83
+
84
+ examples = [
85
+ "https://www.bmw.com/en/index.html",
86
+ "https://www.ferrari.com/en-EN",
87
+ "https://streamlit.io/"
88
+ ]
89
+
90
+ DESCRIPTION = """
91
+
92
+ ## Webpage to ZIP Downloader 🔗
93
+ """
94
+
95
+ with gr.Blocks(theme="gstaff/whiteboard") as demo: # Custom theme
96
+ gr.Markdown(DESCRIPTION)
97
+ gr.Markdown("Enter a URL to download the webpage and its assets as a ZIP file.")
98
+
99
+ url_input = gr.Textbox(label="Website URL", placeholder="Enter a URL (e.g., https://www.example.com)")
100
+
101
+ download_button = gr.Button("Download as ZIP")
102
+ output_file = gr.File(label="Download")
103
+
104
+ def set_example_url(url):
105
+ url_input.value = url
106
+
107
+ download_button.click(fn=generate_zip_file, inputs=url_input, outputs=output_file)
108
+
109
+ gr.Examples(
110
+ examples=examples,
111
+ inputs=url_input,
112
+ outputs=output_file,
113
+ fn=generate_zip_file
114
+ )
115
+ demo.launch()