save-web-as-zip / app.py
prithivMLmods's picture
Update app.py
4b15f24 verified
raw
history blame
3.69 kB
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from zipfile import ZipFile
from io import BytesIO
import gradio as gr
def download_file(url, session):
"""Download a file and return its content."""
try:
response = session.get(url)
response.raise_for_status()
return response.content
except requests.exceptions.RequestException as e:
print(f"Error downloading {url}: {e}")
return None
def save_webpage_as_zip(url):
"""Save a webpage and its assets as a ZIP file."""
session = requests.Session()
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Create a temporary directory to store downloaded files
temp_dir = 'temp_webpage'
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
# Download and save the main HTML file
main_html_path = os.path.join(temp_dir, 'index.html')
with open(main_html_path, 'wb') as f:
f.write(response.content)
# Prepare a list of all assets to download
assets = []
for tag in soup.find_all(['img', 'link', 'script']):
if tag.name == 'img' and tag.get('src'):
assets.append(tag['src'])
elif tag.name == 'link' and tag.get('href'):
assets.append(tag['href'])
elif tag.name == 'script' and tag.get('src'):
assets.append(tag['src'])
# Download and save all assets
for asset in assets:
asset_url = urljoin(url, asset)
asset_path = urlparse(asset_url).path.lstrip('/')
asset_full_path = os.path.join(temp_dir, asset_path)
# Skip if asset_full_path is a directory
if asset_path.endswith('/'):
print(f"Skipping directory {asset_full_path}")
continue
# Create directories if they don't exist
os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)
# Download and save the asset
content = download_file(asset_url, session)
if content:
if os.path.isdir(asset_full_path):
print(f"Skipping directory {asset_full_path}")
continue
with open(asset_full_path, 'wb') as f:
f.write(content)
# Create a ZIP file in memory
zip_buffer = BytesIO()
with ZipFile(zip_buffer, 'w') as zipf:
for root, _, files in os.walk(temp_dir):
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, temp_dir))
# Clean up temporary directory
for root, _, files in os.walk(temp_dir, topdown=False):
for file in files:
os.remove(os.path.join(root, file))
os.rmdir(root)
zip_buffer.seek(0)
return zip_buffer
def generate_zip_file(url):
"""Generate ZIP file from a webpage URL."""
zip_buffer = save_webpage_as_zip(url)
temp_zip_path = "webpage.zip"
with open(temp_zip_path, 'wb') as f:
f.write(zip_buffer.read())
return temp_zip_path
# Gradio Interface
with gr.Blocks(theme="bethecloud/storj_theme") as demo:
gr.Markdown("## Webpage to ZIP Downloader")
gr.Markdown("Enter a URL to download the webpage and its assets as a ZIP file.")
url_input = gr.Textbox(label="Website URL", placeholder="Enter a URL (e.g., https://www.example.com)")
download_button = gr.Button("Download as ZIP")
output_file = gr.File(label="Download")
def set_example_url(url):
url_input.value = url
download_button.click(fn=generate_zip_file, inputs=url_input, outputs=output_file)
demo.launch()