Spaces:

prithivMLmods
/

save-web-as-zip

Running

App Files Files Community

save-web-as-zip / app.py

prithivMLmods

Update app.py

4b15f24 verified 8 months ago

raw

history blame

3.69 kB

	import os
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	from zipfile import ZipFile
	from io import BytesIO
	import gradio as gr

	def download_file(url, session):
	"""Download a file and return its content."""
	try:
	response = session.get(url)
	response.raise_for_status()
	return response.content
	except requests.exceptions.RequestException as e:
	print(f"Error downloading {url}: {e}")
	return None

	def save_webpage_as_zip(url):
	"""Save a webpage and its assets as a ZIP file."""
	session = requests.Session()
	response = session.get(url)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Create a temporary directory to store downloaded files
	temp_dir = 'temp_webpage'
	if not os.path.exists(temp_dir):
	os.makedirs(temp_dir)

	# Download and save the main HTML file
	main_html_path = os.path.join(temp_dir, 'index.html')
	with open(main_html_path, 'wb') as f:
	f.write(response.content)

	# Prepare a list of all assets to download
	assets = []
	for tag in soup.find_all(['img', 'link', 'script']):
	if tag.name == 'img' and tag.get('src'):
	assets.append(tag['src'])
	elif tag.name == 'link' and tag.get('href'):
	assets.append(tag['href'])
	elif tag.name == 'script' and tag.get('src'):
	assets.append(tag['src'])

	# Download and save all assets
	for asset in assets:
	asset_url = urljoin(url, asset)
	asset_path = urlparse(asset_url).path.lstrip('/')
	asset_full_path = os.path.join(temp_dir, asset_path)

	# Skip if asset_full_path is a directory
	if asset_path.endswith('/'):
	print(f"Skipping directory {asset_full_path}")
	continue

	# Create directories if they don't exist
	os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)

	# Download and save the asset
	content = download_file(asset_url, session)
	if content:
	if os.path.isdir(asset_full_path):
	print(f"Skipping directory {asset_full_path}")
	continue
	with open(asset_full_path, 'wb') as f:
	f.write(content)

	# Create a ZIP file in memory
	zip_buffer = BytesIO()
	with ZipFile(zip_buffer, 'w') as zipf:
	for root, _, files in os.walk(temp_dir):
	for file in files:
	file_path = os.path.join(root, file)
	zipf.write(file_path, os.path.relpath(file_path, temp_dir))

	# Clean up temporary directory
	for root, _, files in os.walk(temp_dir, topdown=False):
	for file in files:
	os.remove(os.path.join(root, file))
	os.rmdir(root)

	zip_buffer.seek(0)
	return zip_buffer

	def generate_zip_file(url):
	"""Generate ZIP file from a webpage URL."""
	zip_buffer = save_webpage_as_zip(url)
	temp_zip_path = "webpage.zip"
	with open(temp_zip_path, 'wb') as f:
	f.write(zip_buffer.read())
	return temp_zip_path

	# Gradio Interface
	with gr.Blocks(theme="bethecloud/storj_theme") as demo:
	gr.Markdown("## Webpage to ZIP Downloader")
	gr.Markdown("Enter a URL to download the webpage and its assets as a ZIP file.")

	url_input = gr.Textbox(label="Website URL", placeholder="Enter a URL (e.g., https://www.example.com)")



	download_button = gr.Button("Download as ZIP")
	output_file = gr.File(label="Download")

	def set_example_url(url):
	url_input.value = url

	download_button.click(fn=generate_zip_file, inputs=url_input, outputs=output_file)

	demo.launch()