Spaces:

prithivMLmods
/

save-web-as-zip

Running

App Files Files Community

prithivMLmods commited on Jun 18, 2024

Commit

dd55678

•

1 Parent(s): 2f0546d

Update standard.txt

Browse files

Files changed (1) hide show

standard.txt +101 -0

standard.txt CHANGED Viewed

	@@ -0,0 +1,101 @@

+import os
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from zipfile import ZipFile
+from io import BytesIO
+import gradio as gr
+def download_file(url, session):
+    """Download a file and return its content."""
+    try:
+        response = session.get(url)
+        response.raise_for_status()
+        return response.content
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading {url}: {e}")
+        return None
+def save_webpage_as_zip(url):
+    """Save a webpage and its assets as a ZIP file."""
+    session = requests.Session()
+    response = session.get(url)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.content, 'html.parser')
+    temp_dir = 'temp_webpage'
+    if not os.path.exists(temp_dir):
+        os.makedirs(temp_dir)
+    main_html_path = os.path.join(temp_dir, 'index.html')
+    with open(main_html_path, 'wb') as f:
+        f.write(response.content)
+    assets = []
+    for tag in soup.find_all(['img', 'link', 'script']):
+        if tag.name == 'img' and tag.get('src'):
+            assets.append(tag['src'])
+        elif tag.name == 'link' and tag.get('href'):
+            assets.append(tag['href'])
+        elif tag.name == 'script' and tag.get('src'):
+            assets.append(tag['src'])
+    # Download and save all assets
+    for asset in assets:
+        asset_url = urljoin(url, asset)
+        asset_path = urlparse(asset_url).path.lstrip('/')
+        asset_full_path = os.path.join(temp_dir, asset_path)
+        if asset_path.endswith('/'):
+            print(f"Skipping directory {asset_full_path}")
+            continue
+        os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)
+        content = download_file(asset_url, session)
+        if content:
+            if os.path.isdir(asset_full_path):
+                print(f"Skipping directory {asset_full_path}")
+                continue
+            with open(asset_full_path, 'wb') as f:
+                f.write(content)
+    zip_buffer = BytesIO()
+    with ZipFile(zip_buffer, 'w') as zipf:
+        for root, _, files in os.walk(temp_dir):
+            for file in files:
+                file_path = os.path.join(root, file)
+                zipf.write(file_path, os.path.relpath(file_path, temp_dir))
+    for root, _, files in os.walk(temp_dir, topdown=False):
+        for file in files:
+            os.remove(os.path.join(root, file))
+        os.rmdir(root)
+    zip_buffer.seek(0)
+    return zip_buffer
+def generate_zip_file(url):
+    """Generate ZIP file from a webpage URL."""
+    zip_buffer = save_webpage_as_zip(url)
+    temp_zip_path = "webpage.zip"
+    with open(temp_zip_path, 'wb') as f:
+        f.write(zip_buffer.read())
+    return temp_zip_path
+with gr.Blocks(theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("## Webpage to ZIP Downloader 🔗")
+    gr.Markdown("Enter a URL to download the webpage and its assets as a ZIP file.")
+    url_input = gr.Textbox(label="Website URL", placeholder="Enter a URL (e.g., https://www.example.com)")
+    download_button = gr.Button("Download as ZIP")
+    output_file = gr.File(label="Download")
+    def set_example_url(url):
+        url_input.value = url
+    download_button.click(fn=generate_zip_file, inputs=url_input, outputs=output_file)
+demo.launch()