import streamlit as st
import os
import time
import json
import re

def clean_string_for_filename(s):
    """Cleans a string to make it safe for use as a filename."""
    s = re.sub(r"[^\w\s-]", "", s)  # Remove invalid characters
    s = re.sub(r"\s+", "_", s)  # Replace spaces with underscores
    return s.strip("_")

def check_scraping_status(log_file="scraping_status.log"):
    try:
        with open(log_file, "r") as file:
            lines = file.readlines()
            status = lines[-1]
            return status
    except FileNotFoundError:
        return "Scraping not run yet"  # Log file does not exist; assume scraping is ongoing

def run_scraping(url, depth_limit, pagecount_limit):
    # Generate a safe filename based on the URL
    identifier = clean_string_for_filename(url)
    output_filename = f"output_{identifier}.json"

    if os.path.exists(output_filename):
        os.remove(output_filename)

    # Run the scraping command
    os.popen(f"scrapy runspider homespider.py -a start_url={url} -a depth_limit={depth_limit} -a pagecount_limit={pagecount_limit} -o {output_filename}")

    st.success(f"Scraping started. Output will be saved to {output_filename}.")
    return output_filename

# Streamlit interface
st.title("Scraping Tool with URL-based Output File")
col1, col2 = st.columns(2)

with col1:
    depth_limit = st.slider("Depth Limit", min_value=1, value=2, max_value=5, step=1)
with col2:
    pagecount_limit = st.slider("Page Count Limit", min_value=10, value=10, max_value=50, step=10)

url = st.text_input("Enter URL", value="https://bsenst.github.io/toscrape/app-website/")

if st.button("Run Scraping"):
    if check_scraping_status() == "Scraping running":
        st.warning("Scraping in progress...")
    else:
        output_filename = run_scraping(url, depth_limit, pagecount_limit)

if st.button("Status Scraping"):
    identifier = clean_string_for_filename(url)
    output_file = f"output_{identifier}.json"

    if check_scraping_status() == "Scraping running":
        st.warning("Scraping is running.")
    elif os.path.exists(output_file):
        try:
            with open(output_file, "r") as f:
                scraped_data = json.load(f)
                page_count = len(scraped_data)

            # Show download button if output file exists
            st.download_button(
                "Download Scraping Output",
                data=json.dumps(scraped_data),
                file_name=output_file,
            )
            # Display number of pages scraped
            st.write(f"{page_count} pages scraped:")
            # Display scraping results
            st.write([(el["url"], el["title"]) for el in scraped_data])

        except Exception as e:
            st.warning(f"Error with opening {output_file}: {e}")
    else:
        st.warning("No output file found. Please run the scraping command.")