import streamlit as st
import requests
from bs4 import BeautifulSoup
from lxml import html

# Function to extract links using XPath
def extract_links_with_xpath(content, xpath):
    try:
        tree = html.fromstring(content)
        elements = tree.xpath(xpath)
        links = [elem.text for elem in elements]
        return links
    except Exception as e:
        return f"Error processing the XPath: {e}"


# Function to extract links using CSS selector
def extract_links_with_css(content, css_selector):
    try:
        soup = BeautifulSoup(content, 'html.parser')
        links = [a.text for a in soup.select(css_selector)]
        return links
    except Exception as e:
        return f"Error processing the CSS selector: {e}"

# Function to extract all URLs from a webpage
def extract_urls_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except Exception as e:
        return f"Error processing the URL: {e}"

# Streamlit App
st.title("Webseiten-URL-Extraktor")

# Input options: URL or HTML file
input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))

if input_option == "URL":
    url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://bsenst.github.io/toscrape/", value="https://bsenst.github.io/toscrape/")
    
    extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
    custom_input = None

    if extraction_method == "XPath":
        custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
    elif extraction_method == "CSS Selector":
        custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")

    if st.button("Extrahieren"):
        if url_input:
            st.write(f"Extrahiere von: {url_input}")
            page_content = extract_urls_from_url(url_input)

            if isinstance(page_content, str):
                if extraction_method == "XPath":
                    links = extract_links_with_xpath(page_content, custom_input)
                elif extraction_method == "CSS Selector":
                    links = extract_links_with_css(page_content, custom_input)

                if isinstance(links, list):
                    for url in set(links):
                        st.write(url)
                else:
                    st.error(links)
            else:
                st.error(page_content)
        else:
            st.warning("Bitte geben Sie eine gültige URL ein.")

elif input_option == "HTML-Datei hochladen":
    uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")

    extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
    custom_input = None

    if extraction_method == "XPath":
        custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
    elif extraction_method == "CSS Selector":
        custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")

    if st.button("Extrahieren"):
        if uploaded_file:
            try:
                html_content = uploaded_file.read().decode("utf-8")
                st.write("Extrahiere aus der hochgeladenen HTML-Datei...")

                if extraction_method == "XPath":
                    links = extract_links_with_xpath(html_content, custom_input)
                elif extraction_method == "CSS Selector":
                    links = extract_links_with_css(html_content, custom_input)

                if isinstance(links, list):
                    for url in set(links):
                        st.write(url)
                else:
                    st.error(links)
            except Exception as e:
                st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
        else:
            st.warning("Bitte laden Sie eine HTML-Datei hoch.")