import streamlit as st import requests from bs4 import BeautifulSoup from lxml import html # Function to extract links using XPath def extract_links_with_xpath(content, xpath): try: tree = html.fromstring(content) elements = tree.xpath(xpath) links = [elem.text for elem in elements] return links except Exception as e: return f"Error processing the XPath: {e}" # Function to extract links using CSS selector def extract_links_with_css(content, css_selector): try: soup = BeautifulSoup(content, 'html.parser') links = [a.text for a in soup.select(css_selector)] return links except Exception as e: return f"Error processing the CSS selector: {e}" # Function to extract all URLs from a webpage def extract_urls_from_url(url): try: response = requests.get(url) response.raise_for_status() return response.text except Exception as e: return f"Error processing the URL: {e}" # Streamlit App st.title("Webseiten-URL-Extraktor") # Input options: URL or HTML file input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen")) if input_option == "URL": url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://bsenst.github.io/toscrape/", value="https://bsenst.github.io/toscrape/") extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector")) custom_input = None if extraction_method == "XPath": custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]") elif extraction_method == "CSS Selector": custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]") if st.button("Extrahieren"): if url_input: st.write(f"Extrahiere von: {url_input}") page_content = extract_urls_from_url(url_input) if isinstance(page_content, str): if extraction_method == "XPath": links = extract_links_with_xpath(page_content, custom_input) elif extraction_method == "CSS Selector": links = extract_links_with_css(page_content, custom_input) if isinstance(links, list): for url in set(links): st.write(url) else: st.error(links) else: st.error(page_content) else: st.warning("Bitte geben Sie eine gültige URL ein.") elif input_option == "HTML-Datei hochladen": uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html") extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector")) custom_input = None if extraction_method == "XPath": custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]") elif extraction_method == "CSS Selector": custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]") if st.button("Extrahieren"): if uploaded_file: try: html_content = uploaded_file.read().decode("utf-8") st.write("Extrahiere aus der hochgeladenen HTML-Datei...") if extraction_method == "XPath": links = extract_links_with_xpath(html_content, custom_input) elif extraction_method == "CSS Selector": links = extract_links_with_css(html_content, custom_input) if isinstance(links, list): for url in set(links): st.write(url) else: st.error(links) except Exception as e: st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}") else: st.warning("Bitte laden Sie eine HTML-Datei hoch.")