|
import streamlit as st |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from lxml import html |
|
|
|
|
|
def extract_links_with_xpath(content, xpath): |
|
try: |
|
tree = html.fromstring(content) |
|
elements = tree.xpath(xpath) |
|
links = [elem.text for elem in elements] |
|
return links |
|
except Exception as e: |
|
return f"Error processing the XPath: {e}" |
|
|
|
|
|
|
|
def extract_links_with_css(content, css_selector): |
|
try: |
|
soup = BeautifulSoup(content, 'html.parser') |
|
links = [a.text for a in soup.select(css_selector)] |
|
return links |
|
except Exception as e: |
|
return f"Error processing the CSS selector: {e}" |
|
|
|
|
|
def extract_urls_from_url(url): |
|
try: |
|
response = requests.get(url) |
|
response.raise_for_status() |
|
return response.text |
|
except Exception as e: |
|
return f"Error processing the URL: {e}" |
|
|
|
|
|
st.title("Webseiten-URL-Extraktor") |
|
|
|
|
|
input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen")) |
|
|
|
if input_option == "URL": |
|
url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://bsenst.github.io/toscrape/", value="https://bsenst.github.io/toscrape/") |
|
|
|
extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector")) |
|
custom_input = None |
|
|
|
if extraction_method == "XPath": |
|
custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]") |
|
elif extraction_method == "CSS Selector": |
|
custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]") |
|
|
|
if st.button("Extrahieren"): |
|
if url_input: |
|
st.write(f"Extrahiere von: {url_input}") |
|
page_content = extract_urls_from_url(url_input) |
|
|
|
if isinstance(page_content, str): |
|
if extraction_method == "XPath": |
|
links = extract_links_with_xpath(page_content, custom_input) |
|
elif extraction_method == "CSS Selector": |
|
links = extract_links_with_css(page_content, custom_input) |
|
|
|
if isinstance(links, list): |
|
for url in set(links): |
|
st.write(url) |
|
else: |
|
st.error(links) |
|
else: |
|
st.error(page_content) |
|
else: |
|
st.warning("Bitte geben Sie eine gültige URL ein.") |
|
|
|
elif input_option == "HTML-Datei hochladen": |
|
uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html") |
|
|
|
extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector")) |
|
custom_input = None |
|
|
|
if extraction_method == "XPath": |
|
custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]") |
|
elif extraction_method == "CSS Selector": |
|
custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]") |
|
|
|
if st.button("Extrahieren"): |
|
if uploaded_file: |
|
try: |
|
html_content = uploaded_file.read().decode("utf-8") |
|
st.write("Extrahiere aus der hochgeladenen HTML-Datei...") |
|
|
|
if extraction_method == "XPath": |
|
links = extract_links_with_xpath(html_content, custom_input) |
|
elif extraction_method == "CSS Selector": |
|
links = extract_links_with_css(html_content, custom_input) |
|
|
|
if isinstance(links, list): |
|
for url in set(links): |
|
st.write(url) |
|
else: |
|
st.error(links) |
|
except Exception as e: |
|
st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}") |
|
else: |
|
st.warning("Bitte laden Sie eine HTML-Datei hoch.") |
|
|