bsenst's picture
add xpath und css selector options
9bc8ed7
import streamlit as st
import requests
from bs4 import BeautifulSoup
from lxml import html
# Function to extract links using XPath
def extract_links_with_xpath(content, xpath):
try:
tree = html.fromstring(content)
elements = tree.xpath(xpath)
links = [elem.text for elem in elements]
return links
except Exception as e:
return f"Error processing the XPath: {e}"
# Function to extract links using CSS selector
def extract_links_with_css(content, css_selector):
try:
soup = BeautifulSoup(content, 'html.parser')
links = [a.text for a in soup.select(css_selector)]
return links
except Exception as e:
return f"Error processing the CSS selector: {e}"
# Function to extract all URLs from a webpage
def extract_urls_from_url(url):
try:
response = requests.get(url)
response.raise_for_status()
return response.text
except Exception as e:
return f"Error processing the URL: {e}"
# Streamlit App
st.title("Webseiten-URL-Extraktor")
# Input options: URL or HTML file
input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))
if input_option == "URL":
url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://bsenst.github.io/toscrape/", value="https://bsenst.github.io/toscrape/")
extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
custom_input = None
if extraction_method == "XPath":
custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
elif extraction_method == "CSS Selector":
custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")
if st.button("Extrahieren"):
if url_input:
st.write(f"Extrahiere von: {url_input}")
page_content = extract_urls_from_url(url_input)
if isinstance(page_content, str):
if extraction_method == "XPath":
links = extract_links_with_xpath(page_content, custom_input)
elif extraction_method == "CSS Selector":
links = extract_links_with_css(page_content, custom_input)
if isinstance(links, list):
for url in set(links):
st.write(url)
else:
st.error(links)
else:
st.error(page_content)
else:
st.warning("Bitte geben Sie eine gültige URL ein.")
elif input_option == "HTML-Datei hochladen":
uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")
extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
custom_input = None
if extraction_method == "XPath":
custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
elif extraction_method == "CSS Selector":
custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")
if st.button("Extrahieren"):
if uploaded_file:
try:
html_content = uploaded_file.read().decode("utf-8")
st.write("Extrahiere aus der hochgeladenen HTML-Datei...")
if extraction_method == "XPath":
links = extract_links_with_xpath(html_content, custom_input)
elif extraction_method == "CSS Selector":
links = extract_links_with_css(html_content, custom_input)
if isinstance(links, list):
for url in set(links):
st.write(url)
else:
st.error(links)
except Exception as e:
st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
else:
st.warning("Bitte laden Sie eine HTML-Datei hoch.")