Webseiten-URL-Extraktor

Running

App Files Files Community

Webseiten-URL-Extraktor / app.py

bsenst

add xpath und css selector options

9bc8ed7 about 2 months ago

raw

history blame contribute delete

4.05 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	from lxml import html

	# Function to extract links using XPath
	def extract_links_with_xpath(content, xpath):
	try:
	tree = html.fromstring(content)
	elements = tree.xpath(xpath)
	links = [elem.text for elem in elements]
	return links
	except Exception as e:
	return f"Error processing the XPath: {e}"


	# Function to extract links using CSS selector
	def extract_links_with_css(content, css_selector):
	try:
	soup = BeautifulSoup(content, 'html.parser')
	links = [a.text for a in soup.select(css_selector)]
	return links
	except Exception as e:
	return f"Error processing the CSS selector: {e}"

	# Function to extract all URLs from a webpage
	def extract_urls_from_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	return response.text
	except Exception as e:
	return f"Error processing the URL: {e}"

	# Streamlit App
	st.title("Webseiten-URL-Extraktor")

	# Input options: URL or HTML file
	input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))

	if input_option == "URL":
	url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://bsenst.github.io/toscrape/", value="https://bsenst.github.io/toscrape/")

	extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
	custom_input = None

	if extraction_method == "XPath":
	custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
	elif extraction_method == "CSS Selector":
	custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")

	if st.button("Extrahieren"):
	if url_input:
	st.write(f"Extrahiere von: {url_input}")
	page_content = extract_urls_from_url(url_input)

	if isinstance(page_content, str):
	if extraction_method == "XPath":
	links = extract_links_with_xpath(page_content, custom_input)
	elif extraction_method == "CSS Selector":
	links = extract_links_with_css(page_content, custom_input)

	if isinstance(links, list):
	for url in set(links):
	st.write(url)
	else:
	st.error(links)
	else:
	st.error(page_content)
	else:
	st.warning("Bitte geben Sie eine gültige URL ein.")

	elif input_option == "HTML-Datei hochladen":
	uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")

	extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
	custom_input = None

	if extraction_method == "XPath":
	custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
	elif extraction_method == "CSS Selector":
	custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")

	if st.button("Extrahieren"):
	if uploaded_file:
	try:
	html_content = uploaded_file.read().decode("utf-8")
	st.write("Extrahiere aus der hochgeladenen HTML-Datei...")

	if extraction_method == "XPath":
	links = extract_links_with_xpath(html_content, custom_input)
	elif extraction_method == "CSS Selector":
	links = extract_links_with_css(html_content, custom_input)

	if isinstance(links, list):
	for url in set(links):
	st.write(url)
	else:
	st.error(links)
	except Exception as e:
	st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
	else:
	st.warning("Bitte laden Sie eine HTML-Datei hoch.")