Spaces:
Running
Running
File size: 3,740 Bytes
53dbd29 46f7221 53dbd29 46f7221 53dbd29 46f7221 53dbd29 46f7221 53dbd29 46f7221 53dbd29 46f7221 53dbd29 46f7221 53dbd29 46f7221 53dbd29 46f7221 53dbd29 46f7221 53dbd29 46f7221 53dbd29 46f7221 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
from requests.sessions import Session
from langdetect import detect
from googletrans import Translator
def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None):
try:
session = Session()
# Handle authentication if credentials are provided
if email and password and login_url:
login_data = {
'email': email,
'password': password
# Include other necessary fields as required by the website
}
response = session.post(login_url, data=login_data)
response.raise_for_status()
else:
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove unwanted tags
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
tag.extract()
# Use query selector if provided
if query_selector:
elements = soup.select(query_selector)
text_content = " ".join([element.get_text() for element in elements])
else:
# Extract header content
header_content = soup.find("header")
header_text = header_content.get_text() if header_content else ""
# Extract paragraph content
paragraph_content = soup.find_all("p")
paragraph_text = " ".join([p.get_text() for p in paragraph_content])
text_content = f"{header_text}\n\n{paragraph_text}"
# Clean up whitespace
visible_text = re.sub(r'\s+', ' ', text_content).strip()
# Translate non-English text
translator = Translator()
sentences = re.split(r'(?<=[.!?]) +', visible_text)
translated_sentences = []
for sentence in sentences:
try:
lang = detect(sentence)
if lang != 'en':
translation = translator.translate(sentence, dest='en').text
translated_sentences.append(translation)
else:
translated_sentences.append(sentence)
except Exception:
translated_sentences.append(sentence)
translated_text = ' '.join(translated_sentences)
return translated_text
except Exception as e:
st.error(f"Error occurred while scraping the data: {e}")
return None
def main():
st.title("Web Data Scraper")
url_input = st.text_input("Enter the URL ๐โ๏ธ:", "")
query_selector = st.text_input("Enter a query selector (optional):", "")
email = st.text_input("Email (if authentication required):", "")
password = st.text_input("Password (if authentication required):", "", type="password")
login_url = st.text_input("Enter the login URL (if authentication required):", "")
if st.button("Load Data ๐ง"):
if url_input:
data = scrape_visible_text_from_url(
url=url_input,
query_selector=query_selector if query_selector else None,
email=email if email else None,
password=password if password else None,
login_url=login_url if login_url else None
)
if data:
st.success("Data text successfully scraped!")
st.subheader("Scraped Text:")
st.write(data)
else:
st.warning("Failed to load data from the URL.")
else:
st.warning("Please enter a valid URL.")
if __name__ == "__main__":
main() |