Spaces:
Running
Running
import logging | |
import re | |
import scrapy | |
import pandas as pd | |
from urllib.parse import urlparse | |
# Function to clean text | |
def clean_text(text_list: list[str]) -> str: | |
combined_text: str = ' '.join(text_list) | |
# Remove scripts, styles, and other unwanted content | |
combined_text = re.sub(r'<script.*?>.*?</script>', '', combined_text, flags=re.DOTALL) | |
combined_text = re.sub(r'<style.*?>.*?</style>', '', combined_text, flags=re.DOTALL) | |
# Remove HTML tags | |
combined_text = re.sub(r'<[^>]+>', ' ', combined_text) | |
# Remove multiple whitespaces | |
cleaned_text: str = re.sub(r'\s+', ' ', combined_text) | |
# Strip leading and trailing whitespace | |
cleaned_text = cleaned_text.strip() | |
return cleaned_text | |
class HomepageSpider(scrapy.Spider): | |
name = "huggingface-datenwerkzeug" | |
custom_settings = { | |
'DOWNLOAD_DELAY': 2 # add download delay for fair scraping practice, default is 0 | |
} | |
def __init__(self, start_url=None, *args, **kwargs): | |
super(HomepageSpider, self).__init__(*args, **kwargs) | |
self.start_url = start_url | |
def from_crawler(cls, crawler, *args, **kwargs): | |
# Set default depth and page count if not provided | |
depth_limit = int(kwargs.get("depth_limit", 2)) | |
pagecount_limit = int(kwargs.get("pagecount_limit", 10)) | |
# Update settings dynamically | |
crawler.settings.set("DEPTH_LIMIT", depth_limit, priority="cmdline") | |
crawler.settings.set( | |
"CLOSESPIDER_PAGECOUNT", pagecount_limit, priority="cmdline" | |
) | |
# Log settings for debugging | |
logging.info(f"DEPTH_LIMIT set to: {depth_limit}") | |
logging.info(f"CLOSESPIDER_PAGECOUNT set to: {pagecount_limit}") | |
return super(HomepageSpider, cls).from_crawler(crawler, *args, **kwargs) | |
def start_requests(self): | |
if not self.start_url: | |
self.logger.error("No start_url provided. Use -a start_url=<URL>") | |
return | |
# log scraping started/running | |
with open("scraping_status.log", "w") as log_file: | |
log_file.write("Scraping running") | |
parsed_uri = urlparse(self.start_url) | |
domain = f"{parsed_uri.scheme}://{parsed_uri.netloc}" | |
yield scrapy.Request( | |
url=self.start_url, meta={"domain": domain}, callback=self.parse | |
) | |
def parse(self, response): | |
# Scrape the current page | |
yield { | |
"url": response.url, | |
"title": response.xpath("//title/text()").get(), | |
"body": clean_text(response.xpath("//body//text()").getall()), | |
} | |
# Follow internal links | |
domain = response.meta["domain"] | |
for a_tag in response.xpath("//a[@href]"): | |
href = a_tag.attrib["href"] | |
absolute_url = response.urljoin(href) | |
# Only follow links that are part of the same domain | |
if domain in absolute_url: | |
yield response.follow(absolute_url, self.parse, meta={"domain": domain}) | |
def closed(self, reason): | |
# Log "scraping finished" to a local file | |
with open("scraping_status.log", "w") as log_file: | |
log_file.write("Scraping finished") |