from bs4 import BeautifulSoup, Comment import scrapy, re from boilerpy3 import extractors class FullHTMLSpider(scrapy.spiders.SitemapSpider): name = "ltu_programme_crawler" # allowed_domains = ["www.ltu.se"] sitemap_urls = ["https://www.ltu.se/robots.txt"] sitemap_rules = [(re.compile(r'\/en\/education\/programme\/[a-zA-Z0-9-]*$'), 'parse')] exclude_patterns = [ "sdhog-continuing-part-of-study-programme-non-freshmen-120-300-credits", "international-orchestra-academy"] def parse(self, response): # Skip the page if its URL contains any excluded pattern. if any(pattern in response.url for pattern in self.exclude_patterns): self.logger.info("Skipping page due to excluded URL pattern: %s", response.url) return html = response.text if "Discontinued." in html: self.logger.info("Skipping page (contains 'Discontinued.'): %s", response.url) return extractor = extractors.ArticleExtractor() # Pass HTML to Extractor content = extractor.get_content(html) return {'url': response.url, 'content': content}