Spaces:
Running
Running
from bs4 import BeautifulSoup, Comment | |
import scrapy, re | |
from boilerpy3 import extractors | |
class FullHTMLSpider(scrapy.spiders.SitemapSpider): | |
name = "ltu_programme_crawler" | |
# allowed_domains = ["www.ltu.se"] | |
sitemap_urls = ["https://www.ltu.se/robots.txt"] | |
sitemap_rules = [(re.compile(r'\/en\/education\/programme\/[a-zA-Z0-9-]*$'), 'parse')] | |
exclude_patterns = [ | |
"sdhog-continuing-part-of-study-programme-non-freshmen-120-300-credits", | |
"international-orchestra-academy"] | |
def parse(self, response): | |
# Skip the page if its URL contains any excluded pattern. | |
if any(pattern in response.url for pattern in self.exclude_patterns): | |
self.logger.info("Skipping page due to excluded URL pattern: %s", response.url) | |
return | |
html = response.text | |
if "Discontinued." in html: | |
self.logger.info("Skipping page (contains 'Discontinued.'): %s", response.url) | |
return | |
extractor = extractors.ArticleExtractor() | |
# Pass HTML to Extractor | |
content = extractor.get_content(html) | |
return {'url': response.url, 'content': content} |