Spaces:
Running
Running
File size: 1,160 Bytes
4717959 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
from bs4 import BeautifulSoup, Comment
import scrapy, re
from boilerpy3 import extractors
class FullHTMLSpider(scrapy.spiders.SitemapSpider):
name = "ltu_programme_crawler"
# allowed_domains = ["www.ltu.se"]
sitemap_urls = ["https://www.ltu.se/robots.txt"]
sitemap_rules = [(re.compile(r'\/en\/education\/programme\/[a-zA-Z0-9-]*$'), 'parse')]
exclude_patterns = [
"sdhog-continuing-part-of-study-programme-non-freshmen-120-300-credits",
"international-orchestra-academy"]
def parse(self, response):
# Skip the page if its URL contains any excluded pattern.
if any(pattern in response.url for pattern in self.exclude_patterns):
self.logger.info("Skipping page due to excluded URL pattern: %s", response.url)
return
html = response.text
if "Discontinued." in html:
self.logger.info("Skipping page (contains 'Discontinued.'): %s", response.url)
return
extractor = extractors.ArticleExtractor()
# Pass HTML to Extractor
content = extractor.get_content(html)
return {'url': response.url, 'content': content} |