bsenst commited on
Commit
28b9e2a
·
verified ·
1 Parent(s): 6e2f002

Create homespider.py

Browse files
Files changed (1) hide show
  1. homespider.py +72 -0
homespider.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import scrapy
3
+ import pandas as pd
4
+ from urllib.parse import urlparse
5
+
6
+
7
+ class HomepageSpider(scrapy.Spider):
8
+ name = "dssg-berlin-spider"
9
+
10
+ custom_settings = {
11
+ 'DOWNLOAD_DELAY': 2 # add download delay for fair scraping practice, default is 0
12
+ }
13
+
14
+ def __init__(self, start_url=None, *args, **kwargs):
15
+ super(HomepageSpider, self).__init__(*args, **kwargs)
16
+ self.start_url = start_url
17
+
18
+ @classmethod
19
+ def from_crawler(cls, crawler, *args, **kwargs):
20
+ # Set default depth and page count if not provided
21
+ depth_limit = int(kwargs.get("depth_limit", 2))
22
+ pagecount_limit = int(kwargs.get("pagecount_limit", 10))
23
+
24
+ # Update settings dynamically
25
+ crawler.settings.set("DEPTH_LIMIT", depth_limit, priority="cmdline")
26
+ crawler.settings.set(
27
+ "CLOSESPIDER_PAGECOUNT", pagecount_limit, priority="cmdline"
28
+ )
29
+
30
+ # Log settings for debugging
31
+ logging.info(f"DEPTH_LIMIT set to: {depth_limit}")
32
+ logging.info(f"CLOSESPIDER_PAGECOUNT set to: {pagecount_limit}")
33
+
34
+ return super(HomepageSpider, cls).from_crawler(crawler, *args, **kwargs)
35
+
36
+ def start_requests(self):
37
+ if not self.start_url:
38
+ self.logger.error("No start_url provided. Use -a start_url=<URL>")
39
+ return
40
+
41
+ # log scraping started/running
42
+ with open("scraping_status.log", "w") as log_file:
43
+ log_file.write("Scraping running")
44
+
45
+ parsed_uri = urlparse(self.start_url)
46
+ domain = f"{parsed_uri.scheme}://{parsed_uri.netloc}"
47
+ yield scrapy.Request(
48
+ url=self.start_url, meta={"domain": domain}, callback=self.parse
49
+ )
50
+
51
+ def parse(self, response):
52
+ # Scrape the current page
53
+ yield {
54
+ "url": response.url,
55
+ "title": response.xpath("//title/text()").get(),
56
+ "body": response.xpath("//body//text()").getall(),
57
+ }
58
+
59
+ # Follow internal links
60
+ domain = response.meta["domain"]
61
+ for a_tag in response.xpath("//a[@href]"):
62
+ href = a_tag.attrib["href"]
63
+ absolute_url = response.urljoin(href)
64
+
65
+ # Only follow links that are part of the same domain
66
+ if domain in absolute_url:
67
+ yield response.follow(absolute_url, self.parse, meta={"domain": domain})
68
+
69
+ def closed(self, reason):
70
+ # Log "scraping finished" to a local file
71
+ with open("scraping_status.log", "w") as log_file:
72
+ log_file.write("Scraping finished")