k4d3 commited on
Commit
cee0d63
2 Parent(s): 4a1fbfb d86635a

Merge remote-tracking branch 'refs/remotes/origin/main'

Browse files
Files changed (1) hide show
  1. crawl/crawl +43 -30
crawl/crawl CHANGED
@@ -42,10 +42,10 @@ def sanitize_filename(filename):
42
  str: The sanitized filename.
43
  """
44
  # Remove invalid characters for Windows file names
45
- return re.sub(r'[<>:"/\\|?*]', '', filename)
46
 
47
 
48
- def download_image(session, image_url, save_dir):
49
  """
50
  Download an image from a given URL and save it to the specified directory.
51
 
@@ -56,19 +56,25 @@ def download_image(session, image_url, save_dir):
56
  The URL of the image to download.
57
  save_dir (str):
58
  The directory to save the downloaded image.
 
 
59
  """
60
  try:
61
  # Ensure the URL has a scheme
62
- if not re.match(r'^https?://', image_url):
63
- image_url = 'https://' + image_url.lstrip('/')
64
 
65
- image_filename = os.path.basename(image_url).split('?')[0]
 
 
 
 
66
  sanitized_image_filename = sanitize_filename(image_filename)
67
  image_path = os.path.join(save_dir, sanitized_image_filename)
68
 
69
  response = session.get(image_url, stream=True)
70
  response.raise_for_status()
71
- with open(image_path, 'wb') as image_file:
72
  for chunk in response.iter_content(chunk_size=8192):
73
  image_file.write(chunk)
74
  print(f"Saved image: {image_path}")
@@ -90,15 +96,15 @@ def save_result(target_url):
90
  """
91
  crawler = create_crawler()
92
  result = crawler.run(url=target_url)
93
- title = result.metadata.get('title', 'untitled')
94
  sanitized_title = sanitize_filename(title).replace(" ", "_")
95
-
96
  # Choose the appropriate base path based on the operating system
97
  if platform.system() == "Windows":
98
  base_path = "E:\\knowledgebase\\Saved Websites\\"
99
  else:
100
  base_path = "/home/kade/saved_websites/"
101
-
102
  save_dir = os.path.join(base_path, sanitized_title)
103
  os.makedirs(save_dir, exist_ok=True)
104
 
@@ -109,38 +115,45 @@ def save_result(target_url):
109
  print(f"Saved markdown to {save_path}")
110
 
111
  # Save images in parallel
112
- if 'images' in result.media and isinstance(result.media['images'], list):
113
  session = requests.Session()
114
  headers = {
115
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
116
- 'AppleWebKit/537.36 (KHTML, like Gecko) '
117
- 'Chrome/91.0.4472.124 Safari/537.36',
118
- 'Referer': target_url,
119
- 'Accept': ('image/avif,image/webp,image/apng,image/svg+xml,'
120
- 'image/*,*/*;q=0.8'),
121
- 'Accept-Language': 'en-US,en;q=0.9',
122
- 'Sec-Fetch-Dest': 'image',
123
- 'Sec-Fetch-Mode': 'no-cors',
124
- 'Sec-Fetch-Site': 'cross-site',
 
125
  }
126
  session.headers.update(headers)
127
 
128
  with ThreadPoolExecutor(max_workers=5) as executor:
129
  futures = []
130
- for image_data in result.media['images']:
131
- if 'src' in image_data:
132
- futures.append(executor.submit(download_image,
133
- session,
134
- image_data['src'],
135
- save_dir))
 
 
 
 
 
136
 
137
  for future in as_completed(futures):
138
  future.result()
139
 
140
 
141
  if __name__ == "__main__":
142
- if len(sys.argv) != 2:
143
- print("Usage: python crawl.py <URL>")
144
  else:
145
- url = sys.argv[1]
146
- save_result(url)
 
 
42
  str: The sanitized filename.
43
  """
44
  # Remove invalid characters for Windows file names
45
+ return re.sub(r'[<>:"/\\|?*]', "", filename)
46
 
47
 
48
+ def download_image(session, image_url, save_dir, base_url):
49
  """
50
  Download an image from a given URL and save it to the specified directory.
51
 
 
56
  The URL of the image to download.
57
  save_dir (str):
58
  The directory to save the downloaded image.
59
+ base_url (str):
60
+ The base URL of the page being crawled.
61
  """
62
  try:
63
  # Ensure the URL has a scheme
64
+ if image_url.startswith(".."):
65
+ from urllib.parse import urljoin
66
 
67
+ image_url = urljoin(base_url, image_url)
68
+ elif not re.match(r"^https?://", image_url):
69
+ image_url = "https://" + image_url.lstrip("/")
70
+
71
+ image_filename = os.path.basename(image_url).split("?")[0]
72
  sanitized_image_filename = sanitize_filename(image_filename)
73
  image_path = os.path.join(save_dir, sanitized_image_filename)
74
 
75
  response = session.get(image_url, stream=True)
76
  response.raise_for_status()
77
+ with open(image_path, "wb") as image_file:
78
  for chunk in response.iter_content(chunk_size=8192):
79
  image_file.write(chunk)
80
  print(f"Saved image: {image_path}")
 
96
  """
97
  crawler = create_crawler()
98
  result = crawler.run(url=target_url)
99
+ title = result.metadata.get("title", "untitled")
100
  sanitized_title = sanitize_filename(title).replace(" ", "_")
101
+
102
  # Choose the appropriate base path based on the operating system
103
  if platform.system() == "Windows":
104
  base_path = "E:\\knowledgebase\\Saved Websites\\"
105
  else:
106
  base_path = "/home/kade/saved_websites/"
107
+
108
  save_dir = os.path.join(base_path, sanitized_title)
109
  os.makedirs(save_dir, exist_ok=True)
110
 
 
115
  print(f"Saved markdown to {save_path}")
116
 
117
  # Save images in parallel
118
+ if "images" in result.media and isinstance(result.media["images"], list):
119
  session = requests.Session()
120
  headers = {
121
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
122
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
123
+ "Chrome/91.0.4472.124 Safari/537.36",
124
+ "Referer": target_url,
125
+ "Accept": (
126
+ "image/avif,image/webp,image/apng,image/svg+xml," "image/*,*/*;q=0.8"
127
+ ),
128
+ "Accept-Language": "en-US,en;q=0.9",
129
+ "Sec-Fetch-Dest": "image",
130
+ "Sec-Fetch-Mode": "no-cors",
131
+ "Sec-Fetch-Site": "cross-site",
132
  }
133
  session.headers.update(headers)
134
 
135
  with ThreadPoolExecutor(max_workers=5) as executor:
136
  futures = []
137
+ for image_data in result.media["images"]:
138
+ if "src" in image_data:
139
+ futures.append(
140
+ executor.submit(
141
+ download_image,
142
+ session,
143
+ image_data["src"],
144
+ save_dir,
145
+ target_url,
146
+ )
147
+ )
148
 
149
  for future in as_completed(futures):
150
  future.result()
151
 
152
 
153
  if __name__ == "__main__":
154
+ if len(sys.argv) < 2:
155
+ print("Usage: python crawl.py <URL1> <URL2> ... <URLn>")
156
  else:
157
+ urls = sys.argv[1:]
158
+ for url in urls:
159
+ save_result(url)