k4d3 commited on
Commit
d86635a
·
1 Parent(s): f86ba1d

trailing whitespace yay

Browse files
Files changed (1) hide show
  1. crawl/crawl +33 -27
crawl/crawl CHANGED
@@ -42,7 +42,7 @@ def sanitize_filename(filename):
42
  str: The sanitized filename.
43
  """
44
  # Remove invalid characters for Windows file names
45
- return re.sub(r'[<>:"/\\|?*]', '', filename)
46
 
47
 
48
  def download_image(session, image_url, save_dir, base_url):
@@ -61,19 +61,20 @@ def download_image(session, image_url, save_dir, base_url):
61
  """
62
  try:
63
  # Ensure the URL has a scheme
64
- if image_url.startswith('..'):
65
  from urllib.parse import urljoin
 
66
  image_url = urljoin(base_url, image_url)
67
- elif not re.match(r'^https?://', image_url):
68
- image_url = 'https://' + image_url.lstrip('/')
69
 
70
- image_filename = os.path.basename(image_url).split('?')[0]
71
  sanitized_image_filename = sanitize_filename(image_filename)
72
  image_path = os.path.join(save_dir, sanitized_image_filename)
73
 
74
  response = session.get(image_url, stream=True)
75
  response.raise_for_status()
76
- with open(image_path, 'wb') as image_file:
77
  for chunk in response.iter_content(chunk_size=8192):
78
  image_file.write(chunk)
79
  print(f"Saved image: {image_path}")
@@ -95,15 +96,15 @@ def save_result(target_url):
95
  """
96
  crawler = create_crawler()
97
  result = crawler.run(url=target_url)
98
- title = result.metadata.get('title', 'untitled')
99
  sanitized_title = sanitize_filename(title).replace(" ", "_")
100
-
101
  # Choose the appropriate base path based on the operating system
102
  if platform.system() == "Windows":
103
  base_path = "E:\\knowledgebase\\Saved Websites\\"
104
  else:
105
  base_path = "/home/kade/saved_websites/"
106
-
107
  save_dir = os.path.join(base_path, sanitized_title)
108
  os.makedirs(save_dir, exist_ok=True)
109
 
@@ -114,31 +115,36 @@ def save_result(target_url):
114
  print(f"Saved markdown to {save_path}")
115
 
116
  # Save images in parallel
117
- if 'images' in result.media and isinstance(result.media['images'], list):
118
  session = requests.Session()
119
  headers = {
120
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
121
- 'AppleWebKit/537.36 (KHTML, like Gecko) '
122
- 'Chrome/91.0.4472.124 Safari/537.36',
123
- 'Referer': target_url,
124
- 'Accept': ('image/avif,image/webp,image/apng,image/svg+xml,'
125
- 'image/*,*/*;q=0.8'),
126
- 'Accept-Language': 'en-US,en;q=0.9',
127
- 'Sec-Fetch-Dest': 'image',
128
- 'Sec-Fetch-Mode': 'no-cors',
129
- 'Sec-Fetch-Site': 'cross-site',
 
130
  }
131
  session.headers.update(headers)
132
 
133
  with ThreadPoolExecutor(max_workers=5) as executor:
134
  futures = []
135
- for image_data in result.media['images']:
136
- if 'src' in image_data:
137
- futures.append(executor.submit(download_image,
138
- session,
139
- image_data['src'],
140
- save_dir,
141
- target_url))
 
 
 
 
142
 
143
  for future in as_completed(futures):
144
  future.result()
 
42
  str: The sanitized filename.
43
  """
44
  # Remove invalid characters for Windows file names
45
+ return re.sub(r'[<>:"/\\|?*]', "", filename)
46
 
47
 
48
  def download_image(session, image_url, save_dir, base_url):
 
61
  """
62
  try:
63
  # Ensure the URL has a scheme
64
+ if image_url.startswith(".."):
65
  from urllib.parse import urljoin
66
+
67
  image_url = urljoin(base_url, image_url)
68
+ elif not re.match(r"^https?://", image_url):
69
+ image_url = "https://" + image_url.lstrip("/")
70
 
71
+ image_filename = os.path.basename(image_url).split("?")[0]
72
  sanitized_image_filename = sanitize_filename(image_filename)
73
  image_path = os.path.join(save_dir, sanitized_image_filename)
74
 
75
  response = session.get(image_url, stream=True)
76
  response.raise_for_status()
77
+ with open(image_path, "wb") as image_file:
78
  for chunk in response.iter_content(chunk_size=8192):
79
  image_file.write(chunk)
80
  print(f"Saved image: {image_path}")
 
96
  """
97
  crawler = create_crawler()
98
  result = crawler.run(url=target_url)
99
+ title = result.metadata.get("title", "untitled")
100
  sanitized_title = sanitize_filename(title).replace(" ", "_")
101
+
102
  # Choose the appropriate base path based on the operating system
103
  if platform.system() == "Windows":
104
  base_path = "E:\\knowledgebase\\Saved Websites\\"
105
  else:
106
  base_path = "/home/kade/saved_websites/"
107
+
108
  save_dir = os.path.join(base_path, sanitized_title)
109
  os.makedirs(save_dir, exist_ok=True)
110
 
 
115
  print(f"Saved markdown to {save_path}")
116
 
117
  # Save images in parallel
118
+ if "images" in result.media and isinstance(result.media["images"], list):
119
  session = requests.Session()
120
  headers = {
121
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
122
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
123
+ "Chrome/91.0.4472.124 Safari/537.36",
124
+ "Referer": target_url,
125
+ "Accept": (
126
+ "image/avif,image/webp,image/apng,image/svg+xml," "image/*,*/*;q=0.8"
127
+ ),
128
+ "Accept-Language": "en-US,en;q=0.9",
129
+ "Sec-Fetch-Dest": "image",
130
+ "Sec-Fetch-Mode": "no-cors",
131
+ "Sec-Fetch-Site": "cross-site",
132
  }
133
  session.headers.update(headers)
134
 
135
  with ThreadPoolExecutor(max_workers=5) as executor:
136
  futures = []
137
+ for image_data in result.media["images"]:
138
+ if "src" in image_data:
139
+ futures.append(
140
+ executor.submit(
141
+ download_image,
142
+ session,
143
+ image_data["src"],
144
+ save_dir,
145
+ target_url,
146
+ )
147
+ )
148
 
149
  for future in as_completed(futures):
150
  future.result()