Hansimov commited on
Commit
8c0b736
1 Parent(s): f234ce3

:recycle: [Refactor] Replace output_path with html_path to avoid confuse

Browse files
apis/search_api.py CHANGED
@@ -94,7 +94,7 @@ class SearchAPIApp:
94
  output_parent=query_search_results["query"],
95
  )
96
  html_paths = [
97
- url_and_html_path["output_path"]
98
  for url_and_html_path in url_and_html_path_list
99
  ]
100
 
@@ -109,7 +109,7 @@ class SearchAPIApp:
109
 
110
  for item in url_and_html_path_list:
111
  url = item["url"]
112
- html_path = item["output_path"]
113
  extracted_content = html_path_and_extracted_content_list[
114
  html_paths.index(html_path)
115
  ]["extracted_content"]
 
94
  output_parent=query_search_results["query"],
95
  )
96
  html_paths = [
97
+ str(url_and_html_path["html_path"])
98
  for url_and_html_path in url_and_html_path_list
99
  ]
100
 
 
109
 
110
  for item in url_and_html_path_list:
111
  url = item["url"]
112
+ html_path = str(item["html_path"])
113
  extracted_content = html_path_and_extracted_content_list[
114
  html_paths.index(html_path)
115
  ]["extracted_content"]
networks/google_searcher.py CHANGED
@@ -26,22 +26,22 @@ class GoogleSearcher:
26
  )
27
 
28
  def save_response(self):
29
- if not self.output_path.exists():
30
- self.output_path.parent.mkdir(parents=True, exist_ok=True)
31
- logger.note(f"Saving to: [{self.output_path}]")
32
- with open(self.output_path, "wb") as wf:
33
  wf.write(self.request_response.content)
34
 
35
  def search(self, query, result_num=10, safe=False, overwrite=False):
36
  self.query = query
37
- self.output_path = self.filepath_converter.convert(self.query)
38
  logger.note(f"Searching: [{self.query}]")
39
- if self.output_path.exists() and not overwrite:
40
- logger.success(f"HTML existed: {self.output_path}")
41
  else:
42
  self.send_request(result_num=result_num, safe=safe)
43
  self.save_response()
44
- return self.output_path
45
 
46
 
47
  if __name__ == "__main__":
 
26
  )
27
 
28
  def save_response(self):
29
+ if not self.html_path.exists():
30
+ self.html_path.parent.mkdir(parents=True, exist_ok=True)
31
+ logger.note(f"Saving to: [{self.html_path}]")
32
+ with open(self.html_path, "wb") as wf:
33
  wf.write(self.request_response.content)
34
 
35
  def search(self, query, result_num=10, safe=False, overwrite=False):
36
  self.query = query
37
+ self.html_path = self.filepath_converter.convert(self.query)
38
  logger.note(f"Searching: [{self.query}]")
39
+ if self.html_path.exists() and not overwrite:
40
+ logger.success(f"HTML existed: {self.html_path}")
41
  else:
42
  self.send_request(result_num=result_num, safe=safe)
43
  self.save_response()
44
+ return self.html_path
45
 
46
 
47
  if __name__ == "__main__":
networks/webpage_fetcher.py CHANGED
@@ -34,47 +34,45 @@ class WebpageFetcher:
34
  self.request_response = None
35
 
36
  def save_response(self):
37
- if not self.output_path.exists():
38
- self.output_path.parent.mkdir(parents=True, exist_ok=True)
39
- logger.success(f"Saving to: [{self.output_path}]")
40
 
41
  if self.request_response is None:
42
  return
43
  else:
44
- with open(self.output_path, "wb") as wf:
45
  wf.write(self.request_response.content)
46
 
47
  def fetch(self, url, overwrite=False, output_parent=None):
48
  self.url = url
49
  logger.note(f"Fetching: [{self.url}]")
50
- self.output_path = self.filepath_converter.convert(
51
- self.url, parent=output_parent
52
- )
53
 
54
  if self.is_ignored_host(self.url):
55
  logger.warn(f"Ignore host: [{self.host}]")
56
- return self.output_path
57
 
58
- if self.output_path.exists() and not overwrite:
59
- logger.success(f"HTML existed: [{self.output_path}]")
60
  else:
61
  self.send_request()
62
  self.save_response()
63
- return self.output_path
64
 
65
 
66
  class BatchWebpageFetcher:
67
  def __init__(self):
68
  self.done_count = 0
69
  self.total_count = 0
70
- self.url_and_output_path_list = []
71
 
72
  def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
73
  webpage_fetcher = WebpageFetcher()
74
- output_path = webpage_fetcher.fetch(
75
  url=url, overwrite=overwrite, output_parent=output_parent
76
  )
77
- self.url_and_output_path_list.append({"url": url, "output_path": output_path})
78
  self.done_count += 1
79
  logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
80
 
@@ -94,7 +92,7 @@ class BatchWebpageFetcher:
94
 
95
  for idx, future in enumerate(concurrent.futures.as_completed(futures)):
96
  result = future.result()
97
- return self.url_and_output_path_list
98
 
99
 
100
  if __name__ == "__main__":
 
34
  self.request_response = None
35
 
36
  def save_response(self):
37
+ if not self.html_path.exists():
38
+ self.html_path.parent.mkdir(parents=True, exist_ok=True)
39
+ logger.success(f"Saving to: [{self.html_path}]")
40
 
41
  if self.request_response is None:
42
  return
43
  else:
44
+ with open(self.html_path, "wb") as wf:
45
  wf.write(self.request_response.content)
46
 
47
  def fetch(self, url, overwrite=False, output_parent=None):
48
  self.url = url
49
  logger.note(f"Fetching: [{self.url}]")
50
+ self.html_path = self.filepath_converter.convert(self.url, parent=output_parent)
 
 
51
 
52
  if self.is_ignored_host(self.url):
53
  logger.warn(f"Ignore host: [{self.host}]")
54
+ return self.html_path
55
 
56
+ if self.html_path.exists() and not overwrite:
57
+ logger.success(f"HTML existed: [{self.html_path}]")
58
  else:
59
  self.send_request()
60
  self.save_response()
61
+ return self.html_path
62
 
63
 
64
  class BatchWebpageFetcher:
65
  def __init__(self):
66
  self.done_count = 0
67
  self.total_count = 0
68
+ self.url_and_html_path_list = []
69
 
70
  def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
71
  webpage_fetcher = WebpageFetcher()
72
+ html_path = webpage_fetcher.fetch(
73
  url=url, overwrite=overwrite, output_parent=output_parent
74
  )
75
+ self.url_and_html_path_list.append({"url": url, "html_path": html_path})
76
  self.done_count += 1
77
  logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
78
 
 
92
 
93
  for idx, future in enumerate(concurrent.futures.as_completed(futures)):
94
  result = future.result()
95
+ return self.url_and_html_path_list
96
 
97
 
98
  if __name__ == "__main__":