minko186 commited on
Commit
8711f20
·
verified ·
1 Parent(s): 35710a1

Create google_search.py

Browse files
Files changed (1) hide show
  1. google_search.py +207 -0
google_search.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from googleapiclient.discovery import build
4
+ import asyncio
5
+ import httpx
6
+ from bs4 import BeautifulSoup
7
+ from dotenv import load_dotenv
8
+ import html2text
9
+ import requests
10
+ import unicodedata
11
+ import fitz
12
+
13
+ load_dotenv()
14
+
15
+ API_KEY = os.environ.get("GOOGLE_SEARCH_API_KEY")
16
+ CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
17
+
18
+ # Number of pages to scrape
19
+ NUM_PAGES = 10
20
+
21
+ # load html2text and set up configs
22
+ h2t = html2text.HTML2Text()
23
+ h2t.bodywidth = 0 # No wrapping
24
+ h2t.ignore_links = True # Ignore hyperlinks
25
+ h2t.ignore_images = True # Ignore images
26
+ h2t.ignore_emphasis = True # Ignore emphasis
27
+ h2t.ignore_tables = False # Include tables
28
+ h2t.skip_internal_links = True # Skip internal links
29
+ h2t.skip_external_links = True # Skip external links
30
+ h2t.single_line_break = True # Use single line breaks
31
+ h2t.protect_links = True # Protect links from being split
32
+ h2t.default_image_alt = "[image]" # Default alt text for images
33
+
34
+
35
+ def clean_html(text):
36
+ text = h2t.handle(text)
37
+ text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII") # Remove non-ASCII characters
38
+ return text
39
+
40
+
41
+ def build_results_beautifulsoup(url_list):
42
+ print("Starting to scrape URLs...")
43
+ start_time = time.perf_counter()
44
+
45
+ # scrape URLs in list
46
+ soups = asyncio.run(parallel_scrap(url_list))
47
+
48
+ scraping_time = time.perf_counter() - start_time
49
+ print(f"Scraping processing time: {scraping_time:.2f} seconds")
50
+
51
+ result_content = {}
52
+ count = 0
53
+
54
+ print("Starting to process each URL...")
55
+ for url, soup in zip(url_list, soups):
56
+ if count >= NUM_PAGES:
57
+ print(f"Reached the limit of {NUM_PAGES} pages. Stopping processing.")
58
+ break
59
+
60
+ if soup:
61
+ print(f"Processing URL: {url}")
62
+ text = clean_html(soup.text)
63
+ if len(text) > 500:
64
+ print(f"Adding content from URL: {url}, content length: {len(text)}")
65
+ result_content[url] = text
66
+ count += 1
67
+ else:
68
+ print(f"Skipped URL: {url}, content too short (length: {len(text)})")
69
+ else:
70
+ print(f"Skipped URL: {url}, no soup content available.")
71
+
72
+ print("Finished processing URLs.")
73
+ return result_content
74
+
75
+
76
+ def build_results_extractor(url_list):
77
+ try:
78
+ endpoint = "https://extractorapi.com/api/v1/extractor"
79
+ result_content = {}
80
+ count = 0
81
+ for url in url_list:
82
+ if count >= NUM_PAGES:
83
+ break
84
+ params = {"apikey": os.environ.get("EXTRACTOR_API_KEY"), "url": url}
85
+ r = requests.get(endpoint, params=params)
86
+ if r.status_code == 200:
87
+ text = r.json()["text"]
88
+ if len(text) > 500:
89
+ result_content[url] = text
90
+ count += 1
91
+ if r.status_code == 403:
92
+ raise Exception(f"Error with API; using default implementaion instead")
93
+ return result_content
94
+
95
+ except Exception as e:
96
+ print(e)
97
+ return build_results_beautifulsoup(url_list)
98
+
99
+
100
+ months = {
101
+ "January": "01",
102
+ "February": "02",
103
+ "March": "03",
104
+ "April": "04",
105
+ "May": "05",
106
+ "June": "06",
107
+ "July": "07",
108
+ "August": "08",
109
+ "September": "09",
110
+ "October": "10",
111
+ "November": "11",
112
+ "December": "12",
113
+ }
114
+
115
+ domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
116
+
117
+
118
+ def build_date(year=2024, month="March", day=1):
119
+ return f"{year}{months[month]}{day}"
120
+
121
+
122
+ async def get_url_data(url, client):
123
+ try:
124
+ r = await client.get(url)
125
+ if r.status_code == 200:
126
+ content_type = r.headers.get("Content-Type", "").lower()
127
+ # detect if pdf
128
+ if "application/pdf" in content_type or url.lower().endswith(".pdf"):
129
+ pdf_content = await extract_pdf_text(r.content)
130
+ return BeautifulSoup(pdf_content, "html.parser")
131
+ else:
132
+ return BeautifulSoup(r.content, "html.parser")
133
+ except Exception:
134
+ return None
135
+
136
+
137
+ async def extract_pdf_text(content):
138
+ try:
139
+ with fitz.open(stream=content, filetype="pdf") as doc:
140
+ text = ""
141
+ for page in doc:
142
+ text += page.get_text()
143
+ return f"<div>{text}</div>" # Wrap in a div to make it valid HTML
144
+ except Exception as e:
145
+ print(f"Error extracting PDF text: {str(e)}")
146
+ return "<div>Error extracting PDF text</div>"
147
+
148
+
149
+ async def parallel_scrap(urls):
150
+ async with httpx.AsyncClient(timeout=30) as client:
151
+ tasks = []
152
+ for url in urls:
153
+ tasks.append(get_url_data(url=url, client=client))
154
+ results = await asyncio.gather(*tasks, return_exceptions=True)
155
+ return results
156
+
157
+
158
+ def scrap(urls):
159
+ client = httpx.Client()
160
+ soups = []
161
+ for url in urls:
162
+ soups.append(get_url_data(url=url, client=client))
163
+ return soups
164
+
165
+
166
+ def google_search_urls(
167
+ text,
168
+ sorted_date,
169
+ domains_to_include,
170
+ api_key,
171
+ cse_id,
172
+ **kwargs,
173
+ ):
174
+ service = build("customsearch", "v1", developerKey=api_key)
175
+ results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
176
+ url_list = []
177
+ if "items" in results and len(results["items"]) > 0:
178
+ for count, link in enumerate(results["items"]):
179
+ # skip user selected domains
180
+ if (domains_to_include is None) or not any(
181
+ ("." + domain) in link["link"] for domain in domains_to_include
182
+ ):
183
+ continue
184
+ url = link["link"]
185
+ if url not in url_list:
186
+ url_list.append(url)
187
+ return url_list
188
+
189
+
190
+ def google_search(
191
+ topic,
192
+ sorted_date,
193
+ domains_to_include,
194
+ ):
195
+ api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
196
+ cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
197
+ start_time = time.perf_counter()
198
+ url_list = google_search_urls(
199
+ topic,
200
+ sorted_date,
201
+ domains_to_include,
202
+ api_key,
203
+ cse_id,
204
+ )
205
+ print("Google Search processing time: ", time.perf_counter() - start_time)
206
+ result_content = build_results_beautifulsoup(url_list)
207
+ return result_content