Spaces:
Sleeping
Sleeping
Create google_search.py
Browse files- google_search.py +207 -0
google_search.py
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
from googleapiclient.discovery import build
|
4 |
+
import asyncio
|
5 |
+
import httpx
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
import html2text
|
9 |
+
import requests
|
10 |
+
import unicodedata
|
11 |
+
import fitz
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
API_KEY = os.environ.get("GOOGLE_SEARCH_API_KEY")
|
16 |
+
CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
|
17 |
+
|
18 |
+
# Number of pages to scrape
|
19 |
+
NUM_PAGES = 10
|
20 |
+
|
21 |
+
# load html2text and set up configs
|
22 |
+
h2t = html2text.HTML2Text()
|
23 |
+
h2t.bodywidth = 0 # No wrapping
|
24 |
+
h2t.ignore_links = True # Ignore hyperlinks
|
25 |
+
h2t.ignore_images = True # Ignore images
|
26 |
+
h2t.ignore_emphasis = True # Ignore emphasis
|
27 |
+
h2t.ignore_tables = False # Include tables
|
28 |
+
h2t.skip_internal_links = True # Skip internal links
|
29 |
+
h2t.skip_external_links = True # Skip external links
|
30 |
+
h2t.single_line_break = True # Use single line breaks
|
31 |
+
h2t.protect_links = True # Protect links from being split
|
32 |
+
h2t.default_image_alt = "[image]" # Default alt text for images
|
33 |
+
|
34 |
+
|
35 |
+
def clean_html(text):
|
36 |
+
text = h2t.handle(text)
|
37 |
+
text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII") # Remove non-ASCII characters
|
38 |
+
return text
|
39 |
+
|
40 |
+
|
41 |
+
def build_results_beautifulsoup(url_list):
|
42 |
+
print("Starting to scrape URLs...")
|
43 |
+
start_time = time.perf_counter()
|
44 |
+
|
45 |
+
# scrape URLs in list
|
46 |
+
soups = asyncio.run(parallel_scrap(url_list))
|
47 |
+
|
48 |
+
scraping_time = time.perf_counter() - start_time
|
49 |
+
print(f"Scraping processing time: {scraping_time:.2f} seconds")
|
50 |
+
|
51 |
+
result_content = {}
|
52 |
+
count = 0
|
53 |
+
|
54 |
+
print("Starting to process each URL...")
|
55 |
+
for url, soup in zip(url_list, soups):
|
56 |
+
if count >= NUM_PAGES:
|
57 |
+
print(f"Reached the limit of {NUM_PAGES} pages. Stopping processing.")
|
58 |
+
break
|
59 |
+
|
60 |
+
if soup:
|
61 |
+
print(f"Processing URL: {url}")
|
62 |
+
text = clean_html(soup.text)
|
63 |
+
if len(text) > 500:
|
64 |
+
print(f"Adding content from URL: {url}, content length: {len(text)}")
|
65 |
+
result_content[url] = text
|
66 |
+
count += 1
|
67 |
+
else:
|
68 |
+
print(f"Skipped URL: {url}, content too short (length: {len(text)})")
|
69 |
+
else:
|
70 |
+
print(f"Skipped URL: {url}, no soup content available.")
|
71 |
+
|
72 |
+
print("Finished processing URLs.")
|
73 |
+
return result_content
|
74 |
+
|
75 |
+
|
76 |
+
def build_results_extractor(url_list):
|
77 |
+
try:
|
78 |
+
endpoint = "https://extractorapi.com/api/v1/extractor"
|
79 |
+
result_content = {}
|
80 |
+
count = 0
|
81 |
+
for url in url_list:
|
82 |
+
if count >= NUM_PAGES:
|
83 |
+
break
|
84 |
+
params = {"apikey": os.environ.get("EXTRACTOR_API_KEY"), "url": url}
|
85 |
+
r = requests.get(endpoint, params=params)
|
86 |
+
if r.status_code == 200:
|
87 |
+
text = r.json()["text"]
|
88 |
+
if len(text) > 500:
|
89 |
+
result_content[url] = text
|
90 |
+
count += 1
|
91 |
+
if r.status_code == 403:
|
92 |
+
raise Exception(f"Error with API; using default implementaion instead")
|
93 |
+
return result_content
|
94 |
+
|
95 |
+
except Exception as e:
|
96 |
+
print(e)
|
97 |
+
return build_results_beautifulsoup(url_list)
|
98 |
+
|
99 |
+
|
100 |
+
months = {
|
101 |
+
"January": "01",
|
102 |
+
"February": "02",
|
103 |
+
"March": "03",
|
104 |
+
"April": "04",
|
105 |
+
"May": "05",
|
106 |
+
"June": "06",
|
107 |
+
"July": "07",
|
108 |
+
"August": "08",
|
109 |
+
"September": "09",
|
110 |
+
"October": "10",
|
111 |
+
"November": "11",
|
112 |
+
"December": "12",
|
113 |
+
}
|
114 |
+
|
115 |
+
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
|
116 |
+
|
117 |
+
|
118 |
+
def build_date(year=2024, month="March", day=1):
|
119 |
+
return f"{year}{months[month]}{day}"
|
120 |
+
|
121 |
+
|
122 |
+
async def get_url_data(url, client):
|
123 |
+
try:
|
124 |
+
r = await client.get(url)
|
125 |
+
if r.status_code == 200:
|
126 |
+
content_type = r.headers.get("Content-Type", "").lower()
|
127 |
+
# detect if pdf
|
128 |
+
if "application/pdf" in content_type or url.lower().endswith(".pdf"):
|
129 |
+
pdf_content = await extract_pdf_text(r.content)
|
130 |
+
return BeautifulSoup(pdf_content, "html.parser")
|
131 |
+
else:
|
132 |
+
return BeautifulSoup(r.content, "html.parser")
|
133 |
+
except Exception:
|
134 |
+
return None
|
135 |
+
|
136 |
+
|
137 |
+
async def extract_pdf_text(content):
|
138 |
+
try:
|
139 |
+
with fitz.open(stream=content, filetype="pdf") as doc:
|
140 |
+
text = ""
|
141 |
+
for page in doc:
|
142 |
+
text += page.get_text()
|
143 |
+
return f"<div>{text}</div>" # Wrap in a div to make it valid HTML
|
144 |
+
except Exception as e:
|
145 |
+
print(f"Error extracting PDF text: {str(e)}")
|
146 |
+
return "<div>Error extracting PDF text</div>"
|
147 |
+
|
148 |
+
|
149 |
+
async def parallel_scrap(urls):
|
150 |
+
async with httpx.AsyncClient(timeout=30) as client:
|
151 |
+
tasks = []
|
152 |
+
for url in urls:
|
153 |
+
tasks.append(get_url_data(url=url, client=client))
|
154 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
155 |
+
return results
|
156 |
+
|
157 |
+
|
158 |
+
def scrap(urls):
|
159 |
+
client = httpx.Client()
|
160 |
+
soups = []
|
161 |
+
for url in urls:
|
162 |
+
soups.append(get_url_data(url=url, client=client))
|
163 |
+
return soups
|
164 |
+
|
165 |
+
|
166 |
+
def google_search_urls(
|
167 |
+
text,
|
168 |
+
sorted_date,
|
169 |
+
domains_to_include,
|
170 |
+
api_key,
|
171 |
+
cse_id,
|
172 |
+
**kwargs,
|
173 |
+
):
|
174 |
+
service = build("customsearch", "v1", developerKey=api_key)
|
175 |
+
results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
|
176 |
+
url_list = []
|
177 |
+
if "items" in results and len(results["items"]) > 0:
|
178 |
+
for count, link in enumerate(results["items"]):
|
179 |
+
# skip user selected domains
|
180 |
+
if (domains_to_include is None) or not any(
|
181 |
+
("." + domain) in link["link"] for domain in domains_to_include
|
182 |
+
):
|
183 |
+
continue
|
184 |
+
url = link["link"]
|
185 |
+
if url not in url_list:
|
186 |
+
url_list.append(url)
|
187 |
+
return url_list
|
188 |
+
|
189 |
+
|
190 |
+
def google_search(
|
191 |
+
topic,
|
192 |
+
sorted_date,
|
193 |
+
domains_to_include,
|
194 |
+
):
|
195 |
+
api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
|
196 |
+
cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
|
197 |
+
start_time = time.perf_counter()
|
198 |
+
url_list = google_search_urls(
|
199 |
+
topic,
|
200 |
+
sorted_date,
|
201 |
+
domains_to_include,
|
202 |
+
api_key,
|
203 |
+
cse_id,
|
204 |
+
)
|
205 |
+
print("Google Search processing time: ", time.perf_counter() - start_time)
|
206 |
+
result_content = build_results_beautifulsoup(url_list)
|
207 |
+
return result_content
|