article_writer / plagiarism.py
eljanmahammadli's picture
moved google search api to env file
48d4d11
raw
history blame
3.4 kB
import os
import time
from googleapiclient.discovery import build
import asyncio
import httpx
from bs4 import BeautifulSoup
import justext
import newspaper
from dotenv import load_dotenv
load_dotenv()
def clean_html(text):
result = ""
article = newspaper.Article(url=" ")
article.set_html(text)
article.parse()
result += article.title + "\n"
paragraphs = justext.justext(text, justext.get_stoplist("English"))
for paragraph in paragraphs:
if not paragraph.is_boilerplate:
result += paragraph.text
return result
months = {
"January": "01",
"February": "02",
"March": "03",
"April": "04",
"May": "05",
"June": "06",
"July": "07",
"August": "08",
"September": "09",
"October": "10",
"November": "11",
"December": "12",
}
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
def build_date(year=2024, month="March", day=1):
return f"{year}{months[month]}{day}"
async def get_url_data(url, client):
try:
r = await client.get(url)
if r.status_code == 200:
soup = BeautifulSoup(r.content, "html.parser")
return soup
except Exception:
return None
async def parallel_scrap(urls):
async with httpx.AsyncClient(timeout=30) as client:
tasks = []
for url in urls:
tasks.append(get_url_data(url=url, client=client))
results = await asyncio.gather(*tasks, return_exceptions=True)
return results
def scrap(urls):
client = httpx.Client()
soups = []
for url in urls:
soups.append(get_url_data(url=url, client=client))
return soups
def google_search_urls(
text,
sorted_date,
domains_to_include,
api_key,
cse_id,
**kwargs,
):
service = build("customsearch", "v1", developerKey=api_key)
results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
url_list = []
if "items" in results and len(results["items"]) > 0:
for count, link in enumerate(results["items"]):
# skip user selected domains
if (domains_to_include is None) or not any(
("." + domain) in link["link"] for domain in domains_to_include
):
continue
url = link["link"]
if url not in url_list:
url_list.append(url)
return url_list
def google_search(
topic,
sorted_date,
domains_to_include,
):
api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
start_time = time.perf_counter()
url_list = google_search_urls(
topic,
sorted_date,
domains_to_include,
api_key,
cse_id,
)
print("URLS: ", url_list)
print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
# Scrape URLs in list
start_time = time.perf_counter()
soups = asyncio.run(parallel_scrap(url_list))
print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
result_content = {}
num_pages = 3
count = 0
for url, soup in zip(url_list, soups):
if count >= num_pages:
break
if soup:
text = clean_html(soup.text)
if len(text) > 500:
result_content[url] = text
count += 1
return result_content