Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

article_writer / plagiarism.py

eljanmahammadli

moved google search api to env file

48d4d11 9 months ago

raw

history blame

3.4 kB

	import os
	import time
	from googleapiclient.discovery import build
	import asyncio
	import httpx
	from bs4 import BeautifulSoup
	import justext
	import newspaper
	from dotenv import load_dotenv

	load_dotenv()


	def clean_html(text):
	result = ""
	article = newspaper.Article(url=" ")
	article.set_html(text)
	article.parse()
	result += article.title + "\n"
	paragraphs = justext.justext(text, justext.get_stoplist("English"))
	for paragraph in paragraphs:
	if not paragraph.is_boilerplate:
	result += paragraph.text
	return result


	months = {
	"January": "01",
	"February": "02",
	"March": "03",
	"April": "04",
	"May": "05",
	"June": "06",
	"July": "07",
	"August": "08",
	"September": "09",
	"October": "10",
	"November": "11",
	"December": "12",
	}

	domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]


	def build_date(year=2024, month="March", day=1):
	return f"{year}{months[month]}{day}"


	async def get_url_data(url, client):
	try:
	r = await client.get(url)
	if r.status_code == 200:
	soup = BeautifulSoup(r.content, "html.parser")
	return soup
	except Exception:
	return None


	async def parallel_scrap(urls):
	async with httpx.AsyncClient(timeout=30) as client:
	tasks = []
	for url in urls:
	tasks.append(get_url_data(url=url, client=client))
	results = await asyncio.gather(*tasks, return_exceptions=True)
	return results


	def scrap(urls):
	client = httpx.Client()
	soups = []
	for url in urls:
	soups.append(get_url_data(url=url, client=client))
	return soups


	def google_search_urls(
	text,
	sorted_date,
	domains_to_include,
	api_key,
	cse_id,
	**kwargs,
	):
	service = build("customsearch", "v1", developerKey=api_key)
	results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
	url_list = []
	if "items" in results and len(results["items"]) > 0:
	for count, link in enumerate(results["items"]):
	# skip user selected domains
	if (domains_to_include is None) or not any(
	("." + domain) in link["link"] for domain in domains_to_include
	):
	continue
	url = link["link"]
	if url not in url_list:
	url_list.append(url)
	return url_list


	def google_search(
	topic,
	sorted_date,
	domains_to_include,
	):
	api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
	cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
	start_time = time.perf_counter()
	url_list = google_search_urls(
	topic,
	sorted_date,
	domains_to_include,
	api_key,
	cse_id,
	)
	print("URLS: ", url_list)
	print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
	# Scrape URLs in list
	start_time = time.perf_counter()
	soups = asyncio.run(parallel_scrap(url_list))
	print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
	result_content = {}
	num_pages = 3
	count = 0
	for url, soup in zip(url_list, soups):
	if count >= num_pages:
	break
	if soup:
	text = clean_html(soup.text)
	if len(text) > 500:
	result_content[url] = text
	count += 1
	return result_content