Spaces:
Sleeping
Sleeping
from urllib.request import urlopen, Request | |
from googleapiclient.discovery import build | |
import requests | |
import httpx | |
import re | |
from bs4 import BeautifulSoup | |
import re, math | |
from collections import Counter | |
import numpy as np | |
import asyncio | |
import nltk | |
nltk.download('punkt') | |
WORD = re.compile(r"\w+") | |
# returns cosine similarity of two vectors | |
# input: two vectors | |
# output: integer between 0 and 1. | |
def get_cosine(vec1, vec2): | |
intersection = set(vec1.keys()) & set(vec2.keys()) | |
# calculating numerator | |
numerator = sum([vec1[x] * vec2[x] for x in intersection]) | |
# calculating denominator | |
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()]) | |
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()]) | |
denominator = math.sqrt(sum1) * math.sqrt(sum2) | |
# checking for divide by zero | |
if denominator == 0: | |
return 0.0 | |
else: | |
return float(numerator) / denominator | |
# converts given text into a vector | |
def text_to_vector(text): | |
# uses the Regular expression above and gets all words | |
words = WORD.findall(text) | |
# returns a counter of all the words (count of number of occurences) | |
return Counter(words) | |
# returns cosine similarity of two words | |
# uses: text_to_vector(text) and get_cosine(v1,v2) | |
def cosineSim(text1, text2): | |
vector1 = text_to_vector(text1) | |
vector2 = text_to_vector(text2) | |
# print vector1,vector2 | |
cosine = get_cosine(vector1, vector2) | |
return cosine | |
def get_soup_requests(url): | |
page = requests.get(url) | |
if page.status_code == 200: | |
soup = BeautifulSoup(page.content, "html.parser") | |
return soup | |
print("HTML soup failed") | |
return None | |
def get_soup_httpx(url): | |
client = httpx.Client(timeout=30) | |
try: | |
page = client.get(url) | |
if page.status_code == httpx.codes.OK: | |
soup = BeautifulSoup(page.content, "html.parser") | |
return soup | |
except: | |
print("HTTPx soup failed") | |
return None | |
def getSentences(text): | |
from nltk.tokenize import sent_tokenize | |
sents = sent_tokenize(text) | |
two_sents = [] | |
for i in range(len(sents)): | |
if (i % 2) == 0: | |
two_sents.append(sents[i]) | |
else: | |
two_sents[len(two_sents) - 1] += " " + sents[i] | |
return two_sents | |
def googleSearch( | |
sentences, | |
urlCount, | |
scoreArray, | |
urlList, | |
sorted_date, | |
domains_to_skip, | |
api_key, | |
cse_id, | |
**kwargs, | |
): | |
service = build("customsearch", "v1", developerKey=api_key) | |
for i, sentence in enumerate(sentences): | |
results = ( | |
service.cse() | |
.list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs) | |
.execute() | |
) | |
if "items" in results and len(results["items"]) > 0: | |
for count, link in enumerate(results["items"]): | |
# stop after 5 pages | |
if count > 4: | |
break | |
# skip user selected domains | |
if any( | |
("." + domain) in link["link"] | |
for domain in domains_to_skip | |
): | |
continue | |
# clean up snippet of '...' | |
snippet = link["snippet"] | |
ind = snippet.find("...") | |
if ind < 20 and ind > 9: | |
snippet = snippet[ind + len("... ") :] | |
ind = snippet.find("...") | |
if ind > len(snippet) - 5: | |
snippet = snippet[:ind] | |
# update cosine similarity between snippet and given text | |
url = link["link"] | |
if url not in urlList: | |
urlList.append(url) | |
scoreArray.append([0] * len(sentences)) | |
urlCount[url] = urlCount[url] + 1 if url in urlCount else 1 | |
scoreArray[urlList.index(url)][i] = cosineSim( | |
sentence, snippet | |
) | |
else: | |
print("Google Search failed") | |
return urlCount, scoreArray | |
def getQueries(text, n): | |
# return n-grams of size n | |
finalq = [] | |
words = text.split() | |
l = len(words) | |
for i in range(0, l - n + 1): | |
finalq.append(words[i : i + n]) | |
return finalq | |
def print2D(array): | |
print(np.array(array)) | |
def removePunc(text): | |
res = re.sub(r"[^\w\s]", "", text) | |
return res | |
async def get_url_data(url, client): | |
try: | |
r = await client.get(url) | |
# print(r.status_code) | |
if r.status_code == 200: | |
# print("in") | |
soup = BeautifulSoup(r.content, "html.parser") | |
return soup | |
except Exception: | |
print("HTTPx parallel soup failed") | |
return None | |
async def parallel_scrap(urls): | |
async with httpx.AsyncClient(timeout=30) as client: | |
tasks = [] | |
for url in urls: | |
tasks.append(get_url_data(url=url, client=client)) | |
results = await asyncio.gather(*tasks, return_exceptions=True) | |
return results | |
def matchingScore(sentence, content): | |
if sentence in content: | |
return 1 | |
sentence = removePunc(sentence) | |
content = removePunc(content) | |
if sentence in content: | |
return 1 | |
else: | |
n = 5 | |
ngrams = getQueries(sentence, n) | |
if len(ngrams) == 0: | |
return 0 | |
matched = [x for x in ngrams if " ".join(x) in content] | |
return len(matched) / len(ngrams) | |
async def matchingScoreAsync(sentences, content, content_idx, ScoreArray): | |
content = removePunc(content) | |
for j, sentence in enumerate(sentences): | |
sentence = removePunc(sentence) | |
if sentence in content: | |
ScoreArray[content_idx][j] = 1 | |
else: | |
n = 5 | |
ngrams = getQueries(sentence, n) | |
if len(ngrams) == 0: | |
return 0 | |
matched = [x for x in ngrams if " ".join(x) in content] | |
ScoreArray[content_idx][j] = len(matched) / len(ngrams) | |
print( | |
f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................" | |
) | |
return ScoreArray | |
async def parallel_analyze(soups, sentences, ScoreArray): | |
tasks = [] | |
for i, soup in enumerate(soups): | |
if soup: | |
page_content = soup.text | |
tasks.append( | |
matchingScoreAsync(sentences, page_content, i, ScoreArray) | |
) | |
else: | |
print( | |
f"Analyzed {i+1} of soups (SOUP FAILED)........................" | |
) | |
ScoreArray = await asyncio.gather(*tasks, return_exceptions=True) | |
return ScoreArray | |
async def parallel_analyze_2(soups, sentences, ScoreArray): | |
tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))] | |
for i, soup in enumerate(soups): | |
if soup: | |
page_content = soup.text | |
for j, sent in enumerate(sentences): | |
print( | |
f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................" | |
) | |
tasks[i][j] = matchingScore(sent, page_content) | |
else: | |
print( | |
f"Analyzed {i+1} of soups (SOUP FAILED)........................" | |
) | |
ScoreArray = await asyncio.gather(*tasks, return_exceptions=True) | |
return ScoreArray |