Spaces:
Running
Running
aliasgerovs
commited on
Commit
•
1be431a
1
Parent(s):
9d99259
Updated
Browse files- app.py +401 -0
- requirements.txt +19 -0
- utils.py +250 -0
app.py
ADDED
@@ -0,0 +1,401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore
|
2 |
+
import gradio as gr
|
3 |
+
from urllib.request import urlopen, Request
|
4 |
+
from googleapiclient.discovery import build
|
5 |
+
import requests
|
6 |
+
import httpx
|
7 |
+
import re
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import numpy as np
|
10 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
11 |
+
import asyncio
|
12 |
+
from scipy.special import softmax
|
13 |
+
from evaluate import load
|
14 |
+
from datetime import date
|
15 |
+
import nltk
|
16 |
+
|
17 |
+
np.set_printoptions(suppress=True)
|
18 |
+
|
19 |
+
|
20 |
+
def plagiarism_check(
|
21 |
+
input,
|
22 |
+
year_from,
|
23 |
+
month_from,
|
24 |
+
day_from,
|
25 |
+
year_to,
|
26 |
+
month_to,
|
27 |
+
day_to,
|
28 |
+
domains_to_skip,
|
29 |
+
):
|
30 |
+
api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
31 |
+
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
32 |
+
api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
33 |
+
api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
34 |
+
# api_key = "AIzaSyBrx_pgb6A64wPFQXSGQRgGtukoxVV_0Fk"
|
35 |
+
cse_id = "851813e81162b4ed4"
|
36 |
+
|
37 |
+
sentences = getSentences(input)
|
38 |
+
urlCount = {}
|
39 |
+
ScoreArray = []
|
40 |
+
urlList = []
|
41 |
+
|
42 |
+
date_from = build_date(year_from, month_from, day_from)
|
43 |
+
date_to = build_date(year_to, month_to, day_to)
|
44 |
+
sort_date = f"date:r:{date_from}:{date_to}"
|
45 |
+
# get list of URLS to check
|
46 |
+
urlCount, ScoreArray = googleSearch(
|
47 |
+
sentences,
|
48 |
+
urlCount,
|
49 |
+
ScoreArray,
|
50 |
+
urlList,
|
51 |
+
sort_date,
|
52 |
+
domains_to_skip,
|
53 |
+
api_key,
|
54 |
+
cse_id,
|
55 |
+
)
|
56 |
+
print("Number of URLs: ", len(urlCount))
|
57 |
+
# print("Old Score Array:\n")
|
58 |
+
# print2D(ScoreArray)
|
59 |
+
|
60 |
+
# Scrape URLs in list
|
61 |
+
formatted_tokens = []
|
62 |
+
soups = asyncio.run(parallel_scrap(urlList))
|
63 |
+
print(len(soups))
|
64 |
+
print(
|
65 |
+
"Successful scraping: "
|
66 |
+
+ str(len([x for x in soups if x is not None]))
|
67 |
+
+ "out of "
|
68 |
+
+ str(len(urlList))
|
69 |
+
)
|
70 |
+
|
71 |
+
# Populate matching scores for scrapped pages
|
72 |
+
for i, soup in enumerate(soups):
|
73 |
+
print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
74 |
+
if soup:
|
75 |
+
page_content = soup.text
|
76 |
+
for j, sent in enumerate(sentences):
|
77 |
+
score = matchingScore(sent, page_content)
|
78 |
+
ScoreArray[i][j] = score
|
79 |
+
|
80 |
+
# ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
|
81 |
+
# print("New Score Array:\n")
|
82 |
+
# print2D(ScoreArray)
|
83 |
+
|
84 |
+
|
85 |
+
# Gradio formatting section
|
86 |
+
sentencePlag = [False] * len(sentences)
|
87 |
+
sentenceToMaxURL = [-1] * len(sentences)
|
88 |
+
for j in range(len(sentences)):
|
89 |
+
if j > 0:
|
90 |
+
maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
|
91 |
+
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
92 |
+
else:
|
93 |
+
maxScore = -1
|
94 |
+
for i in range(len(ScoreArray)):
|
95 |
+
margin = (
|
96 |
+
0.1
|
97 |
+
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
98 |
+
else 0
|
99 |
+
)
|
100 |
+
if ScoreArray[i][j] - maxScore > margin:
|
101 |
+
maxScore = ScoreArray[i][j]
|
102 |
+
sentenceToMaxURL[j] = i
|
103 |
+
if maxScore > 0.5:
|
104 |
+
sentencePlag[j] = True
|
105 |
+
|
106 |
+
if (
|
107 |
+
(len(sentences) > 1)
|
108 |
+
and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
|
109 |
+
and (
|
110 |
+
ScoreArray[sentenceToMaxURL[0]][0]
|
111 |
+
- ScoreArray[sentenceToMaxURL[1]][0]
|
112 |
+
< 0.1
|
113 |
+
)
|
114 |
+
):
|
115 |
+
sentenceToMaxURL[0] = sentenceToMaxURL[1]
|
116 |
+
|
117 |
+
index = np.unique(sentenceToMaxURL)
|
118 |
+
|
119 |
+
urlMap = {}
|
120 |
+
for count, i in enumerate(index):
|
121 |
+
urlMap[i] = count + 1
|
122 |
+
for i, sent in enumerate(sentences):
|
123 |
+
formatted_tokens.append(
|
124 |
+
(sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
|
125 |
+
)
|
126 |
+
|
127 |
+
formatted_tokens.append(("\n", None))
|
128 |
+
formatted_tokens.append(("\n", None))
|
129 |
+
formatted_tokens.append(("\n", None))
|
130 |
+
|
131 |
+
urlScore = {}
|
132 |
+
for url in index:
|
133 |
+
s = [
|
134 |
+
ScoreArray[url][sen]
|
135 |
+
for sen in range(len(sentences))
|
136 |
+
if sentenceToMaxURL[sen] == url
|
137 |
+
]
|
138 |
+
urlScore[url] = sum(s) / len(s)
|
139 |
+
|
140 |
+
for ind in index:
|
141 |
+
formatted_tokens.append(
|
142 |
+
(
|
143 |
+
urlList[ind] + " --- Matching Score: " + str(urlScore[ind]),
|
144 |
+
"[" + str(urlMap[ind]) + "]",
|
145 |
+
)
|
146 |
+
)
|
147 |
+
formatted_tokens.append(("\n", None))
|
148 |
+
|
149 |
+
print(f"Formatted Tokens: {formatted_tokens}")
|
150 |
+
|
151 |
+
return formatted_tokens
|
152 |
+
|
153 |
+
|
154 |
+
"""
|
155 |
+
AI DETECTION SECTION
|
156 |
+
"""
|
157 |
+
|
158 |
+
text_bc_model_path = "polygraf-ai/ai-text-bc-bert-1-4m"
|
159 |
+
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
160 |
+
text_bc_model = AutoModelForSequenceClassification.from_pretrained(
|
161 |
+
text_bc_model_path
|
162 |
+
)
|
163 |
+
|
164 |
+
text_mc_model_path = "polygraf-ai/ai-text-mc-v5-lighter-spec"
|
165 |
+
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
166 |
+
text_mc_model = AutoModelForSequenceClassification.from_pretrained(
|
167 |
+
text_mc_model_path
|
168 |
+
)
|
169 |
+
|
170 |
+
|
171 |
+
def remove_special_characters(text):
|
172 |
+
cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
173 |
+
return cleaned_text
|
174 |
+
|
175 |
+
def predict_bc(model, tokenizer, text):
|
176 |
+
tokens = tokenizer(
|
177 |
+
text, padding=True, truncation=True, return_tensors="pt"
|
178 |
+
)["input_ids"]
|
179 |
+
output = model(tokens)
|
180 |
+
output_norm = softmax(output.logits.detach().numpy(), 1)[0]
|
181 |
+
print("BC Score: ", output_norm)
|
182 |
+
bc_score = {"AI": output_norm[1].item(), "HUMAN": output_norm[0].item()}
|
183 |
+
return bc_score
|
184 |
+
|
185 |
+
|
186 |
+
def predict_mc(model, tokenizer, text):
|
187 |
+
tokens = tokenizer(
|
188 |
+
text, padding=True, truncation=True, return_tensors="pt"
|
189 |
+
)["input_ids"]
|
190 |
+
output = model(tokens)
|
191 |
+
output_norm = softmax(output.logits.detach().numpy(), 1)[0]
|
192 |
+
print("MC Score: ", output_norm)
|
193 |
+
mc_score = {}
|
194 |
+
label_map = ["GPT 3.5", "GPT 4", "CLAUDE", "BARD", "LLAMA 2"]
|
195 |
+
for score, label in zip(output_norm, label_map):
|
196 |
+
mc_score[label.upper()] = score.item()
|
197 |
+
return mc_score
|
198 |
+
|
199 |
+
|
200 |
+
def ai_generated_test(input, models):
|
201 |
+
|
202 |
+
cleaned_text = remove_special_characters(input)
|
203 |
+
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text)
|
204 |
+
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text)
|
205 |
+
|
206 |
+
sum_prob = 1 - bc_score["HUMAN"]
|
207 |
+
for key, value in mc_score.items():
|
208 |
+
mc_score[key] = value * sum_prob
|
209 |
+
|
210 |
+
return bc_score, mc_score
|
211 |
+
|
212 |
+
|
213 |
+
# COMBINED
|
214 |
+
def main(
|
215 |
+
input,
|
216 |
+
models,
|
217 |
+
year_from,
|
218 |
+
month_from,
|
219 |
+
day_from,
|
220 |
+
year_to,
|
221 |
+
month_to,
|
222 |
+
day_to,
|
223 |
+
domains_to_skip,
|
224 |
+
):
|
225 |
+
bc_score, mc_score = ai_generated_test(input, models)
|
226 |
+
formatted_tokens = plaigiarism_check(
|
227 |
+
input,
|
228 |
+
year_from,
|
229 |
+
month_from,
|
230 |
+
day_from,
|
231 |
+
year_to,
|
232 |
+
month_to,
|
233 |
+
day_to,
|
234 |
+
domains_to_skip,
|
235 |
+
)
|
236 |
+
return (
|
237 |
+
bc_score,
|
238 |
+
mc_score,
|
239 |
+
formatted_tokens,
|
240 |
+
)
|
241 |
+
|
242 |
+
|
243 |
+
def build_date(year, month, day):
|
244 |
+
return f"{year}{months[month]}{day}"
|
245 |
+
|
246 |
+
|
247 |
+
# START OF GRADIO
|
248 |
+
|
249 |
+
title = "Plagiarism Demo"
|
250 |
+
months = {
|
251 |
+
"January": "01",
|
252 |
+
"February": "02",
|
253 |
+
"March": "03",
|
254 |
+
"April": "04",
|
255 |
+
"May": "05",
|
256 |
+
"June": "06",
|
257 |
+
"July": "07",
|
258 |
+
"August": "08",
|
259 |
+
"September": "09",
|
260 |
+
"October": "10",
|
261 |
+
"November": "11",
|
262 |
+
"December": "12",
|
263 |
+
}
|
264 |
+
|
265 |
+
|
266 |
+
with gr.Blocks() as demo:
|
267 |
+
today = date.today()
|
268 |
+
# dd/mm/YY
|
269 |
+
d1 = today.strftime("%d/%B/%Y")
|
270 |
+
d1 = d1.split("/")
|
271 |
+
|
272 |
+
model_list = ["GPT 3.5", "GPT 4", "CLAUDE", "BARD", "LLAMA2"]
|
273 |
+
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
|
274 |
+
gr.Markdown(
|
275 |
+
"""
|
276 |
+
# Plagiarism Detection Demo
|
277 |
+
"""
|
278 |
+
)
|
279 |
+
input_text = gr.Textbox(label="Input text", lines=5, placeholder="")
|
280 |
+
|
281 |
+
with gr.Row():
|
282 |
+
with gr.Column():
|
283 |
+
only_ai_btn = gr.Button("AI Check")
|
284 |
+
with gr.Column():
|
285 |
+
only_plagiarism_btn = gr.Button("Plagiarism Check")
|
286 |
+
with gr.Column():
|
287 |
+
submit_btn = gr.Button("Full Check")
|
288 |
+
gr.Markdown(
|
289 |
+
"""
|
290 |
+
## Output
|
291 |
+
"""
|
292 |
+
)
|
293 |
+
|
294 |
+
with gr.Row():
|
295 |
+
models = gr.Dropdown(
|
296 |
+
model_list,
|
297 |
+
value=model_list,
|
298 |
+
multiselect=True,
|
299 |
+
label="Models to test against",
|
300 |
+
)
|
301 |
+
|
302 |
+
with gr.Row():
|
303 |
+
with gr.Column():
|
304 |
+
bcLabel = gr.Label(label="Source")
|
305 |
+
with gr.Column():
|
306 |
+
mcLabel = gr.Label(label="Creator")
|
307 |
+
|
308 |
+
with gr.Group():
|
309 |
+
with gr.Row():
|
310 |
+
month_from = gr.Dropdown(
|
311 |
+
choices=months,
|
312 |
+
label="From Month",
|
313 |
+
value="January",
|
314 |
+
interactive=True,
|
315 |
+
)
|
316 |
+
day_from = gr.Textbox(label="From Day", value="01")
|
317 |
+
year_from = gr.Textbox(label="From Year", value="2000")
|
318 |
+
# from_date_button = gr.Button("Submit")
|
319 |
+
with gr.Row():
|
320 |
+
month_to = gr.Dropdown(
|
321 |
+
choices=months,
|
322 |
+
label="To Month",
|
323 |
+
value=d1[1],
|
324 |
+
interactive=True,
|
325 |
+
)
|
326 |
+
day_to = gr.Textbox(label="To Day", value=d1[0])
|
327 |
+
year_to = gr.Textbox(label="To Year", value=d1[2])
|
328 |
+
# to_date_button = gr.Button("Submit")
|
329 |
+
with gr.Row():
|
330 |
+
domains_to_skip = gr.Dropdown(
|
331 |
+
domain_list,
|
332 |
+
multiselect=True,
|
333 |
+
label="Domain To Skip",
|
334 |
+
)
|
335 |
+
|
336 |
+
with gr.Row():
|
337 |
+
with gr.Column():
|
338 |
+
sentenceBreakdown = gr.HighlightedText(
|
339 |
+
label="Plagiarism Sentence Breakdown",
|
340 |
+
combine_adjacent=True,
|
341 |
+
color_map={
|
342 |
+
"[1]": "red",
|
343 |
+
"[2]": "orange",
|
344 |
+
"[3]": "yellow",
|
345 |
+
"[4]": "green",
|
346 |
+
},
|
347 |
+
)
|
348 |
+
|
349 |
+
submit_btn.click(
|
350 |
+
fn=main,
|
351 |
+
inputs=[
|
352 |
+
input_text,
|
353 |
+
models,
|
354 |
+
year_from,
|
355 |
+
month_from,
|
356 |
+
day_from,
|
357 |
+
year_to,
|
358 |
+
month_to,
|
359 |
+
day_to,
|
360 |
+
domains_to_skip,
|
361 |
+
],
|
362 |
+
outputs=[
|
363 |
+
bcLabel,
|
364 |
+
mcLabel,
|
365 |
+
sentenceBreakdown,
|
366 |
+
],
|
367 |
+
api_name="main",
|
368 |
+
)
|
369 |
+
|
370 |
+
only_ai_btn.click(
|
371 |
+
fn=ai_generated_test,
|
372 |
+
inputs=[input_text, models],
|
373 |
+
outputs=[
|
374 |
+
bcLabel,
|
375 |
+
mcLabel,
|
376 |
+
],
|
377 |
+
api_name="ai_check",
|
378 |
+
)
|
379 |
+
|
380 |
+
only_plagiarism_btn.click(
|
381 |
+
fn=plaigiarism_check,
|
382 |
+
inputs=[
|
383 |
+
input_text,
|
384 |
+
year_from,
|
385 |
+
month_from,
|
386 |
+
day_from,
|
387 |
+
year_to,
|
388 |
+
month_to,
|
389 |
+
day_to,
|
390 |
+
domains_to_skip,
|
391 |
+
],
|
392 |
+
outputs=[
|
393 |
+
sentenceBreakdown,
|
394 |
+
],
|
395 |
+
api_name="plagiarism_check",
|
396 |
+
)
|
397 |
+
|
398 |
+
date_from = ""
|
399 |
+
date_to = ""
|
400 |
+
|
401 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
python-docx
|
3 |
+
google-api-python-client
|
4 |
+
nltk
|
5 |
+
BeautifulSoup4
|
6 |
+
scrapingbee
|
7 |
+
requests
|
8 |
+
numpy
|
9 |
+
torch==1.13.0
|
10 |
+
transformers==4.25.1
|
11 |
+
transformers-interpret
|
12 |
+
textstat
|
13 |
+
scipy
|
14 |
+
scikit-learn
|
15 |
+
joblib
|
16 |
+
evaluate
|
17 |
+
tensorflow
|
18 |
+
keras
|
19 |
+
spacy
|
utils.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from urllib.request import urlopen, Request
|
2 |
+
from googleapiclient.discovery import build
|
3 |
+
import requests
|
4 |
+
import httpx
|
5 |
+
import re
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import re, math
|
8 |
+
from collections import Counter
|
9 |
+
import numpy as np
|
10 |
+
import asyncio
|
11 |
+
import nltk
|
12 |
+
|
13 |
+
nltk.download('punkt')
|
14 |
+
|
15 |
+
WORD = re.compile(r"\w+")
|
16 |
+
|
17 |
+
|
18 |
+
# returns cosine similarity of two vectors
|
19 |
+
# input: two vectors
|
20 |
+
# output: integer between 0 and 1.
|
21 |
+
def get_cosine(vec1, vec2):
|
22 |
+
intersection = set(vec1.keys()) & set(vec2.keys())
|
23 |
+
|
24 |
+
# calculating numerator
|
25 |
+
numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
26 |
+
|
27 |
+
# calculating denominator
|
28 |
+
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
|
29 |
+
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
|
30 |
+
denominator = math.sqrt(sum1) * math.sqrt(sum2)
|
31 |
+
|
32 |
+
# checking for divide by zero
|
33 |
+
if denominator == 0:
|
34 |
+
return 0.0
|
35 |
+
else:
|
36 |
+
return float(numerator) / denominator
|
37 |
+
|
38 |
+
|
39 |
+
# converts given text into a vector
|
40 |
+
def text_to_vector(text):
|
41 |
+
# uses the Regular expression above and gets all words
|
42 |
+
words = WORD.findall(text)
|
43 |
+
# returns a counter of all the words (count of number of occurences)
|
44 |
+
return Counter(words)
|
45 |
+
|
46 |
+
|
47 |
+
# returns cosine similarity of two words
|
48 |
+
# uses: text_to_vector(text) and get_cosine(v1,v2)
|
49 |
+
def cosineSim(text1, text2):
|
50 |
+
vector1 = text_to_vector(text1)
|
51 |
+
vector2 = text_to_vector(text2)
|
52 |
+
# print vector1,vector2
|
53 |
+
cosine = get_cosine(vector1, vector2)
|
54 |
+
return cosine
|
55 |
+
|
56 |
+
def get_soup_requests(url):
|
57 |
+
page = requests.get(url)
|
58 |
+
if page.status_code == 200:
|
59 |
+
soup = BeautifulSoup(page.content, "html.parser")
|
60 |
+
return soup
|
61 |
+
print("HTML soup failed")
|
62 |
+
return None
|
63 |
+
|
64 |
+
|
65 |
+
def get_soup_httpx(url):
|
66 |
+
client = httpx.Client(timeout=30)
|
67 |
+
try:
|
68 |
+
page = client.get(url)
|
69 |
+
if page.status_code == httpx.codes.OK:
|
70 |
+
soup = BeautifulSoup(page.content, "html.parser")
|
71 |
+
return soup
|
72 |
+
except:
|
73 |
+
print("HTTPx soup failed")
|
74 |
+
return None
|
75 |
+
|
76 |
+
def getSentences(text):
|
77 |
+
from nltk.tokenize import sent_tokenize
|
78 |
+
|
79 |
+
sents = sent_tokenize(text)
|
80 |
+
two_sents = []
|
81 |
+
for i in range(len(sents)):
|
82 |
+
if (i % 2) == 0:
|
83 |
+
two_sents.append(sents[i])
|
84 |
+
else:
|
85 |
+
two_sents[len(two_sents) - 1] += " " + sents[i]
|
86 |
+
return two_sents
|
87 |
+
|
88 |
+
|
89 |
+
def googleSearch(
|
90 |
+
sentences,
|
91 |
+
urlCount,
|
92 |
+
scoreArray,
|
93 |
+
urlList,
|
94 |
+
sorted_date,
|
95 |
+
domains_to_skip,
|
96 |
+
api_key,
|
97 |
+
cse_id,
|
98 |
+
**kwargs,
|
99 |
+
):
|
100 |
+
service = build("customsearch", "v1", developerKey=api_key)
|
101 |
+
for i, sentence in enumerate(sentences):
|
102 |
+
results = (
|
103 |
+
service.cse()
|
104 |
+
.list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
|
105 |
+
.execute()
|
106 |
+
)
|
107 |
+
if "items" in results and len(results["items"]) > 0:
|
108 |
+
for count, link in enumerate(results["items"]):
|
109 |
+
# stop after 5 pages
|
110 |
+
if count > 4:
|
111 |
+
break
|
112 |
+
# skip user selected domains
|
113 |
+
if any(
|
114 |
+
("." + domain) in link["link"]
|
115 |
+
for domain in domains_to_skip
|
116 |
+
):
|
117 |
+
continue
|
118 |
+
# clean up snippet of '...'
|
119 |
+
snippet = link["snippet"]
|
120 |
+
ind = snippet.find("...")
|
121 |
+
if ind < 20 and ind > 9:
|
122 |
+
snippet = snippet[ind + len("... ") :]
|
123 |
+
ind = snippet.find("...")
|
124 |
+
if ind > len(snippet) - 5:
|
125 |
+
snippet = snippet[:ind]
|
126 |
+
|
127 |
+
# update cosine similarity between snippet and given text
|
128 |
+
url = link["link"]
|
129 |
+
if url not in urlList:
|
130 |
+
urlList.append(url)
|
131 |
+
scoreArray.append([0] * len(sentences))
|
132 |
+
urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
|
133 |
+
scoreArray[urlList.index(url)][i] = cosineSim(
|
134 |
+
sentence, snippet
|
135 |
+
)
|
136 |
+
else:
|
137 |
+
print("Google Search failed")
|
138 |
+
return urlCount, scoreArray
|
139 |
+
|
140 |
+
|
141 |
+
def getQueries(text, n):
|
142 |
+
# return n-grams of size n
|
143 |
+
finalq = []
|
144 |
+
words = text.split()
|
145 |
+
l = len(words)
|
146 |
+
|
147 |
+
for i in range(0, l - n + 1):
|
148 |
+
finalq.append(words[i : i + n])
|
149 |
+
|
150 |
+
return finalq
|
151 |
+
|
152 |
+
|
153 |
+
def print2D(array):
|
154 |
+
print(np.array(array))
|
155 |
+
|
156 |
+
|
157 |
+
def removePunc(text):
|
158 |
+
res = re.sub(r"[^\w\s]", "", text)
|
159 |
+
return res
|
160 |
+
|
161 |
+
|
162 |
+
async def get_url_data(url, client):
|
163 |
+
try:
|
164 |
+
r = await client.get(url)
|
165 |
+
# print(r.status_code)
|
166 |
+
if r.status_code == 200:
|
167 |
+
# print("in")
|
168 |
+
soup = BeautifulSoup(r.content, "html.parser")
|
169 |
+
return soup
|
170 |
+
except Exception:
|
171 |
+
print("HTTPx parallel soup failed")
|
172 |
+
return None
|
173 |
+
|
174 |
+
|
175 |
+
async def parallel_scrap(urls):
|
176 |
+
async with httpx.AsyncClient(timeout=30) as client:
|
177 |
+
tasks = []
|
178 |
+
for url in urls:
|
179 |
+
tasks.append(get_url_data(url=url, client=client))
|
180 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
181 |
+
return results
|
182 |
+
|
183 |
+
|
184 |
+
def matchingScore(sentence, content):
|
185 |
+
if sentence in content:
|
186 |
+
return 1
|
187 |
+
sentence = removePunc(sentence)
|
188 |
+
content = removePunc(content)
|
189 |
+
if sentence in content:
|
190 |
+
return 1
|
191 |
+
else:
|
192 |
+
n = 5
|
193 |
+
ngrams = getQueries(sentence, n)
|
194 |
+
if len(ngrams) == 0:
|
195 |
+
return 0
|
196 |
+
matched = [x for x in ngrams if " ".join(x) in content]
|
197 |
+
return len(matched) / len(ngrams)
|
198 |
+
|
199 |
+
|
200 |
+
async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
|
201 |
+
content = removePunc(content)
|
202 |
+
for j, sentence in enumerate(sentences):
|
203 |
+
sentence = removePunc(sentence)
|
204 |
+
if sentence in content:
|
205 |
+
ScoreArray[content_idx][j] = 1
|
206 |
+
else:
|
207 |
+
n = 5
|
208 |
+
ngrams = getQueries(sentence, n)
|
209 |
+
if len(ngrams) == 0:
|
210 |
+
return 0
|
211 |
+
matched = [x for x in ngrams if " ".join(x) in content]
|
212 |
+
ScoreArray[content_idx][j] = len(matched) / len(ngrams)
|
213 |
+
print(
|
214 |
+
f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
|
215 |
+
)
|
216 |
+
return ScoreArray
|
217 |
+
|
218 |
+
|
219 |
+
async def parallel_analyze(soups, sentences, ScoreArray):
|
220 |
+
tasks = []
|
221 |
+
for i, soup in enumerate(soups):
|
222 |
+
if soup:
|
223 |
+
page_content = soup.text
|
224 |
+
tasks.append(
|
225 |
+
matchingScoreAsync(sentences, page_content, i, ScoreArray)
|
226 |
+
)
|
227 |
+
else:
|
228 |
+
print(
|
229 |
+
f"Analyzed {i+1} of soups (SOUP FAILED)........................"
|
230 |
+
)
|
231 |
+
ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
|
232 |
+
return ScoreArray
|
233 |
+
|
234 |
+
|
235 |
+
async def parallel_analyze_2(soups, sentences, ScoreArray):
|
236 |
+
tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
|
237 |
+
for i, soup in enumerate(soups):
|
238 |
+
if soup:
|
239 |
+
page_content = soup.text
|
240 |
+
for j, sent in enumerate(sentences):
|
241 |
+
print(
|
242 |
+
f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
|
243 |
+
)
|
244 |
+
tasks[i][j] = matchingScore(sent, page_content)
|
245 |
+
else:
|
246 |
+
print(
|
247 |
+
f"Analyzed {i+1} of soups (SOUP FAILED)........................"
|
248 |
+
)
|
249 |
+
ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
|
250 |
+
return ScoreArray
|