aliasgerovs commited on
Commit
1be431a
1 Parent(s): 9d99259
Files changed (3) hide show
  1. app.py +401 -0
  2. requirements.txt +19 -0
  3. utils.py +250 -0
app.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore
2
+ import gradio as gr
3
+ from urllib.request import urlopen, Request
4
+ from googleapiclient.discovery import build
5
+ import requests
6
+ import httpx
7
+ import re
8
+ from bs4 import BeautifulSoup
9
+ import numpy as np
10
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
+ import asyncio
12
+ from scipy.special import softmax
13
+ from evaluate import load
14
+ from datetime import date
15
+ import nltk
16
+
17
+ np.set_printoptions(suppress=True)
18
+
19
+
20
+ def plagiarism_check(
21
+ input,
22
+ year_from,
23
+ month_from,
24
+ day_from,
25
+ year_to,
26
+ month_to,
27
+ day_to,
28
+ domains_to_skip,
29
+ ):
30
+ api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
31
+ api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
32
+ api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
33
+ api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
34
+ # api_key = "AIzaSyBrx_pgb6A64wPFQXSGQRgGtukoxVV_0Fk"
35
+ cse_id = "851813e81162b4ed4"
36
+
37
+ sentences = getSentences(input)
38
+ urlCount = {}
39
+ ScoreArray = []
40
+ urlList = []
41
+
42
+ date_from = build_date(year_from, month_from, day_from)
43
+ date_to = build_date(year_to, month_to, day_to)
44
+ sort_date = f"date:r:{date_from}:{date_to}"
45
+ # get list of URLS to check
46
+ urlCount, ScoreArray = googleSearch(
47
+ sentences,
48
+ urlCount,
49
+ ScoreArray,
50
+ urlList,
51
+ sort_date,
52
+ domains_to_skip,
53
+ api_key,
54
+ cse_id,
55
+ )
56
+ print("Number of URLs: ", len(urlCount))
57
+ # print("Old Score Array:\n")
58
+ # print2D(ScoreArray)
59
+
60
+ # Scrape URLs in list
61
+ formatted_tokens = []
62
+ soups = asyncio.run(parallel_scrap(urlList))
63
+ print(len(soups))
64
+ print(
65
+ "Successful scraping: "
66
+ + str(len([x for x in soups if x is not None]))
67
+ + "out of "
68
+ + str(len(urlList))
69
+ )
70
+
71
+ # Populate matching scores for scrapped pages
72
+ for i, soup in enumerate(soups):
73
+ print(f"Analyzing {i+1} of {len(soups)} soups........................")
74
+ if soup:
75
+ page_content = soup.text
76
+ for j, sent in enumerate(sentences):
77
+ score = matchingScore(sent, page_content)
78
+ ScoreArray[i][j] = score
79
+
80
+ # ScoreArray = asyncio.run(parallel_analyze_2(soups, sentences, ScoreArray))
81
+ # print("New Score Array:\n")
82
+ # print2D(ScoreArray)
83
+
84
+
85
+ # Gradio formatting section
86
+ sentencePlag = [False] * len(sentences)
87
+ sentenceToMaxURL = [-1] * len(sentences)
88
+ for j in range(len(sentences)):
89
+ if j > 0:
90
+ maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
91
+ sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
92
+ else:
93
+ maxScore = -1
94
+ for i in range(len(ScoreArray)):
95
+ margin = (
96
+ 0.1
97
+ if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
98
+ else 0
99
+ )
100
+ if ScoreArray[i][j] - maxScore > margin:
101
+ maxScore = ScoreArray[i][j]
102
+ sentenceToMaxURL[j] = i
103
+ if maxScore > 0.5:
104
+ sentencePlag[j] = True
105
+
106
+ if (
107
+ (len(sentences) > 1)
108
+ and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
109
+ and (
110
+ ScoreArray[sentenceToMaxURL[0]][0]
111
+ - ScoreArray[sentenceToMaxURL[1]][0]
112
+ < 0.1
113
+ )
114
+ ):
115
+ sentenceToMaxURL[0] = sentenceToMaxURL[1]
116
+
117
+ index = np.unique(sentenceToMaxURL)
118
+
119
+ urlMap = {}
120
+ for count, i in enumerate(index):
121
+ urlMap[i] = count + 1
122
+ for i, sent in enumerate(sentences):
123
+ formatted_tokens.append(
124
+ (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
125
+ )
126
+
127
+ formatted_tokens.append(("\n", None))
128
+ formatted_tokens.append(("\n", None))
129
+ formatted_tokens.append(("\n", None))
130
+
131
+ urlScore = {}
132
+ for url in index:
133
+ s = [
134
+ ScoreArray[url][sen]
135
+ for sen in range(len(sentences))
136
+ if sentenceToMaxURL[sen] == url
137
+ ]
138
+ urlScore[url] = sum(s) / len(s)
139
+
140
+ for ind in index:
141
+ formatted_tokens.append(
142
+ (
143
+ urlList[ind] + " --- Matching Score: " + str(urlScore[ind]),
144
+ "[" + str(urlMap[ind]) + "]",
145
+ )
146
+ )
147
+ formatted_tokens.append(("\n", None))
148
+
149
+ print(f"Formatted Tokens: {formatted_tokens}")
150
+
151
+ return formatted_tokens
152
+
153
+
154
+ """
155
+ AI DETECTION SECTION
156
+ """
157
+
158
+ text_bc_model_path = "polygraf-ai/ai-text-bc-bert-1-4m"
159
+ text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
160
+ text_bc_model = AutoModelForSequenceClassification.from_pretrained(
161
+ text_bc_model_path
162
+ )
163
+
164
+ text_mc_model_path = "polygraf-ai/ai-text-mc-v5-lighter-spec"
165
+ text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
166
+ text_mc_model = AutoModelForSequenceClassification.from_pretrained(
167
+ text_mc_model_path
168
+ )
169
+
170
+
171
+ def remove_special_characters(text):
172
+ cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
173
+ return cleaned_text
174
+
175
+ def predict_bc(model, tokenizer, text):
176
+ tokens = tokenizer(
177
+ text, padding=True, truncation=True, return_tensors="pt"
178
+ )["input_ids"]
179
+ output = model(tokens)
180
+ output_norm = softmax(output.logits.detach().numpy(), 1)[0]
181
+ print("BC Score: ", output_norm)
182
+ bc_score = {"AI": output_norm[1].item(), "HUMAN": output_norm[0].item()}
183
+ return bc_score
184
+
185
+
186
+ def predict_mc(model, tokenizer, text):
187
+ tokens = tokenizer(
188
+ text, padding=True, truncation=True, return_tensors="pt"
189
+ )["input_ids"]
190
+ output = model(tokens)
191
+ output_norm = softmax(output.logits.detach().numpy(), 1)[0]
192
+ print("MC Score: ", output_norm)
193
+ mc_score = {}
194
+ label_map = ["GPT 3.5", "GPT 4", "CLAUDE", "BARD", "LLAMA 2"]
195
+ for score, label in zip(output_norm, label_map):
196
+ mc_score[label.upper()] = score.item()
197
+ return mc_score
198
+
199
+
200
+ def ai_generated_test(input, models):
201
+
202
+ cleaned_text = remove_special_characters(input)
203
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text)
204
+ mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text)
205
+
206
+ sum_prob = 1 - bc_score["HUMAN"]
207
+ for key, value in mc_score.items():
208
+ mc_score[key] = value * sum_prob
209
+
210
+ return bc_score, mc_score
211
+
212
+
213
+ # COMBINED
214
+ def main(
215
+ input,
216
+ models,
217
+ year_from,
218
+ month_from,
219
+ day_from,
220
+ year_to,
221
+ month_to,
222
+ day_to,
223
+ domains_to_skip,
224
+ ):
225
+ bc_score, mc_score = ai_generated_test(input, models)
226
+ formatted_tokens = plaigiarism_check(
227
+ input,
228
+ year_from,
229
+ month_from,
230
+ day_from,
231
+ year_to,
232
+ month_to,
233
+ day_to,
234
+ domains_to_skip,
235
+ )
236
+ return (
237
+ bc_score,
238
+ mc_score,
239
+ formatted_tokens,
240
+ )
241
+
242
+
243
+ def build_date(year, month, day):
244
+ return f"{year}{months[month]}{day}"
245
+
246
+
247
+ # START OF GRADIO
248
+
249
+ title = "Plagiarism Demo"
250
+ months = {
251
+ "January": "01",
252
+ "February": "02",
253
+ "March": "03",
254
+ "April": "04",
255
+ "May": "05",
256
+ "June": "06",
257
+ "July": "07",
258
+ "August": "08",
259
+ "September": "09",
260
+ "October": "10",
261
+ "November": "11",
262
+ "December": "12",
263
+ }
264
+
265
+
266
+ with gr.Blocks() as demo:
267
+ today = date.today()
268
+ # dd/mm/YY
269
+ d1 = today.strftime("%d/%B/%Y")
270
+ d1 = d1.split("/")
271
+
272
+ model_list = ["GPT 3.5", "GPT 4", "CLAUDE", "BARD", "LLAMA2"]
273
+ domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
274
+ gr.Markdown(
275
+ """
276
+ # Plagiarism Detection Demo
277
+ """
278
+ )
279
+ input_text = gr.Textbox(label="Input text", lines=5, placeholder="")
280
+
281
+ with gr.Row():
282
+ with gr.Column():
283
+ only_ai_btn = gr.Button("AI Check")
284
+ with gr.Column():
285
+ only_plagiarism_btn = gr.Button("Plagiarism Check")
286
+ with gr.Column():
287
+ submit_btn = gr.Button("Full Check")
288
+ gr.Markdown(
289
+ """
290
+ ## Output
291
+ """
292
+ )
293
+
294
+ with gr.Row():
295
+ models = gr.Dropdown(
296
+ model_list,
297
+ value=model_list,
298
+ multiselect=True,
299
+ label="Models to test against",
300
+ )
301
+
302
+ with gr.Row():
303
+ with gr.Column():
304
+ bcLabel = gr.Label(label="Source")
305
+ with gr.Column():
306
+ mcLabel = gr.Label(label="Creator")
307
+
308
+ with gr.Group():
309
+ with gr.Row():
310
+ month_from = gr.Dropdown(
311
+ choices=months,
312
+ label="From Month",
313
+ value="January",
314
+ interactive=True,
315
+ )
316
+ day_from = gr.Textbox(label="From Day", value="01")
317
+ year_from = gr.Textbox(label="From Year", value="2000")
318
+ # from_date_button = gr.Button("Submit")
319
+ with gr.Row():
320
+ month_to = gr.Dropdown(
321
+ choices=months,
322
+ label="To Month",
323
+ value=d1[1],
324
+ interactive=True,
325
+ )
326
+ day_to = gr.Textbox(label="To Day", value=d1[0])
327
+ year_to = gr.Textbox(label="To Year", value=d1[2])
328
+ # to_date_button = gr.Button("Submit")
329
+ with gr.Row():
330
+ domains_to_skip = gr.Dropdown(
331
+ domain_list,
332
+ multiselect=True,
333
+ label="Domain To Skip",
334
+ )
335
+
336
+ with gr.Row():
337
+ with gr.Column():
338
+ sentenceBreakdown = gr.HighlightedText(
339
+ label="Plagiarism Sentence Breakdown",
340
+ combine_adjacent=True,
341
+ color_map={
342
+ "[1]": "red",
343
+ "[2]": "orange",
344
+ "[3]": "yellow",
345
+ "[4]": "green",
346
+ },
347
+ )
348
+
349
+ submit_btn.click(
350
+ fn=main,
351
+ inputs=[
352
+ input_text,
353
+ models,
354
+ year_from,
355
+ month_from,
356
+ day_from,
357
+ year_to,
358
+ month_to,
359
+ day_to,
360
+ domains_to_skip,
361
+ ],
362
+ outputs=[
363
+ bcLabel,
364
+ mcLabel,
365
+ sentenceBreakdown,
366
+ ],
367
+ api_name="main",
368
+ )
369
+
370
+ only_ai_btn.click(
371
+ fn=ai_generated_test,
372
+ inputs=[input_text, models],
373
+ outputs=[
374
+ bcLabel,
375
+ mcLabel,
376
+ ],
377
+ api_name="ai_check",
378
+ )
379
+
380
+ only_plagiarism_btn.click(
381
+ fn=plaigiarism_check,
382
+ inputs=[
383
+ input_text,
384
+ year_from,
385
+ month_from,
386
+ day_from,
387
+ year_to,
388
+ month_to,
389
+ day_to,
390
+ domains_to_skip,
391
+ ],
392
+ outputs=[
393
+ sentenceBreakdown,
394
+ ],
395
+ api_name="plagiarism_check",
396
+ )
397
+
398
+ date_from = ""
399
+ date_to = ""
400
+
401
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ python-docx
3
+ google-api-python-client
4
+ nltk
5
+ BeautifulSoup4
6
+ scrapingbee
7
+ requests
8
+ numpy
9
+ torch==1.13.0
10
+ transformers==4.25.1
11
+ transformers-interpret
12
+ textstat
13
+ scipy
14
+ scikit-learn
15
+ joblib
16
+ evaluate
17
+ tensorflow
18
+ keras
19
+ spacy
utils.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urllib.request import urlopen, Request
2
+ from googleapiclient.discovery import build
3
+ import requests
4
+ import httpx
5
+ import re
6
+ from bs4 import BeautifulSoup
7
+ import re, math
8
+ from collections import Counter
9
+ import numpy as np
10
+ import asyncio
11
+ import nltk
12
+
13
+ nltk.download('punkt')
14
+
15
+ WORD = re.compile(r"\w+")
16
+
17
+
18
+ # returns cosine similarity of two vectors
19
+ # input: two vectors
20
+ # output: integer between 0 and 1.
21
+ def get_cosine(vec1, vec2):
22
+ intersection = set(vec1.keys()) & set(vec2.keys())
23
+
24
+ # calculating numerator
25
+ numerator = sum([vec1[x] * vec2[x] for x in intersection])
26
+
27
+ # calculating denominator
28
+ sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
29
+ sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
30
+ denominator = math.sqrt(sum1) * math.sqrt(sum2)
31
+
32
+ # checking for divide by zero
33
+ if denominator == 0:
34
+ return 0.0
35
+ else:
36
+ return float(numerator) / denominator
37
+
38
+
39
+ # converts given text into a vector
40
+ def text_to_vector(text):
41
+ # uses the Regular expression above and gets all words
42
+ words = WORD.findall(text)
43
+ # returns a counter of all the words (count of number of occurences)
44
+ return Counter(words)
45
+
46
+
47
+ # returns cosine similarity of two words
48
+ # uses: text_to_vector(text) and get_cosine(v1,v2)
49
+ def cosineSim(text1, text2):
50
+ vector1 = text_to_vector(text1)
51
+ vector2 = text_to_vector(text2)
52
+ # print vector1,vector2
53
+ cosine = get_cosine(vector1, vector2)
54
+ return cosine
55
+
56
+ def get_soup_requests(url):
57
+ page = requests.get(url)
58
+ if page.status_code == 200:
59
+ soup = BeautifulSoup(page.content, "html.parser")
60
+ return soup
61
+ print("HTML soup failed")
62
+ return None
63
+
64
+
65
+ def get_soup_httpx(url):
66
+ client = httpx.Client(timeout=30)
67
+ try:
68
+ page = client.get(url)
69
+ if page.status_code == httpx.codes.OK:
70
+ soup = BeautifulSoup(page.content, "html.parser")
71
+ return soup
72
+ except:
73
+ print("HTTPx soup failed")
74
+ return None
75
+
76
+ def getSentences(text):
77
+ from nltk.tokenize import sent_tokenize
78
+
79
+ sents = sent_tokenize(text)
80
+ two_sents = []
81
+ for i in range(len(sents)):
82
+ if (i % 2) == 0:
83
+ two_sents.append(sents[i])
84
+ else:
85
+ two_sents[len(two_sents) - 1] += " " + sents[i]
86
+ return two_sents
87
+
88
+
89
+ def googleSearch(
90
+ sentences,
91
+ urlCount,
92
+ scoreArray,
93
+ urlList,
94
+ sorted_date,
95
+ domains_to_skip,
96
+ api_key,
97
+ cse_id,
98
+ **kwargs,
99
+ ):
100
+ service = build("customsearch", "v1", developerKey=api_key)
101
+ for i, sentence in enumerate(sentences):
102
+ results = (
103
+ service.cse()
104
+ .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
105
+ .execute()
106
+ )
107
+ if "items" in results and len(results["items"]) > 0:
108
+ for count, link in enumerate(results["items"]):
109
+ # stop after 5 pages
110
+ if count > 4:
111
+ break
112
+ # skip user selected domains
113
+ if any(
114
+ ("." + domain) in link["link"]
115
+ for domain in domains_to_skip
116
+ ):
117
+ continue
118
+ # clean up snippet of '...'
119
+ snippet = link["snippet"]
120
+ ind = snippet.find("...")
121
+ if ind < 20 and ind > 9:
122
+ snippet = snippet[ind + len("... ") :]
123
+ ind = snippet.find("...")
124
+ if ind > len(snippet) - 5:
125
+ snippet = snippet[:ind]
126
+
127
+ # update cosine similarity between snippet and given text
128
+ url = link["link"]
129
+ if url not in urlList:
130
+ urlList.append(url)
131
+ scoreArray.append([0] * len(sentences))
132
+ urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
133
+ scoreArray[urlList.index(url)][i] = cosineSim(
134
+ sentence, snippet
135
+ )
136
+ else:
137
+ print("Google Search failed")
138
+ return urlCount, scoreArray
139
+
140
+
141
+ def getQueries(text, n):
142
+ # return n-grams of size n
143
+ finalq = []
144
+ words = text.split()
145
+ l = len(words)
146
+
147
+ for i in range(0, l - n + 1):
148
+ finalq.append(words[i : i + n])
149
+
150
+ return finalq
151
+
152
+
153
+ def print2D(array):
154
+ print(np.array(array))
155
+
156
+
157
+ def removePunc(text):
158
+ res = re.sub(r"[^\w\s]", "", text)
159
+ return res
160
+
161
+
162
+ async def get_url_data(url, client):
163
+ try:
164
+ r = await client.get(url)
165
+ # print(r.status_code)
166
+ if r.status_code == 200:
167
+ # print("in")
168
+ soup = BeautifulSoup(r.content, "html.parser")
169
+ return soup
170
+ except Exception:
171
+ print("HTTPx parallel soup failed")
172
+ return None
173
+
174
+
175
+ async def parallel_scrap(urls):
176
+ async with httpx.AsyncClient(timeout=30) as client:
177
+ tasks = []
178
+ for url in urls:
179
+ tasks.append(get_url_data(url=url, client=client))
180
+ results = await asyncio.gather(*tasks, return_exceptions=True)
181
+ return results
182
+
183
+
184
+ def matchingScore(sentence, content):
185
+ if sentence in content:
186
+ return 1
187
+ sentence = removePunc(sentence)
188
+ content = removePunc(content)
189
+ if sentence in content:
190
+ return 1
191
+ else:
192
+ n = 5
193
+ ngrams = getQueries(sentence, n)
194
+ if len(ngrams) == 0:
195
+ return 0
196
+ matched = [x for x in ngrams if " ".join(x) in content]
197
+ return len(matched) / len(ngrams)
198
+
199
+
200
+ async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
201
+ content = removePunc(content)
202
+ for j, sentence in enumerate(sentences):
203
+ sentence = removePunc(sentence)
204
+ if sentence in content:
205
+ ScoreArray[content_idx][j] = 1
206
+ else:
207
+ n = 5
208
+ ngrams = getQueries(sentence, n)
209
+ if len(ngrams) == 0:
210
+ return 0
211
+ matched = [x for x in ngrams if " ".join(x) in content]
212
+ ScoreArray[content_idx][j] = len(matched) / len(ngrams)
213
+ print(
214
+ f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
215
+ )
216
+ return ScoreArray
217
+
218
+
219
+ async def parallel_analyze(soups, sentences, ScoreArray):
220
+ tasks = []
221
+ for i, soup in enumerate(soups):
222
+ if soup:
223
+ page_content = soup.text
224
+ tasks.append(
225
+ matchingScoreAsync(sentences, page_content, i, ScoreArray)
226
+ )
227
+ else:
228
+ print(
229
+ f"Analyzed {i+1} of soups (SOUP FAILED)........................"
230
+ )
231
+ ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
232
+ return ScoreArray
233
+
234
+
235
+ async def parallel_analyze_2(soups, sentences, ScoreArray):
236
+ tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
237
+ for i, soup in enumerate(soups):
238
+ if soup:
239
+ page_content = soup.text
240
+ for j, sent in enumerate(sentences):
241
+ print(
242
+ f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
243
+ )
244
+ tasks[i][j] = matchingScore(sent, page_content)
245
+ else:
246
+ print(
247
+ f"Analyzed {i+1} of soups (SOUP FAILED)........................"
248
+ )
249
+ ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
250
+ return ScoreArray