minko186 commited on
Commit
ca02509
·
1 Parent(s): 3460455

clean html text and increase char limit on content

Browse files
Files changed (3) hide show
  1. app.py +2 -2
  2. plagiarism.py +22 -4
  3. requirements.txt +3 -1
app.py CHANGED
@@ -220,7 +220,7 @@ def ai_check(text: str, option: str):
220
 
221
  def generate_prompt(settings: Dict[str, str]) -> str:
222
  content_string = "\n".join(
223
- f"{url.strip()}: \n{content.strip()[:500]}" for url, content in settings["sources"].items()
224
  )
225
 
226
  prompt = f"""
@@ -256,7 +256,7 @@ def generate_prompt(settings: Dict[str, str]) -> str:
256
 
257
  def regenerate_prompt(settings: Dict[str, str]) -> str:
258
  content_string = "\n".join(
259
- f"{url.strip()}: \n{content.strip()[:500]}" for url, content in settings["sources"].items()
260
  )
261
 
262
  prompt = f"""
 
220
 
221
  def generate_prompt(settings: Dict[str, str]) -> str:
222
  content_string = "\n".join(
223
+ f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
224
  )
225
 
226
  prompt = f"""
 
256
 
257
  def regenerate_prompt(settings: Dict[str, str]) -> str:
258
  content_string = "\n".join(
259
+ f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
260
  )
261
 
262
  prompt = f"""
plagiarism.py CHANGED
@@ -3,6 +3,21 @@ from googleapiclient.discovery import build
3
  import asyncio
4
  import httpx
5
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
 
8
  months = {
@@ -55,13 +70,10 @@ def google_search_urls(
55
  **kwargs,
56
  ):
57
  service = build("customsearch", "v1", developerKey=api_key)
58
- num_pages = 3
59
  results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
60
  url_list = []
61
  if "items" in results and len(results["items"]) > 0:
62
  for count, link in enumerate(results["items"]):
63
- if count >= num_pages:
64
- break
65
  # skip user selected domains
66
  if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
67
  continue
@@ -100,9 +112,15 @@ def google_search(
100
  soups = asyncio.run(parallel_scrap(url_list))
101
  print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
102
  result_content = {}
 
 
103
  for url, soup in zip(url_list, soups):
 
 
104
  if soup:
105
- result_content[url] = soup.text
 
 
106
  # for key, value in result_content.items():
107
  # print("-------------------URL: ", key)
108
  # print(value[:30])
 
3
  import asyncio
4
  import httpx
5
  from bs4 import BeautifulSoup
6
+ import justext
7
+ import newspaper
8
+
9
+
10
+ def clean_html(text):
11
+ result = ""
12
+ article = newspaper.Article(url=" ")
13
+ article.set_html(text)
14
+ article.parse()
15
+ result += article.title + "\n"
16
+ paragraphs = justext.justext(text, justext.get_stoplist("English"))
17
+ for paragraph in paragraphs:
18
+ if not paragraph.is_boilerplate:
19
+ result += paragraph.text
20
+ return result
21
 
22
 
23
  months = {
 
70
  **kwargs,
71
  ):
72
  service = build("customsearch", "v1", developerKey=api_key)
 
73
  results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
74
  url_list = []
75
  if "items" in results and len(results["items"]) > 0:
76
  for count, link in enumerate(results["items"]):
 
 
77
  # skip user selected domains
78
  if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
79
  continue
 
112
  soups = asyncio.run(parallel_scrap(url_list))
113
  print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
114
  result_content = {}
115
+ num_pages = 3
116
+ count = 0
117
  for url, soup in zip(url_list, soups):
118
+ if count >= num_pages:
119
+ break
120
  if soup:
121
+ text = clean_html(soup.text)
122
+ result_content[url] = text
123
+ count += 1
124
  # for key, value in result_content.items():
125
  # print("-------------------URL: ", key)
126
  # print(value[:30])
requirements.txt CHANGED
@@ -10,4 +10,6 @@ language_tool_python
10
  scipy
11
  Unidecode
12
  BeautifulSoup4
13
- google-api-python-client
 
 
 
10
  scipy
11
  Unidecode
12
  BeautifulSoup4
13
+ google-api-python-client
14
+ newspaper3k
15
+ jusText