Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Jul 30, 2024

Commit

ca02509

1 Parent(s): 3460455

clean html text and increase char limit on content

Browse files

Files changed (3) hide show

app.py +2 -2
plagiarism.py +22 -4
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -220,7 +220,7 @@ def ai_check(text: str, option: str):
 def generate_prompt(settings: Dict[str, str]) -> str:
     content_string = "\n".join(
-        f"{url.strip()}: \n{content.strip()[:500]}" for url, content in settings["sources"].items()
     )
     prompt = f"""
@@ -256,7 +256,7 @@ def generate_prompt(settings: Dict[str, str]) -> str:
 def regenerate_prompt(settings: Dict[str, str]) -> str:
     content_string = "\n".join(
-        f"{url.strip()}: \n{content.strip()[:500]}" for url, content in settings["sources"].items()
     )
     prompt = f"""

 def generate_prompt(settings: Dict[str, str]) -> str:
     content_string = "\n".join(
+        f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
     )
     prompt = f"""
 def regenerate_prompt(settings: Dict[str, str]) -> str:
     content_string = "\n".join(
+        f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
     )
     prompt = f"""

plagiarism.py CHANGED Viewed

@@ -3,6 +3,21 @@ from googleapiclient.discovery import build
 import asyncio
 import httpx
 from bs4 import BeautifulSoup
 months = {
@@ -55,13 +70,10 @@ def google_search_urls(
     **kwargs,
 ):
     service = build("customsearch", "v1", developerKey=api_key)
-    num_pages = 3
     results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
     url_list = []
     if "items" in results and len(results["items"]) > 0:
         for count, link in enumerate(results["items"]):
-            if count >= num_pages:
-                break
             # skip user selected domains
             if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
                 continue
@@ -100,9 +112,15 @@ def google_search(
     soups = asyncio.run(parallel_scrap(url_list))
     print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
     result_content = {}
     for url, soup in zip(url_list, soups):
         if soup:
-            result_content[url] = soup.text
     # for key, value in result_content.items():
     #     print("-------------------URL: ", key)
     #     print(value[:30])

 import asyncio
 import httpx
 from bs4 import BeautifulSoup
+import justext
+import newspaper
+def clean_html(text):
+    result = ""
+    article = newspaper.Article(url=" ")
+    article.set_html(text)
+    article.parse()
+    result += article.title + "\n"
+    paragraphs = justext.justext(text, justext.get_stoplist("English"))
+    for paragraph in paragraphs:
+        if not paragraph.is_boilerplate:
+            result += paragraph.text
+    return result
 months = {
     **kwargs,
 ):
     service = build("customsearch", "v1", developerKey=api_key)
     results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
     url_list = []
     if "items" in results and len(results["items"]) > 0:
         for count, link in enumerate(results["items"]):
             # skip user selected domains
             if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
                 continue
     soups = asyncio.run(parallel_scrap(url_list))
     print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
     result_content = {}
+    num_pages = 3
+    count = 0
     for url, soup in zip(url_list, soups):
+        if count >= num_pages:
+            break
         if soup:
+            text = clean_html(soup.text)
+            result_content[url] = text
+            count += 1
     # for key, value in result_content.items():
     #     print("-------------------URL: ", key)
     #     print(value[:30])

requirements.txt CHANGED Viewed

@@ -10,4 +10,6 @@ language_tool_python
 scipy
 Unidecode
 BeautifulSoup4
-google-api-python-client

 scipy
 Unidecode
 BeautifulSoup4
+google-api-python-client
+newspaper3k
+jusText