Spaces:

polygraf-ai
/

copyright_checker

Sleeping

App Files Files Community

aliasgerovs commited on Jun 5, 2024

Commit

2019311

1 Parent(s): 41a6a33

Added modalities to the input.

Browse files

Files changed (5) hide show

app.py +18 -4
audio.py +25 -0
nohup.out +0 -0
requirements.txt +3 -1
utils.py +58 -3

app.py CHANGED Viewed

@@ -7,9 +7,11 @@ from analysis import depth_analysis
 from predictors import predict_quillbot
 from plagiarism import plagiarism_check, build_date, html_highlight
 from highlighter import segmented_higlighter
-from utils import extract_text_from_pdf, len_validator
 import yaml
 from functools import partial
 np.set_printoptions(suppress=True)
@@ -115,13 +117,25 @@ with gr.Blocks() as demo:
             fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
         )
     char_count = gr.Textbox(label="Minumum Character Limit Check")
     input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
     with gr.Row():
-        btn = gr.Button("Bias Buster")
-        out = gr.Textbox(label="Bias Corrected Full Input", interactive=False)
-        corrections_output = gr.Textbox(label="Bias Corrections", interactive=False)
         btn.click(fn=update_main, inputs=input_text, outputs=[out, corrections_output])
     with gr.Row():

 from predictors import predict_quillbot
 from plagiarism import plagiarism_check, build_date, html_highlight
 from highlighter import segmented_higlighter
+from utils import extract_text_from_pdf, len_validator, extract_text_from_html
 import yaml
 from functools import partial
+from audio import assemblyai_transcribe
+import yt_dlp
 np.set_printoptions(suppress=True)
             fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
         )
+    with gr.Row():
+        url_input = gr.Textbox(
+            label="Input Page URL to check", lines=1, placeholder="")
+        url_input.change(
+            fn=extract_text_from_html, inputs=url_input, outputs=input_text)
+        audio_url_input = gr.Textbox(label="Input YouTube URL to check", lines=1, placeholder="")
+        audio_url_input.change(
+            fn=assemblyai_transcribe, inputs=audio_url_input, outputs=input_text
+        )
     char_count = gr.Textbox(label="Minumum Character Limit Check")
     input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
     with gr.Row():
+        btn = gr.Button("Deception Filter")
+        out = gr.Textbox(label="Corrected Full Input", interactive=False)
+        corrections_output = gr.Textbox(label="Corrections", interactive=False)
         btn.click(fn=update_main, inputs=input_text, outputs=[out, corrections_output])
     with gr.Row():

audio.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import requests
+import json
+import time
+import yaml
+import yt_dlp
+import assemblyai as aai
+with open("config.yaml", "r") as file:
+    params = yaml.safe_load(file)
+transcriber = aai.Transcriber()
+aai.settings.api_key = params["ASSEMBLY_AI_TOKEN"]
+def assemblyai_transcribe(audio_url):
+    if audio_url is None:
+        return ""
+    with yt_dlp.YoutubeDL() as ydl:
+        info = ydl.extract_info(audio_url, download=False)
+    for format in info["formats"][::-1]:
+        if format["resolution"] == "audio only" and format["ext"] == "m4a":
+            url = format["url"]
+            break
+    transcript = transcriber.transcribe(url)
+    return transcript.text

nohup.out CHANGED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -32,4 +32,6 @@ optimum[onnxruntime]
 emoji==1.6.1
 matplotlib
 tf-keras
-seaborn

 emoji==1.6.1
 matplotlib
 tf-keras
+seaborn
+yt-dlp
+assemblyai

utils.py CHANGED Viewed

@@ -50,7 +50,6 @@ with open("config.yaml", "r") as file:
     params = yaml.safe_load(file)
 text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
 text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
@@ -71,5 +70,61 @@ def extract_text_from_pdf(pdf_path):
     return text
-WORD = re.compile(r"\w+")
-model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

     params = yaml.safe_load(file)
 text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
 text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
     return text
+def format_headings(text):
+    lines = text.split(" ")
+    formatted_lines = []
+    heading = ""
+    for line in lines:
+        if line and line.isupper():
+            heading += line + " "
+        else:
+            if heading != "" and len(heading) > 10:
+                formatted = (
+                    "\n"
+                    + heading[: len(heading) - 2]
+                    + "\n"
+                    + heading[len(heading) - 2 :]
+                    if heading.strip().endswith(" A")
+                    else "\n" + heading + "\n"
+                )
+                formatted_lines.append(formatted.strip(" "))
+            elif heading != "":
+                formatted_lines.append(heading.strip())
+            formatted_lines.append(line.strip())
+            heading = ""
+    return " ".join(formatted_lines)
+def format_live_site(text):
+    # insert a newline between lowercase and uppercase letters
+    formatted_text = re.sub(r"([a-z])([A-Z])", r"\1\n\2", text)
+    # format the "What's included" items
+    formatted_text = re.sub(
+        r"([a-z])(\d+\.\d+[MK])", r"\1\n\2 ", formatted_text
+    )
+    # place headings in all caps on their own line
+    formatted_text = format_headings(formatted_text)
+    # ddd a space after ':', ';', ',', '!', '?' if they are followed by a character
+    formatted_text = re.sub(r"([:;,!?])(\S)", r"\1 \2", formatted_text)
+    return formatted_text
+def extract_text_from_html(url):
+    try:
+        r = requests.get(url)
+        if r.status_code == 200:
+            soup = BeautifulSoup(r.content, "html.parser")
+    except Exception:
+        return "Unable to extract URL"
+    def remove_tags(soup):
+        # parse html content
+        for data in soup(["style", "script", "code", "a"]):
+            # Remove tags
+            data.decompose()
+        # return data by retrieving the tag content
+        return " ".join(soup.stripped_strings)
+    text = remove_tags(soup)
+    text = format_live_site(text)
+    return text