Spaces:
Sleeping
Sleeping
aliasgerovs
commited on
Commit
·
2019311
1
Parent(s):
41a6a33
Added modalities to the input.
Browse files- app.py +18 -4
- audio.py +25 -0
- nohup.out +0 -0
- requirements.txt +3 -1
- utils.py +58 -3
app.py
CHANGED
@@ -7,9 +7,11 @@ from analysis import depth_analysis
|
|
7 |
from predictors import predict_quillbot
|
8 |
from plagiarism import plagiarism_check, build_date, html_highlight
|
9 |
from highlighter import segmented_higlighter
|
10 |
-
from utils import extract_text_from_pdf, len_validator
|
11 |
import yaml
|
12 |
from functools import partial
|
|
|
|
|
13 |
|
14 |
|
15 |
np.set_printoptions(suppress=True)
|
@@ -115,13 +117,25 @@ with gr.Blocks() as demo:
|
|
115 |
fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
|
116 |
)
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
char_count = gr.Textbox(label="Minumum Character Limit Check")
|
119 |
input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
|
120 |
|
121 |
with gr.Row():
|
122 |
-
btn = gr.Button("
|
123 |
-
out = gr.Textbox(label="
|
124 |
-
corrections_output = gr.Textbox(label="
|
125 |
btn.click(fn=update_main, inputs=input_text, outputs=[out, corrections_output])
|
126 |
|
127 |
with gr.Row():
|
|
|
7 |
from predictors import predict_quillbot
|
8 |
from plagiarism import plagiarism_check, build_date, html_highlight
|
9 |
from highlighter import segmented_higlighter
|
10 |
+
from utils import extract_text_from_pdf, len_validator, extract_text_from_html
|
11 |
import yaml
|
12 |
from functools import partial
|
13 |
+
from audio import assemblyai_transcribe
|
14 |
+
import yt_dlp
|
15 |
|
16 |
|
17 |
np.set_printoptions(suppress=True)
|
|
|
117 |
fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
|
118 |
)
|
119 |
|
120 |
+
|
121 |
+
with gr.Row():
|
122 |
+
url_input = gr.Textbox(
|
123 |
+
label="Input Page URL to check", lines=1, placeholder="")
|
124 |
+
url_input.change(
|
125 |
+
fn=extract_text_from_html, inputs=url_input, outputs=input_text)
|
126 |
+
|
127 |
+
audio_url_input = gr.Textbox(label="Input YouTube URL to check", lines=1, placeholder="")
|
128 |
+
audio_url_input.change(
|
129 |
+
fn=assemblyai_transcribe, inputs=audio_url_input, outputs=input_text
|
130 |
+
)
|
131 |
+
|
132 |
char_count = gr.Textbox(label="Minumum Character Limit Check")
|
133 |
input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
|
134 |
|
135 |
with gr.Row():
|
136 |
+
btn = gr.Button("Deception Filter")
|
137 |
+
out = gr.Textbox(label="Corrected Full Input", interactive=False)
|
138 |
+
corrections_output = gr.Textbox(label="Corrections", interactive=False)
|
139 |
btn.click(fn=update_main, inputs=input_text, outputs=[out, corrections_output])
|
140 |
|
141 |
with gr.Row():
|
audio.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
import time
|
4 |
+
import yaml
|
5 |
+
import yt_dlp
|
6 |
+
import assemblyai as aai
|
7 |
+
|
8 |
+
with open("config.yaml", "r") as file:
|
9 |
+
params = yaml.safe_load(file)
|
10 |
+
|
11 |
+
transcriber = aai.Transcriber()
|
12 |
+
aai.settings.api_key = params["ASSEMBLY_AI_TOKEN"]
|
13 |
+
|
14 |
+
def assemblyai_transcribe(audio_url):
|
15 |
+
if audio_url is None:
|
16 |
+
return ""
|
17 |
+
with yt_dlp.YoutubeDL() as ydl:
|
18 |
+
info = ydl.extract_info(audio_url, download=False)
|
19 |
+
|
20 |
+
for format in info["formats"][::-1]:
|
21 |
+
if format["resolution"] == "audio only" and format["ext"] == "m4a":
|
22 |
+
url = format["url"]
|
23 |
+
break
|
24 |
+
transcript = transcriber.transcribe(url)
|
25 |
+
return transcript.text
|
nohup.out
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
CHANGED
@@ -32,4 +32,6 @@ optimum[onnxruntime]
|
|
32 |
emoji==1.6.1
|
33 |
matplotlib
|
34 |
tf-keras
|
35 |
-
seaborn
|
|
|
|
|
|
32 |
emoji==1.6.1
|
33 |
matplotlib
|
34 |
tf-keras
|
35 |
+
seaborn
|
36 |
+
yt-dlp
|
37 |
+
assemblyai
|
utils.py
CHANGED
@@ -50,7 +50,6 @@ with open("config.yaml", "r") as file:
|
|
50 |
params = yaml.safe_load(file)
|
51 |
|
52 |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
53 |
-
|
54 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
55 |
|
56 |
|
@@ -71,5 +70,61 @@ def extract_text_from_pdf(pdf_path):
|
|
71 |
return text
|
72 |
|
73 |
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
params = yaml.safe_load(file)
|
51 |
|
52 |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
|
|
53 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
54 |
|
55 |
|
|
|
70 |
return text
|
71 |
|
72 |
|
73 |
+
def format_headings(text):
|
74 |
+
lines = text.split(" ")
|
75 |
+
formatted_lines = []
|
76 |
+
heading = ""
|
77 |
+
for line in lines:
|
78 |
+
if line and line.isupper():
|
79 |
+
heading += line + " "
|
80 |
+
else:
|
81 |
+
if heading != "" and len(heading) > 10:
|
82 |
+
formatted = (
|
83 |
+
"\n"
|
84 |
+
+ heading[: len(heading) - 2]
|
85 |
+
+ "\n"
|
86 |
+
+ heading[len(heading) - 2 :]
|
87 |
+
if heading.strip().endswith(" A")
|
88 |
+
else "\n" + heading + "\n"
|
89 |
+
)
|
90 |
+
formatted_lines.append(formatted.strip(" "))
|
91 |
+
elif heading != "":
|
92 |
+
formatted_lines.append(heading.strip())
|
93 |
+
formatted_lines.append(line.strip())
|
94 |
+
heading = ""
|
95 |
+
return " ".join(formatted_lines)
|
96 |
+
|
97 |
+
|
98 |
+
def format_live_site(text):
|
99 |
+
# insert a newline between lowercase and uppercase letters
|
100 |
+
formatted_text = re.sub(r"([a-z])([A-Z])", r"\1\n\2", text)
|
101 |
+
# format the "What's included" items
|
102 |
+
formatted_text = re.sub(
|
103 |
+
r"([a-z])(\d+\.\d+[MK])", r"\1\n\2 ", formatted_text
|
104 |
+
)
|
105 |
+
# place headings in all caps on their own line
|
106 |
+
formatted_text = format_headings(formatted_text)
|
107 |
+
# ddd a space after ':', ';', ',', '!', '?' if they are followed by a character
|
108 |
+
formatted_text = re.sub(r"([:;,!?])(\S)", r"\1 \2", formatted_text)
|
109 |
+
return formatted_text
|
110 |
+
|
111 |
+
|
112 |
+
def extract_text_from_html(url):
|
113 |
+
try:
|
114 |
+
r = requests.get(url)
|
115 |
+
if r.status_code == 200:
|
116 |
+
soup = BeautifulSoup(r.content, "html.parser")
|
117 |
+
except Exception:
|
118 |
+
return "Unable to extract URL"
|
119 |
+
|
120 |
+
def remove_tags(soup):
|
121 |
+
# parse html content
|
122 |
+
for data in soup(["style", "script", "code", "a"]):
|
123 |
+
# Remove tags
|
124 |
+
data.decompose()
|
125 |
+
# return data by retrieving the tag content
|
126 |
+
return " ".join(soup.stripped_strings)
|
127 |
+
|
128 |
+
text = remove_tags(soup)
|
129 |
+
text = format_live_site(text)
|
130 |
+
return text
|