aliasgerovs commited on
Commit
2019311
·
1 Parent(s): 41a6a33

Added modalities to the input.

Browse files
Files changed (5) hide show
  1. app.py +18 -4
  2. audio.py +25 -0
  3. nohup.out +0 -0
  4. requirements.txt +3 -1
  5. utils.py +58 -3
app.py CHANGED
@@ -7,9 +7,11 @@ from analysis import depth_analysis
7
  from predictors import predict_quillbot
8
  from plagiarism import plagiarism_check, build_date, html_highlight
9
  from highlighter import segmented_higlighter
10
- from utils import extract_text_from_pdf, len_validator
11
  import yaml
12
  from functools import partial
 
 
13
 
14
 
15
  np.set_printoptions(suppress=True)
@@ -115,13 +117,25 @@ with gr.Blocks() as demo:
115
  fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
116
  )
117
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  char_count = gr.Textbox(label="Minumum Character Limit Check")
119
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
120
 
121
  with gr.Row():
122
- btn = gr.Button("Bias Buster")
123
- out = gr.Textbox(label="Bias Corrected Full Input", interactive=False)
124
- corrections_output = gr.Textbox(label="Bias Corrections", interactive=False)
125
  btn.click(fn=update_main, inputs=input_text, outputs=[out, corrections_output])
126
 
127
  with gr.Row():
 
7
  from predictors import predict_quillbot
8
  from plagiarism import plagiarism_check, build_date, html_highlight
9
  from highlighter import segmented_higlighter
10
+ from utils import extract_text_from_pdf, len_validator, extract_text_from_html
11
  import yaml
12
  from functools import partial
13
+ from audio import assemblyai_transcribe
14
+ import yt_dlp
15
 
16
 
17
  np.set_printoptions(suppress=True)
 
117
  fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
118
  )
119
 
120
+
121
+ with gr.Row():
122
+ url_input = gr.Textbox(
123
+ label="Input Page URL to check", lines=1, placeholder="")
124
+ url_input.change(
125
+ fn=extract_text_from_html, inputs=url_input, outputs=input_text)
126
+
127
+ audio_url_input = gr.Textbox(label="Input YouTube URL to check", lines=1, placeholder="")
128
+ audio_url_input.change(
129
+ fn=assemblyai_transcribe, inputs=audio_url_input, outputs=input_text
130
+ )
131
+
132
  char_count = gr.Textbox(label="Minumum Character Limit Check")
133
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
134
 
135
  with gr.Row():
136
+ btn = gr.Button("Deception Filter")
137
+ out = gr.Textbox(label="Corrected Full Input", interactive=False)
138
+ corrections_output = gr.Textbox(label="Corrections", interactive=False)
139
  btn.click(fn=update_main, inputs=input_text, outputs=[out, corrections_output])
140
 
141
  with gr.Row():
audio.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import time
4
+ import yaml
5
+ import yt_dlp
6
+ import assemblyai as aai
7
+
8
+ with open("config.yaml", "r") as file:
9
+ params = yaml.safe_load(file)
10
+
11
+ transcriber = aai.Transcriber()
12
+ aai.settings.api_key = params["ASSEMBLY_AI_TOKEN"]
13
+
14
+ def assemblyai_transcribe(audio_url):
15
+ if audio_url is None:
16
+ return ""
17
+ with yt_dlp.YoutubeDL() as ydl:
18
+ info = ydl.extract_info(audio_url, download=False)
19
+
20
+ for format in info["formats"][::-1]:
21
+ if format["resolution"] == "audio only" and format["ext"] == "m4a":
22
+ url = format["url"]
23
+ break
24
+ transcript = transcriber.transcribe(url)
25
+ return transcript.text
nohup.out CHANGED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -32,4 +32,6 @@ optimum[onnxruntime]
32
  emoji==1.6.1
33
  matplotlib
34
  tf-keras
35
- seaborn
 
 
 
32
  emoji==1.6.1
33
  matplotlib
34
  tf-keras
35
+ seaborn
36
+ yt-dlp
37
+ assemblyai
utils.py CHANGED
@@ -50,7 +50,6 @@ with open("config.yaml", "r") as file:
50
  params = yaml.safe_load(file)
51
 
52
  text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
53
-
54
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
55
 
56
 
@@ -71,5 +70,61 @@ def extract_text_from_pdf(pdf_path):
71
  return text
72
 
73
 
74
- WORD = re.compile(r"\w+")
75
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  params = yaml.safe_load(file)
51
 
52
  text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
 
53
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
54
 
55
 
 
70
  return text
71
 
72
 
73
+ def format_headings(text):
74
+ lines = text.split(" ")
75
+ formatted_lines = []
76
+ heading = ""
77
+ for line in lines:
78
+ if line and line.isupper():
79
+ heading += line + " "
80
+ else:
81
+ if heading != "" and len(heading) > 10:
82
+ formatted = (
83
+ "\n"
84
+ + heading[: len(heading) - 2]
85
+ + "\n"
86
+ + heading[len(heading) - 2 :]
87
+ if heading.strip().endswith(" A")
88
+ else "\n" + heading + "\n"
89
+ )
90
+ formatted_lines.append(formatted.strip(" "))
91
+ elif heading != "":
92
+ formatted_lines.append(heading.strip())
93
+ formatted_lines.append(line.strip())
94
+ heading = ""
95
+ return " ".join(formatted_lines)
96
+
97
+
98
+ def format_live_site(text):
99
+ # insert a newline between lowercase and uppercase letters
100
+ formatted_text = re.sub(r"([a-z])([A-Z])", r"\1\n\2", text)
101
+ # format the "What's included" items
102
+ formatted_text = re.sub(
103
+ r"([a-z])(\d+\.\d+[MK])", r"\1\n\2 ", formatted_text
104
+ )
105
+ # place headings in all caps on their own line
106
+ formatted_text = format_headings(formatted_text)
107
+ # ddd a space after ':', ';', ',', '!', '?' if they are followed by a character
108
+ formatted_text = re.sub(r"([:;,!?])(\S)", r"\1 \2", formatted_text)
109
+ return formatted_text
110
+
111
+
112
+ def extract_text_from_html(url):
113
+ try:
114
+ r = requests.get(url)
115
+ if r.status_code == 200:
116
+ soup = BeautifulSoup(r.content, "html.parser")
117
+ except Exception:
118
+ return "Unable to extract URL"
119
+
120
+ def remove_tags(soup):
121
+ # parse html content
122
+ for data in soup(["style", "script", "code", "a"]):
123
+ # Remove tags
124
+ data.decompose()
125
+ # return data by retrieving the tag content
126
+ return " ".join(soup.stripped_strings)
127
+
128
+ text = remove_tags(soup)
129
+ text = format_live_site(text)
130
+ return text