Spaces:
Runtime error
Runtime error
thealphamerc
commited on
Commit
•
47d9326
1
Parent(s):
7a97be1
Remove unnecessary files
Browse files- .gitignore +0 -0
- app.py +118 -83
- data/audio.wav +0 -0
- data/audio2.mp3 +0 -0
- flagged/Audio file/0.wav +0 -0
- flagged/log.csv +0 -2
- script.py +87 -0
- trans.py +0 -122
.gitignore
ADDED
File without changes
|
app.py
CHANGED
@@ -1,87 +1,122 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
# Required third party packages: whisper
|
5 |
-
# See instructions for setup here: https://github.com/openai/whisper#setup
|
6 |
-
# - You can use the below command to pull the repo and install dependencies, then just put this script in the repo directory:
|
7 |
-
# pip install git+https://github.com/openai/whisper.git
|
8 |
-
|
9 |
-
import whisper
|
10 |
-
import io
|
11 |
-
import time
|
12 |
import os
|
13 |
-
import
|
14 |
-
import
|
15 |
-
|
16 |
-
# Choose model to use by uncommenting
|
17 |
-
# modelName = "tiny.en"
|
18 |
-
modelName = "base.en"
|
19 |
-
# modelName = "small.en"
|
20 |
-
# modelName = "medium.en"
|
21 |
-
# modelName = "large-v2"
|
22 |
-
|
23 |
-
# Other Variables
|
24 |
-
# (bool) Whether to export the segment data to a json file. Will include word level timestamps if word_timestamps is True.
|
25 |
-
exportTimestampData = True
|
26 |
-
outputFolder = "Output"
|
27 |
-
|
28 |
-
# ----- Select variables for transcribe method -----
|
29 |
-
# audio: path to audio file
|
30 |
-
verbose = True # (bool): Whether to display the text being decoded to the console. If True, displays all the details, If False, displays minimal details. If None, does not display anything
|
31 |
-
language = "english" # Language of audio file
|
32 |
-
# (bool): Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.
|
33 |
-
word_timestamps = False
|
34 |
-
# initial_prompt="" # (optional str): Optional text to provide as a prompt for the first window. This can be used to provide, or "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those word correctly.
|
35 |
-
|
36 |
-
# -------------------------------------------------------------------------
|
37 |
-
print(f"Using Model: {modelName}")
|
38 |
-
filePath = input("Path to File Being Transcribed: ")
|
39 |
-
filePath = filePath.strip("\"")
|
40 |
-
if not os.path.exists(filePath):
|
41 |
-
print("Problem Getting File...")
|
42 |
-
input("Press Enter to Exit...")
|
43 |
-
exit()
|
44 |
-
|
45 |
-
# If output folder does not exist, create it
|
46 |
-
if not os.path.exists(outputFolder):
|
47 |
-
os.makedirs(outputFolder)
|
48 |
-
print("Created Output Folder.\n")
|
49 |
-
|
50 |
-
# Get filename stem using pathlib (filename without extension)
|
51 |
-
fileNameStem = pathlib.Path(filePath).stem
|
52 |
-
|
53 |
-
resultFileName = f"{fileNameStem}.txt"
|
54 |
-
jsonFileName = f"{fileNameStem}.json"
|
55 |
-
|
56 |
-
model = whisper.load_model(modelName)
|
57 |
-
start = time.time()
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
# ---------------------------------------------------
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from subprocess import call
|
3 |
+
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import os
|
5 |
+
# from transformers.pipelines.audio_utils import ffmpeg_read
|
6 |
+
import whisper
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
|
9 |
+
logger = logging.getLogger("whisper-jax-app")
|
10 |
+
logger.setLevel(logging.INFO)
|
11 |
+
ch = logging.StreamHandler()
|
12 |
+
ch.setLevel(logging.INFO)
|
13 |
+
formatter = logging.Formatter(
|
14 |
+
"%(asctime)s;%(levelname)s;%(message)s", "%Y-%m-%d %H:%M:%S")
|
15 |
+
ch.setFormatter(formatter)
|
16 |
+
logger.addHandler(ch)
|
17 |
+
|
18 |
+
|
19 |
+
BATCH_SIZE = 16
|
20 |
+
CHUNK_LENGTH_S = 30
|
21 |
+
NUM_PROC = 8
|
22 |
+
FILE_LIMIT_MB = 1000
|
23 |
+
YT_ATTEMPT_LIMIT = 3
|
24 |
+
|
25 |
+
|
26 |
+
def run_cmd(command):
|
27 |
+
try:
|
28 |
+
print(command)
|
29 |
+
call(command)
|
30 |
+
except KeyboardInterrupt:
|
31 |
+
print("Process interrupted")
|
32 |
+
sys.exit(1)
|
33 |
+
|
34 |
+
|
35 |
+
def inference(text):
|
36 |
+
cmd = ['tts', '--text', text]
|
37 |
+
run_cmd(cmd)
|
38 |
+
return 'tts_output.wav'
|
39 |
+
|
40 |
+
|
41 |
+
model = whisper.load_model("base")
|
42 |
+
|
43 |
+
inputs = gr.components.Audio(type="filepath", label="Add audio file")
|
44 |
+
outputs = gr.components.Textbox()
|
45 |
+
title = "Audio To text⚡️"
|
46 |
+
description = "An example of using TTS to generate speech from text."
|
47 |
+
article = ""
|
48 |
+
examples = [
|
49 |
+
[""]
|
50 |
+
]
|
51 |
+
|
52 |
+
|
53 |
+
def transcribe(inputs):
|
54 |
+
print('Inputs: ', inputs)
|
55 |
+
# print('Text: ', text)
|
56 |
+
# progress(0, desc="Loading audio file...")
|
57 |
+
if inputs is None:
|
58 |
+
logger.warning("No audio file")
|
59 |
+
return "No audio file submitted! Please upload an audio file before submitting your request."
|
60 |
+
file_size_mb = os.stat(inputs).st_size / (1024 * 1024)
|
61 |
+
if file_size_mb > FILE_LIMIT_MB:
|
62 |
+
logger.warning("Max file size exceeded")
|
63 |
+
return f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB."
|
64 |
+
|
65 |
+
# with open(inputs, "rb") as f:
|
66 |
+
# inputs = f.read()
|
67 |
+
|
68 |
+
# load audio and pad/trim it to fit 30 seconds
|
69 |
+
result = model.transcribe(audio=inputs, language='hindi',
|
70 |
+
word_timestamps=False, verbose=True)
|
71 |
# ---------------------------------------------------
|
72 |
|
73 |
+
print(result["text"])
|
74 |
+
return result["text"]
|
75 |
+
|
76 |
+
|
77 |
+
audio_chunked = gr.Interface(
|
78 |
+
fn=transcribe,
|
79 |
+
inputs=inputs,
|
80 |
+
outputs=outputs,
|
81 |
+
allow_flagging="never",
|
82 |
+
title=title,
|
83 |
+
description=description,
|
84 |
+
article=article,
|
85 |
+
)
|
86 |
+
|
87 |
+
microphone_chunked = gr.Interface(
|
88 |
+
fn=transcribe,
|
89 |
+
inputs=[
|
90 |
+
gr.inputs.Audio(source="microphone",
|
91 |
+
optional=True, type="filepath"),
|
92 |
+
],
|
93 |
+
outputs=[
|
94 |
+
gr.outputs.Textbox(label="Transcription").style(
|
95 |
+
show_copy_button=True),
|
96 |
+
],
|
97 |
+
allow_flagging="never",
|
98 |
+
title=title,
|
99 |
+
description=description,
|
100 |
+
article=article,
|
101 |
+
)
|
102 |
+
|
103 |
+
demo = gr.Blocks()
|
104 |
+
with demo:
|
105 |
+
gr.TabbedInterface([audio_chunked, microphone_chunked], [
|
106 |
+
"Audio File", "Microphone"])
|
107 |
+
demo.queue(concurrency_count=1, max_size=5)
|
108 |
+
demo.launch(show_api=False)
|
109 |
+
|
110 |
+
|
111 |
+
# gr.Interface(
|
112 |
+
# inference,
|
113 |
+
# inputs,
|
114 |
+
# outputs,
|
115 |
+
# verbose=True,
|
116 |
+
# title=title,
|
117 |
+
# description=description,
|
118 |
+
# article=article,
|
119 |
+
# examples=examples,
|
120 |
+
# enable_queue=True,
|
121 |
+
|
122 |
+
# ).launch(share=True, debug=True)
|
data/audio.wav
DELETED
Binary file (172 kB)
|
|
data/audio2.mp3
DELETED
Binary file (35.4 kB)
|
|
flagged/Audio file/0.wav
DELETED
Binary file (693 kB)
|
|
flagged/log.csv
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
Audio file,Transcription,timestamp
|
2 |
-
Audio file/0.wav,No audio file submitted! Please upload an audio file before submitting your request.,2023-04-26 23:19:33.132801
|
|
|
|
|
|
script.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Basic script for using the OpenAI Whisper model to transcribe a video file. You can uncomment whichever model you want to use.
|
2 |
+
# Author: ThioJoe ( https://github.com/ThioJoe )
|
3 |
+
|
4 |
+
# Required third party packages: whisper
|
5 |
+
# See instructions for setup here: https://github.com/openai/whisper#setup
|
6 |
+
# - You can use the below command to pull the repo and install dependencies, then just put this script in the repo directory:
|
7 |
+
# pip install git+https://github.com/openai/whisper.git
|
8 |
+
|
9 |
+
import whisper
|
10 |
+
import io
|
11 |
+
import time
|
12 |
+
import os
|
13 |
+
import json
|
14 |
+
import pathlib
|
15 |
+
|
16 |
+
# Choose model to use by uncommenting
|
17 |
+
# modelName = "tiny.en"
|
18 |
+
modelName = "base.en"
|
19 |
+
# modelName = "small.en"
|
20 |
+
# modelName = "medium.en"
|
21 |
+
# modelName = "large-v2"
|
22 |
+
|
23 |
+
# Other Variables
|
24 |
+
# (bool) Whether to export the segment data to a json file. Will include word level timestamps if word_timestamps is True.
|
25 |
+
exportTimestampData = True
|
26 |
+
outputFolder = "Output"
|
27 |
+
|
28 |
+
# ----- Select variables for transcribe method -----
|
29 |
+
# audio: path to audio file
|
30 |
+
verbose = True # (bool): Whether to display the text being decoded to the console. If True, displays all the details, If False, displays minimal details. If None, does not display anything
|
31 |
+
language = "english" # Language of audio file
|
32 |
+
# (bool): Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.
|
33 |
+
word_timestamps = False
|
34 |
+
# initial_prompt="" # (optional str): Optional text to provide as a prompt for the first window. This can be used to provide, or "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those word correctly.
|
35 |
+
|
36 |
+
# -------------------------------------------------------------------------
|
37 |
+
print(f"Using Model: {modelName}")
|
38 |
+
filePath = input("Path to File Being Transcribed: ")
|
39 |
+
filePath = filePath.strip("\"")
|
40 |
+
if not os.path.exists(filePath):
|
41 |
+
print("Problem Getting File...")
|
42 |
+
input("Press Enter to Exit...")
|
43 |
+
exit()
|
44 |
+
|
45 |
+
# If output folder does not exist, create it
|
46 |
+
if not os.path.exists(outputFolder):
|
47 |
+
os.makedirs(outputFolder)
|
48 |
+
print("Created Output Folder.\n")
|
49 |
+
|
50 |
+
# Get filename stem using pathlib (filename without extension)
|
51 |
+
fileNameStem = pathlib.Path(filePath).stem
|
52 |
+
|
53 |
+
resultFileName = f"{fileNameStem}.txt"
|
54 |
+
jsonFileName = f"{fileNameStem}.json"
|
55 |
+
|
56 |
+
model = whisper.load_model(modelName)
|
57 |
+
start = time.time()
|
58 |
+
|
59 |
+
# ---------------------------------------------------
|
60 |
+
result = model.transcribe(audio=filePath, language=language,
|
61 |
+
word_timestamps=word_timestamps, verbose=verbose)
|
62 |
+
# ---------------------------------------------------
|
63 |
+
|
64 |
+
end = time.time()
|
65 |
+
elapsed = float(end - start)
|
66 |
+
|
67 |
+
# Save transcription text to file
|
68 |
+
print("\nWriting transcription to file...")
|
69 |
+
with open(os.path.join(outputFolder, resultFileName), "w", encoding="utf-8") as file:
|
70 |
+
file.write(result["text"])
|
71 |
+
print("Finished writing transcription file.")
|
72 |
+
|
73 |
+
# Sav
|
74 |
+
# e the segments data to json file
|
75 |
+
# if word_timestamps == True:
|
76 |
+
if exportTimestampData == True:
|
77 |
+
print("\nWriting segment data to file...")
|
78 |
+
with open(os.path.join(outputFolder, jsonFileName), "w", encoding="utf-8") as file:
|
79 |
+
segmentsData = result["segments"]
|
80 |
+
json.dump(segmentsData, file, indent=4)
|
81 |
+
print("Finished writing segment data file.")
|
82 |
+
|
83 |
+
elapsedMinutes = str(round(elapsed/60, 2))
|
84 |
+
print(f"\nElapsed Time With {modelName} Model: {elapsedMinutes} Minutes")
|
85 |
+
|
86 |
+
input("Press Enter to exit...")
|
87 |
+
exit()
|
trans.py
DELETED
@@ -1,122 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
from subprocess import call
|
3 |
-
import gradio as gr
|
4 |
-
import os
|
5 |
-
# from transformers.pipelines.audio_utils import ffmpeg_read
|
6 |
-
import whisper
|
7 |
-
|
8 |
-
|
9 |
-
logger = logging.getLogger("whisper-jax-app")
|
10 |
-
logger.setLevel(logging.INFO)
|
11 |
-
ch = logging.StreamHandler()
|
12 |
-
ch.setLevel(logging.INFO)
|
13 |
-
formatter = logging.Formatter(
|
14 |
-
"%(asctime)s;%(levelname)s;%(message)s", "%Y-%m-%d %H:%M:%S")
|
15 |
-
ch.setFormatter(formatter)
|
16 |
-
logger.addHandler(ch)
|
17 |
-
|
18 |
-
|
19 |
-
BATCH_SIZE = 16
|
20 |
-
CHUNK_LENGTH_S = 30
|
21 |
-
NUM_PROC = 8
|
22 |
-
FILE_LIMIT_MB = 1000
|
23 |
-
YT_ATTEMPT_LIMIT = 3
|
24 |
-
|
25 |
-
|
26 |
-
def run_cmd(command):
|
27 |
-
try:
|
28 |
-
print(command)
|
29 |
-
call(command)
|
30 |
-
except KeyboardInterrupt:
|
31 |
-
print("Process interrupted")
|
32 |
-
sys.exit(1)
|
33 |
-
|
34 |
-
|
35 |
-
def inference(text):
|
36 |
-
cmd = ['tts', '--text', text]
|
37 |
-
run_cmd(cmd)
|
38 |
-
return 'tts_output.wav'
|
39 |
-
|
40 |
-
|
41 |
-
model = whisper.load_model("base")
|
42 |
-
|
43 |
-
inputs = gr.components.Audio(type="filepath", label="Add audio file")
|
44 |
-
outputs = gr.components.Textbox()
|
45 |
-
title = "Audio To text⚡️"
|
46 |
-
description = "An example of using TTS to generate speech from text."
|
47 |
-
article = ""
|
48 |
-
examples = [
|
49 |
-
[""]
|
50 |
-
]
|
51 |
-
|
52 |
-
|
53 |
-
def transcribe(inputs):
|
54 |
-
print('Inputs: ', inputs)
|
55 |
-
# print('Text: ', text)
|
56 |
-
# progress(0, desc="Loading audio file...")
|
57 |
-
if inputs is None:
|
58 |
-
logger.warning("No audio file")
|
59 |
-
return "No audio file submitted! Please upload an audio file before submitting your request."
|
60 |
-
file_size_mb = os.stat(inputs).st_size / (1024 * 1024)
|
61 |
-
if file_size_mb > FILE_LIMIT_MB:
|
62 |
-
logger.warning("Max file size exceeded")
|
63 |
-
return f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB."
|
64 |
-
|
65 |
-
# with open(inputs, "rb") as f:
|
66 |
-
# inputs = f.read()
|
67 |
-
|
68 |
-
# load audio and pad/trim it to fit 30 seconds
|
69 |
-
result = model.transcribe(audio=inputs, language='hindi',
|
70 |
-
word_timestamps=False, verbose=True)
|
71 |
-
# ---------------------------------------------------
|
72 |
-
|
73 |
-
print(result["text"])
|
74 |
-
return result["text"]
|
75 |
-
|
76 |
-
|
77 |
-
audio_chunked = gr.Interface(
|
78 |
-
fn=transcribe,
|
79 |
-
inputs=inputs,
|
80 |
-
outputs=outputs,
|
81 |
-
allow_flagging="never",
|
82 |
-
title=title,
|
83 |
-
description=description,
|
84 |
-
article=article,
|
85 |
-
)
|
86 |
-
|
87 |
-
microphone_chunked = gr.Interface(
|
88 |
-
fn=transcribe,
|
89 |
-
inputs=[
|
90 |
-
gr.inputs.Audio(source="microphone",
|
91 |
-
optional=True, type="filepath"),
|
92 |
-
],
|
93 |
-
outputs=[
|
94 |
-
gr.outputs.Textbox(label="Transcription").style(
|
95 |
-
show_copy_button=True),
|
96 |
-
],
|
97 |
-
allow_flagging="never",
|
98 |
-
title=title,
|
99 |
-
description=description,
|
100 |
-
article=article,
|
101 |
-
)
|
102 |
-
|
103 |
-
demo = gr.Blocks()
|
104 |
-
with demo:
|
105 |
-
gr.TabbedInterface([audio_chunked, microphone_chunked], [
|
106 |
-
"Audio File", "Microphone"])
|
107 |
-
demo.queue(concurrency_count=1, max_size=5)
|
108 |
-
demo.launch(show_api=False)
|
109 |
-
|
110 |
-
|
111 |
-
# gr.Interface(
|
112 |
-
# inference,
|
113 |
-
# inputs,
|
114 |
-
# outputs,
|
115 |
-
# verbose=True,
|
116 |
-
# title=title,
|
117 |
-
# description=description,
|
118 |
-
# article=article,
|
119 |
-
# examples=examples,
|
120 |
-
# enable_queue=True,
|
121 |
-
|
122 |
-
# ).launch(share=True, debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|