thealphamerc commited on
Commit
7a97be1
0 Parent(s):

Initial commit 🎉

Browse files
Output/audio.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Quatlin, quatlin quatlin quatlin quatlin. Anti-six.
Output/audio2.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ to gain life in all that...
Output/audio3.json ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": 0,
4
+ "seek": 0,
5
+ "start": 0.0,
6
+ "end": 1.52,
7
+ "text": " Come and sit on a rock.",
8
+ "tokens": [
9
+ 50363,
10
+ 7911,
11
+ 290,
12
+ 1650,
13
+ 319,
14
+ 257,
15
+ 3881,
16
+ 13,
17
+ 50439
18
+ ],
19
+ "temperature": 0.0,
20
+ "avg_logprob": -0.34572365704704733,
21
+ "compression_ratio": 1.356164383561644,
22
+ "no_speech_prob": 0.01958448439836502
23
+ },
24
+ {
25
+ "id": 1,
26
+ "seek": 0,
27
+ "start": 1.52,
28
+ "end": 5.08,
29
+ "text": " Overlooking the river's blow, he wears a hat and some glasses.",
30
+ "tokens": [
31
+ 50439,
32
+ 3827,
33
+ 11534,
34
+ 262,
35
+ 7850,
36
+ 338,
37
+ 6611,
38
+ 11,
39
+ 339,
40
+ 17326,
41
+ 257,
42
+ 6877,
43
+ 290,
44
+ 617,
45
+ 15232,
46
+ 13,
47
+ 50617
48
+ ],
49
+ "temperature": 0.0,
50
+ "avg_logprob": -0.34572365704704733,
51
+ "compression_ratio": 1.356164383561644,
52
+ "no_speech_prob": 0.01958448439836502
53
+ },
54
+ {
55
+ "id": 2,
56
+ "seek": 0,
57
+ "start": 5.08,
58
+ "end": 7.36,
59
+ "text": " A smile on his face.",
60
+ "tokens": [
61
+ 50617,
62
+ 317,
63
+ 8212,
64
+ 319,
65
+ 465,
66
+ 1986,
67
+ 13,
68
+ 50731
69
+ ],
70
+ "temperature": 0.0,
71
+ "avg_logprob": -0.34572365704704733,
72
+ "compression_ratio": 1.356164383561644,
73
+ "no_speech_prob": 0.01958448439836502
74
+ },
75
+ {
76
+ "id": 3,
77
+ "seek": 0,
78
+ "start": 7.36,
79
+ "end": 8.56,
80
+ "text": " He's not lost.",
81
+ "tokens": [
82
+ 50731,
83
+ 679,
84
+ 338,
85
+ 407,
86
+ 2626,
87
+ 13,
88
+ 50791
89
+ ],
90
+ "temperature": 0.0,
91
+ "avg_logprob": -0.34572365704704733,
92
+ "compression_ratio": 1.356164383561644,
93
+ "no_speech_prob": 0.01958448439836502
94
+ },
95
+ {
96
+ "id": 4,
97
+ "seek": 0,
98
+ "start": 8.56,
99
+ "end": 10.4,
100
+ "text": " The water rushes by.",
101
+ "tokens": [
102
+ 50791,
103
+ 383,
104
+ 1660,
105
+ 38596,
106
+ 416,
107
+ 13,
108
+ 50883
109
+ ],
110
+ "temperature": 0.0,
111
+ "avg_logprob": -0.34572365704704733,
112
+ "compression_ratio": 1.356164383561644,
113
+ "no_speech_prob": 0.01958448439836502
114
+ },
115
+ {
116
+ "id": 5,
117
+ "seek": 0,
118
+ "start": 10.4,
119
+ "end": 12.08,
120
+ "text": " A constant sound.",
121
+ "tokens": [
122
+ 50883,
123
+ 317,
124
+ 6937,
125
+ 2128,
126
+ 13,
127
+ 50967
128
+ ],
129
+ "temperature": 0.0,
130
+ "avg_logprob": -0.34572365704704733,
131
+ "compression_ratio": 1.356164383561644,
132
+ "no_speech_prob": 0.01958448439836502
133
+ },
134
+ {
135
+ "id": 6,
136
+ "seek": 0,
137
+ "start": 12.08,
138
+ "end": 13.68,
139
+ "text": " It takes in the view.",
140
+ "tokens": [
141
+ 50967,
142
+ 632,
143
+ 2753,
144
+ 287,
145
+ 262,
146
+ 1570,
147
+ 13,
148
+ 51047
149
+ ],
150
+ "temperature": 0.0,
151
+ "avg_logprob": -0.34572365704704733,
152
+ "compression_ratio": 1.356164383561644,
153
+ "no_speech_prob": 0.01958448439836502
154
+ },
155
+ {
156
+ "id": 7,
157
+ "seek": 0,
158
+ "start": 13.68,
159
+ "end": 14.48,
160
+ "text": " The mountains.",
161
+ "tokens": [
162
+ 51047,
163
+ 383,
164
+ 12269,
165
+ 13,
166
+ 51087
167
+ ],
168
+ "temperature": 0.0,
169
+ "avg_logprob": -0.34572365704704733,
170
+ "compression_ratio": 1.356164383561644,
171
+ "no_speech_prob": 0.01958448439836502
172
+ }
173
+ ]
Output/audio3.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Come and sit on a rock. Overlooking the river's blow, he wears a hat and some glasses. A smile on his face. He's not lost. The water rushes by. A constant sound. It takes in the view. The mountains.
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Basic script for using the OpenAI Whisper model to transcribe a video file. You can uncomment whichever model you want to use.
2
+ # Author: ThioJoe ( https://github.com/ThioJoe )
3
+
4
+ # Required third party packages: whisper
5
+ # See instructions for setup here: https://github.com/openai/whisper#setup
6
+ # - You can use the below command to pull the repo and install dependencies, then just put this script in the repo directory:
7
+ # pip install git+https://github.com/openai/whisper.git
8
+
9
+ import whisper
10
+ import io
11
+ import time
12
+ import os
13
+ import json
14
+ import pathlib
15
+
16
+ # Choose model to use by uncommenting
17
+ # modelName = "tiny.en"
18
+ modelName = "base.en"
19
+ # modelName = "small.en"
20
+ # modelName = "medium.en"
21
+ # modelName = "large-v2"
22
+
23
+ # Other Variables
24
+ # (bool) Whether to export the segment data to a json file. Will include word level timestamps if word_timestamps is True.
25
+ exportTimestampData = True
26
+ outputFolder = "Output"
27
+
28
+ # ----- Select variables for transcribe method -----
29
+ # audio: path to audio file
30
+ verbose = True # (bool): Whether to display the text being decoded to the console. If True, displays all the details, If False, displays minimal details. If None, does not display anything
31
+ language = "english" # Language of audio file
32
+ # (bool): Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.
33
+ word_timestamps = False
34
+ # initial_prompt="" # (optional str): Optional text to provide as a prompt for the first window. This can be used to provide, or "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those word correctly.
35
+
36
+ # -------------------------------------------------------------------------
37
+ print(f"Using Model: {modelName}")
38
+ filePath = input("Path to File Being Transcribed: ")
39
+ filePath = filePath.strip("\"")
40
+ if not os.path.exists(filePath):
41
+ print("Problem Getting File...")
42
+ input("Press Enter to Exit...")
43
+ exit()
44
+
45
+ # If output folder does not exist, create it
46
+ if not os.path.exists(outputFolder):
47
+ os.makedirs(outputFolder)
48
+ print("Created Output Folder.\n")
49
+
50
+ # Get filename stem using pathlib (filename without extension)
51
+ fileNameStem = pathlib.Path(filePath).stem
52
+
53
+ resultFileName = f"{fileNameStem}.txt"
54
+ jsonFileName = f"{fileNameStem}.json"
55
+
56
+ model = whisper.load_model(modelName)
57
+ start = time.time()
58
+
59
+ # ---------------------------------------------------
60
+ result = model.transcribe(audio=filePath, language=language,
61
+ word_timestamps=word_timestamps, verbose=verbose)
62
+ # ---------------------------------------------------
63
+
64
+ end = time.time()
65
+ elapsed = float(end - start)
66
+
67
+ # Save transcription text to file
68
+ print("\nWriting transcription to file...")
69
+ with open(os.path.join(outputFolder, resultFileName), "w", encoding="utf-8") as file:
70
+ file.write(result["text"])
71
+ print("Finished writing transcription file.")
72
+
73
+ # Sav
74
+ # e the segments data to json file
75
+ # if word_timestamps == True:
76
+ if exportTimestampData == True:
77
+ print("\nWriting segment data to file...")
78
+ with open(os.path.join(outputFolder, jsonFileName), "w", encoding="utf-8") as file:
79
+ segmentsData = result["segments"]
80
+ json.dump(segmentsData, file, indent=4)
81
+ print("Finished writing segment data file.")
82
+
83
+ elapsedMinutes = str(round(elapsed/60, 2))
84
+ print(f"\nElapsed Time With {modelName} Model: {elapsedMinutes} Minutes")
85
+
86
+ input("Press Enter to exit...")
87
+ exit()
data/audio.wav ADDED
Binary file (172 kB). View file
 
data/audio2.mp3 ADDED
Binary file (35.4 kB). View file
 
data/audio3.wav ADDED
Binary file (695 kB). View file
 
flagged/Audio file/0.wav ADDED
Binary file (693 kB). View file
 
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Audio file,Transcription,timestamp
2
+ Audio file/0.wav,No audio file submitted! Please upload an audio file before submitting your request.,2023-04-26 23:19:33.132801
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ whisper
2
+ gradio===3.27.0
trans.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from subprocess import call
3
+ import gradio as gr
4
+ import os
5
+ # from transformers.pipelines.audio_utils import ffmpeg_read
6
+ import whisper
7
+
8
+
9
+ logger = logging.getLogger("whisper-jax-app")
10
+ logger.setLevel(logging.INFO)
11
+ ch = logging.StreamHandler()
12
+ ch.setLevel(logging.INFO)
13
+ formatter = logging.Formatter(
14
+ "%(asctime)s;%(levelname)s;%(message)s", "%Y-%m-%d %H:%M:%S")
15
+ ch.setFormatter(formatter)
16
+ logger.addHandler(ch)
17
+
18
+
19
+ BATCH_SIZE = 16
20
+ CHUNK_LENGTH_S = 30
21
+ NUM_PROC = 8
22
+ FILE_LIMIT_MB = 1000
23
+ YT_ATTEMPT_LIMIT = 3
24
+
25
+
26
+ def run_cmd(command):
27
+ try:
28
+ print(command)
29
+ call(command)
30
+ except KeyboardInterrupt:
31
+ print("Process interrupted")
32
+ sys.exit(1)
33
+
34
+
35
+ def inference(text):
36
+ cmd = ['tts', '--text', text]
37
+ run_cmd(cmd)
38
+ return 'tts_output.wav'
39
+
40
+
41
+ model = whisper.load_model("base")
42
+
43
+ inputs = gr.components.Audio(type="filepath", label="Add audio file")
44
+ outputs = gr.components.Textbox()
45
+ title = "Audio To text⚡️"
46
+ description = "An example of using TTS to generate speech from text."
47
+ article = ""
48
+ examples = [
49
+ [""]
50
+ ]
51
+
52
+
53
+ def transcribe(inputs):
54
+ print('Inputs: ', inputs)
55
+ # print('Text: ', text)
56
+ # progress(0, desc="Loading audio file...")
57
+ if inputs is None:
58
+ logger.warning("No audio file")
59
+ return "No audio file submitted! Please upload an audio file before submitting your request."
60
+ file_size_mb = os.stat(inputs).st_size / (1024 * 1024)
61
+ if file_size_mb > FILE_LIMIT_MB:
62
+ logger.warning("Max file size exceeded")
63
+ return f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB."
64
+
65
+ # with open(inputs, "rb") as f:
66
+ # inputs = f.read()
67
+
68
+ # load audio and pad/trim it to fit 30 seconds
69
+ result = model.transcribe(audio=inputs, language='hindi',
70
+ word_timestamps=False, verbose=True)
71
+ # ---------------------------------------------------
72
+
73
+ print(result["text"])
74
+ return result["text"]
75
+
76
+
77
+ audio_chunked = gr.Interface(
78
+ fn=transcribe,
79
+ inputs=inputs,
80
+ outputs=outputs,
81
+ allow_flagging="never",
82
+ title=title,
83
+ description=description,
84
+ article=article,
85
+ )
86
+
87
+ microphone_chunked = gr.Interface(
88
+ fn=transcribe,
89
+ inputs=[
90
+ gr.inputs.Audio(source="microphone",
91
+ optional=True, type="filepath"),
92
+ ],
93
+ outputs=[
94
+ gr.outputs.Textbox(label="Transcription").style(
95
+ show_copy_button=True),
96
+ ],
97
+ allow_flagging="never",
98
+ title=title,
99
+ description=description,
100
+ article=article,
101
+ )
102
+
103
+ demo = gr.Blocks()
104
+ with demo:
105
+ gr.TabbedInterface([audio_chunked, microphone_chunked], [
106
+ "Audio File", "Microphone"])
107
+ demo.queue(concurrency_count=1, max_size=5)
108
+ demo.launch(show_api=False)
109
+
110
+
111
+ # gr.Interface(
112
+ # inference,
113
+ # inputs,
114
+ # outputs,
115
+ # verbose=True,
116
+ # title=title,
117
+ # description=description,
118
+ # article=article,
119
+ # examples=examples,
120
+ # enable_queue=True,
121
+
122
+ # ).launch(share=True, debug=True)