thealphamerc commited on
Commit
47d9326
1 Parent(s): 7a97be1

Remove unnecessary files

Browse files
Files changed (8) hide show
  1. .gitignore +0 -0
  2. app.py +118 -83
  3. data/audio.wav +0 -0
  4. data/audio2.mp3 +0 -0
  5. flagged/Audio file/0.wav +0 -0
  6. flagged/log.csv +0 -2
  7. script.py +87 -0
  8. trans.py +0 -122
.gitignore ADDED
File without changes
app.py CHANGED
@@ -1,87 +1,122 @@
1
- # Basic script for using the OpenAI Whisper model to transcribe a video file. You can uncomment whichever model you want to use.
2
- # Author: ThioJoe ( https://github.com/ThioJoe )
3
-
4
- # Required third party packages: whisper
5
- # See instructions for setup here: https://github.com/openai/whisper#setup
6
- # - You can use the below command to pull the repo and install dependencies, then just put this script in the repo directory:
7
- # pip install git+https://github.com/openai/whisper.git
8
-
9
- import whisper
10
- import io
11
- import time
12
  import os
13
- import json
14
- import pathlib
15
-
16
- # Choose model to use by uncommenting
17
- # modelName = "tiny.en"
18
- modelName = "base.en"
19
- # modelName = "small.en"
20
- # modelName = "medium.en"
21
- # modelName = "large-v2"
22
-
23
- # Other Variables
24
- # (bool) Whether to export the segment data to a json file. Will include word level timestamps if word_timestamps is True.
25
- exportTimestampData = True
26
- outputFolder = "Output"
27
-
28
- # ----- Select variables for transcribe method -----
29
- # audio: path to audio file
30
- verbose = True # (bool): Whether to display the text being decoded to the console. If True, displays all the details, If False, displays minimal details. If None, does not display anything
31
- language = "english" # Language of audio file
32
- # (bool): Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.
33
- word_timestamps = False
34
- # initial_prompt="" # (optional str): Optional text to provide as a prompt for the first window. This can be used to provide, or "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those word correctly.
35
-
36
- # -------------------------------------------------------------------------
37
- print(f"Using Model: {modelName}")
38
- filePath = input("Path to File Being Transcribed: ")
39
- filePath = filePath.strip("\"")
40
- if not os.path.exists(filePath):
41
- print("Problem Getting File...")
42
- input("Press Enter to Exit...")
43
- exit()
44
-
45
- # If output folder does not exist, create it
46
- if not os.path.exists(outputFolder):
47
- os.makedirs(outputFolder)
48
- print("Created Output Folder.\n")
49
-
50
- # Get filename stem using pathlib (filename without extension)
51
- fileNameStem = pathlib.Path(filePath).stem
52
-
53
- resultFileName = f"{fileNameStem}.txt"
54
- jsonFileName = f"{fileNameStem}.json"
55
-
56
- model = whisper.load_model(modelName)
57
- start = time.time()
58
 
59
- # ---------------------------------------------------
60
- result = model.transcribe(audio=filePath, language=language,
61
- word_timestamps=word_timestamps, verbose=verbose)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  # ---------------------------------------------------
63
 
64
- end = time.time()
65
- elapsed = float(end - start)
66
-
67
- # Save transcription text to file
68
- print("\nWriting transcription to file...")
69
- with open(os.path.join(outputFolder, resultFileName), "w", encoding="utf-8") as file:
70
- file.write(result["text"])
71
- print("Finished writing transcription file.")
72
-
73
- # Sav
74
- # e the segments data to json file
75
- # if word_timestamps == True:
76
- if exportTimestampData == True:
77
- print("\nWriting segment data to file...")
78
- with open(os.path.join(outputFolder, jsonFileName), "w", encoding="utf-8") as file:
79
- segmentsData = result["segments"]
80
- json.dump(segmentsData, file, indent=4)
81
- print("Finished writing segment data file.")
82
-
83
- elapsedMinutes = str(round(elapsed/60, 2))
84
- print(f"\nElapsed Time With {modelName} Model: {elapsedMinutes} Minutes")
85
-
86
- input("Press Enter to exit...")
87
- exit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from subprocess import call
3
+ import gradio as gr
 
 
 
 
 
 
 
 
4
  import os
5
+ # from transformers.pipelines.audio_utils import ffmpeg_read
6
+ import whisper
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+
9
+ logger = logging.getLogger("whisper-jax-app")
10
+ logger.setLevel(logging.INFO)
11
+ ch = logging.StreamHandler()
12
+ ch.setLevel(logging.INFO)
13
+ formatter = logging.Formatter(
14
+ "%(asctime)s;%(levelname)s;%(message)s", "%Y-%m-%d %H:%M:%S")
15
+ ch.setFormatter(formatter)
16
+ logger.addHandler(ch)
17
+
18
+
19
+ BATCH_SIZE = 16
20
+ CHUNK_LENGTH_S = 30
21
+ NUM_PROC = 8
22
+ FILE_LIMIT_MB = 1000
23
+ YT_ATTEMPT_LIMIT = 3
24
+
25
+
26
+ def run_cmd(command):
27
+ try:
28
+ print(command)
29
+ call(command)
30
+ except KeyboardInterrupt:
31
+ print("Process interrupted")
32
+ sys.exit(1)
33
+
34
+
35
+ def inference(text):
36
+ cmd = ['tts', '--text', text]
37
+ run_cmd(cmd)
38
+ return 'tts_output.wav'
39
+
40
+
41
+ model = whisper.load_model("base")
42
+
43
+ inputs = gr.components.Audio(type="filepath", label="Add audio file")
44
+ outputs = gr.components.Textbox()
45
+ title = "Audio To text⚡️"
46
+ description = "An example of using TTS to generate speech from text."
47
+ article = ""
48
+ examples = [
49
+ [""]
50
+ ]
51
+
52
+
53
+ def transcribe(inputs):
54
+ print('Inputs: ', inputs)
55
+ # print('Text: ', text)
56
+ # progress(0, desc="Loading audio file...")
57
+ if inputs is None:
58
+ logger.warning("No audio file")
59
+ return "No audio file submitted! Please upload an audio file before submitting your request."
60
+ file_size_mb = os.stat(inputs).st_size / (1024 * 1024)
61
+ if file_size_mb > FILE_LIMIT_MB:
62
+ logger.warning("Max file size exceeded")
63
+ return f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB."
64
+
65
+ # with open(inputs, "rb") as f:
66
+ # inputs = f.read()
67
+
68
+ # load audio and pad/trim it to fit 30 seconds
69
+ result = model.transcribe(audio=inputs, language='hindi',
70
+ word_timestamps=False, verbose=True)
71
  # ---------------------------------------------------
72
 
73
+ print(result["text"])
74
+ return result["text"]
75
+
76
+
77
+ audio_chunked = gr.Interface(
78
+ fn=transcribe,
79
+ inputs=inputs,
80
+ outputs=outputs,
81
+ allow_flagging="never",
82
+ title=title,
83
+ description=description,
84
+ article=article,
85
+ )
86
+
87
+ microphone_chunked = gr.Interface(
88
+ fn=transcribe,
89
+ inputs=[
90
+ gr.inputs.Audio(source="microphone",
91
+ optional=True, type="filepath"),
92
+ ],
93
+ outputs=[
94
+ gr.outputs.Textbox(label="Transcription").style(
95
+ show_copy_button=True),
96
+ ],
97
+ allow_flagging="never",
98
+ title=title,
99
+ description=description,
100
+ article=article,
101
+ )
102
+
103
+ demo = gr.Blocks()
104
+ with demo:
105
+ gr.TabbedInterface([audio_chunked, microphone_chunked], [
106
+ "Audio File", "Microphone"])
107
+ demo.queue(concurrency_count=1, max_size=5)
108
+ demo.launch(show_api=False)
109
+
110
+
111
+ # gr.Interface(
112
+ # inference,
113
+ # inputs,
114
+ # outputs,
115
+ # verbose=True,
116
+ # title=title,
117
+ # description=description,
118
+ # article=article,
119
+ # examples=examples,
120
+ # enable_queue=True,
121
+
122
+ # ).launch(share=True, debug=True)
data/audio.wav DELETED
Binary file (172 kB)
 
data/audio2.mp3 DELETED
Binary file (35.4 kB)
 
flagged/Audio file/0.wav DELETED
Binary file (693 kB)
 
flagged/log.csv DELETED
@@ -1,2 +0,0 @@
1
- Audio file,Transcription,timestamp
2
- Audio file/0.wav,No audio file submitted! Please upload an audio file before submitting your request.,2023-04-26 23:19:33.132801
 
 
 
script.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Basic script for using the OpenAI Whisper model to transcribe a video file. You can uncomment whichever model you want to use.
2
+ # Author: ThioJoe ( https://github.com/ThioJoe )
3
+
4
+ # Required third party packages: whisper
5
+ # See instructions for setup here: https://github.com/openai/whisper#setup
6
+ # - You can use the below command to pull the repo and install dependencies, then just put this script in the repo directory:
7
+ # pip install git+https://github.com/openai/whisper.git
8
+
9
+ import whisper
10
+ import io
11
+ import time
12
+ import os
13
+ import json
14
+ import pathlib
15
+
16
+ # Choose model to use by uncommenting
17
+ # modelName = "tiny.en"
18
+ modelName = "base.en"
19
+ # modelName = "small.en"
20
+ # modelName = "medium.en"
21
+ # modelName = "large-v2"
22
+
23
+ # Other Variables
24
+ # (bool) Whether to export the segment data to a json file. Will include word level timestamps if word_timestamps is True.
25
+ exportTimestampData = True
26
+ outputFolder = "Output"
27
+
28
+ # ----- Select variables for transcribe method -----
29
+ # audio: path to audio file
30
+ verbose = True # (bool): Whether to display the text being decoded to the console. If True, displays all the details, If False, displays minimal details. If None, does not display anything
31
+ language = "english" # Language of audio file
32
+ # (bool): Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.
33
+ word_timestamps = False
34
+ # initial_prompt="" # (optional str): Optional text to provide as a prompt for the first window. This can be used to provide, or "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those word correctly.
35
+
36
+ # -------------------------------------------------------------------------
37
+ print(f"Using Model: {modelName}")
38
+ filePath = input("Path to File Being Transcribed: ")
39
+ filePath = filePath.strip("\"")
40
+ if not os.path.exists(filePath):
41
+ print("Problem Getting File...")
42
+ input("Press Enter to Exit...")
43
+ exit()
44
+
45
+ # If output folder does not exist, create it
46
+ if not os.path.exists(outputFolder):
47
+ os.makedirs(outputFolder)
48
+ print("Created Output Folder.\n")
49
+
50
+ # Get filename stem using pathlib (filename without extension)
51
+ fileNameStem = pathlib.Path(filePath).stem
52
+
53
+ resultFileName = f"{fileNameStem}.txt"
54
+ jsonFileName = f"{fileNameStem}.json"
55
+
56
+ model = whisper.load_model(modelName)
57
+ start = time.time()
58
+
59
+ # ---------------------------------------------------
60
+ result = model.transcribe(audio=filePath, language=language,
61
+ word_timestamps=word_timestamps, verbose=verbose)
62
+ # ---------------------------------------------------
63
+
64
+ end = time.time()
65
+ elapsed = float(end - start)
66
+
67
+ # Save transcription text to file
68
+ print("\nWriting transcription to file...")
69
+ with open(os.path.join(outputFolder, resultFileName), "w", encoding="utf-8") as file:
70
+ file.write(result["text"])
71
+ print("Finished writing transcription file.")
72
+
73
+ # Sav
74
+ # e the segments data to json file
75
+ # if word_timestamps == True:
76
+ if exportTimestampData == True:
77
+ print("\nWriting segment data to file...")
78
+ with open(os.path.join(outputFolder, jsonFileName), "w", encoding="utf-8") as file:
79
+ segmentsData = result["segments"]
80
+ json.dump(segmentsData, file, indent=4)
81
+ print("Finished writing segment data file.")
82
+
83
+ elapsedMinutes = str(round(elapsed/60, 2))
84
+ print(f"\nElapsed Time With {modelName} Model: {elapsedMinutes} Minutes")
85
+
86
+ input("Press Enter to exit...")
87
+ exit()
trans.py DELETED
@@ -1,122 +0,0 @@
1
- import logging
2
- from subprocess import call
3
- import gradio as gr
4
- import os
5
- # from transformers.pipelines.audio_utils import ffmpeg_read
6
- import whisper
7
-
8
-
9
- logger = logging.getLogger("whisper-jax-app")
10
- logger.setLevel(logging.INFO)
11
- ch = logging.StreamHandler()
12
- ch.setLevel(logging.INFO)
13
- formatter = logging.Formatter(
14
- "%(asctime)s;%(levelname)s;%(message)s", "%Y-%m-%d %H:%M:%S")
15
- ch.setFormatter(formatter)
16
- logger.addHandler(ch)
17
-
18
-
19
- BATCH_SIZE = 16
20
- CHUNK_LENGTH_S = 30
21
- NUM_PROC = 8
22
- FILE_LIMIT_MB = 1000
23
- YT_ATTEMPT_LIMIT = 3
24
-
25
-
26
- def run_cmd(command):
27
- try:
28
- print(command)
29
- call(command)
30
- except KeyboardInterrupt:
31
- print("Process interrupted")
32
- sys.exit(1)
33
-
34
-
35
- def inference(text):
36
- cmd = ['tts', '--text', text]
37
- run_cmd(cmd)
38
- return 'tts_output.wav'
39
-
40
-
41
- model = whisper.load_model("base")
42
-
43
- inputs = gr.components.Audio(type="filepath", label="Add audio file")
44
- outputs = gr.components.Textbox()
45
- title = "Audio To text⚡️"
46
- description = "An example of using TTS to generate speech from text."
47
- article = ""
48
- examples = [
49
- [""]
50
- ]
51
-
52
-
53
- def transcribe(inputs):
54
- print('Inputs: ', inputs)
55
- # print('Text: ', text)
56
- # progress(0, desc="Loading audio file...")
57
- if inputs is None:
58
- logger.warning("No audio file")
59
- return "No audio file submitted! Please upload an audio file before submitting your request."
60
- file_size_mb = os.stat(inputs).st_size / (1024 * 1024)
61
- if file_size_mb > FILE_LIMIT_MB:
62
- logger.warning("Max file size exceeded")
63
- return f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB."
64
-
65
- # with open(inputs, "rb") as f:
66
- # inputs = f.read()
67
-
68
- # load audio and pad/trim it to fit 30 seconds
69
- result = model.transcribe(audio=inputs, language='hindi',
70
- word_timestamps=False, verbose=True)
71
- # ---------------------------------------------------
72
-
73
- print(result["text"])
74
- return result["text"]
75
-
76
-
77
- audio_chunked = gr.Interface(
78
- fn=transcribe,
79
- inputs=inputs,
80
- outputs=outputs,
81
- allow_flagging="never",
82
- title=title,
83
- description=description,
84
- article=article,
85
- )
86
-
87
- microphone_chunked = gr.Interface(
88
- fn=transcribe,
89
- inputs=[
90
- gr.inputs.Audio(source="microphone",
91
- optional=True, type="filepath"),
92
- ],
93
- outputs=[
94
- gr.outputs.Textbox(label="Transcription").style(
95
- show_copy_button=True),
96
- ],
97
- allow_flagging="never",
98
- title=title,
99
- description=description,
100
- article=article,
101
- )
102
-
103
- demo = gr.Blocks()
104
- with demo:
105
- gr.TabbedInterface([audio_chunked, microphone_chunked], [
106
- "Audio File", "Microphone"])
107
- demo.queue(concurrency_count=1, max_size=5)
108
- demo.launch(show_api=False)
109
-
110
-
111
- # gr.Interface(
112
- # inference,
113
- # inputs,
114
- # outputs,
115
- # verbose=True,
116
- # title=title,
117
- # description=description,
118
- # article=article,
119
- # examples=examples,
120
- # enable_queue=True,
121
-
122
- # ).launch(share=True, debug=True)