Spaces:

thealphamerc
/

audio-to-text

Runtime error

App Files Files Community

thealphamerc commited on Apr 26, 2023

Commit

7a97be1

0 Parent(s):

Initial commit 🎉

Browse files

Files changed (12) hide show

Output/audio.txt +1 -0
Output/audio2.txt +1 -0
Output/audio3.json +173 -0
Output/audio3.txt +1 -0
app.py +87 -0
data/audio.wav +0 -0
data/audio2.mp3 +0 -0
data/audio3.wav +0 -0
flagged/Audio file/0.wav +0 -0
flagged/log.csv +2 -0
requirements.txt +2 -0
trans.py +122 -0

Output/audio.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Quatlin, quatlin quatlin quatlin quatlin. Anti-six.

Output/audio2.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ to gain life in all that...

Output/audio3.json ADDED Viewed

	@@ -0,0 +1,173 @@

+[
+    {
+        "id": 0,
+        "seek": 0,
+        "start": 0.0,
+        "end": 1.52,
+        "text": " Come and sit on a rock.",
+        "tokens": [
+            50363,
+            7911,
+            290,
+            1650,
+            319,
+            257,
+            3881,
+            13,
+            50439
+        ],
+        "temperature": 0.0,
+        "avg_logprob": -0.34572365704704733,
+        "compression_ratio": 1.356164383561644,
+        "no_speech_prob": 0.01958448439836502
+    },
+    {
+        "id": 1,
+        "seek": 0,
+        "start": 1.52,
+        "end": 5.08,
+        "text": " Overlooking the river's blow, he wears a hat and some glasses.",
+        "tokens": [
+            50439,
+            3827,
+            11534,
+            262,
+            7850,
+            338,
+            6611,
+            11,
+            339,
+            17326,
+            257,
+            6877,
+            290,
+            617,
+            15232,
+            13,
+            50617
+        ],
+        "temperature": 0.0,
+        "avg_logprob": -0.34572365704704733,
+        "compression_ratio": 1.356164383561644,
+        "no_speech_prob": 0.01958448439836502
+    },
+    {
+        "id": 2,
+        "seek": 0,
+        "start": 5.08,
+        "end": 7.36,
+        "text": " A smile on his face.",
+        "tokens": [
+            50617,
+            317,
+            8212,
+            319,
+            465,
+            1986,
+            13,
+            50731
+        ],
+        "temperature": 0.0,
+        "avg_logprob": -0.34572365704704733,
+        "compression_ratio": 1.356164383561644,
+        "no_speech_prob": 0.01958448439836502
+    },
+    {
+        "id": 3,
+        "seek": 0,
+        "start": 7.36,
+        "end": 8.56,
+        "text": " He's not lost.",
+        "tokens": [
+            50731,
+            679,
+            338,
+            407,
+            2626,
+            13,
+            50791
+        ],
+        "temperature": 0.0,
+        "avg_logprob": -0.34572365704704733,
+        "compression_ratio": 1.356164383561644,
+        "no_speech_prob": 0.01958448439836502
+    },
+    {
+        "id": 4,
+        "seek": 0,
+        "start": 8.56,
+        "end": 10.4,
+        "text": " The water rushes by.",
+        "tokens": [
+            50791,
+            383,
+            1660,
+            38596,
+            416,
+            13,
+            50883
+        ],
+        "temperature": 0.0,
+        "avg_logprob": -0.34572365704704733,
+        "compression_ratio": 1.356164383561644,
+        "no_speech_prob": 0.01958448439836502
+    },
+    {
+        "id": 5,
+        "seek": 0,
+        "start": 10.4,
+        "end": 12.08,
+        "text": " A constant sound.",
+        "tokens": [
+            50883,
+            317,
+            6937,
+            2128,
+            13,
+            50967
+        ],
+        "temperature": 0.0,
+        "avg_logprob": -0.34572365704704733,
+        "compression_ratio": 1.356164383561644,
+        "no_speech_prob": 0.01958448439836502
+    },
+    {
+        "id": 6,
+        "seek": 0,
+        "start": 12.08,
+        "end": 13.68,
+        "text": " It takes in the view.",
+        "tokens": [
+            50967,
+            632,
+            2753,
+            287,
+            262,
+            1570,
+            13,
+            51047
+        ],
+        "temperature": 0.0,
+        "avg_logprob": -0.34572365704704733,
+        "compression_ratio": 1.356164383561644,
+        "no_speech_prob": 0.01958448439836502
+    },
+    {
+        "id": 7,
+        "seek": 0,
+        "start": 13.68,
+        "end": 14.48,
+        "text": " The mountains.",
+        "tokens": [
+            51047,
+            383,
+            12269,
+            13,
+            51087
+        ],
+        "temperature": 0.0,
+        "avg_logprob": -0.34572365704704733,
+        "compression_ratio": 1.356164383561644,
+        "no_speech_prob": 0.01958448439836502
+    }
+]

Output/audio3.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Come and sit on a rock. Overlooking the river's blow, he wears a hat and some glasses. A smile on his face. He's not lost. The water rushes by. A constant sound. It takes in the view. The mountains.

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Basic script for using the OpenAI Whisper model to transcribe a video file. You can uncomment whichever model you want to use.
+# Author: ThioJoe ( https://github.com/ThioJoe )
+# Required third party packages: whisper
+# See instructions for setup here: https://github.com/openai/whisper#setup
+#   - You can use the below command to pull the repo and install dependencies, then just put this script in the repo directory:
+#     pip install git+https://github.com/openai/whisper.git
+import whisper
+import io
+import time
+import os
+import json
+import pathlib
+# Choose model to use by uncommenting
+# modelName = "tiny.en"
+modelName = "base.en"
+# modelName = "small.en"
+# modelName = "medium.en"
+# modelName = "large-v2"
+# Other Variables
+# (bool) Whether to export the segment data to a json file. Will include word level timestamps if word_timestamps is True.
+exportTimestampData = True
+outputFolder = "Output"
+#  ----- Select variables for transcribe method  -----
+# audio: path to audio file
+verbose = True  # (bool): Whether to display the text being decoded to the console. If True, displays all the details, If False, displays minimal details. If None, does not display anything
+language = "english"  # Language of audio file
+# (bool): Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.
+word_timestamps = False
+# initial_prompt="" # (optional str): Optional text to provide as a prompt for the first window. This can be used to provide, or "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those word correctly.
+#  -------------------------------------------------------------------------
+print(f"Using Model: {modelName}")
+filePath = input("Path to File Being Transcribed: ")
+filePath = filePath.strip("\"")
+if not os.path.exists(filePath):
+    print("Problem Getting File...")
+    input("Press Enter to Exit...")
+    exit()
+# If output folder does not exist, create it
+if not os.path.exists(outputFolder):
+    os.makedirs(outputFolder)
+    print("Created Output Folder.\n")
+# Get filename stem using pathlib (filename without extension)
+fileNameStem = pathlib.Path(filePath).stem
+resultFileName = f"{fileNameStem}.txt"
+jsonFileName = f"{fileNameStem}.json"
+model = whisper.load_model(modelName)
+start = time.time()
+#  ---------------------------------------------------
+result = model.transcribe(audio=filePath, language=language,
+                          word_timestamps=word_timestamps, verbose=verbose)
+#  ---------------------------------------------------
+end = time.time()
+elapsed = float(end - start)
+# Save transcription text to file
+print("\nWriting transcription to file...")
+with open(os.path.join(outputFolder, resultFileName), "w", encoding="utf-8") as file:
+    file.write(result["text"])
+print("Finished writing transcription file.")
+# Sav
+# e the segments data to json file
+# if word_timestamps == True:
+if exportTimestampData == True:
+    print("\nWriting segment data to file...")
+    with open(os.path.join(outputFolder, jsonFileName), "w", encoding="utf-8") as file:
+        segmentsData = result["segments"]
+        json.dump(segmentsData, file, indent=4)
+    print("Finished writing segment data file.")
+elapsedMinutes = str(round(elapsed/60, 2))
+print(f"\nElapsed Time With {modelName} Model: {elapsedMinutes} Minutes")
+input("Press Enter to exit...")
+exit()

data/audio.wav ADDED Viewed

Binary file (172 kB). View file

data/audio2.mp3 ADDED Viewed

Binary file (35.4 kB). View file

data/audio3.wav ADDED Viewed

Binary file (695 kB). View file

flagged/Audio file/0.wav ADDED Viewed

Binary file (693 kB). View file

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Audio file,Transcription,timestamp
2	+ Audio file/0.wav,No audio file submitted! Please upload an audio file before submitting your request.,2023-04-26 23:19:33.132801

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ whisper
2	+ gradio===3.27.0

trans.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import logging
+from subprocess import call
+import gradio as gr
+import os
+# from transformers.pipelines.audio_utils import ffmpeg_read
+import whisper
+logger = logging.getLogger("whisper-jax-app")
+logger.setLevel(logging.INFO)
+ch = logging.StreamHandler()
+ch.setLevel(logging.INFO)
+formatter = logging.Formatter(
+    "%(asctime)s;%(levelname)s;%(message)s", "%Y-%m-%d %H:%M:%S")
+ch.setFormatter(formatter)
+logger.addHandler(ch)
+BATCH_SIZE = 16
+CHUNK_LENGTH_S = 30
+NUM_PROC = 8
+FILE_LIMIT_MB = 1000
+YT_ATTEMPT_LIMIT = 3
+def run_cmd(command):
+    try:
+        print(command)
+        call(command)
+    except KeyboardInterrupt:
+        print("Process interrupted")
+        sys.exit(1)
+def inference(text):
+    cmd = ['tts', '--text', text]
+    run_cmd(cmd)
+    return 'tts_output.wav'
+model = whisper.load_model("base")
+inputs = gr.components.Audio(type="filepath", label="Add audio file")
+outputs = gr.components.Textbox()
+title = "Audio To text⚡️"
+description = "An example of using TTS to generate speech from text."
+article = ""
+examples = [
+    [""]
+]
+def transcribe(inputs):
+    print('Inputs: ', inputs)
+    # print('Text: ', text)
+    # progress(0, desc="Loading audio file...")
+    if inputs is None:
+        logger.warning("No audio file")
+        return "No audio file submitted! Please upload an audio file before submitting your request."
+    file_size_mb = os.stat(inputs).st_size / (1024 * 1024)
+    if file_size_mb > FILE_LIMIT_MB:
+        logger.warning("Max file size exceeded")
+        return f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB."
+    # with open(inputs, "rb") as f:
+    #     inputs = f.read()
+    # load audio and pad/trim it to fit 30 seconds
+    result = model.transcribe(audio=inputs, language='hindi',
+                              word_timestamps=False, verbose=True)
+#  ---------------------------------------------------
+    print(result["text"])
+    return result["text"]
+audio_chunked = gr.Interface(
+    fn=transcribe,
+    inputs=inputs,
+    outputs=outputs,
+    allow_flagging="never",
+    title=title,
+    description=description,
+    article=article,
+)
+microphone_chunked = gr.Interface(
+    fn=transcribe,
+    inputs=[
+        gr.inputs.Audio(source="microphone",
+                        optional=True, type="filepath"),
+    ],
+    outputs=[
+        gr.outputs.Textbox(label="Transcription").style(
+            show_copy_button=True),
+    ],
+    allow_flagging="never",
+    title=title,
+    description=description,
+    article=article,
+)
+demo = gr.Blocks()
+with demo:
+    gr.TabbedInterface([audio_chunked, microphone_chunked], [
+                       "Audio File", "Microphone"])
+demo.queue(concurrency_count=1, max_size=5)
+demo.launch(show_api=False)
+# gr.Interface(
+#     inference,
+#     inputs,
+#     outputs,
+#     verbose=True,
+#     title=title,
+#     description=description,
+#     article=article,
+#     examples=examples,
+#     enable_queue=True,
+# ).launch(share=True, debug=True)