jessica07 commited on
Commit
8554b58
1 Parent(s): b645b5a

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +34 -0
  2. deepgram_transcribe.py +129 -0
  3. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pathlib import Path
3
+ from deepgram_transcribe import process
4
+ import os
5
+
6
+ def upload_file(files):
7
+ file_paths = [file.name for file in files]
8
+ return file_paths
9
+
10
+ def process_submit(title, file, upload_button, progress=gr.Progress()):
11
+ progress(0, desc="Starting...")
12
+ zip_folder_path = process(file, progress)
13
+ return zip_folder_path
14
+
15
+ track_title = gr.Markdown(
16
+ """
17
+ You can track your progress here
18
+ """)
19
+
20
+ file_output = gr.File()
21
+ upload_button = gr.UploadButton("Click to upload a file", file_types=["audio","video"], file_count="multiple")
22
+
23
+ title = gr.Markdown(
24
+ """
25
+ # Playground
26
+ Upload your audio file here.
27
+ """)
28
+
29
+ with gr.Interface(fn=process_submit, inputs=[title, file_output, upload_button], outputs="file", allow_flagging="never") as demo:
30
+ upload_button.upload(upload_file, upload_button, file_output)
31
+
32
+ demo.launch()
33
+
34
+
deepgram_transcribe.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is a script that transcibe downloaded youtube video using deepgram
2
+ # the audio should be cleaned with UVR5 first, so the file is flac
3
+ # it will upload the full length of interview or podcast to deepgram
4
+ # and will return the speaker id. User must manually listen audio clip to find out which speaker is wanted
5
+ # discard remaining speakers and short length audio
6
+ #
7
+
8
+ import os
9
+ from dotenv import load_dotenv
10
+ from pydub import AudioSegment
11
+ import math
12
+ from os.path import join
13
+ import shutil
14
+
15
+ from deepgram import (
16
+ DeepgramClient,
17
+ PrerecordedOptions,
18
+ FileSource,
19
+ )
20
+
21
+ def process(audio_file, progress):
22
+ load_dotenv("myenv-variable.env")
23
+
24
+ # Path to the audio file
25
+ AUDIO_FILE = audio_file #audio name
26
+ TAGS = "vogue" # youtube source, for categorization
27
+
28
+ API_KEY = os.getenv('API_DEEPGRAM')
29
+ original_parent_folder = os.getcwd()
30
+ print(original_parent_folder)
31
+ # start_index = original_parent_folder.find("file=")
32
+
33
+ speaker_folder = join(original_parent_folder, "output")
34
+
35
+ if not os.path.isdir(speaker_folder):
36
+ os.mkdir(speaker_folder)
37
+
38
+ deepgram = DeepgramClient(API_KEY)
39
+
40
+ with open(AUDIO_FILE, "rb") as file:
41
+ buffer_data = file.read()
42
+
43
+ payload: FileSource = {
44
+ "buffer": buffer_data,
45
+ }
46
+
47
+ #STEP 2: Configure Deepgram options for audio analysis
48
+ options = PrerecordedOptions(
49
+ model="nova-2",
50
+ smart_format=True,
51
+ filler_words=True,
52
+ diarize=True
53
+ )
54
+
55
+ progress(0.20)
56
+
57
+ try:
58
+ response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
59
+ except Exception as e:
60
+ print(e)
61
+
62
+ progress(0.30)
63
+
64
+ audio = AudioSegment.from_file(AUDIO_FILE)
65
+ data = response
66
+
67
+ paragraphs = data['results']['channels'][0]['alternatives'][0]['paragraphs']['paragraphs']
68
+
69
+ csv_data = [["filename", "speaker", "text", "start_time", "end_time", "duration"]]
70
+
71
+ i=1
72
+
73
+ progress(0.40)
74
+
75
+ for paragraph in progress.tqdm(paragraphs, desc="Generating..."):
76
+ sentences = paragraph['sentences']
77
+ for text in sentences:
78
+
79
+ # convert the start and end time of the sentence to ms, add +- 5ms buffer to it
80
+ start_time_ms = math.floor(text['start']*1000)-5
81
+ end_time_ms = math.ceil(text['end']*1000)+5
82
+ duration_s = round(text['end']-text['start'],3)
83
+ duration_ms = str(end_time_ms-start_time_ms).zfill(6)
84
+ speaker_id = paragraph['speaker']
85
+
86
+ folder_path = join(speaker_folder, "Speaker_"+str(speaker_id))
87
+ if not os.path.isdir(folder_path):
88
+ os.mkdir(folder_path)
89
+
90
+ if speaker_id == 10:
91
+ speaker_id = "Tayr"
92
+
93
+
94
+ # Slice the audio segment
95
+ segment = audio[start_time_ms:end_time_ms]
96
+
97
+ # Generate file name
98
+ file_name = join("wavs",f"{TAGS}_Speaker_{speaker_id}_i{str(i).zfill(3)}_d{duration_ms}.wav")
99
+
100
+
101
+ # Export the segment to temp folder
102
+ temp_folder = join(folder_path,f"{TAGS}_Speaker_{speaker_id}_i{str(i).zfill(3)}_d{duration_ms}.wav")
103
+ segment.export(temp_folder, format="wav")
104
+
105
+ # Add data to CSV list
106
+ csv_data.append([file_name, speaker_id, text['text'], start_time_ms, end_time_ms, duration_s])
107
+
108
+ i += 1
109
+
110
+
111
+ # Specify the filename
112
+ csv_filename = join(speaker_folder,f"{TAGS}_output.txt")
113
+
114
+ with open(csv_filename, 'w') as file:
115
+ # Iterate over each row in the data
116
+ for row in csv_data:
117
+ # Create a string where each field is separated by a '|'
118
+ row_string = '|'.join(str(item) for item in row)
119
+ # Write the string to the file, followed by a newline character
120
+ file.write(row_string + '\n')
121
+
122
+ progress(0.90)
123
+
124
+ shutil.make_archive("output", 'zip', speaker_folder)
125
+ print(f"Data written to {csv_filename}")
126
+
127
+ progress(1.00)
128
+ return "output.zip"
129
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ python-dotenv
2
+ deepgram-sdk==3.2.6