File size: 6,346 Bytes
a647c50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cf661d
a647c50
 
 
 
 
 
 
 
 
 
4cf661d
a647c50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import spaces
import gradio as gr
import base64
import librosa
from extractors.asrdiarization.asr_extractor import ASRExtractorConfig, ASRExtractor
from indexify_extractor_sdk import Content

MAX_AUDIO_MINUTES = 60 # wont try to transcribe if longer than this

asr_extractor = ASRExtractor()

def check_audio(audio_filepath):
	"""
	Do not convert and raise error if audio too long.
	"""
	data, sr = librosa.load(audio_filepath, sr=None, mono=True)
	duration = librosa.get_duration(y=data, sr=sr)

	if duration / 60.0 > MAX_AUDIO_MINUTES:
		raise gr.Error(
			f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
			"If you wish, you may trim the audio using the Audio viewer in Step 1 "
			"(click on the scissors icon to start trimming audio)."
		)

	return audio_filepath

@spaces.GPU
def transcribe(audio_filepath, task, batch_size, chunk_length_s, sampling_rate, language, num_speakers, min_speakers, max_speakers, assisted):
	if audio_filepath is None:
		raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
	
	audio_filepath = check_audio(audio_filepath)
	
	with open(audio_filepath, "rb") as f:
		converted_audio_filepath = base64.b64encode(f.read()).decode("utf-8")
		
	content = Content(content_type="audio/mpeg", data=converted_audio_filepath)
	config = ASRExtractorConfig(task=task, batch_size=batch_size, chunk_length_s=chunk_length_s, sampling_rate=sampling_rate, language=language, num_speakers=num_speakers, min_speakers=min_speakers, max_speakers=max_speakers, assisted=assisted)
	
	result = asr_extractor.extract(content, config)
	text_content = next(content.data.decode('utf-8') for content in result)
	
	return text_content

with gr.Blocks(
	title="ASR + diarization + speculative decoding with Indexify"
) as audio_demo:

	gr.HTML("<h1 style='text-align: center'>ASR + diarization + speculative decoding with <a href='https://getindexify.ai/'>Indexify</a></h1>")
	gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
	gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")

	with gr.Row():
		with gr.Column():
			gr.HTML(
				"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"

				"<p style='color: #A0A0A0;'>Use this demo for audio files only up to 60 mins long. "
				"You can transcribe longer files and try various other extractors locally with "
				"<a href='https://getindexify.ai/'>Indexify</a>.</p>"
			)

			audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")

			gr.HTML("<p><b>Step 2:</b> Choose the parameters or leave to default.</p>")

			task = gr.Dropdown(
				choices=["transcribe", "translate"],
				value="transcribe",
                info="passed to the ASR pipeline",
				label="Task:"
			)

			with gr.Column():
				batch_size = gr.Number(
					value=24,
                    info="for assisted generation the `batch_size` must be set to 1",
					label="Batch Size:"
				)
				chunk_length_s = gr.Number(
					value=30,
                    info="passed to the ASR pipeline",
					label="Chunk Length:"
				)
				sampling_rate = gr.Number(
					value=16000,
                    info="`sampling_rate` indicates the sampling rate of the audio to process and is used for preprocessing",
					label="Sampling Rate:"
				)
				language = gr.Dropdown(
                    choices=['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese', 'turkish', 'polish', 'catalan', 'dutch', 'arabic', 'swedish', 'italian', 'indonesian', 'hindi', 'finnish', 'vietnamese', 'hebrew', 'ukrainian', 'greek', 'malay', 'czech', 'romanian', 'danish', 'hungarian', 'tamil', 'norwegian', 'thai', 'urdu', 'croatian', 'bulgarian', 'lithuanian', 'latin', 'maori', 'malayalam', 'welsh', 'slovak', 'telugu', 'persian', 'latvian', 'bengali', 'serbian', 'azerbaijani', 'slovenian', 'kannada', 'estonian', 'macedonian', 'breton', 'basque', 'icelandic', 'armenian', 'nepali', 'mongolian', 'bosnian', 'kazakh', 'albanian', 'swahili', 'galician', 'marathi', 'punjabi', 'sinhala', 'khmer', 'shona', 'yoruba', 'somali', 'afrikaans', 'occitan', 'georgian', 'belarusian', 'tajik', 'sindhi', 'gujarati', 'amharic', 'yiddish', 'lao', 'uzbek', 'faroese', 'haitian creole', 'pashto', 'turkmen', 'nynorsk', 'maltese', 'sanskrit', 'luxembourgish', 'myanmar', 'tibetan', 'tagalog', 'malagasy', 'assamese', 'tatar', 'hawaiian', 'lingala', 'hausa', 'bashkir', 'javanese', 'sundanese', 'cantonese', 'burmese', 'valencian', 'flemish', 'haitian', 'letzeburgesch', 'pushto', 'panjabi', 'moldavian', 'moldovan', 'sinhalese', 'castilian', 'mandarin'],
					info="passed to the ASR pipeline",
					label="Language:"
				)
				num_speakers = gr.Number(
					info="passed to diarization pipeline",
					label="Number of Speakers:"
				)
				min_speakers = gr.Number(
					info="passed to diarization pipeline",
					label="Minimum Speakers:"
				)
				max_speakers = gr.Number(
					info="passed to diarization pipeline",
					label="Maximum Speakers:"
				)
				assisted = gr.Checkbox(
					value=False,
                    info="the `assisted` flag tells the pipeline whether to use speculative decoding",
					label="Assisted?",
				)

		with gr.Column():

			gr.HTML("<p><b>Step 3:</b> Run the extractor.</p>")

			go_button = gr.Button(
				value="Run extractor",
				variant="primary", # make "primary" so it stands out (default is "secondary")
			)

			model_output_text_box = gr.Textbox(
				label="Extractor Output",
				elem_id="model_output_text_box",
			)

	with gr.Row():

		gr.HTML(
			"<p style='text-align: center'>"
				"Developed with 🫶 by <a href='https://getindexify.io/' target='_blank'>Indexify</a> | "
				"a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product"
			"</p>"
		)

	go_button.click(
		fn=transcribe, 
		inputs = [audio_file, task, batch_size, chunk_length_s, sampling_rate, language, num_speakers, min_speakers, max_speakers, assisted],
		outputs = [model_output_text_box]
	)

demo = gr.TabbedInterface([audio_demo], ["Audio Extraction"], theme=gr.themes.Soft())

demo.queue()
demo.launch()