Zeimoto commited on
Commit
5cc4f06
1 Parent(s): 2ea871a

update app.py add transcribe function

Browse files
Files changed (1) hide show
  1. app.py +57 -37
app.py CHANGED
@@ -5,40 +5,60 @@ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
5
  from datasets import load_dataset
6
  import torch
7
 
8
- # x = st.slider('Select a value')
9
- # st.write(x, 'squared is', x * x)
10
-
11
- wav_audio_data = st_audiorec()
12
-
13
- if wav_audio_data is not None:
14
- st.audio(wav_audio_data, format='audio/wav')
15
-
16
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
17
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
18
-
19
- model_id = "openai/whisper-large-v3"
20
-
21
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
22
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
23
- )
24
- model.to(device)
25
-
26
- processor = AutoProcessor.from_pretrained(model_id)
27
-
28
- pipe = pipeline(
29
- "automatic-speech-recognition",
30
- model=model,
31
- tokenizer=processor.tokenizer,
32
- feature_extractor=processor.feature_extractor,
33
- max_new_tokens=128,
34
- chunk_length_s=30,
35
- batch_size=16,
36
- return_timestamps=True,
37
- torch_dtype=torch_dtype,
38
- device=device,
39
- )
40
-
41
- dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
42
- sample = dataset[0]["audio"]
43
- result = pipe(sample)
44
- print(result["text"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from datasets import load_dataset
6
  import torch
7
 
8
+ pipe = None
9
+ audio_sample: bytes = None
10
+ audio_transcription: str = None
11
+
12
+ def main ():
13
+
14
+ init_model()
15
+ # x = st.slider('Select a value')
16
+ # st.write(x, 'squared is', x * x)
17
+
18
+ wav_audio_data = st_audiorec()
19
+
20
+ if wav_audio_data is not None:
21
+ st.audio(wav_audio_data, format='audio/wav')
22
+
23
+ dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
24
+ sample = dataset[0]["audio"]
25
+ st.write('Sample:', sample)
26
+
27
+ async def init_model ():
28
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
29
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
30
+
31
+ model_id = "openai/whisper-large-v3"
32
+
33
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
34
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
35
+ )
36
+ model.to(device)
37
+
38
+ processor = AutoProcessor.from_pretrained(model_id)
39
+
40
+ pipe = pipeline(
41
+ "automatic-speech-recognition",
42
+ model=model,
43
+ tokenizer=processor.tokenizer,
44
+ feature_extractor=processor.feature_extractor,
45
+ max_new_tokens=128,
46
+ chunk_length_s=30,
47
+ batch_size=16,
48
+ return_timestamps=True,
49
+ torch_dtype=torch_dtype,
50
+ device=device,
51
+ )
52
+
53
+ async def transcribe (audio_sample: bytes, pipe) -> str:
54
+
55
+ # dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
56
+ # sample = dataset[0]["audio"]
57
+ result = pipe(audio_sample)
58
+ print(result)
59
+
60
+ st.write('Result', result["text"])
61
+ return result["text"]
62
+
63
+ if __name__ == "__main__":
64
+ main()