pustozerov commited on
Commit
8f18caf
·
1 Parent(s): f508160

Database with examples was completely moved to the HuggingFace cloud.

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +17 -12
  3. requirements.txt +4 -2
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  /data/user_data/
2
  /info/transcripts/
 
 
1
  /data/user_data/
2
  /info/transcripts/
3
+ /info/configs/manifests/
app.py CHANGED
@@ -1,30 +1,35 @@
1
  import glob
2
  import random
3
  import os
 
4
  import soundfile as sf
5
  import streamlit as st
6
  from pydub import AudioSegment
 
 
7
 
8
  from modules.diarization.nemo_diarization import diarization
9
 
 
 
 
 
10
  st.title('Call Transcription demo')
11
  st.subheader('This simple demo shows the possibilities of the ASR and NLP in the task of '
12
  'automatic speech recognition and diarization. It works with mp3, ogg and wav files. You can randomly '
13
  'pickup a set of images from the built-in database or try uploading your own files.')
 
14
 
15
-
16
- if st.button('Try random samples from the database'):
17
- folder = "data/datasets/crema_d_diarization_chunks"
18
- os.makedirs(folder, exist_ok=True)
19
- list_all_audio = glob.glob("data/datasets/crema_d_diarization_chunks/*.wav")
20
- chosen_files = sorted(random.sample(list_all_audio, 1))
21
- file_name = os.path.basename(chosen_files[0]).split(".")[0]
22
- audio_file = open(chosen_files[0], 'rb')
23
- audio_bytes = audio_file.read()
24
- st.audio(audio_bytes)
25
- f = sf.SoundFile(chosen_files[0])
26
  st.write("Starting transcription. Estimated processing time: %0.1f seconds" % (f.frames / (f.samplerate * 5)))
27
- result = diarization(chosen_files[0])
28
  with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
29
  transcript = f.read()
30
  st.write("Transcription completed.")
 
1
  import glob
2
  import random
3
  import os
4
+ import numpy as np
5
  import soundfile as sf
6
  import streamlit as st
7
  from pydub import AudioSegment
8
+ from datasets import load_dataset
9
+ from scipy.io.wavfile import write
10
 
11
  from modules.diarization.nemo_diarization import diarization
12
 
13
+ FOLDER_WAV = "data/user_data"
14
+ SAMPLE_RATE = 16000
15
+ dataset = load_dataset("pustozerov/crema_d_diarization", split='validation')
16
+
17
  st.title('Call Transcription demo')
18
  st.subheader('This simple demo shows the possibilities of the ASR and NLP in the task of '
19
  'automatic speech recognition and diarization. It works with mp3, ogg and wav files. You can randomly '
20
  'pickup a set of images from the built-in database or try uploading your own files.')
21
+ if st.button('Try a random sample from the database'):
22
 
23
+ shuffled_dataset = dataset.shuffle(seed=random.randint(0, 100))
24
+ file_name = str(shuffled_dataset["file"][0]).split(".")[0]
25
+ audio_bytes = np.array(shuffled_dataset["data"][0])
26
+ audio_bytes_scaled = np.int16(audio_bytes / np.max(np.abs(audio_bytes)) * 32767)
27
+ write(os.path.join(FOLDER_WAV, file_name + '.wav'), rate=SAMPLE_RATE, data=audio_bytes_scaled)
28
+ f = sf.SoundFile(os.path.join(FOLDER_WAV, file_name + '.wav'))
29
+ audio_file = open(os.path.join(FOLDER_WAV, file_name + '.wav'), 'rb')
30
+ st.audio(audio_file.read())
 
 
 
31
  st.write("Starting transcription. Estimated processing time: %0.1f seconds" % (f.frames / (f.samplerate * 5)))
32
+ result = diarization(os.path.join(FOLDER_WAV, file_name + '.wav'))
33
  with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
34
  transcript = f.read()
35
  st.write("Transcription completed.")
requirements.txt CHANGED
@@ -33,10 +33,12 @@ sentencepiece==0.1.96
33
  SoundFile==0.10.3.post1
34
  spacy==3.4.0
35
  speechbrain @ git+https://github.com/speechbrain/speechbrain.git
36
- streamlit==1.10.0
37
  torch==1.12.0
38
  torchaudio==0.12.0
39
  transformers==4.20.0
40
  webdataset==0.1.62
41
  Cython==0.29.14
42
- youtokentome
 
 
 
33
  SoundFile==0.10.3.post1
34
  spacy==3.4.0
35
  speechbrain @ git+https://github.com/speechbrain/speechbrain.git
36
+ streamlit~=1.11.1
37
  torch==1.12.0
38
  torchaudio==0.12.0
39
  transformers==4.20.0
40
  webdataset==0.1.62
41
  Cython==0.29.14
42
+ youtokentome
43
+ datasets~=2.4.0
44
+ NEMO~=4.1.1