Spaces:
Build error
Build error
Commit
•
fea5bdc
0
Parent(s):
Duplicate from asafaya/arabic-audio-transcription
Browse filesCo-authored-by: Ali Safaya <asafaya@users.noreply.huggingface.co>
- .gitattributes +35 -0
- README.md +47 -0
- app.py +170 -0
- output.wav +3 -0
- requirements.txt +4 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
25 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
35 |
+
output.wav filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Arabic Audio Transcription
|
3 |
+
emoji: 🎙️
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
app_file: app.py
|
8 |
+
pinned: false
|
9 |
+
license: cc-by-nc-4.0
|
10 |
+
duplicated_from: asafaya/arabic-audio-transcription
|
11 |
+
---
|
12 |
+
|
13 |
+
# Configuration
|
14 |
+
|
15 |
+
`title`: _string_
|
16 |
+
Display title for the Space
|
17 |
+
|
18 |
+
`emoji`: _string_
|
19 |
+
Space emoji (emoji-only character allowed)
|
20 |
+
|
21 |
+
`colorFrom`: _string_
|
22 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
23 |
+
|
24 |
+
`colorTo`: _string_
|
25 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
26 |
+
|
27 |
+
`sdk`: _string_
|
28 |
+
Can be either `gradio`, `streamlit`, or `static`
|
29 |
+
|
30 |
+
`sdk_version` : _string_
|
31 |
+
Only applicable for `streamlit` SDK.
|
32 |
+
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
33 |
+
|
34 |
+
`app_file`: _string_
|
35 |
+
Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
|
36 |
+
Path is relative to the root of the repository.
|
37 |
+
|
38 |
+
`models`: _List[string]_
|
39 |
+
HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
|
40 |
+
Will be parsed automatically from your code if not specified here.
|
41 |
+
|
42 |
+
`datasets`: _List[string]_
|
43 |
+
HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
|
44 |
+
Will be parsed automatically from your code if not specified here.
|
45 |
+
|
46 |
+
`pinned`: _boolean_
|
47 |
+
Whether the Space stays on top of your list.
|
app.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import shutil
|
3 |
+
import os
|
4 |
+
import tempfile
|
5 |
+
|
6 |
+
from collections import OrderedDict
|
7 |
+
from glob import glob
|
8 |
+
|
9 |
+
import numpy
|
10 |
+
import torch
|
11 |
+
import torchaudio
|
12 |
+
import torchaudio.functional as F
|
13 |
+
|
14 |
+
from pydub import AudioSegment
|
15 |
+
from tqdm import tqdm
|
16 |
+
|
17 |
+
from speechbrain.pretrained import VAD
|
18 |
+
from speechbrain.pretrained import EncoderASR
|
19 |
+
|
20 |
+
import gradio as gr
|
21 |
+
|
22 |
+
tempdir = tempfile.mkdtemp()
|
23 |
+
|
24 |
+
def read_and_resample(filename, outdir):
|
25 |
+
# load the file
|
26 |
+
|
27 |
+
AudioSegment.from_file(filename).export(f"{filename}.wav", format='wav', parameters=["-ar", "16000", "-ac", '1'])
|
28 |
+
filename = f"{filename}.wav"
|
29 |
+
|
30 |
+
signal, sr = torchaudio.load(filename)
|
31 |
+
if sr != 16_000:
|
32 |
+
# downsample to 16khz and mono
|
33 |
+
resampled = F.resample(signal, sr, 16_000, lowpass_filter_width=128).mean(dim=0).view(1, -1).cpu()
|
34 |
+
else:
|
35 |
+
resampled = signal.mean(dim=0).view(1, -1).cpu()
|
36 |
+
|
37 |
+
# get tmp dir:
|
38 |
+
filename = os.path.basename(filename).split(".")[0]
|
39 |
+
|
40 |
+
# yield segments of 90 minutes.
|
41 |
+
c_size = 60 * 60 * 16_000
|
42 |
+
for i, c in enumerate(range(0, resampled.shape[1], c_size)):
|
43 |
+
tempaudio = os.path.join(outdir, f"{filename}-{i}.wav")
|
44 |
+
|
45 |
+
# save to tmp dir:
|
46 |
+
torchaudio.save(tempaudio, resampled[:, c:c+c_size], 16_000)
|
47 |
+
yield (tempaudio, resampled[:, c:c+c_size])
|
48 |
+
|
49 |
+
|
50 |
+
def segment_file(VAD, id, prefix, filename, resampled, output_dir):
|
51 |
+
|
52 |
+
min_chunk_size = 4 # seconds
|
53 |
+
max_allowed_length = 12 # seconds
|
54 |
+
margin = 0.15
|
55 |
+
|
56 |
+
with torch.no_grad():
|
57 |
+
audio_info = VAD.get_speech_segments(filename, apply_energy_VAD=True, len_th=0.5,
|
58 |
+
deactivation_th=0.4, double_check=False, close_th=0.25)
|
59 |
+
|
60 |
+
# save segments:
|
61 |
+
s = -1
|
62 |
+
for _s, _e in audio_info:
|
63 |
+
_s, _e = _s.item(), _e.item()
|
64 |
+
|
65 |
+
_s = max(0, _s - margin)
|
66 |
+
e = min(resampled.size(1) / 16_000, _e + margin)
|
67 |
+
|
68 |
+
if s == -1:
|
69 |
+
s = _s
|
70 |
+
|
71 |
+
chunk_length = e - s
|
72 |
+
if chunk_length > min_chunk_size:
|
73 |
+
|
74 |
+
no_chunks = int(numpy.ceil(chunk_length / max_allowed_length))
|
75 |
+
starts = numpy.linspace(s, e, no_chunks + 1).tolist()
|
76 |
+
|
77 |
+
if chunk_length > max_allowed_length:
|
78 |
+
print("WARNING: segment too long:", chunk_length)
|
79 |
+
print(no_chunks, starts)
|
80 |
+
|
81 |
+
for x in range(no_chunks):
|
82 |
+
|
83 |
+
start = starts[x]
|
84 |
+
end = starts[x + 1]
|
85 |
+
|
86 |
+
local_chunk_length = end - start
|
87 |
+
|
88 |
+
print(f"Saving segment: {start:08.2f}-{end:08.2f}, with length: {local_chunk_length:05.2f} secs")
|
89 |
+
fname = f"{id}-{prefix}-{start:08.2f}-{end:08.2f}.wav"
|
90 |
+
|
91 |
+
# convert from seconds to samples:
|
92 |
+
start = int(start * 16_000)
|
93 |
+
end = int(end * 16_000)
|
94 |
+
|
95 |
+
# save segment:
|
96 |
+
torchaudio.save(os.path.join(output_dir, fname), resampled[:, start:end], 16_000)
|
97 |
+
s = -1
|
98 |
+
|
99 |
+
|
100 |
+
def format_time(secs: float):
|
101 |
+
m, s = divmod(secs, 60)
|
102 |
+
h, m = divmod(m, 60)
|
103 |
+
return "%d:%02d:%02d,%03d" % (h, m, s, int(secs * 1000 % 1000))
|
104 |
+
|
105 |
+
asr_model = EncoderASR.from_hparams(source="asafaya/hubert-large-arabic-transcribe")
|
106 |
+
vad_model = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty")
|
107 |
+
|
108 |
+
def main(filename, generate_srt=False):
|
109 |
+
try:
|
110 |
+
AudioSegment.from_file(filename)
|
111 |
+
except:
|
112 |
+
return "Please upload a valid audio file"
|
113 |
+
|
114 |
+
outdir = os.path.join(tempdir, filename.split("/")[-1].split(".")[0])
|
115 |
+
if not os.path.exists(outdir):
|
116 |
+
os.mkdir(outdir)
|
117 |
+
|
118 |
+
print("Applying VAD to", filename)
|
119 |
+
|
120 |
+
# directory to save
|
121 |
+
segments_dir = os.path.join(outdir, "segments")
|
122 |
+
if os.path.exists(segments_dir):
|
123 |
+
raise Exception(f"Segments directory already exists: {segments_dir}")
|
124 |
+
os.mkdir(segments_dir)
|
125 |
+
print("Saving segments to", segments_dir)
|
126 |
+
|
127 |
+
for c, (tempaudio, resampled) in enumerate(read_and_resample(filename, outdir)):
|
128 |
+
print(f"Segmenting file: {filename}, with length: {resampled.shape[1] / 16_000:05.2f} secs: {tempaudio}")
|
129 |
+
segment_file(vad_model, os.path.basename(tempaudio), c, tempaudio, resampled, segments_dir)
|
130 |
+
# os.remove(tempaudio)
|
131 |
+
|
132 |
+
transcriptions = OrderedDict()
|
133 |
+
files = glob(os.path.join(segments_dir, "*.wav"))
|
134 |
+
print("Start transcribing")
|
135 |
+
for f in tqdm(sorted(files)):
|
136 |
+
try:
|
137 |
+
transcriptions[os.path.basename(f).replace(".wav", "")] = asr_model.transcribe_file(f)
|
138 |
+
# os.remove(os.path.basename(f))
|
139 |
+
except Exception as e:
|
140 |
+
print(e)
|
141 |
+
print("Error transcribing file {}".format(f))
|
142 |
+
print("Skipping...")
|
143 |
+
|
144 |
+
# shutil.rmtree(outdir)
|
145 |
+
|
146 |
+
fo = ""
|
147 |
+
for i, key in enumerate(transcriptions):
|
148 |
+
line = key
|
149 |
+
|
150 |
+
# segment-0-00148.72-00156.97
|
151 |
+
start_sec = float(line.split("-")[-2])
|
152 |
+
end_sec = float(line.split("-")[-1])
|
153 |
+
if len(line) < 2: continue
|
154 |
+
|
155 |
+
if generate_srt:
|
156 |
+
fo += ("{}\n".format(i+1))
|
157 |
+
fo += ("{} --> ".format(format_time(start_sec)))
|
158 |
+
fo += ("{}\n".format(format_time(end_sec)))
|
159 |
+
|
160 |
+
fo += ("{}\n".format(transcriptions[key]))
|
161 |
+
fo += ("\n") if generate_srt else ""
|
162 |
+
|
163 |
+
return fo
|
164 |
+
|
165 |
+
outputs = gr.outputs.Textbox(label="Transcription")
|
166 |
+
|
167 |
+
title = "Arabic Speech Transcription"
|
168 |
+
description = "Simply upload your audio."
|
169 |
+
|
170 |
+
gr.Interface(main, [gr.inputs.Audio(label="Arabic Audio File", type="filepath"), "checkbox"], outputs, title=title, description=description, enable_queue=True).launch()
|
output.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f7174c9be1bd14e7bda67f0578c1c0f75a2270017065ea0e79381c6f406e005
|
3 |
+
size 320078
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
speechbrain==0.5.13
|
2 |
+
transformers==4.22.2
|
3 |
+
pydub==0.25.1
|
4 |
+
gradio==3.8.2
|