birgermoell commited on
Commit
993f0db
1 Parent(s): 4bc4813

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +49 -0
  2. packages.txt +1 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
+ import torch
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import io
7
+
8
+ st.title("Syllables per Second Calculator")
9
+ st.write("Upload an audio file to calculate the number of 'p', 't', and 'k' syllables per second.")
10
+
11
+ def get_syllables_per_second(audio_file):
12
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
13
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
14
+
15
+ audio_input, sample_rate = sf.read(io.BytesIO(audio_file.read()))
16
+
17
+ if audio_input.ndim > 1 and audio_input.shape[1] == 2:
18
+ audio_input = np.mean(audio_input, axis=1)
19
+
20
+ input_values = processor(audio_input, return_tensors="pt").input_values
21
+
22
+ with torch.no_grad():
23
+ logits = model(input_values).logits
24
+ predicted_ids = torch.argmax(logits, dim=-1)
25
+ transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
26
+ offsets = transcription['char_offsets']
27
+
28
+ # Find the start and end time offsets of the syllables
29
+ syllable_offsets = [item for item in offsets[0] if item['char'] in ['p', 't', 'k']]
30
+
31
+ if syllable_offsets: # if any syllable is found
32
+ first_syllable_offset = syllable_offsets[0]['start_offset'] / sample_rate
33
+ last_syllable_offset = syllable_offsets[-1]['end_offset'] / sample_rate
34
+ # Duration from the first to the last syllable
35
+ syllable_duration = last_syllable_offset - first_syllable_offset
36
+ else:
37
+ syllable_duration = 0
38
+
39
+ syllable_count = len(syllable_offsets)
40
+ syllables_per_second = syllable_count / syllable_duration if syllable_duration > 0 else 0
41
+
42
+ return syllables_per_second
43
+
44
+ uploaded_file = st.file_uploader("Choose an audio file", type=["wav"])
45
+
46
+ if uploaded_file is not None:
47
+ with st.spinner("Processing the audio file..."):
48
+ result = get_syllables_per_second(uploaded_file)
49
+ st.write("Syllables per second: ", result)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ espeak
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ numpy
3
+ transformers
4
+ soundfile
5
+ phonemizer