Spaces:

clr
/

prosalign

Sleeping

App Files Files Community

clr commited on Mar 30, 2023

Commit

e8949c5

1 Parent(s): 62e3c32

Update graph.py

Browse files

Files changed (1) hide show

graph.py +33 -14

graph.py CHANGED Viewed

@@ -4,17 +4,17 @@ from scipy import signal
 import librosa
 import subprocess
 import matplotlib.pyplot as plt
-def readwav(wav_path):
-    wav, sr = sf.read(wav_path, dtype=np.float32)
-    if len(wav.shape) == 2:
-        wav = wav.mean(1)
     if sr != 16000:
-        wlen = int(wav.shape[0] / sr * 16000)
-        wav = signal.resample(wav, wlen)
-    return wav
 def normalise_transcript(xcp):
@@ -25,7 +25,20 @@ def normalise_transcript(xcp):
-def get_pitch_tracks(wav_path):
     print('FILE PATH:', wav_path)
     f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
     print('PLAIN:',f0_data)
@@ -35,19 +48,25 @@ def get_pitch_tracks(wav_path):
     #print(f0_data)
     f0_data = [l.split(' ') for l in f0_data[:-1]] # the last line is other info
     f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
     return f0_data
 # transcript could be from a corpus with the wav file,
 # input by the user,
 # or from a previous speech recognition process
-def align_and_graph(wav_path, transcript, aligner_function):
     plt.close('all')
     # fetch data
-    speech = readwav(wav_path)
     w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))
@@ -55,7 +74,7 @@ def align_and_graph(wav_path, transcript, aligner_function):
     rec_start = w_align[0][1]
     rec_end = w_align[-1][2]
-    f0_data = get_pitch_tracks(wav_path)
     if f0_data:
         f_max = max([f0 for t,f0 in f0_data]) + 50
     else:
@@ -85,7 +104,7 @@ def align_and_graph(wav_path, transcript, aligner_function):
-    w, sr = librosa.load(wav_path)
     fr_l = 2048 # librosa default
     h_l = 512 # default
     rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)

 import librosa
 import subprocess
 import matplotlib.pyplot as plt
+from pydub import AudioSegment
+def readaud(sound_path):
+    aud, sr = sf.read(sound_path, dtype=np.float32)
+    if len(aud.shape) == 2:
+        aud = aud.mean(1)
     if sr != 16000:
+        alen = int(aud.shape[0] / sr * 16000)
+        aud = signal.resample(aud, alen)
+    return aud
 def normalise_transcript(xcp):
+def get_pitch_tracks(sound_path):
+    orig_ftype = sound_path.split('.')[-1]
+    if orig_ftype == '.wav':
+        wav_path = sound_path
+    else:
+        aud_data = AudioSegment.from_file(sound_path, orig_ftype)
+        curdir = subprocess.run(["pwd"], capture_output=True, text=True).stdout
+        tmp_path = f'{curdir}/tmp.wav'
+        aud_data.export(tmp_path, format="wav")
+        wav_path = tmp_path
     print('FILE PATH:', wav_path)
     f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
     print('PLAIN:',f0_data)
     #print(f0_data)
     f0_data = [l.split(' ') for l in f0_data[:-1]] # the last line is other info
     f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
+    if orig_ftype != '.wav':
+        subprocess.run(["rm", tmp_path])
     return f0_data
 # transcript could be from a corpus with the wav file,
 # input by the user,
 # or from a previous speech recognition process
+def align_and_graph(sound_path, transcript, aligner_function):
     plt.close('all')
     # fetch data
+    speech = readaud(sound_path)
     w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))
     rec_start = w_align[0][1]
     rec_end = w_align[-1][2]
+    f0_data = get_pitch_tracks(sound_path)
     if f0_data:
         f_max = max([f0 for t,f0 in f0_data]) + 50
     else:
+    w, sr = librosa.load(sound_path)
     fr_l = 2048 # librosa default
     h_l = 512 # default
     rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)