clr commited on
Commit
e8949c5
·
1 Parent(s): 62e3c32

Update graph.py

Browse files
Files changed (1) hide show
  1. graph.py +33 -14
graph.py CHANGED
@@ -4,17 +4,17 @@ from scipy import signal
4
  import librosa
5
  import subprocess
6
  import matplotlib.pyplot as plt
 
7
 
8
 
9
-
10
- def readwav(wav_path):
11
- wav, sr = sf.read(wav_path, dtype=np.float32)
12
- if len(wav.shape) == 2:
13
- wav = wav.mean(1)
14
  if sr != 16000:
15
- wlen = int(wav.shape[0] / sr * 16000)
16
- wav = signal.resample(wav, wlen)
17
- return wav
18
 
19
 
20
  def normalise_transcript(xcp):
@@ -25,7 +25,20 @@ def normalise_transcript(xcp):
25
 
26
 
27
 
28
- def get_pitch_tracks(wav_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  print('FILE PATH:', wav_path)
30
  f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
31
  print('PLAIN:',f0_data)
@@ -35,19 +48,25 @@ def get_pitch_tracks(wav_path):
35
  #print(f0_data)
36
  f0_data = [l.split(' ') for l in f0_data[:-1]] # the last line is other info
37
  f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
 
 
 
 
38
  return f0_data
39
 
40
 
 
 
41
  # transcript could be from a corpus with the wav file,
42
  # input by the user,
43
  # or from a previous speech recognition process
44
- def align_and_graph(wav_path, transcript, aligner_function):
45
 
46
  plt.close('all')
47
-
48
 
49
  # fetch data
50
- speech = readwav(wav_path)
51
  w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))
52
 
53
 
@@ -55,7 +74,7 @@ def align_and_graph(wav_path, transcript, aligner_function):
55
  rec_start = w_align[0][1]
56
  rec_end = w_align[-1][2]
57
 
58
- f0_data = get_pitch_tracks(wav_path)
59
  if f0_data:
60
  f_max = max([f0 for t,f0 in f0_data]) + 50
61
  else:
@@ -85,7 +104,7 @@ def align_and_graph(wav_path, transcript, aligner_function):
85
 
86
 
87
 
88
- w, sr = librosa.load(wav_path)
89
  fr_l = 2048 # librosa default
90
  h_l = 512 # default
91
  rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
 
4
  import librosa
5
  import subprocess
6
  import matplotlib.pyplot as plt
7
+ from pydub import AudioSegment
8
 
9
 
10
+ def readaud(sound_path):
11
+ aud, sr = sf.read(sound_path, dtype=np.float32)
12
+ if len(aud.shape) == 2:
13
+ aud = aud.mean(1)
 
14
  if sr != 16000:
15
+ alen = int(aud.shape[0] / sr * 16000)
16
+ aud = signal.resample(aud, alen)
17
+ return aud
18
 
19
 
20
  def normalise_transcript(xcp):
 
25
 
26
 
27
 
28
+ def get_pitch_tracks(sound_path):
29
+
30
+ orig_ftype = sound_path.split('.')[-1]
31
+
32
+ if orig_ftype == '.wav':
33
+ wav_path = sound_path
34
+
35
+ else:
36
+ aud_data = AudioSegment.from_file(sound_path, orig_ftype)
37
+ curdir = subprocess.run(["pwd"], capture_output=True, text=True).stdout
38
+ tmp_path = f'{curdir}/tmp.wav'
39
+ aud_data.export(tmp_path, format="wav")
40
+ wav_path = tmp_path
41
+
42
  print('FILE PATH:', wav_path)
43
  f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
44
  print('PLAIN:',f0_data)
 
48
  #print(f0_data)
49
  f0_data = [l.split(' ') for l in f0_data[:-1]] # the last line is other info
50
  f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
51
+
52
+ if orig_ftype != '.wav':
53
+ subprocess.run(["rm", tmp_path])
54
+
55
  return f0_data
56
 
57
 
58
+
59
+
60
  # transcript could be from a corpus with the wav file,
61
  # input by the user,
62
  # or from a previous speech recognition process
63
+ def align_and_graph(sound_path, transcript, aligner_function):
64
 
65
  plt.close('all')
66
+
67
 
68
  # fetch data
69
+ speech = readaud(sound_path)
70
  w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))
71
 
72
 
 
74
  rec_start = w_align[0][1]
75
  rec_end = w_align[-1][2]
76
 
77
+ f0_data = get_pitch_tracks(sound_path)
78
  if f0_data:
79
  f_max = max([f0 for t,f0 in f0_data]) + 50
80
  else:
 
104
 
105
 
106
 
107
+ w, sr = librosa.load(sound_path)
108
  fr_l = 2048 # librosa default
109
  h_l = 512 # default
110
  rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)