ylacombe's picture
Upload 17 files
db36668 verified
import torch
import penn
# Here we'll use a 10 millisecond hopsize
hopsize = .01
# Provide a sensible frequency range given your domain and model
fmin = 30.
fmax = 1000.
# Select a checkpoint to use for inference. Selecting None will
# download and use FCNF0++ pretrained on MDB-stem-synth and PTDB
checkpoint = None
# Centers frames at hopsize / 2, 3 * hopsize / 2, 5 * hopsize / 2, ...
center = 'half-hop'
# (Optional) Linearly interpolate unvoiced regions below periodicity threshold
interp_unvoiced_at = .065
def pitch_apply(batch, rank=None, audio_column_name="audio", output_column_name="utterance_pitch", penn_batch_size=4096):
if isinstance(batch[audio_column_name], list):
utterance_pitch_mean = []
utterance_pitch_std = []
for sample in batch[audio_column_name]:
# Infer pitch and periodicity
pitch, periodicity = penn.from_audio(
torch.tensor(sample["array"][None, :]).float(),
sample["sampling_rate"],
hopsize=hopsize,
fmin=fmin,
fmax=fmax,
checkpoint=checkpoint,
batch_size=penn_batch_size,
center=center,
interp_unvoiced_at=interp_unvoiced_at,
gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
)
utterance_pitch_mean.append(pitch.mean().cpu())
utterance_pitch_std.append(pitch.std().cpu())
batch[f"{output_column_name}_mean"] = utterance_pitch_mean
batch[f"{output_column_name}_std"] = utterance_pitch_std
else:
sample = batch[audio_column_name]
pitch, periodicity = penn.from_audio(
torch.tensor(sample["array"][None, :]).float(),
sample["sampling_rate"],
hopsize=hopsize,
fmin=fmin,
fmax=fmax,
checkpoint=checkpoint,
batch_size=penn_batch_size,
center=center,
interp_unvoiced_at=interp_unvoiced_at,
gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
)
batch[f"{output_column_name}_mean"] = pitch.mean().cpu()
batch[f"{output_column_name}_std"] = pitch.std().cpu()
return batch