Spaces:
Running
Running
import os | |
import torch | |
import shutil | |
import librosa | |
import warnings | |
import numpy as np | |
import gradio as gr | |
import librosa.display | |
import matplotlib.pyplot as plt | |
import torchvision.transforms as transforms | |
from collections import Counter | |
from PIL import Image | |
from tqdm import tqdm | |
from model import net, MODEL_DIR | |
MODEL = net() | |
def most_common_element(input_list): | |
counter = Counter(input_list) | |
mce, _ = counter.most_common(1)[0] | |
return mce | |
def wav_to_mel(audio_path: str, width=0.18): | |
os.makedirs("./tmp") | |
try: | |
y, sr = librosa.load(audio_path, sr=48000) | |
non_silent = y | |
mel_spec = librosa.feature.melspectrogram(y=non_silent, sr=sr) | |
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) | |
dur = librosa.get_duration(y=non_silent, sr=sr) | |
total_frames = log_mel_spec.shape[1] | |
step = int(width * total_frames / dur) | |
count = int(total_frames / step) | |
begin = int(0.5 * (total_frames - count * step)) | |
end = begin + step * count | |
for i in tqdm(range(begin, end, step), desc="Converting wav to jpgs..."): | |
librosa.display.specshow(log_mel_spec[:, i : i + step]) | |
plt.axis("off") | |
plt.savefig( | |
f"./tmp/{os.path.basename(audio_path)[:-4]}_{i}.jpg", | |
bbox_inches="tight", | |
pad_inches=0.0, | |
) | |
plt.close() | |
except Exception as e: | |
print(f"Error converting {audio_path} : {e}") | |
def embed_img(img_path, input_size=224): | |
transform = transforms.Compose( | |
[ | |
transforms.Resize([input_size, input_size]), | |
transforms.ToTensor(), | |
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), | |
] | |
) | |
img = Image.open(img_path).convert("RGB") | |
return transform(img).unsqueeze(0) | |
def inference(wav_path, folder_path="./tmp"): | |
if os.path.exists(folder_path): | |
shutil.rmtree(folder_path) | |
if not wav_path: | |
return None, "请输入音频 Please input an audio!" | |
wav_to_mel(wav_path) | |
outputs = [] | |
all_files = os.listdir(folder_path) | |
for file_name in all_files: | |
if file_name.lower().endswith(".jpg"): | |
file_path = os.path.join(folder_path, file_name) | |
input = embed_img(file_path) | |
output: torch.Tensor = MODEL(input) | |
pred_id = torch.max(output.data, 1)[1] | |
outputs.append(pred_id) | |
max_count_item = most_common_element(outputs) | |
shutil.rmtree(folder_path) | |
return os.path.basename(wav_path), translate[classes[max_count_item]] | |
if __name__ == "__main__": | |
warnings.filterwarnings("ignore") | |
translate = { | |
"PearlRiver": "Pearl River", | |
"YoungChang": "YOUNG CHANG", | |
"Steinway-T": "STEINWAY Theater", | |
"Hsinghai": "HSINGHAI", | |
"Kawai": "KAWAI", | |
"Steinway": "STEINWAY", | |
"Kawai-G": "KAWAI Grand", | |
"Yamaha": "YAMAHA", | |
} | |
classes = list(translate.keys()) | |
example_wavs = [] | |
for cls in classes: | |
example_wavs.append(f"{MODEL_DIR}/examples/{cls}.wav") | |
with gr.Blocks() as demo: | |
gr.Interface( | |
fn=inference, | |
inputs=gr.Audio( | |
type="filepath", label="上传钢琴录音 Upload a piano recording" | |
), | |
outputs=[ | |
gr.Textbox(label="音频文件名 Audio filename", show_copy_button=True), | |
gr.Textbox( | |
label="钢琴分类结果 Piano classification result", | |
show_copy_button=True, | |
), | |
], | |
examples=example_wavs, | |
cache_examples=False, | |
allow_flagging="never", | |
title="建议录音时长保持在 3s 左右, 过长会影响识别效率<br>It is recommended to keep the duration of recording around 3s, too long will affect the recognition efficiency.", | |
) | |
gr.Markdown( | |
""" | |
# 引用 Cite | |
```bibtex | |
@article{Zhou2023AHE, | |
author = {Monan Zhou and Shangda Wu and Shaohua Ji and Zijin Li and Wei Li}, | |
title = {A Holistic Evaluation of Piano Sound Quality}, | |
booktitle = {Proceedings of the 10th Conference on Sound and Music Technology (CSMT)}, | |
year = {2023}, | |
publisher = {Springer Singapore}, | |
address = {Singapore} | |
} | |
```""" | |
) | |
demo.launch() | |