This notebook contains the code for the figures and evaluation of Appendix A of the dissertation:
Helena Cuesta. Data-driven Pitch Content Description of Choral Singing Recordings. PhD thesis, Universitat Pompeu Fabra. 2022 (to appear).
We compare the performance of Late/Deep, U-Net-Harm, and VoasCNN, proposed models for multi-pitch estimation, multi-pitch streaming, and voice assignment, applied to four-part vocal ensembles.
For this demo, we consider the song Hoy comamos y bebamos from CantorÃa Dataset.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import librosa
import librosa.display
import mir_eval
import os
from IPython.display import Audio, display
FS = 22050
HOPSIZE = 256
F_MIN = 32.7
NUM_FEATURES = 360
BINS_PER_OCTAVE = 60
NUM_OCTAVES = 6
OVER_SAMPLE = 5
def mf0_to_plot_array(times, freqs):
'''Function written by R. Bittner
'''
plot_times = []
plot_freqs = []
for t, freq in zip(times, freqs):
for f in freq:
plot_times.append(t)
plot_freqs.append(f)
return plot_times, plot_freqs
def compute_cqt(x):
cqt = np.abs(librosa.cqt(
y=x, sr=FS, hop_length=HOPSIZE, fmin=F_MIN, n_bins=NUM_FEATURES,
bins_per_octave=BINS_PER_OCTAVE
))
cqt_db = librosa.amplitude_to_db(cqt)
cqt_db = (cqt_db - cqt_db.min()) / (cqt_db.max() - cqt_db.min())
return cqt_db
def compute_hcqt(x):
cqtm = []
for h in [1,2,3,4,5]:
C = librosa.cqt(y=x, sr=FS, hop_length=HOPSIZE,
fmin=F_MIN * h,
n_bins=(NUM_OCTAVES * OVER_SAMPLE * 12),
bins_per_octave=BINS_PER_OCTAVE)
C, P = librosa.magphase(C)
C = librosa.amplitude_to_db(C)
C = (C - C.min()) / (C.max() - C.min())
cqtm.append(C)
cqtm = np.asarray(cqtm).astype(np.float32)
cqtm = np.moveaxis(cqtm, 0, -1)
return cqtm
def get_freq_grid():
"""Get the hcqt frequency grid
"""
freq_grid = librosa.cqt_frequencies(
NUM_OCTAVES * 12 * OVER_SAMPLE, F_MIN, bins_per_octave=BINS_PER_OCTAVE)
return freq_grid
def get_time_grid(n_time_frames):
"""Get the hcqt time grid
"""
time_grid = librosa.core.frames_to_time(
range(n_time_frames), sr=FS, hop_length=HOPSIZE
)
return time_grid
def mpe_to_plot(est_times, est_freqs):
times = []
freqs = []
for t, fqs in zip(est_times, est_freqs):
for f in fqs:
times.append(t)
freqs.append(f)
return np.array(times), np.array(freqs)
def plot_grids(n_frames, sr=22050):
freq_grid = librosa.cqt_frequencies(
n_bins=360,
fmin=32.7,
bins_per_octave=60
)
time_grid = librosa.frames_to_time(
frames = np.arange(n_frames),
sr=sr,
hop_length=256
)
return freq_grid, time_grid
def mono_prep(times, freqs):
freqs = list(freqs)
for i, (tms, fqs) in enumerate(zip(times, freqs)):
if fqs <= 0:
freqs[i] = np.array([])
else:
freqs[i] = np.array([fqs])
return times, freqs
Cantoria_HCB_Mix.wav
y, sr = librosa.load("./data/Audio/Cantoria_HCB_Mix.wav", sr=44100)
display(Audio(y, rate=sr))