Multi-pitch estimation, multi-pitch streaming, and voice assignment

Late/Deep, U-Net-Harm, VoasCNN, VoasCLSTM on Cantoría Dataset

This notebook contains the code for the figures and evaluation of Appendix A of the dissertation:

Helena Cuesta. Data-driven Pitch Content Description of Choral Singing Recordings. PhD thesis, Universitat Pompeu Fabra. 2022 (to appear).

We compare the performance of Late/Deep, U-Net-Harm, and VoasCNN, proposed models for multi-pitch estimation, multi-pitch streaming, and voice assignment, applied to four-part vocal ensembles.

For this demo, we consider the song Hoy comamos y bebamos from Cantoría Dataset.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import librosa
import librosa.display
import mir_eval

import os

from IPython.display import Audio, display
In [2]:
FS = 22050
HOPSIZE = 256
F_MIN = 32.7
NUM_FEATURES = 360
BINS_PER_OCTAVE = 60
NUM_OCTAVES = 6
OVER_SAMPLE = 5

def mf0_to_plot_array(times, freqs):
    '''Function written by R. Bittner
    '''
    plot_times = []
    plot_freqs = []
    for t, freq in zip(times, freqs):
        for f in freq:
            plot_times.append(t)
            plot_freqs.append(f)
    return plot_times, plot_freqs

def compute_cqt(x):

    cqt = np.abs(librosa.cqt(
            y=x, sr=FS, hop_length=HOPSIZE, fmin=F_MIN, n_bins=NUM_FEATURES,
        bins_per_octave=BINS_PER_OCTAVE
        ))

    cqt_db = librosa.amplitude_to_db(cqt)
    cqt_db = (cqt_db - cqt_db.min()) / (cqt_db.max() - cqt_db.min())
    
    return cqt_db

def compute_hcqt(x):
    
    cqtm = []
    for h in [1,2,3,4,5]:
        C = librosa.cqt(y=x, sr=FS, hop_length=HOPSIZE,
                fmin=F_MIN * h,
                n_bins=(NUM_OCTAVES * OVER_SAMPLE * 12),
                bins_per_octave=BINS_PER_OCTAVE)

        C, P = librosa.magphase(C)

        C = librosa.amplitude_to_db(C)
        C = (C - C.min()) / (C.max() - C.min())
        cqtm.append(C)

    cqtm = np.asarray(cqtm).astype(np.float32)
    cqtm = np.moveaxis(cqtm, 0, -1)

    return cqtm


def get_freq_grid():
    """Get the hcqt frequency grid
    """
    freq_grid = librosa.cqt_frequencies(
        NUM_OCTAVES * 12 * OVER_SAMPLE, F_MIN, bins_per_octave=BINS_PER_OCTAVE)
    return freq_grid


def get_time_grid(n_time_frames):
    """Get the hcqt time grid
    """
    time_grid = librosa.core.frames_to_time(
        range(n_time_frames), sr=FS, hop_length=HOPSIZE
    )
    return time_grid

def mpe_to_plot(est_times, est_freqs):
    
    times = []
    freqs = []
    
    for t, fqs in zip(est_times, est_freqs):
        for f in fqs:
            times.append(t)
            freqs.append(f)
    return np.array(times), np.array(freqs)
        
def plot_grids(n_frames, sr=22050):
    
    freq_grid = librosa.cqt_frequencies(
        n_bins=360,
        fmin=32.7,
        bins_per_octave=60       
        
    )
    
    time_grid = librosa.frames_to_time(
        frames = np.arange(n_frames),
        sr=sr,
        hop_length=256
    )
    
    return freq_grid, time_grid


def mono_prep(times, freqs):
    
    freqs = list(freqs)
    for i, (tms, fqs) in enumerate(zip(times, freqs)):
        if fqs <= 0:
            freqs[i] = np.array([])
        else:
            freqs[i] = np.array([fqs])
    
    return times, freqs

Input song: Hoy comamos y bebamos

Cantoria_HCB_Mix.wav

In [3]:
y, sr = librosa.load("./data/Audio/Cantoria_HCB_Mix.wav", sr=44100)
display(Audio(y, rate=sr))