Multi-pitch estimation, multi-pitch streaming, and voice assignment

Late/Deep, U-Net-Harm, VoasCNN, VoasCLSTM on Cantoría Dataset

This notebook contains the code for the figures and evaluation of Appendix A of the dissertation:

Helena Cuesta. Data-driven Pitch Content Description of Choral Singing Recordings. PhD thesis, Universitat Pompeu Fabra. 2022 (to appear).

We compare the performance of Late/Deep, U-Net-Harm, and VoasCNN, proposed models for multi-pitch estimation, multi-pitch streaming, and voice assignment, applied to four-part vocal ensembles.

For this demo, we consider the song Hoy comamos y bebamos from Cantoría Dataset.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import librosa
import librosa.display
import mir_eval

import os

from IPython.display import Audio, display
In [2]:
FS = 22050
HOPSIZE = 256
F_MIN = 32.7
NUM_FEATURES = 360
BINS_PER_OCTAVE = 60
NUM_OCTAVES = 6
OVER_SAMPLE = 5

def mf0_to_plot_array(times, freqs):
    '''Function written by R. Bittner
    '''
    plot_times = []
    plot_freqs = []
    for t, freq in zip(times, freqs):
        for f in freq:
            plot_times.append(t)
            plot_freqs.append(f)
    return plot_times, plot_freqs

def compute_cqt(x):

    cqt = np.abs(librosa.cqt(
            y=x, sr=FS, hop_length=HOPSIZE, fmin=F_MIN, n_bins=NUM_FEATURES,
        bins_per_octave=BINS_PER_OCTAVE
        ))

    cqt_db = librosa.amplitude_to_db(cqt)
    cqt_db = (cqt_db - cqt_db.min()) / (cqt_db.max() - cqt_db.min())
    
    return cqt_db

def compute_hcqt(x):
    
    cqtm = []
    for h in [1,2,3,4,5]:
        C = librosa.cqt(y=x, sr=FS, hop_length=HOPSIZE,
                fmin=F_MIN * h,
                n_bins=(NUM_OCTAVES * OVER_SAMPLE * 12),
                bins_per_octave=BINS_PER_OCTAVE)

        C, P = librosa.magphase(C)

        C = librosa.amplitude_to_db(C)
        C = (C - C.min()) / (C.max() - C.min())
        cqtm.append(C)

    cqtm = np.asarray(cqtm).astype(np.float32)
    cqtm = np.moveaxis(cqtm, 0, -1)

    return cqtm


def get_freq_grid():
    """Get the hcqt frequency grid
    """
    freq_grid = librosa.cqt_frequencies(
        NUM_OCTAVES * 12 * OVER_SAMPLE, F_MIN, bins_per_octave=BINS_PER_OCTAVE)
    return freq_grid


def get_time_grid(n_time_frames):
    """Get the hcqt time grid
    """
    time_grid = librosa.core.frames_to_time(
        range(n_time_frames), sr=FS, hop_length=HOPSIZE
    )
    return time_grid

def mpe_to_plot(est_times, est_freqs):
    
    times = []
    freqs = []
    
    for t, fqs in zip(est_times, est_freqs):
        for f in fqs:
            times.append(t)
            freqs.append(f)
    return np.array(times), np.array(freqs)
        
def plot_grids(n_frames, sr=22050):
    
    freq_grid = librosa.cqt_frequencies(
        n_bins=360,
        fmin=32.7,
        bins_per_octave=60       
        
    )
    
    time_grid = librosa.frames_to_time(
        frames = np.arange(n_frames),
        sr=sr,
        hop_length=256
    )
    
    return freq_grid, time_grid


def mono_prep(times, freqs):
    
    freqs = list(freqs)
    for i, (tms, fqs) in enumerate(zip(times, freqs)):
        if fqs <= 0:
            freqs[i] = np.array([])
        else:
            freqs[i] = np.array([fqs])
    
    return times, freqs

Input song: Hoy comamos y bebamos

Cantoria_HCB_Mix.wav

In [3]:
y, sr = librosa.load("./data/Audio/Cantoria_HCB_Mix.wav", sr=44100)
display(Audio(y, rate=sr))

Input features viz

In [4]:
cqt = compute_cqt(y)
hcqt = compute_hcqt(y)
In [5]:
fig, ax = plt.subplots(2, 1, figsize=(15,7))

img = librosa.display.specshow(cqt, sr=FS, hop_length=HOPSIZE, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=BINS_PER_OCTAVE, ax=ax[0])

ax[0].tick_params(axis='both', labelsize=15)
ax[0].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[0].set_xlabel(xlabel="")

img = librosa.display.specshow(cqt, sr=FS, hop_length=HOPSIZE, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=BINS_PER_OCTAVE, ax=ax[1])

ax[1].tick_params(axis='both', labelsize=15)
ax[1].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[1].set_xlabel(xlabel="Time (sec)", fontsize=20)
ax[1].set_xlim([5, 15])

plt.tight_layout()
plt.savefig("HCB_input_cqt.png", dpi=200)
In [6]:
fig, ax = plt.subplots(2, 2, figsize=(15,7))

img = librosa.display.specshow(hcqt[:,:,0], sr=FS, hop_length=HOPSIZE, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=BINS_PER_OCTAVE, ax=ax[0,0])

ax[0,0].tick_params(axis='both', labelsize=15)
ax[0,0].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[0,0].set_xlabel(xlabel="")
ax[0,0].set_title("|H[1]|", fontsize=20)

img = librosa.display.specshow(hcqt[:,:,0], sr=FS, hop_length=HOPSIZE, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=BINS_PER_OCTAVE, ax=ax[1,0])

ax[1,0].tick_params(axis='both', labelsize=15)
ax[1,0].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[1,0].set_xlabel(xlabel="Time (sec)", fontsize=20)
ax[1,0].set_xlim([5, 15])


img = librosa.display.specshow(hcqt[:,:,2], sr=FS, hop_length=HOPSIZE, x_axis="time", 
                               y_axis="cqt_hz", fmin=32.7*3, bins_per_octave=BINS_PER_OCTAVE, ax=ax[0,1])

ax[0,1].tick_params(axis='both', labelsize=15)
ax[0,1].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[0,1].set_xlabel(xlabel="")
ax[0,1].set_title("|H[3]|", fontsize=20)

img = librosa.display.specshow(hcqt[:,:,2], sr=FS, hop_length=HOPSIZE, x_axis="time", 
                               y_axis="cqt_hz",  fmin=32.7*3, bins_per_octave=BINS_PER_OCTAVE, ax=ax[1,1])

ax[1,1].tick_params(axis='both', labelsize=15)
ax[1,1].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[1,1].set_xlabel(xlabel="Time (sec)", fontsize=20)
ax[1,1].set_xlim([5, 15])


plt.tight_layout()
plt.savefig("HCB_input_hcqt.png", dpi=200)

Reference F0 labels

We did some manual corrections to the F0 trajectories, automatically extracted with pYIN.

In [7]:
## load f0 labels
ref_soprano = pd.read_csv("./data/F0_manual/Cantoria_HCB_S.csv", header=None).values
ref_alto = pd.read_csv("./data/F0_manual/Cantoria_HCB_A.csv", header=None).values
ref_tenor = pd.read_csv("./data/F0_manual/Cantoria_HCB_T.csv", header=None).values
ref_bass = pd.read_csv("./data/F0_manual/Cantoria_HCB_B.csv", header=None).values

fig, ax = plt.subplots(1, 1, figsize=(15, 7))

ax.plot(ref_soprano[:, 0], ref_soprano[:, 1], ".", markersize=5, color="tab:orange", label="Soprano")
ax.plot(ref_alto[:, 0], ref_alto[:, 1], ".", markersize=5, color="tab:green", label="Alto")
ax.plot(ref_tenor[:, 0], ref_tenor[:, 1], ".", markersize=5, color="tab:red", label="Tenor")
ax.plot(ref_bass[:, 0], ref_bass[:, 1], ".", markersize=5, color="tab:blue", label="Bass")

ax.set_ylabel("Frequency (Hz)", fontsize=20)
ax.set_xlabel("Time (sec)", fontsize=20)
ax.tick_params(axis="both", labelsize=15)
ax.set_ylim([80, 500])
ax.legend(fontsize=15, ncol=4, markerscale=3)

plt.savefig("HCB_F0_reference_full.png", dpi=200)

fig, ax = plt.subplots(1, 1, figsize=(15, 7))

ax.plot(ref_soprano[:, 0], ref_soprano[:, 1], ".", markersize=5, color="tab:orange", label="Soprano")
ax.plot(ref_alto[:, 0], ref_alto[:, 1], ".", markersize=5, color="tab:green", label="Alto")
ax.plot(ref_tenor[:, 0], ref_tenor[:, 1], ".", markersize=5, color="tab:red", label="Tenor")
ax.plot(ref_bass[:, 0], ref_bass[:, 1], ".", markersize=5, color="tab:blue", label="Bass")

ax.set_ylabel("Frequency (Hz)", fontsize=20)
ax.set_xlabel("Time (sec)", fontsize=20)
ax.tick_params(axis="both", labelsize=15)
ax.set_ylim([80, 500])
ax.legend(fontsize=15, ncol=4, markerscale=3)
ax.set_xlim([5, 15])
plt.savefig("HCB_F0_reference_zoom.png", dpi=200)

Multiple F0 estimation with Late/Deep

Figures with the polyphonic pitch salience representation estimated by Late/Deep, and the resulting multi-pitch stream after post-processing, plotted on top of the reference F0 labels.

For better visualization, we zoom the xaxis between 5 and 15 seconds.

In [8]:
## load Late/Deep prediction, previously computed

ld_salience = np.load("./data/LD/Cantoria_HCB_ld_mpe.npy")
freq_grid, time_grid = plot_grids(ld_salience.shape[1])

fig, ax = plt.subplots(1, 1, figsize=(15, 7))
img = librosa.display.specshow(ld_salience, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax)
ax.tick_params(axis='both', labelsize=15)
ax.set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax.set_xlabel(xlabel="Time (sec)", fontsize=20)
ax.set_xlim([5, 15])
plt.tight_layout()
plt.savefig("HCB_salience_ld_zoom.png", dpi=200)

## load post-processed prediction
pred_times_, pred_freqs_ = mir_eval.io.load_ragged_time_series("./data/LD/Cantoria_HCB_ld_mpe.csv")

pred_times, pred_freqs = mpe_to_plot(pred_times_, pred_freqs_)


## plot references
fig, ax = plt.subplots(1, 1, figsize=(15, 7))

ax.plot(ref_soprano[:, 0], ref_soprano[:, 1], ".", markersize=10, color="tab:blue", label="Soprano")
ax.plot(ref_alto[:, 0], ref_alto[:, 1], ".", markersize=10, color="darkgreen", label="Alto")
ax.plot(ref_tenor[:, 0], ref_tenor[:, 1], ".", markersize=10, color="darkred", label="Tenor")
ax.plot(ref_bass[:, 0], ref_bass[:, 1], ".", markersize=10, color="darkorange", label="Bass")


## plot Late/Deep prediction
ax.plot(pred_times, pred_freqs, ".", markersize=6, color="black", label="L/D prediction")


ax.set_ylabel("Frequency (Hz)", fontsize=20)
ax.set_xlabel("Time (sec)", fontsize=20)
ax.tick_params(axis="both", labelsize=15)
ax.set_ylim([80, 500])
ax.legend(fontsize=15, ncol=5, markerscale=3)

ax.set_xlim([5, 15])
plt.tight_layout()
plt.savefig("HCB_F0_ld_zoom.png", dpi=200)

Numerical evaluation

using mir_eval's multi-pitch evaluation metrics

In [9]:
## construct mpe reference following mir_eval's requirements
ref_freqs = list(np.vstack([
    ref_soprano[:,1],
    ref_alto[:, 1],
    ref_tenor[:, 1],
    ref_bass[:, 1]
]).transpose())

ref_times = ref_soprano[:,0]
for i, (tms, fqs) in enumerate(zip(ref_times, ref_freqs)):
    if any(fqs) <= 20:
        ref_freqs[i] = np.array([f for f in fqs if f > 0])
ref_freqs = list(ref_freqs)

## review predictions

pred_freqs_ = list(pred_freqs_)
for i, (tms, fqs) in enumerate(zip(pred_times_, pred_freqs_)):
    if fqs.size >= 1:
        pred_freqs_[i] = np.array([f for f in fqs if f > 20])



metrics = mir_eval.multipitch.evaluate(ref_times, ref_freqs, pred_times_, pred_freqs_)
metrics["F-Score"] = 2*(metrics["Precision"] * metrics["Recall"]) / (metrics["Precision"] + metrics["Recall"])


print("MPE Results:\n")
print("Precision={}".format(metrics["Precision"]))
print("Recall={}".format(metrics["Recall"]))
print("Accuracy={}".format(metrics["Accuracy"]))
print("F-Score={}".format(metrics["F-Score"]))
/Users/helenacuesta/anaconda3/envs/mf0/lib/python3.6/site-packages/mir_eval/multipitch.py:410: UserWarning: Estimate times not equal to reference times. Resampling to common time base.
  warnings.warn("Estimate times not equal to reference times. "
MPE Results:

Precision=0.963500061447708
Recall=0.8762713758801833
Accuracy=0.8481176979662484
F-Score=0.9178178412549753

Multiple F0 streaming with U-Net-Harm

Figures with the monophonic pitch salience representation estimated by U-Net-Harm, and the resulting F0 contours after post-processing.

For better visualization, we zoom the xaxis between 5 and 15 seconds.

In [10]:
sop_sal = np.load("./data/Unetharm/Cantoria_HCB_unh_s.npy")
alt_sal = np.load("./data/Unetharm/Cantoria_HCB_unh_a.npy")
ten_sal = np.load("./data/Unetharm/Cantoria_HCB_unh_t.npy")
bass_sal = np.load("./data/Unetharm/Cantoria_HCB_unh_b.npy")

## plot predicted saliences
fig, ax = plt.subplots(2, 2, figsize=(15, 10))

### soprano \hat{Y}_S
img = librosa.display.specshow(sop_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[0,0])
ax[0,0].tick_params(axis='both', labelsize=15)
ax[0,0].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[0,0].tick_params(axis='both', labelsize=15)
ax[0,0].set_xlabel(xlabel="")

### alto \hat{Y}_A
img = librosa.display.specshow(alt_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[0,1])
ax[0,1].tick_params(axis='both', labelsize=15)
ax[0,1].set_ylabel(ylabel="")
ax[0,1].tick_params(axis='both', labelsize=15)
ax[0,1].set_xlabel(xlabel="")

### tenor \hat{Y}_T
img = librosa.display.specshow(ten_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[1,0])
ax[1,0].tick_params(axis='both', labelsize=15)
ax[1,0].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[1,0].tick_params(axis='both', labelsize=15)
ax[1,0].set_xlabel(xlabel="Time (sec)", fontsize=20)

### bass \hat{Y}_B
img = librosa.display.specshow(bass_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[1,1])
ax[1,1].tick_params(axis='both', labelsize=15)
ax[1,1].set_ylabel(ylabel="")
ax[1,1].tick_params(axis='both', labelsize=15)

ax[1,1].set_xlabel(xlabel="Time (sec)", fontsize=20)

ax[0,0].set_xlim([5, 15])
ax[0,1].set_xlim([5, 15])
ax[1,0].set_xlim([5, 15])
ax[1,1].set_xlim([5, 15])

plt.tight_layout()

plt.savefig("HCB_salience_unetharm_zoom.png", dpi=200)
In [11]:
sop_f0 = pd.read_csv("./data/Unetharm/Cantoria_HCB_unh_s.csv", header=None).values
alt_f0 = pd.read_csv("./data/Unetharm/Cantoria_HCB_unh_a.csv", header=None).values
ten_f0 = pd.read_csv("./data/Unetharm/Cantoria_HCB_unh_t.csv", header=None).values
bass_f0 = pd.read_csv("./data/Unetharm/Cantoria_HCB_unh_b.csv", header=None).values

## plot predicted F0 contours against the reference F0 labels

fig, ax = plt.subplots(1, 1, figsize=(15, 7))

### plot reference 
ax.plot(ref_soprano[:, 0], ref_soprano[:, 1], ".", markersize=10, color="darkblue", label="Soprano")
ax.plot(ref_alto[:, 0], ref_alto[:, 1], ".", markersize=10, color="darkgreen", label="Alto")
ax.plot(ref_tenor[:, 0], ref_tenor[:, 1], ".", markersize=10, color="darkred", label="Tenor")
ax.plot(ref_bass[:, 0], ref_bass[:, 1], ".", markersize=10, color="darkorange", label="Bass")

### plot predictions
ax.plot(sop_f0[:, 0], sop_f0[:, 1], ".", markersize=6, color="lightskyblue", label="Pred. Soprano")
ax.plot(alt_f0[:, 0], alt_f0[:, 1], ".", markersize=6, color="yellowgreen", label="Pred. Alto")
ax.plot(ten_f0[:, 0], ten_f0[:, 1], ".", markersize=6, color="red", label="Pred. Tenor")
ax.plot(bass_f0[:, 0], bass_f0[:, 1], ".", markersize=6, color="gold", label="Pred. Bass")

ax.set_ylabel("Frequency (Hz)", fontsize=20)
ax.set_xlabel("Time (sec)", fontsize=20)
ax.tick_params(axis="both", labelsize=15)
ax.set_ylim([80, 500])
ax.legend(fontsize=15, ncol=4, markerscale=3)

ax.set_xlim([5, 15])
plt.tight_layout()
plt.savefig("HCB_F0_unetharm_zoom.png", dpi=200)

Numerical evaluation

In [13]:
## soprano eval
### melody extraction metrics
soprano_metrics = mir_eval.melody.evaluate(
    ref_soprano[:,0],
    ref_soprano[:,1],
    sop_f0[:,0],
    sop_f0[:,1]
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_soprano[:,0], ref_soprano[:,1])
est_times, est_freqs = mono_prep(sop_f0[:,0], sop_f0[:,1])

soprano_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

soprano_metrics_mpe["F-Score"] = 2*(soprano_metrics_mpe["Precision"] * soprano_metrics_mpe["Recall"]) / \
(soprano_metrics_mpe["Precision"] + soprano_metrics_mpe["Recall"])


print("SOPRANO Results Melody:")
print("RPA={}".format(soprano_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(soprano_metrics["Overall Accuracy"]))
print("\n")
print("SOPRANO Results MPE:")
print("Precision={}".format(soprano_metrics_mpe["Precision"]))
print("Recall={}".format(soprano_metrics_mpe["Recall"]))
print("Accuracy={}".format(soprano_metrics_mpe["Accuracy"]))
print("F-Score={}".format(soprano_metrics_mpe["F-Score"]))

print("\n\n")

## alto eval
### melody extraction metrics
alto_metrics = mir_eval.melody.evaluate(
    ref_alto[:,0],
    ref_alto[:,1],
    alt_f0[:,0],
    alt_f0[:,1]
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_alto[:,0], ref_alto[:,1])
est_times, est_freqs = mono_prep(alt_f0[:,0], alt_f0[:,1])

alto_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

alto_metrics_mpe["F-Score"] = 2*(alto_metrics_mpe["Precision"] * alto_metrics_mpe["Recall"]) / \
(alto_metrics_mpe["Precision"] + alto_metrics_mpe["Recall"])


print("ALTO Results Melody:")
print("RPA={}".format(alto_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(alto_metrics["Overall Accuracy"]))
print("\n")
print("ALTO Results MPE:")
print("Precision={}".format(alto_metrics_mpe["Precision"]))
print("Recall={}".format(alto_metrics_mpe["Recall"]))
print("Accuracy={}".format(alto_metrics_mpe["Accuracy"]))
print("F-Score={}".format(alto_metrics_mpe["F-Score"]))

print("\n\n")

## tenor eval
### melody extraction metrics
tenor_metrics = mir_eval.melody.evaluate(
    ref_tenor[:,0],
    ref_tenor[:,1],
    ten_f0[:,0],
    ten_f0[:,1]
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_tenor[:,0], ref_tenor[:,1])
est_times, est_freqs = mono_prep(ten_f0[:,0], ten_f0[:,1])

tenor_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

tenor_metrics_mpe["F-Score"] = 2*(tenor_metrics_mpe["Precision"] * tenor_metrics_mpe["Recall"]) / \
(tenor_metrics_mpe["Precision"] + tenor_metrics_mpe["Recall"])


print("TENOR Results Melody:")
print("RPA={}".format(tenor_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(tenor_metrics["Overall Accuracy"]))
print("\n")
print("TENOR Results MPE:")
print("Precision={}".format(tenor_metrics_mpe["Precision"]))
print("Recall={}".format(tenor_metrics_mpe["Recall"]))
print("Accuracy={}".format(tenor_metrics_mpe["Accuracy"]))
print("F-Score={}".format(tenor_metrics_mpe["F-Score"]))

print("\n\n")

## Bass eval
### melody extraction metrics
bass_metrics = mir_eval.melody.evaluate(
    ref_bass[:,0],
    ref_bass[:,1],
    bass_f0[:,0],
    bass_f0[:,1]
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_bass[:,0], ref_bass[:,1])
est_times, est_freqs = mono_prep(bass_f0[:,0], bass_f0[:,1])

bass_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

bass_metrics_mpe["F-Score"] = 2*(bass_metrics_mpe["Precision"] * bass_metrics_mpe["Recall"]) / \
(bass_metrics_mpe["Precision"] + bass_metrics_mpe["Recall"])


print("BASS Results Melody:")
print("RPA={}".format(bass_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(bass_metrics["Overall Accuracy"]))
print("\n")
print("BASS Results MPE:")
print("Precision={}".format(bass_metrics_mpe["Precision"]))
print("Recall={}".format(bass_metrics_mpe["Recall"]))
print("Accuracy={}".format(bass_metrics_mpe["Accuracy"]))
print("F-Score={}".format(bass_metrics_mpe["F-Score"]))
/Users/helenacuesta/anaconda3/envs/mf0/lib/python3.6/site-packages/mir_eval/multipitch.py:410: UserWarning: Estimate times not equal to reference times. Resampling to common time base.
  warnings.warn("Estimate times not equal to reference times. "
SOPRANO Results Melody:
RPA=0.6623771642489471
Overall Acc=0.7132822477650064


SOPRANO Results MPE:
Precision=0.8004484304932735
Recall=0.6682264857276556
Accuracy=0.5728038507821901
F-Score=0.7283856159143076



ALTO Results Melody:
RPA=0.6982131039046989
Overall Acc=0.7407407407407407


ALTO Results MPE:
Precision=0.7090182141270547
Recall=0.7041694242223693
Accuracy=0.5462947116207427
F-Score=0.7065855008301052



TENOR Results Melody:
RPA=0.7336909871244636
Overall Acc=0.7557471264367817


TENOR Results MPE:
Precision=0.8000925497454882
Recall=0.7420600858369099
Accuracy=0.6259956553222302
F-Score=0.7699844132709863



BASS Results Melody:
RPA=0.8549807996385814
Overall Acc=0.8590357598978289


BASS Results MPE:
Precision=0.8634290925672594
Recall=0.8554325728484301
Accuracy=0.7534818941504178
F-Score=0.8594122319301032

Late/Deep + Voice Assignment

We use the Late/Deep output as input to the proposed VA models, VoasCNN and VoasCLSTM, and plot the outputs as we did for U-Net-Harm above.

Late/Deep + VoasCNN

In [14]:
sop_sal = np.load("./data/VA/VoasCNN/Cantoria_HCB_ld_mpe_s.npy")
alt_sal = np.load("./data/VA/VoasCNN/Cantoria_HCB_ld_mpe_a.npy")
ten_sal = np.load("./data/VA/VoasCNN/Cantoria_HCB_ld_mpe_t.npy")
bass_sal = np.load("./data/VA/VoasCNN/Cantoria_HCB_ld_mpe_b.npy")

## plot predicted saliences
fig, ax = plt.subplots(2, 2, figsize=(15, 10))

### soprano \hat{Y}_S
img = librosa.display.specshow(sop_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[0,0])
ax[0,0].tick_params(axis='both', labelsize=15)
ax[0,0].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[0,0].tick_params(axis='both', labelsize=15)
ax[0,0].set_xlabel(xlabel="")

### alto \hat{Y}_A
img = librosa.display.specshow(alt_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[0,1])
ax[0,1].tick_params(axis='both', labelsize=15)
ax[0,1].set_ylabel(ylabel="")
ax[0,1].tick_params(axis='both', labelsize=15)
ax[0,1].set_xlabel(xlabel="")

### tenor \hat{Y}_T
img = librosa.display.specshow(ten_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[1,0])
ax[1,0].tick_params(axis='both', labelsize=15)
ax[1,0].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[1,0].tick_params(axis='both', labelsize=15)
ax[1,0].set_xlabel(xlabel="Time (sec)", fontsize=20)

### bass \hat{Y}_B
img = librosa.display.specshow(bass_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[1,1])
ax[1,1].tick_params(axis='both', labelsize=15)
ax[1,1].set_ylabel(ylabel="")
ax[1,1].tick_params(axis='both', labelsize=15)

ax[1,1].set_xlabel(xlabel="Time (sec)", fontsize=20)

ax[0,0].set_xlim([5, 15])
ax[0,1].set_xlim([5, 15])
ax[1,0].set_xlim([5, 15])
ax[1,1].set_xlim([5, 15])

plt.tight_layout()

plt.savefig("HCB_salience_voascnn_zoom.png", dpi=200)
In [15]:
## read VoasCNN output F0s (each column in the csv files corresponds to one voice)

f0s = pd.read_csv("./data/VA/VoasCNN/Cantoria_HCB_ld_mpe.csv", header=None).values
times = f0s[:, 0]
sop_f0 = f0s[:, 1]
alt_f0 = f0s[:, 2]
ten_f0 = f0s[:, 3]
bass_f0 = f0s[:, 4]

## plot predicted F0 contours against the reference F0 labels

fig, ax = plt.subplots(1, 1, figsize=(15, 7))

### plot reference 
ax.plot(ref_soprano[:, 0], ref_soprano[:, 1], ".", markersize=10, color="darkblue", label="Soprano")
ax.plot(ref_alto[:, 0], ref_alto[:, 1], ".", markersize=10, color="darkgreen", label="Alto")
ax.plot(ref_tenor[:, 0], ref_tenor[:, 1], ".", markersize=10, color="darkred", label="Tenor")
ax.plot(ref_bass[:, 0], ref_bass[:, 1], ".", markersize=10, color="darkorange", label="Bass")


### plot predictions
ax.plot(times, sop_f0, ".", markersize=6, color="lightskyblue", label="Pred. Soprano")
ax.plot(times, alt_f0, "x", markersize=6, color="yellowgreen", label="Pred. Alto")
ax.plot(times, ten_f0, ".", markersize=6, color="red", label="Pred. Tenor")
ax.plot(times, bass_f0, ".", markersize=6, color="gold", label="Pred. Bass")

ax.set_ylabel("Frequency (Hz)", fontsize=20)
ax.set_xlabel("Time (sec)", fontsize=20)
ax.tick_params(axis="both", labelsize=15)
ax.set_ylim([80, 500])
ax.legend(fontsize=15, ncol=4, markerscale=3)

ax.set_xlim([5, 15])
plt.tight_layout()

plt.savefig("HCB_F0_voascnn_zoom.png", dpi=200)

Numerical evaluation

In [16]:
## soprano eval
### melody extraction metrics
soprano_metrics = mir_eval.melody.evaluate(
    ref_soprano[:,0],
    ref_soprano[:,1],
    times,
    sop_f0
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_soprano[:,0], ref_soprano[:,1])
est_times, est_freqs = mono_prep(times, sop_f0)

soprano_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

soprano_metrics_mpe["F-Score"] = 2*(soprano_metrics_mpe["Precision"] * soprano_metrics_mpe["Recall"]) / \
(soprano_metrics_mpe["Precision"] + soprano_metrics_mpe["Recall"])


print("SOPRANO Results Melody:")
print("RPA={}".format(soprano_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(soprano_metrics["Overall Accuracy"]))
print("\n")
print("SOPRANO Results MPE:")
print("Precision={}".format(soprano_metrics_mpe["Precision"]))
print("Recall={}".format(soprano_metrics_mpe["Recall"]))
print("Accuracy={}".format(soprano_metrics_mpe["Accuracy"]))
print("F-Score={}".format(soprano_metrics_mpe["F-Score"]))

print("\n\n")

## alto eval
### melody extraction metrics
alto_metrics = mir_eval.melody.evaluate(
    ref_alto[:,0],
    ref_alto[:,1],
    times,
    alt_f0
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_alto[:,0], ref_alto[:,1])
est_times, est_freqs = mono_prep(times, alt_f0)

alto_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

alto_metrics_mpe["F-Score"] = 2*(alto_metrics_mpe["Precision"] * alto_metrics_mpe["Recall"]) / \
(alto_metrics_mpe["Precision"] + alto_metrics_mpe["Recall"])


print("ALTO Results Melody:")
print("RPA={}".format(alto_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(alto_metrics["Overall Accuracy"]))
print("\n")
print("ALTO Results MPE:")
print("Precision={}".format(alto_metrics_mpe["Precision"]))
print("Recall={}".format(alto_metrics_mpe["Recall"]))
print("Accuracy={}".format(alto_metrics_mpe["Accuracy"]))
print("F-Score={}".format(alto_metrics_mpe["F-Score"]))

print("\n\n")

## tenor eval
### melody extraction metrics
tenor_metrics = mir_eval.melody.evaluate(
    ref_tenor[:,0],
    ref_tenor[:,1],
    times,
    ten_f0
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_tenor[:,0], ref_tenor[:,1])
est_times, est_freqs = mono_prep(times, ten_f0)

tenor_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

tenor_metrics_mpe["F-Score"] = 2*(tenor_metrics_mpe["Precision"] * tenor_metrics_mpe["Recall"]) / \
(tenor_metrics_mpe["Precision"] + tenor_metrics_mpe["Recall"])


print("TENOR Results Melody:")
print("RPA={}".format(tenor_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(tenor_metrics["Overall Accuracy"]))
print("\n")
print("TENOR Results MPE:")
print("Precision={}".format(tenor_metrics_mpe["Precision"]))
print("Recall={}".format(tenor_metrics_mpe["Recall"]))
print("Accuracy={}".format(tenor_metrics_mpe["Accuracy"]))
print("F-Score={}".format(tenor_metrics_mpe["F-Score"]))

print("\n\n")

## Bass eval
### melody extraction metrics
bass_metrics = mir_eval.melody.evaluate(
    ref_bass[:,0],
    ref_bass[:,1],
    times,
    bass_f0
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_bass[:,0], ref_bass[:,1])
est_times, est_freqs = mono_prep(times, bass_f0)

bass_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

bass_metrics_mpe["F-Score"] = 2*(bass_metrics_mpe["Precision"] * bass_metrics_mpe["Recall"]) / \
(bass_metrics_mpe["Precision"] + bass_metrics_mpe["Recall"])


print("BASS Results Melody:")
print("RPA={}".format(bass_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(bass_metrics["Overall Accuracy"]))
print("\n")
print("BASS Results MPE:")
print("Precision={}".format(bass_metrics_mpe["Precision"]))
print("Recall={}".format(bass_metrics_mpe["Recall"]))
print("Accuracy={}".format(bass_metrics_mpe["Accuracy"]))
print("F-Score={}".format(bass_metrics_mpe["F-Score"]))
/Users/helenacuesta/anaconda3/envs/mf0/lib/python3.6/site-packages/mir_eval/multipitch.py:410: UserWarning: Estimate times not equal to reference times. Resampling to common time base.
  warnings.warn("Estimate times not equal to reference times. "
SOPRANO Results Melody:
RPA=0.812821712681329
Overall Acc=0.8314176245210728


SOPRANO Results MPE:
Precision=0.871896722939424
Recall=0.8217126813289658
Accuracy=0.7331941544885178
F-Score=0.8460611900746807



ALTO Results Melody:
RPA=0.3816457092433267
Overall Acc=0.5106960408684547


ALTO Results MPE:
Precision=0.41439427805096113
Recall=0.40900066181336864
Accuracy=0.2591919474346428
F-Score=0.41167980459642506



TENOR Results Melody:
RPA=0.5815450643776824
Overall Acc=0.6567688378033205


TENOR Results MPE:
Precision=0.7542005420054201
Recall=0.5972103004291845
Accuracy=0.49991018501886114
F-Score=0.6665868263473054



BASS Results Melody:
RPA=0.7544612604472555
Overall Acc=0.7878352490421456


BASS Results MPE:
Precision=0.8852114525873391
Recall=0.7612378585949853
Accuracy=0.6928453947368421
F-Score=0.8185572018460043

Late/Deep + VoasCLSTM

In [17]:
sop_sal = np.load("./data/VA/VoasCLSTM/Cantoria_HCB_ld_mpe_s.npy")
alt_sal = np.load("./data/VA/VoasCLSTM/Cantoria_HCB_ld_mpe_a.npy")
ten_sal = np.load("./data/VA/VoasCLSTM/Cantoria_HCB_ld_mpe_t.npy")
bass_sal = np.load("./data/VA/VoasCLSTM/Cantoria_HCB_ld_mpe_b.npy")

## plot predicted saliences
fig, ax = plt.subplots(2, 2, figsize=(15, 10))

### soprano \hat{Y}_S
img = librosa.display.specshow(sop_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[0,0])
ax[0,0].tick_params(axis='both', labelsize=15)
ax[0,0].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[0,0].tick_params(axis='both', labelsize=15)
ax[0,0].set_xlabel(xlabel="")

### alto \hat{Y}_A
img = librosa.display.specshow(alt_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[0,1])
ax[0,1].tick_params(axis='both', labelsize=15)
ax[0,1].set_ylabel(ylabel="")
ax[0,1].tick_params(axis='both', labelsize=15)
ax[0,1].set_xlabel(xlabel="")

### tenor \hat{Y}_T
img = librosa.display.specshow(ten_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[1,0])
ax[1,0].tick_params(axis='both', labelsize=15)
ax[1,0].set_ylabel(ylabel="Frequency (Hz)", fontsize=20)
ax[1,0].tick_params(axis='both', labelsize=15)
ax[1,0].set_xlabel(xlabel="Time (sec)", fontsize=20)

### bass \hat{Y}_B
img = librosa.display.specshow(bass_sal, sr=22050, hop_length=256, x_axis="time", 
                               y_axis="cqt_hz", bins_per_octave=60,ax=ax[1,1])
ax[1,1].tick_params(axis='both', labelsize=15)
ax[1,1].set_ylabel(ylabel="")
ax[1,1].tick_params(axis='both', labelsize=15)

ax[1,1].set_xlabel(xlabel="Time (sec)", fontsize=20)

ax[0,0].set_xlim([5, 15])
ax[0,1].set_xlim([5, 15])
ax[1,0].set_xlim([5, 15])
ax[1,1].set_xlim([5, 15])

plt.tight_layout()

plt.savefig("HCB_salience_voasclstm_zoom.png", dpi=200)
In [18]:
## read VoasCNN output F0s (each column in the csv files corresponds to one voice)

f0s = pd.read_csv("./data/VA/VoasCLSTM/Cantoria_HCB_ld_mpe.csv", header=None).values
times = f0s[:, 0]
sop_f0 = f0s[:, 1]
alt_f0 = f0s[:, 2]
ten_f0 = f0s[:, 3]
bass_f0 = f0s[:, 4]

## plot predicted F0 contours against the reference F0 labels

fig, ax = plt.subplots(1, 1, figsize=(15, 7))

### plot reference 
ax.plot(ref_soprano[:, 0], ref_soprano[:, 1], ".", markersize=10, color="darkblue", label="Soprano")
ax.plot(ref_alto[:, 0], ref_alto[:, 1], ".", markersize=10, color="darkgreen", label="Alto")
ax.plot(ref_tenor[:, 0], ref_tenor[:, 1], ".", markersize=10, color="darkred", label="Tenor")
ax.plot(ref_bass[:, 0], ref_bass[:, 1], ".", markersize=10, color="darkorange", label="Bass")


### plot predictions
ax.plot(times, sop_f0, ".", markersize=6, color="lightskyblue", label="Pred. Soprano")
ax.plot(times, alt_f0, "x", markersize=6, color="yellowgreen", label="Pred. Alto")
ax.plot(times, ten_f0, ".", markersize=6, color="red", label="Pred. Tenor")
ax.plot(times, bass_f0, ".", markersize=6, color="gold", label="Pred. Bass")

ax.set_ylabel("Frequency (Hz)", fontsize=20)
ax.set_xlabel("Time (sec)", fontsize=20)
ax.tick_params(axis="both", labelsize=15)
ax.set_ylim([80, 500])
ax.legend(fontsize=15, ncol=4, markerscale=3)

ax.set_xlim([5, 15])
plt.tight_layout()

plt.savefig("HCB_F0_voasclstm_zoom.png", dpi=200)

Numerical evaluation

In [19]:
## soprano eval
### melody extraction metrics
soprano_metrics = mir_eval.melody.evaluate(
    ref_soprano[:,0],
    ref_soprano[:,1],
    times,
    sop_f0
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_soprano[:,0], ref_soprano[:,1])
est_times, est_freqs = mono_prep(times, sop_f0)

soprano_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

soprano_metrics_mpe["F-Score"] = 2*(soprano_metrics_mpe["Precision"] * soprano_metrics_mpe["Recall"]) / \
(soprano_metrics_mpe["Precision"] + soprano_metrics_mpe["Recall"])


print("SOPRANO Results Melody:")
print("RPA={}".format(soprano_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(soprano_metrics["Overall Accuracy"]))
print("\n")
print("SOPRANO Results MPE:")
print("Precision={}".format(soprano_metrics_mpe["Precision"]))
print("Recall={}".format(soprano_metrics_mpe["Recall"]))
print("Accuracy={}".format(soprano_metrics_mpe["Accuracy"]))
print("F-Score={}".format(soprano_metrics_mpe["F-Score"]))

print("\n\n")

## alto eval
### melody extraction metrics
alto_metrics = mir_eval.melody.evaluate(
    ref_alto[:,0],
    ref_alto[:,1],
    times,
    alt_f0
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_alto[:,0], ref_alto[:,1])
est_times, est_freqs = mono_prep(times, alt_f0)

alto_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

alto_metrics_mpe["F-Score"] = 2*(alto_metrics_mpe["Precision"] * alto_metrics_mpe["Recall"]) / \
(alto_metrics_mpe["Precision"] + alto_metrics_mpe["Recall"])


print("ALTO Results Melody:")
print("RPA={}".format(alto_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(alto_metrics["Overall Accuracy"]))
print("\n")
print("ALTO Results MPE:")
print("Precision={}".format(alto_metrics_mpe["Precision"]))
print("Recall={}".format(alto_metrics_mpe["Recall"]))
print("Accuracy={}".format(alto_metrics_mpe["Accuracy"]))
print("F-Score={}".format(alto_metrics_mpe["F-Score"]))

print("\n\n")

## tenor eval
### melody extraction metrics
tenor_metrics = mir_eval.melody.evaluate(
    ref_tenor[:,0],
    ref_tenor[:,1],
    times,
    ten_f0
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_tenor[:,0], ref_tenor[:,1])
est_times, est_freqs = mono_prep(times, ten_f0)

tenor_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

tenor_metrics_mpe["F-Score"] = 2*(tenor_metrics_mpe["Precision"] * tenor_metrics_mpe["Recall"]) / \
(tenor_metrics_mpe["Precision"] + tenor_metrics_mpe["Recall"])


print("TENOR Results Melody:")
print("RPA={}".format(tenor_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(tenor_metrics["Overall Accuracy"]))
print("\n")
print("TENOR Results MPE:")
print("Precision={}".format(tenor_metrics_mpe["Precision"]))
print("Recall={}".format(tenor_metrics_mpe["Recall"]))
print("Accuracy={}".format(tenor_metrics_mpe["Accuracy"]))
print("F-Score={}".format(tenor_metrics_mpe["F-Score"]))

print("\n\n")

## Bass eval
### melody extraction metrics
bass_metrics = mir_eval.melody.evaluate(
    ref_bass[:,0],
    ref_bass[:,1],
    times,
    bass_f0
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_bass[:,0], ref_bass[:,1])
est_times, est_freqs = mono_prep(times, bass_f0)

bass_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

bass_metrics_mpe["F-Score"] = 2*(bass_metrics_mpe["Precision"] * bass_metrics_mpe["Recall"]) / \
(bass_metrics_mpe["Precision"] + bass_metrics_mpe["Recall"])


print("BASS Results Melody:")
print("RPA={}".format(bass_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(bass_metrics["Overall Accuracy"]))
print("\n")
print("BASS Results MPE:")
print("Precision={}".format(bass_metrics_mpe["Precision"]))
print("Recall={}".format(bass_metrics_mpe["Recall"]))
print("Accuracy={}".format(bass_metrics_mpe["Accuracy"]))
print("F-Score={}".format(bass_metrics_mpe["F-Score"]))
/Users/helenacuesta/anaconda3/envs/mf0/lib/python3.6/site-packages/mir_eval/multipitch.py:410: UserWarning: Estimate times not equal to reference times. Resampling to common time base.
  warnings.warn("Estimate times not equal to reference times. "
SOPRANO Results Melody:
RPA=0.6413196069255966
Overall Acc=0.7335568326947637


SOPRANO Results MPE:
Precision=0.9403389830508475
Recall=0.6490407112774919
Accuracy=0.623370786516854
F-Score=0.7679955703211517



ALTO Results Melody:
RPA=0.414074564306199
Overall Acc=0.5287356321839081


ALTO Results MPE:
Precision=0.44462025316455694
Recall=0.4339289653651004
Accuracy=0.28140200286123035
F-Score=0.4392095567712404



TENOR Results Melody:
RPA=0.5675965665236051
Overall Acc=0.6566091954022989


TENOR Results MPE:
Precision=0.6569908079342042
Recall=0.582832618025751
Accuracy=0.4468575189206976
F-Score=0.617693882192404



BASS Results Melody:
RPA=0.7000225886604924
Overall Acc=0.7796934865900383


BASS Results MPE:
Precision=0.9775980087118855
Recall=0.7097357126722386
Accuracy=0.6983774172038231
F-Score=0.8224054443135714

Post-processing VoasCNN's output with Viterbi decoding

In [33]:
sop_sal = np.load("./data/VA/VoasCNN/Cantoria_HCB_ld_mpe_s.npy")
alt_sal = np.load("./data/VA/VoasCNN/Cantoria_HCB_ld_mpe_a.npy")
ten_sal = np.load("./data/VA/VoasCNN/Cantoria_HCB_ld_mpe_t.npy")
bass_sal = np.load("./data/VA/VoasCNN/Cantoria_HCB_ld_mpe_b.npy")
In [43]:
## read VoasCNN output F0s (each column in the csv files corresponds to one voice)

f0s = pd.read_csv("./data/VA/VoasCLSTM/Cantoria_HCB_ld_mpe.csv", header=None).values
times = f0s[:, 0]
sop_f0 = predicted_pitch_sop
alt_f0 = predicted_pitch_alt
ten_f0 = predicted_pitch_ten
bass_f0 = predicted_pitch_bass

## plot predicted F0 contours against the reference F0 labels

fig, ax = plt.subplots(1, 1, figsize=(15, 7))

### plot reference 
ax.plot(ref_soprano[:, 0], ref_soprano[:, 1], ".", markersize=10, color="darkblue", label="Soprano")
ax.plot(ref_alto[:, 0], ref_alto[:, 1], ".", markersize=10, color="darkgreen", label="Alto")
ax.plot(ref_tenor[:, 0], ref_tenor[:, 1], ".", markersize=10, color="darkred", label="Tenor")
ax.plot(ref_bass[:, 0], ref_bass[:, 1], ".", markersize=10, color="darkorange", label="Bass")


### plot predictions
ax.plot(times, predicted_pitch_sop, ".", markersize=6, color="lightskyblue", label="Pred. Soprano")
ax.plot(times, predicted_pitch_alt, "x", markersize=6, color="yellowgreen", label="Pred. Alto")
ax.plot(times, predicted_pitch_ten, ".", markersize=6, color="red", label="Pred. Tenor")
ax.plot(times, predicted_pitch_bass, ".", markersize=6, color="gold", label="Pred. Bass")

ax.set_ylabel("Frequency (Hz)", fontsize=20)
ax.set_xlabel("Time (sec)", fontsize=20)
ax.tick_params(axis="both", labelsize=15)
ax.set_ylim([80, 500])
ax.legend(fontsize=15, ncol=4, markerscale=3)

ax.set_xlim([5, 15])
plt.tight_layout()
plt.savefig("HCB_Cantoria_voascnn_viterbi_zoom.png", dpi=200)
In [41]:
## soprano eval
### melody extraction metrics
soprano_metrics = mir_eval.melody.evaluate(
    ref_soprano[:,0],
    ref_soprano[:,1],
    times,
    predicted_pitch_sop
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_soprano[:,0], ref_soprano[:,1])
est_times, est_freqs = mono_prep(times, predicted_pitch_sop)

soprano_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

soprano_metrics_mpe["F-Score"] = 2*(soprano_metrics_mpe["Precision"] * soprano_metrics_mpe["Recall"]) / \
(soprano_metrics_mpe["Precision"] + soprano_metrics_mpe["Recall"])


print("SOPRANO Results Melody:")
print("RPA={}".format(soprano_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(soprano_metrics["Overall Accuracy"]))
print("\n")
print("SOPRANO Results MPE:")
print("Precision={}".format(soprano_metrics_mpe["Precision"]))
print("Recall={}".format(soprano_metrics_mpe["Recall"]))
print("Accuracy={}".format(soprano_metrics_mpe["Accuracy"]))
print("F-Score={}".format(soprano_metrics_mpe["F-Score"]))

print("\n\n")

## alto eval
### melody extraction metrics
alto_metrics = mir_eval.melody.evaluate(
    ref_alto[:,0],
    ref_alto[:,1],
    times,
    predicted_pitch_alt
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_alto[:,0], ref_alto[:,1])
est_times, est_freqs = mono_prep(times, predicted_pitch_alt)

alto_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

alto_metrics_mpe["F-Score"] = 2*(alto_metrics_mpe["Precision"] * alto_metrics_mpe["Recall"]) / \
(alto_metrics_mpe["Precision"] + alto_metrics_mpe["Recall"])


print("ALTO Results Melody:")
print("RPA={}".format(alto_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(alto_metrics["Overall Accuracy"]))
print("\n")
print("ALTO Results MPE:")
print("Precision={}".format(alto_metrics_mpe["Precision"]))
print("Recall={}".format(alto_metrics_mpe["Recall"]))
print("Accuracy={}".format(alto_metrics_mpe["Accuracy"]))
print("F-Score={}".format(alto_metrics_mpe["F-Score"]))

print("\n\n")

## tenor eval
### melody extraction metrics
tenor_metrics = mir_eval.melody.evaluate(
    ref_tenor[:,0],
    ref_tenor[:,1],
    times,
    predicted_pitch_ten
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_tenor[:,0], ref_tenor[:,1])
est_times, est_freqs = mono_prep(times, predicted_pitch_ten)

tenor_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

tenor_metrics_mpe["F-Score"] = 2*(tenor_metrics_mpe["Precision"] * tenor_metrics_mpe["Recall"]) / \
(tenor_metrics_mpe["Precision"] + tenor_metrics_mpe["Recall"])


print("TENOR Results Melody:")
print("RPA={}".format(tenor_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(tenor_metrics["Overall Accuracy"]))
print("\n")
print("TENOR Results MPE:")
print("Precision={}".format(tenor_metrics_mpe["Precision"]))
print("Recall={}".format(tenor_metrics_mpe["Recall"]))
print("Accuracy={}".format(tenor_metrics_mpe["Accuracy"]))
print("F-Score={}".format(tenor_metrics_mpe["F-Score"]))

print("\n\n")

## Bass eval
### melody extraction metrics
bass_metrics = mir_eval.melody.evaluate(
    ref_bass[:,0],
    ref_bass[:,1],
    times,
    predicted_pitch_bass
)

### multipitch metrics
ref_times, ref_freqs = mono_prep(ref_bass[:,0], ref_bass[:,1])
est_times, est_freqs = mono_prep(times, predicted_pitch_bass)

bass_metrics_mpe = mir_eval.multipitch.evaluate(ref_times, ref_freqs, est_times, est_freqs)

bass_metrics_mpe["F-Score"] = 2*(bass_metrics_mpe["Precision"] * bass_metrics_mpe["Recall"]) / \
(bass_metrics_mpe["Precision"] + bass_metrics_mpe["Recall"])


print("BASS Results Melody:")
print("RPA={}".format(bass_metrics["Raw Pitch Accuracy"]))
print("Overall Acc={}".format(bass_metrics["Overall Accuracy"]))
print("\n")
print("BASS Results MPE:")
print("Precision={}".format(bass_metrics_mpe["Precision"]))
print("Recall={}".format(bass_metrics_mpe["Recall"]))
print("Accuracy={}".format(bass_metrics_mpe["Accuracy"]))
print("F-Score={}".format(bass_metrics_mpe["F-Score"]))
/Users/helenacuesta/anaconda3/envs/mf0/lib/python3.6/site-packages/mir_eval/multipitch.py:410: UserWarning: Estimate times not equal to reference times. Resampling to common time base.
  warnings.warn("Estimate times not equal to reference times. "
SOPRANO Results Melody:
RPA=0.9199812821712682
Overall Acc=0.6278735632183908


SOPRANO Results MPE:
Precision=0.6268966618750998
Recall=0.9183434721572298
Accuracy=0.5937972768532527
F-Score=0.7451352634076887



ALTO Results Melody:
RPA=0.6898301345687183
Overall Acc=0.49936143039591313


ALTO Results MPE:
Precision=0.4984826704999201
Recall=0.6885065078314582
Accuracy=0.4067509448716278
F-Score=0.5782842319807301



TENOR Results Melody:
RPA=0.8886266094420601
Overall Acc=0.6612388250319284


TENOR Results MPE:
Precision=0.6626736942980355
Recall=0.890343347639485
Accuracy=0.6126698168930892
F-Score=0.7598205292555626



BASS Results Melody:
RPA=0.7779534673593856
Overall Acc=0.5499680715197957


BASS Results MPE:
Precision=0.5487941223446734
Recall=0.7761463745199909
Accuracy=0.4738003309431881
F-Score=0.6429640718562875
In [ ]: