# -*- coding: utf-8 -*-
"""
This script implements so-called MMSEv, a speaker embedding compensation method
for vocal effort-robust speaker verification purposes. MMSEv has been reported
in the following manuscript:
    
    I. López-Espejo, S. Prieto, A. Ortega, and E. Lleida: "Improved Vocal Effort
    Transfer Vector Estimation for Vocal Effort-Robust Speaker Verification",
    2023.

This implementation corresponds to the shouted-normal scenario.

@author: Iván López-Espejo (ivl@es.aau.dk)
"""


import numpy as np
import scipy.io as sio
from sklearn.mixture import GaussianMixture
from scipy.stats import multivariate_normal
from sklearn.decomposition import PCA


# Data loading.
data = np.load('Embeddings_Shouted.npz')
afiles = data['new_audio_paths']  # For shouted.
xv = data['xvectors']

# ---------------
# Hyperparameters
# ---------------
no_pca_comp = 16  # No. of PCA dimensions.
no_gauss = 8  # No. of GMM components.
# ---------------


fpl = 24  # No. of utterances per speaker and vocal effort mode.
ndl = 22  # No. of speakers.

# Leave-one-speaker-out cross-validation.
total_est = np.empty((0,256))
total_files = np.empty(0)
for si in np.arange(0,ndl):
    
    print(str(si+1))
    
    # We prepare the data for speaker "si".
    te_files = np.empty(0)
    te_data = np.empty((0,256))
    tr_files = np.empty(0)
    tr_data = np.empty((0,256))
    for fi in np.arange(0,len(afiles)):
        cfile = afiles[fi]
        cspk = int(cfile[1:3])
        if si+1 == cspk:
            # Test.
            te_files = np.append(te_files, cfile)
            te_data = np.append(te_data, np.asmatrix(xv[fi]), axis=0)
        else:
            # Training.
            tr_files = np.append(tr_files, cfile)
            tr_data = np.append(tr_data, np.asmatrix(xv[fi]), axis=0)
    
    # We re-arrange the data to comfortably work with them.
    tr_shout = np.asarray(tr_data[np.arange(0,len(tr_data),2),])
    tr_norm = np.asarray(tr_data[np.arange(1,len(tr_data),2),])
    te_shout = np.asarray(te_data[np.arange(0,len(te_data),2),])
    te_norm = np.asarray(te_data[np.arange(1,len(te_data),2),])
    tr_shoutf = tr_files[np.arange(0,len(tr_files),2),]
    tr_normf = tr_files[np.arange(1,len(tr_files),2),]
    te_shoutf = te_files[np.arange(0,len(te_files),2),]
    te_normf = te_files[np.arange(1,len(te_files),2),]
    
    # ------------------------------------------------------------------------
    # WE BELOW APPLY THE METHODOLOGY FOR SPEAKER "si".
    # ------------------------------------------------------------------------
    
    # Dimensionality reduction by means of PCA.
    pca = PCA(n_components=no_pca_comp)
    z = np.concatenate((tr_norm, tr_shout), axis=0)
    pca.fit(z)
    # We transform the training data.
    tr_pca_s = pca.transform(tr_shout)
    tr_pca_r = pca.transform(tr_shout - tr_norm)  # Vocal effort transfer vector v = y - x.
    
    # We model the joint distribution of z = (v, y) by means of a GMM in the PCA domain.
    z = np.concatenate((tr_pca_r, tr_pca_s), axis=1)
    gm = GaussianMixture(n_components=no_gauss).fit(z)
    
    # MMSE estimation exploiting a GMM.
    est_pca = np.zeros((len(te_shout), no_pca_comp))
    te_pca_s = pca.transform(te_shout)  # Test data to the PCA domain.
    for i in range(len(te_shout)):
        
        # We apply the method on an embedding basis.
        y0 = np.asmatrix(te_pca_s[i]).T
        
        ws = np.zeros(no_gauss)  # Combination weights.
        estK = np.zeros((no_pca_comp, no_gauss))  # Variable to store the partial estimates.
        for k in range(no_gauss):
            
            # Means and covariances of the k-th component.
            u_r = np.asmatrix(gm.means_[k, 0:no_pca_comp]).T
            u_y = np.asmatrix(gm.means_[k, no_pca_comp:]).T
            S_ry = np.diag(np.diag(gm.covariances_[k, 0:no_pca_comp, no_pca_comp:]))
            S_yy = np.diag(np.diag(gm.covariances_[k, no_pca_comp:, no_pca_comp:]))
            
            # Partial estimate.
            est_k = u_r + np.matmul(np.matmul(S_ry, np.linalg.inv(S_yy)), (y0 - u_y))
            estK[:, k] = np.squeeze(np.asarray(est_k))
            
            # Corresponding combination weight.
            ws[k] = multivariate_normal.pdf(np.squeeze(np.asarray(y0)), np.squeeze(np.asarray(u_y)), S_yy) * gm.weights_[k]
        
        # Combination weights normalization.
        ws = ws / np.sum(ws)
        
        # Final estimate.
        est_pca[i] = np.sum(estK * ws, axis=1)

    est = pca.inverse_transform(est_pca)  # Leaving the PCA domain.
    est = te_shout - est  # Embedding compensation as in x = y - v.
    
    # We accumulate the result.
    total_files = np.append(total_files, te_files[np.arange(0,len(te_data),2),])
    total_est = np.append(total_est, est, axis=0)
    total_files = np.append(total_files, te_files[np.arange(1,len(te_data),2),])
    total_est = np.append(total_est, te_norm, axis=0)

# We save the result in .mat files.
sio.savemat('Files_MMSEv_GMM' + str(no_gauss) + '_PCA' + str(no_pca_comp) + '_Shouted.mat',{'total_files': total_files})
sio.savemat('Embeddings_MMSEv_GMM' + str(no_gauss) + '_PCA' + str(no_pca_comp) + '_Shouted.mat',{'total_est': total_est})