Skip to content

Speech Recognition & Features

MFCC extraction, mel filterbank construction, cepstrum/power-spectrum conversion, and Linear Discriminant Analysis.

v_melcepst

V_MELCEPST - Calculate the mel cepstrum of a signal.

v_melcepst

v_melcepst(
    s,
    fs=11025,
    w="M",
    nc=12,
    p=None,
    n=None,
    inc=None,
    fl=0,
    fh=0.5,
) -> tuple[ndarray, ndarray]

Calculate the mel cepstrum of a signal.

Parameters:

Name Type Description Default
s array_like

Speech signal.

required
fs float

Sample rate in Hz.

11025
w str

Mode string: 'R' : rectangular window 'N' : Hanning window 'M' : Hamming window (default) 'p' : filters act in power domain 'a' : filters act in absolute magnitude domain (default) '0' : include 0th cepstral coefficient 'E' : include log energy 'd' : include delta coefficients 'D' : include delta-delta coefficients

'M'
nc int

Number of cepstral coefficients excluding 0th.

12
p int

Number of filters in filterbank. Default: floor(3*log(fs)).

None
n int

FFT length. Default: power of 2 < 0.03*fs.

None
inc int

Frame increment. Default: n//2.

None
fl float

Low end of lowest filter as fraction of fs.

0
fh float

High end of highest filter as fraction of fs.

0.5

Returns:

Name Type Description
c ndarray

Mel cepstrum output (one frame per row).

tc ndarray

Time of each frame centre in samples.

Source code in pyvoicebox/v_melcepst.py
def v_melcepst(s, fs=11025, w='M', nc=12, p=None, n=None, inc=None, fl=0, fh=0.5) -> tuple[np.ndarray, np.ndarray]:
    """Calculate the mel cepstrum of a signal.

    Parameters
    ----------
    s : array_like
        Speech signal.
    fs : float
        Sample rate in Hz.
    w : str
        Mode string:
          'R' : rectangular window
          'N' : Hanning window
          'M' : Hamming window (default)
          'p' : filters act in power domain
          'a' : filters act in absolute magnitude domain (default)
          '0' : include 0th cepstral coefficient
          'E' : include log energy
          'd' : include delta coefficients
          'D' : include delta-delta coefficients
    nc : int
        Number of cepstral coefficients excluding 0th.
    p : int, optional
        Number of filters in filterbank. Default: floor(3*log(fs)).
    n : int, optional
        FFT length. Default: power of 2 < 0.03*fs.
    inc : int, optional
        Frame increment. Default: n//2.
    fl : float
        Low end of lowest filter as fraction of fs.
    fh : float
        High end of highest filter as fraction of fs.

    Returns
    -------
    c : ndarray
        Mel cepstrum output (one frame per row).
    tc : ndarray
        Time of each frame centre in samples.
    """
    s = np.asarray(s, dtype=float).ravel()

    if w is None or w == '':
        w = 'M'
    if p is None:
        p = int(np.floor(3 * np.log(fs)))
    if n is None:
        n = int(2 ** np.floor(np.log2(0.03 * fs)))
    if inc is None:
        inc = n // 2

    # Create window and enframe
    if 'R' in w:
        z, tc, *_ = v_enframe(s, n, inc)
    elif 'N' in w:
        win = 0.5 - 0.5 * np.cos(2 * np.pi * np.arange(1, n + 1) / (n + 1))
        z, tc, *_ = v_enframe(s, win, inc)
    else:
        # Hamming window
        win = 0.54 - 0.46 * np.cos(2 * np.pi * np.arange(n) / (n - 1))
        z, tc, *_ = v_enframe(s, win, inc)

    # Compute FFT
    f = v_rfft(z.T, n)  # columns are frames
    # f shape: (1+n//2, nframes)

    # Get mel filterbank
    m_bank, _, a_idx, b_idx = v_melbankm(p, n, fs, fl, fh, w)

    # Extract relevant FFT bins
    pw = f[a_idx - 1:b_idx, :] * np.conj(f[a_idx - 1:b_idx, :])  # power spectrum
    pw = np.real(pw)
    pth = np.max(pw) * 1e-20

    if 'p' in w:
        y = np.log(np.maximum(m_bank[:, a_idx - 1:b_idx].toarray() @ pw, pth))
    else:
        ath = np.sqrt(pth)
        y = np.log(np.maximum(m_bank[:, a_idx - 1:b_idx].toarray() @ np.abs(f[a_idx - 1:b_idx, :]), ath))

    # DCT
    c = v_rdct(y).T  # one frame per row
    nf = c.shape[0]
    nc_total = nc + 1

    if p > nc_total:
        c = c[:, :nc_total]
    elif p < nc_total:
        c = np.hstack([c, np.zeros((nf, nc_total - p))])

    if '0' not in w:
        c = c[:, 1:]  # remove 0th coefficient
        nc_total -= 1

    if 'E' in w:
        log_energy = np.log(np.maximum(np.sum(pw, axis=0), pth))
        c = np.column_stack([log_energy, c])
        nc_total += 1

    # Delta coefficients
    if 'D' in w:
        vf = np.array([4, 3, 2, 1, 0, -1, -2, -3, -4]) / 60.0
        af = np.array([1, 0, -1]) / 2.0
        ww = np.ones(5, dtype=int)
        cx = np.vstack([c[ww - 1, :], c, c[nf * np.ones(5, dtype=int) - 1, :]])
        vx = np.zeros((nf + 10, nc_total))
        for col in range(nc_total):
            vx[:, col] = np.convolve(cx[:, col], vf, mode='full')[:nf + 10]
        vx = vx[8:, :]  # remove initial transient

        ax = np.zeros((nf + 2, nc_total))
        for col in range(nc_total):
            ax[:, col] = np.convolve(vx[:, col], af, mode='full')[:nf + 2]
        ax = ax[2:, :]  # remove initial transient
        vx = vx[1:nf + 1, :]  # trim vx

        if 'd' in w:
            c = np.hstack([c, vx, ax])
        else:
            c = np.hstack([c, ax])
    elif 'd' in w:
        vf = np.array([4, 3, 2, 1, 0, -1, -2, -3, -4]) / 60.0
        ww = np.ones(4, dtype=int)
        cx = np.vstack([c[ww - 1, :], c, c[nf * np.ones(4, dtype=int) - 1, :]])
        vx = np.zeros((nf + 8, nc_total))
        for col in range(nc_total):
            vx[:, col] = np.convolve(cx[:, col], vf, mode='full')[:nf + 8]
        vx = vx[8:, :]
        c = np.hstack([c, vx])

    return c, tc

v_melbankm

V_MELBANKM - Determine matrix for a mel/erb/bark-spaced filterbank.

v_melbankm

v_melbankm(
    p=None, n=256, fs=11025, fl=0, fh=0.5, w="tz"
) -> tuple[Any, ndarray, int, int]

Determine matrix for a mel/erb/bark-spaced filterbank.

Parameters:

Name Type Description Default
p int or float

Number of filters or filter spacing in k-mel/bark/erb. Default: ceil(4.6*log10(fs)).

None
n int

Length of FFT.

256
fs float

Sample rate in Hz.

11025
fl float

Low end of lowest filter as fraction of fs (or Hz if 'h'/'H' in w).

0
fh float

High end of highest filter as fraction of fs.

0.5
w str

Options string (see MATLAB docs).

'tz'

Returns:

Name Type Description
x scipy.sparse matrix

Filterbank matrix (p, 1+floor(n/2)) or (p, mx-mn+1).

mc ndarray

Filterbank centre frequencies in mel/erb/bark.

mn int

Lowest FFT bin with non-zero coefficient.

mx int

Highest FFT bin with non-zero coefficient.

Source code in pyvoicebox/v_melbankm.py
def v_melbankm(p=None, n=256, fs=11025, fl=0, fh=0.5, w='tz') -> tuple[Any, np.ndarray, int, int]:
    """Determine matrix for a mel/erb/bark-spaced filterbank.

    Parameters
    ----------
    p : int or float, optional
        Number of filters or filter spacing in k-mel/bark/erb.
        Default: ceil(4.6*log10(fs)).
    n : int
        Length of FFT.
    fs : float
        Sample rate in Hz.
    fl : float
        Low end of lowest filter as fraction of fs (or Hz if 'h'/'H' in w).
    fh : float
        High end of highest filter as fraction of fs.
    w : str
        Options string (see MATLAB docs).

    Returns
    -------
    x : scipy.sparse matrix
        Filterbank matrix (p, 1+floor(n/2)) or (p, mx-mn+1).
    mc : ndarray
        Filterbank centre frequencies in mel/erb/bark.
    mn : int
        Lowest FFT bin with non-zero coefficient.
    mx : int
        Highest FFT bin with non-zero coefficient.
    """
    sfact = 2 - int('s' in w)  # 1 if single sided else 2

    # Determine warping
    wr = ' '
    for ch in w:
        if ch in 'lebf':
            wr = ch

    if 'h' in w or 'H' in w:
        mflh = np.array([fl, fh], dtype=float)
    else:
        mflh = np.array([fl * fs, fh * fs], dtype=float)

    if 'H' not in w:
        if wr == 'f':
            pass
        elif wr == 'l':
            if fl <= 0:
                raise ValueError("Low frequency limit must be >0 for 'l' option")
            mflh = np.log10(mflh)
        elif wr == 'e':
            mflh = v_frq2erb(mflh)[0]
        elif wr == 'b':
            mflh = v_frq2bark(mflh)[0]
        else:
            mflh = v_frq2mel(mflh)[0]

    melrng = mflh[1] - mflh[0]
    fn2 = n // 2

    if p is None:
        p = int(np.ceil(4.6 * np.log10(fs)))

    if 'c' in w:
        if p < 1:
            p = int(round(melrng / (p * 1000))) + 1
        melinc = melrng / (p - 1)
        mflh = mflh + np.array([-1, 1]) * melinc
    else:
        if p < 1:
            p = int(round(melrng / (p * 1000))) - 1
        melinc = melrng / (p + 1)

    # Calculate FFT bins for filter boundaries
    edges = mflh[0] + np.array([0, 1, p, p + 1]) * melinc
    if wr == 'f':
        blim = edges * n / fs
    elif wr == 'l':
        blim = 10.0 ** edges * n / fs
    elif wr == 'e':
        blim = v_erb2frq(edges)[0] * n / fs
    elif wr == 'b':
        blim = v_bark2frq(edges)[0] * n / fs
    else:
        blim = v_mel2frq(edges)[0] * n / fs

    mc = mflh[0] + (np.arange(1, p + 1)) * melinc  # mel centre frequencies
    b1 = int(np.floor(blim[0])) + 1  # lowest FFT bin_0 required
    b4 = min(fn2, int(np.ceil(blim[3])) - 1)  # highest FFT bin_0 required

    # Map FFT bins to filter centres
    bins = np.arange(b1, b4 + 1)
    freqs = bins * fs / n
    if wr == 'f':
        pf = (freqs - mflh[0]) / melinc
    elif wr == 'l':
        pf = (np.log10(freqs) - mflh[0]) / melinc
    elif wr == 'e':
        pf = (v_frq2erb(freqs)[0] - mflh[0]) / melinc
    elif wr == 'b':
        pf = (v_frq2bark(freqs)[0] - mflh[0]) / melinc
    else:
        pf = (v_frq2mel(freqs)[0] - mflh[0]) / melinc

    # Remove incorrect entries due to rounding
    if len(pf) > 0 and pf[0] < 0:
        pf = pf[1:]
        b1 = b1 + 1
    if len(pf) > 0 and pf[-1] >= p + 1:
        pf = pf[:-1]
        b4 = b4 - 1

    fp = np.floor(pf).astype(int)
    pm = pf - fp

    k2_arr = np.where(fp > 0)[0]
    k2 = k2_arr[0] if len(k2_arr) > 0 else len(fp)
    k3_arr = np.where(fp < p)[0]
    k3 = k3_arr[-1] if len(k3_arr) > 0 else -1
    k4 = len(fp) - 1

    if 'y' in w:
        mn = 1
        mx = fn2 + 1
        # Build sparse matrix with preserved power
        r_list = []
        c_list = []
        v_list = []
        # Left edge: bins before k2
        for i in range(k2 + b1):
            r_list.append(0)
            c_list.append(i)
            v_list.append(1.0)
        # Middle: upper filters
        for i in range(k2, k3 + 1):
            r_list.append(fp[i])
            c_list.append(i + b1)
            v_list.append(pm[i])
        # Middle: lower filters
        for i in range(k2, k3 + 1):
            r_list.append(fp[i] - 1)
            c_list.append(i + b1)
            v_list.append(1.0 - pm[i])
        # Right edge: bins after k3
        for i in range(k3 + b1 + 1, fn2 + 1):
            r_list.append(p - 1)
            c_list.append(i)
            v_list.append(1.0)
        r_arr = np.array(r_list)
        c_arr = np.array(c_list)
        v_arr = np.array(v_list, dtype=float)
    else:
        # Standard case
        r1 = fp[:k3 + 1]  # filter number (0-based)
        c1 = np.arange(k3 + 1)
        v1 = pm[:k3 + 1]

        r2 = fp[k2:k4 + 1] - 1  # filter number (0-based)
        c2 = np.arange(k2, k4 + 1)
        v2 = 1.0 - pm[k2:k4 + 1]

        r_arr = np.concatenate([r1, r2])
        c_arr = np.concatenate([c1, c2])
        v_arr = np.concatenate([v1, v2])

        mn = b1 + 1  # lowest FFT bin_1
        mx = b4 + 1  # highest FFT bin_1

    # Handle negative frequencies
    if b1 < 0:
        c_arr = np.abs(c_arr + b1 - 1) - b1 + 1

    # Apply window shape
    if 'n' in w:
        v_arr = 0.5 - 0.5 * np.cos(v_arr * np.pi)
    elif 'm' in w:
        v_arr = 0.5 - 0.46 / 1.08 * np.cos(v_arr * np.pi)

    # Double all except DC and Nyquist
    if sfact == 2:
        msk = (c_arr + mn > 2) & (c_arr + mn < n - fn2 + 2)
        v_arr[msk] = 2.0 * v_arr[msk]

    # Build sparse matrix
    # Convert to 0-based indices for the full matrix
    x = sp.csr_matrix((v_arr, (r_arr, c_arr + mn - 1)),
                       shape=(p, 1 + fn2))

    if 'u' in w:
        sx = np.array(x.sum(axis=1)).ravel()
        sx[sx == 0] = 1.0
        x = sp.diags(1.0 / sx) @ x

    return x, mc, mn, mx

v_cep2pow

V_CEP2POW - Convert cepstral means and variances to the power domain.

v_cep2pow

v_cep2pow(u, v, mode='c') -> tuple[ndarray, ndarray]

Convert cepstral means and variances to the power domain.

Parameters:

Name Type Description Default
u array_like

Vector giving the cepstral means with u[0] the 0th cepstral coefficient.

required
v array_like

Cepstral covariance matrix or vector containing the diagonal elements.

required
mode str

'c' : pow=exp(irdct(cep)) [default] 'i' : pow=exp(cep) [no transformation]

'c'

Returns:

Name Type Description
m ndarray

Row vector giving means in the power domain.

c ndarray

Covariance matrix in the power domain.

Source code in pyvoicebox/v_cep2pow.py
def v_cep2pow(u, v, mode='c') -> tuple[np.ndarray, np.ndarray]:
    """Convert cepstral means and variances to the power domain.

    Parameters
    ----------
    u : array_like
        Vector giving the cepstral means with u[0] the 0th cepstral coefficient.
    v : array_like
        Cepstral covariance matrix or vector containing the diagonal elements.
    mode : str, optional
        'c' : pow=exp(irdct(cep)) [default]
        'i' : pow=exp(cep) [no transformation]

    Returns
    -------
    m : ndarray
        Row vector giving means in the power domain.
    c : ndarray
        Covariance matrix in the power domain.
    """
    u = np.asarray(u, dtype=float).ravel()
    v = np.asarray(v, dtype=float)
    if v.ndim == 1 or min(v.shape) == 1:
        v = np.diag(v.ravel())

    if 'i' in mode:
        p_vec = u.copy()
        q_mat = v.copy()
    else:
        # Default: DCT domain
        p_vec = v_irdct(u)
        q_mat = v_irdct(v_irdct(v).T)

    m = np.exp(p_vec + 0.5 * np.diag(q_mat))
    c = np.outer(m, m) * (np.exp(q_mat) - 1.0)
    return m, c

v_pow2cep

V_POW2CEP - Convert power domain means and variances to the cepstral domain.

v_pow2cep

v_pow2cep(m, c, mode='c') -> tuple[ndarray, ndarray]

Convert power domain means and variances to the cepstral domain.

Parameters:

Name Type Description Default
m array_like

Vector giving means in the power domain.

required
c array_like

Covariance matrix in the power domain (or diag© if diagonal).

required
mode str

'c' : pow=exp(irdct(cep)) [default] 'i' : pow=exp(cep) [no transformation]

'c'

Returns:

Name Type Description
u ndarray

Row vector giving the cepstral means with u[0] the 0th cepstral coefficient.

v ndarray

Cepstral covariance matrix.

Source code in pyvoicebox/v_pow2cep.py
def v_pow2cep(m, c, mode='c') -> tuple[np.ndarray, np.ndarray]:
    """Convert power domain means and variances to the cepstral domain.

    Parameters
    ----------
    m : array_like
        Vector giving means in the power domain.
    c : array_like
        Covariance matrix in the power domain (or diag(c) if diagonal).
    mode : str, optional
        'c' : pow=exp(irdct(cep)) [default]
        'i' : pow=exp(cep) [no transformation]

    Returns
    -------
    u : ndarray
        Row vector giving the cepstral means with u[0] the 0th cepstral coefficient.
    v : ndarray
        Cepstral covariance matrix.
    """
    m = np.asarray(m, dtype=float).ravel()
    c = np.asarray(c, dtype=float)
    if c.ndim == 1 or min(c.shape) == 1:
        c = np.diag(c.ravel())

    q = np.log(1.0 + c / np.outer(m, m))
    p = np.log(m) - 0.5 * np.diag(q)

    if 'i' in mode:
        u = p
        v = q
    else:
        # Default: DCT domain
        u = v_rdct(p)
        v = v_rdct(v_rdct(q).T)

    return u, v

v_ldatrace

V_LDATRACE - LDA transform to maximize trace discriminant.

v_ldatrace

v_ldatrace(
    b, w=None, n=None, c=None
) -> tuple[ndarray, ndarray, ndarray, ndarray]

Calculate an LDA transform to maximize trace discriminant.

Parameters:

Name Type Description Default
b ndarray

Between-class covariance matrix (m, m).

required
w ndarray

Within-class covariance matrix (m, m). Default: identity.

None
n int

Number of columns in output matrix A. Default: m.

None
c ndarray

Pre-specified columns of A (m, r). Default: None.

None

Returns:

Name Type Description
a ndarray

Transformation matrix (m, n): y = a.T @ x.

f ndarray

Incremental gain in discriminant for successive columns.

B ndarray

Between-class covariance of y.

W ndarray

Within-class covariance of y.

Source code in pyvoicebox/v_ldatrace.py
def v_ldatrace(b, w=None, n=None, c=None) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Calculate an LDA transform to maximize trace discriminant.

    Parameters
    ----------
    b : ndarray
        Between-class covariance matrix (m, m).
    w : ndarray, optional
        Within-class covariance matrix (m, m). Default: identity.
    n : int, optional
        Number of columns in output matrix A. Default: m.
    c : ndarray, optional
        Pre-specified columns of A (m, r). Default: None.

    Returns
    -------
    a : ndarray
        Transformation matrix (m, n): y = a.T @ x.
    f : ndarray
        Incremental gain in discriminant for successive columns.
    B : ndarray
        Between-class covariance of y.
    W : ndarray
        Within-class covariance of y.
    """
    b = np.asarray(b, dtype=float)
    m = b.shape[0]

    if w is None:
        w = np.eye(m)
    else:
        w = np.asarray(w, dtype=float)

    if n is None:
        n = m

    r = 0
    if c is not None:
        c = np.asarray(c, dtype=float)
        r = c.shape[1]

    if r > 0:
        a = np.zeros((m, n))
        if n > r:
            g = linalg.cholesky(w, lower=False)  # upper triangular
            # null space of c'*inv(g')
            ginv = linalg.solve_triangular(g, np.eye(m))
            ct_ginv = c.T @ ginv.T
            _, sv, vt = linalg.svd(ct_ginv, full_matrices=True)
            # Null space: columns of V corresponding to zero singular values
            null_dim = m - min(r, m)
            v_null = vt[min(r, m):, :].T
            v = ginv @ v_null
            p_mat, l_mat, q_mat = linalg.svd(v.T @ b @ v, full_matrices=True)
            a[:, r:n] = v @ p_mat[:, :n - r]
            a[:, :r] = c
        else:
            a = c[:, :n]

        if n > 0:
            aw = a.T @ w @ a
            ari = a @ linalg.solve_triangular(
                linalg.qr(linalg.cholesky(aw, lower=False))[1],
                np.eye(n)
            )
            f = np.diag(ari.T @ b @ ari)
    else:
        # Use generalized eigenvalue decomposition
        eigenvalues, eigenvectors = linalg.eig(b, w)
        eigenvalues = np.real(eigenvalues)
        idx = np.argsort(-eigenvalues)
        a = np.real(eigenvectors[:, idx[:n]])
        f = eigenvalues[idx[:n]]

    B_out = a.T @ b @ a
    W_out = a.T @ w @ a

    return a, f, B_out, W_out