Skip to content

Speech Synthesis

Text-to-speech and glottal flow models.

v_sapisynth

V_SAPISYNTH - SAPI speech synthesis (stub).

This wraps Microsoft SAPI which is Windows-specific.

v_sapisynth

v_sapisynth(*args, **kwargs) -> None

Synthesize speech using Microsoft SAPI.

This is a Windows-specific function that interfaces with Microsoft SAPI. For cross-platform text-to-speech, consider using pyttsx3 or gTTS.

Raises:

Type Description
NotImplementedError

SAPI is Windows-specific.

Source code in pyvoicebox/v_sapisynth.py
def v_sapisynth(*args, **kwargs) -> None:
    """Synthesize speech using Microsoft SAPI.

    This is a Windows-specific function that interfaces with Microsoft SAPI.
    For cross-platform text-to-speech, consider using pyttsx3 or gTTS.

    Raises
    ------
    NotImplementedError
        SAPI is Windows-specific.
    """
    raise NotImplementedError(
        "v_sapisynth is Windows/SAPI-specific. "
        "Consider using pyttsx3 or gTTS for cross-platform TTS."
    )

v_glotros

V_GLOTROS - Rosenberg glottal model.

v_glotros

v_glotros(d, t=None, p=None) -> ndarray

Rosenberg glottal model.

Parameters:

Name Type Description Default
d int

Derivative order (0, 1, or 2).

required
t array_like

Time in fractions of a cycle. Default: (0:99)/100.

None
p array_like

Parameters: p[0]=closure time, p[1]=+ve/-ve slope ratio. Default: [0.6, 0.5].

None

Returns:

Name Type Description
u ndarray

Output waveform (derivative of flow waveform if d>0).

Source code in pyvoicebox/v_glotros.py
def v_glotros(d, t=None, p=None) -> np.ndarray:
    """Rosenberg glottal model.

    Parameters
    ----------
    d : int
        Derivative order (0, 1, or 2).
    t : array_like, optional
        Time in fractions of a cycle. Default: (0:99)/100.
    p : array_like, optional
        Parameters: p[0]=closure time, p[1]=+ve/-ve slope ratio.
        Default: [0.6, 0.5].

    Returns
    -------
    u : ndarray
        Output waveform (derivative of flow waveform if d>0).
    """
    if t is None:
        tt = np.arange(100) / 100.0
    else:
        tt = np.mod(np.asarray(t, dtype=float), 1.0)

    u = np.zeros_like(tt)
    de = np.array([0.6, 0.5])
    if p is None:
        p = de.copy()
    else:
        p = np.asarray(p, dtype=float).ravel()
        if len(p) < 2:
            p = np.concatenate([p, de[len(p):2]])

    pp = p[0] / (1.0 + p[1])
    ta = tt < pp
    tb = (tt < p[0]) & ~ta
    wa = np.pi / pp
    wb = 0.5 * np.pi / (p[0] - pp)
    fb = wb * pp

    if d == 0:
        u[ta] = 0.5 * (1.0 - np.cos(wa * tt[ta]))
        u[tb] = np.cos(wb * tt[tb] - fb)
    elif d == 1:
        u[ta] = 0.5 * wa * np.sin(wa * tt[ta])
        u[tb] = -wb * np.sin(wb * tt[tb] - fb)
    elif d == 2:
        u[ta] = 0.5 * wa**2 * np.cos(wa * tt[ta])
        u[tb] = -wb**2 * np.cos(wb * tt[tb] - fb)
    else:
        raise ValueError('Derivative must be 0, 1, or 2')

    return u

v_glotlf

V_GLOTLF - Liljencrants-Fant glottal model.

v_glotlf

v_glotlf(d=0, t=None, p=None) -> tuple[ndarray, dict]

Liljencrants-Fant glottal model.

Parameters:

Name Type Description Default
d int

Derivative order (0, 1, or 2). Default: 0.

0
t array_like

Time in fractions of a cycle. Default: (0:99)/100.

None
p array_like

Parameters [te, E0/Ee, 1-tp/te]. Default: [0.6, 0.1, 0.2].

None

Returns:

Name Type Description
u ndarray

Output waveform.

q dict

Structure with glottal model parameters.

Source code in pyvoicebox/v_glotlf.py
def v_glotlf(d=0, t=None, p=None) -> tuple[np.ndarray, dict]:
    """Liljencrants-Fant glottal model.

    Parameters
    ----------
    d : int, optional
        Derivative order (0, 1, or 2). Default: 0.
    t : array_like, optional
        Time in fractions of a cycle. Default: (0:99)/100.
    p : array_like, optional
        Parameters [te, E0/Ee, 1-tp/te]. Default: [0.6, 0.1, 0.2].

    Returns
    -------
    u : ndarray
        Output waveform.
    q : dict
        Structure with glottal model parameters.
    """
    if t is None:
        tt = np.arange(100) / 100.0
    else:
        t = np.asarray(t, dtype=float)
        tt = t - np.floor(t)

    u = np.zeros_like(tt)
    de = np.array([0.6, 0.1, 0.2])
    if p is None:
        p = de.copy()
    else:
        p = np.asarray(p, dtype=float).ravel()
        if len(p) < 3:
            p = np.concatenate([p, de[len(p):3]])

    # Calculate parameters
    te = p[0]
    mtc = te - 1.0
    e0 = 1.0
    wa = np.pi / (te * (1.0 - p[2]))
    a = -np.log(-p[1] * np.sin(wa * te)) / te
    inta = e0 * ((wa / np.tan(wa * te) - a) / p[1] + wa) / (a**2 + wa**2)

    rb0 = p[1] * inta
    rb = rb0

    # Newton iteration for closure time constant
    thresh = 1e-9
    err = 1.0
    for _ in range(15):
        kk = 1.0 - np.exp(mtc / rb)
        err = rb + mtc * (1.0 / kk - 1.0) - rb0
        derr = 1.0 - (1.0 - kk) * (mtc / rb / kk)**2
        rb = rb - err / derr
        if abs(err) < thresh:
            break

    if abs(err) > thresh:
        raise ValueError('Requested glottal waveform parameters are not feasible')

    e1 = 1.0 / (p[1] * (1.0 - np.exp(mtc / rb)))
    ta_mask = tt < te
    tb_mask = ~ta_mask

    if d == 0:
        u[ta_mask] = e0 * (np.exp(a * tt[ta_mask]) * (a * np.sin(wa * tt[ta_mask]) - wa * np.cos(wa * tt[ta_mask])) + wa) / (a**2 + wa**2)
        u[tb_mask] = e1 * (np.exp(mtc / rb) * (tt[tb_mask] - 1.0 - rb) + np.exp((te - tt[tb_mask]) / rb) * rb)
    elif d == 1:
        u[ta_mask] = e0 * np.exp(a * tt[ta_mask]) * np.sin(wa * tt[ta_mask])
        u[tb_mask] = e1 * (np.exp(mtc / rb) - np.exp((te - tt[tb_mask]) / rb))
    elif d == 2:
        u[ta_mask] = e0 * np.exp(a * tt[ta_mask]) * (a * np.sin(wa * tt[ta_mask]) + wa * np.cos(wa * tt[ta_mask]))
        u[tb_mask] = e1 * np.exp((te - tt[tb_mask]) / rb) / rb
    else:
        raise ValueError('Derivative must be 0, 1, or 2')

    # Build parameter structure
    ti = (np.pi + np.arctan(-wa / a)) / wa
    tp = np.pi / wa
    q = {}
    q['Up'] = e0 * wa * (np.exp(a * tp) + 1.0) / (a**2 + wa**2)
    q['E0'] = 1.0
    q['Ei'] = e0 * np.exp(a * ti) * np.sin(wa * ti)
    q['Ee'] = 1.0 / p[1]
    q['alpha'] = a
    q['epsilon'] = 1.0 / rb
    q['omega'] = wa
    q['t0'] = 0.0
    q['ti'] = ti
    q['tp'] = tp
    q['te'] = te
    q['ta'] = rb / (p[1] * e1)
    q['tc'] = 1.0
    q['Utc'] = -err / p[1]

    return u, q