Information Theory¶

Entropy calculation and Huffman coding.

v_huffman ¶

V_HUFFMAN - Calculate a D-ary Huffman code.

v_huffman ¶

v_huffman(p, a='01') -> tuple[list, ndarray, float]

Calculate a D-ary Huffman code.

Parameters:

Name	Type	Description	Default
`p`	`array_like`	Vector of symbol probabilities.	required
`a`	`str or list`	Alphabet characters. Length determines code order. Default '01' (binary).	`'01'`

Returns:

Name	Type	Description
`cc`	`list`	Code for each symbol (list of strings or lists).
`ll`	`ndarray`	Code lengths for each symbol.
`l`	`float`	Average code length.

Source code in pyvoicebox/v_huffman.py

def v_huffman(p, a='01') -> tuple[list, np.ndarray, float]:
    """Calculate a D-ary Huffman code.

    Parameters
    ----------
    p : array_like
        Vector of symbol probabilities.
    a : str or list, optional
        Alphabet characters. Length determines code order. Default '01' (binary).

    Returns
    -------
    cc : list
        Code for each symbol (list of strings or lists).
    ll : ndarray
        Code lengths for each symbol.
    l : float
        Average code length.
    """
    p = np.asarray(p, dtype=float).ravel()
    np_ = len(p)
    d = len(a)

    # Append zeros to ensure full code tree
    nx = np_ + (1 - np_) % (d - 1)
    if nx < np_:
        nx += d - 1
    px = np.zeros(nx)
    px[:np_] = p

    cl = (nx - 1) // (d - 1)  # max potential code length
    cd = np.zeros((nx, cl), dtype=int)
    qx = px.copy()
    ix = np.arange(nx)
    kx = np.zeros(nx, dtype=int)

    for i in range(cl - 1, -1, -1):
        nc = 1 + (i + 1) * (d - 1)  # adjust for 0-based
        # Only use first nc elements
        jx = np.argsort(qx[:nc])
        kx_temp = np.zeros(nc, dtype=int)
        kx_temp[jx] = np.arange(nc)

        # Map current indices through kx
        for idx in range(nx):
            if ix[idx] < nc:
                cd[idx, i] = kx_temp[ix[idx]]

        # Update indices
        for idx in range(nx):
            if ix[idx] < nc:
                ix[idx] = max(kx_temp[ix[idx]] - d + 1, 0)

        # Combine d smallest probabilities
        sorted_q = qx[:nc].copy()
        sorted_q.sort()
        combined = np.sum(sorted_q[:d])
        new_q = np.append(sorted_q[d:], combined)
        new_q.sort()
        qx = np.zeros_like(qx)
        qx[:len(new_q)] = new_q

    cc = []
    ll = np.zeros(np_, dtype=int)
    for i in range(np_):
        ci = cd[i, cd[i, :] < d]
        ll[i] = len(ci)
        code = ''.join(a[c] for c in ci)
        cc.append(code)

    total_p = np.sum(p)
    if total_p > 0:
        l = np.dot(p, ll) / total_p
    else:
        l = 0.0

    return cc, ll, l

v_entropy ¶

V_ENTROPY - Shannon entropy of discrete and sampled continuous distributions.

v_entropy ¶

v_entropy(
    p, dim=None, cond=None, arg=None, step=None
) -> ndarray

Calculate the entropy of discrete and sampled continuous distributions.

Parameters:

Name	Type	Description	Default
`p`	`array_like`	Probability array.	required
`dim`	`int or list of int`	Dimensions along which to evaluate entropy. Default: first non-singleton.	`None`
`cond`	`list of int`	Dimensions to use as conditional variables.	`None`
`arg`	`list of int`	Dimensions to use as parameters in output.	`None`
`step`	`float or list of float`	Sample increment for continuous distributions.	`None`

Returns:

Name	Type	Description
`h`	`ndarray`	Entropy value(s).

Source code in pyvoicebox/v_entropy.py

def v_entropy(p, dim=None, cond=None, arg=None, step=None) -> np.ndarray:
    """Calculate the entropy of discrete and sampled continuous distributions.

    Parameters
    ----------
    p : array_like
        Probability array.
    dim : int or list of int, optional
        Dimensions along which to evaluate entropy. Default: first non-singleton.
    cond : list of int, optional
        Dimensions to use as conditional variables.
    arg : list of int, optional
        Dimensions to use as parameters in output.
    step : float or list of float, optional
        Sample increment for continuous distributions.

    Returns
    -------
    h : ndarray
        Entropy value(s).
    """
    p = np.asarray(p, dtype=float)
    ndim = p.ndim

    # Handle step
    if step is None:
        stp = np.zeros(max(ndim, 1))
    else:
        step = np.atleast_1d(step)
        stp = np.full(max(ndim, 1), step[0])
        stp[:len(step)] = step
        stp = np.log2(stp)

    # Handle arg
    if arg is None:
        arg = []
    else:
        arg = [a for a in np.atleast_1d(arg) if a >= 0]

    # Handle cond
    if cond is None:
        cond = []
    else:
        cond = [c for c in np.atleast_1d(cond) if c >= 0]

    if not cond:
        s = p.shape
        if dim is None:
            # First non-singleton or dimension with size >= 2
            candidates = [i for i, si in enumerate(s) if si >= min(2, max(s))]
            dim = [candidates[0]] if candidates else [0]
        else:
            dim = [d for d in np.atleast_1d(dim) if d >= 0]

        sd = 1
        for d in dim:
            if d < len(s):
                sd *= s[d]

        if sd == 1:
            # Bernoulli variables
            q = p.ravel()
            h = -np.log2(q + (q == 0)) * q - np.log2(1 - q + (q == 1)) * (1 - q)
            return h
        else:
            # General case for 1D/2D
            if p.ndim == 1:
                q = p
                sq = np.sum(q)
                h = np.sum(-np.log2(q + (q == 0)) * q) / sq + np.log2(sq)
            else:
                # Sum along the specified dimensions
                axes = tuple(dim)
                sq = np.sum(p, axis=axes, keepdims=True)
                h = np.sum(-np.log2(p + (p == 0)) * p, axis=axes, keepdims=True) / sq + np.log2(sq)
                h = np.squeeze(h)

            h = h + np.sum(stp[dim])
            return h
    else:
        # Conditional entropy: H(X|Y) = H(X,Y) - H(Y)
        joint_dims = list(dim) + list(cond)
        return v_entropy(p, joint_dims, arg=arg) - v_entropy(p, cond, arg=arg)