Skip to content

Information Theory

Entropy calculation and Huffman coding.

v_huffman

V_HUFFMAN - Calculate a D-ary Huffman code.

v_huffman

v_huffman(p, a='01') -> tuple[list, ndarray, float]

Calculate a D-ary Huffman code.

Parameters:

Name Type Description Default
p array_like

Vector of symbol probabilities.

required
a str or list

Alphabet characters. Length determines code order. Default '01' (binary).

'01'

Returns:

Name Type Description
cc list

Code for each symbol (list of strings or lists).

ll ndarray

Code lengths for each symbol.

l float

Average code length.

Source code in pyvoicebox/v_huffman.py
def v_huffman(p, a='01') -> tuple[list, np.ndarray, float]:
    """Calculate a D-ary Huffman code.

    Parameters
    ----------
    p : array_like
        Vector of symbol probabilities.
    a : str or list, optional
        Alphabet characters. Length determines code order. Default '01' (binary).

    Returns
    -------
    cc : list
        Code for each symbol (list of strings or lists).
    ll : ndarray
        Code lengths for each symbol.
    l : float
        Average code length.
    """
    p = np.asarray(p, dtype=float).ravel()
    np_ = len(p)
    d = len(a)

    # Append zeros to ensure full code tree
    nx = np_ + (1 - np_) % (d - 1)
    if nx < np_:
        nx += d - 1
    px = np.zeros(nx)
    px[:np_] = p

    cl = (nx - 1) // (d - 1)  # max potential code length
    cd = np.zeros((nx, cl), dtype=int)
    qx = px.copy()
    ix = np.arange(nx)
    kx = np.zeros(nx, dtype=int)

    for i in range(cl - 1, -1, -1):
        nc = 1 + (i + 1) * (d - 1)  # adjust for 0-based
        # Only use first nc elements
        jx = np.argsort(qx[:nc])
        kx_temp = np.zeros(nc, dtype=int)
        kx_temp[jx] = np.arange(nc)

        # Map current indices through kx
        for idx in range(nx):
            if ix[idx] < nc:
                cd[idx, i] = kx_temp[ix[idx]]

        # Update indices
        for idx in range(nx):
            if ix[idx] < nc:
                ix[idx] = max(kx_temp[ix[idx]] - d + 1, 0)

        # Combine d smallest probabilities
        sorted_q = qx[:nc].copy()
        sorted_q.sort()
        combined = np.sum(sorted_q[:d])
        new_q = np.append(sorted_q[d:], combined)
        new_q.sort()
        qx = np.zeros_like(qx)
        qx[:len(new_q)] = new_q

    cc = []
    ll = np.zeros(np_, dtype=int)
    for i in range(np_):
        ci = cd[i, cd[i, :] < d]
        ll[i] = len(ci)
        code = ''.join(a[c] for c in ci)
        cc.append(code)

    total_p = np.sum(p)
    if total_p > 0:
        l = np.dot(p, ll) / total_p
    else:
        l = 0.0

    return cc, ll, l

v_entropy

V_ENTROPY - Shannon entropy of discrete and sampled continuous distributions.

v_entropy

v_entropy(
    p, dim=None, cond=None, arg=None, step=None
) -> ndarray

Calculate the entropy of discrete and sampled continuous distributions.

Parameters:

Name Type Description Default
p array_like

Probability array.

required
dim int or list of int

Dimensions along which to evaluate entropy. Default: first non-singleton.

None
cond list of int

Dimensions to use as conditional variables.

None
arg list of int

Dimensions to use as parameters in output.

None
step float or list of float

Sample increment for continuous distributions.

None

Returns:

Name Type Description
h ndarray

Entropy value(s).

Source code in pyvoicebox/v_entropy.py
def v_entropy(p, dim=None, cond=None, arg=None, step=None) -> np.ndarray:
    """Calculate the entropy of discrete and sampled continuous distributions.

    Parameters
    ----------
    p : array_like
        Probability array.
    dim : int or list of int, optional
        Dimensions along which to evaluate entropy. Default: first non-singleton.
    cond : list of int, optional
        Dimensions to use as conditional variables.
    arg : list of int, optional
        Dimensions to use as parameters in output.
    step : float or list of float, optional
        Sample increment for continuous distributions.

    Returns
    -------
    h : ndarray
        Entropy value(s).
    """
    p = np.asarray(p, dtype=float)
    ndim = p.ndim

    # Handle step
    if step is None:
        stp = np.zeros(max(ndim, 1))
    else:
        step = np.atleast_1d(step)
        stp = np.full(max(ndim, 1), step[0])
        stp[:len(step)] = step
        stp = np.log2(stp)

    # Handle arg
    if arg is None:
        arg = []
    else:
        arg = [a for a in np.atleast_1d(arg) if a >= 0]

    # Handle cond
    if cond is None:
        cond = []
    else:
        cond = [c for c in np.atleast_1d(cond) if c >= 0]

    if not cond:
        s = p.shape
        if dim is None:
            # First non-singleton or dimension with size >= 2
            candidates = [i for i, si in enumerate(s) if si >= min(2, max(s))]
            dim = [candidates[0]] if candidates else [0]
        else:
            dim = [d for d in np.atleast_1d(dim) if d >= 0]

        sd = 1
        for d in dim:
            if d < len(s):
                sd *= s[d]

        if sd == 1:
            # Bernoulli variables
            q = p.ravel()
            h = -np.log2(q + (q == 0)) * q - np.log2(1 - q + (q == 1)) * (1 - q)
            return h
        else:
            # General case for 1D/2D
            if p.ndim == 1:
                q = p
                sq = np.sum(q)
                h = np.sum(-np.log2(q + (q == 0)) * q) / sq + np.log2(sq)
            else:
                # Sum along the specified dimensions
                axes = tuple(dim)
                sq = np.sum(p, axis=axes, keepdims=True)
                h = np.sum(-np.log2(p + (p == 0)) * p, axis=axes, keepdims=True) / sq + np.log2(sq)
                h = np.squeeze(h)

            h = h + np.sum(stp[dim])
            return h
    else:
        # Conditional entropy: H(X|Y) = H(X,Y) - H(Y)
        joint_dims = list(dim) + list(cond)
        return v_entropy(p, joint_dims, arg=arg) - v_entropy(p, cond, arg=arg)