Skip to content

Audio File Input/Output

Read and write a variety of audio file formats.

Read and write

v_readwav

V_READWAV - Read a .WAV format sound file.

Uses the soundfile library for core WAV I/O, preserving the MATLAB function signature for compatibility.

v_readwav

v_readwav(
    filename, mode="p", nmax=-1, nskip=0
) -> tuple[ndarray, int]

Read a .WAV format sound file.

Parameters:

Name Type Description Default
filename str

Path to the WAV file (with or without .wav extension).

required
mode str

Scaling mode string. Default is 'p'. 'p' : Scaled so +-1 equals full scale (default). 'r' : Raw unscaled data (integer values). 's' : Auto scale to make data peak = +-1. 'q' : Scaled to make 0dBm0 be unity mean square.

'p'
nmax int

Maximum number of samples to read. -1 for unlimited (default).

-1
nskip int

Number of samples to skip from start. Default is 0.

0

Returns:

Name Type Description
y ndarray

Data matrix of shape (samples, channels).

fs int

Sample frequency in Hz.

Source code in pyvoicebox/v_readwav.py
def v_readwav(filename, mode='p', nmax=-1, nskip=0) -> tuple[np.ndarray, int]:
    """Read a .WAV format sound file.

    Parameters
    ----------
    filename : str
        Path to the WAV file (with or without .wav extension).
    mode : str, optional
        Scaling mode string. Default is 'p'.
            'p' : Scaled so +-1 equals full scale (default).
            'r' : Raw unscaled data (integer values).
            's' : Auto scale to make data peak = +-1.
            'q' : Scaled to make 0dBm0 be unity mean square.
    nmax : int, optional
        Maximum number of samples to read. -1 for unlimited (default).
    nskip : int, optional
        Number of samples to skip from start. Default is 0.

    Returns
    -------
    y : ndarray
        Data matrix of shape (samples, channels).
    fs : int
        Sample frequency in Hz.
    """
    info = sf.info(filename)
    fs = info.samplerate
    nchannels = info.channels
    subtype = info.subtype

    # Determine how many samples to read
    total_samples = info.frames
    start = nskip
    if nmax >= 0:
        stop = min(start + nmax, total_samples)
    else:
        stop = total_samples

    # Determine scaling mode
    if not mode:
        mode = 'p'
    # Find the first scaling character
    sc = 'p'
    for c in mode:
        if c in 'prsq':
            sc = c
            break

    # Read as float (soundfile default gives +-1 range for integer formats)
    if sc == 'r':
        # For raw mode, read as integer values
        # Determine dtype based on subtype
        if 'PCM_8' in subtype:
            dtype = 'int16'  # soundfile doesn't support int8, we'll handle
        elif 'PCM_16' in subtype:
            dtype = 'int16'
        elif 'PCM_24' in subtype:
            dtype = 'int32'
        elif 'PCM_32' in subtype:
            dtype = 'int32'
        elif 'FLOAT' in subtype or 'DOUBLE' in subtype:
            dtype = 'float64'
        else:
            dtype = 'float64'

        if 'FLOAT' in subtype or 'DOUBLE' in subtype:
            y, _ = sf.read(filename, start=start, stop=stop, dtype='float64',
                           always_2d=True)
        else:
            y, _ = sf.read(filename, start=start, stop=stop, dtype=dtype,
                           always_2d=True)
            y = y.astype(np.float64)
    else:
        # Read as float64, soundfile normalizes to +-1
        y, _ = sf.read(filename, start=start, stop=stop, dtype='float64',
                       always_2d=True)

        if sc == 's':
            # Auto scale to peak = +-1
            peak = np.max(np.abs(y))
            if peak > 0:
                y = y / peak
        elif sc == 'q':
            # Scale to 0dBm0 (ITU G.711)
            # For mu-law format use 2.03761563, else use 2.03033976
            # Since soundfile normalizes to +-1, we just multiply
            # We'd need format info to know if mu-law, default to A-law factor
            y = y * 2.03033976
        # 'p' mode: already +-1 from soundfile

    # If only one channel, still return 2D
    if y.ndim == 1:
        y = y.reshape(-1, 1)

    return y, fs

v_writewav

V_WRITEWAV - Write a .WAV format sound file.

Uses the soundfile library for core WAV I/O.

v_writewav

v_writewav(d, fs, filename, mode='s') -> None

Write a .WAV format sound file.

Parameters:

Name Type Description Default
d array_like

Data to write. Shape (samples,) or (samples, channels).

required
fs int

Sample frequency in Hz.

required
filename str

Output filename (with or without .wav extension).

required
mode str

Mode string controlling format and scaling. Default is 's'. 's' : Auto scale to make data peak = +-1 (default). 'r' : Raw unscaled data. 'p' : Scaled so +-1 equals full scale. 'q' : Scaled to 0dBm0. '16': 16 bit PCM data (default bit depth). '8' : 8 bit PCM data. '24': 24 bit PCM data. '32': 32 bit PCM data. 'v' : 32-bit floating point. 'V' : 64-bit floating point. 'a' : 8-bit A-law PCM. 'u' : 8-bit mu-law PCM.

's'
Source code in pyvoicebox/v_writewav.py
def v_writewav(d, fs, filename, mode='s') -> None:
    """Write a .WAV format sound file.

    Parameters
    ----------
    d : array_like
        Data to write. Shape (samples,) or (samples, channels).
    fs : int
        Sample frequency in Hz.
    filename : str
        Output filename (with or without .wav extension).
    mode : str, optional
        Mode string controlling format and scaling. Default is 's'.
            's' : Auto scale to make data peak = +-1 (default).
            'r' : Raw unscaled data.
            'p' : Scaled so +-1 equals full scale.
            'q' : Scaled to 0dBm0.
            '16': 16 bit PCM data (default bit depth).
            '8' : 8 bit PCM data.
            '24': 24 bit PCM data.
            '32': 32 bit PCM data.
            'v' : 32-bit floating point.
            'V' : 64-bit floating point.
            'a' : 8-bit A-law PCM.
            'u' : 8-bit mu-law PCM.
    """
    d = np.asarray(d, dtype=float)
    if d.ndim == 1:
        d = d.reshape(-1, 1)
    elif d.ndim == 2 and d.shape[0] == 1:
        d = d.T

    if not mode:
        mode = 's'

    # Determine subtype (bit depth / format)
    subtype = 'PCM_16'  # default
    if 'v' in mode:
        subtype = 'FLOAT'
    elif 'V' in mode:
        subtype = 'DOUBLE'
    elif 'a' in mode:
        subtype = 'PCM_16'  # A-law: we encode manually then write as PCM
    elif 'u' in mode:
        subtype = 'PCM_16'  # Mu-law: same
    else:
        # Look for numeric bit depth
        bits = None
        i = 0
        while i < len(mode):
            if mode[i].isdigit():
                j = i
                while j < len(mode) and mode[j].isdigit():
                    j += 1
                bits = int(mode[i:j])
                break
            i += 1
        if bits is not None:
            bit_map = {8: 'PCM_16', 16: 'PCM_16', 24: 'PCM_24', 32: 'PCM_32'}
            subtype = bit_map.get(bits, 'PCM_16')

    # Determine scaling mode
    sc = 's'  # default
    for c in mode:
        if c in 'prsq':
            sc = c
            break

    # Scale data
    if sc == 's':
        peak = np.max(np.abs(d))
        if peak > 0:
            d = d / peak
    elif sc == 'p':
        pass  # data already in +-1 range
    elif sc == 'q':
        # Scale by dBm0 factor
        d = d / 2.03033976
    elif sc == 'r':
        # Raw: normalize integer range to +-1 for soundfile
        # Determine the peak integer value for the bit depth
        if subtype == 'FLOAT' or subtype == 'DOUBLE':
            pass  # no normalization needed
        else:
            # Extract bits
            bits_val = int(subtype.split('_')[1]) if '_' in subtype else 16
            peak_int = 2 ** (bits_val - 1)
            d = d / peak_int

    # Append .wav if no extension
    if '.' not in filename:
        filename = filename + '.wav'

    # Write using soundfile
    sf.write(filename, d, fs, subtype=subtype)

v_readhtk

V_READHTK - Read an HTK parameter file.

v_readhtk

v_readhtk(file) -> tuple[ndarray, ndarray, float, int, int]

Read an HTK parameter file.

Parameters:

Name Type Description Default
file str

Path to the HTK file.

required

Returns:

Name Type Description
d ndarray

Data: column vector for waveforms, one row per frame for other types.

fp float

Frame period in seconds.

dt int

Base data type (0-12).

tc int

Full type code including modifiers.

t str

Text version of type code, e.g. 'LPC_C_K'.

Source code in pyvoicebox/v_readhtk.py
def v_readhtk(file) -> tuple[np.ndarray, np.ndarray, float, int, int]:
    """Read an HTK parameter file.

    Parameters
    ----------
    file : str
        Path to the HTK file.

    Returns
    -------
    d : ndarray
        Data: column vector for waveforms, one row per frame for other types.
    fp : float
        Frame period in seconds.
    dt : int
        Base data type (0-12).
    tc : int
        Full type code including modifiers.
    t : str
        Text version of type code, e.g. 'LPC_C_K'.
    """
    with open(file, 'rb') as f:
        # Read header (12 bytes)
        nf = struct.unpack('>i', f.read(4))[0]       # number of frames
        fp = struct.unpack('>i', f.read(4))[0] * 1e-7  # frame period in seconds
        by = struct.unpack('>h', f.read(2))[0]       # bytes per frame
        tc = struct.unpack('>h', f.read(2))[0]       # type code

        # Handle negative tc (unsigned interpretation)
        if tc < 0:
            tc = tc + 65536

        # Extract suffix bits and base data type
        cc = _HTK_SUFFIXES
        nhb = len(cc)
        ndt = 6

        # Extract bits from type code
        hb = np.zeros(nhb + 1, dtype=int)
        for i in range(nhb + 1):
            hb[i] = int(np.floor(tc * 2.0 ** (-(ndt + nhb) + i)))
        hd = np.zeros(nhb, dtype=int)
        for i in range(nhb):
            hd[i] = hb[nhb - i] - 2 * hb[nhb - 1 - i]
        dt = tc - int(hb[-1]) * (2 ** ndt)

        # Handle IREFC hack
        if dt == 5:
            # Check file length to detect if stored as compressed LPREFC
            pos = f.tell()
            f.seek(0, 2)
            flen = f.tell()
            f.seek(12, 0)
            if flen > 14 + by * nf:
                dt = 2
                hd[4] = 1  # set compressed flag (index 4 = _C)
                nf = nf + 4

        if dt in (0, 5, 10):
            # 16-bit data for waveforms, IREFC and DISCRETE
            ncol = by // 2
            raw = np.frombuffer(f.read(nf * ncol * 2), dtype='>i2')
            d = raw.reshape(nf, ncol).astype(np.float64)
            if dt == 5:
                d = d / 32767.0
        else:
            if hd[4]:  # compressed data
                nf = nf - 4
                ncol = by // 2
                scales = np.frombuffer(f.read(ncol * 4), dtype='>f4').astype(np.float64)
                biases = np.frombuffer(f.read(ncol * 4), dtype='>f4').astype(np.float64)
                raw = np.frombuffer(f.read(nf * ncol * 2), dtype='>i2').astype(np.float64)
                raw = raw.reshape(nf, ncol)
                d = (raw + biases[np.newaxis, :]) / scales[np.newaxis, :]
            else:
                # Uncompressed float data
                ncol = by // 4
                raw = np.frombuffer(f.read(nf * ncol * 4), dtype='>f4')
                d = raw.reshape(nf, ncol).astype(np.float64)

    # Build text type string
    kind_idx = min(dt, len(_HTK_KINDS) - 1)
    t = _HTK_KINDS[kind_idx]
    for i in range(nhb):
        if hd[i] > 0:
            t += '_' + cc[i]

    return d, fp, dt, tc, t

v_writehtk

V_WRITEHTK - Write data in HTK format.

v_writehtk

v_writehtk(file, d, fp, tc) -> None

Write data in HTK format.

Parameters:

Name Type Description Default
file str

Path to the output file.

required
d array_like

Data to write: one row per frame.

required
fp float

Frame period in seconds.

required
tc int

Type code (see v_readhtk for details).

required
Source code in pyvoicebox/v_writehtk.py
def v_writehtk(file, d, fp, tc) -> None:
    """Write data in HTK format.

    Parameters
    ----------
    file : str
        Path to the output file.
    d : array_like
        Data to write: one row per frame.
    fp : float
        Frame period in seconds.
    tc : int
        Type code (see v_readhtk for details).
    """
    d = np.asarray(d, dtype=np.float64)

    # Silently ignore checksum request: clear bit 13 (4096)
    tc = tc & ~4096

    if d.ndim == 1:
        d = d.reshape(-1, 1)

    nf, nv = d.shape

    # Extract bits from type code
    nhb = 10
    ndt = 6
    hb = np.zeros(nhb + 1, dtype=int)
    for i in range(nhb + 1):
        hb[i] = int(np.floor(tc * 2.0 ** (-(ndt + nhb) + i)))
    hd = np.zeros(nhb, dtype=int)
    for i in range(nhb):
        hd[i] = hb[nhb - i] - 2 * hb[nhb - 1 - i]
    dt = tc - int(hb[-1]) * (2 ** ndt)

    # If waveform is a row vector (nf==1, dt==0), treat as column
    if dt == 0 and nf == 1:
        d = d.T
        nf, nv = d.shape

    # Handle compression
    if hd[4]:  # compressed
        dx = np.max(d, axis=0)
        dn = np.min(d, axis=0)
        a = np.ones(nv)
        b = dx.copy()
        mk = dx > dn
        a[mk] = 65534.0 / (dx[mk] - dn[mk])
        b[mk] = 0.5 * (dx[mk] + dn[mk]) * a[mk]
        d = d * a[np.newaxis, :] - b[np.newaxis, :]
        nf = nf + 4

    # Adjust tc for int16 storage
    tc_write = tc
    if tc_write > 32767:
        tc_write = tc_write - 65536

    with open(file, 'wb') as f:
        # Write header
        f.write(struct.pack('>i', nf))
        f.write(struct.pack('>i', round(fp * 1e7)))

        if dt in (0, 5, 10) or hd[4]:
            # Write data as int16
            if dt == 5:
                d = d * 32767
                if hd[4]:
                    raise ValueError('Cannot use compression with IREFC format')

            nby = nv * 2
            if nby > 32767:
                raise ValueError(
                    f'byte count of frame is {nby} which exceeds 32767 '
                    '(is data transposed?)')

            f.write(struct.pack('>h', nby))
            f.write(struct.pack('>h', tc_write))

            if hd[4]:
                # Write compression factors
                for val in a:
                    f.write(struct.pack('>f', val))
                for val in b:
                    f.write(struct.pack('>f', val))

            # Write data row by row (transposed for column-major order)
            int_data = np.round(d).astype(np.int16)
            # Write in row-major order (each row is a frame)
            f.write(int_data.astype('>i2').tobytes())
        else:
            # Write data as float32
            nby = nv * 4
            if nby > 32767:
                raise ValueError(
                    f'byte count of frame is {nby} which exceeds 32767 '
                    '(is data transposed?)')

            f.write(struct.pack('>h', nby))
            f.write(struct.pack('>h', tc_write))

            # Write data
            float_data = d.astype(np.float32)
            f.write(float_data.astype('>f4').tobytes())

Read only

v_readsph

V_READSPH - Read a SPHERE/TIMIT format sound file.

The SPHERE (SPeech HEader REsources) format is used by NIST for speech corpora like TIMIT.

v_readsph

v_readsph(filename, mode='p', nmax=-1, nskip=0) -> ndarray

Read a SPHERE/TIMIT format sound file.

Parameters:

Name Type Description Default
filename str

Path to the SPH file (with or without .sph extension).

required
mode str

Scaling/format mode string. Default is 'p'. 'p' : Scaled so +-1 equals full scale (default). 'r' : Raw unscaled data (integer values). 's' : Auto scale to make data peak = +-1. 'l' : Force little endian byte order. 'b' : Force big endian byte order. 'w' : Also read .wrd annotation file. 't' : Also read .phn phonetic transcription file.

'p'
nmax int

Maximum number of samples to read. -1 for unlimited.

-1
nskip int

Number of samples to skip from start.

0

Returns:

Name Type Description
y ndarray

Data matrix of shape (samples, channels).

fs int

Sample frequency in Hz.

wrd list of tuple, optional

Word annotations [(start_time, end_time, text), ...]. Only returned if 'w' in mode.

phn list of tuple, optional

Phoneme annotations [(start_time, end_time, text), ...]. Only returned if 't' in mode.

ffx dict

File information dictionary.

Source code in pyvoicebox/v_readsph.py
def v_readsph(filename, mode='p', nmax=-1, nskip=0) -> np.ndarray:
    """Read a SPHERE/TIMIT format sound file.

    Parameters
    ----------
    filename : str
        Path to the SPH file (with or without .sph extension).
    mode : str, optional
        Scaling/format mode string. Default is 'p'.
        'p' : Scaled so +-1 equals full scale (default).
        'r' : Raw unscaled data (integer values).
        's' : Auto scale to make data peak = +-1.
        'l' : Force little endian byte order.
        'b' : Force big endian byte order.
        'w' : Also read .wrd annotation file.
        't' : Also read .phn phonetic transcription file.
    nmax : int, optional
        Maximum number of samples to read. -1 for unlimited.
    nskip : int, optional
        Number of samples to skip from start.

    Returns
    -------
    y : ndarray
        Data matrix of shape (samples, channels).
    fs : int
        Sample frequency in Hz.
    wrd : list of tuple, optional
        Word annotations [(start_time, end_time, text), ...].
        Only returned if 'w' in mode.
    phn : list of tuple, optional
        Phoneme annotations [(start_time, end_time, text), ...].
        Only returned if 't' in mode.
    ffx : dict
        File information dictionary.
    """
    if not mode:
        mode = 'p'

    # Determine scaling mode
    sc = 'p'
    for c in mode:
        if c in 'prs':
            sc = c
            break

    # Find file
    if not os.path.isfile(filename):
        if os.path.isfile(filename + '.sph'):
            filename = filename + '.sph'
        elif os.path.isfile(filename + '.wav'):
            filename = filename + '.wav'
        else:
            raise FileNotFoundError(f"Cannot open {filename} for input")

    # Default byte order
    byte_order = 'little'
    if 'l' in mode:
        byte_order = 'little'
    elif 'b' in mode:
        byte_order = 'big'

    bo = '<' if byte_order == 'little' else '>'

    with open(filename, 'rb') as fid:
        # Read header
        first_line = fid.read(16)
        if len(first_line) < 16:
            raise IOError("File does not begin with a SPHERE header")

        fmt = first_line[:7].decode('ascii', errors='replace').strip()
        try:
            hlen = int(first_line[8:15].decode('ascii').strip())
        except ValueError:
            raise IOError("File does not begin with a SPHERE header")

        # Parse header fields
        hdr = {}
        while True:
            line = b''
            while True:
                ch = fid.read(1)
                if not ch or ch == b'\n':
                    break
                line += ch
            line_str = line.decode('ascii', errors='replace').strip()

            if not line_str or line_str.startswith(';'):
                if line_str.startswith('end_head'):
                    break
                continue

            parts = line_str.split(None, 2)
            if len(parts) < 2:
                if 'end_head' in line_str:
                    break
                continue

            field_name = parts[0]
            type_spec = parts[1]

            if len(parts) >= 3:
                value_str = parts[2]
            else:
                value_str = ''

            if type_spec.startswith('-s'):
                # String type
                try:
                    slen = int(type_spec[2:])
                    hdr[field_name] = value_str[:slen].strip()
                except ValueError:
                    hdr[field_name] = value_str.strip()
            elif type_spec.startswith('-i'):
                try:
                    hdr[field_name] = int(value_str)
                except ValueError:
                    hdr[field_name] = 0
            elif type_spec.startswith('-r'):
                try:
                    hdr[field_name] = float(value_str)
                except ValueError:
                    hdr[field_name] = 0.0
            else:
                hdr[field_name] = value_str

        # Determine byte order from header
        if 'sample_byte_format' in hdr:
            sbf = hdr['sample_byte_format']
            if sbf.startswith('0'):
                byte_order = 'little'
            else:
                byte_order = 'big'
            if 'l' not in mode and 'b' not in mode:
                bo = '<' if byte_order == 'little' else '>'

        # Extract key parameters
        nsamp = hdr.get('sample_count', 0)
        nchan = hdr.get('channel_count', 1)
        nbytes = hdr.get('sample_n_bytes', 2)
        nbits = hdr.get('sample_sig_bits', 16)
        fs = hdr.get('sample_rate', 1)

        # Determine coding
        coding = hdr.get('sample_coding', 'pcm')
        is_ulaw = 'ulaw' in coding.lower() or 'mu-law' in coding.lower()

        if nsamp == 0:
            fid.seek(0, 2)
            file_size = fid.tell()
            nsamp = (file_size - hlen) // (nchan * nbytes)

        # Read data
        start = nskip
        ksamples = nsamp - start
        if nmax >= 0:
            ksamples = min(nmax, ksamples)

        if ksamples > 0:
            fid.seek(hlen + nchan * nbytes * start)
            nread = nchan * ksamples

            if nbytes == 1:
                raw = np.frombuffer(fid.read(nread), dtype=np.uint8)
                if is_ulaw:
                    from pyvoicebox.v_pcmu2lin import v_pcmu2lin
                    y = v_pcmu2lin(raw.astype(float))
                    pk = 2.005649
                else:
                    y = raw.astype(float) - 128
                    pk = 128
            elif nbytes == 2:
                dtype = np.dtype(bo + 'i2')
                y = np.frombuffer(fid.read(nread * 2), dtype=dtype).astype(float)
                pk = 32768
            elif nbytes == 4:
                dtype = np.dtype(bo + 'i4')
                y = np.frombuffer(fid.read(nread * 4), dtype=dtype).astype(float)
                pk = 2**31
            else:
                raise ValueError(f"Unsupported sample size: {nbytes} bytes")

            # Scale
            if sc == 's':
                peak = np.max(np.abs(y))
                if peak > 0:
                    y = y / peak
            elif sc == 'p':
                if not is_ulaw or nbytes > 1:
                    y = y / pk
            # 'r' mode: no scaling

            if nchan > 1:
                y = y.reshape(-1, nchan)
            else:
                y = y[:, np.newaxis]
        else:
            y = np.array([]).reshape(0, nchan)

    ffx = {
        'filename': filename,
        'header': hdr,
        'format': fmt,
        'sample_count': nsamp,
        'channel_count': nchan,
        'sample_n_bytes': nbytes,
        'sample_sig_bits': nbits,
        'sample_rate': fs,
    }

    # Read annotation files if requested
    results = [y, fs]

    if 'w' in mode:
        wrd = _read_annotation(filename, 'wrd', fs)
        results.append(wrd)

    if 't' in mode:
        phn = _read_annotation(filename, 'phn', fs)
        results.append(phn)

    results.append(ffx)
    return tuple(results)

v_readaif

V_READAIF - Read a .AIF (AIFF) format sound file.

Uses the soundfile library when available for robust AIFF reading.

v_readaif

v_readaif(
    filename, mode="p", nmax=-1, nskip=0
) -> tuple[ndarray, int]

Read a .AIF (AIFF) format sound file.

Parameters:

Name Type Description Default
filename str

Path to the AIF file (with or without .aif extension).

required
mode str

Scaling mode string. Default is 'p'. 'p' : Scaled so +-1 equals full scale (default). 'r' : Raw unscaled data (integer values). 's' : Auto scale to make data peak = +-1.

'p'
nmax int

Maximum number of samples to read. -1 for unlimited (default).

-1
nskip int

Number of samples to skip from start. Default is 0.

0

Returns:

Name Type Description
y ndarray

Data matrix of shape (samples, channels).

fs int

Sample frequency in Hz.

Source code in pyvoicebox/v_readaif.py
def v_readaif(filename, mode='p', nmax=-1, nskip=0) -> tuple[np.ndarray, int]:
    """Read a .AIF (AIFF) format sound file.

    Parameters
    ----------
    filename : str
        Path to the AIF file (with or without .aif extension).
    mode : str, optional
        Scaling mode string. Default is 'p'.
        'p' : Scaled so +-1 equals full scale (default).
        'r' : Raw unscaled data (integer values).
        's' : Auto scale to make data peak = +-1.
    nmax : int, optional
        Maximum number of samples to read. -1 for unlimited (default).
    nskip : int, optional
        Number of samples to skip from start. Default is 0.

    Returns
    -------
    y : ndarray
        Data matrix of shape (samples, channels).
    fs : int
        Sample frequency in Hz.
    """
    import soundfile as sf
    import os

    # Try with and without extension
    if not os.path.isfile(filename):
        if os.path.isfile(filename + '.aif'):
            filename = filename + '.aif'
        elif os.path.isfile(filename + '.aiff'):
            filename = filename + '.aiff'
        else:
            raise FileNotFoundError(f"Cannot open {filename} for input")

    info = sf.info(filename)
    fs = info.samplerate
    total_samples = info.frames

    start = nskip
    if nmax >= 0:
        stop = min(start + nmax, total_samples)
    else:
        stop = total_samples

    # Determine scaling
    if not mode:
        mode = 'p'
    sc = 'p'
    for c in mode:
        if c in 'prsq':
            sc = c
            break

    if sc == 'r':
        # Read as integer
        y, _ = sf.read(filename, start=start, stop=stop, dtype='int32')
    else:
        # Read as float (normalized to [-1, 1])
        y, _ = sf.read(filename, start=start, stop=stop, dtype='float64')

        if sc == 's':
            peak = np.max(np.abs(y))
            if peak > 0:
                y = y / peak

    if y.ndim == 1:
        y = y[:, np.newaxis]

    return y, fs

v_readau

V_READAU - Read a SUN .AU format sound file.

Uses the soundfile library for robust AU file reading.

v_readau

v_readau(filename, mode='') -> tuple[ndarray, int, dict]

Read a SUN .AU format sound file.

Parameters:

Name Type Description Default
filename str

Path to the AU file (with or without .au extension).

required
mode str

Mode string: 't' : trim leading and trailing silences 'h' : read header only

''

Returns:

Name Type Description
y ndarray

Data matrix with one channel per column. If mode='h', returns header parameters as a dict.

fs int

Sample frequency in Hz.

h dict

Header parameters: 'header_length', 'data_length', 'data_format', 'sample_rate', 'num_channels'.

Source code in pyvoicebox/v_readau.py
def v_readau(filename, mode='') -> tuple[np.ndarray, int, dict]:
    """Read a SUN .AU format sound file.

    Parameters
    ----------
    filename : str
        Path to the AU file (with or without .au extension).
    mode : str, optional
        Mode string:
        't' : trim leading and trailing silences
        'h' : read header only

    Returns
    -------
    y : ndarray
        Data matrix with one channel per column.
        If mode='h', returns header parameters as a dict.
    fs : int
        Sample frequency in Hz.
    h : dict
        Header parameters:
        'header_length', 'data_length', 'data_format',
        'sample_rate', 'num_channels'.
    """
    import soundfile as sf
    import os

    if not os.path.isfile(filename):
        if os.path.isfile(filename + '.au'):
            filename = filename + '.au'
        else:
            raise FileNotFoundError(f"Cannot open {filename} for input")

    info = sf.info(filename)
    fs = info.samplerate
    h = {
        'sample_rate': info.samplerate,
        'num_channels': info.channels,
        'frames': info.frames,
        'format': info.format,
        'subtype': info.subtype,
    }

    if 'h' in mode:
        return h, fs, h

    y, _ = sf.read(filename, dtype='float64')

    if y.ndim == 1:
        y = y[:, np.newaxis]

    if 't' in mode:
        # Trim leading and trailing silence
        energy = np.sum(y ** 2, axis=1)
        threshold = np.max(energy) * 1e-4
        nonsilent = np.where(energy > threshold)[0]
        if len(nonsilent) > 0:
            y = y[nonsilent[0]:nonsilent[-1] + 1, :]

    return y, fs, h

v_readflac

V_READFLAC - Read a .FLAC format sound file.

Uses the soundfile library for FLAC decoding.

v_readflac

v_readflac(filename, mode='p') -> tuple[ndarray, int]

Read a .FLAC format sound file.

Parameters:

Name Type Description Default
filename str

Path to the FLAC file.

required
mode str

Scaling mode string. Default is 'p'. 'p' : Scaled so +-1 equals full scale (default). 'r' : Raw unscaled data (integer values). 's' : Auto scale to make data peak = +-1.

'p'

Returns:

Name Type Description
y ndarray

Data matrix of shape (samples, channels).

fs int

Sample frequency in Hz.

Source code in pyvoicebox/v_readflac.py
def v_readflac(filename, mode='p') -> tuple[np.ndarray, int]:
    """Read a .FLAC format sound file.

    Parameters
    ----------
    filename : str
        Path to the FLAC file.
    mode : str, optional
        Scaling mode string. Default is 'p'.
        'p' : Scaled so +-1 equals full scale (default).
        'r' : Raw unscaled data (integer values).
        's' : Auto scale to make data peak = +-1.

    Returns
    -------
    y : ndarray
        Data matrix of shape (samples, channels).
    fs : int
        Sample frequency in Hz.
    """
    import soundfile as sf
    import os

    if not os.path.isfile(filename):
        if os.path.isfile(filename + '.flac'):
            filename = filename + '.flac'
        else:
            raise FileNotFoundError(f"Cannot open {filename} for input")

    if not mode:
        mode = 'p'

    sc = 'p'
    for c in mode:
        if c in 'prsq':
            sc = c
            break

    info = sf.info(filename)
    fs = info.samplerate

    if sc == 'r':
        y, _ = sf.read(filename, dtype='int32')
    else:
        y, _ = sf.read(filename, dtype='float64')

        if sc == 's':
            peak = np.max(np.abs(y))
            if peak > 0:
                y = y / peak

    if y.ndim == 1:
        y = y[:, np.newaxis]

    return y, fs

v_readsfs

V_READSFS - Read a .SFS (Speech Filing System) format sound file.

The SFS format was developed by Mark Huckvale at UCL for speech research. This is a simplified Python reader for the most common data types.

v_readsfs

v_readsfs(
    filename, ty=1, sub=-1, mode="p", nmax=-1, nskip=0
) -> tuple[ndarray, float, dict]

Read a .SFS format sound file.

Parameters:

Name Type Description Default
filename str

Path to the SFS file.

required
ty int

Type of data item: 0=main header, 1=speech, 2=laryngograph, 5=annotation. Default is 1.

1
sub int

Instance of type ty: 0=first, -1=last (default).

-1
mode str

Mode string. Default is 'p'. 'i' : Force integer data to be at least 16 bits.

'p'
nmax int

Maximum number of samples to read. -1 for unlimited.

-1
nskip int

Number of samples to skip from start.

0

Returns:

Name Type Description
y ndarray

Data array. For speech data, column vector.

fs float

Sample frequency in Hz.

hd dict

Header information.

Source code in pyvoicebox/v_readsfs.py
def v_readsfs(filename, ty=1, sub=-1, mode='p', nmax=-1, nskip=0) -> tuple[np.ndarray, float, dict]:
    """Read a .SFS format sound file.

    Parameters
    ----------
    filename : str
        Path to the SFS file.
    ty : int, optional
        Type of data item: 0=main header, 1=speech, 2=laryngograph,
        5=annotation. Default is 1.
    sub : int, optional
        Instance of type ty: 0=first, -1=last (default).
    mode : str, optional
        Mode string. Default is 'p'.
        'i' : Force integer data to be at least 16 bits.
    nmax : int, optional
        Maximum number of samples to read. -1 for unlimited.
    nskip : int, optional
        Number of samples to skip from start.

    Returns
    -------
    y : ndarray
        Data array. For speech data, column vector.
    fs : float
        Sample frequency in Hz.
    hd : dict
        Header information.
    """
    if not os.path.isfile(filename):
        raise FileNotFoundError(f"Cannot open {filename} for input")

    with open(filename, 'rb') as fid:
        # Read main header
        t = fid.read(512)
        if len(t) < 512:
            raise IOError(f"Cannot read header from SFS file {filename}")

        if t[:3] != b'UC2':
            raise ValueError(f"{filename} is not an SFS file type UC2")

        byte_order = t[511]  # byte order indicator
        bo = '>' if byte_order == 0 else '<'

        # Read item list
        itemlist = [(0, 1, 0, 0, byte_order)]
        proglist = [('', '', '')]

        for i in range(1, 200):
            pos = fid.tell()
            t = fid.read(512)
            if len(t) < 512:
                break

            item_bo = t[511]
            ibo = '>' if item_bo == 0 else '<'

            # Parse item header
            item_type = struct.unpack(ibo + 'i', t[388:392])[0]
            item_subtype = struct.unpack(ibo + 'i', t[392:396])[0]
            item_length = struct.unpack(ibo + 'i', t[412:416])[0]

            if abs(item_type) > 29:
                break

            itemlist.append((item_type, item_subtype, item_length, pos, item_bo))
            proglist.append((
                _zerotrim(t[0:256]),
                _zerotrim(t[256:384]),
                _zerotrim(t[436:456]),
            ))

            # Skip data
            fid.seek(item_length, 1)

        # Find requested item
        it = None
        if ty == 0:
            it = 0
        else:
            matches = [i for i, item in enumerate(itemlist) if item[0] == ty]
            if not matches:
                raise ValueError(f"Cannot find item type {ty} in file {filename}")
            if sub == 0:
                it = matches[0]
            elif sub == -1:
                it = matches[-1]
            else:
                matches_sub = [i for i in matches if itemlist[i][1] == sub]
                if matches_sub:
                    it = matches_sub[0]
                else:
                    raise ValueError(f"Cannot find item {ty}.{sub} in file {filename}")

        y = np.array([])
        fs = 0.0
        hd = {}

        if it == 0:
            # Read main header info
            fid.seek(0)
            mb = fid.read(512)
            hd['file_type'] = _zerotrim(mb[0:4])
            return y, fs, hd

        # Read item data
        lit = itemlist[it]
        item_bo = lit[4]
        ibo = '>' if item_bo == 0 else '<'

        fid.seek(lit[3])  # seek to item header
        # Read the 512-byte item header
        ihdr = fid.read(512)

        # Parse item header fields
        processing_history = _zerotrim(ihdr[0:256])
        parameters = _zerotrim(ihdr[256:384])

        # Read numeric header fields
        hdr = np.zeros(14)
        hdr_bytes = ihdr[384:384 + 32]
        for j in range(8):
            hdr[j] = struct.unpack(ibo + 'i', hdr_bytes[j * 4:(j + 1) * 4])[0]

        # Frame duration (double)
        hdr[8] = struct.unpack(ibo + 'd', ihdr[416:424])[0]
        if hdr[8] > 0:
            fs = 1.0 / hdr[8]

        # Data present
        hdr[9] = struct.unpack(ibo + 'i', ihdr[424:428])[0]

        # Time offset (double)
        hdr[10] = struct.unpack(ibo + 'd', ihdr[428:436])[0]

        comment = _zerotrim(ihdr[436:456])

        # Remaining fields
        remaining = ihdr[456:468]
        if len(remaining) >= 12:
            for j in range(3):
                hdr[11 + j] = struct.unpack(ibo + 'i', remaining[j * 4:(j + 1) * 4])[0]

        hd = {
            'processing_history': processing_history,
            'parameters': parameters,
            'comment': comment,
            'data_type': int(hdr[1]),
            'subtype': int(hdr[2]),
            'floating': int(hdr[3]),
            'datasize': int(hdr[4]),
            'framesize': int(hdr[5]),
            'numframes': int(hdr[6]),
            'data_length': int(hdr[7]),
            'frame_duration': hdr[8],
            'data_present': int(hdr[9]),
            'time_offset': hdr[10],
            'sample_rate': fs,
        }

        # Read data
        ksamples = int(hdr[6]) - nskip
        if nmax >= 0:
            ksamples = min(nmax, ksamples)

        if ksamples > 0 and int(hdr[9]) == 1:
            ds = int(hdr[4])  # data size in bytes
            fsz = int(hdr[5])  # frame size

            if int(hdr[3]) >= 0:  # non-structured
                if int(hdr[3]) > 0:  # floating point
                    if ds == 4:
                        dtype = ibo + 'f'
                        np_dtype = np.float32
                    elif ds == 8:
                        dtype = ibo + 'd'
                        np_dtype = np.float64
                    else:
                        raise ValueError("Invalid data size in SFS file")
                else:  # integer
                    if ds == 1 and 'i' not in mode:
                        np_dtype = np.uint8
                    elif ds <= 2:
                        np_dtype = np.dtype(ibo + 'i2')
                        fsz = int(np.ceil(fsz * ds / 2))
                    elif ds == 4:
                        np_dtype = np.dtype(ibo + 'i4')
                    else:
                        raise ValueError("Invalid data size in SFS file")

                # Seek to data start
                fid.seek(lit[3] + 512 + nskip * fsz * ds)
                nd = fsz * ksamples
                raw = fid.read(nd * ds)
                y = np.frombuffer(raw, dtype=np_dtype, count=nd)
                y = y.astype(float)
                if fsz > 1:
                    y = y.reshape(ksamples, fsz)
                else:
                    y = y[:, np.newaxis]

    return y, fs, hd

v_readcnx

V_READCNX - Read a .CNX format sound file.

This is the format of the BT Connex-S1 alphabet database.

v_readcnx

v_readcnx(filename, mode='') -> tuple[ndarray, float, dict]

Read a .CNX format sound file.

Parameters:

Name Type Description Default
filename str

Path to the CNX file (with or without .cnx extension).

required
mode str

Mode string: 't' : trim to start/end samples indicated in header 'h' : read header only

''

Returns:

Name Type Description
y ndarray

Column vector containing the waveform (int16 samples).

fs float

Sample frequency in Hz.

h dict

Header parameters: 'num_samples' : number of samples in file 'status' : 0=good, 1=bad 'start_sample' : start sample number 'end_sample' : ending sample number 'speaker_id' : speaker identification number 'speaker_age' : speaker age group 'speaker_sex' : 0=male, 1=female 'ascii_char' : ascii character 'repetition' : repetition number

Source code in pyvoicebox/v_readcnx.py
def v_readcnx(filename, mode='') -> tuple[np.ndarray, float, dict]:
    """Read a .CNX format sound file.

    Parameters
    ----------
    filename : str
        Path to the CNX file (with or without .cnx extension).
    mode : str, optional
        Mode string:
        't' : trim to start/end samples indicated in header
        'h' : read header only

    Returns
    -------
    y : ndarray
        Column vector containing the waveform (int16 samples).
    fs : float
        Sample frequency in Hz.
    h : dict
        Header parameters:
        'num_samples' : number of samples in file
        'status' : 0=good, 1=bad
        'start_sample' : start sample number
        'end_sample' : ending sample number
        'speaker_id' : speaker identification number
        'speaker_age' : speaker age group
        'speaker_sex' : 0=male, 1=female
        'ascii_char' : ascii character
        'repetition' : repetition number
    """
    if not os.path.isfile(filename):
        if os.path.isfile(filename + '.cnx'):
            filename = filename + '.cnx'
        else:
            raise FileNotFoundError(f"Cannot open {filename} for input")

    # Field index table (0-indexed delimiters)
    # Original MATLAB ix:
    # ix=[17 71; 18 0; 19 0; 10 0; 12 0; 13 77; 15 -1; 16 0]
    # Columns: [delimiter_index, special_value]
    ix = [
        (16, 71),   # status: check against 'G' (71)
        (17, 0),    # start_sample
        (18, 0),    # end_sample
        (9, 0),     # speaker_id
        (11, 0),    # speaker_age
        (12, 77),   # speaker_sex: check against 'M' (77)
        (14, -1),   # ascii_char: raw byte
        (15, 0),    # repetition
    ]

    with open(filename, 'rb') as fid:
        hdr = fid.read(512)
        if len(hdr) != 512:
            raise IOError(f"Cannot read header from connex file {filename}")

        # Find delimiters (pipe characters) starting from byte 4
        delimiters = []
        for i in range(4, len(hdr)):
            if hdr[i] == ord('|'):
                delimiters.append(i)

        # Parse sample frequency from first field
        # Characters from byte 16 to first delimiter
        fs_str = hdr[16:delimiters[0]].decode('ascii', errors='replace').strip()
        fs = float(fs_str)

        h = {}
        field_names = ['status', 'start_sample', 'end_sample', 'speaker_id',
                       'speaker_age', 'speaker_sex', 'ascii_char', 'repetition']

        for idx, (del_idx, special) in enumerate(ix):
            # Get field between delimiters
            start = delimiters[del_idx - 1] + 1 if del_idx > 0 else 4
            end = delimiters[del_idx] if del_idx < len(delimiters) else len(hdr)
            field_bytes = hdr[start:end]

            # Find '=' sign
            eq_pos = field_bytes.find(b'=')
            if eq_pos < 0:
                h[field_names[idx]] = 0
                continue

            value_bytes = field_bytes[eq_pos + 1:]

            if special == -1:
                # Raw byte value
                h[field_names[idx]] = value_bytes[0] if len(value_bytes) > 0 else 0
            elif special > 0:
                # Check against character
                ch = value_bytes[0] if len(value_bytes) > 0 else 0
                h[field_names[idx]] = 0 if ch == special else 1
            else:
                # Numeric value
                try:
                    val_str = value_bytes.decode('ascii', errors='replace').strip()
                    h[field_names[idx]] = int(val_str) if val_str else 0
                except (ValueError, IndexError):
                    h[field_names[idx]] = 0

        if 'h' in mode:
            return np.array([]), fs, h

        if 't' in mode:
            # Read trimmed data
            start_samp = h.get('start_sample', 0)
            end_samp = h.get('end_sample', 0)
            fid.seek(512 + 2 * start_samp, 0)  # skip to status offset, not start_sample
            count = end_samp - start_samp + 1
            y = np.frombuffer(fid.read(count * 2), dtype='<i2')
        else:
            y = np.frombuffer(fid.read(), dtype='<i2')

        # Compute total samples
        fid.seek(0, 2)
        total_bytes = fid.tell()
        h['num_samples'] = (total_bytes - 512) // 2

    y = y.astype(float)
    if y.ndim == 1:
        y = y[:, np.newaxis]

    return y, fs, h