Audio File Input/Output¶

Read and write a variety of audio file formats.

Read and write¶

v_readwav ¶

V_READWAV - Read a .WAV format sound file.

Uses the soundfile library for core WAV I/O, preserving the MATLAB function signature for compatibility.

v_readwav ¶

v_readwav(
    filename, mode="p", nmax=-1, nskip=0
) -> tuple[ndarray, int]

Read a .WAV format sound file.

Parameters:

Name	Type	Description	Default
`filename`	`str`	Path to the WAV file (with or without .wav extension).	required
`mode`	`str`	Scaling mode string. Default is 'p'. 'p' : Scaled so +-1 equals full scale (default). 'r' : Raw unscaled data (integer values). 's' : Auto scale to make data peak = +-1. 'q' : Scaled to make 0dBm0 be unity mean square.	`'p'`
`nmax`	`int`	Maximum number of samples to read. -1 for unlimited (default).	`-1`
`nskip`	`int`	Number of samples to skip from start. Default is 0.	`0`

Returns:

Name	Type	Description
`y`	`ndarray`	Data matrix of shape (samples, channels).
`fs`	`int`	Sample frequency in Hz.

Source code in pyvoicebox/v_readwav.py

def v_readwav(filename, mode='p', nmax=-1, nskip=0) -> tuple[np.ndarray, int]:
    """Read a .WAV format sound file.

    Parameters
    ----------
    filename : str
        Path to the WAV file (with or without .wav extension).
    mode : str, optional
        Scaling mode string. Default is 'p'.
            'p' : Scaled so +-1 equals full scale (default).
            'r' : Raw unscaled data (integer values).
            's' : Auto scale to make data peak = +-1.
            'q' : Scaled to make 0dBm0 be unity mean square.
    nmax : int, optional
        Maximum number of samples to read. -1 for unlimited (default).
    nskip : int, optional
        Number of samples to skip from start. Default is 0.

    Returns
    -------
    y : ndarray
        Data matrix of shape (samples, channels).
    fs : int
        Sample frequency in Hz.
    """
    info = sf.info(filename)
    fs = info.samplerate
    nchannels = info.channels
    subtype = info.subtype

    # Determine how many samples to read
    total_samples = info.frames
    start = nskip
    if nmax >= 0:
        stop = min(start + nmax, total_samples)
    else:
        stop = total_samples

    # Determine scaling mode
    if not mode:
        mode = 'p'
    # Find the first scaling character
    sc = 'p'
    for c in mode:
        if c in 'prsq':
            sc = c
            break

    # Read as float (soundfile default gives +-1 range for integer formats)
    if sc == 'r':
        # For raw mode, read as integer values
        # Determine dtype based on subtype
        if 'PCM_8' in subtype:
            dtype = 'int16'  # soundfile doesn't support int8, we'll handle
        elif 'PCM_16' in subtype:
            dtype = 'int16'
        elif 'PCM_24' in subtype:
            dtype = 'int32'
        elif 'PCM_32' in subtype:
            dtype = 'int32'
        elif 'FLOAT' in subtype or 'DOUBLE' in subtype:
            dtype = 'float64'
        else:
            dtype = 'float64'

        if 'FLOAT' in subtype or 'DOUBLE' in subtype:
            y, _ = sf.read(filename, start=start, stop=stop, dtype='float64',
                           always_2d=True)
        else:
            y, _ = sf.read(filename, start=start, stop=stop, dtype=dtype,
                           always_2d=True)
            y = y.astype(np.float64)
    else:
        # Read as float64, soundfile normalizes to +-1
        y, _ = sf.read(filename, start=start, stop=stop, dtype='float64',
                       always_2d=True)

        if sc == 's':
            # Auto scale to peak = +-1
            peak = np.max(np.abs(y))
            if peak > 0:
                y = y / peak
        elif sc == 'q':
            # Scale to 0dBm0 (ITU G.711)
            # For mu-law format use 2.03761563, else use 2.03033976
            # Since soundfile normalizes to +-1, we just multiply
            # We'd need format info to know if mu-law, default to A-law factor
            y = y * 2.03033976
        # 'p' mode: already +-1 from soundfile

    # If only one channel, still return 2D
    if y.ndim == 1:
        y = y.reshape(-1, 1)

    return y, fs

v_writewav ¶

V_WRITEWAV - Write a .WAV format sound file.

Uses the soundfile library for core WAV I/O.

v_writewav ¶

v_writewav(d, fs, filename, mode='s') -> None

Write a .WAV format sound file.

Parameters:

Name	Type	Description	Default
`d`	`array_like`	Data to write. Shape (samples,) or (samples, channels).	required
`fs`	`int`	Sample frequency in Hz.	required
`filename`	`str`	Output filename (with or without .wav extension).	required
`mode`	`str`	Mode string controlling format and scaling. Default is 's'. 's' : Auto scale to make data peak = +-1 (default). 'r' : Raw unscaled data. 'p' : Scaled so +-1 equals full scale. 'q' : Scaled to 0dBm0. '16': 16 bit PCM data (default bit depth). '8' : 8 bit PCM data. '24': 24 bit PCM data. '32': 32 bit PCM data. 'v' : 32-bit floating point. 'V' : 64-bit floating point. 'a' : 8-bit A-law PCM. 'u' : 8-bit mu-law PCM.	`'s'`

Source code in pyvoicebox/v_writewav.py

def v_writewav(d, fs, filename, mode='s') -> None:
    """Write a .WAV format sound file.

    Parameters
    ----------
    d : array_like
        Data to write. Shape (samples,) or (samples, channels).
    fs : int
        Sample frequency in Hz.
    filename : str
        Output filename (with or without .wav extension).
    mode : str, optional
        Mode string controlling format and scaling. Default is 's'.
            's' : Auto scale to make data peak = +-1 (default).
            'r' : Raw unscaled data.
            'p' : Scaled so +-1 equals full scale.
            'q' : Scaled to 0dBm0.
            '16': 16 bit PCM data (default bit depth).
            '8' : 8 bit PCM data.
            '24': 24 bit PCM data.
            '32': 32 bit PCM data.
            'v' : 32-bit floating point.
            'V' : 64-bit floating point.
            'a' : 8-bit A-law PCM.
            'u' : 8-bit mu-law PCM.
    """
    d = np.asarray(d, dtype=float)
    if d.ndim == 1:
        d = d.reshape(-1, 1)
    elif d.ndim == 2 and d.shape[0] == 1:
        d = d.T

    if not mode:
        mode = 's'

    # Determine subtype (bit depth / format)
    subtype = 'PCM_16'  # default
    if 'v' in mode:
        subtype = 'FLOAT'
    elif 'V' in mode:
        subtype = 'DOUBLE'
    elif 'a' in mode:
        subtype = 'PCM_16'  # A-law: we encode manually then write as PCM
    elif 'u' in mode:
        subtype = 'PCM_16'  # Mu-law: same
    else:
        # Look for numeric bit depth
        bits = None
        i = 0
        while i < len(mode):
            if mode[i].isdigit():
                j = i
                while j < len(mode) and mode[j].isdigit():
                    j += 1
                bits = int(mode[i:j])
                break
            i += 1
        if bits is not None:
            bit_map = {8: 'PCM_16', 16: 'PCM_16', 24: 'PCM_24', 32: 'PCM_32'}
            subtype = bit_map.get(bits, 'PCM_16')

    # Determine scaling mode
    sc = 's'  # default
    for c in mode:
        if c in 'prsq':
            sc = c
            break

    # Scale data
    if sc == 's':
        peak = np.max(np.abs(d))
        if peak > 0:
            d = d / peak
    elif sc == 'p':
        pass  # data already in +-1 range
    elif sc == 'q':
        # Scale by dBm0 factor
        d = d / 2.03033976
    elif sc == 'r':
        # Raw: normalize integer range to +-1 for soundfile
        # Determine the peak integer value for the bit depth
        if subtype == 'FLOAT' or subtype == 'DOUBLE':
            pass  # no normalization needed
        else:
            # Extract bits
            bits_val = int(subtype.split('_')[1]) if '_' in subtype else 16
            peak_int = 2 ** (bits_val - 1)
            d = d / peak_int

    # Append .wav if no extension
    if '.' not in filename:
        filename = filename + '.wav'

    # Write using soundfile
    sf.write(filename, d, fs, subtype=subtype)

v_readhtk ¶

V_READHTK - Read an HTK parameter file.

v_readhtk ¶

v_readhtk(file) -> tuple[ndarray, ndarray, float, int, int]

Read an HTK parameter file.

Parameters:

Name	Type	Description	Default
`file`	`str`	Path to the HTK file.	required

Returns:

Name	Type	Description
`d`	`ndarray`	Data: column vector for waveforms, one row per frame for other types.
`fp`	`float`	Frame period in seconds.
`dt`	`int`	Base data type (0-12).
`tc`	`int`	Full type code including modifiers.
`t`	`str`	Text version of type code, e.g. 'LPC_C_K'.

Source code in pyvoicebox/v_readhtk.py

def v_readhtk(file) -> tuple[np.ndarray, np.ndarray, float, int, int]:
    """Read an HTK parameter file.

    Parameters
    ----------
    file : str
        Path to the HTK file.

    Returns
    -------
    d : ndarray
        Data: column vector for waveforms, one row per frame for other types.
    fp : float
        Frame period in seconds.
    dt : int
        Base data type (0-12).
    tc : int
        Full type code including modifiers.
    t : str
        Text version of type code, e.g. 'LPC_C_K'.
    """
    with open(file, 'rb') as f:
        # Read header (12 bytes)
        nf = struct.unpack('>i', f.read(4))[0]       # number of frames
        fp = struct.unpack('>i', f.read(4))[0] * 1e-7  # frame period in seconds
        by = struct.unpack('>h', f.read(2))[0]       # bytes per frame
        tc = struct.unpack('>h', f.read(2))[0]       # type code

        # Handle negative tc (unsigned interpretation)
        if tc < 0:
            tc = tc + 65536

        # Extract suffix bits and base data type
        cc = _HTK_SUFFIXES
        nhb = len(cc)
        ndt = 6

        # Extract bits from type code
        hb = np.zeros(nhb + 1, dtype=int)
        for i in range(nhb + 1):
            hb[i] = int(np.floor(tc * 2.0 ** (-(ndt + nhb) + i)))
        hd = np.zeros(nhb, dtype=int)
        for i in range(nhb):
            hd[i] = hb[nhb - i] - 2 * hb[nhb - 1 - i]
        dt = tc - int(hb[-1]) * (2 ** ndt)

        # Handle IREFC hack
        if dt == 5:
            # Check file length to detect if stored as compressed LPREFC
            pos = f.tell()
            f.seek(0, 2)
            flen = f.tell()
            f.seek(12, 0)
            if flen > 14 + by * nf:
                dt = 2
                hd[4] = 1  # set compressed flag (index 4 = _C)
                nf = nf + 4

        if dt in (0, 5, 10):
            # 16-bit data for waveforms, IREFC and DISCRETE
            ncol = by // 2
            raw = np.frombuffer(f.read(nf * ncol * 2), dtype='>i2')
            d = raw.reshape(nf, ncol).astype(np.float64)
            if dt == 5:
                d = d / 32767.0
        else:
            if hd[4]:  # compressed data
                nf = nf - 4
                ncol = by // 2
                scales = np.frombuffer(f.read(ncol * 4), dtype='>f4').astype(np.float64)
                biases = np.frombuffer(f.read(ncol * 4), dtype='>f4').astype(np.float64)
                raw = np.frombuffer(f.read(nf * ncol * 2), dtype='>i2').astype(np.float64)
                raw = raw.reshape(nf, ncol)
                d = (raw + biases[np.newaxis, :]) / scales[np.newaxis, :]
            else:
                # Uncompressed float data
                ncol = by // 4
                raw = np.frombuffer(f.read(nf * ncol * 4), dtype='>f4')
                d = raw.reshape(nf, ncol).astype(np.float64)

    # Build text type string
    kind_idx = min(dt, len(_HTK_KINDS) - 1)
    t = _HTK_KINDS[kind_idx]
    for i in range(nhb):
        if hd[i] > 0:
            t += '_' + cc[i]

    return d, fp, dt, tc, t

v_writehtk ¶

V_WRITEHTK - Write data in HTK format.

v_writehtk ¶

v_writehtk(file, d, fp, tc) -> None

Write data in HTK format.

Parameters:

Name	Type	Description	Default
`file`	`str`	Path to the output file.	required
`d`	`array_like`	Data to write: one row per frame.	required
`fp`	`float`	Frame period in seconds.	required
`tc`	`int`	Type code (see v_readhtk for details).	required

Source code in pyvoicebox/v_writehtk.py

def v_writehtk(file, d, fp, tc) -> None:
    """Write data in HTK format.

    Parameters
    ----------
    file : str
        Path to the output file.
    d : array_like
        Data to write: one row per frame.
    fp : float
        Frame period in seconds.
    tc : int
        Type code (see v_readhtk for details).
    """
    d = np.asarray(d, dtype=np.float64)

    # Silently ignore checksum request: clear bit 13 (4096)
    tc = tc & ~4096

    if d.ndim == 1:
        d = d.reshape(-1, 1)

    nf, nv = d.shape

    # Extract bits from type code
    nhb = 10
    ndt = 6
    hb = np.zeros(nhb + 1, dtype=int)
    for i in range(nhb + 1):
        hb[i] = int(np.floor(tc * 2.0 ** (-(ndt + nhb) + i)))
    hd = np.zeros(nhb, dtype=int)
    for i in range(nhb):
        hd[i] = hb[nhb - i] - 2 * hb[nhb - 1 - i]
    dt = tc - int(hb[-1]) * (2 ** ndt)

    # If waveform is a row vector (nf==1, dt==0), treat as column
    if dt == 0 and nf == 1:
        d = d.T
        nf, nv = d.shape

    # Handle compression
    if hd[4]:  # compressed
        dx = np.max(d, axis=0)
        dn = np.min(d, axis=0)
        a = np.ones(nv)
        b = dx.copy()
        mk = dx > dn
        a[mk] = 65534.0 / (dx[mk] - dn[mk])
        b[mk] = 0.5 * (dx[mk] + dn[mk]) * a[mk]
        d = d * a[np.newaxis, :] - b[np.newaxis, :]
        nf = nf + 4

    # Adjust tc for int16 storage
    tc_write = tc
    if tc_write > 32767:
        tc_write = tc_write - 65536

    with open(file, 'wb') as f:
        # Write header
        f.write(struct.pack('>i', nf))
        f.write(struct.pack('>i', round(fp * 1e7)))

        if dt in (0, 5, 10) or hd[4]:
            # Write data as int16
            if dt == 5:
                d = d * 32767
                if hd[4]:
                    raise ValueError('Cannot use compression with IREFC format')

            nby = nv * 2
            if nby > 32767:
                raise ValueError(
                    f'byte count of frame is {nby} which exceeds 32767 '
                    '(is data transposed?)')

            f.write(struct.pack('>h', nby))
            f.write(struct.pack('>h', tc_write))

            if hd[4]:
                # Write compression factors
                for val in a:
                    f.write(struct.pack('>f', val))
                for val in b:
                    f.write(struct.pack('>f', val))

            # Write data row by row (transposed for column-major order)
            int_data = np.round(d).astype(np.int16)
            # Write in row-major order (each row is a frame)
            f.write(int_data.astype('>i2').tobytes())
        else:
            # Write data as float32
            nby = nv * 4
            if nby > 32767:
                raise ValueError(
                    f'byte count of frame is {nby} which exceeds 32767 '
                    '(is data transposed?)')

            f.write(struct.pack('>h', nby))
            f.write(struct.pack('>h', tc_write))

            # Write data
            float_data = d.astype(np.float32)
            f.write(float_data.astype('>f4').tobytes())

Read only¶

v_readsph ¶

V_READSPH - Read a SPHERE/TIMIT format sound file.

The SPHERE (SPeech HEader REsources) format is used by NIST for speech corpora like TIMIT.

v_readsph ¶

v_readsph(filename, mode='p', nmax=-1, nskip=0) -> ndarray

Read a SPHERE/TIMIT format sound file.

Parameters:

Name	Type	Description	Default
`filename`	`str`	Path to the SPH file (with or without .sph extension).	required
`mode`	`str`	Scaling/format mode string. Default is 'p'. 'p' : Scaled so +-1 equals full scale (default). 'r' : Raw unscaled data (integer values). 's' : Auto scale to make data peak = +-1. 'l' : Force little endian byte order. 'b' : Force big endian byte order. 'w' : Also read .wrd annotation file. 't' : Also read .phn phonetic transcription file.	`'p'`
`nmax`	`int`	Maximum number of samples to read. -1 for unlimited.	`-1`
`nskip`	`int`	Number of samples to skip from start.	`0`

Returns:

Name	Type	Description
`y`	`ndarray`	Data matrix of shape (samples, channels).
`fs`	`int`	Sample frequency in Hz.
`wrd`	`list of tuple, optional`	Word annotations [(start_time, end_time, text), ...]. Only returned if 'w' in mode.
`phn`	`list of tuple, optional`	Phoneme annotations [(start_time, end_time, text), ...]. Only returned if 't' in mode.
`ffx`	`dict`	File information dictionary.

Source code in pyvoicebox/v_readsph.py

def v_readsph(filename, mode='p', nmax=-1, nskip=0) -> np.ndarray:
    """Read a SPHERE/TIMIT format sound file.

    Parameters
    ----------
    filename : str
        Path to the SPH file (with or without .sph extension).
    mode : str, optional
        Scaling/format mode string. Default is 'p'.
        'p' : Scaled so +-1 equals full scale (default).
        'r' : Raw unscaled data (integer values).
        's' : Auto scale to make data peak = +-1.
        'l' : Force little endian byte order.
        'b' : Force big endian byte order.
        'w' : Also read .wrd annotation file.
        't' : Also read .phn phonetic transcription file.
    nmax : int, optional
        Maximum number of samples to read. -1 for unlimited.
    nskip : int, optional
        Number of samples to skip from start.

    Returns
    -------
    y : ndarray
        Data matrix of shape (samples, channels).
    fs : int
        Sample frequency in Hz.
    wrd : list of tuple, optional
        Word annotations [(start_time, end_time, text), ...].
        Only returned if 'w' in mode.
    phn : list of tuple, optional
        Phoneme annotations [(start_time, end_time, text), ...].
        Only returned if 't' in mode.
    ffx : dict
        File information dictionary.
    """
    if not mode:
        mode = 'p'

    # Determine scaling mode
    sc = 'p'
    for c in mode:
        if c in 'prs':
            sc = c
            break

    # Find file
    if not os.path.isfile(filename):
        if os.path.isfile(filename + '.sph'):
            filename = filename + '.sph'
        elif os.path.isfile(filename + '.wav'):
            filename = filename + '.wav'
        else:
            raise FileNotFoundError(f"Cannot open {filename} for input")

    # Default byte order
    byte_order = 'little'
    if 'l' in mode:
        byte_order = 'little'
    elif 'b' in mode:
        byte_order = 'big'

    bo = '<' if byte_order == 'little' else '>'

    with open(filename, 'rb') as fid:
        # Read header
        first_line = fid.read(16)
        if len(first_line) < 16:
            raise IOError("File does not begin with a SPHERE header")

        fmt = first_line[:7].decode('ascii', errors='replace').strip()
        try:
            hlen = int(first_line[8:15].decode('ascii').strip())
        except ValueError:
            raise IOError("File does not begin with a SPHERE header")

        # Parse header fields
        hdr = {}
        while True:
            line = b''
            while True:
                ch = fid.read(1)
                if not ch or ch == b'\n':
                    break
                line += ch
            line_str = line.decode('ascii', errors='replace').strip()

            if not line_str or line_str.startswith(';'):
                if line_str.startswith('end_head'):
                    break
                continue

            parts = line_str.split(None, 2)
            if len(parts) < 2:
                if 'end_head' in line_str:
                    break
                continue

            field_name = parts[0]
            type_spec = parts[1]

            if len(parts) >= 3:
                value_str = parts[2]
            else:
                value_str = ''

            if type_spec.startswith('-s'):
                # String type
                try:
                    slen = int(type_spec[2:])
                    hdr[field_name] = value_str[:slen].strip()
                except ValueError:
                    hdr[field_name] = value_str.strip()
            elif type_spec.startswith('-i'):
                try:
                    hdr[field_name] = int(value_str)
                except ValueError:
                    hdr[field_name] = 0
            elif type_spec.startswith('-r'):
                try:
                    hdr[field_name] = float(value_str)
                except ValueError:
                    hdr[field_name] = 0.0
            else:
                hdr[field_name] = value_str

        # Determine byte order from header
        if 'sample_byte_format' in hdr:
            sbf = hdr['sample_byte_format']
            if sbf.startswith('0'):
                byte_order = 'little'
            else:
                byte_order = 'big'
            if 'l' not in mode and 'b' not in mode:
                bo = '<' if byte_order == 'little' else '>'

        # Extract key parameters
        nsamp = hdr.get('sample_count', 0)
        nchan = hdr.get('channel_count', 1)
        nbytes = hdr.get('sample_n_bytes', 2)
        nbits = hdr.get('sample_sig_bits', 16)
        fs = hdr.get('sample_rate', 1)

        # Determine coding
        coding = hdr.get('sample_coding', 'pcm')
        is_ulaw = 'ulaw' in coding.lower() or 'mu-law' in coding.lower()

        if nsamp == 0:
            fid.seek(0, 2)
            file_size = fid.tell()
            nsamp = (file_size - hlen) // (nchan * nbytes)

        # Read data
        start = nskip
        ksamples = nsamp - start
        if nmax >= 0:
            ksamples = min(nmax, ksamples)

        if ksamples > 0:
            fid.seek(hlen + nchan * nbytes * start)
            nread = nchan * ksamples

            if nbytes == 1:
                raw = np.frombuffer(fid.read(nread), dtype=np.uint8)
                if is_ulaw:
                    from pyvoicebox.v_pcmu2lin import v_pcmu2lin
                    y = v_pcmu2lin(raw.astype(float))
                    pk = 2.005649
                else:
                    y = raw.astype(float) - 128
                    pk = 128
            elif nbytes == 2:
                dtype = np.dtype(bo + 'i2')
                y = np.frombuffer(fid.read(nread * 2), dtype=dtype).astype(float)
                pk = 32768
            elif nbytes == 4:
                dtype = np.dtype(bo + 'i4')
                y = np.frombuffer(fid.read(nread * 4), dtype=dtype).astype(float)
                pk = 2**31
            else:
                raise ValueError(f"Unsupported sample size: {nbytes} bytes")

            # Scale
            if sc == 's':
                peak = np.max(np.abs(y))
                if peak > 0:
                    y = y / peak
            elif sc == 'p':
                if not is_ulaw or nbytes > 1:
                    y = y / pk
            # 'r' mode: no scaling

            if nchan > 1:
                y = y.reshape(-1, nchan)
            else:
                y = y[:, np.newaxis]
        else:
            y = np.array([]).reshape(0, nchan)

    ffx = {
        'filename': filename,
        'header': hdr,
        'format': fmt,
        'sample_count': nsamp,
        'channel_count': nchan,
        'sample_n_bytes': nbytes,
        'sample_sig_bits': nbits,
        'sample_rate': fs,
    }

    # Read annotation files if requested
    results = [y, fs]

    if 'w' in mode:
        wrd = _read_annotation(filename, 'wrd', fs)
        results.append(wrd)

    if 't' in mode:
        phn = _read_annotation(filename, 'phn', fs)
        results.append(phn)

    results.append(ffx)
    return tuple(results)

v_readaif ¶

V_READAIF - Read a .AIF (AIFF) format sound file.

Uses the soundfile library when available for robust AIFF reading.

v_readaif ¶

v_readaif(
    filename, mode="p", nmax=-1, nskip=0
) -> tuple[ndarray, int]

Read a .AIF (AIFF) format sound file.

Parameters:

Name	Type	Description	Default
`filename`	`str`	Path to the AIF file (with or without .aif extension).	required
`mode`	`str`	Scaling mode string. Default is 'p'. 'p' : Scaled so +-1 equals full scale (default). 'r' : Raw unscaled data (integer values). 's' : Auto scale to make data peak = +-1.	`'p'`
`nmax`	`int`	Maximum number of samples to read. -1 for unlimited (default).	`-1`
`nskip`	`int`	Number of samples to skip from start. Default is 0.	`0`

Returns:

Name	Type	Description
`y`	`ndarray`	Data matrix of shape (samples, channels).
`fs`	`int`	Sample frequency in Hz.

Source code in pyvoicebox/v_readaif.py

def v_readaif(filename, mode='p', nmax=-1, nskip=0) -> tuple[np.ndarray, int]:
    """Read a .AIF (AIFF) format sound file.

    Parameters
    ----------
    filename : str
        Path to the AIF file (with or without .aif extension).
    mode : str, optional
        Scaling mode string. Default is 'p'.
        'p' : Scaled so +-1 equals full scale (default).
        'r' : Raw unscaled data (integer values).
        's' : Auto scale to make data peak = +-1.
    nmax : int, optional
        Maximum number of samples to read. -1 for unlimited (default).
    nskip : int, optional
        Number of samples to skip from start. Default is 0.

    Returns
    -------
    y : ndarray
        Data matrix of shape (samples, channels).
    fs : int
        Sample frequency in Hz.
    """
    import soundfile as sf
    import os

    # Try with and without extension
    if not os.path.isfile(filename):
        if os.path.isfile(filename + '.aif'):
            filename = filename + '.aif'
        elif os.path.isfile(filename + '.aiff'):
            filename = filename + '.aiff'
        else:
            raise FileNotFoundError(f"Cannot open {filename} for input")

    info = sf.info(filename)
    fs = info.samplerate
    total_samples = info.frames

    start = nskip
    if nmax >= 0:
        stop = min(start + nmax, total_samples)
    else:
        stop = total_samples

    # Determine scaling
    if not mode:
        mode = 'p'
    sc = 'p'
    for c in mode:
        if c in 'prsq':
            sc = c
            break

    if sc == 'r':
        # Read as integer
        y, _ = sf.read(filename, start=start, stop=stop, dtype='int32')
    else:
        # Read as float (normalized to [-1, 1])
        y, _ = sf.read(filename, start=start, stop=stop, dtype='float64')

        if sc == 's':
            peak = np.max(np.abs(y))
            if peak > 0:
                y = y / peak

    if y.ndim == 1:
        y = y[:, np.newaxis]

    return y, fs

v_readau ¶

V_READAU - Read a SUN .AU format sound file.

Uses the soundfile library for robust AU file reading.

v_readau ¶

v_readau(filename, mode='') -> tuple[ndarray, int, dict]

Read a SUN .AU format sound file.

Parameters:

Name	Type	Description	Default
`filename`	`str`	Path to the AU file (with or without .au extension).	required
`mode`	`str`	Mode string: 't' : trim leading and trailing silences 'h' : read header only	`''`

Returns:

Name	Type	Description
`y`	`ndarray`	Data matrix with one channel per column. If mode='h', returns header parameters as a dict.
`fs`	`int`	Sample frequency in Hz.
`h`	`dict`	Header parameters: 'header_length', 'data_length', 'data_format', 'sample_rate', 'num_channels'.

Source code in pyvoicebox/v_readau.py

def v_readau(filename, mode='') -> tuple[np.ndarray, int, dict]:
    """Read a SUN .AU format sound file.

    Parameters
    ----------
    filename : str
        Path to the AU file (with or without .au extension).
    mode : str, optional
        Mode string:
        't' : trim leading and trailing silences
        'h' : read header only

    Returns
    -------
    y : ndarray
        Data matrix with one channel per column.
        If mode='h', returns header parameters as a dict.
    fs : int
        Sample frequency in Hz.
    h : dict
        Header parameters:
        'header_length', 'data_length', 'data_format',
        'sample_rate', 'num_channels'.
    """
    import soundfile as sf
    import os

    if not os.path.isfile(filename):
        if os.path.isfile(filename + '.au'):
            filename = filename + '.au'
        else:
            raise FileNotFoundError(f"Cannot open {filename} for input")

    info = sf.info(filename)
    fs = info.samplerate
    h = {
        'sample_rate': info.samplerate,
        'num_channels': info.channels,
        'frames': info.frames,
        'format': info.format,
        'subtype': info.subtype,
    }

    if 'h' in mode:
        return h, fs, h

    y, _ = sf.read(filename, dtype='float64')

    if y.ndim == 1:
        y = y[:, np.newaxis]

    if 't' in mode:
        # Trim leading and trailing silence
        energy = np.sum(y ** 2, axis=1)
        threshold = np.max(energy) * 1e-4
        nonsilent = np.where(energy > threshold)[0]
        if len(nonsilent) > 0:
            y = y[nonsilent[0]:nonsilent[-1] + 1, :]

    return y, fs, h

v_readflac ¶

V_READFLAC - Read a .FLAC format sound file.

Uses the soundfile library for FLAC decoding.

v_readflac ¶

v_readflac(filename, mode='p') -> tuple[ndarray, int]

Read a .FLAC format sound file.

Parameters:

Name	Type	Description	Default
`filename`	`str`	Path to the FLAC file.	required
`mode`	`str`	Scaling mode string. Default is 'p'. 'p' : Scaled so +-1 equals full scale (default). 'r' : Raw unscaled data (integer values). 's' : Auto scale to make data peak = +-1.	`'p'`

Returns:

Name	Type	Description
`y`	`ndarray`	Data matrix of shape (samples, channels).
`fs`	`int`	Sample frequency in Hz.

Source code in pyvoicebox/v_readflac.py

def v_readflac(filename, mode='p') -> tuple[np.ndarray, int]:
    """Read a .FLAC format sound file.

    Parameters
    ----------
    filename : str
        Path to the FLAC file.
    mode : str, optional
        Scaling mode string. Default is 'p'.
        'p' : Scaled so +-1 equals full scale (default).
        'r' : Raw unscaled data (integer values).
        's' : Auto scale to make data peak = +-1.

    Returns
    -------
    y : ndarray
        Data matrix of shape (samples, channels).
    fs : int
        Sample frequency in Hz.
    """
    import soundfile as sf
    import os

    if not os.path.isfile(filename):
        if os.path.isfile(filename + '.flac'):
            filename = filename + '.flac'
        else:
            raise FileNotFoundError(f"Cannot open {filename} for input")

    if not mode:
        mode = 'p'

    sc = 'p'
    for c in mode:
        if c in 'prsq':
            sc = c
            break

    info = sf.info(filename)
    fs = info.samplerate

    if sc == 'r':
        y, _ = sf.read(filename, dtype='int32')
    else:
        y, _ = sf.read(filename, dtype='float64')

        if sc == 's':
            peak = np.max(np.abs(y))
            if peak > 0:
                y = y / peak

    if y.ndim == 1:
        y = y[:, np.newaxis]

    return y, fs

v_readsfs ¶

V_READSFS - Read a .SFS (Speech Filing System) format sound file.

The SFS format was developed by Mark Huckvale at UCL for speech research. This is a simplified Python reader for the most common data types.

v_readsfs ¶

v_readsfs(
    filename, ty=1, sub=-1, mode="p", nmax=-1, nskip=0
) -> tuple[ndarray, float, dict]

Read a .SFS format sound file.

Parameters:

Name	Type	Description	Default
`filename`	`str`	Path to the SFS file.	required
`ty`	`int`	Type of data item: 0=main header, 1=speech, 2=laryngograph, 5=annotation. Default is 1.	`1`
`sub`	`int`	Instance of type ty: 0=first, -1=last (default).	`-1`
`mode`	`str`	Mode string. Default is 'p'. 'i' : Force integer data to be at least 16 bits.	`'p'`
`nmax`	`int`	Maximum number of samples to read. -1 for unlimited.	`-1`
`nskip`	`int`	Number of samples to skip from start.	`0`

Returns:

Name	Type	Description
`y`	`ndarray`	Data array. For speech data, column vector.
`fs`	`float`	Sample frequency in Hz.
`hd`	`dict`	Header information.

Source code in pyvoicebox/v_readsfs.py

def v_readsfs(filename, ty=1, sub=-1, mode='p', nmax=-1, nskip=0) -> tuple[np.ndarray, float, dict]:
    """Read a .SFS format sound file.

    Parameters
    ----------
    filename : str
        Path to the SFS file.
    ty : int, optional
        Type of data item: 0=main header, 1=speech, 2=laryngograph,
        5=annotation. Default is 1.
    sub : int, optional
        Instance of type ty: 0=first, -1=last (default).
    mode : str, optional
        Mode string. Default is 'p'.
        'i' : Force integer data to be at least 16 bits.
    nmax : int, optional
        Maximum number of samples to read. -1 for unlimited.
    nskip : int, optional
        Number of samples to skip from start.

    Returns
    -------
    y : ndarray
        Data array. For speech data, column vector.
    fs : float
        Sample frequency in Hz.
    hd : dict
        Header information.
    """
    if not os.path.isfile(filename):
        raise FileNotFoundError(f"Cannot open {filename} for input")

    with open(filename, 'rb') as fid:
        # Read main header
        t = fid.read(512)
        if len(t) < 512:
            raise IOError(f"Cannot read header from SFS file {filename}")

        if t[:3] != b'UC2':
            raise ValueError(f"{filename} is not an SFS file type UC2")

        byte_order = t[511]  # byte order indicator
        bo = '>' if byte_order == 0 else '<'

        # Read item list
        itemlist = [(0, 1, 0, 0, byte_order)]
        proglist = [('', '', '')]

        for i in range(1, 200):
            pos = fid.tell()
            t = fid.read(512)
            if len(t) < 512:
                break

            item_bo = t[511]
            ibo = '>' if item_bo == 0 else '<'

            # Parse item header
            item_type = struct.unpack(ibo + 'i', t[388:392])[0]
            item_subtype = struct.unpack(ibo + 'i', t[392:396])[0]
            item_length = struct.unpack(ibo + 'i', t[412:416])[0]

            if abs(item_type) > 29:
                break

            itemlist.append((item_type, item_subtype, item_length, pos, item_bo))
            proglist.append((
                _zerotrim(t[0:256]),
                _zerotrim(t[256:384]),
                _zerotrim(t[436:456]),
            ))

            # Skip data
            fid.seek(item_length, 1)

        # Find requested item
        it = None
        if ty == 0:
            it = 0
        else:
            matches = [i for i, item in enumerate(itemlist) if item[0] == ty]
            if not matches:
                raise ValueError(f"Cannot find item type {ty} in file {filename}")
            if sub == 0:
                it = matches[0]
            elif sub == -1:
                it = matches[-1]
            else:
                matches_sub = [i for i in matches if itemlist[i][1] == sub]
                if matches_sub:
                    it = matches_sub[0]
                else:
                    raise ValueError(f"Cannot find item {ty}.{sub} in file {filename}")

        y = np.array([])
        fs = 0.0
        hd = {}

        if it == 0:
            # Read main header info
            fid.seek(0)
            mb = fid.read(512)
            hd['file_type'] = _zerotrim(mb[0:4])
            return y, fs, hd

        # Read item data
        lit = itemlist[it]
        item_bo = lit[4]
        ibo = '>' if item_bo == 0 else '<'

        fid.seek(lit[3])  # seek to item header
        # Read the 512-byte item header
        ihdr = fid.read(512)

        # Parse item header fields
        processing_history = _zerotrim(ihdr[0:256])
        parameters = _zerotrim(ihdr[256:384])

        # Read numeric header fields
        hdr = np.zeros(14)
        hdr_bytes = ihdr[384:384 + 32]
        for j in range(8):
            hdr[j] = struct.unpack(ibo + 'i', hdr_bytes[j * 4:(j + 1) * 4])[0]

        # Frame duration (double)
        hdr[8] = struct.unpack(ibo + 'd', ihdr[416:424])[0]
        if hdr[8] > 0:
            fs = 1.0 / hdr[8]

        # Data present
        hdr[9] = struct.unpack(ibo + 'i', ihdr[424:428])[0]

        # Time offset (double)
        hdr[10] = struct.unpack(ibo + 'd', ihdr[428:436])[0]

        comment = _zerotrim(ihdr[436:456])

        # Remaining fields
        remaining = ihdr[456:468]
        if len(remaining) >= 12:
            for j in range(3):
                hdr[11 + j] = struct.unpack(ibo + 'i', remaining[j * 4:(j + 1) * 4])[0]

        hd = {
            'processing_history': processing_history,
            'parameters': parameters,
            'comment': comment,
            'data_type': int(hdr[1]),
            'subtype': int(hdr[2]),
            'floating': int(hdr[3]),
            'datasize': int(hdr[4]),
            'framesize': int(hdr[5]),
            'numframes': int(hdr[6]),
            'data_length': int(hdr[7]),
            'frame_duration': hdr[8],
            'data_present': int(hdr[9]),
            'time_offset': hdr[10],
            'sample_rate': fs,
        }

        # Read data
        ksamples = int(hdr[6]) - nskip
        if nmax >= 0:
            ksamples = min(nmax, ksamples)

        if ksamples > 0 and int(hdr[9]) == 1:
            ds = int(hdr[4])  # data size in bytes
            fsz = int(hdr[5])  # frame size

            if int(hdr[3]) >= 0:  # non-structured
                if int(hdr[3]) > 0:  # floating point
                    if ds == 4:
                        dtype = ibo + 'f'
                        np_dtype = np.float32
                    elif ds == 8:
                        dtype = ibo + 'd'
                        np_dtype = np.float64
                    else:
                        raise ValueError("Invalid data size in SFS file")
                else:  # integer
                    if ds == 1 and 'i' not in mode:
                        np_dtype = np.uint8
                    elif ds <= 2:
                        np_dtype = np.dtype(ibo + 'i2')
                        fsz = int(np.ceil(fsz * ds / 2))
                    elif ds == 4:
                        np_dtype = np.dtype(ibo + 'i4')
                    else:
                        raise ValueError("Invalid data size in SFS file")

                # Seek to data start
                fid.seek(lit[3] + 512 + nskip * fsz * ds)
                nd = fsz * ksamples
                raw = fid.read(nd * ds)
                y = np.frombuffer(raw, dtype=np_dtype, count=nd)
                y = y.astype(float)
                if fsz > 1:
                    y = y.reshape(ksamples, fsz)
                else:
                    y = y[:, np.newaxis]

    return y, fs, hd

v_readcnx ¶

V_READCNX - Read a .CNX format sound file.

This is the format of the BT Connex-S1 alphabet database.

v_readcnx ¶

v_readcnx(filename, mode='') -> tuple[ndarray, float, dict]

Read a .CNX format sound file.

Parameters:

Name	Type	Description	Default
`filename`	`str`	Path to the CNX file (with or without .cnx extension).	required
`mode`	`str`	Mode string: 't' : trim to start/end samples indicated in header 'h' : read header only	`''`

Returns:

Name	Type	Description
`y`	`ndarray`	Column vector containing the waveform (int16 samples).
`fs`	`float`	Sample frequency in Hz.
`h`	`dict`	Header parameters: 'num_samples' : number of samples in file 'status' : 0=good, 1=bad 'start_sample' : start sample number 'end_sample' : ending sample number 'speaker_id' : speaker identification number 'speaker_age' : speaker age group 'speaker_sex' : 0=male, 1=female 'ascii_char' : ascii character 'repetition' : repetition number

Source code in pyvoicebox/v_readcnx.py

def v_readcnx(filename, mode='') -> tuple[np.ndarray, float, dict]:
    """Read a .CNX format sound file.

    Parameters
    ----------
    filename : str
        Path to the CNX file (with or without .cnx extension).
    mode : str, optional
        Mode string:
        't' : trim to start/end samples indicated in header
        'h' : read header only

    Returns
    -------
    y : ndarray
        Column vector containing the waveform (int16 samples).
    fs : float
        Sample frequency in Hz.
    h : dict
        Header parameters:
        'num_samples' : number of samples in file
        'status' : 0=good, 1=bad
        'start_sample' : start sample number
        'end_sample' : ending sample number
        'speaker_id' : speaker identification number
        'speaker_age' : speaker age group
        'speaker_sex' : 0=male, 1=female
        'ascii_char' : ascii character
        'repetition' : repetition number
    """
    if not os.path.isfile(filename):
        if os.path.isfile(filename + '.cnx'):
            filename = filename + '.cnx'
        else:
            raise FileNotFoundError(f"Cannot open {filename} for input")

    # Field index table (0-indexed delimiters)
    # Original MATLAB ix:
    # ix=[17 71; 18 0; 19 0; 10 0; 12 0; 13 77; 15 -1; 16 0]
    # Columns: [delimiter_index, special_value]
    ix = [
        (16, 71),   # status: check against 'G' (71)
        (17, 0),    # start_sample
        (18, 0),    # end_sample
        (9, 0),     # speaker_id
        (11, 0),    # speaker_age
        (12, 77),   # speaker_sex: check against 'M' (77)
        (14, -1),   # ascii_char: raw byte
        (15, 0),    # repetition
    ]

    with open(filename, 'rb') as fid:
        hdr = fid.read(512)
        if len(hdr) != 512:
            raise IOError(f"Cannot read header from connex file {filename}")

        # Find delimiters (pipe characters) starting from byte 4
        delimiters = []
        for i in range(4, len(hdr)):
            if hdr[i] == ord('|'):
                delimiters.append(i)

        # Parse sample frequency from first field
        # Characters from byte 16 to first delimiter
        fs_str = hdr[16:delimiters[0]].decode('ascii', errors='replace').strip()
        fs = float(fs_str)

        h = {}
        field_names = ['status', 'start_sample', 'end_sample', 'speaker_id',
                       'speaker_age', 'speaker_sex', 'ascii_char', 'repetition']

        for idx, (del_idx, special) in enumerate(ix):
            # Get field between delimiters
            start = delimiters[del_idx - 1] + 1 if del_idx > 0 else 4
            end = delimiters[del_idx] if del_idx < len(delimiters) else len(hdr)
            field_bytes = hdr[start:end]

            # Find '=' sign
            eq_pos = field_bytes.find(b'=')
            if eq_pos < 0:
                h[field_names[idx]] = 0
                continue

            value_bytes = field_bytes[eq_pos + 1:]

            if special == -1:
                # Raw byte value
                h[field_names[idx]] = value_bytes[0] if len(value_bytes) > 0 else 0
            elif special > 0:
                # Check against character
                ch = value_bytes[0] if len(value_bytes) > 0 else 0
                h[field_names[idx]] = 0 if ch == special else 1
            else:
                # Numeric value
                try:
                    val_str = value_bytes.decode('ascii', errors='replace').strip()
                    h[field_names[idx]] = int(val_str) if val_str else 0
                except (ValueError, IndexError):
                    h[field_names[idx]] = 0

        if 'h' in mode:
            return np.array([]), fs, h

        if 't' in mode:
            # Read trimmed data
            start_samp = h.get('start_sample', 0)
            end_samp = h.get('end_sample', 0)
            fid.seek(512 + 2 * start_samp, 0)  # skip to status offset, not start_sample
            count = end_samp - start_samp + 1
            y = np.frombuffer(fid.read(count * 2), dtype='<i2')
        else:
            y = np.frombuffer(fid.read(), dtype='<i2')

        # Compute total samples
        fid.seek(0, 2)
        total_bytes = fid.tell()
        h['num_samples'] = (total_bytes - 512) // 2

    y = y.astype(float)
    if y.ndim == 1:
        y = y[:, np.newaxis]

    return y, fs, h