Source code for edgel3.cli

import os
import sys
import sklearn.decomposition
from edgel3.edgel3_exceptions import EdgeL3Error
from edgel3.core import process_file
from edgel3.models import load_embedding_model
from argparse import ArgumentParser, RawDescriptionHelpFormatter, ArgumentTypeError
from collections import Iterable

def positive_float(value):
    """An argparse type method for accepting only positive floats"""

    try:
        fvalue = float(value)
    except (ValueError, TypeError) as e:
        raise ArgumentTypeError('Expected a positive float, error message: '
                                '{}'.format(e))
    if fvalue <= 0:
        raise ArgumentTypeError('Expected a positive float')
    return fvalue

def positive_int(value):
    """An argparse type method for accepting only positive integers"""

    try:
        ivalue = int(value)
    except (ValueError, TypeError) as e:
        raise ArgumentTypeError('Expected a positive int, error message: '
                                '{}'.format(e))
    if ivalue <= 0:
        raise ArgumentTypeError('Expected a positive int')
    return ivalue

def get_file_list(input_list):
    """Get list of files from the list of inputs"""

    if not isinstance(input_list, Iterable) or isinstance(input_list, str):
        raise ArgumentTypeError('input_list must be iterable (and not string)')
    file_list = []
    for item in input_list:
        if os.path.isfile(item):
            file_list.append(os.path.abspath(item))
        elif os.path.isdir(item):
            for fname in os.listdir(item):
                path = os.path.join(item, fname)
                if os.path.isfile(path):
                    file_list.append(path)
        else:
            raise EdgeL3Error('Could not find {}'.format(item))

    return file_list

[docs]def run(
        inputs, 
        output_dir=None, 
        suffix=None, 
        model_type='sparse', 
        emb_dim=128, 
        retrain_type='ft', 
        sparsity=95.45,
        center=True, 
        hop_size=0.1, 
        verbose=False
    ):
    """
    Computes and saves L3 embedding for given inputs.

    Parameters
    ----------
    inputs : list of str, or str
        File/directory path or list of file/directory paths to be processed
    output_dir : str or None
        Path to directory for saving output files. If None, output files will
        be saved to the directory containing the input file.
    suffix : str or None
        String to be appended to the output filename, i.e. <base filename>_<suffix>.npy.
        If None, then no suffix will be added, i.e. <base filename>.npy.
    model_type : {sea, sparse}
        Type of smaller version of L3 model.
        If ``sea`` is selected, the audio model is a UST specialized (SEA) model. ``sparse`` gives a sparse L3 model with the desired ``sparsity``.
    emb_dim : {512, 256, 128, 64}
        Desired embedding dimension of the UST specialized embedding approximated (SEA) models.
    retrain_type : str
        Type of retraining after sparsification of the L3 audio. Finetuned model is returned for ``ft``
        and ``kd`` gives knowledge distilled sparse audio. 
    sparsity : {95.45, 53.5, 63.5, 72.3, 87.0}
        The desired sparsity to be achieved for the audio model of L3. Sparsity of 95.45 corresponds to the EdgeL3 model. 
    center : boolean
        If True, pads beginning of signal so timestamps correspond
        to center of window.
    hop_size : float
        Hop size in seconds.
    quiet : boolean
        If True, suppress all non-error output to stdout

    Returns
    -------

    """
    if isinstance(inputs, str):
        file_list = [inputs]
    elif isinstance(inputs, Iterable):
        file_list = get_file_list(inputs)
    else:
        raise EdgeL3Error('Invalid input: {}'.format(str(inputs)))

    if len(file_list) == 0:
        print('Edgel3: No WAV files found in {}. Aborting.'.format(str(inputs)))
        sys.exit(-1)

    # Load model
    model = load_embedding_model(model_type, emb_dim, retrain_type, sparsity)

    # Process all files in the arguments
    for filepath in file_list:
        if verbose:
            print('Edgel3: Processing: {}'.format(filepath))
        process_file(filepath,
                     output_dir=output_dir,
                     suffix=suffix,
                     model=model,
                     center=center,
                     hop_size=hop_size,
                     verbose=verbose)
    if verbose:
        print('Edgel3: Done!')


def parse_args(args):
    parser = ArgumentParser(description='Extracts audio embeddings from pruned Look, Listen, and Learn models (Arandjelovic and Zisserman 2017).')

    parser.add_argument('inputs', nargs='+',
                        help='Path or paths to files to process, or path to '
                             'a directory of files to process.')

    parser.add_argument('--output-dir', '-o', default=None,
                        help='Directory to save the ouptut file(s); '
                             'if not given, the output will be '
                             'saved to the same directory as the input WAV '
                             'file(s).')

    parser.add_argument('--suffix', '-x', default=None,
                        help='String to append to the output filenames.'
                             'If not provided, no suffix is added.')

    parser.add_argument('--model-type', '-mtype', type=str, default='sparse',
                        choices=['sea', 'sparse'],
                        help='Type of edge L3 model')
    
    parser.add_argument('--emb-dim', '-e', type=positive_int, default=128,
                        choices=[512, 256, 128, 64],
                        help='Embedding dimension of the UST SEA model.'
                             'Ignored for `sparse` models.')

    parser.add_argument('--retrain-type', '-retrain', type=str, default='ft',
                        choices=['ft', 'kd'],
                        help='The type of retraining after L3 audio is sparsified')

    parser.add_argument('--model-sparsity', '-sp', type=positive_float, default=95.45,
                        choices=[95.45, 53.5, 63.5, 72.3, 87.0],
                        help='Overall model sparsity desired in L3')

    parser.add_argument('--no-centering', '-n', action='store_true', default=False,
                        help='Do not pad signal; timestamps will correspond to '
                             'the beginning of each analysis window.')

    parser.add_argument('--hop-size', '-t', type=positive_float, default=0.1,
                        help='Hop size in seconds for processing audio files.')

    parser.add_argument('--quiet', '-q', action='store_true', default=False,
                        help='Suppress all non-error messages to stdout.')

    return parser.parse_args(args)


def main():
    """
    Extracts audio embeddings from smaller versions of Look, Listen, and Learn models (Arandjelovic and Zisserman 2017).
    """
    args = parse_args(sys.argv[1:])

    print(args)
    run(
        args.inputs,
        output_dir=args.output_dir,
        suffix=args.suffix,
        model_type=args.model_type,
        emb_dim=args.emb_dim,
        retrain_type=args.retrain_type,
        sparsity=args.model_sparsity,
        center=not args.no_centering,
        hop_size=args.hop_size,
        verbose=not args.quiet
    )