Source code for edgel3.models

import os
import warnings
import sklearn.decomposition

with warnings.catch_warnings():
    # Suppress TF and Keras warnings when importing
    warnings.simplefilter("ignore")
    from kapre.time_frequency import Spectrogram, Melspectrogram
    from keras.layers import (
        Input, Conv2D, BatchNormalization, MaxPooling2D,
        Flatten, Activation, Lambda
    )
    from keras.models import Model
    import keras.regularizers as regularizers


[docs]def load_embedding_model(model_type, emb_dim, retrain_type, sparsity): """ Returns a model with the given characteristics. Loads the model if the model has not been loaded yet. Parameters ---------- model_type : {sea, sparse} Type of smaller version of L3 model. If 'sea' is selected, the audio model is a UST specialized (SEA) model. 'sparse' gives a sparse L3 model with the desired 'sparsity'. emb_dim : {512, 256, 128, 64} Desired embedding dimension of the UST specialized embedding approximated (SEA) models. retrain_type : 'ft' or 'kd' Type of retraining for the sparsified weights of L3 audio model. 'ft' chooses the fine-tuning method and 'kd' returns knowledge distilled model. sparsity : {95.45, 53.5, 63.5, 72.3, 87.0} The desired sparsity of audio model. Returns ------- model : keras.models.Model Model object. """ # Construct embedding model and load model weights with warnings.catch_warnings(): warnings.simplefilter("ignore") kwargs = {'emb_dim': emb_dim, 'sparsity': sparsity} m = MODELS[model_type](**kwargs) m.load_weights(load_embedding_model_path(model_type, emb_dim, retrain_type, sparsity)) return m
[docs]def load_embedding_model_path(model_type, emb_dim, retrain_type, sparsity): """ Returns the local path to the model weights file for the model with the given sparsity Parameters ---------- model_type : {sea, sparse} Type of smaller version of L3 model. If 'sea' is selected, the audio model is a UST specialized (SEA) model. 'sparse' gives a sparse L3 model with the desired 'sparsity'. emb_dim : {512, 256, 128, 64} Desired embedding dimension of the UST specialized embedding approximated (SEA) models. retrain_type : 'ft' or 'kd' Type of retraining for the sparsified weights of L3 audio model. 'ft' chooses the fine-tuning method and 'kd' returns knowledge distilled model. sparsity : {95.45, 53.5, 63.5, 72.3, 87.0} Desired sparsity of the audio model. Returns ------- output_path : str Path to given model object """ if model_type == 'sea': return os.path.join(os.path.dirname(__file__), 'edgel3_sea_ust_audio_emb_{}.h5'.format(emb_dim)) else: return os.path.join(os.path.dirname(__file__), 'edgel3_{}_audio_sparsity_{}.h5'.format(retrain_type, sparsity))
[docs]def _construct_sparsified_audio_network(**kwargs): """ Returns an uninitialized model object for a sparsified network with a Melspectrogram input (with 256 frequency bins). Returns ------- model : keras.models.Model Model object. """ weight_decay = 1e-5 n_dft = 2048 n_mels = 256 n_hop = 242 asr = 48000 audio_window_dur = 1 # INPUT x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32') # MELSPECTROGRAM PREPROCESSING y_a = Melspectrogram(n_dft=n_dft, n_hop=n_hop, n_mels=n_mels, sr=asr, power_melgram=1.0, htk=True, # n_win=n_win, return_decibel_melgram=True, padding='same')(x_a) y_a = BatchNormalization()(y_a) # CONV BLOCK 1 n_filter_a_1 = 64 filt_size_a_1 = (3, 3) pool_size_a_1 = (2, 2) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a) # CONV BLOCK 2 n_filter_a_2 = 128 filt_size_a_2 = (3, 3) pool_size_a_2 = (2, 2) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a) # CONV BLOCK 3 n_filter_a_3 = 256 filt_size_a_3 = (3, 3) pool_size_a_3 = (2, 2) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a) # CONV BLOCK 4 n_filter_a_4 = 512 filt_size_a_4 = (3, 3) pool_size_a_4 = (32, 24) y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_4, filt_size_a_4, kernel_initializer='he_normal', name='audio_embedding_layer', padding='same', kernel_regularizer=regularizers.l2(weight_decay))(y_a) pool_size_a_4 = tuple(y_a.get_shape().as_list()[1:3]) #(32, 24) y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a) y_a = Flatten()(y_a) m = Model(inputs=x_a, outputs=y_a) return m
[docs]def _construct_ust_specialized_audio_network(emb_dim=128, **kwargs): """ Returns an uninitialized model object for a UST specialized audio network with a Melspectrogram input (with 64 frequency bins). Returns ------- model : keras.models.Model Model object. """ weight_decay = 1e-5 n_dft = 1024 # original L3 has 2048 n_mels = 64 # original L3 has 256 n_hop = 160 # original L3 has 242 asr = 8000 # original L3 has 48000 audio_window_dur = 1 # reduce the number of conv filters in each conv block according to the emb_dim given reduction_factor = { 512: [1, 1, 1, 1], 256: [2, 2, 2, 2], 128: [2, 2, 2, 4], 64: [2, 2, 2, 8] } # INPUT x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32') # MELSPECTROGRAM PREPROCESSING y_a = Melspectrogram(n_dft=n_dft, n_hop=n_hop, n_mels=n_mels, sr=asr, power_melgram=1.0, htk=True, # n_win=n_win, return_decibel_melgram=True, padding='same')(x_a) y_a = BatchNormalization()(y_a) # CONV BLOCK 1 n_filter_a_1 = 64//reduction_factor[emb_dim][0] filt_size_a_1 = (3, 3) pool_size_a_1 = (2, 2) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a) # CONV BLOCK 2 n_filter_a_2 = 128//reduction_factor[emb_dim][1] filt_size_a_2 = (3, 3) pool_size_a_2 = (2, 2) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a) # CONV BLOCK 3 n_filter_a_3 = 256//reduction_factor[emb_dim][2] filt_size_a_3 = (3, 3) pool_size_a_3 = (2, 2) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a) # CONV BLOCK 4 n_filter_a_4 = 512//reduction_factor[emb_dim][3] filt_size_a_4 = (3, 3) y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_4, filt_size_a_4, kernel_initializer='he_normal', name='audio_embedding_layer', padding='same', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) pool_size_a_4 = tuple(y_a.get_shape().as_list()[1:3]) #(32, 24) y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a) y_a = Flatten()(y_a) m = Model(inputs=x_a, outputs=y_a) return m
MODELS = { 'sparse': _construct_sparsified_audio_network, 'sea': _construct_ust_specialized_audio_network }