Training of the model

Extra lib instalation

In [ ]:
!pip install speechpy
!pip install soundfile
!pip install tables

Lib imports

In [1]:
import numpy as np
import pandas as pd
import soundfile as sf
import scipy.io.wavfile as wav
import speechpy
import json
import os
import math
import tensorflow.keras as k
import dask.dataframe as dd
from IPython.display import display, Markdown
from time import sleep
from pprint import pprint
from multiprocessing import Queue, Process, Pool

%config IPCompleter.greedy=True

For audio processing I will use speechpy as it is the fastest of the well known libraries:

speed_comp

Source: sonopy

Processing function for dataset creation

In [2]:
def df_col2numpy(df, col_names):
    ser = df.apply(lambda row: np.array([row[col] for col in col_names]).flatten(), axis=1)
    arr = np.array(ser.values.tolist())
    return arr

def df_col2series(df, col_names):
    if len(col_names) == 1:
        ser = df[col_names[0]].map(lambda cell: np.array([cell]).flatten())
    ser = df.apply(lambda row: np.array([row[col] for col in col_names]).flatten(), axis=1)
    return ser

def file2mfcc(file_name, frame_length=0.20, frame_stride=0.1, recreate=False):
    """ recreate: whether to recreate existing .npy MFCC"""
    
    dir_name = file_name[:file_name.index('blocks')]
    file_wav, file_ogg = None, None
    
#     check for existing .wav or .npy cache
    for file in os.listdir(dir_name):
#         if file.endswith('.wav'):
#             file_wav = os.path.join(dir_name, file)
#         if file.endswith('.npy') and not recreate:
#             return np.load(os.path.join(dir_name, file))
        if file.endswith('.mfcc') and not recreate:
            return pd.read_hdf((os.path.join(dir_name, file)))
            
#     if none .wav found, create it
    for file in os.listdir(dir_name):
        if file.endswith('.ogg'):
            file_ogg = os.path.join(dir_name, file)
            if not file_wav:
                data, samplerate = sf.read(file_ogg)
                file_wav = f'{file_ogg[:-4]}.wav'
                sf.write(file_wav, data, samplerate)

    fs, signal = wav.read(file_wav)
    
#     Stereo to mono
    if signal.shape[1] == 2:
        signal = (signal[:, 0] + signal[:, 1]) / 2
    else:
        signal = signal[:, 0]

    # Pre-emphasize
    signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98)

    # Extract MFCC features
    mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=frame_length, 
                                 frame_stride=frame_stride, num_filters=40, fft_length=512,
                                 low_frequency=0, high_frequency=None, num_cepstral=13)
    
#     Normalize
    mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
    
#     Cache results and clean .wav to save space
#     np.save(f'{file_wav[:-4]}.mfcc.npy', mfcc_cmvn)
    if file_ogg:
        os.remove(file_wav)

#     Recalculate the time differences
    index = np.arange(0, (len(mfcc) - 0.5) * frame_stride, frame_stride) + frame_length
    df = pd.DataFrame(data=mfcc_cmvn, index=index).apply(np.array, axis=1)
    df.to_hdf(f'{file_wav[:-4]}.mfcc', 'mfcc', mode='w', format='fixed')
    return df

def process_cell(cell, side):
    res = cell[:, :, side, :]
    
    mx = res.max()
    mx_index = np.unravel_index(res.argmax(), res.shape)
    pred = [None for _ in range(3)]
    
    for dim in range(3):
#         pred[dim] = np.zeros(res.shape[dim] + 1)
        pred[dim] = np.zeros(res.shape[dim])
        
        if mx < 0.5:
#             pred[dim][-1] = 1
            pass
        else:
            pred[dim][mx_index[dim]] = 1
    
    if mx < 0.5:
        res = cell[:, :, (side+1) % 2, :]
    
        mx = res.max()
        mx_index = np.unravel_index(res.argmax(), res.shape)
        for dim in range(3):
            pred[dim][mx_index[dim]] = 1
        
    return pred

def change_output(df: pd.DataFrame):
    left, right = df.apply(lambda cell: process_cell(cell, 0)), df.apply(lambda cell: process_cell(cell, 1))
    
    left = pd.DataFrame(left.to_list(), columns=[f'l_dim{x}' for x in range(3)], index=left.index)
    right = pd.DataFrame(right.to_list(), columns=[f'r_dim{x}' for x in range(3)], index=right.index)
    
    return left.join(right)
    
def process_file(file_path, recreate=False):
    """ Processing needed to be done per file """
    print(f'Processing {file_path}')
    try:
        df = pd.read_pickle(file_path)
    
        # all in one serialization (99.5 % with bad metric)
        #     df['output'] = df_col2series(df, ['output'])
        df = df.join(change_output(df['output']))
        df['shifted'] = df['output'].shift(1, fill_value=[np.zeros(df['output'].iloc[0].shape)])
        
        df['times'] = df_col2series(df, ['prev', 'next'])
        df['name'] = f'{file_path}'

        mfcc = file2mfcc(file_path, recreate=recreate)
        mfcc.name = 'mfcc'
        round_index = mfcc.index.values[1] - mfcc.index.values[0]
        df.index = np.floor(df['time'] / round_index).astype(int)
        mfcc.index = (mfcc.index / round_index).astype(int)

        df = df.join(mfcc)
        df.index = df['time']
        df = df.dropna()
    except Exception as e:
        print(f'Caught Error: {e}')
        return None

    return df

def pocess_df(df, X_cols, y_cols):
    """ Post processing on the whole DF """
    
    # Add shifter y (predictions)
    y_cols_shifted = [f'{x}_shifted' for x in y_cols]
    shifted = df[y_cols].groupby('name').shift(1)
    df[y_cols_shifted] = shifted
    df = df.dropna()
    
    return df
    
file = '../data/Army Of The Night/blocks/Expert.pkl'
file2mfcc(file, recreate=False)
process_file(file).iloc[0]
Processing ../data/Army Of The Night/blocks/Expert.pkl
Out[2]:
output     [[[[0. 1. 0. 0. 0. 0. 0. 0. 0.], [0. 0. 0. 0. ...
time                                                 3.08211
prev                                                 3.08211
next                                               0.0821053
l_dim0                                       [1.0, 0.0, 0.0]
l_dim1                                  [1.0, 0.0, 0.0, 0.0]
l_dim2         [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
r_dim0                                       [1.0, 0.0, 0.0]
r_dim1                                  [1.0, 0.0, 0.0, 0.0]
r_dim2         [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
shifted    [[[[0. 0. 0. 0. 0. 0. 0. 0. 0.], [0. 0. 0. 0. ...
times              [3.0821052631578945, 0.08210526315789446]
name             ../data/Army Of The Night/blocks/Expert.pkl
mfcc       [0.36976948, -1.4347119, 0.909154, 0.62588304,...
Name: 3.0821052631578945, dtype: object

Generate urls to blocks

In [3]:
def get_file_paths(path, hard_max):
    """ 
    Create a list of all pregenerated blocks files 
    Search in full subtree of :path:
    """
    
    file_paths = []
    counter = 0
    
    for root, dirs, files in os.walk(path, topdown=False):
        if counter > hard_max:
                break
        for name in files:
            if root[-6:] == 'blocks':
                print(f'#{counter:5} {root}/{name}')
                file_paths.append(os.path.join(root, name))
                
                counter += 1
                
    return file_paths

Set which cols to use as X and Y

In [19]:
X_list = []
Y = []
HARD_MAX = 20000
path = '../data'

y_cols = [f'{side}_dim{i}' for side in 'rl' for i in range(3) ]
# X_cols = ['times', 'mfcc'] + y_cols
X_cols = y_cols

columns = ['name', 'time'] + X_cols + y_cols

Create and save training data

In [ ]:
multi_core = True
if multi_core:
    # Embarrassingly parallel problem, but RAM heavy
    pool = Pool(processes=None)
    X_list = pool.map(process_file, get_file_paths(path, HARD_MAX))
else:
    X_list = [process_file(x) for x in get_file_paths(path, HARD_MAX)]

X_list = [x for x in X_list if x is not None]
        
print(f'Passes {len(X_list):6}/{HARD_MAX:6} hard max')
X = pd.DataFrame(pd.concat(X_list), columns=columns)
X = X.set_index(['name', 'time'])

X.to_pickle(os.path.join(path, 'X_saved.pkl'))

Load training data

In [6]:
X = pd.read_pickle(os.path.join(path, 'X_saved.gzip'))
print(f'Loaded {len(X)} rows')
Out[6]:
804317

Training approaches

Generator training approach

Advantages

  • Every batch can have different length
    • No crop of songs in batches needed
    • Minimal padding
    • $\Rightarrow$ model learns from songs with the context of beginning and end

Disadvantages

  • Less convinient then standard model.fit(X, y, **kwargs)
  • Big memory movement overhead
    • Would be even more significant if trained on a GPU

Standard training approach

  • Different lengths solved by generating snippets of songs of len $N$
    • Effectively creates more versitile dataset, since songs are "starting" at different places

Advantages

  • No padding and no crop
    • No crop of songs in batches needed
    • Minimal padding
    • $\Rightarrow$ model learns from songs with the context of beginning and end
  • All "heavy altilery" from TF can be used

Disadvantages

  • No differentitation between beggining and end of the song
    • $\Rightarrow$ could be solved by adding procentage column which indicates in interval $(0, 1)$ where the beat lies in the song

Generator training approach

Helper functions for train_generator

In [31]:
precision = 2

def pp(row):
    print('*' * 69)
    print(row)
    
def round_up(num:float, prec: int) -> int:
    return int(math.ceil((10 ** -prec) * num) / (10 ** -prec))
    
def get_len_category(song, prec: int=precision):
    return int(round_up(len(song), 2))

def get_mask(X, prec: int=precision):
    mask = X.groupby('name').apply(get_len_category)
    return mask.to_dict()

def create_batch(group, ceil_len, verbose=True):
    if verbose:
        print(f'Creating batch of ceil_len {ceil_len:6} with {len(group)} rows')
    ceil_len = int(ceil_len)
    
    batch = []
    for name, song in group.groupby('name'):
        empty_row = song.head(1).squeeze().apply(np.zeros_like)
#         print(f'{ceil_len} | {len(song)} | {empty_row}')
        df_to_add = pd.DataFrame([empty_row] * (ceil_len - len(song)))
        batch.append(pd.concat([song, df_to_add]))

    return pd.concat(batch)

# LESON: Don't forget about the NaNs!


def list2numpy(batch, col_name):
    return np.array(batch.groupby('name')[col_name].apply(list).to_list())

Show bucketing results

In [32]:
grouped = X.groupby(get_mask(X), level=0)

adjust = 6
stats = []
print(f'{"from":>{adjust}} ‒ {"to":>{adjust}}: {"# of songs":>{adjust*2}}')
      
for name, group in grouped:
    print(f'{name - 10 ** precision:{adjust}} ‒ {name:{adjust}}: {len(group.groupby("name").groups):{adjust*2}}')
    stats.append({'from': name - 10 ** precision, 'to': name, '# of songs': len(group.groupby("name").groups)})
          
# print in your favorite way
# pd.DataFrame(stats, columns=['from', 'to', '# of songs'])
  from ‒     to:   # of songs
     0 ‒    100:            7
   100 ‒    200:           27
   200 ‒    300:          105
   300 ‒    400:          245
   400 ‒    500:          310
   500 ‒    600:          261
   600 ‒    700:          187
   700 ‒    800:          125
   800 ‒    900:           73
   900 ‒   1000:           49
  1000 ‒   1100:           31
  1100 ‒   1200:           16
  1200 ‒   1300:            4
  1300 ‒   1400:            6
  1400 ‒   1500:            5
  1500 ‒   1600:            3
  1600 ‒   1700:            1
  1700 ‒   1800:            1
  2200 ‒   2300:            1
In [40]:
def train_generator(df, X_cols, y_cols, verbose=True):
    grouped = X.groupby(get_mask(df), level=0)
    
#     p = Pool(2)  # slowe because of memory
#     batches = p.starmap(create_batch, [(group, ceil_len) for ceil_len, group in grouped])
#     grouped = list(grouped)[:2]
    batches = [create_batch(group, ceil_len, verbose) for ceil_len, group in grouped]
    batches = [batch for batch in batches if len(batch.groupby('name').groups) > 8]
    
    while True:
        for batch in batches:
            yield [list2numpy(batch, col) for col in X_cols],\
                  [list2numpy(batch, col) for col in y_cols]

# test generated shapes
generator = train_generator(X, X_cols, y_cols)
for x, y in generator:
    print(f'x.shapes {[np.array(x_t).shape for x_t in x]}')
    print(f'y.shape {[np.array(y_t).shape for y_t in y]}\n')
    break
Creating batch of ceil_len    100 with 295 rows
Creating batch of ceil_len    200 with 4541 rows
Creating batch of ceil_len    300 with 27009 rows
Creating batch of ceil_len    400 with 86592 rows
Creating batch of ceil_len    500 with 140161 rows
Creating batch of ceil_len    600 with 143226 rows
Creating batch of ceil_len    700 with 120583 rows
Creating batch of ceil_len    800 with 93800 rows
Creating batch of ceil_len    900 with 61609 rows
Creating batch of ceil_len   1000 with 45718 rows
Creating batch of ceil_len   1100 with 32074 rows
Creating batch of ceil_len   1200 with 18189 rows
Creating batch of ceil_len   1300 with 5017 rows
Creating batch of ceil_len   1400 with 8151 rows
Creating batch of ceil_len   1500 with 7166 rows
Creating batch of ceil_len   1600 with 4540 rows
Creating batch of ceil_len   1700 with 1698 rows
Creating batch of ceil_len   1800 with 1729 rows
Creating batch of ceil_len   2300 with 2219 rows
x.shapes [(27, 200, 3), (27, 200, 4), (27, 200, 9), (27, 200, 3), (27, 200, 4), (27, 200, 9)]
y.shape [(27, 200, 3), (27, 200, 4), (27, 200, 9), (27, 200, 3), (27, 200, 4), (27, 200, 9)]

Model

In [41]:
from tensorflow.keras.layers import Dense, LSTM, Flatten, Input, Activation, TimeDistributed, concatenate
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
In [62]:
def get_model(X, X_cols, y_cols):
    demo_row = X.iloc[0]
    X_shapes = [demo_row[col].shape[0] for col in X_cols]
    y_shapes = [demo_row[col].shape[0] for col in y_cols]
    
#     in1 = Input(shape=(None, 216)) # last blocks
#     in2 = Input(shape=(None, 2))   # time difference of previous and next beat
    inputs = [Input(shape=(None, shape)) for shape in X_shapes]
    
    time_dist = [TimeDistributed(Dense(shape, activation='sigmoid'))(inputs[i]) for i, shape in enumerate(X_shapes)]
#     out = time_dist
#     x1 = TimeDistributed(Dense(50, activation='elu'))(in1)
#     x2 = TimeDistributed(Dense(3, activation='elu'))(in2)
    
    out = concatenate(time_dist, axis=-1)
    out = LSTM(64, return_sequences=True)(out)
    out = LSTM(64, return_sequences=True)(out)
#     out = LSTM(128, return_sequences=True)(out)
#     out = TimeDistributed(Dense(216, activation='sigmoid'))(out)
    outputs = [TimeDistributed(Dense(shape, activation='softmax'), name=col)(out) for shape, col in zip(y_shapes, y_cols)]

    model = Model(inputs=inputs, outputs=outputs)
    
    model.compile(optimizer='rmsprop',
#                   loss='binary_crossentropy',
                  loss='categorical_crossentropy',
                  loss_weights=[1,2,3,1,2,3],
                  metrics=['accuracy', 'categorical_crossentropy'])
    
    return model

Get a new model and show it

In [63]:
model = get_model(X, X_cols, y_cols)
model.summary()
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_79 (InputLayer)           (None, None, 3)      0                                            
__________________________________________________________________________________________________
input_80 (InputLayer)           (None, None, 4)      0                                            
__________________________________________________________________________________________________
input_81 (InputLayer)           (None, None, 9)      0                                            
__________________________________________________________________________________________________
input_82 (InputLayer)           (None, None, 3)      0                                            
__________________________________________________________________________________________________
input_83 (InputLayer)           (None, None, 4)      0                                            
__________________________________________________________________________________________________
input_84 (InputLayer)           (None, None, 9)      0                                            
__________________________________________________________________________________________________
time_distributed_78 (TimeDistri (None, None, 3)      12          input_79[0][0]                   
__________________________________________________________________________________________________
time_distributed_79 (TimeDistri (None, None, 4)      20          input_80[0][0]                   
__________________________________________________________________________________________________
time_distributed_80 (TimeDistri (None, None, 9)      90          input_81[0][0]                   
__________________________________________________________________________________________________
time_distributed_81 (TimeDistri (None, None, 3)      12          input_82[0][0]                   
__________________________________________________________________________________________________
time_distributed_82 (TimeDistri (None, None, 4)      20          input_83[0][0]                   
__________________________________________________________________________________________________
time_distributed_83 (TimeDistri (None, None, 9)      90          input_84[0][0]                   
__________________________________________________________________________________________________
concatenate_12 (Concatenate)    (None, None, 32)     0           time_distributed_78[0][0]        
                                                                 time_distributed_79[0][0]        
                                                                 time_distributed_80[0][0]        
                                                                 time_distributed_81[0][0]        
                                                                 time_distributed_82[0][0]        
                                                                 time_distributed_83[0][0]        
__________________________________________________________________________________________________
lstm_3 (LSTM)                   (None, None, 64)     24832       concatenate_12[0][0]             
__________________________________________________________________________________________________
lstm_4 (LSTM)                   (None, None, 64)     33024       lstm_3[0][0]                     
__________________________________________________________________________________________________
r_dim0 (TimeDistributed)        (None, None, 3)      195         lstm_4[0][0]                     
__________________________________________________________________________________________________
r_dim1 (TimeDistributed)        (None, None, 4)      260         lstm_4[0][0]                     
__________________________________________________________________________________________________
r_dim2 (TimeDistributed)        (None, None, 9)      585         lstm_4[0][0]                     
__________________________________________________________________________________________________
l_dim0 (TimeDistributed)        (None, None, 3)      195         lstm_4[0][0]                     
__________________________________________________________________________________________________
l_dim1 (TimeDistributed)        (None, None, 4)      260         lstm_4[0][0]                     
__________________________________________________________________________________________________
l_dim2 (TimeDistributed)        (None, None, 9)      585         lstm_4[0][0]                     
==================================================================================================
Total params: 60,180
Trainable params: 60,180
Non-trainable params: 0
__________________________________________________________________________________________________

Train model on generator

In [64]:
def important_metric(metric_name):
    return 'val' in metric_name and 'acc' in metric_name
        

def train_on_generator(X, X_cols, y_cols, verbose_level=1, model=None, epochs=300):
    if not model:
        model = get_model(X, X_cols, y_cols)
    
    acc_results = {}
    stats_len = 30

    generator = train_generator(X, X_cols, y_cols, verbose_level>=2)
    for i, (x, y) in enumerate(generator):
        if i > epochs:
            break
        res = model.fit(x, y, batch_size=128, validation_split=0.1, verbose=verbose_level>=3)
        
        if verbose_level >= 1:
            if i % stats_len == 0:
                total_acc = (np.array(list(acc_results.values())) / stats_len).mean()
                display(Markdown(f'### Batch {i:4} | {total_acc:4.4}'))
                pprint([f'{key:30}: {val/stats_len}' for key, val in acc_results.items()])

                acc_results = {key: val[0] for key, val in res.history.items() if important_metric(key)}
            else:
                for key in acc_results:
                    acc_results[key] += res.history[key][0]
    
    
    acc_results = {key: val[0] for key, val in res.history.items() if important_metric(key)}
    total_acc = (np.array(list(acc_results.values()))).mean()
    display(Markdown(f'### Last epoch {i:4} results | {total_acc:4.4}'))
    pprint(acc_results)
    
    return model
    
# Test train_on_generator
# train_on_generator(X, X_cols, y_cols, 2, None, 2)
Creating batch of ceil_len    100 with 388 rows
Creating batch of ceil_len    200 with 4614 rows
Creating batch of ceil_len    300 with 27004 rows
Creating batch of ceil_len    400 with 87647 rows
Creating batch of ceil_len    500 with 139251 rows
Creating batch of ceil_len    600 with 144965 rows
Creating batch of ceil_len    700 with 118096 rows
Creating batch of ceil_len    800 with 93775 rows
Creating batch of ceil_len    900 with 62536 rows
Creating batch of ceil_len   1000 with 44869 rows
Creating batch of ceil_len   1100 with 31043 rows
Creating batch of ceil_len   1200 with 18173 rows
Creating batch of ceil_len   1300 with 5013 rows
Creating batch of ceil_len   1400 with 8145 rows
Creating batch of ceil_len   1500 with 8661 rows
Creating batch of ceil_len   1600 with 3037 rows
Creating batch of ceil_len   1700 with 1697 rows
Creating batch of ceil_len   1800 with 1728 rows
Creating batch of ceil_len   2300 with 2218 rows
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:20: RuntimeWarning: Mean of empty slice.

Batch 0 | nan

[]

Last epoch 3 results | 0.4378

{'val_l_dim0_acc': 0.7486,
 'val_l_dim1_acc': 0.2624,
 'val_l_dim2_acc': 0.3018,
 'val_r_dim0_acc': 0.7461,
 'val_r_dim1_acc': 0.2684,
 'val_r_dim2_acc': 0.2997}
Out[64]:
<tensorflow.python.keras.engine.training.Model at 0x7fc77b659e48>
In [65]:
# X.join(X.shift)
y_cols_shifted = [f'{x}_shifted' for x in y_cols]
shifted = X[y_cols].groupby('name').shift(1)
X[y_cols_shifted] = shifted
X = X.dropna()
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py:3391: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
In [67]:
# Sanity check, try if the model performs well on _identity_
train_on_generator(X, y_cols, y_cols, verbose_level=1, model=None, epochs=300)

model = train_on_generator(X, ['times', 'mfcc'] + y_cols_shifted, y_cols, verbose_level=2, model=model, epochs=490)
Creating batch of ceil_len    100 with 381 rows
Creating batch of ceil_len    200 with 4587 rows
Creating batch of ceil_len    300 with 27799 rows
Creating batch of ceil_len    400 with 87299 rows
Creating batch of ceil_len    500 with 139643 rows
Creating batch of ceil_len    600 with 144401 rows
Creating batch of ceil_len    700 with 118113 rows
Creating batch of ceil_len    800 with 92250 rows
Creating batch of ceil_len    900 with 62462 rows
Creating batch of ceil_len   1000 with 45821 rows
Creating batch of ceil_len   1100 with 30013 rows
Creating batch of ceil_len   1200 with 18157 rows
Creating batch of ceil_len   1300 with 5009 rows
Creating batch of ceil_len   1400 with 8139 rows
Creating batch of ceil_len   1500 with 8655 rows
Creating batch of ceil_len   1600 with 3035 rows
Creating batch of ceil_len   1700 with 1696 rows
Creating batch of ceil_len   1800 with 1727 rows
Creating batch of ceil_len   2300 with 2217 rows
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:20: RuntimeWarning: Mean of empty slice.
/opt/conda/lib/python3.7/site-packages/numpy/core/_methods.py:85: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)

Batch 0 | nan

[]

Batch 30 | 0.5183

['val_r_dim0_acc                : 0.7540727615356445',
 'val_r_dim1_acc                : 0.36608778635660805',
 'val_r_dim2_acc                : 0.42783355712890625',
 'val_l_dim0_acc                : 0.757870864868164',
 'val_l_dim1_acc                : 0.3757168769836426',
 'val_l_dim2_acc                : 0.4283662478129069']

Batch 60 | 0.52

['val_r_dim0_acc                : 0.7600526809692383',
 'val_r_dim1_acc                : 0.3661651611328125',
 'val_r_dim2_acc                : 0.4345332145690918',
 'val_l_dim0_acc                : 0.7642615000406902',
 'val_l_dim1_acc                : 0.36000216801961266',
 'val_l_dim2_acc                : 0.4347590128580729']

Batch 90 | 0.5266

['val_r_dim0_acc                : 0.7600261688232421',
 'val_r_dim1_acc                : 0.3766444206237793',
 'val_r_dim2_acc                : 0.444332218170166',
 'val_l_dim0_acc                : 0.7642174402872721',
 'val_l_dim1_acc                : 0.3688096364339193',
 'val_l_dim2_acc                : 0.4454135258992513']

Batch 120 | 0.5275

['val_r_dim0_acc                : 0.7584291458129883',
 'val_r_dim1_acc                : 0.37539917627970376',
 'val_r_dim2_acc                : 0.44953041076660155',
 'val_l_dim0_acc                : 0.7623305638631185',
 'val_l_dim1_acc                : 0.3684961954752604',
 'val_l_dim2_acc                : 0.4508045196533203']

Batch 150 | 0.5229

['val_r_dim0_acc                : 0.753141975402832',
 'val_r_dim1_acc                : 0.3714120546976725',
 'val_r_dim2_acc                : 0.4376091957092285',
 'val_l_dim0_acc                : 0.7576244354248047',
 'val_l_dim1_acc                : 0.3780789375305176',
 'val_l_dim2_acc                : 0.43938868840535483']

Batch 180 | 0.5289

['val_r_dim0_acc                : 0.759404182434082',
 'val_r_dim1_acc                : 0.37351264953613283',
 'val_r_dim2_acc                : 0.4466133117675781',
 'val_l_dim0_acc                : 0.7630348205566406',
 'val_l_dim1_acc                : 0.3851865450541178',
 'val_l_dim2_acc                : 0.4456220944722494']

Batch 210 | 0.5328

['val_r_dim0_acc                : 0.7601642608642578',
 'val_r_dim1_acc                : 0.38237762451171875',
 'val_r_dim2_acc                : 0.4560214678446452',
 'val_l_dim0_acc                : 0.7643567403157552',
 'val_l_dim1_acc                : 0.3774712880452474',
 'val_l_dim2_acc                : 0.4561563491821289']

Batch 240 | 0.532

['val_r_dim0_acc                : 0.7550404866536459',
 'val_r_dim1_acc                : 0.3808454513549805',
 'val_r_dim2_acc                : 0.45687707265218097',
 'val_l_dim0_acc                : 0.7591039021809896',
 'val_l_dim1_acc                : 0.38339579900105797',
 'val_l_dim2_acc                : 0.45652631123860676']

Batch 270 | 0.5305

['val_r_dim0_acc                : 0.7555030186971029',
 'val_r_dim1_acc                : 0.3786564826965332',
 'val_r_dim2_acc                : 0.4546598116556803',
 'val_l_dim0_acc                : 0.7594136555989583',
 'val_l_dim1_acc                : 0.38031047185262046',
 'val_l_dim2_acc                : 0.45472421646118166']

Batch 300 | 0.5423

['val_r_dim0_acc                : 0.7609608968098959',
 'val_r_dim1_acc                : 0.3859834671020508',
 'val_r_dim2_acc                : 0.467801570892334',
 'val_l_dim0_acc                : 0.7647853215535482',
 'val_l_dim1_acc                : 0.40865278244018555',
 'val_l_dim2_acc                : 0.46591355005900065']

Batch 330 | 0.5399

['val_r_dim0_acc                : 0.7569397608439128',
 'val_r_dim1_acc                : 0.3860780715942383',
 'val_r_dim2_acc                : 0.4670612335205078',
 'val_l_dim0_acc                : 0.7603558858235677',
 'val_l_dim1_acc                : 0.4015027046203613',
 'val_l_dim2_acc                : 0.4672680854797363']

Batch 360 | 0.5381

['val_r_dim0_acc                : 0.7544565200805664',
 'val_r_dim1_acc                : 0.3819138526916504',
 'val_r_dim2_acc                : 0.4585309664408366',
 'val_l_dim0_acc                : 0.7581957499186198',
 'val_l_dim1_acc                : 0.4155714670817057',
 'val_l_dim2_acc                : 0.4601314544677734']

Batch 390 | 0.5448

['val_r_dim0_acc                : 0.7607501983642578',
 'val_r_dim1_acc                : 0.3876008669535319',
 'val_r_dim2_acc                : 0.4658071517944336',
 'val_l_dim0_acc                : 0.7644286473592122',
 'val_l_dim1_acc                : 0.42104330062866213',
 'val_l_dim2_acc                : 0.4691755930582682']

Batch 420 | 0.5491

['val_r_dim0_acc                : 0.7607670466105143',
 'val_r_dim1_acc                : 0.3846865653991699',
 'val_r_dim2_acc                : 0.4738899230957031',
 'val_l_dim0_acc                : 0.7641873677571615',
 'val_l_dim1_acc                : 0.4314267158508301',
 'val_l_dim2_acc                : 0.47943398157755535']

Batch 450 | 0.5527

['val_r_dim0_acc                : 0.7591227849324544',
 'val_r_dim1_acc                : 0.39608370463053383',
 'val_r_dim2_acc                : 0.4806464831034342',
 'val_l_dim0_acc                : 0.7628245671590169',
 'val_l_dim1_acc                : 0.42985604604085287',
 'val_l_dim2_acc                : 0.48740644454956056']

Batch 480 | 0.5478

['val_r_dim0_acc                : 0.7539368947347005',
 'val_r_dim1_acc                : 0.394324525197347',
 'val_r_dim2_acc                : 0.46978670756022134',
 'val_l_dim0_acc                : 0.7582558949788412',
 'val_l_dim1_acc                : 0.43149509429931643',
 'val_l_dim2_acc                : 0.47922709782918294']

Last epoch 491 results | 0.5797

{'val_l_dim0_acc': 0.76798075,
 'val_l_dim1_acc': 0.4439423,
 'val_l_dim2_acc': 0.55115384,
 'val_r_dim0_acc': 0.7653846,
 'val_r_dim1_acc': 0.4266346,
 'val_r_dim2_acc': 0.52307695}

Empirical findings

  • If the y_cols_shifted input is not provided, model tends to learn and stay on most common value of each classification.
  • Sanity check porforms quickly over 90 % acc

Improvements to be tested

  1. More normalization of the data
    • Force the model to catch the underlying principle and not "mean"
  2. Flip L / R hand and horizontal mirorring and rotations
  3. Flip vertically with rotation
  4. If one hand not used, mirror the other hand instead of "0"
  5. Instead of generator, create snippets of 100 beats
    • Easier GPU training
    • Train it on gColab

Hand evaluation

  • Is needed since good results can be caused by a wrongly chosen matric!
In [70]:
import copy

generator = train_generator(X, ['times', 'mfcc'] + y_cols_shifted, y_cols)
x, y = generator.__next__()
prediction = model.predict(x)
Creating batch of ceil_len    100 with 381 rows
Creating batch of ceil_len    200 with 4587 rows
Creating batch of ceil_len    300 with 27799 rows
Creating batch of ceil_len    400 with 87299 rows
Creating batch of ceil_len    500 with 139643 rows
Creating batch of ceil_len    600 with 144401 rows
Creating batch of ceil_len    700 with 118113 rows
Creating batch of ceil_len    800 with 92250 rows
Creating batch of ceil_len    900 with 62462 rows
Creating batch of ceil_len   1000 with 45821 rows
Creating batch of ceil_len   1100 with 30013 rows
Creating batch of ceil_len   1200 with 18157 rows
Creating batch of ceil_len   1300 with 5009 rows
Creating batch of ceil_len   1400 with 8139 rows
Creating batch of ceil_len   1500 with 8655 rows
Creating batch of ceil_len   1600 with 3035 rows
Creating batch of ceil_len   1700 with 1696 rows
Creating batch of ceil_len   1800 with 1727 rows
Creating batch of ceil_len   2300 with 2217 rows
In [71]:
f, t = 0, 20
for dim, (p, y_t) in enumerate(zip(prediction, y)):    
    df = pd.DataFrame(p[0][f:t])
    df = df.eq(df.where(df != 0).max(1), axis=0).astype(int)
    df.index.name = y_cols[dim]
    df_y = pd.DataFrame(y_t[0][f:t]).astype(int)
    df = df.join(df_y, rsuffix='_true')
    display(df)
0 1 2 0_true 1_true 2_true
r_dim0
0 1 0 0 0 0 1
1 1 0 0 1 0 0
2 1 0 0 1 0 0
3 1 0 0 1 0 0
4 1 0 0 0 0 1
5 1 0 0 0 0 1
6 1 0 0 0 0 1
7 1 0 0 0 0 1
8 1 0 0 0 0 1
9 1 0 0 1 0 0
10 1 0 0 1 0 0
11 1 0 0 1 0 0
12 1 0 0 1 0 0
13 1 0 0 1 0 0
14 1 0 0 0 0 1
15 1 0 0 0 0 1
16 1 0 0 0 0 1
17 1 0 0 0 0 1
18 1 0 0 1 0 0
19 1 0 0 1 0 0
0 1 2 3 0_true 1_true 2_true 3_true
r_dim1
0 0 0 1 0 0 0 0 1
1 0 0 1 0 0 0 1 0
2 0 0 1 0 0 0 1 0
3 0 0 0 1 0 0 1 0
4 0 0 0 1 0 0 0 1
5 0 0 1 0 0 0 0 1
6 0 0 1 0 1 0 0 0
7 0 0 1 0 0 0 0 1
8 0 0 1 0 1 0 0 0
9 0 0 0 1 0 0 1 0
10 0 0 0 1 0 0 1 0
11 0 1 0 0 0 1 0 0
12 0 0 1 0 0 0 1 0
13 0 0 0 1 0 0 1 0
14 0 0 0 1 0 0 0 1
15 0 0 1 0 1 0 0 0
16 0 0 1 0 0 0 0 1
17 0 0 1 0 1 0 0 0
18 0 0 0 1 0 0 1 0
19 0 0 1 0 0 1 0 0
0 1 2 3 4 5 6 7 8 0_true 1_true 2_true 3_true 4_true 5_true 6_true 7_true 8_true
r_dim2
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
2 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
3 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
4 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
5 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
6 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
7 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
8 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
9 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
10 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
11 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
12 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
13 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
14 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
15 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
16 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
17 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0
18 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
19 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
0 1 2 0_true 1_true 2_true
l_dim0
0 1 0 0 0 0 1
1 1 0 0 1 0 0
2 1 0 0 1 0 0
3 1 0 0 1 0 0
4 1 0 0 0 0 1
5 1 0 0 0 0 1
6 1 0 0 0 0 1
7 1 0 0 0 0 1
8 1 0 0 0 0 1
9 1 0 0 1 0 0
10 1 0 0 1 0 0
11 1 0 0 1 0 0
12 1 0 0 1 0 0
13 1 0 0 1 0 0
14 1 0 0 0 0 1
15 1 0 0 0 0 1
16 1 0 0 0 0 1
17 1 0 0 0 0 1
18 1 0 0 1 0 0
19 1 0 0 1 0 0
0 1 2 3 0_true 1_true 2_true 3_true
l_dim1
0 0 0 1 0 1 0 0 0
1 0 1 0 0 0 1 0 0
2 1 0 0 0 0 1 0 0
3 1 0 0 0 0 1 0 0
4 1 0 0 0 1 0 0 0
5 0 1 0 0 0 0 0 1
6 1 0 0 0 1 0 0 0
7 0 1 0 0 0 0 0 1
8 1 0 0 0 1 0 0 0
9 0 1 0 0 0 1 0 0
10 1 0 0 0 0 0 1 0
11 0 1 0 0 0 1 0 0
12 1 0 0 0 0 1 0 0
13 1 0 0 0 0 1 0 0
14 1 0 0 0 0 0 0 1
15 1 0 0 0 1 0 0 0
16 0 1 0 0 0 0 0 1
17 1 0 0 0 1 0 0 0
18 0 1 0 0 0 1 0 0
19 1 0 0 0 0 1 0 0
0 1 2 3 4 5 6 7 8 0_true 1_true 2_true 3_true 4_true 5_true 6_true 7_true 8_true
l_dim2
0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
3 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
4 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
5 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
6 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
7 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
8 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
9 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
10 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
11 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
12 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
13 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
14 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
15 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
16 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
17 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
18 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
19 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
In [ ]:
def eval_generator():
    while True:
        for x1, x2, y in zip(X['blocks'][300:], X['times'][300:], Y[300:]):
            yield [np.array([x1, ]), np.array([x2, ])], np.array([y, ])
            
model = get_model()
model.batch_size = 8            

model.fit_generator(train_generator(), steps_per_epoch=300, epochs=1, verbose=1, 
                    use_multiprocessing=False)
model.evaluate_generator(eval_generator(), steps=19, )

Song snippets training

In [ ]: