!pip install speechpy
!pip install soundfile
!pip install tables
import numpy as np
import pandas as pd
import soundfile as sf
import scipy.io.wavfile as wav
import speechpy
import json
import os
import math
import tensorflow.keras as k
import dask.dataframe as dd
from IPython.display import display, Markdown
from time import sleep
from pprint import pprint
from multiprocessing import Queue, Process, Pool
%config IPCompleter.greedy=True
For audio processing I will use speechpy
as it is the fastest of the well known libraries:
Source: sonopy
def df_col2numpy(df, col_names):
ser = df.apply(lambda row: np.array([row[col] for col in col_names]).flatten(), axis=1)
arr = np.array(ser.values.tolist())
return arr
def df_col2series(df, col_names):
if len(col_names) == 1:
ser = df[col_names[0]].map(lambda cell: np.array([cell]).flatten())
ser = df.apply(lambda row: np.array([row[col] for col in col_names]).flatten(), axis=1)
return ser
def file2mfcc(file_name, frame_length=0.20, frame_stride=0.1, recreate=False):
""" recreate: whether to recreate existing .npy MFCC"""
dir_name = file_name[:file_name.index('blocks')]
file_wav, file_ogg = None, None
# check for existing .wav or .npy cache
for file in os.listdir(dir_name):
# if file.endswith('.wav'):
# file_wav = os.path.join(dir_name, file)
# if file.endswith('.npy') and not recreate:
# return np.load(os.path.join(dir_name, file))
if file.endswith('.mfcc') and not recreate:
return pd.read_hdf((os.path.join(dir_name, file)))
# if none .wav found, create it
for file in os.listdir(dir_name):
if file.endswith('.ogg'):
file_ogg = os.path.join(dir_name, file)
if not file_wav:
data, samplerate = sf.read(file_ogg)
file_wav = f'{file_ogg[:-4]}.wav'
sf.write(file_wav, data, samplerate)
fs, signal = wav.read(file_wav)
# Stereo to mono
if signal.shape[1] == 2:
signal = (signal[:, 0] + signal[:, 1]) / 2
else:
signal = signal[:, 0]
# Pre-emphasize
signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98)
# Extract MFCC features
mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=frame_length,
frame_stride=frame_stride, num_filters=40, fft_length=512,
low_frequency=0, high_frequency=None, num_cepstral=13)
# Normalize
mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
# Cache results and clean .wav to save space
# np.save(f'{file_wav[:-4]}.mfcc.npy', mfcc_cmvn)
if file_ogg:
os.remove(file_wav)
# Recalculate the time differences
index = np.arange(0, (len(mfcc) - 0.5) * frame_stride, frame_stride) + frame_length
df = pd.DataFrame(data=mfcc_cmvn, index=index).apply(np.array, axis=1)
df.to_hdf(f'{file_wav[:-4]}.mfcc', 'mfcc', mode='w', format='fixed')
return df
def process_cell(cell, side):
res = cell[:, :, side, :]
mx = res.max()
mx_index = np.unravel_index(res.argmax(), res.shape)
pred = [None for _ in range(3)]
for dim in range(3):
# pred[dim] = np.zeros(res.shape[dim] + 1)
pred[dim] = np.zeros(res.shape[dim])
if mx < 0.5:
# pred[dim][-1] = 1
pass
else:
pred[dim][mx_index[dim]] = 1
if mx < 0.5:
res = cell[:, :, (side+1) % 2, :]
mx = res.max()
mx_index = np.unravel_index(res.argmax(), res.shape)
for dim in range(3):
pred[dim][mx_index[dim]] = 1
return pred
def change_output(df: pd.DataFrame):
left, right = df.apply(lambda cell: process_cell(cell, 0)), df.apply(lambda cell: process_cell(cell, 1))
left = pd.DataFrame(left.to_list(), columns=[f'l_dim{x}' for x in range(3)], index=left.index)
right = pd.DataFrame(right.to_list(), columns=[f'r_dim{x}' for x in range(3)], index=right.index)
return left.join(right)
def process_file(file_path, recreate=False):
""" Processing needed to be done per file """
print(f'Processing {file_path}')
try:
df = pd.read_pickle(file_path)
# all in one serialization (99.5 % with bad metric)
# df['output'] = df_col2series(df, ['output'])
df = df.join(change_output(df['output']))
df['shifted'] = df['output'].shift(1, fill_value=[np.zeros(df['output'].iloc[0].shape)])
df['times'] = df_col2series(df, ['prev', 'next'])
df['name'] = f'{file_path}'
mfcc = file2mfcc(file_path, recreate=recreate)
mfcc.name = 'mfcc'
round_index = mfcc.index.values[1] - mfcc.index.values[0]
df.index = np.floor(df['time'] / round_index).astype(int)
mfcc.index = (mfcc.index / round_index).astype(int)
df = df.join(mfcc)
df.index = df['time']
df = df.dropna()
except Exception as e:
print(f'Caught Error: {e}')
return None
return df
def pocess_df(df, X_cols, y_cols):
""" Post processing on the whole DF """
# Add shifter y (predictions)
y_cols_shifted = [f'{x}_shifted' for x in y_cols]
shifted = df[y_cols].groupby('name').shift(1)
df[y_cols_shifted] = shifted
df = df.dropna()
return df
file = '../data/Army Of The Night/blocks/Expert.pkl'
file2mfcc(file, recreate=False)
process_file(file).iloc[0]
def get_file_paths(path, hard_max):
"""
Create a list of all pregenerated blocks files
Search in full subtree of :path:
"""
file_paths = []
counter = 0
for root, dirs, files in os.walk(path, topdown=False):
if counter > hard_max:
break
for name in files:
if root[-6:] == 'blocks':
print(f'#{counter:5} {root}/{name}')
file_paths.append(os.path.join(root, name))
counter += 1
return file_paths
X_list = []
Y = []
HARD_MAX = 20000
path = '../data'
y_cols = [f'{side}_dim{i}' for side in 'rl' for i in range(3) ]
# X_cols = ['times', 'mfcc'] + y_cols
X_cols = y_cols
columns = ['name', 'time'] + X_cols + y_cols
multi_core = True
if multi_core:
# Embarrassingly parallel problem, but RAM heavy
pool = Pool(processes=None)
X_list = pool.map(process_file, get_file_paths(path, HARD_MAX))
else:
X_list = [process_file(x) for x in get_file_paths(path, HARD_MAX)]
X_list = [x for x in X_list if x is not None]
print(f'Passes {len(X_list):6}/{HARD_MAX:6} hard max')
X = pd.DataFrame(pd.concat(X_list), columns=columns)
X = X.set_index(['name', 'time'])
X.to_pickle(os.path.join(path, 'X_saved.pkl'))
X = pd.read_pickle(os.path.join(path, 'X_saved.gzip'))
print(f'Loaded {len(X)} rows')
model.fit(X, y, **kwargs)
precision = 2
def pp(row):
print('*' * 69)
print(row)
def round_up(num:float, prec: int) -> int:
return int(math.ceil((10 ** -prec) * num) / (10 ** -prec))
def get_len_category(song, prec: int=precision):
return int(round_up(len(song), 2))
def get_mask(X, prec: int=precision):
mask = X.groupby('name').apply(get_len_category)
return mask.to_dict()
def create_batch(group, ceil_len, verbose=True):
if verbose:
print(f'Creating batch of ceil_len {ceil_len:6} with {len(group)} rows')
ceil_len = int(ceil_len)
batch = []
for name, song in group.groupby('name'):
empty_row = song.head(1).squeeze().apply(np.zeros_like)
# print(f'{ceil_len} | {len(song)} | {empty_row}')
df_to_add = pd.DataFrame([empty_row] * (ceil_len - len(song)))
batch.append(pd.concat([song, df_to_add]))
return pd.concat(batch)
# LESON: Don't forget about the NaNs!
def list2numpy(batch, col_name):
return np.array(batch.groupby('name')[col_name].apply(list).to_list())
grouped = X.groupby(get_mask(X), level=0)
adjust = 6
stats = []
print(f'{"from":>{adjust}} ‒ {"to":>{adjust}}: {"# of songs":>{adjust*2}}')
for name, group in grouped:
print(f'{name - 10 ** precision:{adjust}} ‒ {name:{adjust}}: {len(group.groupby("name").groups):{adjust*2}}')
stats.append({'from': name - 10 ** precision, 'to': name, '# of songs': len(group.groupby("name").groups)})
# print in your favorite way
# pd.DataFrame(stats, columns=['from', 'to', '# of songs'])
def train_generator(df, X_cols, y_cols, verbose=True):
grouped = X.groupby(get_mask(df), level=0)
# p = Pool(2) # slowe because of memory
# batches = p.starmap(create_batch, [(group, ceil_len) for ceil_len, group in grouped])
# grouped = list(grouped)[:2]
batches = [create_batch(group, ceil_len, verbose) for ceil_len, group in grouped]
batches = [batch for batch in batches if len(batch.groupby('name').groups) > 8]
while True:
for batch in batches:
yield [list2numpy(batch, col) for col in X_cols],\
[list2numpy(batch, col) for col in y_cols]
# test generated shapes
generator = train_generator(X, X_cols, y_cols)
for x, y in generator:
print(f'x.shapes {[np.array(x_t).shape for x_t in x]}')
print(f'y.shape {[np.array(y_t).shape for y_t in y]}\n')
break
from tensorflow.keras.layers import Dense, LSTM, Flatten, Input, Activation, TimeDistributed, concatenate
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
def get_model(X, X_cols, y_cols):
demo_row = X.iloc[0]
X_shapes = [demo_row[col].shape[0] for col in X_cols]
y_shapes = [demo_row[col].shape[0] for col in y_cols]
# in1 = Input(shape=(None, 216)) # last blocks
# in2 = Input(shape=(None, 2)) # time difference of previous and next beat
inputs = [Input(shape=(None, shape)) for shape in X_shapes]
time_dist = [TimeDistributed(Dense(shape, activation='sigmoid'))(inputs[i]) for i, shape in enumerate(X_shapes)]
# out = time_dist
# x1 = TimeDistributed(Dense(50, activation='elu'))(in1)
# x2 = TimeDistributed(Dense(3, activation='elu'))(in2)
out = concatenate(time_dist, axis=-1)
out = LSTM(64, return_sequences=True)(out)
out = LSTM(64, return_sequences=True)(out)
# out = LSTM(128, return_sequences=True)(out)
# out = TimeDistributed(Dense(216, activation='sigmoid'))(out)
outputs = [TimeDistributed(Dense(shape, activation='softmax'), name=col)(out) for shape, col in zip(y_shapes, y_cols)]
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='rmsprop',
# loss='binary_crossentropy',
loss='categorical_crossentropy',
loss_weights=[1,2,3,1,2,3],
metrics=['accuracy', 'categorical_crossentropy'])
return model
model = get_model(X, X_cols, y_cols)
model.summary()
def important_metric(metric_name):
return 'val' in metric_name and 'acc' in metric_name
def train_on_generator(X, X_cols, y_cols, verbose_level=1, model=None, epochs=300):
if not model:
model = get_model(X, X_cols, y_cols)
acc_results = {}
stats_len = 30
generator = train_generator(X, X_cols, y_cols, verbose_level>=2)
for i, (x, y) in enumerate(generator):
if i > epochs:
break
res = model.fit(x, y, batch_size=128, validation_split=0.1, verbose=verbose_level>=3)
if verbose_level >= 1:
if i % stats_len == 0:
total_acc = (np.array(list(acc_results.values())) / stats_len).mean()
display(Markdown(f'### Batch {i:4} | {total_acc:4.4}'))
pprint([f'{key:30}: {val/stats_len}' for key, val in acc_results.items()])
acc_results = {key: val[0] for key, val in res.history.items() if important_metric(key)}
else:
for key in acc_results:
acc_results[key] += res.history[key][0]
acc_results = {key: val[0] for key, val in res.history.items() if important_metric(key)}
total_acc = (np.array(list(acc_results.values()))).mean()
display(Markdown(f'### Last epoch {i:4} results | {total_acc:4.4}'))
pprint(acc_results)
return model
# Test train_on_generator
# train_on_generator(X, X_cols, y_cols, 2, None, 2)
# X.join(X.shift)
y_cols_shifted = [f'{x}_shifted' for x in y_cols]
shifted = X[y_cols].groupby('name').shift(1)
X[y_cols_shifted] = shifted
X = X.dropna()
# Sanity check, try if the model performs well on _identity_
train_on_generator(X, y_cols, y_cols, verbose_level=1, model=None, epochs=300)
model = train_on_generator(X, ['times', 'mfcc'] + y_cols_shifted, y_cols, verbose_level=2, model=model, epochs=490)
y_cols_shifted
input is not provided, model tends to learn and stay on most common value of each classification.import copy
generator = train_generator(X, ['times', 'mfcc'] + y_cols_shifted, y_cols)
x, y = generator.__next__()
prediction = model.predict(x)
f, t = 0, 20
for dim, (p, y_t) in enumerate(zip(prediction, y)):
df = pd.DataFrame(p[0][f:t])
df = df.eq(df.where(df != 0).max(1), axis=0).astype(int)
df.index.name = y_cols[dim]
df_y = pd.DataFrame(y_t[0][f:t]).astype(int)
df = df.join(df_y, rsuffix='_true')
display(df)
def eval_generator():
while True:
for x1, x2, y in zip(X['blocks'][300:], X['times'][300:], Y[300:]):
yield [np.array([x1, ]), np.array([x2, ])], np.array([y, ])
model = get_model()
model.batch_size = 8
model.fit_generator(train_generator(), steps_per_epoch=300, epochs=1, verbose=1,
use_multiprocessing=False)
model.evaluate_generator(eval_generator(), steps=19, )