import os
import numpy as np
import matplotlib.pyplot as plt
from skimage.transform import resize
from skimage import data
from scipy.misc import imresize
import IPython.display as ipyd
import tensorflow as tf
from libs import utils, gif, datasets, dataset_utils, vae, dft
%matplotlib inline
plt.style.use('ggplot')
dataset = 'gtzan_music_speech'
music_dir = os.path.join(os.path.join(dataset, 'music_speech'), 'music_wav')
#music folder
music = [os.path.join(music_dir, file_i)
for file_i in os.listdir(music_dir)
if file_i.endswith('.wav')]
speech_dir = os.path.join(os.path.join(dataset, 'music_speech'), 'speech_wav')
# speech folder
speech = [os.path.join(speech_dir, file_i)
for file_i in os.listdir(speech_dir)
if file_i.endswith('.wav')]
print(music[:10], speech[:10])
file_i = music[1]
s = utils.load_audio(file_i)
plt.plot(s)
fft_size = 512
hop_size = 256
print("s:",s)
print("\ns shape:", s.shape)
re, img = dft.dft_np(s, hop_size=256, fft_size=512)
print("\nreal:",re[:5])
print("\nimaginary:",img[:5])
print("\nreal shape, imaginary shape:", re.shape, img.shape)
mag, phs = dft.ztoc(re, img)
print("\nMagnitude:",mag[:5])
print("\nPhase:",phs[:5])
print("\nMagnitude Shape:",mag.shape)
print("\nPhase Shape:",phs.shape)
plt.figure(figsize=(4,10))
plt.imshow(mag)
plt.figure(figsize=(20, 8))
plt.imshow(np.log(mag.T))
plt.xlabel('Time')
plt.ylabel('Frequency Bin')
# sample rate 22050 hz for this dataset
sr = 22050
# Calculated how many hops there are in a second which will tell us how many frames of magnitudes we have per second
n_frames_per_second = sr // hop_size
print("n_frames_per_second:",n_frames_per_second)
# We want 500 milliseconds of audio in our window
n_frames = n_frames_per_second // 2
print("n_frames:",n_frames)
# And we'll move our window by 250 ms at a time
frame_hops = n_frames_per_second // 4
print("frame_hops:",frame_hops)
# We'll therefore have this many sliding windows:
n_hops = (len(mag) - n_frames) // frame_hops
print("n_hops:",n_hops)
Xs = []
ys = []
for hop_i in range(n_hops):
# Creating our sliding window
frames = mag[(hop_i * frame_hops):(hop_i * frame_hops + n_frames)]
# Store them with a new 3rd axis and as a logarithmic scale
# Add a small value to avoid log of 0
Xs.append(np.log(np.abs(frames[..., np.newaxis]) + 1e-10))
ys.append(0)
print("Xs:",Xs[:1])
print("\nys:",ys[:1])
print("\nXs shape:",np.shape(Xs))
print("\nys shape:",np.shape(ys))
plt.figure(figsize=(20,8))
plt.imshow(Xs[0][..., 0])
plt.title('label:{}'.format(ys[1]))
plt.figure(figsize=(20,8))
plt.imshow(Xs[1][..., 0])
plt.title('label:{}'.format(ys[1]))
# Store every magnitude frame and its label of being music: 0 or speech: 1
Xs, ys = [], []
#for music
for i in music:
s = utils.load_audio(i)
# taking DSP
re, im = dft.dft_np(s, fft_size=fft_size, hop_size=hop_size)
# converting complex to polar
mag, phs = dft.ztoc(re, im)
# no. of sliding windows:
n_hops = (len(mag) - n_frames) // frame_hops
# extract them all:
for hop_i in range(n_hops):
# Get the current sliding window
frames = mag[(hop_i * frame_hops):(hop_i * frame_hops + n_frames)]
# take the log magnitudes:
this_X = np.log(np.abs(frames[..., np.newaxis]) + 1e-10)
# And store it:
Xs.append(this_X)
# store correct labels:
ys.append(0)
#for speech
for i in speech:
s = utils.load_audio(i)
re, im = dft.dft_np(s, fft_size=fft_size, hop_size=hop_size)
mag, phs = dft.ztoc(re, im)
n_hops = (len(mag) - n_frames) // frame_hops
for hop_i in range(n_hops):
frames = mag[(hop_i * frame_hops):(hop_i * frame_hops + n_frames)]
this_X = np.log(np.abs(frames[..., np.newaxis]) + 1e-10)
Xs.append(this_X)
ys.append(0)
Xs = np.array(Xs)
ys = np.array(ys)
print(Xs.shape, ys.shape)
# Describe the input shape of the network
n_observations, n_height, n_width, n_channels = Xs.shape
Using the helper class in libs\datasets_utils.py
, we will split the Xs and ys into training, testing and validation. Also specifying a parameter one_hot
stating whether we want our ys to be converted to a one hot vector or not. Each mini batch can be fetched using next_batch()
function of this object. This will also perform the shuffling of the dataset.
ds = datasets.Dataset(Xs=Xs, ys=ys, split=[0.8, 0.1, 0.1], one_hot=True)
Xs_i, ys_i = next(ds.train.next_batch())
print(Xs_i.shape, ys_i.shape)
# visualize first element
plt.figure(figsize=(20,8))
plt.imshow(Xs_i[0, :, :, 0])
plt.title('label:{}'.format(ys_i[0]))
tf.reset_default_graph()
# create placeholders
# Xs is 4D array (n_obs, n_height, n_width, n_channels)
X = tf.placeholder(name='X', shape=[None, 43, 256, 1], dtype=tf.float32)
y = tf.placeholder(name='y', shape=[None, 2], dtype=tf.float32)
n_filters = [32,32,32,32]
H = X
for layer_i, n_filters_i in enumerate(n_filters):
H, W = utils.conv2d(
#H, n_filters_i, k_h=3, k_w=3, d_h=2, d_w=2,
H, n_filters_i, k_h=10, k_w=10, d_h=4, d_w=4,
name=str(layer_i))
H = tf.nn.relu(H)
print(H.get_shape().as_list())
# connect the last convolutional layer to a fully connected network
fc, W = utils.linear(H, 100, str(len(n_filters)))
Y_pred, W = utils.linear(fc, 2, str(len(n_filters) + 1), tf.nn.softmax)
# defining loss function and calucating cost of entire batch
loss = utils.binary_cross_entropy(Y_pred, y)
cost = tf.reduce_mean(tf.reduce_sum(loss, 1))
# measure of accuracy
predicted_y = tf.argmax(Y_pred, 1)
actual_y = tf.argmax(y, 1)
correct_prediction = tf.equal(predicted_y, actual_y)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# defining optimizer
learning_rate = 0.001
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
# logging the entire process in tensorboard
'''
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir="tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)
bce_summary = tf.summary.scalar('COST', cost)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
'''
n_epochs = 3
batch_size = 120
n_batch = 15360 // batch_size
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for epoch_i in range(n_epochs):
print('Epoch: ', epoch_i)
# training set
this_accuracy = 0
its = 0
for Xs_i, ys_i in ds.train.next_batch(batch_size):
this_accuracy += sess.run([accuracy, optimizer], feed_dict={
X:Xs_i, y:ys_i})[0]
its += 1
print(this_accuracy / its)
#summary_str = bce_summary.eval(feed_dict={X: Xs_i, y: ys_i}, session=sess)
#step = epoch_i * n_batch + its
#file_writer.add_summary(summary_str, step)
print('Training accuracy: ', this_accuracy / its)
# validation set
this_accuracy = 0
its = 0
for Xs_i, ys_i in ds.valid.next_batch(batch_size):
this_accuracy += sess.run(accuracy, feed_dict={
X:Xs_i, y:ys_i})
its += 1
print('Validation accuracy: ', this_accuracy / its)
#file_writer.close()