Assignment 2 | 斯坦福CS231n-深度学习与计算机视觉课程

该笔记是以斯坦福cs231n课程的python编程任务为主线，展开对该课程主要内容的理解和部分数学推导。这篇文章是第二篇。

CS231n简介

CS231n的全称是CS231n: Convolutional Neural Networks for Visual Recognition，即面向视觉识别的卷积神经网络。该课程是斯坦福大学计算机视觉实验室推出的课程。需要注意的是，目前大家说CS231n，大都指的是2016年冬季学期（一月到三月）的最新版本。

课程描述 Information
计算机视觉在社会中已经逐渐普及，并广泛运用于搜索检索、图像理解、手机应用、地图导航、医疗制药、无人机和无人驾驶汽车等领域。而这些应用的核心技术就是图像分类、图像定位和图像探测等视觉识别任务。近期神经网络（也就是“深度学习”）方法上的进展极大地提升了这些代表当前发展水平的视觉识别系统的性能。

本课程将深入讲解深度学习框架的细节问题，聚焦面向视觉识别任务（尤其是图像分类任务）的端到端学习模型。在10周的课程中，学生们将会学习如何实现、训练和调试他们自己的神经网络，并建立起对计算机视觉领域的前沿研究方向的细节理解。最终的作业将包括训练一个有几百万参数的卷积神经网络，并将其应用到最大的图像分类数据库（ImageNet）上。我们将会聚焦于教授如何确定图像识别问题，学习算法（比如反向传播算法），对网络的训练和精细调整（fine-tuning）中的工程实践技巧，指导学生动手完成课程作业和最终的课程项目。

视频入口

Assignment 2

Python编程任务（线性分类器）

· 我用的IDE是Pycharm。
· Assignment1的线性分类器部分，我们需要完成 linear_svm.py，softmax.py，linear_classifier.py。在完成后，你可以用svm.ipynb和softmax.ipynb里的代码来debug你的模型，获得最优模型，然后在测试集上测试分类水平。
· Assignment1用的图像库是CIFAR-10，你也可以从这里下载。

linear_svm.py代码如下：

代码语言：javascript

复制

coauthor = 'Deeplayer'
5.19.2016import numpy as np
def svm_loss_naive(W, X, y, reg):

"""

Inputs:

- W: A numpy array of shape (D, C) containing weights.

- X: A numpy array of shape (N, D) containing a minibatch of data.

- y: A numpy array of shape (N,) containing training labels; y[i] = c means

that X[i] has label c, where 0 <= c < C.

- reg: (float) regularization strength
Returns a tuple of:
- loss as single float
- gradient with respect to weights W; an array of same shape as W
&#34;&#34;&#34;
dW = np.zeros(W.shape)   # initialize the gradient as zero
# compute the loss and the gradient
num_classes = W.shape[1]
num_train = X.shape[0]
loss = 0.0
for i in xrange(num_train):    
    scores = X[i].dot(W)    
    correct_class_score = scores[y[i]]
    for j in xrange(num_classes):
        if j == y[i]:    
            continue
        margin = scores[j] - correct_class_score + 1   # note delta = 1
        if margin &gt; 0:
            loss += margin
            dW[:, y[i]] += -X[i, :]     # compute the correct_class gradients
            dW[:, j] += X[i, :]         # compute the wrong_class gradients
# Right now the loss is a sum over all training examples, but we want it
# to be an average instead so we divide by num_train.
loss /= num_train
dW /= num_train
# Add regularization to the loss.
loss += 0.5 * reg * np.sum(W * W)
dW += reg * W
return loss, dW

def svm_loss_vectorized(W, X, y, reg):

"""

Structured SVM loss function, vectorized implementation.Inputs and outputs

are the same as svm_loss_naive.

"""

loss = 0.0

dW = np.zeros(W.shape)   # initialize the gradient as zero

scores = X.dot(W)        # N by C

num_train = X.shape[0]

num_classes = W.shape[1]

scores_correct = scores[np.arange(num_train), y]   # 1 by N

scores_correct = np.reshape(scores_correct, (num_train, 1))  # N by 1

margins = scores - scores_correct + 1.0     # N by C

margins[np.arange(num_train), y] = 0.0

margins[margins <= 0] = 0.0

loss += np.sum(margins) / num_train

loss += 0.5 * reg * np.sum(W * W)

# compute the gradient

margins[margins > 0] = 1.0

row_sum = np.sum(margins, axis=1)                  # 1 by N

margins[np.arange(num_train), y] = -row_sum

dW += np.dot(X.T, margins)/num_train + reg * W     # D by C
return loss, dW</code></pre></div></div><figure class=""><hr/></figure><p>softmax.py代码如下：</p><div class="rno-markdown-code"><div class="rno-markdown-code-toolbar"><div class="rno-markdown-code-toolbar-info"><div class="rno-markdown-code-toolbar-item is-type"><span class="is-m-hidden">代码语言：</span>javascript</div></div><div class="rno-markdown-code-toolbar-opt"><div class="rno-markdown-code-toolbar-copy"><i class="icon-copy"></i><span class="is-m-hidden">复制</span></div></div></div><div class="developer-code-block"><pre class="prism-token token line-numbers language-javascript"><code class="language-javascript" style="margin-left:0">__coauthor__ = &#39;Deeplayer&#39;

5.19.2016
import numpy as np
def softmax_loss_naive(W, X, y, reg):
# Initialize the loss and gradient to zero.
loss = 0.0
dW = np.zeros_like(W)    # D by C
dW_each = np.zeros_like(W)
num_train, dim = X.shape
num_class = W.shape[1]
f = X.dot(W)    # N by C
# Considering the Numeric Stability
f_max = np.reshape(np.max(f, axis=1), (num_train, 1))   # N by 1
prob = np.exp(f - f_max) / np.sum(np.exp(f - f_max), axis=1, keepdims=True) # N by C
y_trueClass = np.zeros_like(prob)
y_trueClass[np.arange(num_train), y] = 1.0
for i in xrange(num_train):
    for j in xrange(num_class):    
        loss += -(y_trueClass[i, j] * np.log(prob[i, j]))    
        dW_each[:, j] = -(y_trueClass[i, j] - prob[i, j]) * X[i, :]
    dW += dW_each
loss /= num_train
loss += 0.5 * reg * np.sum(W * W)
dW /= num_train
dW += reg * W

return loss, dW

def softmax_loss_vectorized(W, X, y, reg):

"""

Softmax loss function, vectorized version.
Inputs and outputs are the same as softmax_loss_naive.    
&#34;&#34;&#34;    
# Initialize the loss and gradient to zero.    
loss = 0.0    
dW = np.zeros_like(W)    # D by C    
num_train, dim = X.shape

f = X.dot(W)    # N by C
# Considering the Numeric Stability
f_max = np.reshape(np.max(f, axis=1), (num_train, 1))   # N by 1
prob = np.exp(f - f_max) / np.sum(np.exp(f - f_max), axis=1, keepdims=True)
y_trueClass = np.zeros_like(prob)
y_trueClass[range(num_train), y] = 1.0    # N by C
loss += -np.sum(y_trueClass * np.log(prob)) / num_train + 0.5 * reg * np.sum(W * W)
dW += -np.dot(X.T, y_trueClass - prob) / num_train + reg * W

return loss, dW</code></pre></div></div><figure class=""><hr/></figure><p>linear_classifier.py代码如下：</p><p>__coauthor__ = &#39;Deeplayer&#39; </p><p># 5.19.2016</p><p> from linear_svm import * </p><p>from softmax import * </p><p> class LinearClassifier(object):   </p><p> def __init__(self):                </p><p> self.W = None      </p><p>  def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100,  </p><p>  batch_size=200, verbose=True):   </p><p>  Train this linear classifier using stochastic gradient descent.  </p><p>  Inputs:            </p><p>  - X: A numpy array of shape (N, D) containing training data; there are N                      </p><p> training samples each of dimension D.               </p><p> - y: A numpy array of shape (N,) containing training labels; y[i] = c</p><p>   means that X[i] has label 0 &lt;= c &lt; C for C classes.               </p><p> - learning_rate: (float) learning rate for optimization.               </p><p> - reg: (float) regularization strength.              </p><p>  - num_iters: (integer) number of steps to take when optimizing      </p><p>  - batch_size: (integer) number of training examples to use at each step.              </p><p>  - verbose: (boolean) If true, print progress during optimization.      </p><p>  Outputs:               </p><p> A list containing the value of the loss function at each training iteration.     </p><p>   &#34;&#34;&#34;        </p><p>num_train, dim = X.shape       </p><p> # assume y takes values 0...K-1 where K is number of classes       </p><p> num_classes = np.max(y) + 1        </p><p>  if self.W is None:           </p><p> # lazily initialize W         </p><p>   self.W = 0.001 * np.random.randn(dim, num_classes)   # D by C       </p><p> # Run stochastic gradient descent(Mini-Batch) to optimize W      </p><p>  loss_history = []      </p><p>  for it in xrange(num_iters):          </p><p>  X_batch = None          </p><p>  y_batch = None         </p><p>   # Sampling with replacement is faster than sampling without replacement.            </p><p>sample_index = np.random.choice(num_train, batch_size, replace=False)         </p><p>   X_batch = X[sample_index, :]   # batch_size by D          </p><p>  y_batch = y[sample_index]      # 1 by batch_size          </p><p>  # evaluate loss and gradient           </p><p> loss, grad = self.loss(X_batch, y_batch, reg)         </p><p>   loss_history.append(loss)          </p><p>  # perform parameter update         </p><p>   self.W += -learning_rate * grad            </p><p>if verbose and it % 100 == 0:              </p><p>  print &#39;Iteration %d / %d: loss %f&#39; % (it, num_iters, loss)       </p><p> return loss_history    def predict(self, X):           </p><p> &#34;&#34;&#34;          </p><p>  Use the trained weights of this linear classifier to predict labels for       </p><p>   data points.         </p><p>   Inputs:          </p><p>  - X: D x N array of training data. Each column is a D-dimensional point.          </p><p>  Returns:        </p><p>    - y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional                  </p><p>    array of length N, and each element is an integer giving the                </p><p>  predicted class.     </p><p>  &#34;&#34;&#34;        y_pred = np.zeros(X.shape[1])    # 1 by N     </p><p>   y_pred = np.argmax(np.dot(self.W.T, X), axis=0)     </p><p>   return y_pred    </p><p>def loss(self, X_batch, y_batch, reg):      </p><p>    &#34;&#34;&#34;        </p><p>    Compute the loss function and its derivative.         </p><p>   Subclasses will override this.        </p><p>    Inputs:        </p><p>    - X_batch: A numpy array of shape (N, D) containing a minibatch of N                   </p><p>data points; each point has dimension D.           </p><p> - y_batch: A numpy array of shape (N,) containing labels for the minibatch.     </p><p>   - reg: (float) regularization strength.        </p><p>  Returns: A tuple containing:           </p><p> - loss as a single float           </p><p> - gradient with respect to self.W; an array of the same shape as W      </p><p>    &#34;&#34;&#34;           </p><p> pass</p><p> class LinearSVM(LinearClassifier):    </p><p>  &#34;&#34;&#34;  </p><p>  A subclass that uses the Multiclass SVM loss function    </p><p>&#34;&#34;&#34;        </p><p>def loss(self, X_batch, y_batch, reg):              </p><p>  return svm_loss_vectorized(self.W, X_batch, y_batch, reg) class Softmax(LinearClassifier): </p><p>     &#34;&#34;&#34;    </p><p>A subclass that uses the Softmax + Cross-entropy loss function  </p><p>  &#34;&#34;&#34;      </p><p>  def loss(self, X_batch, y_batch, reg):   </p><p>             return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)</p><figure class=""><hr/></figure><p>下面我贴一下微调超参数获得最优模型的代码，并给出一些运行结果和图：</p><p>1、 LinearClassifier_svm_start.py</p><p>__coauthor__ = &#39;Deeplayer&#39;</p><p> # 5.20.2016 import numpy as np </p><p>import matplotlib.pyplot as plt</p><p> import math</p><p> from linear_classifier import </p><p>* from data_utils import load_CIFAR10</p><p> # Load the raw CIFAR-10 data.</p><p> cifar10_dir = &#39;E:/PycharmProjects/ML/CS231n/cifar-10-batches-py&#39; # u should change this</p><p> X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)</p><p> # As a sanity check, we print out the size of the training and test data.</p><p> print &#39;Training data shape: &#39;, X_train.shape     # (50000,32,32,3)</p><p> print &#39;Training labels shape: &#39;, y_train.shape   # (50000L,)</p><p> print &#39;Test data shape: &#39;, X_test.shape          # (10000,32,32,3)</p><p> print &#39;Test labels shape: &#39;, y_test.shape        # (10000L,) </p><p>print </p><p> # Visualize some examples from the dataset</p><p>. # We show a few examples of training images from each class.</p><p> classes = [&#39;plane&#39;, &#39;car&#39;, &#39;bird&#39;, &#39;cat&#39;, &#39;deer&#39;,              </p><p>                &#39;dog&#39;, &#39;frog&#39;, &#39;horse&#39;, &#39;ship&#39;, &#39;truck&#39;] </p><p>num_classes = len(classes) samples_per_class = 7</p><p> for y, cls in enumerate(classes):     </p><p>    idxs = np.flatnonzero(y_train == y)       </p><p> idxs = np.random.choice(idxs, samples_per_class, replace=False) </p><p>    for i, idx in enumerate(idxs):               </p><p> plt_idx = i * num_classes + y + 1      </p><p>  plt.subplot(samples_per_class, num_classes, plt_idx)         </p><p> plt.imshow(X_train[idx].astype(&#39;uint8&#39;))              </p><p>  plt.axis(&#39;off&#39;)             </p><p> if i == 0:                      </p><p>  plt.title(cls)</p><p> plt.show()</p><p> # Split the data into train, val, and test sets.</p><p> num_training = 49000</p><p> num_validation = 1000 </p><p>num_test = 1000</p><p> mask = range(num_training, num_training + num_validation)</p><p> X_val = X_train[mask]                  # (1000,32,32,3)]</p><p> y_val = y_train[mask]                  # (1,1000) </p><p>mask = range(num_training</p><p>) X_train = X_train[mask]             # (49000,32,32,3) </p><p>y_train = y_train[mask]                # (1,49000) </p><p>mask = range(num_test)</p><p> X_test = X_test[mask]                  # (1000,32,32,3) </p><p>y_test = y_test[mask]                  # (1,1000)</p><p> # Preprocessing1: reshape the image data into rows</p><p> X_train = np.reshape(X_train, (X_train.shape[0], -1))    # (49000,3072)</p><p> X_val = np.reshape(X_val, (X_val.shape[0], -1))          # (1000,3072) </p><p>X_test = np.reshape(X_test, (X_test.shape[0], -1))       # (1000,3072)</p><p> # Preprocessing2: subtract the mean image</p><p> mean_image = np.mean(X_train, axis=0)       # (1,3072)</p><p> X_train -= mean_image </p><p>X_val -= mean_image</p><p> X_test -= mean_image</p><p> # Visualize the mean image</p><p> plt.figure(figsize=(4, 4))</p><p> plt.imshow(mean_image.reshape((32, 32, 3)).astype(&#39;uint8&#39;))</p><p> plt.show()</p><p> # Bias trick, extending the data</p><p> X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])    # (49000,3073)</p><p> X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])          # (1000,3073)</p><p> X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])       # (1000,3073)</p><p> # Use the validation set to tune hyperparameters (regularization strength</p><p> # and learning rate). </p><p>learning_rates = [1e-7, 5e-5]</p><p> regularization_strengths = [5e4, 1e5]</p><p> results = {}best_val = -1    # The highest validation accuracy that we have seen so far.</p><p> best_svm = None   # The LinearSVM object that achieved the highest validation rate.</p><p> iters = 1500</p><p> for lr in learning_rates:    for rs in regularization_strengths:          </p><p>  svm = LinearSVM()           </p><p> svm.train(X_train, y_train, learning_rate=lr, reg=rs, num_iters=iters)         </p><p>   Tr_pred = svm.predict(X_train.T)            </p><p>acc_train = np.mean(y_train == Tr_pred)         </p><p>   Val_pred = svm.predict(X_val.T)           </p><p> acc_val = np.mean(y_val == Val_pred)         </p><p>   results[(lr, rs)] = (acc_train, acc_val)          </p><p>  if best_val &lt; acc_val:            </p><p>best_val = acc_val          </p><p>  best_svm = svm</p><p> # print results for lr, reg in sorted(results):       </p><p> train_accuracy, val_accuracy = results[(lr, reg)]    </p><p>    print &#39;lr %e reg %e train accuracy: %f val accuracy: %f&#39; %                                </p><p>(lr, reg, train_accuracy, val_accuracy)</p><p> print &#39;Best validation accuracy achieved during validation: %f&#39; % </p><p>best_val # around 38.2% # Visualize the learned weights for each class </p><p>w = best_svm</p><p>.W[:-1, :]   # strip out the bias w = w.reshape(32, 32, 3, 10) </p><p>w_min, w_max = np.min(w), np.max(w)</p><p> classes = [&#39;plane&#39;, &#39;car&#39;, &#39;bird&#39;, &#39;cat&#39;, &#39;deer&#39;,                 </p><p>             &#39;dog&#39;, &#39;frog&#39;, &#39;horse&#39;, &#39;ship&#39;, &#39;truck&#39;] for i in xrange(10):    </p><p>    plt.subplot(2, 5, i + 1)      </p><p>  # Rescale the weights to be between 0 and 255     </p><p>    wimg = 255.0 * (w[:, :, :, i].squeeze() - w_min) / (w_max - w_min)       </p><p> plt.imshow(wimg.astype(&#39;uint8&#39;))      </p><p>  plt.axis(&#39;off&#39;)       </p><p> plt.title(classes[i])   </p><p> plt.show() </p><p> # Evaluate the best svm on test set</p><p> Ts_pred = best_svm.predict(X_test.T)</p><p> test_accuracy = np.mean(y_test == Ts_pred)    # around 37.1% </p><p>print &#39;LinearSVM on raw pixels of CIFAR-10 final test set accuracy: %f&#39; % test_accuracy  </p><figure class=""><hr/></figure><p>下面可视化一下部分原始图片、均值图像和学习到的权重：</p><figure class=""><div class="rno-markdown-img-url" style="text-align:center"><div class="rno-markdown-img-url-inner" style="width:94.9%"><div style="width:100%"><img src="https://cdn.static.attains.cn/app/developer-bbs/upload/1723357560990427998.jpeg" /></div></div></div></figure><p>figure_1.png</p><figure class=""><div class="rno-markdown-img-url" style="text-align:center"><div class="rno-markdown-img-url-inner" style="width:44.65%"><div style="width:100%"><img src="https://cdn.static.attains.cn/app/developer-bbs/upload/1723357561157026543.png" /></div></div></div></figure><p>figure_2.png</p><figure class=""><div class="rno-markdown-img-url" style="text-align:center"><div class="rno-markdown-img-url-inner" style="width:100%"><div style="width:100%"><img src="https://cdn.static.attains.cn/app/developer-bbs/upload/1723357561398669180.jpeg" /></div></div></div></figure><p>figure_3.png</p><figure class=""><hr/></figure><p>2、 LinearClassifier_softmax_start.py</p><div class="rno-markdown-code"><div class="rno-markdown-code-toolbar"><div class="rno-markdown-code-toolbar-info"><div class="rno-markdown-code-toolbar-item is-type"><span class="is-m-hidden">代码语言：</span>javascript</div></div><div class="rno-markdown-code-toolbar-opt"><div class="rno-markdown-code-toolbar-copy"><i class="icon-copy"></i><span class="is-m-hidden">复制</span></div></div></div><div class="developer-code-block"><pre class="prism-token token line-numbers language-javascript"><code class="language-javascript" style="margin-left:0">__coauthor__ = &#39;Deeplayer&#39;

5.20.2016
import numpy as np

from data_utils import load_CIFAR10

from linear_classifier import *
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):

"""

Load the CIFAR-10 dataset from disk and perform preprocessing to prepare

it for the linear classifier. These are the same steps as we used for the SVM,

but condensed to a single function.

"""

# Load the raw CIFAR-10 data

cifar10_dir = 'E:/PycharmProjects/ML/CS231n/cifar-10-batches-py'   # make a change

X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

# subsample the data

mask = range(num_training, num_training + num_validation)

X_val = X_train[mask]

y_val = y_train[mask]

mask = range(num_training)

X_train = X_train[mask]

y_train = y_train[mask]

mask = range(num_test)

X_test = X_test[mask]

y_test = y_test[mask]

# Preprocessing: reshape the image data into rows

X_train = np.reshape(X_train, (X_train.shape[0], -1))

X_val = np.reshape(X_val, (X_val.shape[0], -1))

X_test = np.reshape(X_test, (X_test.shape[0], -1))

# subtract the mean image

mean_image = np.mean(X_train, axis=0)

X_train -= mean_image

X_val -= mean_image

X_test -= mean_image

# add bias dimension and transform into columns

X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])

X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])

X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
return X_train, y_train, X_val, y_val, X_test, y_test

Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = get_CIFAR10_data()
Use the validation set to tune hyperparameters (regularization strength
and learning rate).
results = {}

best_val = -1

best_softmax = None

learning_rates = [1e-7, 5e-7]

regularization_strengths = [5e4, 1e4]

iters = 1500

for lr in learning_rates:

for rs in regularization_strengths:

softmax = Softmax()

softmax.train(X_train, y_train, learning_rate=lr, reg=rs, num_iters=iters)

Tr_pred = softmax.predict(X_train.T)

acc_train = np.mean(y_train == Tr_pred)

Val_pred = softmax.predict(X_val.T)

acc_val = np.mean(y_val == Val_pred)

results[(lr, rs)] = (acc_train, acc_val)

if best_val < acc_val:

best_val = acc_val

best_softmax = softmax
Print out results.
for lr, reg in sorted(results):

train_accuracy, val_accuracy = results[(lr, reg)]

print 'lr %e reg %e train accuracy: %f val accuracy: %f' %

(lr, reg, train_accuracy, val_accuracy)

# around 38.9%

print 'best validation accuracy achieved during cross-validation: %f' % best_val
Evaluate the best softmax on test set.
Ts_pred = best_softmax.predict(X_test.T)

test_accuracy = np.mean(y_test == Ts_pred)       # around 37.4%

print 'Softmax on raw pixels of CIFAR-10 final test set accuracy: %f' % test_accuracy

最后以SVM为例，比较一下向量化和非向量化编程在运算速度上的差异：

--> naive_vs_vectorized.py

代码语言：javascript

复制

coauthor = 'Deeplayer'
5.20.2016
import time

from linear_svm import *

from data_utils import load_CIFAR10
def get_CIFAR10_data(num_training=49000, num_dev=500):
# Load the raw CIFAR-10 data  
cifar10_dir = &#39;E:/PycharmProjects/ML/CS231n/cifar-10-batches-py&#39;   # make a change
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)  
mask = range(num_training)  
X_train = X_train[mask]  
mask = np.random.choice(num_training, num_dev, replace=False)    
X_dev = X_train[mask]  
y_dev = y_train[mask]  

X_train = np.reshape(X_train, (X_train.shape[0], -1))  
X_dev = np.reshape(X_dev, (X_dev.shape[0], -1))    

mean_image = np.mean(X_train, axis=0)  
X_dev -= mean_image  
X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])  

return X_dev, y_dev

X_dev, y_dev = get_CIFAR10_data()
generate a random SVM weight matrix of small numbers
W = np.random.randn(3073, 10) * 0.0001

tic = time.time()

loss_naive, grad_naive = svm_loss_naive(W, X_dev, y_dev, 0.00001)

toc = time.time()

print 'Naive loss and gradient: computed in %fs' % (toc - tic)    # around 0.198s
tic = time.time()

loss_vectorized, grad_vectorized = svm_loss_vectorized(W, X_dev, y_dev, 0.00001)

toc = time.time()

print 'Vectorized loss and gradient: computed in %fs' % (toc - tic)    # around 0.005s