student.py

import numpy as np

from skimage.feature import hog
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.svm import SVC


def build_vocabulary(images, vocab_size):
    """
    This function should sample HOG descriptors from the training images,
    cluster them with kmeans, and then return the cluster centers.

    Inputs:
        images: An nx256x256 numpy matrix where n is the number of images.
                the first dimension contains the reference to the 256x256 numpy matrix
                representing the corresponding image.
         vocab_size: an integer indicating the number of words desired for the
                     bag of words vocab set

    Outputs:
        a vocab_size x (z*z*9) (see below) array which contains the cluster
        centers that result from the K Means clustering.

    You'll need to generate HOG features using the skimage.feature.hog() function.
    The documentation is available here:
    http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.hog

    However, the documentation is a bit confusing, so we will highlight some
    important arguments to consider:
        cells_per_block: The hog function breaks the image into evenly-sized
            blocks, which are further broken down into cells, each made of
            pixels_per_cell pixels (see below). Setting this parameter tells the
            function how many cells to include in each block. This is a tuple of
            width and height. Your SIFT implementation, which had a total of
            16 cells, was equivalent to setting this argument to (4,4).
        pixels_per_cell: This controls the width and height of each cell
            (in pixels). Like cells_per_block, it is a tuple. In your SIFT
            implementation, each cell was 4 pixels by 4 pixels, so (4,4).
        feature_vector: This argument is a boolean which tells the function
            what shape it should use for the return array. When set to True,
            it returns one long array. We recommend setting it to True and
            reshaping the result rather than working with the default value,
            as it is very confusing.

    It is up to you to choose your cells per block and pixels per cell. Choose
    values that generate reasonably-sized feature vectors and produce good
    classification results. For each cell, HOG produces a histogram (feature
    vector) of length 9. We want one feature vector per block. To do this we
    can append the histograms for each cell together. Let's say you set
    cells_per_block = (z,z). This means that the length of your feature vector
    for the block will be z*z*9.

    With feature_vector=True, hog() will return one long np array containing every
    cell histogram concatenated end to end. We want to break this up into a
    list of (z*z*9) block feature vectors. We can do this using a really nifty numpy
    function. When using np.reshape, you can set the length of one dimension to
    -1, which tells numpy to make this dimension as big as it needs to be to
    accomodate to reshape all of the data based on the other dimensions. So if
    we want to break our long np array (long_boi) into rows of z*z*9 feature
    vectors we can use small_bois = long_boi.reshape(-1, z*z*9).

    The number of feature vectors that come from this reshape is dependent on
    the size of the image you give to hog(). It will fit as many blocks as it
    can on the image. You can choose to resize (or crop) each image to a consistent size
    (therefore creating the same number of feature vectors per image), or you
    can find feature vectors in the original sized image.

    ONE MORE THING
    If we returned all the features we found as our vocabulary, we would have an
    absolutely massive vocabulary. That would make matching inefficient AND
    inaccurate! So we use K Means clustering to find a much smaller (vocab_size)
    number of representative points. We recommend using sklearn.cluster.KMeans
    to do this. Note that this can take a VERY LONG TIME to complete (upwards
    of ten minutes for large numbers of features and large max_iter), so set
    the max_iter argument to something low (we used 100) and be patient. You
    may also find success setting the "tol" argument (see documentation for
    details)
    """

    N, _, _ = images.shape
    cells_per_block = 1
    pixels_per_cell = 16
    features_per_img = int((256 / pixels_per_cell) ** 2)
    # indices = np.arange(features_per_img)
    # features_per_img = 256
    features = np.zeros((N * features_per_img, cells_per_block * cells_per_block * 9))
    print("Extracting features to build vocabulary.")
    for i in range(N):
        features[features_per_img * i: features_per_img + (features_per_img * i), :] = \
            hog(images[i, :, :], cells_per_block=(cells_per_block, cells_per_block),
                pixels_per_cell=(pixels_per_cell, pixels_per_cell),
                feature_vector=True).reshape(-1, cells_per_block * cells_per_block * 9)  # [
        # np.random.choice(indices, features_per_img, replace=False), :]
    print("Features extracted.")
    print("Clustering features to create bag of words.")
    kmeans = KMeans(n_clusters=vocab_size, max_iter=100, verbose=True).fit(features)
    print("Done clustering")
    return kmeans.cluster_centers_


def get_bags_of_words(images):
    """
    This function should take in a list of image paths and calculate a bag of
    words histogram for each image, then return those histograms in an array.

    Inputs:
        images: An nx256x256 numpy matrix where n is the number of images.
                the first dimension contains the reference to the 256x256 numpy matrix
                representing the corresponding image.

    Outputs:
        An nxd numpy matrix, where d is size of the histogram built for each image.

    Use the same hog function to extract feature vectors as before (see
    build_vocabulary). It is important that you use the same hog settings for
    both build_vocabulary and get_bags_of_words! Otherwise, you will end up
    with different feature representations between your vocab and your test
    images, and you won't be able to match anything at all!

    After getting the feature vectors for an image, you will build up a
    histogram that represents what words are contained within the image.
    For each feature, find the closest vocab word, then add 1 to the histogram
    at the index of that word. For example, if the closest vector in the vocab
    is the 103rd word, then you should add 1 to the 103rd histogram bin. Your
    histogram should have as many bins as there are vocabulary words.

    Suggested functions: scipy.spatial.distance.cdist, np.argsort,
                         np.linalg.norm, skimage.feature.hog
    """

    vocab = np.load('vocab.npy')
    print('Loaded vocab from file.')
    N, _, _ = images.shape
    cells_per_block = 1
    pixels_per_cell = 4
    features_per_img = int((256 / pixels_per_cell) ** 2)
    # indices = np.arange(features_per_img)
    # features_per_img = 256
    features = np.zeros((N * features_per_img, cells_per_block * cells_per_block * 9))
    print("Extracting features to match them against bag of words.")
    for i in range(N):
        features[features_per_img * i: features_per_img + (features_per_img * i), :] = \
            hog(images[i, :, :], cells_per_block=(cells_per_block, cells_per_block),
                pixels_per_cell=(pixels_per_cell, pixels_per_cell),
                feature_vector=True).reshape(-1, cells_per_block * cells_per_block * 9)  # [
        # np.random.choice(indices, features_per_img, replace=False), :]
    print("Features Extracted")
    print("Creating histogram for each image using the bag of words.")
    nearest_features = np.argsort(cdist(features, vocab), axis=1)[:, 0]
    vocab_size = vocab.shape[0]
    images_histogram = np.zeros((N, vocab_size))
    for i, img_feature in enumerate(np.array_split(nearest_features, nearest_features.shape[0] / features_per_img)):
        images_histogram[i, :] = np.histogram(img_feature, bins=vocab_size, density=False)[0]
    print("histogram done")

    row_sums = images_histogram.sum(axis=1)
    images_histogram = images_histogram / row_sums[:, np.newaxis]
    return images_histogram


def svm_classify(train_image_feats, train_labels, test_image_feats):
    """
    This function will predict a category for every test image by training
    15 many-versus-one linear SVM classifiers on the training data, then
    using those learned classifiers on the testing data.

    Inputs:
        train_image_feats: An nxd numpy array, where n is the number of training
                           examples, and d is the image descriptor vector size.
        train_labels: An nx1 Python list containing the corresponding ground
                      truth labels for the training data.
        test_image_feats: An mxd numpy array, where m is the number of test
                          images and d is the image descriptor vector size.

    Outputs:
        An mx1 numpy array of strings, where each string is the predicted label
        for the corresponding image in test_image_feats

    We suggest you look at the sklearn.svm module, including the LinearSVC
    class. With the right arguments, you can get a 15-class SVM as described
    above in just one call! Be sure to read the documentation carefully.
    """

    print("Classifying features.")
    clf = SVC(C=4, kernel='rbf', gamma='scale')
    clf.fit(train_image_feats, train_labels)
    predicted_labels = clf.predict(test_image_feats)
    print("Classification done.")
    return predicted_labels