Source code for flexmatcher.classify.nGramClassifier

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import linear_model
from flexmatcher.classify import Classifier
import numpy as np


[docs]class NGramClassifier(Classifier):

    """Classify data-points using counts of n-gram sequence of words or chars.

    The NGramClassifier uses n-grams of words or characters (based on user
    preference) and extracts count features or binary features (based on user
    preference) to train a classifier. It uses a LogisticRegression
    classifier as its training model.

    Attributes:
        labels (ndarray): Vector storing the labels of each data-point.
        features (ndarray): Matrix storing the extracting features.
        vectorizer (object): Vectorizer for transforming text to features. It
        will be either of type CountVectorizer or HashingVectorizer.
        clf (LogisticRegression): The classifier instance.
        num_classes (int): Number of classes/columns to match to
        all_classes (ndarray): Sorted array of all possible classes
    """

    def __init__(self, ngram_range=(1, 1), analyzer='word', count=True,
                 n_features=200):
        """Initializes the classifier.

        Args:
            ngram_range (tuple): Pair of ints specifying the range of ngrams.
            analyzer (string): Determines what type of analyzer to be used.
            Setting it to 'word' will consider each word as a unit of language
            and 'char' will consider each character as a unit of language.
            count (boolean): Determines if features are counts of n-grams
            versus a binary value encoding if the n-gram is present or not.
            n_features (int): Maximum number of features used.
        """
        # checking what type of vectorizer to create
        if count:
            self.vectorizer = CountVectorizer(analyzer=analyzer,
                                              ngram_range=ngram_range,
                                              max_features=n_features)
        else:
            self.vectorizer = HashingVectorizer(analyzer=analyzer,
                                                ngram_range=ngram_range,
                                                n_features=n_features)

[docs]    def fit(self, data):
        """
        Args:
            data (dataframe): Training data (values and their correct column).
        """
        self.labels = np.array(data['class'])
        self.num_classes = len(data['class'].unique())
        self.all_classes = np.sort(np.unique(self.labels))
        values = list(data['value'])
        self.features = self.vectorizer.fit_transform(values).toarray()
        # training the classifier
        self.lrm = linear_model.LogisticRegression(class_weight='balanced')
        self.lrm.fit(self.features, self.labels)

[docs]    def predict_training(self, folds=5):
        """Do cross-validation and return probabilities for each data-point.

        Args:
            folds (int): Number of folds used for prediction on training data.
        """
        partial_clf = linear_model.LogisticRegression(class_weight='balanced')
        prediction = np.zeros((len(self.features), self.num_classes))
        skf = StratifiedKFold(n_splits=folds)
        for train_index, test_index in skf.split(self.features, self.labels):
            # prepare the training and test data
            training_features = self.features[train_index]
            test_features = self.features[test_index]
            training_labels = self.labels[train_index]
            # fitting the model and predicting
            partial_clf.fit(training_features, training_labels)
            curr_pred = partial_clf.predict_proba(test_features)
            prediction[test_index] = \
                self.predict_proba_ordered(curr_pred, partial_clf.classes_)
        return prediction

[docs]    def predict_proba_ordered(self, probs, classes):
        """Fills out the probability matrix with classes that were missing.

        Args:
            probs (list): list of probabilities, output of predict_proba
            classes_ (ndarray): list of classes from clf.classes_
            all_classes (ndarray): list of all possible classes
        """
        proba_ordered = np.zeros((probs.shape[0], self.all_classes.size),
                                 dtype=np.float)
        sorter = np.argsort(self.all_classes)
        idx = sorter[np.searchsorted(self.all_classes, classes, sorter=sorter)]
        proba_ordered[:, idx] = probs
        return proba_ordered

[docs]    def predict(self, data):
        """Predict the class for a new given data.

        Args:
            data (dataframe): Dataframe of values to predict the column for.
        """
        values = list(data['value'])
        features = self.vectorizer.transform(values).toarray()
        return self.lrm.predict_proba(features)