from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
from sklearn.model_selection import StratifiedKFold
from flexmatcher.classify import Classifier
from sklearn import linear_model
import numpy as np
[docs]class CharDistClassifier(Classifier):
"""Classify the data-point using counts of character types in the data.
The CharDistClassifier extracts 7 simple features: number of
white-space, digit, and alphabetical characters as well as their percentage
and the total number of characters. Then it trains a logistic regression on
top of these features.
Attributes:
labels (ndarray): Vector storing the labels of each data-point.
features (ndarray): Matrix storing the extracting features.
clf (LogisticRegression): The classifier instance.
num_classes (int): Number of classes/columns to match to
all_classes (ndarray): Sorted array of all possible classes
"""
def __init__(self):
"""Initializes the classifier."""
self.clf = linear_model.LogisticRegression(class_weight='balanced')
[docs] def fit(self, data):
"""Extracts features and labels from the data and fits a model.
Args:
data (dataframe): Training data (values and their correct column).
"""
self.labels = np.array(data['class'])
self.num_classes = len(data['class'].unique())
self.all_classes = np.sort(np.unique(self.labels))
# populating the features dataframe
feat_df = data[['value']].copy()
feat_df['length'] = feat_df['value'].apply(lambda val: len(val))
feat_df['digit_frac'] = feat_df['value'].apply(
lambda val: 0 if len(val) == 0 else
sum(char.isdigit() for char in val) / len(val))
feat_df['digit_num'] = feat_df['value'].apply(
lambda val: sum(char.isdigit() for char in val))
feat_df['alpha_frac'] = feat_df['value'].apply(
lambda val: 0 if len(val) == 0 else
sum(char.isalpha() for char in val) / len(val))
feat_df['alpha_num'] = feat_df['value'].apply(
lambda val: sum(char.isalpha() for char in val))
feat_df['space_frac'] = feat_df['value'].apply(
lambda val: 0 if len(val) == 0 else
sum(char.isspace() for char in val) / len(val))
feat_df['space_num'] = feat_df['value'].apply(
lambda val: sum(char.isspace() for char in val))
self.features = feat_df.ix[:, 1:].as_matrix()
# training the classifier
self.clf.fit(self.features, self.labels)
[docs] def predict_training(self, folds=5):
"""Do cross-validation and return probabilities for each data-point.
Args:
folds (int): Number of folds used for prediction on training data.
"""
partial_clf = linear_model.LogisticRegression(class_weight='balanced')
prediction = np.zeros((len(self.features), self.num_classes))
skf = StratifiedKFold(n_splits=folds)
for train_index, test_index in skf.split(self.features, self.labels):
# prepare the training and test data
training_features = self.features[train_index]
test_features = self.features[test_index]
training_labels = self.labels[train_index]
# fitting the model and predicting
partial_clf.fit(training_features, training_labels)
curr_pred = partial_clf.predict_proba(test_features)
prediction[test_index] = \
self.predict_proba_ordered(curr_pred, partial_clf.classes_)
return prediction
[docs] def predict_proba_ordered(self, probs, classes):
"""Fills out the probability matrix with classes that were missing.
Args:
probs (list): list of probabilities, output of predict_proba
classes_ (ndarray): list of classes from clf.classes_
all_classes (ndarray): list of all possible classes
"""
proba_ordered = np.zeros((probs.shape[0], self.all_classes.size),
dtype=np.float)
sorter = np.argsort(self.all_classes)
idx = sorter[np.searchsorted(self.all_classes, classes, sorter=sorter)]
proba_ordered[:, idx] = probs
return proba_ordered
[docs] def predict(self, data):
"""Predict the class for a new given data.
Args:
data (dataframe): Dataframe of values to predict the column for.
"""
feat_df = data[['value']].copy()
feat_df['length'] = feat_df['value'].apply(lambda val: len(val))
feat_df['digit_frac'] = feat_df['value'].apply(
lambda val: 0 if len(val) == 0 else
sum(char.isdigit() for char in val) / len(val))
feat_df['digit_num'] = feat_df['value'].apply(
lambda val: sum(char.isdigit() for char in val))
feat_df['alpha_frac'] = feat_df['value'].apply(
lambda val: 0 if len(val) == 0 else
sum(char.isalpha() for char in val) / len(val))
feat_df['alpha_num'] = feat_df['value'].apply(
lambda val: sum(char.isalpha() for char in val))
feat_df['space_frac'] = feat_df['value'].apply(
lambda val: 0 if len(val) == 0 else
sum(char.isspace() for char in val) / len(val))
feat_df['space_num'] = feat_df['value'].apply(
lambda val: sum(char.isspace() for char in val))
features = feat_df.ix[:, 1:].as_matrix()
return self.clf.predict_proba(features)