Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

In response to Issue #67: Naive Bayes Classifier added along with a unit testing file; also, included a test_use_cases.py file to compare the accuracy of naive bayes models using both numpy-ml and scikit-learn using dataset in the file wine.data #75

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 114 additions & 108 deletions numpy_ml/linear_models/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@


class GaussianNBClassifier:
def __init__(self, eps=1e-6):
r"""
def __init__(self):
"""
A naive Bayes classifier for real-valued data.

Notes
Expand Down Expand Up @@ -46,102 +46,113 @@ def __init__(self, eps=1e-6):
associated with class :math:`c`, :math:`\mu_c` and :math:`\Sigma_c`
(where :math:`1 \leq c \leq K`), are estimated via MLE from the set of
training examples with label :math:`c`.
"""
pass

Parameters
----------
eps : float
A value added to the variance to prevent numerical error. Default
is 1e-6.
# Separate the dataset into a subset of data for each class

Attributes
----------
parameters : dict
Dictionary of model parameters: "mean", the `(K, M)` array of
feature means under each class, "sigma", the `(K, M)` array of
feature variances under each class, and "prior", the `(K,)` array of
empirical prior probabilities for each class label.
hyperparameters : dict
Dictionary of model hyperparameters
labels : :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
An array containing the unique class labels for the training
examples.
def separate_classes(self, X, y):
"""
self.labels = None
self.hyperparameters = {"eps": eps}
self.parameters = {
"mean": None, # shape: (K, M)
"sigma": None, # shape: (K, M)
"prior": None, # shape: (K,)
}
Separates the dataset in to a subset of data for each class.
Parameters:
------------
X- array, list of features
y- list, target
Returns:
A dictionary with y as keys and assigned X as values.
"""
separated_classes = {}
for i in range(len(X)):
feature_values = X[i]
class_name = y[i]
if class_name not in separated_classes:
separated_classes[class_name] = []
separated_classes[class_name].append(feature_values)
return separated_classes

# Standard deviation and mean are required for the (Gaussian) distribution function

def stat_info(self, X):
"""
Calculates standard deviation and mean of features.
Parameters:
------------
X- array , list of features
Returns:
A dictionary with STD and Mean as keys and assigned features STD and Mean as values.
"""
for feature in zip(*X):
yield {
'std' : np.std(feature),
'mean' : np.mean(feature)
}

def fit(self, X, y):
"""
Fit the model parameters via maximum likelihood.

Notes
-----
The model parameters are stored in the :py:attr:`parameters` attribute.
The model parameters are stored in the :py:attr:`class_summary` attribute.
The following keys are present:

mean: :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
Feature means for each of the `K` label classes
sigma: :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
Feature variances for each of the `K` label classes
prior : :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
prior_proba :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
Prior probability of each of the `K` label classes, estimated
empirically from the training data
summary : Dictionary having both the keys and the values of
py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
Feature means for each of the `K` label classes along with the
Feature STDs for each of the `K` label classes

Parameters
----------
X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
X :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
A dataset consisting of `N` examples, each of dimension `M`
y: :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
y :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
The class label for each of the `N` examples in `X`

Returns
-------
self: object
Dictionary with the prior probability, mean, and standard deviation of each class
"""
P = self.parameters
H = self.hyperparameters

self.labels = np.unique(y)

K = len(self.labels)
N, M = X.shape
separated_classes = self.separate_classes(X, y)
self.class_summary = {}

P["mean"] = np.zeros((K, M))
P["sigma"] = np.zeros((K, M))
P["prior"] = np.zeros((K,))
for class_name, feature_values in separated_classes.items():
self.class_summary[class_name] = {
'prior_proba': len(feature_values)/len(X),
'summary': [i for i in self.stat_info(feature_values)],
}
return self.class_summary

for i, c in enumerate(self.labels):
X_c = X[y == c, :]
# Gaussian distribution function

P["mean"][i, :] = np.mean(X_c, axis=0)
P["sigma"][i, :] = np.var(X_c, axis=0) + H["eps"]
P["prior"][i] = X_c.shape[0] / N
return self

def predict(self, X):
def distribution(self, x, mean, std):
"""
Use the trained classifier to predict the class label for each example
in **X**.
Holds the computation for the Gaussian Distribution Function

Parameters
----------
X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
A dataset of `N` examples, each of dimension `M`
x: type: float, value of feature 'x'
mean: type: float, the mean value of feature 'x'
stdev: type: float, the standard deviation of feature 'x'

Returns
-------
labels : :py:class:`ndarray <numpy.ndarray>` of shape `(N)`
The predicted class labels for each example in `X`
--------
f: A float value of Normal Probability
"""
return self.labels[self._log_posterior(X).argmax(axis=1)]

def _log_posterior(self, X):
r"""
Compute the (unnormalized) log posterior for each class.
exponent = np.exp(-((x-mean)**2 / (2*std**2)))
f = exponent / (np.sqrt(2*np.pi)*std)
return f

# Required predict method, to predict the class

def predict(self, X):
"""
Use the trained classifier to predict the class label for each example
in **X**.

Parameters
----------
Expand All @@ -150,62 +161,57 @@ def _log_posterior(self, X):

Returns
-------
log_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
Unnormalized log posterior probability of each class for each
example in `X`
MAPs : :py:class:`ndarray <numpy.ndarray>` of shape `(N)`
The predicted class labels for each example in `X`
"""
K = len(self.labels)
log_posterior = np.zeros((X.shape[0], K))
for i in range(K):
log_posterior[:, i] = self._log_class_posterior(X, i)
return log_posterior

def _log_class_posterior(self, X, class_idx):
r"""
Compute the (unnormalized) log posterior for the label at index
`class_idx` in :py:attr:`labels`.
# Using Maximum a posteriori (MAP) probability

Notes
-----
Unnormalized log posterior for example :math:`\mathbf{x}_i` and class
:math:`c` is::
MAPs = []

.. math::
for row in X:
joint_proba = {}

for class_name, features in self.class_summary.items():
total_features = len(features['summary'])
likelihood = 1

\log P(y_i = c \mid \mathbf{x}_i, \theta)
&\propto \log P(y=c \mid \theta) +
\log P(\mathbf{x}_i \mid y_i = c, \theta) \\
&\propto \log P(y=c \mid \theta)
\sum{j=1}^M \log P(x_j \mid y_i = c, \theta)
for idx in range(total_features):
feature = row[idx]
mean = features['summary'][idx]['mean']
stdev = features['summary'][idx]['std']
normal_proba = self.distribution(feature, mean, stdev)
likelihood *= normal_proba
prior_proba = features['prior_proba']
joint_proba[class_name] = prior_proba * likelihood

In the Gaussian naive Bayes model, the feature likelihood for class
:math:`c`, :math:`P(\mathbf{x}_i \mid y_i = c, \theta)` is assumed to
be normally distributed
MAP = max(joint_proba, key= joint_proba.get)
MAPs.append(MAP)

.. math::
return MAPs

\mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)
# Calculate the model's accuracy

def accuracy(self, y_test, y_pred):
"""
Calculates model's accuracy using label comparison

Parameters
----------
X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
A dataset of `N` examples, each of dimension `M`
class_idx : int
The index of the current class in :py:attr:`labels`
------------
y_test: :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
The true class label for each of the `N` examples in `X`
y_pred: :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
The predicted class label for each of the `N` examples in `X`

Returns
-------
log_class_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
Unnormalized log probability of the label at index `class_idx`
in :py:attr:`labels` for each example in `X`
--------
acc: A number between 0-1, representing the percentage of correct predictions.
The accuracy of the GaussianNB model using numpy-ml environment
"""
P = self.parameters
mu = P["mean"][class_idx]
prior = P["prior"][class_idx]
sigsq = P["sigma"][class_idx]

# log likelihood = log X | N(mu, sigsq)
log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * sigsq))
log_likelihood -= 0.5 * np.sum(((X - mu) ** 2) / sigsq, axis=1)
return log_likelihood + np.log(prior)

true_true = 0
for y_t, y_p in zip(y_test, y_pred):
if y_t == y_p:
true_true += 1
acc = true_true / len(y_test)
return acc
Loading