# DSCI552 Assignment 4 Naive Bayes Classifier and Theory Questions solution

\$25.00

Original Work ?

## Description

5/5 - (1 vote)

For this assignment you will implement a Naive Bayes Classifier that implements the SKlearn classifier API with `fit``predict` and `score` methods.

The Naive Bayes Classifer takes as parameter the density function used in the likelihood calcuation:

• `normal`: Normal density function
• `knn`: K nearest neighbor density function

Most of the code already has been written for you. You only need to fill in the missing part between

``````## Insert your code BEGIN

``````from functools import partial

import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import norm``````
``````class NaiveBayesClassifier:
def __init__(self, likelihood='normal', k=None):
self.likelihood = likelihood

# Let
#  K = number of unique classes
#  N = number of test instances
#  d = number of inputs (input dimensionality)

# Numpy array unique classes, shape = (K,)
self.classes = None

# Numpy array of class priors, P(C), shape = (K,)
self.priors = None

# Numpy array of likelihoods, P(x|C), shape = (N, K),
self.likelihoods = None

# Numpy array of posterior probabilities, P(C|x), shape = (N, K)
self.posteriors = None

## For the Guassian Density
# means, shape = (K, d)
self.avgs = None
# variances, shape = (K, d)
self.vars = None

## For the knn Density
# number of neighbors to use
self.k = k
# store training X
self.X_train = None
# store trainging y
self.y_train = None

def generate_classes(self, y):
"""
Generate the classes based on y, and store in self.classes

:param y: array of class targets
"""
self.classes = np.unique(y)

def generate_priors(self, y):
"""
Compute the prior probabilities and store self.priors

:param y: array of class targets

"""
# self.priors = ...

def knn_density_function(self, x_train, x_predict):
"""
Implements k-nearest neighbor density estimate (Alpaydin Eq 8.8)

:param x_train 1d numpy array
:param x_predict 1d numpy array
:returns probabilities at x_prdict, shape = x_predict.shape
"""
# Find the distance to kth nearest neighbor
result = []
for x0 in x_predict:
dist = np.abs(x_train - x0)
index = np.argsort(dist)
result.append(dist[index[self.k - 1]])
dist_k = np.array(result)

# Find the probability at x using knn density
# Note: Equation 8.8 may return probabilites greater than 1.
#       For probabilities greater than 1, set it equal to 1.

# Return ...

# Gaussian part
def generate_avgs(self, X, y):
"""
Return mean for each class and for each attribute
"""

def generate_vars(self, X, y):
"""
Return variance for each class and for each attribute
"""

# Place any method you need here
# def ...

def generate_guassian_likelihoods(self, X):

def generate_knn_likelihoods(self, X):
likelihoods = np.ones([len(self.classes), X.shape[0] ])
for i, aclass in enumerate(self.classes):
index = self.y_train == aclass
for attr in range(X.shape[1]):

return likelihood

def fit(self, X, y):
# define the classes with ascending order
self.generate_classes(y)
# compute the Priori probability
self.generate_priors(y)

# different likelihood function
if self.likelihood == 'normal':
# calculate the avg and var based on X and y
self.avgs = self.generate_avgs(X, y)
self.vars = self.generate_vars(X, y)
elif self.likelihood == 'knn':
self.X_train = X
self.y_train = y
else:
raise ValueError('Invalid value for likelihood. Must be "normal" or "knn".')
return self

def generate_likelihoods(self, X):
"""
:param ndarray x
:returns probabilities at X (like X.shape[0] * Number of classes -> {Poss for each class} )
"""
# Gussian
if self.likelihood == "normal":
self.likelihoods = self.generate_guassian_likelihoods(X)
elif self.likelihood == "knn":
self.likelihoods = self.generate_knn_likelihoods(X)
else:
raise ValueError('Invalid value for likelihood Must be "normal" or "knn".')
return self.likelihoods

def predict(self, X):
"""
:param ndarray x
:returns prediction
"""
self.likelihoods = self.generate_likelihoods(X)
# self.posteriors = ...

return prediction

def score(self, X, y, sample_weight=None):
return accuracy_score(self.predict(X), y, sample_weight=sample_weight)``````
``````iris = load_iris()
x = iris['data']
y = iris['target']``````
``````# Create an instance of the classifier with a normal likelihood distribution
clf = NaiveBayesClassifier(likelihood='normal')

# # Fit the classifier to the training data
clf.fit(x, y)

# # Use the classifier to make predictions on new data
y_pred = clf.predict(x)

# # Evaluate the accuracy of the classifier
accuracy = clf.score(x, y)
print('Accuracy:', accuracy)``````
``````np.set_printoptions(precision=3)

print("\nmeans:\n", clf.avgs)

print("\nvariances:\n", clf.vars)

print('\nprior probability:\n', clf.priors)

print('\nlikelihoods:')
print(clf.likelihoods[:5, :])
print(clf.likelihoods[50:55, :])
print(clf.likelihoods[100:105, :])

print('\nposteriors:')
print(clf.posteriors[:5, :])
print(clf.posteriors[50:55, :])
print(clf.posteriors[100:105, :])

print('\npredictions:')
print(y_pred[:5])
print(y_pred[50:55])
print(y_pred[100:105])``````
``````# Create an instance of the classifier with a normal likelihood distribution
clf = NaiveBayesClassifier(likelihood='knn', k=3)

# # Fit the classifier to the training data
clf.fit(x, y)

# # Use the classifier to make predictions on new data
y_pred = clf.predict(x)

# # Evaluate the accuracy of the classifier
accuracy = clf.score(x, y)
print('Accuracy:', accuracy)``````
``````np.set_printoptions(precision=3)

print('prior probability:\n', clf.priors)

print('\nlikelihoods:')
print(clf.likelihoods[:5, :])
print(clf.likelihoods[50:55, :])
print(clf.likelihoods[100:105, :])

print('\nposteriors:')
print(clf.posteriors[:5, :])
print(clf.posteriors[50:55, :])
print(clf.posteriors[100:105, :])

print('\npredictions:')
print(y_pred[:5])
print(y_pred[50:55])
print(y_pred[100:105])

DSCI - 552: Machine Learning for Data Science
Assignment - 4
Theory Questions
1) How can we detect outliers after hierarchical clustering?
2) In building a regression tree, instead of the mean we can use the median,
and instead of minimizing the squared error we can minimize the absolute
error. Why does this help in the case of noise?``````