Naive Bayes algorithm and its Python implementation

algorithm##

Bayes' Theorem##

The formula of Bayes' theorem is very simple:

. Often used to solve classification problems.

Plain Bayes##

The Chinese name is better, it is called Naive Bayes, and Naive Bayes in English. Everyone knows what Naive means. The simplicity of Naive Bayes is reflected in the assumption that all attributes (ie features) are mutually independent. It can be expressed as:

From this, the previous Bayes theorem can be expressed as:

Make

The largest category Y is the sample

The category it belongs to.

And for each sample,

Is not labelled

Changed, so just compare

That's it

Python code implementation##

'''
Naive Bayes Model
'''
import pandas as pd 
import numpy as np 
from sklearn.datasets import load_iris
from collections import defaultdict
from sklearn.cross_validation import train_test_split

def load_data():'''
 Load iris data
    '''
 data =load_iris()return data['data'],data['target']classNBClassifier(object):
 def __init__(self):
  self.y =[]#Label collection
  self.x =[]#A set of values for each attribute
  self.py =defaultdict(float)#Probability distribution of labels
  self.pxy =defaultdict(dict)#The probability distribution of each attribute under each label
  self.n =5#Number of grades

 def prob(self,element,arr):'''
  Calculate how often elements appear in the list
        '''
  prob =0.0for a in arr:if element == a:
    prob +=1/len(arr)if prob ==0.0:
   prob =0.001return prob

 def get_set(self,x,y):
  self.y =list(set(y))for i inrange(x.shape[1]):
   self.x.append(list(set(x[:,i])))#Record the value set of each column

 def fit(self,x,y):'''
  Training model
        '''
  x = self.preprocess(x)
  self.get_set(x,y)
  #1. Get p(y)for yi in self.y:
   self.py[yi]= self.prob(yi,y)
  #2. Get p(x|y)for yi in self.y:for i inrange(x.shape[1]):
    sample = x[y==yi,i]#Sample under label yi
    # Get the probability distribution of the column
    pxy =[self.prob(xi,sample)for xi in self.x[i]]
    self.pxy[yi][i]= pxy
  print("train score",self.score(x,y))

 def predict_one(self,x):'''
  Predict a single sample
        '''
  max_prob =0.0
  max_yi = self.y[0]for yi in self.y:
   prob_y = self.py[yi]for i inrange(len(x)):
    prob_x_y = self.pxy[yi][i][self.x[i].index(x[i])]#p(xi|y)
    prob_y *= prob_x_y#Calculate p(x1|y)p(x2|y)...p(xn|y)p(y)if prob_y > max_prob:
    max_prob = prob_y
    max_yi = yi
  return max_yi

 def predict(self,samples):'''
  Prediction function
        '''
  samples = self.preprocess(samples)
  y_list =[]for m inrange(samples.shape[0]):
   yi = self.predict_one(samples[m,:])
   y_list.append(yi)return np.array(y_list)

 def preprocess(self,x):'''
  Because of the huge difference in the size of the value sets of different features, part of the probability matrix becomes sparse, and data segmentation is required
        '''
  for i inrange(x.shape[1]):
   x[:,i]= self.step(x[:,i],self.n)return x

 def step(self,arr,n):'''
  Divided into n levels
        '''
  ma =max(arr)
  mi =min(arr)for i inrange(len(arr)):for j inrange(n):
    a = mi +(ma-mi)*(j/n)
    b = mi +(ma-mi)*((j+1)/n)if arr[i]>= a and arr[i]<= b:
     arr[i]= j+1breakreturn arr

 def score(self,x,y):
  y_test = self.predict(x)
  score =0.0for i inrange(len(y)):if y_test[i]== y[i]:
    score +=1/len(y)return score

if __name__ =="__main__":
 x,y =load_data()
 x_train,x_test,y_train,y_test =train_test_split(x,y,test_size =0.5,random_state =100)
 clf =NBClassifier()
 clf.fit(x_train,y_train)
 score = clf.score(x_test,y_test)print('test score',score)

Source code address:

Arctanxy/learning_notes