The formula of Bayes' theorem is very simple:
. Often used to solve classification problems.
The Chinese name is better, it is called Naive Bayes, and Naive Bayes in English. Everyone knows what Naive means. The simplicity of Naive Bayes is reflected in the assumption that all attributes (ie features) are mutually independent. It can be expressed as:
From this, the previous Bayes theorem can be expressed as:
Make
The largest category Y is the sample
The category it belongs to.
And for each sample,
Is not labelled
Changed, so just compare
That's it
'''
Naive Bayes Model
'''
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from collections import defaultdict
from sklearn.cross_validation import train_test_split
def load_data():'''
Load iris data
'''
data =load_iris()return data['data'],data['target']classNBClassifier(object):
def __init__(self):
self.y =[]#Label collection
self.x =[]#A set of values for each attribute
self.py =defaultdict(float)#Probability distribution of labels
self.pxy =defaultdict(dict)#The probability distribution of each attribute under each label
self.n =5#Number of grades
def prob(self,element,arr):'''
Calculate how often elements appear in the list
'''
prob =0.0for a in arr:if element == a:
prob +=1/len(arr)if prob ==0.0:
prob =0.001return prob
def get_set(self,x,y):
self.y =list(set(y))for i inrange(x.shape[1]):
self.x.append(list(set(x[:,i])))#Record the value set of each column
def fit(self,x,y):'''
Training model
'''
x = self.preprocess(x)
self.get_set(x,y)
#1. Get p(y)for yi in self.y:
self.py[yi]= self.prob(yi,y)
#2. Get p(x|y)for yi in self.y:for i inrange(x.shape[1]):
sample = x[y==yi,i]#Sample under label yi
# Get the probability distribution of the column
pxy =[self.prob(xi,sample)for xi in self.x[i]]
self.pxy[yi][i]= pxy
print("train score",self.score(x,y))
def predict_one(self,x):'''
Predict a single sample
'''
max_prob =0.0
max_yi = self.y[0]for yi in self.y:
prob_y = self.py[yi]for i inrange(len(x)):
prob_x_y = self.pxy[yi][i][self.x[i].index(x[i])]#p(xi|y)
prob_y *= prob_x_y#Calculate p(x1|y)p(x2|y)...p(xn|y)p(y)if prob_y > max_prob:
max_prob = prob_y
max_yi = yi
return max_yi
def predict(self,samples):'''
Prediction function
'''
samples = self.preprocess(samples)
y_list =[]for m inrange(samples.shape[0]):
yi = self.predict_one(samples[m,:])
y_list.append(yi)return np.array(y_list)
def preprocess(self,x):'''
Because of the huge difference in the size of the value sets of different features, part of the probability matrix becomes sparse, and data segmentation is required
'''
for i inrange(x.shape[1]):
x[:,i]= self.step(x[:,i],self.n)return x
def step(self,arr,n):'''
Divided into n levels
'''
ma =max(arr)
mi =min(arr)for i inrange(len(arr)):for j inrange(n):
a = mi +(ma-mi)*(j/n)
b = mi +(ma-mi)*((j+1)/n)if arr[i]>= a and arr[i]<= b:
arr[i]= j+1breakreturn arr
def score(self,x,y):
y_test = self.predict(x)
score =0.0for i inrange(len(y)):if y_test[i]== y[i]:
score +=1/len(y)return score
if __name__ =="__main__":
x,y =load_data()
x_train,x_test,y_train,y_test =train_test_split(x,y,test_size =0.5,random_state =100)
clf =NBClassifier()
clf.fit(x_train,y_train)
score = clf.score(x_test,y_test)print('test score',score)
Source code address:
Recommended Posts