This article describes the feature extraction operation implemented by Python. Share with you for your reference, as follows:
# - *- coding: utf-8-*-"""
Created on Mon Aug 2110:57:292017
@ author:Floating heart
"""
# Filtered feature selection
# Choose based on the variance. The smaller the variance, the poorer the ability to recognize the attribute and can be eliminated
from sklearn.feature_selection import VarianceThreshold
x=[[100,1,2,3],[100,4,5,6],[100,7,8,9],[101,11,12,13]]
selector=VarianceThreshold(1) #Variance threshold value,
selector.fit(x)
selector.variances_ #Show variance of attributes
selector.transform(x)#Feature selection
selector.get_support(True) #After selecting the result, the index before the feature
selector.inverse_transform(selector.transform(x)) #Restore the result of feature selection to the original data
# Data that has been removed, displayed as 0
# Univariate feature selection
from sklearn.feature_selection import SelectKBest,f_classif
x=[[1,2,3,4,5],[5,4,3,2,1],[3,3,3,3,3],[1,1,1,1,1]]
y=[0,1,0,1]
selector=SelectKBest(score_func=f_classif,k=3)#Select 3 features, the indicator uses the analysis of variance F value
selector.fit(x,y)
selector.scores_ #Score for each feature
selector.pvalues_
selector.get_support(True) #If true, return the selected feature index, if False, then
# What is returned is an array of boolean values, the array is only those features selected
selector.transform(x)
# Feature selection when wrapping
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC #Select svm as the evaluation algorithm
from sklearn.datasets import load_iris #Load data set
iris=load_iris()
x=iris.data
y=iris.target
estimator=LinearSVC()
selector=RFE(estimator=estimator,n_features_to_select=2) #Choose 2 features
selector.fit(x,y)
selector.n_features_ #Give the number of selected features
selector.support_ #Gives the mask of the selected feature
selector.ranking_ #Feature ranking, the selected feature is ranked 1
# Note: Feature extraction is not necessarily related to the improvement of prediction performance, and then compare;
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
from sklearn import cross_validation
from sklearn.datasets import load_iris
# Download Data
iris=load_iris()
X=iris.data
y=iris.target
# Feature extraction
estimator=LinearSVC()
selector=RFE(estimator=estimator,n_features_to_select=2)
X_t=selector.fit_transform(X,y)
# Split test set and validation set
x_train,x_test,y_train,y_test=cross_validation.train_test_split(X,y,
test_size=0.25,random_state=0,stratify=y)
x_train_t,x_test_t,y_train_t,y_test_t=cross_validation.train_test_split(X_t,y,
test_size=0.25,random_state=0,stratify=y)
clf=LinearSVC()
clf_t=LinearSVC()
clf.fit(x_train,y_train)
clf_t.fit(x_train_t,y_train_t)print('origin dataset test score:',clf.score(x_test,y_test))
# origin dataset test score:0.973684210526print('selected Dataset:test score:',clf_t.score(x_test_t,y_test_t))
# selected Dataset:test score:0.947368421053import numpy as np
from sklearn.feature_selection import RFECV
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
iris=load_iris()
x=iris.data
y=iris.target
estimator=LinearSVC()
selector=RFECV(estimator=estimator,cv=3)
selector.fit(x,y)
selector.n_features_
selector.support_
selector.ranking_
selector.grid_scores_
# Embedded feature selection
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.datasets import load_digits
digits=load_digits()
x=digits.data
y=digits.target
estimator=LinearSVC(penalty='l1',dual=False)
selector=SelectFromModel(estimator=estimator,threshold='mean')
selector.fit(x,y)
selector.transform(x)
selector.threshold_
selector.get_support(indices=True)
# scikitlearn provides a Pipeline to talk about multiple learners forming a pipeline, usually in the form of a pipeline: standardize data,
#- - 》Feature extraction learner——————"The learner that performs prediction, except after the last learner,
# All the previous learners must provide a transform method, which is used for data transformation (such as normalization, regularization,
# And feature extraction
# Learner pipeline (pipeline)
from sklearn.svm import LinearSVC
from sklearn.datasets import load_digits
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
def test_Pipeline(data):
x_train,x_test,y_train,y_test=data
steps=[('linear_svm',LinearSVC(C=1,penalty='l1',dual=False)),('logisticregression',LogisticRegression(C=1))]
pipeline=Pipeline(steps)
pipeline.fit(x_train,y_train)print('named steps',pipeline.named_steps)print('pipeline score',pipeline.score(x_test,y_test))if __name__=='__main__':
data=load_digits()
x=data.data
y=data.target
test_Pipeline(cross_validation.train_test_split(x,y,test_size=0.25,
random_state=0,stratify=y))
More readers who are interested in Python related content can check the topic of this site: "Python data structure and algorithmtutorial", "Python coding operation skills summary", "Pythonfunction usage skills summary", "Pythonstring operation skills summary" and "Python Getting Started and Advanced Classic Tutorial"
I hope this article will help you Pythonprogram design.
Recommended Posts