python 特徵篩選

2021-09-26 10:38:08 字數 2699 閱讀 8231

from sklearn.feature_selection import variancethreshold,selectkbest,chi2

from sklearn.datasets import load_iris

import pandas as pd

x,y = load_iris(return_x_y=true)

x_df = pd.dataframe(x,columns=list("abcd"))

(chi2,pval) = chi2(x_df,y)

dict_feature = {}

for i,j in zip(x_df.columns.values,chi2):

dict_feature[i]=j

#對字典按照values排序

ls = sorted(dict_feature.items(),key=lambda item:item[1],reverse=true)

#特徵選取數量

k =2

ls_new_feature=

for i in range(k):

x_new = x_df[ls_new_feature]

from sklearn.feature_selection import variancethreshold,selectkbest,chi2

from sklearn.datasets import load_iris

import pandas as pd

from sklearn.feature_selection import mutual_info_classif

#用於度量特徵和離散目標的互資訊

x,y = load_iris(return_x_y=true)

x_df = pd.dataframe(x,columns=list("abcd"))

feature_cat = ["a","d"]

discrete_features =

feature = x_df.columns.values.tolist()

for k in feature_cat:

if k in feature:

mu = mutual_info_classif(x_df,y,discrete_features=discrete_features,

n_neighbors=3, copy=true, random_state=none)

dict_feature = {}

for i,j in zip(x_df.columns.values,mu):

dict_feature[i]=j

#對字典按照values排序

ls = sorted(dict_feature.items(),key=lambda item:item[1],reverse=true)

#特徵選取數量

k =2

ls_new_feature=

for i in range(k):

x_new = x_df[ls_new_feature]

from sklearn.datasets import load_iris

import pandas as pd

from sklearn.feature_selection import selectfrommodel

from sklearn.linear_model import logisticregression

x,y = load_iris(return_x_y=true)

x_df = pd.dataframe(x,columns=list("abcd"))

sf = selectfrommodel(estimator=logisticregression(penalty="l1", c=0.1),

threshold=none,

prefit=false,

norm_order=1)

sf.fit(x_df,y)

x_new = x_df[x_df.columns.values[sf.get_support()]]

from sklearn.feature_selection import variancethreshold

from sklearn.datasets import load_iris

import pandas as pd

x,y = load_iris(return_x_y=true)

x_df = pd.dataframe(x,columns=list("abcd"))

#建議作為數值特徵的篩選方法,對於分類特徵可以考慮每個類別的佔比問題

ts = 0.5

vt = variancethreshold(threshold=ts)

vt.fit(x_df)

#檢視各個特徵的方差

dict_variance = {}

for i,j in zip(x_df.columns.values,vt.variances_):

dict_variance[i] = j

#獲取保留了的特徵的特徵名

ls = list()

for i,j in dict_variance.items():

if j >= ts:

x_new = pd.dataframe(vt.fit_transform(x_df),columns=ls)

特徵工程 之 特徵篩選

從現有的m個特徵中選出n個特徵 n m 降低特徵維度減少計算量的同時,使模型效果達到最優。在實際業務中,用於模型中的特徵維度往往很高,幾萬維。如一些ctr預估問題中,特徵維度高達上億維,維度過高會增大模型計算複雜度。但實際情況是,並不是每個特徵對模型的 都是有效果的,所以需要去除一些不必要的特徵,從...

機器學習 特徵工程 特徵篩選

1 冗餘 部分特徵相關度太高,消耗計算效能,影響決策樹分支的選擇。2 雜訊 部分特徵是對 結果有負影響 3 降維 減少特徵數量 降維,使模型泛化能力更強,減少過擬合 4 特徵選擇與降維的關係 特徵選擇只篩選掉原本特徵裡和結果 關係不大的,後者做特徵的計算組合構成新特徵。svd pca降維也能解決一定...

隨機森林特徵篩選

剛看到一篇介紹特徵篩選的文章,裡面介紹基於模型的特徵排名,附加了乙個隨機森林的python程式,感覺挺好,趕緊mark下來。程式使用了skliearn機器學習庫,資料集為boston房屋 資料,源程式如下所示 fromsklearn.cross validationimportcross val s...