支援向量機SVM案例與調參

from sklearn.datasets import load_breast_cancer
from sklearn.svm import svc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from time import time
import datetime
import pandas as pd
from sklearn.preprocessing import standardscaler
from sklearn.model_selection import stratifiedshufflesplit
from sklearn.model_selection import gridsearchcv
# 匯入資料
data = load_breast_cancer()
# print(data)
x = data.data
y = data.target
# print(x.shape)
# print(y)
# print(np.unique(y)) # unique表示不重複的值
# time():時間戳，乙個時間點
# data = pd.dataframe(x)
# print(data.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).t)
# 資料的量綱不統一，資料的分布是偏態的
# 進行標準化
x = standardscaler().fit_transform(x)
data = pd.dataframe(x)
# print(data.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).t)
plt.scatter(x[:, 0], x[:, 1], c=y)
plt.show()
# random_state為了保證程式每次執行都分割一樣的訓練集和測試集
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=420)

分別對核函式linear, poly, rbf, sigmoid進行學習，

# ploy在該例中跑不出來
kernel = ["linear", "rbf", "sigmoid"]
for kernel in kernel:
time0 = time()
clf = svc(kernel=kernel,
gamma="auto",
cache_size=5000, # 允許使用的記憶體，單位為mb，預設是200m
).fit(xtrain, ytrain)
print("the accuracy under kernel %s is %f" % (kernel, clf.score(xtest, ytest)))
print(datetime.datetime.fromtimestamp(time() - time0).strftime("%m:%s:%f"))

調rbf的gamma引數畫學習曲線

# 畫學習曲線
score = 
gamma_range = np.logspace(-10, 1, 50) # 返回在對數刻度上均勻間隔的數字
for i in gamma_range:
clf = svc(kernel="rbf", gamma=i, cache_size=5000).fit(xtrain, ytrain)
print(max(score), gamma_range[score.index(max(score))])
plt.plot(gamma_range, score)
plt.show()

對多項式核函式poly進行多引數調參，使用網格搜尋法，因為要理解gamma, degree, coef0這些引數對訓練的影響比較困難，

所以採取網格搜尋法

time0 = time()
gamma_range = np.logspace(-10, 1, 20)
coef0_range = np.linspace(0, 5, 10)
param_grid = dict(gamma=gamma_range,
coef0=coef0_range)
# 交叉驗證，把乙份完整的資料分成n份，然後用乙份做測試集，剩下的n-1份做訓練集
cv = stratifiedshufflesplit(n_splits=5, test_size=0.3, random_state=420)
grid = gridsearchcv(svc(kernel="poly", degree=1, cache_size=5000),
param_grid=param_grid, cv=cv)
grid.fit(x, y)
print("the best parameters are %s with a score of %0.5f" % (grid.best_params_,
grid.best_score_))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%m:%s:%f"))

為了解決軟間隔問題，引入鬆弛因子c

# 調線性核函式
score = 
c_range = np.linspace(0.01, 30, 50)
for i in c_range:
clf = svc(kernel="linear",c=i,cache_size=5000).fit(xtrain,ytrain)
print(max(score), c_range[score.index(max(score))])
plt.plot(c_range, score)
plt.show()
# 換rbf
score = 
c_range = np.linspace(0.01, 30, 50)
for i in c_range:
clf = svc(kernel="rbf", c=i, gamma=0.0127, cache_size=5000).fit(xtrain, ytrain)
print(max(score), c_range[score.index(max(score))])
plt.plot(c_range, score)
plt.show()

支援向量機（SVM）

簡介術語支援向量機 svm 是乙個類分類器，正式的定義是乙個能夠將不同類樣本在樣本空間分隔的超平面。換句話說，給定一些標記 label 好的訓練樣本監督式學習 svm演算法輸出乙個最優化的分隔超平面。首先我們假定有乙個未知的欲分類的集合，可以進行分割，但是我們不知道分割的函式超平面，也叫真實...

支援向量機SVM

支援向量機svm support vector machine 是機器學習領域的乙個有監督的學習模型。一簡介支援向量機建立在統計學習理論的基礎之上。統計學習理論 statistical learning theory簡稱slt 是一種處理小樣本的統計理論為研究有限樣本情況下的統計模式識別和更廣...

SVM支援向量機

在機器學習領域，很多時候會用到分類的一些演算法，例如knn，貝葉斯。我們可以把分類的樣本簡單除暴的分為兩種型別。線性可分和非線性可分。可以使用乙個非常簡單的例子來解釋什麼是線性可分，什麼是線性不可分。a 線性可分的2類樣本 b 非線性可分的2類樣已知乙個線性可分的資料集,其中x表示乙個n維向量，當...

支援向量機SVM案例與調參

支援向量機（SVM）

支援向量機SVM

SVM支援向量機

相關推薦