機器學習實戰之KMeans

2021-06-15 08:37:41 字數 3254 閱讀 4142

from numpy import *

def loaddataset(filename):

datamat =

fr = open(filename)

for line in fr.readlines():

curline = line.strip().split('\t')

fltline = map(float,curline)

return datamat

def disteclude(veca, vecb):

return sqrt(sum(pow((veca-vecb).a,2)))

def randcent(dataset, k):

n = shape(dataset)[1]

centroids = mat(zeros((k,n)))

for j in range(n):

print dataset[:,j]

minj = min(dataset[:,j])

rangej = float(max(dataset[:,j])-minj)

centroids[:,j] = minj + rangej * random.rand(k,1)

return centroids

def kmeans(dataset, k, distmeas = disteclude, createcent=randcent):

m = shape(dataset)[0]

clusterassment = mat(zeros((m,2)))

centroids = createcent(dataset,k)

clusterchanged = true

while clusterchanged:

for i in range(m):

mindist = inf; minindex = -1

for j in range(k):

distji = distmeas(centroids[j,:],dataset[i,:])

if distji < mindist:

mindist = distji

minindex = j

clusterchanged = true if clusterassment[i,0] != minindex else false

clusterassment[i,:] = minindex, mindist**2

print centroids

for cent in range(k):

ptsinclust = dataset[nonzero(clusterassment[:,0].a == cent)[0]] #get datas which belong to cent

centroids[cent,:] = mean(ptsinclust, axis = 0) #update the centroids

return centroids, clusterassment

# dataset = loaddataset('testset.txt')

# kmeans(mat(dataset), 3, disteclude, randcent)

def bikmeans(dataset, k, distmeas = disteclude):

m = shape(dataset)[0]

clusterassment = mat(zeros((m,2)))

centroid0 = mean(dataset, axis=0).tolist()[0]

centlist = [centroid0]

for j in range(m):

clusterassment[j,1] = distmeas(mat(centroid0), dataset[j,:])**2

while(len(centlist) < k):

lowestsse = inf

for i in range(len(centlist)):

ptsincurrcluster = dataset[nonzero(clusterassment[:,0].a == i)[0],:]

centroidmat, splitclustass = kmeans(ptsincurrcluster, 2, distmeas)

ssesplit = sum(splitclustass[:,1])

ssenotsplit = sum(clusterassment[nonzero(clusterassment[:,0].a != i)[0],1])

print "ssesplit, and notsplit:", ssesplit, ssenotsplit

if (ssesplit + ssenotsplit) < lowestsse:

bestcenttosplit = i

bestnewcents = centroidmat

bestclustass = splitclustass.copy()

lowestsse = ssesplit + ssenotsplit

'''after partition,the bestcenttosplit will replace the original cluster'''

bestclustass[nonzero(bestclustass[:,0].a == 1)[0],0] = len(centlist)

bestclustass[nonzero(bestclustass[:,0].a == 0)[0],0] = bestcenttosplit

print 'the bestcenttosplit is:', bestcenttosplit

print 'the len of bestcustass is:', len(bestclustass)

centlist[bestcenttosplit] = bestnewcents[0,:]

'''update the clusterassment'''

clusterassment[nonzero(clusterassment[:,0].a == bestcenttosplit)[0],:] = bestclustass

return centlist, clusterassment

# datmat = mat(loaddataset('testset2.txt'))

# centlist, mynewassments = bikmeans(datmat, 3)

# print centlist

機器學習實戰Kmeans

from numpy import import matplotlib.pyplot as plt import pandas as pd load dataset url names sepal length sepal width petal length petal width class d...

機器學習實戰之K Means聚類

俗話說的好 物以類聚,人以群分 今天我們要講的聚類演算法很大程度上可以印證此話。聚類是一種非監督學習,什麼是非監督學習?與之前學習的分類和回歸不同 監督學習 監督學習是有有label標籤的,而非監督學習沒有。我們再回到聚類上,聚類是把相似的物件歸到同一簇中,有點像全自動分類。聚類的應用場景有很多,例...

機器學習機器學習實戰 kmeans

簡介 聚類演算法是一種無監督學習,它將相似的物件歸類到同一簇中。聚類的方法可以應用所有的物件,簇內的物件越相似,聚類效果也就越好。聚類和分類的最大不同之處在於,分類的目標是已知的,聚類是完全無監督學習,類別沒有像分類那樣被預先定義出來,所以叫做無監督學習。kmeans演算法是實際中最常用的聚類演算法...