机器学习之聚类

编程入门行业动态更新时间:2024-10-24 08:31:39

机器学习之聚类

例子：

1、采用Kmeans算法实现2D数据自动
聚类，预测V1=80,V2=60数据类别；
2、计算预测准确率，完成结果矫正
3、采用KNN、Meanshift算法，重复步骤1-2

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import MeanShift,estimate_bandwidth
data = pd.read_csv(r'D:\tencent\qicq\data.csv')
data.head()fig1 = plt.figure()
plt.scatter(data.loc[:,'V1'],data.loc[:,'V2'])
plt.title('un-labled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.draw()
plt.show()# define X, y
X = data.drop(['labels'],axis=1)
y = data.loc[:,'labels']
pd.value_counts(y)fig2 = plt.figure()
lable0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0])
lable1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1])
lable2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2])plt.title('labled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((lable0,lable1,lable2),('lable0','lable1','lable2'))
plt.draw()
plt.show()# set the model
KM = KMeans(n_clusters=3,random_state=0)
KM.fit(X)centers = KM.cluster_centers_
print(centers)y_predict = KM.predict(X)
print(pd.value_counts(y_predict))y_predict_test = KM.predict([[80,60]])
print(y_predict_test)accuracy = accuracy_score(y,y_predict)
print(accuracy)
# visualize the data and results
fig3 = plt.subplot(121)
lable0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0])
lable1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1])
lable2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2])plt.title('predict_labled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((lable0,lable1,lable2),('lable0','lable1','lable2'))
plt.scatter(centers[:,0],centers[:,1])
plt.show()fig4 = plt.subplot(122)
lable0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0])
lable1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1])
lable2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2])plt.title('origin_labled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((lable0,lable1,lable2),('lable0','lable1','lable2'))
plt.scatter(centers[:,0],centers[:,1])
plt.show()# establish a KNN model
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X,y)
# predict based on the test data V1=80, V2=60
y_predict_knn_test = KNN.predict([[80,60]])
y_predict_knn = KNN.predict(X)
print(y_predict_knn_test)
print('knn accuracy:',accuracy_score(y,y_predict_knn))
print(pd.value_counts(y_predict_knn),pd.value_counts(y))# visualize the data and results
fig5 = plt.subplot(121)
lable0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0])
lable1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1])
lable2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2])plt.title('predict_knn result')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((lable0,lable1,lable2),('lable0','lable1','lable2'))
plt.scatter(centers[:,0],centers[:,1])
plt.show()fig6 = plt.subplot(122)
lable0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0])
lable1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1])
lable2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2])plt.title('origin_labled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((lable0,lable1,lable2),('lable0','lable1','lable2'))
plt.scatter(centers[:,0],centers[:,1])
plt.show()# meanshift model
# obtain the bandwidth
bw = estimate_bandwidth(X,n_samples=500)
print(bw)
# establish the meanshift model un-supervised model
ms = MeanShift(bandwidth=bw)
ms.fit(X)y_predict_ms = ms.predict(X)
print(pd.value_counts(y_predict_ms),pd.value_counts(y))
# visualize the data and results
fig7 = plt.subplot(121)
lable0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0])
lable1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1])
lable2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2])plt.title('predict_ms result')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((lable0,lable1,lable2),('lable0','lable1','lable2'))
plt.scatter(centers[:,0],centers[:,1])
plt.show()fig8 = plt.subplot(122)
lable0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0])
lable1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1])
lable2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2])plt.title('origin_labled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((lable0,lable1,lable2),('lable0','lable1','lable2'))
plt.scatter(centers[:,0],centers[:,1])
plt.show()
# corrected the result
y_corrected_ms = []
for i in y_corrected_ms:if i==0:y_corrected_ms.append(2)elif i==1:y_corrected_ms.append(1)else:y_corrected_ms.append(0)
print(pd.value_counts(y_corrected_ms),pd.value_counts(y))y_corrected_ms = np.array(y_corrected_ms)
print(type(y_corrected_ms))# visualize the data and results
fig9 = plt.subplot(121)
lable0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0])
lable1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1])
lable2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2])plt.title('predict_ms result')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((lable0,lable1,lable2),('lable0','lable1','lable2'))
plt.scatter(centers[:,0],centers[:,1])
plt.show()fig10 = plt.subplot(122)
lable0 = plt.scatter(X.loc[:,'V1'][y==0],X.loc[:,'V2'][y==0])
lable1 = plt.scatter(X.loc[:,'V1'][y==1],X.loc[:,'V2'][y==1])
lable2 = plt.scatter(X.loc[:,'V1'][y==2],X.loc[:,'V2'][y==2])plt.title('origin_labled data')
plt.xlabel('V1')
plt.ylabel('V2')
plt.legend((lable0,lable1,lable2),('lable0','lable1','lable2'))
plt.scatter(centers[:,0],centers[:,1])
plt.show()

作业：

依据给定数据集，实现2D数据聚类
1)分别采用Kmeans、.K、Meanshift方法，实现数据类别划分
2)计算各算法准确性，如果需要数据矫正，需矫正完成后再计算准确性：
3)将原始数据、Kmeans预测数据、Kr预测数据、Meanshift预测数据，作为四个子图，对比展示在一个大图内：
需要将质心位置同时标注在图上
4)三种方法分别对[20,90]，[60,40]，[70,10]三个数据进行预测
数据集：week8_cluster_.homework._data.csv

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import MeanShift,estimate_bandwidth
data = pd.read_csv(r'D:\tencent\qicq\week8_cluster_homework_data.csv')
data.head()fig1 = plt.figure()
plt.scatter(data.loc[:,'x1'],data.loc[:,'x2'])
plt.title('un-labled data')
plt.xlabel('x1')
plt.ylabel('x2')
plt.draw()
plt.show()# define X, y
X = data.drop(['y'],axis=1)
y = data.loc[:,'y']
pd.value_counts(y)fig2 = plt.figure()
lable0 = plt.scatter(X.loc[:,'x1'][y==0],X.loc[:,'x2'][y==0])
lable1 = plt.scatter(X.loc[:,'x1'][y==1],X.loc[:,'x2'][y==1])plt.title('labled data')
plt.xlabel('x1')
plt.ylabel('x2')
plt.legend((lable0,lable1),('lable0','lable1'))
plt.show()# set the model
KM = KMeans(n_clusters=2,random_state=0)
KM.fit(X)centers = KM.cluster_centers_
print(centers)y_predict = KM.predict(X)
print(pd.value_counts(y_predict))y_predict_test = KM.predict([[80,60]])
print(y_predict_test)accuracy = accuracy_score(y,y_predict)
print(accuracy)
# visualize the data and results
fig, axs = plt.subplots(2, 2)lable0 = axs[0, 0].scatter(X.loc[:,'x1'][y==0],X.loc[:,'x2'][y==0])
lable1 = axs[0, 0].scatter(X.loc[:,'x1'][y==1],X.loc[:,'x2'][y==1])
axs[0, 0].set_title('predict_labled data')
axs[0, 0].set_xlabel('x1')
axs[0, 0].set_ylabel('x2')
axs[0, 0].legend((lable0,lable1),('lable0','lable1'))
axs[0, 0].scatter(centers[:,0],centers[:,1])lable0 = axs[0, 1].scatter(X.loc[:,'x1'][y==0],X.loc[:,'x2'][y==0])
lable1 = axs[0, 1].scatter(X.loc[:,'x1'][y==1],X.loc[:,'x2'][y==1])
axs[0, 1].set_title('origin_labled data')
axs[0, 1].set_xlabel('x1')
axs[0, 1].set_ylabel('x2')
axs[0, 1].legend((lable0,lable1),('lable0','lable1'))
axs[0, 1].scatter(centers[:,0],centers[:,1])# establish a KNN model
KNN = KNeighborsClassifier(n_neighbors=2)
KNN.fit(X,y)# visualize the data and results
lable0 = axs[1, 0].scatter(X.loc[:,'x1'][y==0],X.loc[:,'x2'][y==0])
lable1 = axs[1, 0].scatter(X.loc[:,'x1'][y==1],X.loc[:,'x2'][y==1])
axs[1, 0].set_title('predict_knn result')
axs[1, 0].set_xlabel('x1')
axs[1, 0].set_ylabel('x2')
axs[1, 0].legend((lable0,lable1),('lable0','lable1'))
axs[1, 0].scatter(centers[:,0],centers[:,1])# meanshift model
# obtain the bandwidth
bw = estimate_bandwidth(X,n_samples=500)
print(bw)
# establish the meanshift model un-supervised model
ms = MeanShift(bandwidth=bw)
ms.fit(X)y_predict_ms = ms.predict(X)
print(pd.value_counts(y_predict_ms),pd.value_counts(y))
# visualize the data and results# corrected the result
y_corrected_ms = []
for i in y_corrected_ms:if i==0:y_corrected_ms.append(2)elif i==1:y_corrected_ms.append(1)else:y_corrected_ms.append(0)
print(pd.value_counts(y_corrected_ms),pd.value_counts(y))y_corrected_ms = np.array(y_corrected_ms)
print(type(y_corrected_ms))# visualize the data and results
lable0 = axs[1, 1].scatter(X.loc[:,'x1'][y==0],X.loc[:,'x2'][y==0])
lable1 = axs[1, 1].scatter(X.loc[:,'x1'][y==1],X.loc[:,'x2'][y==1])
axs[1, 1].set_title('predict_ms result')
axs[1, 1].set_xlabel('x1')
axs[1, 1].set_ylabel('x2')
axs[1, 1].legend((lable0,lable1),('lable0','lable1'))
axs[1, 1].scatter(centers[:,0],centers[:,1])plt.show()y_predict_test = KM.predict([[20,90],[60,40],[70,10]])
print(f'KM:{y_predict_test}')
y_predict_test = KNN.predict([[20,90],[60,40],[70,10]])
print(f'KNN:{y_predict_test}')
y_predict_test = ms.predict([[20,90],[60,40],[70,10]])
print(f'ms:{y_predict_test}')

效果：

更多推荐

机器学习之聚类

本文发布于:2024-03-08 17:40:47，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1721665.html