基于Python的鸢尾花聚类与分类
1 导入必要的库
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
2 加载数据
# 加载数据
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
3 数据探索
sns.pairplot(df, hue='target')
plt.show()
图3-1
4 聚类分析
# 设定聚类数为3
kmeans = KMeans(n_clusters=3, random_state=0).fit(df.drop('target', axis=1))
df['cluster'] = kmeans.labels_
# 可视化聚类结果
plt.scatter(df['sepal length (cm)'], df['sepal width (cm)'], c=df['cluster'], cmap='viridis')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.title('K-Means Clustering of Iris Dataset')
plt.show()
图4-1 聚类结果
# 计算轮廓系数
score = silhouette_score(df.drop('target', axis=1), kmeans.labels_)
print(f"Silhouette Coefficient: {score}")# 计算每个样本的轮廓系数
sample_silhouette_values = silhouette_samples(df.drop('target', axis=1), kmeans.labels_)# 可视化轮廓图
plt.figure(figsize=(10, 5))
y_lower = 10
for i in range(3):# Aggregate the silhouette scores for samples belonging to# cluster i, and sort themith_cluster_silhouette_values = sample_silhouette_values[kmeans.labels_ == i]ith_cluster_silhouette_values.sort()size_cluster_i = ith_cluster_silhouette_values.shape[0]y_upper = y_lower + size_cluster_icolor = plt.cm.nipy_spectral(float(i) / 3)plt.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,facecolor=color, alpha=0.7)# Label the silhouette plots with their cluster numbers at the middleplt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i),color=color, fontweight='bold', verticalalignment='center')y_lower = y_upper + 10 # 10 for the 0 samplesplt.xlabel('Silhouette Coefficient')
plt.ylabel('Cluster Label')
plt.title('Silhouette Plot')
plt.show()
图4-2 轮廓图
5 决策树分类
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.3, random_state=42)# 初始化决策树分类器
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)# 预测测试集
y_pred = clf.predict(X_test)
# 显示混淆矩阵
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()# 显示分类报告
print(classification_report(y_test, y_pred))
# 可视化决策树
from sklearn.tree import plot_tree
import matplotlib.pyplot as pltplt.figure(figsize=(20,10))
plot_tree(clf, filled=True, feature_names=iris.feature_names, class_names=list(iris.target_names), rounded=True, fontsize=9)
plt.show()
图5-1 混淆矩阵
图5-2 决策树模型结构