# Clustering
# Introductions
Clustering is an unsupervised learning method which can be used to discover the underlying pattern in the data structure.
聚类是一种无监督的学习方法,可以用来发现数据结构中的潜在模式。
For example, it can be used to group unlabelled data
例如,它可以用于对未标记的数据进行分组
# K-Means Clustering
Requirements: numerical and normalized features
要求:数值化和归一化特征
import numpy as np | |
import scipy as sp | |
import pandas as pd | |
from IPython.display import display, HTML | |
df=pd.read_csv('data_students.csv') | |
cols=df.columns | |
# print out and display dataframe as tables in HTML | |
display(HTML(df.head(10).to_html())) | |
# replace missing values in numerical variables by using mean value | |
# 用均值代替数值变量中的缺失值 | |
df["Age"].fillna(df["Age"].mean(), inplace=True) | |
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True) | |
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True) | |
df["Exam"].fillna(df["Exam"].mean(), inplace=True) | |
df["Grade"].fillna(df["Grade"].mean(), inplace=True) | |
# check again whether there are missing values | |
# 再次检查是否有缺失值 | |
print('ColumnName, DataType, MissingValues') | |
for i in cols: | |
print(i, ',', df[i].dtype,',',df[i].isnull().any()) | |
# remove column ID | |
df=df.drop('ID',1) | |
# print out and display dataframe as tables in HTML | |
display(HTML(df.head(10).to_html())) | |
# Data preprocessing | |
print('Column Datatypes:\n',df.dtypes) | |
# convert all nominal variables to binary variables | |
# 将所有名义变量转换为二进制变量 | |
df_raw=df.copy(deep=True) | |
df_knn=df.copy(deep=True) | |
# create new binary columns | |
# 新建二进制列 | |
df_dummies=pd.get_dummies(df_knn[['Degree','Nationality']]) | |
# add them to dataframe | |
df_knn=df_knn.join(df_dummies) | |
# drop original columns | |
df_knn=df_knn.drop('Degree',axis=1) | |
df_knn=df_knn.drop('Nationality', axis=1) | |
display('Data Example:',HTML(df_knn.head(10).to_html())) | |
# Normalized all numerical features | |
# 归一化所有数值特征 | |
# find numeric columns 查找数字列 | |
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] | |
cols_numeric = df_knn.select_dtypes(include=numerics).columns.tolist() | |
print('Selected numerical columns:\n',cols_numeric) | |
# min-max normalization to scale [0, 1] | |
for col in cols_numeric: | |
df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min()) | |
# We ignore the label column | |
# 忽略标签列 | |
df_kmeans=df_knn.drop('GradeLetter',axis=1) | |
display(HTML(df_kmeans.head(10).to_html())) | |
# KMeans clustering | |
import matplotlib.pyplot as plt | |
from matplotlib.colors import ListedColormap | |
from sklearn.cluster import KMeans | |
# API, https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html | |
kmeans=KMeans(n_clusters=4, random_state=1,max_iter=200) | |
kmeans.fit(df_kmeans) | |
y_pred=kmeans.predict(df_kmeans) | |
plt.scatter(df_kmeans['Exam'],df_kmeans['Grade'],c=y_pred,cmap='viridis') | |
# get the cluster labels and add it back to the original data | |
# 获取聚类标签并将其添加回原始数据 | |
opt=kmeans.labels_ | |
df_knn['Cluster']=opt | |
display('Data:',HTML(df_knn.tail(10).to_html())) | |
# try different K value and find the best K for KMeans | |
# 尝试不同的 K 值,找到最适合 KMeans 的 K 值 | |
# Assumption: SSE is smaller, it is better | |
# 假设:SSE 越小越好 | |
Sum_of_squared_distances = [] | |
K = range(1,15) | |
for k in K: | |
km = KMeans(n_clusters=k) | |
km = km.fit(df_kmeans) | |
Sum_of_squared_distances.append(km.inertia_) | |
# Plot K and SSE, observe which one is better | |
# K 图和 SSE 图,观察哪个更好 | |
# In the plot, the elbow on the arm is optimal k | |
# 图中折点是最佳 K | |
plt.plot(K, Sum_of_squared_distances, 'bx-') | |
plt.xlabel('k') | |
plt.ylabel('Sum_of_squared_distances') | |
plt.title('Elbow Method For Optimal k') | |
plt.show() |
# DBSCAN - Density-based Clustering
from sklearn.cluster import DBSCAN | |
# API: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html | |
df_dbscan=df_knn.drop(['GradeLetter','Cluster'],axis=1) | |
display('Data:',HTML(df_dbscan.tail(10).to_html())) | |
# Numpy array of all the cluster labels assigned to each data point | |
# 分配给每个数据点的所有聚类标签的 Numpy 数组 | |
db_default = DBSCAN(eps = 0.2, min_samples = 3).fit(df_dbscan) | |
labels = db_default.labels_ | |
df_dbscan['Cluster']=labels | |
display('Data after clustering:',HTML(df_dbscan.tail(10).to_html())) | |
# Visualize the clusters 可视化集群 | |
# Building the label to colour mapping | |
# 构建标签到颜色的映射 | |
# Need to figure out how many clusters were produced, then assign different number of the colors | |
# 需要计算出产生了多少簇,然后分配不同数量的颜色 | |
colours = {} | |
colours[0] = 'r' | |
colours[1] = 'g' | |
colours[2] = 'b' | |
colours[-1] = 'k' | |
# Building the colour vector for each data point | |
# 为每个数据点构建颜色向量 | |
cvec = [colours[label] for label in labels] | |
# For the construction of the legend of the plot | |
r = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='r'); | |
g = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='g'); | |
b = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='b'); | |
k = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='k'); | |
# Plotting P1 on the X-Axis and P2 on the Y-Axis | |
# 在 X 轴上绘制 P1,在 Y 轴上绘制 P2 | |
# according to the colour vector defined | |
# 根据定义的颜色向量 | |
plt.figure(figsize =(9, 9)) | |
plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], c = cvec) | |
# Building the legend | |
plt.legend((r, g, b, k), ('Label 0', 'Label 1', 'Label 2', 'Label -1')) | |
plt.show() |
# Hierarchical Clustering
from sklearn.cluster import AgglomerativeClustering | |
import scipy.cluster.hierarchy as shc | |
# API, https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html | |
df_hc=df_knn.drop(['GradeLetter','Cluster'],axis=1) | |
display('Data:',HTML(df_hc.tail(10).to_html())) | |
# Plot Dendrogram | |
plt.figure(figsize =(8, 8)) | |
plt.title('Visualising the data') | |
Dendrogram = shc.dendrogram((shc.linkage(df_hc, method ='single'))) | |
# Clustering based on the Dendrogram | |
# 基于树状图的聚类分析 | |
# choose best K based on elbow method introduced above | |
# 基于上面介绍的弯肘法选择最佳 K | |
cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='single') | |
cls=cluster.fit_predict(df_hc) | |
print(cluster.labels_) | |
# Visualizing the clustering | |
plt.figure(figsize =(6, 6)) | |
plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], | |
c = cls, cmap ='rainbow') | |
plt.show() |
# Association Rules
install the mlxtend
library first
start anconda prompt, To install this package with conda run one of the following:
conda install -c conda-forge mlxtend
conda install -c conda-forge/label/gcc7 mlxtend
conda install -c conda-forge/label/cf201901 mlxtend
fix install issues in windows
copy the following dll fileslibcrypto-1_1-x64.*
libssl-1_1-x64.*
from "your Anaconda3 folder\Library\bin" to "your Anaconda3 folder\DLLs"
# load data | |
import numpy as np | |
import scipy as sp | |
import pandas as pd | |
from IPython.display import display, HTML | |
df=pd.read_csv('data_students.csv') | |
cols=df.columns | |
# print out and display dataframe as tables in HTML | |
display(HTML(df.head(10).to_html())) | |
# replace missing values in numerical variables by using mean value | |
# 用均值代替数值变量中的缺失值 | |
df["Age"].fillna(df["Age"].mean(), inplace=True) | |
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True) | |
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True) | |
df["Exam"].fillna(df["Exam"].mean(), inplace=True) | |
df["Grade"].fillna(df["Grade"].mean(), inplace=True) | |
# check again whether there are missing values | |
print('ColumnName, DataType, MissingValues') | |
for i in cols: | |
print(i, ',', df[i].dtype,',',df[i].isnull().any()) | |
# remove column ID | |
df=df.drop('ID',1) | |
# print out and display dataframe as tables in HTML | |
display(HTML(df.head(10).to_html())) | |
# Data preprocessing | |
print('Column Datatypes:\n',df.dtypes) | |
# convert all numerical variable to nominal variables | |
df_nb=df.copy(deep=True) | |
df_nb['Gender'] = df_nb['Gender'].astype(str) | |
df_nb['Age'] = pd.cut(df_nb['Age'],3) | |
df_nb['Hours on Readings'] = pd.cut(df_nb['Hours on Readings'],3) | |
df_nb['Hours on Assignments'] = pd.cut(df_nb['Hours on Assignments'],3) | |
df_nb['Hours on Games'] = pd.cut(df_nb['Hours on Games'],3) | |
df_nb['Hours on Internet'] = pd.cut(df_nb['Hours on Internet'],3) | |
df_nb['Exam'] = pd.cut(df_nb['Exam'],3) | |
df_nb['Grade'] = pd.cut(df_nb['Grade'],3) | |
display('Data Example',HTML(df_nb.head(5).to_html())) | |
# Association Rule Mining | |
from mlxtend.frequent_patterns import apriori | |
from mlxtend.frequent_patterns import association_rules | |
from mlxtend.preprocessing import TransactionEncoder | |
import matplotlib.pyplot as plt | |
print(df_nb.dtypes) | |
# convert all columns to strings | |
# 将所有列转换为字符串 | |
df_nb = df_nb.astype(str) | |
df_nb['Gender'] = 'Gender=' + df_nb['Gender'].astype(str) | |
df_nb['Age'] = 'Age=' + df_nb['Age'].astype(str) | |
df_nb['Hours on Readings'] = 'Readings=' + df_nb['Hours on Readings'].astype(str) | |
df_nb['Hours on Assignments'] = 'Assignments=' + df_nb['Hours on Assignments'].astype(str) | |
df_nb['Hours on Games'] = 'Games=' + df_nb['Hours on Games'].astype(str) | |
df_nb['Hours on Internet'] = 'Internet=' + df_nb['Hours on Internet'].astype(str) | |
df_nb['Exam'] = 'Exam=' + df_nb['Exam'].astype(str) | |
df_nb['Grade'] = 'Grade=' + df_nb['Grade'].astype(str) | |
print(df_nb.dtypes) | |
# convert data frame to lists | |
# 将数据帧转换为列表 | |
df_arr = df_nb.stack().groupby(level=0).apply(list).tolist() | |
# Encode lists to transactions | |
# 将列表编码为事务 | |
te = TransactionEncoder() | |
df_transactions = te.fit_transform(df_arr) | |
# covnert the values to booleans: TRUE and FALSE | |
df_rules = pd.DataFrame(df_transactions,columns=te.columns_) | |
display('Data Example',HTML(df_rules.head(5).to_html())) | |
# API, http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/ | |
frequent_itemsets = apriori(df_rules, min_support=0.45, use_colnames=True) | |
# API, http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/ | |
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) | |
display('Rules',HTML(rules.to_html())) |
# Outlier Detection
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.neighbors import LocalOutlierFactor | |
# API, https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor | |
# Data preprocessing | |
# Prepare a numerical feature matrix, better to be normalized | |
# 准备一个数值特征矩阵,最好归一化 | |
print('Column Datatypes:\n',df.dtypes) | |
# convert all nominal variables to binary variables | |
df_raw=df.copy(deep=True) | |
df_knn=df.copy(deep=True) | |
# create new binary columns | |
df_dummies=pd.get_dummies(df_knn[['Degree','Nationality']]) | |
# add them to dataframe | |
df_knn=df_knn.join(df_dummies) | |
# drop original columns | |
df_knn=df_knn.drop('Degree',axis=1) | |
df_knn=df_knn.drop('Nationality', axis=1) | |
display('Data Example:',HTML(df_knn.head(10).to_html())) | |
# Normalized all numerical features | |
# 归一化所有数值特征 | |
# find numeric columns | |
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] | |
cols_numeric = df_knn.select_dtypes(include=numerics).columns.tolist() | |
print('Selected numerical columns:\n',cols_numeric) | |
# min-max normalization to scale [0, 1] | |
for col in cols_numeric: | |
df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min()) | |
df_knn=df_knn.drop("GradeLetter",1) | |
display(HTML(df_knn.head(10).to_html())) | |
# API, https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor | |
# plot data points | |
plt.scatter(df_knn["Age"], df_knn["Exam"], color = "b") | |
plt.grid() | |
plt.show() | |
# model specification | |
model1 = LocalOutlierFactor(n_neighbors = 3, metric = "euclidean") | |
# model fitting 模型拟合 | |
y_pred = model1.fit_predict(df_knn) | |
# filter outlier index 过滤离群指数 | |
outlier_index = np.where(y_pred == -1) # negative values are outliers | |
print("outlier indices: ", outlier_index) | |
# filter outlier values | |
outlier_values = df_knn.iloc[outlier_index] | |
# plot data | |
plt.scatter(df["Age"], df["Exam"], color = "b") | |
# plot outlier values | |
plt.scatter(outlier_values["Age"], outlier_values["Exam"], color = "r") | |
plt.grid() | |
plt.show() |
# Assignment
# Clustering
- Ignore the column related to loan term
忽略与贷款期限相关的列 - Perform Kmeans, DBSCAN and hierarchical clustering
执行 Kmeans, DBSCAN 和层次聚类 - Determine the best K in Kmeans and use the same K in hierarchical clustering
确定 Kmeans 中的最佳 K,并在层次聚类中使用相同的 K - Evaluate the training process of these models by comparing SSE
通过比较 SSE 评估这些模型的训练过程 - Evaluate the outputs by fusing them into your previous classification task
通过将输出融合到之前的分类任务中来评估输出
# Association Rules
- Produce the rules by trying different confidence and support values
通过尝试不同的置信度和支持值来生成规则 - Pick up top interesting/useful rules, and explain them, e.g., why they are valuable or interesting
找出最有趣 / 有用的规则,并解释它们,例如,为什么它们有价值或有趣
# Outlier Detections
- Identify outliers by using LOF
利用 LOF 识别离群值 - Re-run Kmeans clustering to see whether SSE can be reduced
重新运行 Kmeans 聚类,看看 SSE 是否可以降低