# Clustering
# Introductions
Clustering is an unsupervised learning method which can be used to discover the underlying pattern in the data structure.
For example, it can be used to group unlabelled data
# K-Means Clustering
Requirements: numerical and normalized features
import numpy as np | |
import scipy as sp | |
import pandas as pd | |
from IPython.display import display, HTML | |
df=pd.read_csv('data_students.csv') | |
cols=df.columns | |
# print out and display dataframe as tables in HTML | |
display(HTML(df.head(10).to_html())) | |
# replace missing values in numerical variables by using mean value | |
# 用均值代替数值变量中的缺失值 | |
df["Age"].fillna(df["Age"].mean(), inplace=True) | |
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True) | |
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True) | |
df["Exam"].fillna(df["Exam"].mean(), inplace=True) | |
df["Grade"].fillna(df["Grade"].mean(), inplace=True) | |
# check again whether there are missing values | |
# 再次检查是否有缺失值 | |
print('ColumnName, DataType, MissingValues') | |
for i in cols: | |
print(i, ',', df[i].dtype,',',df[i].isnull().any()) | |
# remove column ID | |
df=df.drop('ID',1) | |
# print out and display dataframe as tables in HTML | |
display(HTML(df.head(10).to_html())) | |
# Data preprocessing | |
print('Column Datatypes:\n',df.dtypes) | |
# convert all nominal variables to binary variables | |
# 将所有名义变量转换为二进制变量 | |
df_raw=df.copy(deep=True) | |
df_knn=df.copy(deep=True) | |
# create new binary columns | |
# 新建二进制列 | |
df_dummies=pd.get_dummies(df_knn[['Degree','Nationality']]) | |
# add them to dataframe | |
df_knn=df_knn.join(df_dummies) | |
# drop original columns | |
df_knn=df_knn.drop('Degree',axis=1) | |
df_knn=df_knn.drop('Nationality', axis=1) | |
display('Data Example:',HTML(df_knn.head(10).to_html())) | |
# Normalized all numerical features | |
# 归一化所有数值特征 | |
# find numeric columns 查找数字列 | |
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] | |
cols_numeric = df_knn.select_dtypes(include=numerics).columns.tolist() | |
print('Selected numerical columns:\n',cols_numeric) | |
# min-max normalization to scale [0, 1] | |
for col in cols_numeric: | |
df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min()) | |
# We ignore the label column | |
# 忽略标签列 | |
df_kmeans=df_knn.drop('GradeLetter',axis=1) | |
display(HTML(df_kmeans.head(10).to_html())) | |
# KMeans clustering | |
import matplotlib.pyplot as plt | |
from matplotlib.colors import ListedColormap | |
from sklearn.cluster import KMeans | |
# API, https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html | |
kmeans=KMeans(n_clusters=4, random_state=1,max_iter=200) | |
kmeans.fit(df_kmeans) | |
y_pred=kmeans.predict(df_kmeans) | |
plt.scatter(df_kmeans['Exam'],df_kmeans['Grade'],c=y_pred,cmap='viridis') | |
# get the cluster labels and add it back to the original data | |
# 获取聚类标签并将其添加回原始数据 | |
opt=kmeans.labels_ | |
df_knn['Cluster']=opt | |
display('Data:',HTML(df_knn.tail(10).to_html())) | |
# try different K value and find the best K for KMeans | |
# 尝试不同的 K 值,找到最适合 KMeans 的 K 值 | |
# Assumption: SSE is smaller, it is better | |
# 假设:SSE 越小越好 | |
Sum_of_squared_distances = [] | |
K = range(1,15) | |
for k in K: | |
km = KMeans(n_clusters=k) | |
km = km.fit(df_kmeans) | |
Sum_of_squared_distances.append(km.inertia_) | |
# Plot K and SSE, observe which one is better | |
# K 图和 SSE 图,观察哪个更好 | |
# In the plot, the elbow on the arm is optimal k | |
# 图中折点是最佳 K | |
plt.plot(K, Sum_of_squared_distances, 'bx-') | |
plt.xlabel('k') | |
plt.ylabel('Sum_of_squared_distances') | |
plt.title('Elbow Method For Optimal k') | |
plt.show() |
# DBSCAN - Density-based Clustering
from sklearn.cluster import DBSCAN | |
# API: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html | |
df_dbscan=df_knn.drop(['GradeLetter','Cluster'],axis=1) | |
display('Data:',HTML(df_dbscan.tail(10).to_html())) | |
# Numpy array of all the cluster labels assigned to each data point | |
# 分配给每个数据点的所有聚类标签的 Numpy 数组 | |
db_default = DBSCAN(eps = 0.2, min_samples = 3).fit(df_dbscan) | |
labels = db_default.labels_ | |
df_dbscan['Cluster']=labels | |
display('Data after clustering:',HTML(df_dbscan.tail(10).to_html())) | |
# Visualize the clusters 可视化集群 | |
# Building the label to colour mapping | |
# 构建标签到颜色的映射 | |
# Need to figure out how many clusters were produced, then assign different number of the colors | |
# 需要计算出产生了多少簇,然后分配不同数量的颜色 | |
colours = {} | |
colours[0] = 'r' | |
colours[1] = 'g' | |
colours[2] = 'b' | |
colours[-1] = 'k' | |
# Building the colour vector for each data point | |
# 为每个数据点构建颜色向量 | |
cvec = [colours[label] for label in labels] | |
# For the construction of the legend of the plot | |
r = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='r'); | |
g = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='g'); | |
b = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='b'); | |
k = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='k'); | |
# Plotting P1 on the X-Axis and P2 on the Y-Axis | |
# 在 X 轴上绘制 P1,在 Y 轴上绘制 P2 | |
# according to the colour vector defined | |
# 根据定义的颜色向量 | |
plt.figure(figsize =(9, 9)) | |
plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], c = cvec) | |
# Building the legend | |
plt.legend((r, g, b, k), ('Label 0', 'Label 1', 'Label 2', 'Label -1')) | |
plt.show() |
# Hierarchical Clustering
from sklearn.cluster import AgglomerativeClustering | |
import scipy.cluster.hierarchy as shc | |
# API, https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html | |
df_hc=df_knn.drop(['GradeLetter','Cluster'],axis=1) | |
display('Data:',HTML(df_hc.tail(10).to_html())) | |
# Plot Dendrogram | |
plt.figure(figsize =(8, 8)) | |
plt.title('Visualising the data') | |
Dendrogram = shc.dendrogram((shc.linkage(df_hc, method ='single'))) | |
# Clustering based on the Dendrogram | |
# 基于树状图的聚类分析 | |
# choose best K based on elbow method introduced above | |
# 基于上面介绍的弯肘法选择最佳 K | |
cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='single') | |
cls=cluster.fit_predict(df_hc) | |
print(cluster.labels_) | |
# Visualizing the clustering | |
plt.figure(figsize =(6, 6)) | |
plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], | |
c = cls, cmap ='rainbow') | |
plt.show() |
# Association Rules
install the mlxtend
library first
start anconda prompt, To install this package with conda run one of the following:
conda install -c conda-forge mlxtend
conda install -c conda-forge/label/gcc7 mlxtend
conda install -c conda-forge/label/cf201901 mlxtend
fix install issues in windows
copy the following dll fileslibcrypto-1_1-x64.*
from "your Anaconda3 folder\Library\bin" to "your Anaconda3 folder\DLLs"
# load data | |
import numpy as np | |
import scipy as sp | |
import pandas as pd | |
from IPython.display import display, HTML | |
df=pd.read_csv('data_students.csv') | |
cols=df.columns | |
# print out and display dataframe as tables in HTML | |
display(HTML(df.head(10).to_html())) | |
# replace missing values in numerical variables by using mean value | |
# 用均值代替数值变量中的缺失值 | |
df["Age"].fillna(df["Age"].mean(), inplace=True) | |
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True) | |
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True) | |
df["Exam"].fillna(df["Exam"].mean(), inplace=True) | |
df["Grade"].fillna(df["Grade"].mean(), inplace=True) | |
# check again whether there are missing values | |
print('ColumnName, DataType, MissingValues') | |
for i in cols: | |
print(i, ',', df[i].dtype,',',df[i].isnull().any()) | |
# remove column ID | |
df=df.drop('ID',1) | |
# print out and display dataframe as tables in HTML | |
display(HTML(df.head(10).to_html())) | |
# Data preprocessing | |
print('Column Datatypes:\n',df.dtypes) | |
# convert all numerical variable to nominal variables | |
df_nb=df.copy(deep=True) | |
df_nb['Gender'] = df_nb['Gender'].astype(str) | |
df_nb['Age'] = pd.cut(df_nb['Age'],3) | |
df_nb['Hours on Readings'] = pd.cut(df_nb['Hours on Readings'],3) | |
df_nb['Hours on Assignments'] = pd.cut(df_nb['Hours on Assignments'],3) | |
df_nb['Hours on Games'] = pd.cut(df_nb['Hours on Games'],3) | |
df_nb['Hours on Internet'] = pd.cut(df_nb['Hours on Internet'],3) | |
df_nb['Exam'] = pd.cut(df_nb['Exam'],3) | |
df_nb['Grade'] = pd.cut(df_nb['Grade'],3) | |
display('Data Example',HTML(df_nb.head(5).to_html())) | |
# Association Rule Mining | |
from mlxtend.frequent_patterns import apriori | |
from mlxtend.frequent_patterns import association_rules | |
from mlxtend.preprocessing import TransactionEncoder | |
import matplotlib.pyplot as plt | |
print(df_nb.dtypes) | |
# convert all columns to strings | |
# 将所有列转换为字符串 | |
df_nb = df_nb.astype(str) | |
df_nb['Gender'] = 'Gender=' + df_nb['Gender'].astype(str) | |
df_nb['Age'] = 'Age=' + df_nb['Age'].astype(str) | |
df_nb['Hours on Readings'] = 'Readings=' + df_nb['Hours on Readings'].astype(str) | |
df_nb['Hours on Assignments'] = 'Assignments=' + df_nb['Hours on Assignments'].astype(str) | |
df_nb['Hours on Games'] = 'Games=' + df_nb['Hours on Games'].astype(str) | |
df_nb['Hours on Internet'] = 'Internet=' + df_nb['Hours on Internet'].astype(str) | |
df_nb['Exam'] = 'Exam=' + df_nb['Exam'].astype(str) | |
df_nb['Grade'] = 'Grade=' + df_nb['Grade'].astype(str) | |
print(df_nb.dtypes) | |
# convert data frame to lists | |
# 将数据帧转换为列表 | |
df_arr = df_nb.stack().groupby(level=0).apply(list).tolist() | |
# Encode lists to transactions | |
# 将列表编码为事务 | |
te = TransactionEncoder() | |
df_transactions = te.fit_transform(df_arr) | |
# covnert the values to booleans: TRUE and FALSE | |
df_rules = pd.DataFrame(df_transactions,columns=te.columns_) | |
display('Data Example',HTML(df_rules.head(5).to_html())) | |
# API, http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/ | |
frequent_itemsets = apriori(df_rules, min_support=0.45, use_colnames=True) | |
# API, http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/ | |
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7) | |
display('Rules',HTML(rules.to_html())) |
# Outlier Detection
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.neighbors import LocalOutlierFactor | |
# API, https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor | |
# Data preprocessing | |
# Prepare a numerical feature matrix, better to be normalized | |
# 准备一个数值特征矩阵,最好归一化 | |
print('Column Datatypes:\n',df.dtypes) | |
# convert all nominal variables to binary variables | |
df_raw=df.copy(deep=True) | |
df_knn=df.copy(deep=True) | |
# create new binary columns | |
df_dummies=pd.get_dummies(df_knn[['Degree','Nationality']]) | |
# add them to dataframe | |
df_knn=df_knn.join(df_dummies) | |
# drop original columns | |
df_knn=df_knn.drop('Degree',axis=1) | |
df_knn=df_knn.drop('Nationality', axis=1) | |
display('Data Example:',HTML(df_knn.head(10).to_html())) | |
# Normalized all numerical features | |
# 归一化所有数值特征 | |
# find numeric columns | |
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] | |
cols_numeric = df_knn.select_dtypes(include=numerics).columns.tolist() | |
print('Selected numerical columns:\n',cols_numeric) | |
# min-max normalization to scale [0, 1] | |
for col in cols_numeric: | |
df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min()) | |
df_knn=df_knn.drop("GradeLetter",1) | |
display(HTML(df_knn.head(10).to_html())) | |
# API, https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor | |
# plot data points | |
plt.scatter(df_knn["Age"], df_knn["Exam"], color = "b") | |
plt.grid() | |
plt.show() | |
# model specification | |
model1 = LocalOutlierFactor(n_neighbors = 3, metric = "euclidean") | |
# model fitting 模型拟合 | |
y_pred = model1.fit_predict(df_knn) | |
# filter outlier index 过滤离群指数 | |
outlier_index = np.where(y_pred == -1) # negative values are outliers | |
print("outlier indices: ", outlier_index) | |
# filter outlier values | |
outlier_values = df_knn.iloc[outlier_index] | |
# plot data | |
plt.scatter(df["Age"], df["Exam"], color = "b") | |
# plot outlier values | |
plt.scatter(outlier_values["Age"], outlier_values["Exam"], color = "r") | |
plt.grid() | |
plt.show() |
# Assignment
# Clustering
- Ignore the column related to loan term
忽略与贷款期限相关的列 - Perform Kmeans, DBSCAN and hierarchical clustering
执行 Kmeans, DBSCAN 和层次聚类 - Determine the best K in Kmeans and use the same K in hierarchical clustering
确定 Kmeans 中的最佳 K,并在层次聚类中使用相同的 K - Evaluate the training process of these models by comparing SSE
通过比较 SSE 评估这些模型的训练过程 - Evaluate the outputs by fusing them into your previous classification task
# Association Rules
- Produce the rules by trying different confidence and support values
通过尝试不同的置信度和支持值来生成规则 - Pick up top interesting/useful rules, and explain them, e.g., why they are valuable or interesting
找出最有趣 / 有用的规则,并解释它们,例如,为什么它们有价值或有趣
# Outlier Detections
- Identify outliers by using LOF
利用 LOF 识别离群值 - Re-run Kmeans clustering to see whether SSE can be reduced
重新运行 Kmeans 聚类,看看 SSE 是否可以降低