# Clustering

# Introductions

Clustering is an unsupervised learning method which can be used to discover the underlying pattern in the data structure.
聚类是一种无监督的学习方法，可以用来发现数据结构中的潜在模式。

For example, it can be used to group unlabelled data
例如，它可以用于对未标记的数据进行分组

# K-Means Clustering

Requirements: numerical and normalized features
要求：数值化和归一化特征

	import numpy as np
	import scipy as sp
	import pandas as pd
	from IPython.display import display, HTML

	df=pd.read_csv('data_students.csv')
	cols=df.columns
	# print out and display dataframe as tables in HTML
	display(HTML(df.head(10).to_html()))

	# replace missing values in numerical variables by using mean value
	# 用均值代替数值变量中的缺失值
	df["Age"].fillna(df["Age"].mean(), inplace=True)
	df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True)
	df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True)
	df["Exam"].fillna(df["Exam"].mean(), inplace=True)
	df["Grade"].fillna(df["Grade"].mean(), inplace=True)

	# check again whether there are missing values
	# 再次检查是否有缺失值
	print('ColumnName, DataType, MissingValues')
	for i in cols:
	print(i, ',', df[i].dtype,',',df[i].isnull().any())

	# remove column ID
	df=df.drop('ID',1)

	# print out and display dataframe as tables in HTML
	display(HTML(df.head(10).to_html()))

	# Data preprocessing
	print('Column Datatypes:\n',df.dtypes)
	# convert all nominal variables to binary variables
	# 将所有名义变量转换为二进制变量
	df_raw=df.copy(deep=True)
	df_knn=df.copy(deep=True)
	# create new binary columns
	# 新建二进制列
	df_dummies=pd.get_dummies(df_knn[['Degree','Nationality']])
	# add them to dataframe
	df_knn=df_knn.join(df_dummies)
	# drop original columns
	df_knn=df_knn.drop('Degree',axis=1)
	df_knn=df_knn.drop('Nationality', axis=1)

	display('Data Example:',HTML(df_knn.head(10).to_html()))

	# Normalized all numerical features
	# 归一化所有数值特征
	# find numeric columns 查找数字列
	numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
	cols_numeric = df_knn.select_dtypes(include=numerics).columns.tolist()
	print('Selected numerical columns:\n',cols_numeric)

	# min-max normalization to scale [0, 1]
	for col in cols_numeric:
	df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min())


	# We ignore the label column
	# 忽略标签列
	df_kmeans=df_knn.drop('GradeLetter',axis=1)
	display(HTML(df_kmeans.head(10).to_html()))

	# KMeans clustering
	import matplotlib.pyplot as plt
	from matplotlib.colors import ListedColormap
	from sklearn.cluster import KMeans

	# API, https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

	kmeans=KMeans(n_clusters=4, random_state=1,max_iter=200)
	kmeans.fit(df_kmeans)
	y_pred=kmeans.predict(df_kmeans)

	plt.scatter(df_kmeans['Exam'],df_kmeans['Grade'],c=y_pred,cmap='viridis')

	# get the cluster labels and add it back to the original data
	# 获取聚类标签并将其添加回原始数据
	opt=kmeans.labels_
	df_knn['Cluster']=opt
	display('Data:',HTML(df_knn.tail(10).to_html()))


	# try different K value and find the best K for KMeans
	# 尝试不同的 K 值，找到最适合 KMeans 的 K 值
	# Assumption: SSE is smaller, it is better
	# 假设：SSE 越小越好

	Sum_of_squared_distances = []
	K = range(1,15)
	for k in K:
	km = KMeans(n_clusters=k)
	km = km.fit(df_kmeans)
	Sum_of_squared_distances.append(km.inertia_)

	# Plot K and SSE, observe which one is better
	# K 图和 SSE 图，观察哪个更好
	# In the plot, the elbow on the arm is optimal k
	# 图中折点是最佳 K
	plt.plot(K, Sum_of_squared_distances, 'bx-')
	plt.xlabel('k')
	plt.ylabel('Sum_of_squared_distances')
	plt.title('Elbow Method For Optimal k')
	plt.show()

# DBSCAN - Density-based Clustering

	from sklearn.cluster import DBSCAN

	# API: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
	df_dbscan=df_knn.drop(['GradeLetter','Cluster'],axis=1)
	display('Data:',HTML(df_dbscan.tail(10).to_html()))

	# Numpy array of all the cluster labels assigned to each data point
	# 分配给每个数据点的所有聚类标签的 Numpy 数组
	db_default = DBSCAN(eps = 0.2, min_samples = 3).fit(df_dbscan)
	labels = db_default.labels_
	df_dbscan['Cluster']=labels
	display('Data after clustering:',HTML(df_dbscan.tail(10).to_html()))

	# Visualize the clusters 可视化集群

	# Building the label to colour mapping
	# 构建标签到颜色的映射
	# Need to figure out how many clusters were produced, then assign different number of the colors
	# 需要计算出产生了多少簇，然后分配不同数量的颜色
	colours = {}
	colours[0] = 'r'
	colours[1] = 'g'
	colours[2] = 'b'
	colours[-1] = 'k'

	# Building the colour vector for each data point
	# 为每个数据点构建颜色向量
	cvec = [colours[label] for label in labels]

	# For the construction of the legend of the plot
	r = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='r');
	g = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='g');
	b = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='b');
	k = plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], color ='k');

	# Plotting P1 on the X-Axis and P2 on the Y-Axis
	# 在 X 轴上绘制 P1，在 Y 轴上绘制 P2
	# according to the colour vector defined
	# 根据定义的颜色向量
	plt.figure(figsize =(9, 9))
	plt.scatter(df_dbscan['Age'], df_dbscan['Exam'], c = cvec)
	# Building the legend
	plt.legend((r, g, b, k), ('Label 0', 'Label 1', 'Label 2', 'Label -1'))
	plt.show()

# Hierarchical Clustering

	from sklearn.cluster import AgglomerativeClustering
	import scipy.cluster.hierarchy as shc

	# API, https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html

	df_hc=df_knn.drop(['GradeLetter','Cluster'],axis=1)
	display('Data:',HTML(df_hc.tail(10).to_html()))

	# Plot Dendrogram
	plt.figure(figsize =(8, 8))
	plt.title('Visualising the data')
	Dendrogram = shc.dendrogram((shc.linkage(df_hc, method ='single')))

	# Clustering based on the Dendrogram
	# 基于树状图的聚类分析
	# choose best K based on elbow method introduced above
	# 基于上面介绍的弯肘法选择最佳 K
	cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='single')
	cls=cluster.fit_predict(df_hc)
	print(cluster.labels_)

	# Visualizing the clustering
	plt.figure(figsize =(6, 6))
	plt.scatter(df_dbscan['Age'], df_dbscan['Exam'],
	c = cls, cmap ='rainbow')
	plt.show()

# Association Rules

install the mlxtend library first
start anconda prompt, To install this package with conda run one of the following:

conda install -c conda-forge mlxtend 
conda install -c conda-forge/label/gcc7 mlxtend 
conda install -c conda-forge/label/cf201901 mlxtend

fix install issues in windows
copy the following dll files
libcrypto-1_1-x64.*
libssl-1_1-x64.*
from "your Anaconda3 folder\Library\bin" to "your Anaconda3 folder\DLLs"

	# load data

	import numpy as np
	import scipy as sp
	import pandas as pd
	from IPython.display import display, HTML

	df=pd.read_csv('data_students.csv')
	cols=df.columns
	# print out and display dataframe as tables in HTML
	display(HTML(df.head(10).to_html()))

	# replace missing values in numerical variables by using mean value
	# 用均值代替数值变量中的缺失值
	df["Age"].fillna(df["Age"].mean(), inplace=True)
	df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True)
	df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True)
	df["Exam"].fillna(df["Exam"].mean(), inplace=True)
	df["Grade"].fillna(df["Grade"].mean(), inplace=True)

	# check again whether there are missing values
	print('ColumnName, DataType, MissingValues')
	for i in cols:
	print(i, ',', df[i].dtype,',',df[i].isnull().any())

	# remove column ID
	df=df.drop('ID',1)

	# print out and display dataframe as tables in HTML
	display(HTML(df.head(10).to_html()))

	# Data preprocessing
	print('Column Datatypes:\n',df.dtypes)
	# convert all numerical variable to nominal variables
	df_nb=df.copy(deep=True)
	df_nb['Gender'] = df_nb['Gender'].astype(str)
	df_nb['Age'] = pd.cut(df_nb['Age'],3)
	df_nb['Hours on Readings'] = pd.cut(df_nb['Hours on Readings'],3)
	df_nb['Hours on Assignments'] = pd.cut(df_nb['Hours on Assignments'],3)
	df_nb['Hours on Games'] = pd.cut(df_nb['Hours on Games'],3)
	df_nb['Hours on Internet'] = pd.cut(df_nb['Hours on Internet'],3)
	df_nb['Exam'] = pd.cut(df_nb['Exam'],3)
	df_nb['Grade'] = pd.cut(df_nb['Grade'],3)

	display('Data Example',HTML(df_nb.head(5).to_html()))

	# Association Rule Mining

	from mlxtend.frequent_patterns import apriori
	from mlxtend.frequent_patterns import association_rules
	from mlxtend.preprocessing import TransactionEncoder
	import matplotlib.pyplot as plt

	print(df_nb.dtypes)
	# convert all columns to strings
	# 将所有列转换为字符串
	df_nb = df_nb.astype(str)
	df_nb['Gender'] = 'Gender=' + df_nb['Gender'].astype(str)
	df_nb['Age'] = 'Age=' + df_nb['Age'].astype(str)
	df_nb['Hours on Readings'] = 'Readings=' + df_nb['Hours on Readings'].astype(str)
	df_nb['Hours on Assignments'] = 'Assignments=' + df_nb['Hours on Assignments'].astype(str)
	df_nb['Hours on Games'] = 'Games=' + df_nb['Hours on Games'].astype(str)
	df_nb['Hours on Internet'] = 'Internet=' + df_nb['Hours on Internet'].astype(str)
	df_nb['Exam'] = 'Exam=' + df_nb['Exam'].astype(str)
	df_nb['Grade'] = 'Grade=' + df_nb['Grade'].astype(str)
	print(df_nb.dtypes)


	# convert data frame to lists
	# 将数据帧转换为列表
	df_arr = df_nb.stack().groupby(level=0).apply(list).tolist()

	# Encode lists to transactions
	# 将列表编码为事务
	te = TransactionEncoder()
	df_transactions = te.fit_transform(df_arr)
	# covnert the values to booleans: TRUE and FALSE
	df_rules = pd.DataFrame(df_transactions,columns=te.columns_)
	display('Data Example',HTML(df_rules.head(5).to_html()))

	# API, http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
	frequent_itemsets = apriori(df_rules, min_support=0.45, use_colnames=True)
	# API, http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/
	rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
	display('Rules',HTML(rules.to_html()))

# Outlier Detection

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.neighbors import LocalOutlierFactor

	# API, https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor

	# Data preprocessing
	# Prepare a numerical feature matrix, better to be normalized
	# 准备一个数值特征矩阵，最好归一化
	print('Column Datatypes:\n',df.dtypes)
	# convert all nominal variables to binary variables
	df_raw=df.copy(deep=True)
	df_knn=df.copy(deep=True)
	# create new binary columns
	df_dummies=pd.get_dummies(df_knn[['Degree','Nationality']])
	# add them to dataframe
	df_knn=df_knn.join(df_dummies)
	# drop original columns
	df_knn=df_knn.drop('Degree',axis=1)
	df_knn=df_knn.drop('Nationality', axis=1)

	display('Data Example:',HTML(df_knn.head(10).to_html()))

	# Normalized all numerical features
	# 归一化所有数值特征
	# find numeric columns
	numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
	cols_numeric = df_knn.select_dtypes(include=numerics).columns.tolist()
	print('Selected numerical columns:\n',cols_numeric)

	# min-max normalization to scale [0, 1]
	for col in cols_numeric:
	df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min())

	df_knn=df_knn.drop("GradeLetter",1)
	display(HTML(df_knn.head(10).to_html()))

	# API, https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor

	# plot data points
	plt.scatter(df_knn["Age"], df_knn["Exam"], color = "b")
	plt.grid()
	plt.show()

	# model specification
	model1 = LocalOutlierFactor(n_neighbors = 3, metric = "euclidean")
	# model fitting 模型拟合
	y_pred = model1.fit_predict(df_knn)
	# filter outlier index 过滤离群指数
	outlier_index = np.where(y_pred == -1) # negative values are outliers
	print("outlier indices: ", outlier_index)
	# filter outlier values
	outlier_values = df_knn.iloc[outlier_index]
	# plot data
	plt.scatter(df["Age"], df["Exam"], color = "b")
	# plot outlier values
	plt.scatter(outlier_values["Age"], outlier_values["Exam"], color = "r")
	plt.grid()
	plt.show()

# Assignment

# Clustering

Ignore the column related to loan term
忽略与贷款期限相关的列
Perform Kmeans, DBSCAN and hierarchical clustering
执行 Kmeans, DBSCAN 和层次聚类
Determine the best K in Kmeans and use the same K in hierarchical clustering
确定 Kmeans 中的最佳 K，并在层次聚类中使用相同的 K
Evaluate the training process of these models by comparing SSE
通过比较 SSE 评估这些模型的训练过程
Evaluate the outputs by fusing them into your previous classification task
通过将输出融合到之前的分类任务中来评估输出

# Association Rules

Produce the rules by trying different confidence and support values
通过尝试不同的置信度和支持值来生成规则
Pick up top interesting/useful rules, and explain them, e.g., why they are valuable or interesting
找出最有趣 / 有用的规则，并解释它们，例如，为什么它们有价值或有趣

# Outlier Detections

Identify outliers by using LOF
利用 LOF 识别离群值
Re-run Kmeans clustering to see whether SSE can be reduced
重新运行 Kmeans 聚类，看看 SSE 是否可以降低

数据挖掘机器学习

# Clustering

# Introductions

# K-Means Clustering

# DBSCAN - Density-based Clustering

# Hierarchical Clustering

# Association Rules

# Outlier Detection

# Assignment

# Clustering

# Association Rules

# Outlier Detections

Chapter 12 & 13

Chapter 14