# Introductions
Classification is a kind of supervised learning.
分类是一种监督学习。
It is used to predict the value in a nominal variable which is also called 'label'.
它用于预测名义变量 (也称为 “标签”) 中的值。
The factors that are used for predictions are called features.
用于预测的因素称为特征。
import numpy as np | |
import scipy as sp | |
import pandas as pd | |
from IPython.display import display, HTML | |
df=pd.read_csv('data_students.csv') | |
cols=df.columns | |
# print out and display dataframe as tables in HTML | |
display(HTML(df.head(10).to_html())) | |
# replace missing values in numerical variables by using mean value | |
# 使用平均值替换数值变量中的缺失值 | |
df["Age"].fillna(df["Age"].mean(), inplace=True) | |
df["Hours on Assignments"].fillna(df["Hours on Assignments"].mean(), inplace=True) | |
df["Hours on Games"].fillna(df["Hours on Games"].mean(), inplace=True) | |
df["Exam"].fillna(df["Exam"].mean(), inplace=True) | |
df["Grade"].fillna(df["Grade"].mean(), inplace=True) | |
# check again whether there are missing values | |
# 再次检查是否有缺失值 | |
print('ColumnName, DataType, MissingValues') | |
for i in cols: | |
print(i, ',', df[i].dtype,',',df[i].isnull().any()) | |
# remove column ID | |
# 删除 ID 列 | |
df=df.drop('ID',1) | |
# print out and display dataframe as tables in HTML | |
# 在 HTML 中打印并显示 dataframe 为表格 | |
display(HTML(df.head(10).to_html())) |
# KNN Classifier
Requirements: 1). numerical features; 2). normalized features
要求:1)数值特征;2) 标准化特征
Parameters: distance measure and value of K
参数:距离测量和 K 值
# Data preprocessing 数据预处理
print('Column Datatypes:\n',df.dtypes) | |
# convert all nominal variables to binary variables | |
# 将所有名义变量转换为二进制变量 | |
df_raw=df.copy(deep=True) | |
df_knn=df.copy(deep=True) | |
# create new binary columns | |
# 创建新的二进制列 | |
df_dummies=pd.get_dummies(df_knn[['Degree','Nationality']]) | |
# add them to dataframe | |
# 将它们添加到 dataframe | |
df_knn=df_knn.join(df_dummies) | |
# drop original columns | |
# 删除原始列 | |
df_knn=df_knn.drop('Degree',axis=1) | |
df_knn=df_knn.drop('Nationality', axis=1) | |
display('Data Example:',HTML(df_knn.head(10).to_html())) | |
# drop extra binary columns, since we only need N-1 binary columns | |
# 删除额外的二进制列,因为我们只需要 N-1 二进制列 | |
print(df_knn.columns) | |
df_knn=df_knn.drop('Degree_ BS', axis=1) | |
df_knn=df_knn.drop('Nationality_ China', axis=1) | |
display('Data Example:',HTML(df_knn.head(10).to_html())) | |
# Normalized all numerical features | |
# 归一化所有数值特征 | |
# min-max normalization to scale [0, 1] | |
# 最小 - 最大归一化以缩放 [0,1] | |
for col in df_knn.columns: | |
if col != 'GradeLetter': | |
# exclude GradeLetter, since it is label in our data | |
# 不包括 GradeLetter,因为它是数据中的标签 | |
df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min()) | |
display(HTML(df_knn.head(10).to_html())) | |
# Build KNN models and evaluate the models | |
# 建立 KNN 模型并评估模型 | |
# Note: for demo and teaching purpose, we present evaluations based on both hold-out and N-fold cross validations | |
# By hold-out evaluations 通过 hold-out 评估 | |
from sklearn.model_selection import train_test_split | |
from matplotlib import pyplot as plt | |
import matplotlib as mpl | |
import seaborn as sns | |
# preprocess label, since KNN requires label encoding | |
# 预处理标签,因为 KNN 需要标签编码 | |
from sklearn import preprocessing | |
y = df_knn['GradeLetter'] # define label as nominal values 将标签定义为标称值 | |
le = preprocessing.LabelEncoder() | |
le.fit(y) | |
y_encoded = le.transform(y) # encode nominal labels to integers 将标称标签编码为整数 | |
print(y_encoded) | |
df_knn['GradeLetter'] = y_encoded | |
x = df_knn.drop('GradeLetter',1) | |
y = df_knn['GradeLetter'] | |
display(HTML(df_knn.head(10).to_html())) | |
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2) | |
# Visualize train set 可视化训练组 | |
plt.figure(1) | |
plt.scatter(x_train['Grade'], x_train['Exam'], c=y_train, alpha = 0.8) | |
plt.xlabel('Grade') | |
plt.ylabel('Exam') | |
plt.title('Visualization of Trainng Set') | |
plt.show() | |
plt.close() | |
# build and eval models | |
# 建立和评估模型 | |
from sklearn import neighbors | |
from sklearn.metrics import accuracy_score | |
# API, https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html | |
from sklearn.metrics import precision_score | |
from sklearn.metrics import recall_score | |
# API for KNeighborsClassifier | |
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html | |
for k in range(1, 24, 2): | |
clf=neighbors.KNeighborsClassifier(k, weights='uniform') | |
clf.fit(x_train, y_train) | |
y_pred = clf.predict(x_test) | |
print('K =', k, ', Accuracy: ', accuracy_score(y_test, y_pred), ', Precision: ', precision_score(y_test, y_pred, average='micro'), | |
', Recall: ', recall_score(y_test, y_pred, average='micro')) | |
# note, there is also an option 'macro' which calculate metric for each label, then return average | |
# 注意,还有一个选项 “宏”,它计算每个标签的度量,然后返回平均值 | |
# Visualize the best model on the test set | |
# 在测试集上可视化最佳模型 | |
clf=neighbors.KNeighborsClassifier(1, weights='uniform') | |
clf.fit(x_train, y_train) | |
y_pred = clf.predict(x_test) | |
plt.figure(2) | |
plt.scatter(x_test['Grade'], x_test['Exam'], c=y_pred, alpha = 0.8) | |
plt.xlabel('Grade') | |
plt.ylabel('Exam') | |
plt.title('Visualization of Testing Set') | |
plt.show() | |
plt.close() | |
# By N-fold cross evaluations 通过 N 倍交叉评估 | |
from sklearn.model_selection import cross_val_score | |
for k in range(1, 24, 2): | |
clf=neighbors.KNeighborsClassifier(k, weights='uniform') | |
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean() | |
print('K =', k, ', Accuracy: ',acc) |
# Naive Bayes Classifier
Requirements: 1). nominal features; 2). assumption of conditionally indepenence
要求:1)名义性特征;2) 条件独立假设
from sklearn.naive_bayes import GaussianNB | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import precision_score | |
from sklearn.metrics import recall_score | |
from sklearn.model_selection import cross_val_score | |
# Pre-processing | |
print('Column data types:\n',df_raw.dtypes) | |
df_nb=df_raw.copy(deep=True) | |
# convert numerical to categorical data, e.g., Age | |
df_nb['Gender'] = df_nb['Gender'].astype(str) | |
df_nb['Age'] = pd.cut(df_nb['Age'],3) | |
df_nb['Hours on Readings'] = pd.cut(df_nb['Hours on Readings'],3) | |
df_nb['Hours on Assignments'] = pd.cut(df_nb['Hours on Assignments'],3) | |
df_nb['Hours on Games'] = pd.cut(df_nb['Hours on Games'],3) | |
df_nb['Hours on Internet'] = pd.cut(df_nb['Hours on Internet'],3) | |
df_nb['Exam'] = pd.cut(df_nb['Exam'],3) | |
df_nb['Grade'] = pd.cut(df_nb['Grade'],3) | |
display('Data Example',HTML(df_nb.head(5).to_html())) | |
# by hold-out evaluation | |
y=df_nb['GradeLetter'] | |
le = preprocessing.LabelEncoder() | |
le.fit(y) | |
y_encoded = le.transform(y) # encode nominal labels to integers | |
# transform categorical data to numerical data, i.e., one-hot encoding 将分类数据转换为数值数据,即一次热编码 | |
print(df_nb.dtypes) | |
df_nb=pd.get_dummies(df_nb.drop('GradeLetter',axis=1)) | |
df_nb['GradeLetter']=y_encoded | |
display(HTML(df_nb.head(5).to_html())) | |
print('starting model build and evaluations...') | |
# API for GaussianNB | |
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB | |
x_train, x_test, y_train, y_test = train_test_split(df_nb, y_encoded, test_size=0.2) | |
clf = GaussianNB() | |
clf.fit(x_train, y_train) | |
y_pred=clf.predict(x_test) | |
# in the following coding example, we use accuracy only as the example | |
print("Accuracy by Hold-out Eval:",accuracy_score(y_pred,y_test)) | |
# by N-fold evaluation | |
y=df_nb['GradeLetter'] | |
x=df_nb.drop('GradeLetter',axis=1) | |
clf = GaussianNB() | |
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean() | |
print("Accuracy by N-fold Cross Validation:",acc) |
# Decision Trees and Random Forest
Preprocessing: 1). encode labels; 2). convert numerical to categorical data and then encoding
预处理:1) 对标签进行编码;2) 将数字数据转换为分类数据,然后进行编码
Tips: same preprocessing with the operations in Naive Bayes
提示:与朴素贝叶斯中的操作相同的预处理
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import BaggingClassifier | |
# API for DecisionTreeClassifier | |
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html?highlight=decisiontreeclassifier#sklearn.tree.DecisionTreeClassifier | |
# by hold-out evaluation | |
x_train, x_test, y_train, y_test = train_test_split(df_nb, y_encoded, test_size=0.2) | |
clf=DecisionTreeClassifier(criterion='entropy', max_depth=10) # note: there are many parameters in API API 中有很多参数 | |
clf=clf.fit(x_train, y_train) | |
y_pred=clf.predict(x_test) | |
acc=accuracy_score(y_pred, y_test) | |
print('Tree Accuracy by hold-out evaluation: ',acc) | |
# by N-fold cross validation | |
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean() | |
print("Tree Accuracy by N-fold Cross Validation:",acc) | |
# Example of randomForest = bagging method of decision trees | |
# 随机森林的例子 = 决策树的装袋方法 | |
tree = DecisionTreeClassifier() | |
# Note: you can use tree only or the random forest for the purpose of evaluations | |
# 可以仅使用树或随机森林进行评估 | |
# API, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html?highlight=baggingclassifier | |
bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8, random_state=1) | |
acc=cross_val_score(bag, x, y, cv=5, scoring='accuracy').mean() | |
print("RandomForest Accuracy by N-fold Cross Validation:",acc) |
# SVM
Preprocessing: same requirements as KNN, not necessary for normalization
预处理:与 KNN 相同的要求,归一化不是必需的
from sklearn.svm import SVC | |
# API for SVC | |
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html?highlight=svc#sklearn.svm.SVC | |
# by hold-out evaluation | |
x_train, x_test, y_train, y_test = train_test_split(df_knn, y_encoded, test_size=0.2) | |
clf=SVC(kernel='linear', C=1E10) # C is large -> hard margin; C is small -> soft margin C 是大 -> 硬边;C 是小 -> 软边距 | |
clf=clf.fit(x_train, y_train) | |
y_pred=clf.predict(x_test) | |
acc=accuracy_score(y_pred, y_test) | |
print('Accuracy by hold-out evaluation: ',acc) | |
x=df_knn.drop('GradeLetter',axis=1) | |
y=df_knn['GradeLetter'] | |
# by N-fold cross validation | |
clf=SVC(kernel='poly', C=1E10) | |
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean() | |
print("Accuracy by N-fold Cross Validation:",acc) | |
clf=SVC(kernel='rbf', C=1E10) | |
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean() | |
print("Accuracy by N-fold Cross Validation:",acc) |
# Logistic Regression
Preprocessing: same requirements as KNN, not necessary for normalization
预处理:与 KNN 要求相同,归一化不是必需的
API
import pandas as pd | |
import numpy as np | |
from sklearn import metrics | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split | |
# by hold-out evaluation | |
x_train, x_test, y_train, y_test = train_test_split(df_knn, y_encoded, test_size=0.2) | |
clf=LogisticRegression(penalty='l2',solver='lbfgs') | |
clf=clf.fit(x_train, y_train) | |
y_pred=clf.predict(x_test) | |
acc=accuracy_score(y_pred, y_test) | |
print('Accuracy by hold-out evaluation: ',acc) | |
x=df_knn.drop('GradeLetter',axis=1) | |
y=df_knn['GradeLetter'] | |
# by N-fold cross validation | |
clf=LogisticRegression() | |
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean() | |
print("Accuracy by N-fold Cross Validation:",acc) |
# Neural Networks
Preprocessing: same requirements as KNN, not necessary for normalization
API
from sklearn.neural_network import MLPClassifier | |
# by hold-out evaluation | |
x_train, x_test, y_train, y_test = train_test_split(df_knn, y_encoded, test_size=0.2) | |
clf=MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(200,), random_state=1) | |
clf=clf.fit(x_train, y_train) | |
y_pred=clf.predict(x_test) | |
acc=accuracy_score(y_pred, y_test) | |
print('Accuracy by hold-out evaluation: ',acc) | |
x=df_knn.drop('GradeLetter',axis=1) | |
y=df_knn['GradeLetter'] | |
# by N-fold cross validation | |
clf=MLPClassifier(solver='lbfgs', alpha=1e-4,hidden_layer_sizes=(100,8), random_state=1) | |
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean() | |
print("Accuracy by N-fold Cross Validation:",acc) |
# Imbalance Solutions
Note that imbalance solutions can only be applied to training set
请注意,不平衡解决方案只能应用于训练集
API
import numpy as np | |
import scipy as sp | |
import pandas as pd | |
from IPython.display import display, HTML | |
from collections import Counter | |
df=pd.read_csv('data_students.csv') | |
cols=df.columns | |
# print out and display dataframe as tables in HTML | |
# 在 HTML 中打印并显示 dataframe 为表格 | |
display(HTML(df.head(10).to_html())) | |
# check degree of imbalance in labels | |
# 检查标签的不平衡程度 | |
cf=df['GradeLetter'].value_counts() | |
crf=df['GradeLetter'].value_counts()/df.shape[0] | |
print("\nClass frequency:\n", cf, "\n\nClass relative frequency:\n", crf) | |
# get features and labels | |
# 获取特征和标签 | |
x=df.drop('GradeLetter',axis=1) | |
y=df['GradeLetter'] | |
# Install the library imblearn on Anaconda | |
# 在 Anaconda 上安装库 imblearn | |
# https://anaconda.org/conda-forge/imbalanced-learn | |
from imblearn.over_sampling import RandomOverSampler | |
# http://glemaitre.github.io/imbalanced-learn/generated/imblearn.over_sampling.RandomOverSampler.html | |
from imblearn.over_sampling import SMOTE | |
# https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html | |
from imblearn.under_sampling import RandomUnderSampler | |
# http://glemaitre.github.io/imbalanced-learn/generated/imblearn.under_sampling.RandomUnderSampler.html | |
ros = RandomOverSampler(random_state=10) | |
ros.fit(x, y) | |
print('Original dataset shape {}'.format(Counter(y))) | |
x_resampled, y_resampled = ros.fit_resample(x, y) | |
print('After oversampling dataset shape {}'.format(Counter(y_resampled))) | |
print('Original dataset shape {}'.format(Counter(y))) | |
ros = RandomUnderSampler(random_state=30) | |
ros.fit(x, y) | |
x_resampled, y_resampled = ros.fit_resample(x, y) | |
print('After undersampling dataset shape {}'.format(Counter(y_resampled))) | |
# get features and labels, SMOTE can only be applied on numerical features | |
# 获取特征和标签,SMOTE 只能应用于数值特征 | |
x=df_knn.drop('GradeLetter',axis=1) | |
y=df_knn['GradeLetter'] | |
ros = SMOTE(k_neighbors=2) | |
ros.fit(x, y) | |
print('Original dataset shape {}'.format(Counter(y))) | |
x_resampled, y_resampled = ros.fit_resample(x, y) | |
print('After oversampling by SMOTE dataset shape {}'.format(Counter(y_resampled))) | |
# Note that, imbalance solutions are only applied on training set. | |
# 请注意,不平衡解决方案仅适用于训练集。 | |
# In terms of N-folds, you have to split data into train-test splits, and apply the solution | |
# 就 N-folds 而言,必须将数据拆分为训练 - 测试拆分,并应用解决方案 | |
from sklearn.model_selection import KFold | |
# N-fold data split | |
# API: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html | |
# Assume last column is your label, other columns are features | |
# 假设最后一列是标签,其他列是特征 | |
X = df_knn.loc[:, df_knn.columns!='GradeLetter'] | |
y = df_knn.loc[:,'GradeLetter'] | |
print(X.columns) | |
print(type(X)) | |
print(type(y)) | |
kf = KFold(n_splits=5, shuffle=True) | |
data_5folds = [] | |
for train_index, test_index in kf.split(X,y): | |
print("TRAIN:", train_index, "TEST:", test_index) | |
# get actual data by index | |
# 按索引获取实际数据 | |
x_train, x_test = X.iloc[train_index], X.iloc[test_index] | |
y_train, y_test = y[train_index], y[test_index] | |
# save data into fold | |
# 将数据保存到 fold 中 | |
fold = [x_train, x_test, y_train, y_test] | |
# add each fold data into 5folds | |
# 将每个 fold 数据添加到 5 个 folds | |
data_5folds.append(fold) | |
for k in range(1, 24, 2): | |
acc_5folds = [] | |
for x_train, x_test, y_train, y_test in data_5folds: | |
ros = RandomOverSampler(random_state=10) | |
ros.fit(x_train, y_train) | |
x_resampled, y_resampled = ros.fit_resample(x_train, y_train) | |
clf=neighbors.KNeighborsClassifier(k, weights='uniform') | |
clf.fit(x_resampled, y_resampled) | |
y_pred = clf.predict(x_test) | |
acc = accuracy_score(y_test, y_pred) | |
acc_5folds.append(acc) | |
print('k = ',k,'Accuracy on 5-folds: ', np.mean(acc_5folds)) |