导入测试数据集
这里使用scikit-learn
自带的鸢尾花数据
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
X.shape
# (150, 4)
y.shape
# (150,)
实现
查看y的时候发现,发现lable是按照从小到大进行排序的,所以取训练集不能直接取前n个为训练集,后n个为测试集,这样得到的模型肯定是不准确的。
这个时候可以使用permutation
方法,获取到随机打乱的一组索引,之后自定义训练集和测试集的比例,这里设置测试集的比例为0.2,使用numpy
的fancy indexing
就可以切割得到完全随机的训练集和测试集。
# 获取随机打乱的索引
shuffle_indexes = np.random.permutation(len(X))
# array([106, 61, 127, 139, 128, 0, 108, 142, 92, 20, 113, 17, 11,
# 88, 140, 116, 18, 89, 104, 57, 38, 135, 131, 99, 133, 66,
# 40, 121, 86, 93, 134, 145, 39, 52, 98, 50, 90, 24, 51,
# 2, 120, 55, 75, 107, 67, 59, 36, 80, 119, 82, 143, 69,
# 137, 81, 14, 19, 64, 65, 28, 60, 77, 7, 105, 47, 83,
# 138, 109, 3, 71, 63, 31, 44, 102, 43, 146, 122, 110, 21,
# 79, 29, 12, 53, 136, 148, 58, 149, 42, 114, 41, 97, 4,
# 118, 85, 26, 96, 94, 48, 45, 8, 103, 16, 101, 130, 54,
# 91, 27, 9, 132, 129, 23, 95, 78, 124, 74, 46, 25, 62,
# 30, 6, 117, 115, 33, 111, 15, 76, 112, 72, 68, 5, 56,
# 49, 144, 125, 37, 32, 13, 70, 84, 10, 1, 123, 100, 22,
# 34, 147, 35, 73, 141, 126, 87])
# 设置测试集比例
test_ratio = 0.2
# 注意这里可能取到浮点数,需要强制取整
test_size = int(len(X)*test_ratio)
# 得到训练集和测试集的索引
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]
# 使用fancy indexing 即可得到训练集和测试集数据
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
print(X_train.shape)
# (120, 4)
print(y_train.shape)
# (120,)
自定义train_test_split
import numpy as np
def train_test_split(X, y, test_ratio=0.2, seed=None):
"""将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
assert X.shape[0] == y.shape[0], \
"the size of X must be equal to the size of y"
assert 0.0 <= test_ratio <= 1.0, \
"test_ration must be valid"
if seed:
np.random.seed(seed)
shuffled_indexes = np.random.permutation(len(X))
test_size = int(len(X) * test_ratio)
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
return X_train, X_test, y_train, y_test
# 使用我们封装的算法
from model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)
# 此处引用的是自己封装的knn算法,不是scikit-learn封装的
from kNN import KNNClassifier
my_knn_clf = KNNClassifier(k=3)
my_knn_clf.fit(X_train,y_train)
y_predict = my_knn_clf.predict(X_test)
# array([1, 2, 2, 0, 1, 2, 0, 2, 0, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 0, 2, 1,
# 2, 1, 0, 0, 1, 2, 2, 0])
y_test
# array([1, 2, 2, 0, 1, 2, 0, 2, 0, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 0, 2, 1,
# 2, 1, 0, 0, 1, 2, 2, 0])
# 得到y_predict和y_test之后需要对其进行比较,查看准确率
sum(y_predict == y_test)
# 30
# 准确率
sum(y_predict == y_test)/len(y_test)
# 1.0
使用scikit-learn
中的train_test_split
scikit-learn
中为我们封装好了分割数据集的方法,我们可以直接调用
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 666)
print(X_train.shape)
print(y_train.shape)
# (120, 4)
# (120,)
print(X_test.shape)
print(y_test.shape)
# (30, 4)
# (30,)