机器学习练习题-MNIST数据增强
题目: 写一个可以将MNIST图片向任意方向(上、下、左、右)移动一个像素的功能。然后,对训练集中的每张图片,创建四个位移后的副本(每个方向一个),添加到训练集。最后,在这个扩展过的训练集上训练模型,衡量其在测试集上的精度。你应该能注意到,模型的表现甚至变得更好了!这种人工扩展训练集的技术称为数据增广或训练集扩展。
参考代码:
from sklearn.datasets import fetch_mldata import numpy as np from sklearn.neighbors import KNeighborsClassifier mnist = fetch_mldata(MNIST original, data_home=./mnist1) X, y = mnist["data"], mnist["target"] #print(X.shape) X_train = X[:60000] y_train = y[:60000] X_test = X[60000:] y_test = y[60000:] shuffle_index = np.random.permutation(60000) X_train, y_train = X_train[shuffle_index], y_train[shuffle_index] X_train, y_train = X_train[:60000], y_train[:60000] train_array = [] train_size = X_train.shape[0] for i in range(train_size): #for i in range(122): train_array.append(X_train[i].reshape(28, 28)) print(train_size=, train_size) def data_en(data_s, direc=u): size=len(data_s) en_ret = np.zeros((size, 784)) if direc == u: for i in range(size): trans_data = np.append(data_s[i][1:,:], data_s[i][0:1,:],axis=0) #print(trans_data.shape) en_ret[i] = trans_data.reshape(1, -1) elif direc == d: for i in range(size): trans_data = np.append(data_s[i][-1:,:], data_s[i][:-1,:],axis=0) #print(trans_data.shape) en_ret[i] = trans_data.reshape(1, -1) elif direc == l: for i in range(size): trans_data = np.append(data_s[i][:,1:], data_s[i][:,0:1],axis=1) #print(trans_data.shape) en_ret[i] = trans_data.reshape(1, -1) elif direc == r: for i in range(size): trans_data = np.append(data_s[i][:,-1:], data_s[i][:,:-1],axis=1) #plt.imshow(trans_data, cmap = matplotlib.cm.binary,interpolation="nearest") en_ret[i] = trans_data.reshape(1, -1) return en_ret X_trainu = data_en(train_array, u) X_traind = data_en(train_array, d) X_trainl = data_en(train_array, l) X_trainr = data_en(train_array, r) #X_trainA = np.append(X_train, X_trainu, axis=0) X_trainA = np.concatenate((X_train, X_trainu, X_traind, X_trainl, X_trainr), axis=0) y_trainA = np.concatenate((y_train, y_train, y_train, y_train, y_train), axis=0) print(X_trainA.shape) print(y_trainA.shape) knn_clf = KNeighborsClassifier() knn_clf.fit(X_train, y_train) y_pred = knn_clf.predict(X_test) from sklearn.metrics import precision_score, recall_score,confusion_matrix ps = precision_score(y_test, y_pred, average=None) print( ps=, ps, np.average(ps)) knn_clfA = KNeighborsClassifier() knn_clfA.fit(X_trainA, y_trainA) y_pred = knn_clfA.predict(X_test) psA = precision_score(y_test, y_pred, average=None) print( psA=, psA, np.average(psA))
输出结果:
(train_size=, 60000) (300000, 784) (300000,) ( ps=, array([ 0.96340257, 0.95450716, 0.98216056, 0.96442688, 0.9762151 , 0.96528555, 0.98130841, 0.96108949, 0.98809524, 0.95626243]), 0.96927533865705706) ( psA=, array([ 0.97887324, 0.95850974, 0.98624754, 0.97628458, 0.97841727, 0.9698324 , 0.98130841, 0.96605238, 0.99353448, 0.96915423]), 0.97582142737164035)
精度由0.9692提升到0.9758。
上一篇:
通过多线程提高代码的执行效率例子
下一篇:
利用随机森林进行特征重要性排序