Section1 勾配消失問題
ニューラルネットワーククラス
多層のニューラルネットワークを構築するクラス
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import numpy as np from common import layers from collections import OrderedDict from common import functions from data.mnist import load_mnist import matplotlib.pyplot as plt class MultiLayerNet: ''' input_size: 入力層のノード数 hidden_size_list: 隠れ層のノード数のリスト output_size: 出力層のノード数 activation: 活性化関数 weight_init_std: 重みの初期化方法 ''' def __init__(self, input_size, hidden_size_list, output_size, activation='relu', weight_init_std='relu'): self.input_size = input_size self.output_size = output_size self.hidden_size_list = hidden_size_list self.hidden_layer_num = len(hidden_size_list) self.params = {} # 重みの初期化 self.__init_weight(weight_init_std) # レイヤの生成, sigmoidとreluのみ扱う activation_layer = {'sigmoid': layers.Sigmoid, 'relu': layers.Relu} self.layers = OrderedDict() # 追加した順番に格納 for idx in range(1, self.hidden_layer_num+1): self.layers['Affine' + str(idx)] = layers.Affine(self.params['W' + str(idx)], self.params['b' + str(idx)]) self.layers['Activation_function' + str(idx)] = activation_layer[activation]() idx = self.hidden_layer_num + 1 self.layers['Affine' + str(idx)] = layers.Affine(self.params['W' + str(idx)], self.params['b' + str(idx)]) self.last_layer = layers.SoftmaxWithLoss() def __init_weight(self, weight_init_std): all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size] for idx in range(1, len(all_size_list)): scale = weight_init_std if str(weight_init_std).lower() in ('relu', 'he'): scale = np.sqrt(2.0 / all_size_list[idx - 1]) elif str(weight_init_std).lower() in ('sigmoid', 'xavier'): scale = np.sqrt(1.0 / all_size_list[idx - 1]) self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx]) self.params['b' + str(idx)] = np.zeros(all_size_list[idx]) def predict(self, x): for layer in self.layers.values(): x = layer.forward(x) return x def loss(self, x, d): y = self.predict(x) weight_decay = 0 for idx in range(1, self.hidden_layer_num + 2): W = self.params['W' + str(idx)] return self.last_layer.forward(y, d) + weight_decay def accuracy(self, x, d): y = self.predict(x) y = np.argmax(y, axis=1) if d.ndim != 1 : d = np.argmax(d, axis=1) accuracy = np.sum(y == d) / float(x.shape[0]) return accuracy def gradient(self, x, d): # forward self.loss(x, d) # backward dout = 1 dout = self.last_layer.backward(dout) layers = list(self.layers.values()) layers.reverse() for layer in layers: dout = layer.backward(dout) # 設定 grad = {} for idx in range(1, self.hidden_layer_num+2): grad['W' + str(idx)] = self.layers['Affine' + str(idx)].dW grad['b' + str(idx)] = self.layers['Affine' + str(idx)].db return grad |
クラスを実行しただけなので出力は無し。
勾配消失問題
上記クラスを使用して勾配消失問題を確認する。
条件
- 入力層のノード数784、隠れ層のノード数40、20、出力層のノード数10
- 活性化関数はシグモイド関数
- 重みの初期化はガウス分布
- 学習が失敗するパターン
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# データの読み込み (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True, one_hot_label=True) print("データ読み込み完了") network = MultiLayerNet(input_size=784, hidden_size_list=[40, 20], output_size=10, activation='sigmoid', weight_init_std=0.01) iters_num = 2000 train_size = x_train.shape[0] batch_size = 100 learning_rate = 0.1 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] # 勾配 grad = network.gradient(x_batch, d_batch) for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'): network.params[key] -= learning_rate * grad[key] loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i + 1) % plot_interval == 0: accr_test = network.accuracy(x_test, d_test) accuracies_test.append(accr_test) accr_train = network.accuracy(x_batch, d_batch) accuracies_train.append(accr_train) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
結果出力
Generation10~1900は省略。
グラフ
エポックが進んでも学習が行われていない。
ReLU – gauss
活性化関数にRuLu関数、初期値は正規分布。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# データの読み込み (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True, one_hot_label=True) print("データ読み込み完了") network = MultiLayerNet(input_size=784, hidden_size_list=[40, 20], output_size=10, activation='relu', weight_init_std=0.01) iters_num = 2000 train_size = x_train.shape[0] batch_size = 100 learning_rate = 0.1 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] # 勾配 grad = network.gradient(x_batch, d_batch) for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'): network.params[key] -= learning_rate * grad[key] loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i + 1) % plot_interval == 0: accr_test = network.accuracy(x_test, d_test) accuracies_test.append(accr_test) accr_train = network.accuracy(x_batch, d_batch) accuracies_train.append(accr_train) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
出力結果
500エポックまでは学習できていないがその後、精度が上がっていることが確認できる。
ReLu関数で勾配消失問題が回避できた。
sigmoid – Xavier
活性化関数にシグモイド関数、初期値にザビエル
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# データの読み込み (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True, one_hot_label=True) print("データ読み込み完了") network = MultiLayerNet(input_size=784, hidden_size_list=[40, 20], output_size=10, activation='sigmoid', weight_init_std='Xavier') iters_num = 2000 train_size = x_train.shape[0] batch_size = 100 learning_rate = 0.1 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] # 勾配 grad = network.gradient(x_batch, d_batch) for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'): network.params[key] -= learning_rate * grad[key] loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i + 1) % plot_interval == 0: accr_test = network.accuracy(x_test, d_test) accuracies_test.append(accr_test) accr_train = network.accuracy(x_batch, d_batch) accuracies_train.append(accr_train) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
結果出力
少ないエポック数の時点から精度の改善がみられる。
活性化関数がシグモイド関数でもXavier初期化で勾配消失問題を回避できた。
ReLU – He
活性化関数にReLU、初期化にHe
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# データの読み込み (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True, one_hot_label=True) print("データ読み込み完了") network = MultiLayerNet(input_size=784, hidden_size_list=[40, 20], output_size=10, activation='relu', weight_init_std='He') iters_num = 2000 train_size = x_train.shape[0] batch_size = 100 learning_rate = 0.1 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] # 勾配 grad = network.gradient(x_batch, d_batch) for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'): network.params[key] -= learning_rate * grad[key] loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i + 1) % plot_interval == 0: accr_test = network.accuracy(x_test, d_test) accuracies_test.append(accr_test) accr_train = network.accuracy(x_batch, d_batch) accuracies_train.append(accr_train) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
出力結果
バッチ正規化
バッチ正規化クラス
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import numpy as np from collections import OrderedDict from common import layers from data.mnist import load_mnist import matplotlib.pyplot as plt from multi_layer_net import MultiLayerNet from common import optimizer # バッチ正則化 layer class BatchNormalization: ''' gamma: スケール係数 beta: オフセット momentum: 慣性 running_mean: テスト時に使用する平均 running_var: テスト時に使用する分散 ''' def __init__(self, gamma, beta, momentum=0.9, running_mean=None, running_var=None): self.gamma = gamma self.beta = beta self.momentum = momentum self.input_shape = None self.running_mean = running_mean self.running_var = running_var # backward時に使用する中間データ self.batch_size = None self.xc = None self.std = None self.dgamma = None self.dbeta = None def forward(self, x, train_flg=True): if self.running_mean is None: N, D = x.shape self.running_mean = np.zeros(D) self.running_var = np.zeros(D) if train_flg: mu = x.mean(axis=0) # 平均 xc = x - mu # xをセンタリング var = np.mean(xc**2, axis=0) # 分散 std = np.sqrt(var + 10e-7) # スケーリング xn = xc / std self.batch_size = x.shape[0] self.xc = xc self.xn = xn self.std = std self.running_mean = self.momentum * self.running_mean + (1-self.momentum) * mu # 平均値の加重平均 self.running_var = self.momentum * self.running_var + (1-self.momentum) * var #分散値の加重平均 else: xc = x - self.running_mean xn = xc / ((np.sqrt(self.running_var + 10e-7))) out = self.gamma * xn + self.beta return out def backward(self, dout): dbeta = dout.sum(axis=0) dgamma = np.sum(self.xn * dout, axis=0) dxn = self.gamma * dout dxc = dxn / self.std dstd = -np.sum((dxn * self.xc) / (self.std * self.std), axis=0) dvar = 0.5 * dstd / self.std dxc += (2.0 / self.batch_size) * self.xc * dvar dmu = np.sum(dxc, axis=0) dx = dxc - dmu / self.batch_size self.dgamma = dgamma self.dbeta = dbeta return dx |
バッチ正規化を適用した学習
活性化関数にReLU、初期化にHe
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# データの読み込み (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True, one_hot_label=True) print("データ読み込み完了") network = MultiLayerNet(input_size=784, hidden_size_list=[40, 20], output_size=10, activation='relu', weight_init_std='He') iters_num = 2000 train_size = x_train.shape[0] batch_size = 100 learning_rate = 0.1 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] # 勾配 grad = network.gradient(x_batch, d_batch) for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'): network.params[key] -= learning_rate * grad[key] loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i + 1) % plot_interval == 0: accr_test = network.accuracy(x_test, d_test) accuracies_test.append(accr_test) accr_train = network.accuracy(x_batch, d_batch) accuracies_train.append(accr_train) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
出力結果
Section2 学習率最適化手法
SGD
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import sys, os sys.path.append(os.pardir) # 親ディレクトリのファイルをインポートするための設定 import numpy as np from collections import OrderedDict from common import layers from data.mnist import load_mnist import matplotlib.pyplot as plt from multi_layer_net import MultiLayerNet # データの読み込み (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True, one_hot_label=True) print("データ読み込み完了") # batch_normalizationの設定 ================================ # use_batchnorm = True use_batchnorm = False # ==================================================== network = MultiLayerNet(input_size=784, hidden_size_list=[40, 20], output_size=10, activation='sigmoid', weight_init_std=0.01, use_batchnorm=use_batchnorm) iters_num = 1000 train_size = x_train.shape[0] batch_size = 100 learning_rate = 0.01 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] # 勾配 grad = network.gradient(x_batch, d_batch) for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'): network.params[key] -= learning_rate * grad[key] loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i + 1) % plot_interval == 0: accr_test = network.accuracy(x_test, d_test) accuracies_test.append(accr_test) accr_train = network.accuracy(x_batch, d_batch) accuracies_train.append(accr_train) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
確率的勾配降下法ではうまくいなかないことを確認した。
モーメンタム
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# データの読み込み (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True, one_hot_label=True) print("データ読み込み完了") # batch_normalizationの設定 ================================ # use_batchnorm = True use_batchnorm = False # ==================================================== network = MultiLayerNet(input_size=784, hidden_size_list=[40, 20], output_size=10, activation='sigmoid', weight_init_std=0.01, use_batchnorm=use_batchnorm) iters_num = 1000 train_size = x_train.shape[0] batch_size = 100 learning_rate = 0.01 # 慣性 momentum = 0.9 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] # 勾配 grad = network.gradient(x_batch, d_batch) if i == 0: v = {} for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'): if i == 0: v[key] = np.zeros_like(network.params[key]) v[key] = momentum * v[key] - learning_rate * grad[key] network.params[key] += v[key] loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i + 1) % plot_interval == 0: accr_test = network.accuracy(x_test, d_test) accuracies_test.append(accr_test) accr_train = network.accuracy(x_batch, d_batch) accuracies_train.append(accr_train) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
出力結果
モーメンタムでも精度が上がらないことを確認した。
AdaGrad
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# データの読み込み (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True, one_hot_label=True) print("データ読み込み完了") # batch_normalizationの設定 ======================= # use_batchnorm = True use_batchnorm = False # ==================================================== network = MultiLayerNet(input_size=784, hidden_size_list=[40, 20], output_size=10, activation='sigmoid', weight_init_std=0.01, use_batchnorm=use_batchnorm) iters_num = 1000 train_size = x_train.shape[0] batch_size = 100 learning_rate = 0.1 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] # 勾配 grad = network.gradient(x_batch, d_batch) if i == 0: h = {} for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'): if i == 0: h[key] = np.full_like(network.params[key], 1e-4) else: h[key] += np.square(grad[key]) network.params[key] -= learning_rate * grad[key] / (np.sqrt(h[key])) loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i + 1) % plot_interval == 0: accr_test = network.accuracy(x_test, d_test) accuracies_test.append(accr_test) accr_train = network.accuracy(x_batch, d_batch) accuracies_train.append(accr_train) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
立ち上がりは遅いが、1000エポック目には学習が進み、検証データにも精度が出ることが分かった。
RSMProp
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# データの読み込み (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True, one_hot_label=True) print("データ読み込み完了") # batch_normalizationの設定 ================================ # use_batchnorm = True use_batchnorm = False # ==================================================== network = MultiLayerNet(input_size=784, hidden_size_list=[40, 20], output_size=10, activation='sigmoid', weight_init_std=0.01, use_batchnorm=use_batchnorm) iters_num = 1000 train_size = x_train.shape[0] batch_size = 100 learning_rate = 0.01 decay_rate = 0.99 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] # 勾配 grad = network.gradient(x_batch, d_batch) if i == 0: h = {} for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'): if i == 0: h[key] = np.zeros_like(network.params[key]) h[key] *= decay_rate h[key] += (1 - decay_rate) * np.square(grad[key]) network.params[key] -= learning_rate * grad[key] / (np.sqrt(h[key]) + 1e-7) loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i + 1) % plot_interval == 0: accr_test = network.accuracy(x_test, d_test) accuracies_test.append(accr_test) accr_train = network.accuracy(x_batch, d_batch) accuracies_train.append(accr_train) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
出力結果
RMSPropでは精度が出ることが分かった。
Adam
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# データの読み込み (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True, one_hot_label=True) print("データ読み込み完了") # batch_normalizationの設定 ================================ # use_batchnorm = True use_batchnorm = False # ==================================================== network = MultiLayerNet(input_size=784, hidden_size_list=[40, 20], output_size=10, activation='sigmoid', weight_init_std=0.01, use_batchnorm=use_batchnorm) iters_num = 1000 train_size = x_train.shape[0] batch_size = 100 learning_rate = 0.01 beta1 = 0.9 beta2 = 0.999 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] # 勾配 grad = network.gradient(x_batch, d_batch) if i == 0: m = {} v = {} learning_rate_t = learning_rate * np.sqrt(1.0 - beta2 ** (i + 1)) / (1.0 - beta1 ** (i + 1)) for key in ('W1', 'W2', 'W3', 'b1', 'b2', 'b3'): if i == 0: m[key] = np.zeros_like(network.params[key]) v[key] = np.zeros_like(network.params[key]) m[key] += (1 - beta1) * (grad[key] - m[key]) v[key] += (1 - beta2) * (grad[key] ** 2 - v[key]) network.params[key] -= learning_rate_t * m[key] / (np.sqrt(v[key]) + 1e-7) if (i + 1) % plot_interval == 0: accr_test = network.accuracy(x_test, d_test) accuracies_test.append(accr_test) accr_train = network.accuracy(x_batch, d_batch) accuracies_train.append(accr_train) loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
出力結果
adamでもうまくいくことが分かった。
Section3 過学習
対策なし
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import numpy as np from collections import OrderedDict from common import layers from data.mnist import load_mnist import matplotlib.pyplot as plt from multi_layer_net import MultiLayerNet from common import optimizer (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True) print("データ読み込み完了") # 過学習を再現するために、学習データを削減 x_train = x_train[:300] d_train = d_train[:300] network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10) optimizer = optimizer.SGD(learning_rate=0.01) iters_num = 1000 train_size = x_train.shape[0] batch_size = 100 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] grad = network.gradient(x_batch, d_batch) optimizer.update(network.params, grad) loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i+1) % plot_interval == 0: accr_train = network.accuracy(x_train, d_train) accr_test = network.accuracy(x_test, d_test) accuracies_train.append(accr_train) accuracies_test.append(accr_test) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
出力結果
訓練データに対して100%の精度が出ているが、検証データでは70%ほどの精度になっており、やや過学習の傾向がみられる。
L2正則化
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from common import optimizer (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True) print("データ読み込み完了") # 過学習を再現するために、学習データを削減 x_train = x_train[:300] d_train = d_train[:300] network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10) iters_num = 1000 train_size = x_train.shape[0] batch_size = 100 learning_rate=0.01 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 hidden_layer_num = network.hidden_layer_num # 正則化強度設定 ====================================== weight_decay_lambda = 0.1 # ================================================= for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] grad = network.gradient(x_batch, d_batch) weight_decay = 0 for idx in range(1, hidden_layer_num+1): grad['W' + str(idx)] = network.layers['Affine' + str(idx)].dW + weight_decay_lambda * network.params['W' + str(idx)] grad['b' + str(idx)] = network.layers['Affine' + str(idx)].db network.params['W' + str(idx)] -= learning_rate * grad['W' + str(idx)] network.params['b' + str(idx)] -= learning_rate * grad['b' + str(idx)] weight_decay += 0.5 * weight_decay_lambda * np.sqrt(np.sum(network.params['W' + str(idx)] ** 2)) # L2正則化のコード loss = network.loss(x_batch, d_batch) + weight_decay train_loss_list.append(loss) if (i+1) % plot_interval == 0: accr_train = network.accuracy(x_train, d_train) accr_test = network.accuracy(x_test, d_test) accuracies_train.append(accr_train) accuracies_test.append(accr_test) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
出力結果
正則化項が加わることによって学習用データセットに対しては100%正解できなくなっている。検証用データセットにはあまり改善の効果が見られなかった。
L1正則化
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
(x_train, d_train), (x_test, d_test) = load_mnist(normalize=True) print("データ読み込み完了") # 過学習を再現するために、学習データを削減 x_train = x_train[:300] d_train = d_train[:300] network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10) iters_num = 1000 train_size = x_train.shape[0] batch_size = 100 learning_rate=0.1 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 hidden_layer_num = network.hidden_layer_num # 正則化強度設定 ====================================== weight_decay_lambda = 0.005 # ================================================= for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] grad = network.gradient(x_batch, d_batch) weight_decay = 0 for idx in range(1, hidden_layer_num+1): grad['W' + str(idx)] = network.layers['Affine' + str(idx)].dW + weight_decay_lambda * np.sign(network.params['W' + str(idx)]) grad['b' + str(idx)] = network.layers['Affine' + str(idx)].db network.params['W' + str(idx)] -= learning_rate * grad['W' + str(idx)] network.params['b' + str(idx)] -= learning_rate * grad['b' + str(idx)] weight_decay += weight_decay_lambda * np.sum(np.abs(network.params['W' + str(idx)])) loss = network.loss(x_batch, d_batch) + weight_decay train_loss_list.append(loss) if (i+1) % plot_interval == 0: accr_train = network.accuracy(x_train, d_train) accr_test = network.accuracy(x_test, d_test) accuracies_train.append(accr_train) accuracies_test.append(accr_test) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
正則化項が加わったことによって学習用データセットに対して100%取れていない。
ドロップアウト
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
class Dropout: def __init__(self, dropout_ratio=0.5): self.dropout_ratio = dropout_ratio self.mask = None def forward(self, x, train_flg=True): if train_flg: self.mask = np.random.rand(*x.shape) > self.dropout_ratio return x * self.mask else: return x * (1.0 - self.dropout_ratio) def backward(self, dout): return dout * self.mask |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from common import optimizer (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True) print("データ読み込み完了") # 過学習を再現するために、学習データを削減 x_train = x_train[:300] d_train = d_train[:300] # ドロップアウト設定 ====================================== use_dropout = True dropout_ratio = 0.15 # ==================================================== network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10, weight_decay_lambda=weight_decay_lambda, use_dropout = use_dropout, dropout_ratio = dropout_ratio) optimizer = optimizer.SGD(learning_rate=0.01) # optimizer = optimizer.Momentum(learning_rate=0.01, momentum=0.9) # optimizer = optimizer.AdaGrad(learning_rate=0.01) # optimizer = optimizer.Adam() iters_num = 1000 train_size = x_train.shape[0] batch_size = 100 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] grad = network.gradient(x_batch, d_batch) optimizer.update(network.params, grad) loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i+1) % plot_interval == 0: accr_train = network.accuracy(x_train, d_train) accr_test = network.accuracy(x_test, d_test) accuracies_train.append(accr_train) accuracies_test.append(accr_test) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
学習が緩やかなため未完了ではあるが、順調に上がっている。
上記二つの場合と比べて上がり方が緩やかになっているのはドロップアウトによってニューラルネットワークの中でデータ量が増えたような状態になっているから。
学習用、検証用データセットが増えたためその分最適化が難しくなり、精度の上昇が遅くなっている。
ドロップアウト+L1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
from common import optimizer (x_train, d_train), (x_test, d_test) = load_mnist(normalize=True) print("データ読み込み完了") # 過学習を再現するために、学習データを削減 x_train = x_train[:300] d_train = d_train[:300] # ドロップアウト設定 ====================================== use_dropout = True dropout_ratio = 0.08 # ==================================================== network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10, use_dropout = use_dropout, dropout_ratio = dropout_ratio) iters_num = 1000 train_size = x_train.shape[0] batch_size = 100 learning_rate=0.01 train_loss_list = [] accuracies_train = [] accuracies_test = [] hidden_layer_num = network.hidden_layer_num plot_interval=10 # 正則化強度設定 ====================================== weight_decay_lambda=0.004 # ================================================= for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] grad = network.gradient(x_batch, d_batch) weight_decay = 0 for idx in range(1, hidden_layer_num+1): grad['W' + str(idx)] = network.layers['Affine' + str(idx)].dW + weight_decay_lambda * np.sign(network.params['W' + str(idx)]) grad['b' + str(idx)] = network.layers['Affine' + str(idx)].db network.params['W' + str(idx)] -= learning_rate * grad['W' + str(idx)] network.params['b' + str(idx)] -= learning_rate * grad['b' + str(idx)] weight_decay += weight_decay_lambda * np.sum(np.abs(network.params['W' + str(idx)])) loss = network.loss(x_batch, d_batch) + weight_decay train_loss_list.append(loss) if (i+1) % plot_interval == 0: accr_train = network.accuracy(x_train, d_train) accr_test = network.accuracy(x_test, d_test) accuracies_train.append(accr_train) accuracies_test.append(accr_test) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
ドロップアウトとL1正則化を組み合わせることで精度向上と1000回までの学習で完了している。
Section4 畳み込みニューラルネットワークの概念
クラスや関数の準備
画像データを高速に処理するための関数。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
import pickle import numpy as np from collections import OrderedDict from common import layers from common import optimizer from data.mnist import load_mnist import matplotlib.pyplot as plt # 画像データを2次元配列に変換 ''' input_data: 入力値 filter_h: フィルターの高さ filter_w: フィルターの横幅 stride: ストライド pad: パディング ''' def im2col(input_data, filter_h, filter_w, stride=1, pad=0): # N: number, C: channel, H: height, W: width N, C, H, W = input_data.shape out_h = (H + 2 * pad - filter_h)//stride + 1 out_w = (W + 2 * pad - filter_w)//stride + 1 img = np.pad(input_data, [(0,0), (0,0), (pad, pad), (pad, pad)], 'constant') col = np.zeros((N, C, filter_h, filter_w, out_h, out_w)) for y in range(filter_h): y_max = y + stride * out_h for x in range(filter_w): x_max = x + stride * out_w col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride] col = col.transpose(0, 4, 5, 1, 2, 3) # (N, C, filter_h, filter_w, out_h, out_w) -> (N, filter_w, out_h, out_w, C, filter_h) col = col.reshape(N * out_h * out_w, -1) return col |
処理の確認
1 2 3 4 5 6 7 8 9 10 11 |
# im2colの処理確認 input_data = np.random.rand(2, 1, 4, 4)*100//1 # number, channel, height, widthを表す print('========== input_data ===========\n', input_data) print('==============================') filter_h = 3 filter_w = 3 stride = 1 pad = 0 col = im2col(input_data, filter_h=filter_h, filter_w=filter_w, stride=stride, pad=pad) print('============= col ==============\n', col) print('==============================') |
畳み込み演算を高速に行うための変換。
フィルターでと掛け合わせるデータを横一列に並べ直すと高速に演算できる。
この例では3×3のフィルターを想定。
出力colの第一行目がinput dataの一つ目の行列の左上3×3のデータと同じ。
col2img関数、畳み込みクラス、プーリングクラス、sinple convolution networkクラスの実行
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# 2次元配列を画像データに変換 def col2im(col, input_shape, filter_h, filter_w, stride=1, pad=0): # N: number, C: channel, H: height, W: width N, C, H, W = input_shape # 切り捨て除算 out_h = (H + 2 * pad - filter_h)//stride + 1 out_w = (W + 2 * pad - filter_w)//stride + 1 col = col.reshape(N, out_h, out_w, C, filter_h, filter_w).transpose(0, 3, 4, 5, 1, 2) # (N, filter_h, filter_w, out_h, out_w, C) img = np.zeros((N, C, H + 2 * pad + stride - 1, W + 2 * pad + stride - 1)) for y in range(filter_h): y_max = y + stride * out_h for x in range(filter_w): x_max = x + stride * out_w img[:, :, y:y_max:stride, x:x_max:stride] += col[:, :, y, x, :, :] return img[:, :, pad:H + pad, pad:W + pad] class Convolution: # W: フィルター, b: バイアス def __init__(self, W, b, stride=1, pad=0): self.W = W self.b = b self.stride = stride self.pad = pad # 中間データ(backward時に使用) self.x = None self.col = None self.col_W = None # フィルター・バイアスパラメータの勾配 self.dW = None self.db = None def forward(self, x): # FN: filter_number, C: channel, FH: filter_height, FW: filter_width FN, C, FH, FW = self.W.shape N, C, H, W = x.shape # 出力値のheight, width out_h = 1 + int((H + 2 * self.pad - FH) / self.stride) out_w = 1 + int((W + 2 * self.pad - FW) / self.stride) # xを行列に変換 col = im2col(x, FH, FW, self.stride, self.pad) # フィルターをxに合わせた行列に変換 col_W = self.W.reshape(FN, -1).T out = np.dot(col, col_W) + self.b # 計算のために変えた形式を戻す out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2) self.x = x self.col = col self.col_W = col_W return out def backward(self, dout): FN, C, FH, FW = self.W.shape dout = dout.transpose(0, 2, 3, 1).reshape(-1, FN) self.db = np.sum(dout, axis=0) self.dW = np.dot(self.col.T, dout) self.dW = self.dW.transpose(1, 0).reshape(FN, C, FH, FW) dcol = np.dot(dout, self.col_W.T) # dcolを画像データに変換 dx = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad) return dx class Pooling: def __init__(self, pool_h, pool_w, stride=1, pad=0): self.pool_h = pool_h self.pool_w = pool_w self.stride = stride self.pad = pad self.x = None self.arg_max = None def forward(self, x): N, C, H, W = x.shape out_h = int(1 + (H - self.pool_h) / self.stride) out_w = int(1 + (W - self.pool_w) / self.stride) # xを行列に変換 col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad) # プーリングのサイズに合わせてリサイズ col = col.reshape(-1, self.pool_h*self.pool_w) # 行ごとに最大値を求める arg_max = np.argmax(col, axis=1) out = np.max(col, axis=1) # 整形 out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2) self.x = x self.arg_max = arg_max return out def backward(self, dout): dout = dout.transpose(0, 2, 3, 1) pool_size = self.pool_h * self.pool_w dmax = np.zeros((dout.size, pool_size)) dmax[np.arange(self.arg_max.size), self.arg_max.flatten()] = dout.flatten() dmax = dmax.reshape(dout.shape + (pool_size,)) dcol = dmax.reshape(dmax.shape[0] * dmax.shape[1] * dmax.shape[2], -1) dx = col2im(dcol, self.x.shape, self.pool_h, self.pool_w, self.stride, self.pad) return dx class SimpleConvNet: # conv - relu - pool - affine - relu - affine - softmax def __init__(self, input_dim=(1, 28, 28), conv_param={'filter_num':30, 'filter_size':5, 'pad':0, 'stride':1}, hidden_size=100, output_size=10, weight_init_std=0.01): filter_num = conv_param['filter_num'] filter_size = conv_param['filter_size'] filter_pad = conv_param['pad'] filter_stride = conv_param['stride'] input_size = input_dim[1] conv_output_size = (input_size - filter_size + 2 * filter_pad) / filter_stride + 1 pool_output_size = int(filter_num * (conv_output_size / 2) * (conv_output_size / 2)) # 重みの初期化 self.params = {} self.params['W1'] = weight_init_std * np.random.randn(filter_num, input_dim[0], filter_size, filter_size) self.params['b1'] = np.zeros(filter_num) self.params['W2'] = weight_init_std * np.random.randn(pool_output_size, hidden_size) self.params['b2'] = np.zeros(hidden_size) self.params['W3'] = weight_init_std * np.random.randn(hidden_size, output_size) self.params['b3'] = np.zeros(output_size) # レイヤの生成 self.layers = OrderedDict() self.layers['Conv1'] = layers.Convolution(self.params['W1'], self.params['b1'], conv_param['stride'], conv_param['pad']) self.layers['Relu1'] = layers.Relu() self.layers['Pool1'] = layers.Pooling(pool_h=2, pool_w=2, stride=2) self.layers['Affine1'] = layers.Affine(self.params['W2'], self.params['b2']) self.layers['Relu2'] = layers.Relu() self.layers['Affine2'] = layers.Affine(self.params['W3'], self.params['b3']) self.last_layer = layers.SoftmaxWithLoss() def predict(self, x): for key in self.layers.keys(): x = self.layers[key].forward(x) return x def loss(self, x, d): y = self.predict(x) return self.last_layer.forward(y, d) def accuracy(self, x, d, batch_size=100): if d.ndim != 1 : d = np.argmax(d, axis=1) acc = 0.0 for i in range(int(x.shape[0] / batch_size)): tx = x[i*batch_size:(i+1)*batch_size] td = d[i*batch_size:(i+1)*batch_size] y = self.predict(tx) y = np.argmax(y, axis=1) acc += np.sum(y == td) return acc / x.shape[0] def gradient(self, x, d): # forward self.loss(x, d) # backward dout = 1 dout = self.last_layer.backward(dout) layers = list(self.layers.values()) layers.reverse() for layer in layers: dout = layer.backward(dout) # 設定 grad = {} grad['W1'], grad['b1'] = self.layers['Conv1'].dW, self.layers['Conv1'].db grad['W2'], grad['b2'] = self.layers['Affine1'].dW, self.layers['Affine1'].db grad['W3'], grad['b3'] = self.layers['Affine2'].dW, self.layers['Affine2'].db return grad |
画像識別(MNIST)
画像識別をおこなう。
使用するデータセットはMNIST
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
from common import optimizer # データの読み込み (x_train, d_train), (x_test, d_test) = load_mnist(flatten=False) print("データ読み込み完了") # 処理に時間のかかる場合はデータを削減 x_train, d_train = x_train[:5000], d_train[:5000] x_test, d_test = x_test[:1000], d_test[:1000] network = SimpleConvNet(input_dim=(1,28,28), conv_param = {'filter_num': 30, 'filter_size': 5, 'pad': 0, 'stride': 1}, hidden_size=100, output_size=10, weight_init_std=0.01) optimizer = optimizer.Adam() iters_num = 1000 train_size = x_train.shape[0] batch_size = 100 train_loss_list = [] accuracies_train = [] accuracies_test = [] plot_interval=10 for i in range(iters_num): batch_mask = np.random.choice(train_size, batch_size) x_batch = x_train[batch_mask] d_batch = d_train[batch_mask] grad = network.gradient(x_batch, d_batch) optimizer.update(network.params, grad) loss = network.loss(x_batch, d_batch) train_loss_list.append(loss) if (i+1) % plot_interval == 0: accr_train = network.accuracy(x_train, d_train) accr_test = network.accuracy(x_test, d_test) accuracies_train.append(accr_train) accuracies_test.append(accr_test) print('Generation: ' + str(i+1) + '. 正答率(トレーニング) = ' + str(accr_train)) print(' : ' + str(i+1) + '. 正答率(テスト) = ' + str(accr_test)) lists = range(0, iters_num, plot_interval) plt.plot(lists, accuracies_train, label="training set") plt.plot(lists, accuracies_test, label="test set") plt.legend(loc="lower right") plt.title("accuracy") plt.xlabel("count") plt.ylabel("accuracy") plt.ylim(0, 1.0) # グラフの表示 plt.show() |
正答率が訓練データ99%、検証データで95%になり良好な結果が得られた。
処理が完了するまでに10分ほどかかった。
Section5 最新のCNN(AlexNet)
該当するソースコードはないため、割愛。
コメント