推导:https://zybuluo.com/hanbingtao/note/581764
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | import random import numpy as np import math def sigmoid(x): return 1. / (1 + np.exp(-x)) def sigmoid_derivative(values): return values*(1-values) def tanh(x): return 2.0 / (1.0 + np.exp(-2 * x)) - 1.0 def tanh_derivative(values): return 1. - values ** 2 # createst uniform random array w/ values in [a,b) and shape args def rand_arr(a, b, *args): np.random.seed(0) return np.random.rand(*args) * (b - a) + a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | class LstmParam: def __init__(self, mem_cell_ct, x_dim): self.mem_cell_ct = mem_cell_ct self.x_dim = x_dim concat_len = x_dim + mem_cell_ct # weight matrices self.wg = rand_arr(-0.1, 0.1, mem_cell_ct, concat_len) self.wi = rand_arr(-0.1, 0.1, mem_cell_ct, concat_len) self.wf = rand_arr(-0.1, 0.1, mem_cell_ct, concat_len) self.wo = rand_arr(-0.1, 0.1, mem_cell_ct, concat_len) # bias terms self.bg = rand_arr(-0.1, 0.1, mem_cell_ct) self.bi = rand_arr(-0.1, 0.1, mem_cell_ct) self.bf = rand_arr(-0.1, 0.1, mem_cell_ct) self.bo = rand_arr(-0.1, 0.1, mem_cell_ct) # diffs (derivative of loss function w.r.t. all parameters) self.wg_diff = np.zeros((mem_cell_ct, concat_len)) self.wi_diff = np.zeros((mem_cell_ct, concat_len)) self.wf_diff = np.zeros((mem_cell_ct, concat_len)) self.wo_diff = np.zeros((mem_cell_ct, concat_len)) self.bg_diff = np.zeros(mem_cell_ct) self.bi_diff = np.zeros(mem_cell_ct) self.bf_diff = np.zeros(mem_cell_ct) self.bo_diff = np.zeros(mem_cell_ct) def apply_diff(self, lr = 1): self.wg -= lr * self.wg_diff self.wi -= lr * self.wi_diff self.wf -= lr * self.wf_diff self.wo -= lr * self.wo_diff self.bg -= lr * self.bg_diff self.bi -= lr * self.bi_diff self.bf -= lr * self.bf_diff self.bo -= lr * self.bo_diff # reset diffs to zero self.wg_diff = np.zeros_like(self.wg) self.wi_diff = np.zeros_like(self.wi) self.wf_diff = np.zeros_like(self.wf) self.wo_diff = np.zeros_like(self.wo) self.bg_diff = np.zeros_like(self.bg) self.bi_diff = np.zeros_like(self.bi) self.bf_diff = np.zeros_like(self.bf) self.bo_diff = np.zeros_like(self.bo) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | class LstmState: def __init__(self, mem_cell_ct, x_dim): self.g = np.zeros(mem_cell_ct) self.i = np.zeros(mem_cell_ct) self.f = np.zeros(mem_cell_ct) self.o = np.zeros(mem_cell_ct) self.s = np.zeros(mem_cell_ct) self.h = np.zeros(mem_cell_ct) self.bottom_diff_h = np.zeros_like(self.h) self.bottom_diff_s = np.zeros_like(self.s) class LstmNode: def __init__(self, lstm_param, lstm_state): # store reference to parameters and to activations self.state = lstm_state self.param = lstm_param # non-recurrent input concatenated with recurrent input self.xc = None def bottom_data_is(self, x, s_prev = None, h_prev = None): # if this is the first lstm node in the network if s_prev == None: s_prev = np.zeros_like(self.state.s) if h_prev == None: h_prev = np.zeros_like(self.state.h) # save data for use in backprop self.s_prev = s_prev self.h_prev = h_prev # concatenate x(t) and h(t-1) xc = np.hstack((x, h_prev)) #单元状态 self.state.g = np.tanh(np.dot(self.param.wg, xc) + self.param.bg) #输入门 self.state.i = sigmoid(np.dot(self.param.wi, xc) + self.param.bi) #遗忘门 self.state.f = sigmoid(np.dot(self.param.wf, xc) + self.param.bf) #输出门 self.state.o = sigmoid(np.dot(self.param.wo, xc) + self.param.bo) #更新 self.state.s = self.state.g * self.state.i + s_prev * self.state.f #最终输出 self.state.h = self.state.s * self.state.o self.xc = xc def top_diff_is(self, top_diff_h, top_diff_s): # notice that top_diff_s is carried along the constant error carousel ds = self.state.o * top_diff_h + top_diff_s do = self.state.s * top_diff_h di = self.state.g * ds dg = self.state.i * ds df = self.s_prev * ds # diffs w.r.t. vector inside sigma / tanh function di_input = sigmoid_derivative(self.state.i) * di df_input = sigmoid_derivative(self.state.f) * df do_input = sigmoid_derivative(self.state.o) * do dg_input = tanh_derivative(self.state.g) * dg # diffs w.r.t. inputs self.param.wi_diff += np.outer(di_input, self.xc) self.param.wf_diff += np.outer(df_input, self.xc) self.param.wo_diff += np.outer(do_input, self.xc) self.param.wg_diff += np.outer(dg_input, self.xc) self.param.bi_diff += di_input self.param.bf_diff += df_input self.param.bo_diff += do_input self.param.bg_diff += dg_input # compute bottom diff dxc = np.zeros_like(self.xc) dxc += np.dot(self.param.wi.T, di_input) dxc += np.dot(self.param.wf.T, df_input) dxc += np.dot(self.param.wo.T, do_input) dxc += np.dot(self.param.wg.T, dg_input) # save bottom diffs self.state.bottom_diff_s = ds * self.state.f self.state.bottom_diff_h = dxc[self.param.x_dim:] |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | class LstmNetwork(): def __init__(self, lstm_param): self.lstm_param = lstm_param self.lstm_node_list = [] # input sequence self.x_list = [] def y_list_is(self, y_list, loss_layer): """ Updates diffs by setting target sequence with corresponding loss layer. Will *NOT* update parameters. To update parameters, call self.lstm_param.apply_diff() """ assert len(y_list) == len(self.x_list) idx = len(self.x_list) - 1 # first node only gets diffs from label ... loss = loss_layer.loss(self.lstm_node_list[idx].state.h, y_list[idx]) diff_h = loss_layer.bottom_diff(self.lstm_node_list[idx].state.h, y_list[idx]) # here s is not affecting loss due to h(t+1), hence we set equal to zero diff_s = np.zeros(self.lstm_param.mem_cell_ct) self.lstm_node_list[idx].top_diff_is(diff_h, diff_s) idx -= 1 ### ... following nodes also get diffs from next nodes, hence we add diffs to diff_h ### we also propagate error along constant error carousel using diff_s while idx >= 0: loss += loss_layer.loss(self.lstm_node_list[idx].state.h, y_list[idx]) diff_h = loss_layer.bottom_diff(self.lstm_node_list[idx].state.h, y_list[idx]) diff_h += self.lstm_node_list[idx + 1].state.bottom_diff_h diff_s = self.lstm_node_list[idx + 1].state.bottom_diff_s self.lstm_node_list[idx].top_diff_is(diff_h, diff_s) idx -= 1 return loss def x_list_clear(self): self.x_list = [] def x_list_add(self, x): self.x_list.append(x) if len(self.x_list) > len(self.lstm_node_list): # need to add new lstm node, create new state mem lstm_state = LstmState(self.lstm_param.mem_cell_ct, self.lstm_param.x_dim) self.lstm_node_list.append(LstmNode(self.lstm_param, lstm_state)) # get index of most recent x input idx = len(self.x_list) - 1 if idx == 0: # no recurrent inputs yet self.lstm_node_list[idx].bottom_data_is(x) else: s_prev = self.lstm_node_list[idx - 1].state.s h_prev = self.lstm_node_list[idx - 1].state.h self.lstm_node_list[idx].bottom_data_is(x, s_prev, h_prev) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | class LstmLayer(object): def __init__(self, input_width, state_width, learning_rate): self.input_width = input_width self.state_width = state_width self.learning_rate = learning_rate # 门的激活函数 self.gate_activator = SigmoidActivator() # 输出的激活函数 self.output_activator = TanhActivator() # 当前时刻初始化为t0 self.times = 0 # 各个时刻的单元状态向量c self.c_list = self.init_state_vec() # 各个时刻的输出向量h self.h_list = self.init_state_vec() # 各个时刻的遗忘门f self.f_list = self.init_state_vec() # 各个时刻的输入门i self.i_list = self.init_state_vec() # 各个时刻的输出门o self.o_list = self.init_state_vec() # 各个时刻的即时状态c~ self.ct_list = self.init_state_vec() # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf self.Wfh, self.Wfx, self.bf = ( self.init_weight_mat()) # 输入门权重矩阵Wfh, Wfx, 偏置项bf self.Wih, self.Wix, self.bi = ( self.init_weight_mat()) # 输出门权重矩阵Wfh, Wfx, 偏置项bf self.Woh, self.Wox, self.bo = ( self.init_weight_mat()) # 单元状态权重矩阵Wfh, Wfx, 偏置项bf self.Wch, self.Wcx, self.bc = ( self.init_weight_mat()) def init_state_vec(self): ''' 初始化保存状态的向量 ''' state_vec_list = [] state_vec_list.append(np.zeros( (self.state_width, 1))) return state_vec_list def init_weight_mat(self): ''' 初始化权重矩阵 ''' Wh = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.state_width)) Wx = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.input_width)) b = np.zeros((self.state_width, 1)) return Wh, Wx, b def forward(self, x): ''' 根据式1-式6进行前向计算 ''' self.times += 1 # 遗忘门 fg = self.calc_gate(x, self.Wfx, self.Wfh, self.bf, self.gate_activator) self.f_list.append(fg) # 输入门 ig = self.calc_gate(x, self.Wix, self.Wih, self.bi, self.gate_activator) self.i_list.append(ig) # 输出门 og = self.calc_gate(x, self.Wox, self.Woh, self.bo, self.gate_activator) self.o_list.append(og) # 即时状态 ct = self.calc_gate(x, self.Wcx, self.Wch, self.bc, self.output_activator) self.ct_list.append(ct) # 单元状态 c = fg * self.c_list[self.times - 1] + ig * ct self.c_list.append(c) # 输出 h = og * self.output_activator.forward(c) self.h_list.append(h) def calc_gate(self, x, Wx, Wh, b, activator): ''' 计算门 ''' h = self.h_list[self.times - 1] # 上次的LSTM输出 net = np.dot(Wh, h) + np.dot(Wx, x) + b gate = activator.forward(net) return gate def backward(self, x, delta_h, activator): ''' 实现LSTM训练算法 ''' self.calc_delta(delta_h, activator) self.calc_gradient(x) def update(self): ''' 按照梯度下降,更新权重 ''' self.Wfh -= self.learning_rate * self.Whf_grad self.Wfx -= self.learning_rate * self.Whx_grad self.bf -= self.learning_rate * self.bf_grad self.Wih -= self.learning_rate * self.Whi_grad self.Wix -= self.learning_rate * self.Whi_grad self.bi -= self.learning_rate * self.bi_grad self.Woh -= self.learning_rate * self.Wof_grad self.Wox -= self.learning_rate * self.Wox_grad self.bo -= self.learning_rate * self.bo_grad self.Wch -= self.learning_rate * self.Wcf_grad self.Wcx -= self.learning_rate * self.Wcx_grad self.bc -= self.learning_rate * self.bc_grad def calc_delta(self, delta_h, activator): # 初始化各个时刻的误差项 self.delta_h_list = self.init_delta() # 输出误差项 self.delta_o_list = self.init_delta() # 输出门误差项 self.delta_i_list = self.init_delta() # 输入门误差项 self.delta_f_list = self.init_delta() # 遗忘门误差项 self.delta_ct_list = self.init_delta() # 即时输出误差项 # 保存从上一层传递下来的当前时刻的误差项 self.delta_h_list[-1] = delta_h # 迭代计算每个时刻的误差项 for k in range(self.times, 0, -1): self.calc_delta_k(k) def init_delta(self): ''' 初始化误差项 ''' delta_list = [] for i in range(self.times + 1): delta_list.append(np.zeros( (self.state_width, 1))) return delta_list def calc_delta_k(self, k): ''' 根据k时刻的delta_h,计算k时刻的delta_f、 delta_i、delta_o、delta_ct,以及k-1时刻的delta_h ''' # 获得k时刻前向计算的值 ig = self.i_list[k] og = self.o_list[k] fg = self.f_list[k] ct = self.ct_list[k] c = self.c_list[k] c_prev = self.c_list[k-1] tanh_c = self.output_activator.forward(c) delta_k = self.delta_h_list[k] # 根据式9计算delta_o delta_o = (delta_k * tanh_c * self.gate_activator.backward(og)) delta_f = (delta_k * og * (1 - tanh_c * tanh_c) * c_prev * self.gate_activator.backward(fg)) delta_i = (delta_k * og * (1 - tanh_c * tanh_c) * ct * self.gate_activator.backward(ig)) delta_ct = (delta_k * og * (1 - tanh_c * tanh_c) * ig * self.output_activator.backward(ct)) delta_h_prev = ( np.dot(delta_o.transpose(), self.Woh) + np.dot(delta_i.transpose(), self.Wih) + np.dot(delta_f.transpose(), self.Wfh) + np.dot(delta_ct.transpose(), self.Wch) ).transpose() # 保存全部delta值 self.delta_h_list[k-1] = delta_h_prev self.delta_f_list[k] = delta_f self.delta_i_list[k] = delta_i self.delta_o_list[k] = delta_o self.delta_ct_list[k] = delta_ct def calc_gradient(self, x): # 初始化遗忘门权重梯度矩阵和偏置项 self.Wfh_grad, self.Wfx_grad, self.bf_grad = ( self.init_weight_gradient_mat()) # 初始化输入门权重梯度矩阵和偏置项 self.Wih_grad, self.Wix_grad, self.bi_grad = ( self.init_weight_gradient_mat()) # 初始化输出门权重梯度矩阵和偏置项 self.Woh_grad, self.Wox_grad, self.bo_grad = ( self.init_weight_gradient_mat()) # 初始化单元状态权重梯度矩阵和偏置项 self.Wch_grad, self.Wcx_grad, self.bc_grad = ( self.init_weight_gradient_mat()) # 计算对上一次输出h的权重梯度 for t in range(self.times, 0, -1): # 计算各个时刻的梯度 (Wfh_grad, bf_grad, Wih_grad, bi_grad, Woh_grad, bo_grad, Wch_grad, bc_grad) = ( self.calc_gradient_t(t)) # 实际梯度是各时刻梯度之和 self.Wfh_grad += Wfh_grad self.bf_grad += bf_grad self.Wih_grad += Wih_grad self.bi_grad += bi_grad self.Woh_grad += Woh_grad self.bo_grad += bo_grad self.Wch_grad += Wch_grad self.bc_grad += bc_grad # 计算对本次输入x的权重梯度 xt = x.transpose() self.Wfx_grad = np.dot(self.delta_f_list[-1], xt) self.Wix_grad = np.dot(self.delta_i_list[-1], xt) self.Wox_grad = np.dot(self.delta_o_list[-1], xt) self.Wcx_grad = np.dot(self.delta_ct_list[-1], xt) def init_weight_gradient_mat(self): ''' 初始化权重矩阵 ''' Wh_grad = np.zeros((self.state_width, self.state_width)) Wx_grad = np.zeros((self.state_width, self.input_width)) b_grad = np.zeros((self.state_width, 1)) return Wh_grad, Wx_grad, b_grad def calc_gradient_t(self, t): ''' 计算每个时刻t权重的梯度 ''' h_prev = self.h_list[t-1].transpose() Wfh_grad = np.dot(self.delta_f_list[t], h_prev) bf_grad = self.delta_f_list[t] Wih_grad = np.dot(self.delta_i_list[t], h_prev) bi_grad = self.delta_f_list[t] Woh_grad = np.dot(self.delta_o_list[t], h_prev) bo_grad = self.delta_f_list[t] Wch_grad = np.dot(self.delta_ct_list[t], h_prev) bc_grad = self.delta_ct_list[t] return Wfh_grad, bf_grad, Wih_grad, bi_grad, \ Woh_grad, bo_grad, Wch_grad, bc_grad def reset_state(self): # 当前时刻初始化为t0 self.times = 0 # 各个时刻的单元状态向量c self.c_list = self.init_state_vec() # 各个时刻的输出向量h self.h_list = self.init_state_vec() # 各个时刻的遗忘门f self.f_list = self.init_state_vec() # 各个时刻的输入门i self.i_list = self.init_state_vec() # 各个时刻的输出门o self.o_list = self.init_state_vec() # 各个时刻的即时状态c~ self.ct_list = self.init_state_vec() def data_set(): x = [np.array([[1], [2], [3]]), np.array([[2], [3], [4]])] d = np.array([[1], [2]]) return x, d def gradient_check(): ''' 梯度检查 ''' # 设计一个误差函数,取所有节点输出项之和 error_function = lambda o: o.sum() lstm = LstmLayer(3, 2, 1e-3) # 计算forward值 x, d = data_set() lstm.forward(x[0]) lstm.forward(x[1]) # 求取sensitivity map sensitivity_array = np.ones(lstm.h_list[-1].shape, dtype=np.float64) # 计算梯度 lstm.backward(x[1], sensitivity_array, IdentityActivator()) # 检查梯度 epsilon = 10e-4 for i in range(lstm.Wfh.shape[0]): for j in range(lstm.Wfh.shape[1]): lstm.Wfh[i,j] += epsilon lstm.reset_state() lstm.forward(x[0]) lstm.forward(x[1]) err1 = error_function(lstm.h_list[-1]) lstm.Wfh[i,j] -= 2*epsilon lstm.reset_state() lstm.forward(x[0]) lstm.forward(x[1]) err2 = error_function(lstm.h_list[-1]) expect_grad = (err1 - err2) / (2 * epsilon) lstm.Wfh[i,j] += epsilon print 'weights(%d,%d): expected - actural %.4e - %.4e' % ( i, j, expect_grad, lstm.Wfh_grad[i,j]) return lstm def test(): l = LstmLayer(3, 2, 1e-3) x, d = data_set() l.forward(x[0]) l.forward(x[1]) l.backward(x[1], d, IdentityActivator()) return l |