赞
踩
# 导入数据及画图函数 def load_data(path, transpose=True): data = sio.loadmat(path) y = data.get('y') # (5000,1) y = y.reshape(y.shape[0]) # make it back to column vector X = data.get('X') # (5000,400) if transpose: # for this dataset, you need a transpose to get the orientation right X = np.array([im.reshape((20, 20)).T for im in X]) # and I flat the image again to preserve the vector presentation X = np.array([im.reshape(400) for im in X]) return X, y X, _ = load_data('ex4data1.mat') def plot_100_image(X): """ sample 100 image and show them assume the image is square X : (5000, 400) """ size = int(np.sqrt(X.shape[1])) # sample 100 image, reshape, reorg it sample_idx = np.random.choice(np.arange(X.shape[0]), 100) # 100*400 sample_images = X[sample_idx, :] fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8)) for r in range(10): for c in range(10): ax_array[r, c].matshow(sample_images[10 * r + c].reshape((size, size)), cmap=matplotlib.cm.binary) plt.xticks(np.array([])) plt.yticks(np.array([])) plot_100_image(X) plt.show()
# 数据读取与数据处理 X_raw, y_raw = load_data('ex4data1.mat', transpose=False) X = np.insert(X_raw, 0, np.ones(X_raw.shape[0]), axis=1) # 增加全部为1的一列 print('X.shape:', X.shape) # X.shape: (5000, 401) def expand_y(y): # 相当于是实现了独热编码的功能 # """expand 5000*1 into 5000*10 # where y=10 -> [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]: ndarray # """ res = [] for i in y: y_array = np.zeros(10) y_array[i - 1] = 1 res.append(y_array) return np.array(res) y = expand_y(y_raw) print('y.shape:', y.shape) # y.shape: (5000, 10)
np.ravel:对于[[1,2,3],[4,5,6]],会变成[1,2,3,4,5,6]# 序列化和反序列化 # 读取权重 def load_weight(path): data = sio.loadmat(path) return data['Theta1'], data['Theta2'] t1, t2 = load_weight('ex4weights.mat') print('t1.shape:', t1.shape) print('t2.shape:', t2.shape) # 定义序列化函数 # 在这个nn架构中,我们有theta1(25,401),theta2(10,26),它们的梯度是delta1,delta2 def serialize(a, b): # 将权重序列化,变成一个列向量 return np.concatenate((np.ravel(a), np.ravel(b))) theta = serialize(t1, t2) # 扁平化参数,25*401+10*26=10285 print('theta.shape:', theta.shape) # 定义反序列化函数 def deserialize(seq): # """into ndarray of (25, 401), (10, 26)""" return seq[:25 * 401].reshape(25, 401), seq[25 * 401:].reshape(10, 26)
# 前向传播 def sigmoid(z): return 1 / (1 + np.exp(-z)) def feed_forward(theta, X): """apply to architecture 400+1 * 25+1 *10 X: 5000 * 401 """ t1, t2 = deserialize(theta) # t1: (25,401) t2: (10,26) m = X.shape[0] # 样本数 a1 = X # 5000 * 401 z2 = a1 @ t1.T # 5000 * 25 a2 = np.insert(sigmoid(z2), 0, np.ones(m), axis=1) # 5000*26 z3 = a2 @ t2.T # 5000 * 10 h = sigmoid(z3) # 5000*10, this is h_theta(X) return a1, z2, a2, z3, h # you need all those for backprop _, _, _, _, h = feed_forward(theta, X) print('h.shape:', h.shape)
# 代价函数 def cost(theta, X, y): # """calculate cost # y: (m, k) ndarray # """ m = X.shape[0] # get the data size m _, _, _, _, h = feed_forward(theta, X) # np.multiply is pairwise operation pair_computation = -np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h)) # (5000,10) pair_computation_sum = pair_computation.sum() return pair_computation_sum / m print('cost(theta, X, y):', cost(theta, X, y)) # 正则化代价函数 def regularized_cost(theta, X, y, l=1): """the first column of t1 and t2 is intercept theta, ignore them when you do regularization""" t1, t2 = deserialize(theta) # t1: (25,401) t2: (10,26) m = X.shape[0] reg_t1 = (l / (2 * m)) * np.power(t1[:, 1:], 2).sum() # this is how you ignore first col reg_t2 = (l / (2 * m)) * np.power(t2[:, 1:], 2).sum() return cost(theta, X, y) + reg_t1 + reg_t2 print('regularized_cost(theta, X, y):', regularized_cost(theta, X, y))
1、反向传播过程的阐述详见:5-反向传播算法过程梳理_骑着蜗牛环游深度学习世界的博客-CSDN博客
2、借助李宏毅老师的讲解加深对反向传播过程的理解,然后从代价反向一层层的通过权重传递的思想(即吴恩达老师给出的算法)看懂该部分的代码实现
# 1、计算激活函数的导数
def sigmoid_gradient(z):
# 下面的计算结果等同于对激活函数求导的结果
return np.multiply(sigmoid(z), (1 - sigmoid(z)))

# 2、计算theta的梯度 def gradient(theta, X, y): # initialize t1, t2 = deserialize(theta) # t1: (25,401) t2: (10,26) m = X.shape[0] # 初始化梯度,即每个参数的偏导数 delta1 = np.zeros(t1.shape) # (25, 401) delta2 = np.zeros(t2.shape) # (10, 26) # 进行前向传播 a1, z2, a2, z3, h = feed_forward(theta, X) ''' a1.shape: (5000, 401) z2.shape: (5000, 25) a2.shape: (5000, 26) z3.shape: (5000, 10) h.shape: (5000, 10) ''' for i in range(m): a1i = a1[i, :] # (1, 401) z2i = z2[i, :] # (1, 25) a2i = a2[i, :] # (1, 26) hi = h[i, :] # (1, 10) yi = y[i, :] # (1, 10) d3i = hi - yi # (1, 10),实际值与预测值之间的误差,也就是代价 z2i = np.insert(z2i, 0, np.ones(1)) # make it (1, 26) to compute d2i d2i = np.multiply(t2.T @ d3i, sigmoid_gradient(z2i)) # (1, 26) # careful with np vector transpose delta2 += np.matrix(d3i).T @ np.matrix(a2i) # (1, 10).T @ (1, 26) -> (10, 26) delta1 += np.matrix(d2i[1:]).T @ np.matrix(a1i) # (1, 25).T @ (1, 401) -> (25, 401) delta1 = delta1 / m delta2 = delta2 / m return serialize(delta1, delta2)
def regularized_gradient(theta, X, y, l=1):
"""don't regularize theta of bias terms"""
m = X.shape[0]
delta1, delta2 = deserialize(gradient(theta, X, y))
t1, t2 = deserialize(theta)
t1[:, 0] = 0 # 这一步就是为了不把正则化作用到偏置上
reg_term_d1 = (l / m) * t1
delta1 = delta1 + reg_term_d1
t2[:, 0] = 0
reg_term_d2 = (l / m) * t2
delta2 = delta2 + reg_term_d2
return serialize(delta1, delta2)
1、回顾梯度检验的概念:在代价函数上沿着切线的方向选择离两个非常近的点然后计算两个点的平均值用以估计梯度,然后与反向传播得出的梯度进行比较
2、当
θ
\theta
θ是一个向量时,需要分别对其中的每一个分量进行检验,因此代码中使用expand_array方法对参数进行了维度的扩展,为的就是在梯度检验时,一次只检验一个分量。
梯度下降检验的思想如图2所示。

# 梯度检验 def expand_array(arr): """replicate array into matrix [1, 2, 3] [[1, 2, 3], [1, 2, 3], [1, 2, 3]] """ # turn matrix back to ndarray return np.array(np.matrix(np.ones(arr.shape[0])).T @ np.matrix(arr)) def gradient_checking(theta, X, y, epsilon, regularized=False): def a_numeric_grad(plus, minus, regularized=False): """calculate a partial gradient with respect to 1 theta""" if regularized: return (regularized_cost(plus, X, y) - regularized_cost(minus, X, y)) / (epsilon * 2) else: return (cost(plus, X, y) - cost(minus, X, y)) / (epsilon * 2) theta_matrix = expand_array(theta) # expand to (10285, 10285) epsilon_matrix = np.identity(len(theta)) * epsilon plus_matrix = theta_matrix + epsilon_matrix minus_matrix = theta_matrix - epsilon_matrix # calculate numerical gradient with respect to each theta # 得到每一个theta的梯度检验值 numeric_grad = np.array([a_numeric_grad(plus_matrix[i], minus_matrix[i], regularized) for i in range(len(theta))]) # analytical grad will depend on if you want it to be regularized or not # 得到经由梯度下降算法计算之后的偏导数 analytic_grad = regularized_gradient(theta, X, y) if regularized else gradient(theta, X, y) # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # the diff below should be less than 1e-9 # this is how original matlab code do gradient checking diff = np.linalg.norm(numeric_grad - analytic_grad) / np.linalg.norm(numeric_grad + analytic_grad) print( 'If your backpropagation implementation is correct,\nthe relative difference will be smaller than 10e-9 (assume epsilon=0.0001).\nRelative Difference: {}\n'.format( diff)) # 开始进行梯度检验 gradient_checking(theta, X, y, epsilon=0.0001) # 这个运行很慢,谨慎运行
# 训练 # 定义参数初始化函数 def random_init(size): return np.random.uniform(-0.12, 0.12, size) def nn_training(X, y): """regularized version the architecture is hard coded here... won't generalize """ init_theta = random_init(10285) # 25*401 + 10*26 res = opt.minimize(fun=regularized_cost, x0=init_theta, args=(X, y, 1), method='TNC', jac=regularized_gradient, options={'maxiter': 400}) return res res = nn_training(X, y) # 慢 print('res:\n', res)
success的结果是false
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。