• 一步一步用numpy实现神经网络各种层


    1. 首先准备一下数据

    if __name__ == "__main__":
    	data = np.array([[2, 1, 0],
                     [2, 2, 0],
                     [5, 4, 1],
                     [4, 5, 1],
                     [2, 3, 0],
                     [3, 2, 0],
                     [6, 5, 1],
                     [4, 1, 0],
                     [6, 3, 1],
                     [7, 4, 1]])
    	
    	x = data[:, :-1]
    	y = data[:, -1]
    	
    	for epoch in range(1000):
    		...
    

    2. 实现Softmax+CrossEntropy层

    单独求softmax层有点麻烦, 将softmax+entropy一起求导更方便。

    假设对于输入向量 ( x 1 , x 2 , x 3 ) (x_1, x_2, x_3) (x1,x2,x3), 则对应的Loss为:

    L = − ∑ i = 1 C y i ln ⁡ p i = − ( y 1 ln ⁡ p 1 + y 2 ln ⁡ p 2 + y 3 ln ⁡ p 3 )

    L=i=1Cyilnpi=(y1lnp1+y2lnp2+y3lnp3)" role="presentation">L=i=1Cyilnpi=(y1lnp1+y2lnp2+y3lnp3)
    L=i=1Cyilnpi=(y1lnp1+y2lnp2+y3lnp3)

    其中 y i y_i yi为ground truth, 为one-hot vector. p i p_i pi为输出概率。

    p 1 = e x 1 e x 1 + e x 2 + e x 3 p 2 = e x 2 e x 1 + e x 2 + e x 3 p 3 = e x 3 e x 1 + e x 2 + e x 3 p_1=\frac{e^{x_1}}{e^{x_1}+e^{x_2}+e^{x_3}}\\ p_2=\frac{e^{x_2}}{e^{x_1}+e^{x_2}+e^{x_3}}\\ p_3=\frac{e^{x_3}}{e^{x_1}+e^{x_2}+e^{x_3}}\\ p1=ex1+ex2+ex3ex1p2=ex1+ex2+ex3ex2p3=ex1+ex2+ex3ex3
    则偏导为
    ∂ L ∂ x 1 = − y 1 1 p 1 ∗ ∂ p 1 ∂ x 1 − y 2 1 p 2 ∗ ∂ p 2 ∂ x 1 − y 3 1 p 3 ∗ ∂ p 3 ∂ x 1 = − y 1 1 p 1 ∗ e x 1 ∗ ( e x 1 + e x 2 + e x 3 ) − e x 1 ∗ e x 1 ( e x 1 + e x 2 + e x 3 ) 2 − y 2 1 p 2 ∗ − e x 2 ∗ e x 1 ( e x 1 + e x 2 + e x 3 ) 2 − y 3 1 p 3 ∗ − e x 3 ∗ e x 1 ( e x 1 + e x 2 + e x 3 ) 2 = − y 1 1 p 1 ( p 1 ∗ p 2 + p 1 ∗ p 3 ) − y 2 1 p 2 ( − p 1 ∗ p 2 ) − y 3 1 p 3 ( − p 1 ∗ p 3 ) = − y 1 ( p 2 + p 3 ) + y 2 ∗ p 2 + y 3 ∗ p 3 = − y 1 ( 1 − p 1 ) + y 2 ∗ p 1 + y 3 ∗ p 1 = y 1 ( p 1 − 1 ) + y 2 ∗ p 1 + y 3 ∗ p 1

    Lx1=y11p1p1x1y21p2p2x1y31p3p3x1=y11p1ex1(ex1+ex2+ex3)ex1ex1(ex1+ex2+ex3)2y21p2ex2ex1(ex1+ex2+ex3)2y31p3ex3ex1(ex1+ex2+ex3)2=y11p1(p1p2+p1p3)y21p2(p1p2)y31p3(p1p3)=y1(p2+p3)+y2p2+y3p3=y1(1p1)+y2p1+y3p1=y1(p11)+y2p1+y3p1" role="presentation">Lx1=y11p1p1x1y21p2p2x1y31p3p3x1=y11p1ex1(ex1+ex2+ex3)ex1ex1(ex1+ex2+ex3)2y21p2ex2ex1(ex1+ex2+ex3)2y31p3ex3ex1(ex1+ex2+ex3)2=y11p1(p1p2+p1p3)y21p2(p1p2)y31p3(p1p3)=y1(p2+p3)+y2p2+y3p3=y1(1p1)+y2p1+y3p1=y1(p11)+y2p1+y3p1
    x1L=y1p11x1p1y2p21x1p2y3p31x1p3=y1p11(ex1+ex2+ex3)2ex1(ex1+ex2+ex3)ex1ex1y2p21(ex1+ex2+ex3)2ex2ex1y3p31(ex1+ex2+ex3)2ex3ex1=y1p11(p1p2+p1p3)y2p21(p1p2)y3p31(p1p3)=y1(p2+p3)+y2p2+y3p3=y1(1p1)+y2p1+y3p1=y1(p11)+y2p1+y3p1

    同理:
    ∂ L ∂ x 2 = y 1 ∗ p 2 + y 2 ( p 2 − 1 ) + y 3 ∗ p 2 ∂ L ∂ x 3 = y 1 ∗ p 3 + y 2 p 3 + y 3 ∗ ( p 3 − 1 ) \frac{\partial L}{\partial x_2}=y_1*p_2+y_2(p_2-1)+y_3*p_2\\ \frac{\partial L}{\partial x_3}=y_1*p_3+y_2p_3+y_3*(p_3-1) x2L=y1p2+y2(p21)+y3p2x3L=y1p3+y2p3+y3(p31)

    y 1 = 1 y_1=1 y1=1时, 对应的导数为 ( p 1 − 1 , p 2 , p 3 ) (p1-1, p_2, p_3) (p11,p2,p3). 当 y 2 = 1 y_2=1 y2=1时,对应的导数为: ( p 1 , p 2 − 1 , p 3 ) (p_1, p2-1, p3) (p1,p21,p3).

    例如求得概率为 ( 0.2 , 0.3 , 0.5 ) (0.2, 0.3, 0.5) (0.2,0.3,0.5), label为 ( 0 , 0 , 1 ) (0, 0, 1) (0,0,1), 则导数为 ( 0.2 , 0.3 , − 0.5 ) (0.2, 0.3, -0.5) (0.2,0.3,0.5)

    python代码为:

    注意求softmax时需要np.exp(x-np.max(x, axis=1, keepdims=True))防止指数运算溢出。

    class Softmax:
    	def __init__(self, n_classes):
    		self.n_classes = n_classes
    
    	def forward(self, x, y):
    		prob = np.exp(x-np.max(x, axis=1, keepdims=True))
    		prob /= np.sum(prob, axis=1, keepdims=True)
    		
    		# 选出y==1位置的概率
    		loss = -np.sum(np.log(prob[np.arange(len(y), y])) / len(y)
    
    		self.grad = prob.copy()
    		self.grad[np.arange(len(y), y] -= 1
    		"""
    		因为后面求导数都是直接np.sum而不是np.mean, 因此这里mean一次就可以了
    		"""
    		self.grad /= len(y)  
    
    		return prob, loss
    
    	def backward(self):
    		return self.grad
    

    3. 单独的CrossEntropy

    python代码为:

    class Entropy:
    	def __init__(self, n_classes):
    		self.n_classes = n_classes
    		self.grad = None
    	
    	def forward(self, x, y):
    		# x: (b, c), y: (b)
    		b = y.shape[0]
    		one_hot_y = np.zeros((b, self.n_classes))
    		one_hot_y[range(len(y)), y] = 1
    		self.grad = one_hot_y * -1 / x
    		return np.mean(-one_hot_y * np.log(x), axis=0)
    
    	def backward(self):
    		return self.grad
    

    2. 单独的Softmax层

    from einops import repeat, rearrange, einsum
    class Softmax:
    	def __init__(self):
    		
    	def forward(self, x):
    		# x: (b, c)
    		x_exp = np.exp(x)
    		self.output = x_xep / np.sum(x_exp, axis=1, keep_dims=True)
    		return self.output
    		
    	def backward(self, prev_grad):
    		b, c = self.output.shape
            o = repeat(self.output, 'b c -> b c r', r=c)
            I = repeat(np.eye(x.shape[1]), 'c1 c2 -> b c1 c2', b=b)
            self.grad = o * (I - rearrange(o, 'b c1 c2 -> b c2 c1'))
            return einsum(self.grad, grad[..., None], 'b c c, b c m -> b c m')[..., 0]		
    

    3. Linear层

    注意更新 w w w时用的 d w d_w dw, 但是往上一层传递的是 d x d_x dx。因为上一层需要 d L / d o u t dL/d_{out} dL/dout, 而本层的输入 x x x即是上一次层的输出 d L / d o u t = d L / d x dL/d_{out} = dL/dx dL/dout=dL/dx

    class Linear:
        def __init__(self, in_channels, out_channels, lr):
            self.lr = lr
            self.w = np.random.rand(in_channels, out_channels)
            self.b = np.random.rand(out_channels)
    
        def forward(self, x):
            self.x = x
            return x@self.w + self.b
    
        def backward(self, grad):
            dx = einsum(prev_grad, rearrange(self.w, 'w1 w2 -> w2 w1'), 'c1 b, b c2 -> c1 c2')
            dw = einsum(rearrange(self.x, 'b c -> c b'), prev_grad, 'c1 b, b c2 -> c1 c2')
            db = np.sum(prev_grad, axis=0)
    
            self.w -= self.lr * dw
            self.b -= self.lr * db
    
            """
                注意这里往上一层传递的是dx, 因为上一层需要dL/d_out, 而本层的输入x即是上一次层的输出
                dL/d_out = dL/dx
            """
            return dx
    
    

    5. 完整训练代码

    from einops import *
    import numpy as np
    
    
    class Softmax:
        def __init__(self, train=True):
            self.grad = None
            self.train = train
    
        def forward(self, x, y):
            prob = np.exp(x-np.max(x, axis=1, keepdims=True))
            prob /= np.sum(prob, axis=1, keepdims=True)
    
            if self.train:
                loss = -np.sum(np.log(prob[range(len(y)), y]))/len(y)
    
                self.grad = prob.copy()
                self.grad[range(len(y)), y] -= 1
                self.grad /= len(y)
                return prob, loss
    
            else:
                return prob
    
        def backward(self):
            return self.grad
    
    
    class Linear:
        def __init__(self, in_channels, out_channels, lr):
            self.w = np.random.rand(in_channels, out_channels)
            self.b = np.random.rand(out_channels)
            self.lr = lr
    
        def forward(self, x):
            self.x = x
            output = einsum(x, self.w, 'b c1, c1 c2 -> b c2') + self.b
            return output
    
        def backward(self, prev_grad):
            cur_grad = einsum(rearrange(self.x, 'b c -> c b'), prev_grad, 'c1 b, b c2 -> c1 c2')
    
            self.w -= self.lr * cur_grad
            self.b -= self.lr * np.sum(prev_grad, axis=0)
            return cur_grad
    
    
    class Network:
        def __init__(self, in_channels, out_channels, n_classes, lr):
            self.lr = lr
            self.linear = Linear(in_channels, out_channels, lr)
            self.softmax = Softmax()
    
        def forward(self, x, y=None):
            out = self.linear.forward(x)
            out = self.softmax.forward(out, y)
            return out
    
        def backward(self):
            grad = self.softmax.backward()
            grad = self.linear.backward(grad)
    
            return grad
    
    
    if __name__ == "__main__":
        data = np.array([[2, 1, 0],
                         [2, 2, 0],
                         [5, 4, 1],
                         [4, 5, 1],
                         [2, 3, 0],
                         [3, 2, 0],
                         [6, 5, 1],
                         [4, 1, 0],
                         [6, 3, 1],
                         [7, 4, 1]])
    
        # x = np.concatenate([np.array([[1]] * data.shape[0]), data[:, :2]], axis=1)
        x = data[:, :-1]
        y = data[:, -1:].flatten()
    
        net = Network(2, 2, 2, 0.1)
        # loss_fn = CrossEntropy(n_classes=2)
        for epoch in range(500):
            prob, loss = net.forward(x, y)
            # loss = loss_fn.forward(out, y)
            # grad_ = loss_fn.backward()
            grad = net.backward()
            print(loss)
    
        net.softmax.train = False
        print(net.forward(np.array([[0, 0], [0, 4], [8, 6], [10, 10]])), y)
    
  • 相关阅读:
    SLA 、SLO & SLI
    Windows 安装 Dapr
    SecureCRT隧道,跳板机+端口转发,内网穿透
    资源道具化
    网络安全(黑客)自学
    Qt扫盲-QJsonObject理论总结
    基于html+css+javascript+jquery+bootstarp响应式网页设计——大理我的家乡旅游景点
    Linux 中的一堆目录上运行 mvn clean
    .NET 6学习笔记(3)——在Windows Service中托管ASP.NET Core并指定端口
    用超声波清洗机来洗眼镜好不好?眼镜超声波清洗机推荐
  • 原文地址:https://blog.csdn.net/u012897374/article/details/139655924