一步一步用numpy实现神经网络各种层

1. 首先准备一下数据

if __name__ == "__main__":
	data = np.array([[2, 1, 0],
                 [2, 2, 0],
                 [5, 4, 1],
                 [4, 5, 1],
                 [2, 3, 0],
                 [3, 2, 0],
                 [6, 5, 1],
                 [4, 1, 0],
                 [6, 3, 1],
                 [7, 4, 1]])
	
	x = data[:, :-1]
	y = data[:, -1]
	
	for epoch in range(1000):
		...

2. 实现Softmax+CrossEntropy层

单独求softmax层有点麻烦, 将softmax+entropy一起求导更方便。

假设对于输入向量 $x_1, x_2, x_3)$ , 则对应的Loss为:

\begin{aligned} L & = - \sum_{i = 1}^{C} y_{i} \ln p^{i} \\ = - (y_{1} \ln p_{1} + y_{2} \ln p_{2} + y_{3} \ln p_{3}) \end{aligned}

L = - i = 1 \sum C y_{i} ln p^{i} = - (y_{1} ln p_{1} + y_{2} ln p_{2} + y_{3} ln p_{3})

其中 $y_i$ 为ground truth, 为one-hot vector. $p_i$ 为输出概率。

$p_1=\frac{e^{x_1}}{e^{x_1}+e^{x_2}+e^{x_3}}\\ p_2=\frac{e^{x_2}}{e^{x_1}+e^{x_2}+e^{x_3}}\\ p_3=\frac{e^{x_3}}{e^{x_1}+e^{x_2}+e^{x_3}}\\$
则偏导为

\begin{aligned} \frac{\partial L}{\partial x_{1}} & = - y_{1} \frac{1}{p_{1}} * \frac{\partial p_{1}}{\partial x_{1}} - y_{2} \frac{1}{p_{2}} * \frac{\partial p_{2}}{\partial x_{1}} - y_{3} \frac{1}{p_{3}} * \frac{\partial p_{3}}{\partial x_{1}} \\ = - y_{1} \frac{1}{p_{1}} * \frac{e^{x_{1}} * (e^{x_{1}} + e^{x_{2}} + e^{x_{3}}) - e^{x_{1}} * e^{x_{1}}}{(e^{x_{1}} + e^{x_{2}} + e^{x_{3}})^{2}} \\ - y_{2} \frac{1}{p_{2}} * \frac{- e^{x_{2}} * e^{x_{1}}}{(e^{x_{1}} + e^{x_{2}} + e^{x_{3}})^{2}} \\ - y_{3} \frac{1}{p_{3}} * \frac{- e^{x_{3}} * e^{x_{1}}}{(e^{x_{1}} + e^{x_{2}} + e^{x_{3}})^{2}} \\ = - y_{1} \frac{1}{p_{1}} (p_{1} * p_{2} + p_{1} * p_{3}) \\ - y_{2} \frac{1}{p_{2}} (- p_{1} * p_{2}) \\ - y_{3} \frac{1}{p_{3}} (- p_{1} * p_{3}) \\ = - y 1 (p_{2} + p_{3}) + y_{2} * p_{2} + y_{3} * p_{3} \\ = - y_{1} (1 - p_{1}) + y_{2} * p_{1} + y_{3} * p_{1} \\ = y_{1} (p_{1} - 1) + y_{2} * p_{1} + y_{3} * p_{1} \end{aligned}

\frac{\partial L}{\partial x _{1}} = - y_{1} \frac{1}{p _{1}} * \frac{\partial p _{1}}{\partial x _{1}} - y_{2} \frac{1}{p _{2}} * \frac{\partial p _{2}}{\partial x _{1}} - y_{3} \frac{1}{p _{3}} * \frac{\partial p _{3}}{\partial x _{1}} = - y_{1} \frac{1}{p _{1}} * \frac{e ^{x_{1}} * ( e ^{x_{1}} + e ^{x_{2}} + e ^{x_{3}} ) - e ^{x_{1}} * e ^{x_{1}}}{( e ^{x_{1}} + e ^{x_{2}} + e ^{x_{3}} ) ^{2}} - y_{2} \frac{1}{p _{2}} * \frac{- e ^{x_{2}} * e ^{x_{1}}}{( e ^{x_{1}} + e ^{x_{2}} + e ^{x_{3}} ) ^{2}} - y_{3} \frac{1}{p _{3}} * \frac{- e ^{x_{3}} * e ^{x_{1}}}{( e ^{x_{1}} + e ^{x_{2}} + e ^{x_{3}} ) ^{2}} = - y_{1} \frac{1}{p _{1}} (p_{1} * p_{2} + p_{1} * p_{3}) - y_{2} \frac{1}{p _{2}} (- p_{1} * p_{2}) - y_{3} \frac{1}{p _{3}} (- p_{1} * p_{3}) = - y 1 (p_{2} + p_{3}) + y_{2} * p_{2} + y_{3} * p_{3} = - y_{1} (1 - p_{1}) + y_{2} * p_{1} + y_{3} * p_{1} = y_{1} (p_{1} - 1) + y_{2} * p_{1} + y_{3} * p_{1}

同理:
$\frac{\partial L}{\partial x_2}=y_1*p_2+y_2(p_2-1)+y_3*p_2\\ \frac{\partial L}{\partial x_3}=y_1*p_3+y_2p_3+y_3*(p_3-1)$

当 $y_1=1$ 时, 对应的导数为 $p1-1, p_2, p_3)$ . 当 $y_2=1$ 时,对应的导数为: $p_1, p2-1, p3)$ .

例如求得概率为 $(0.2, 0.3, 0.5)$ , label为 $(0, 0, 1)$ , 则导数为 $(0.2, 0.3, - 0.5)$

python代码为:

注意求softmax时需要np.exp(x-np.max(x, axis=1, keepdims=True))防止指数运算溢出。

class Softmax:
	def __init__(self, n_classes):
		self.n_classes = n_classes

	def forward(self, x, y):
		prob = np.exp(x-np.max(x, axis=1, keepdims=True))
		prob /= np.sum(prob, axis=1, keepdims=True)
		
		# 选出y==1位置的概率
		loss = -np.sum(np.log(prob[np.arange(len(y), y])) / len(y)

		self.grad = prob.copy()
		self.grad[np.arange(len(y), y] -= 1
		"""
		因为后面求导数都是直接np.sum而不是np.mean, 因此这里mean一次就可以了
		"""
		self.grad /= len(y)  

		return prob, loss

	def backward(self):
		return self.grad

3. 单独的CrossEntropy

python代码为:

class Entropy:
	def __init__(self, n_classes):
		self.n_classes = n_classes
		self.grad = None
	
	def forward(self, x, y):
		# x: (b, c), y: (b)
		b = y.shape[0]
		one_hot_y = np.zeros((b, self.n_classes))
		one_hot_y[range(len(y)), y] = 1
		self.grad = one_hot_y * -1 / x
		return np.mean(-one_hot_y * np.log(x), axis=0)

	def backward(self):
		return self.grad

2. 单独的Softmax层

from einops import repeat, rearrange, einsum
class Softmax:
	def __init__(self):
		
	def forward(self, x):
		# x: (b, c)
		x_exp = np.exp(x)
		self.output = x_xep / np.sum(x_exp, axis=1, keep_dims=True)
		return self.output
		
	def backward(self, prev_grad):
		b, c = self.output.shape
        o = repeat(self.output, 'b c -> b c r', r=c)
        I = repeat(np.eye(x.shape[1]), 'c1 c2 -> b c1 c2', b=b)
        self.grad = o * (I - rearrange(o, 'b c1 c2 -> b c2 c1'))
        return einsum(self.grad, grad[..., None], 'b c c, b c m -> b c m')[..., 0]

3. Linear层

注意更新 $w$ 时用的 $d_w$ , 但是往上一层传递的是 $d_x$ 。因为上一层需要 $dL/d_{out}$ , 而本层的输入 $x$ 即是上一次层的输出 $dL/d_{out} = dL/dx$

class Linear:
    def __init__(self, in_channels, out_channels, lr):
        self.lr = lr
        self.w = np.random.rand(in_channels, out_channels)
        self.b = np.random.rand(out_channels)

    def forward(self, x):
        self.x = x
        return x@self.w + self.b

    def backward(self, grad):
        dx = einsum(prev_grad, rearrange(self.w, 'w1 w2 -> w2 w1'), 'c1 b, b c2 -> c1 c2')
        dw = einsum(rearrange(self.x, 'b c -> c b'), prev_grad, 'c1 b, b c2 -> c1 c2')
        db = np.sum(prev_grad, axis=0)

        self.w -= self.lr * dw
        self.b -= self.lr * db

        """
            注意这里往上一层传递的是dx, 因为上一层需要dL/d_out, 而本层的输入x即是上一次层的输出
            dL/d_out = dL/dx
        """
        return dx

5. 完整训练代码

from einops import *
import numpy as np


class Softmax:
    def __init__(self, train=True):
        self.grad = None
        self.train = train

    def forward(self, x, y):
        prob = np.exp(x-np.max(x, axis=1, keepdims=True))
        prob /= np.sum(prob, axis=1, keepdims=True)

        if self.train:
            loss = -np.sum(np.log(prob[range(len(y)), y]))/len(y)

            self.grad = prob.copy()
            self.grad[range(len(y)), y] -= 1
            self.grad /= len(y)
            return prob, loss

        else:
            return prob

    def backward(self):
        return self.grad


class Linear:
    def __init__(self, in_channels, out_channels, lr):
        self.w = np.random.rand(in_channels, out_channels)
        self.b = np.random.rand(out_channels)
        self.lr = lr

    def forward(self, x):
        self.x = x
        output = einsum(x, self.w, 'b c1, c1 c2 -> b c2') + self.b
        return output

    def backward(self, prev_grad):
        cur_grad = einsum(rearrange(self.x, 'b c -> c b'), prev_grad, 'c1 b, b c2 -> c1 c2')

        self.w -= self.lr * cur_grad
        self.b -= self.lr * np.sum(prev_grad, axis=0)
        return cur_grad


class Network:
    def __init__(self, in_channels, out_channels, n_classes, lr):
        self.lr = lr
        self.linear = Linear(in_channels, out_channels, lr)
        self.softmax = Softmax()

    def forward(self, x, y=None):
        out = self.linear.forward(x)
        out = self.softmax.forward(out, y)
        return out

    def backward(self):
        grad = self.softmax.backward()
        grad = self.linear.backward(grad)

        return grad


if __name__ == "__main__":
    data = np.array([[2, 1, 0],
                     [2, 2, 0],
                     [5, 4, 1],
                     [4, 5, 1],
                     [2, 3, 0],
                     [3, 2, 0],
                     [6, 5, 1],
                     [4, 1, 0],
                     [6, 3, 1],
                     [7, 4, 1]])

    # x = np.concatenate([np.array([[1]] * data.shape[0]), data[:, :2]], axis=1)
    x = data[:, :-1]
    y = data[:, -1:].flatten()

    net = Network(2, 2, 2, 0.1)
    # loss_fn = CrossEntropy(n_classes=2)
    for epoch in range(500):
        prob, loss = net.forward(x, y)
        # loss = loss_fn.forward(out, y)
        # grad_ = loss_fn.backward()
        grad = net.backward()
        print(loss)

    net.softmax.train = False
    print(net.forward(np.array([[0, 0], [0, 4], [8, 6], [10, 10]])), y)

相关阅读:
SLA 、SLO & SLI
Windows 安装 Dapr
SecureCRT隧道，跳板机+端口转发，内网穿透
资源道具化
网络安全（黑客）自学
Qt扫盲-QJsonObject理论总结
基于html+css+javascript+jquery+bootstarp响应式网页设计——大理我的家乡旅游景点
Linux 中的一堆目录上运行 mvn clean
.NET 6学习笔记（3）——在Windows Service中托管ASP.NET Core并指定端口
用超声波清洗机来洗眼镜好不好？眼镜超声波清洗机推荐

原文地址：https://blog.csdn.net/u012897374/article/details/139655924