if __name__ == "__main__":
data = np.array([[2, 1, 0],
[2, 2, 0],
[5, 4, 1],
[4, 5, 1],
[2, 3, 0],
[3, 2, 0],
[6, 5, 1],
[4, 1, 0],
[6, 3, 1],
[7, 4, 1]])
x = data[:, :-1]
y = data[:, -1]
for epoch in range(1000):
...
单独求softmax层有点麻烦, 将softmax+entropy一起求导更方便。
假设对于输入向量 ( x 1 , x 2 , x 3 ) (x_1, x_2, x_3) (x1,x2,x3), 则对应的Loss为:
L
=
−
∑
i
=
1
C
y
i
ln
p
i
=
−
(
y
1
ln
p
1
+
y
2
ln
p
2
+
y
3
ln
p
3
)
其中 y i y_i yi为ground truth, 为one-hot vector. p i p_i pi为输出概率。
p
1
=
e
x
1
e
x
1
+
e
x
2
+
e
x
3
p
2
=
e
x
2
e
x
1
+
e
x
2
+
e
x
3
p
3
=
e
x
3
e
x
1
+
e
x
2
+
e
x
3
p_1=\frac{e^{x_1}}{e^{x_1}+e^{x_2}+e^{x_3}}\\ p_2=\frac{e^{x_2}}{e^{x_1}+e^{x_2}+e^{x_3}}\\ p_3=\frac{e^{x_3}}{e^{x_1}+e^{x_2}+e^{x_3}}\\
p1=ex1+ex2+ex3ex1p2=ex1+ex2+ex3ex2p3=ex1+ex2+ex3ex3
则偏导为
∂
L
∂
x
1
=
−
y
1
1
p
1
∗
∂
p
1
∂
x
1
−
y
2
1
p
2
∗
∂
p
2
∂
x
1
−
y
3
1
p
3
∗
∂
p
3
∂
x
1
=
−
y
1
1
p
1
∗
e
x
1
∗
(
e
x
1
+
e
x
2
+
e
x
3
)
−
e
x
1
∗
e
x
1
(
e
x
1
+
e
x
2
+
e
x
3
)
2
−
y
2
1
p
2
∗
−
e
x
2
∗
e
x
1
(
e
x
1
+
e
x
2
+
e
x
3
)
2
−
y
3
1
p
3
∗
−
e
x
3
∗
e
x
1
(
e
x
1
+
e
x
2
+
e
x
3
)
2
=
−
y
1
1
p
1
(
p
1
∗
p
2
+
p
1
∗
p
3
)
−
y
2
1
p
2
(
−
p
1
∗
p
2
)
−
y
3
1
p
3
(
−
p
1
∗
p
3
)
=
−
y
1
(
p
2
+
p
3
)
+
y
2
∗
p
2
+
y
3
∗
p
3
=
−
y
1
(
1
−
p
1
)
+
y
2
∗
p
1
+
y
3
∗
p
1
=
y
1
(
p
1
−
1
)
+
y
2
∗
p
1
+
y
3
∗
p
1
同理:
∂
L
∂
x
2
=
y
1
∗
p
2
+
y
2
(
p
2
−
1
)
+
y
3
∗
p
2
∂
L
∂
x
3
=
y
1
∗
p
3
+
y
2
p
3
+
y
3
∗
(
p
3
−
1
)
\frac{\partial L}{\partial x_2}=y_1*p_2+y_2(p_2-1)+y_3*p_2\\ \frac{\partial L}{\partial x_3}=y_1*p_3+y_2p_3+y_3*(p_3-1)
∂x2∂L=y1∗p2+y2(p2−1)+y3∗p2∂x3∂L=y1∗p3+y2p3+y3∗(p3−1)
当 y 1 = 1 y_1=1 y1=1时, 对应的导数为 ( p 1 − 1 , p 2 , p 3 ) (p1-1, p_2, p_3) (p1−1,p2,p3). 当 y 2 = 1 y_2=1 y2=1时,对应的导数为: ( p 1 , p 2 − 1 , p 3 ) (p_1, p2-1, p3) (p1,p2−1,p3).
例如求得概率为 ( 0.2 , 0.3 , 0.5 ) (0.2, 0.3, 0.5) (0.2,0.3,0.5), label为 ( 0 , 0 , 1 ) (0, 0, 1) (0,0,1), 则导数为 ( 0.2 , 0.3 , − 0.5 ) (0.2, 0.3, -0.5) (0.2,0.3,−0.5)
python代码为:
注意求softmax时需要np.exp(x-np.max(x, axis=1, keepdims=True))防止指数运算溢出。
class Softmax:
def __init__(self, n_classes):
self.n_classes = n_classes
def forward(self, x, y):
prob = np.exp(x-np.max(x, axis=1, keepdims=True))
prob /= np.sum(prob, axis=1, keepdims=True)
# 选出y==1位置的概率
loss = -np.sum(np.log(prob[np.arange(len(y), y])) / len(y)
self.grad = prob.copy()
self.grad[np.arange(len(y), y] -= 1
"""
因为后面求导数都是直接np.sum而不是np.mean, 因此这里mean一次就可以了
"""
self.grad /= len(y)
return prob, loss
def backward(self):
return self.grad
python代码为:
class Entropy:
def __init__(self, n_classes):
self.n_classes = n_classes
self.grad = None
def forward(self, x, y):
# x: (b, c), y: (b)
b = y.shape[0]
one_hot_y = np.zeros((b, self.n_classes))
one_hot_y[range(len(y)), y] = 1
self.grad = one_hot_y * -1 / x
return np.mean(-one_hot_y * np.log(x), axis=0)
def backward(self):
return self.grad
from einops import repeat, rearrange, einsum
class Softmax:
def __init__(self):
def forward(self, x):
# x: (b, c)
x_exp = np.exp(x)
self.output = x_xep / np.sum(x_exp, axis=1, keep_dims=True)
return self.output
def backward(self, prev_grad):
b, c = self.output.shape
o = repeat(self.output, 'b c -> b c r', r=c)
I = repeat(np.eye(x.shape[1]), 'c1 c2 -> b c1 c2', b=b)
self.grad = o * (I - rearrange(o, 'b c1 c2 -> b c2 c1'))
return einsum(self.grad, grad[..., None], 'b c c, b c m -> b c m')[..., 0]
注意更新 w w w时用的 d w d_w dw, 但是往上一层传递的是 d x d_x dx。因为上一层需要 d L / d o u t dL/d_{out} dL/dout, 而本层的输入 x x x即是上一次层的输出 d L / d o u t = d L / d x dL/d_{out} = dL/dx dL/dout=dL/dx
class Linear:
def __init__(self, in_channels, out_channels, lr):
self.lr = lr
self.w = np.random.rand(in_channels, out_channels)
self.b = np.random.rand(out_channels)
def forward(self, x):
self.x = x
return x@self.w + self.b
def backward(self, grad):
dx = einsum(prev_grad, rearrange(self.w, 'w1 w2 -> w2 w1'), 'c1 b, b c2 -> c1 c2')
dw = einsum(rearrange(self.x, 'b c -> c b'), prev_grad, 'c1 b, b c2 -> c1 c2')
db = np.sum(prev_grad, axis=0)
self.w -= self.lr * dw
self.b -= self.lr * db
"""
注意这里往上一层传递的是dx, 因为上一层需要dL/d_out, 而本层的输入x即是上一次层的输出
dL/d_out = dL/dx
"""
return dx
from einops import *
import numpy as np
class Softmax:
def __init__(self, train=True):
self.grad = None
self.train = train
def forward(self, x, y):
prob = np.exp(x-np.max(x, axis=1, keepdims=True))
prob /= np.sum(prob, axis=1, keepdims=True)
if self.train:
loss = -np.sum(np.log(prob[range(len(y)), y]))/len(y)
self.grad = prob.copy()
self.grad[range(len(y)), y] -= 1
self.grad /= len(y)
return prob, loss
else:
return prob
def backward(self):
return self.grad
class Linear:
def __init__(self, in_channels, out_channels, lr):
self.w = np.random.rand(in_channels, out_channels)
self.b = np.random.rand(out_channels)
self.lr = lr
def forward(self, x):
self.x = x
output = einsum(x, self.w, 'b c1, c1 c2 -> b c2') + self.b
return output
def backward(self, prev_grad):
cur_grad = einsum(rearrange(self.x, 'b c -> c b'), prev_grad, 'c1 b, b c2 -> c1 c2')
self.w -= self.lr * cur_grad
self.b -= self.lr * np.sum(prev_grad, axis=0)
return cur_grad
class Network:
def __init__(self, in_channels, out_channels, n_classes, lr):
self.lr = lr
self.linear = Linear(in_channels, out_channels, lr)
self.softmax = Softmax()
def forward(self, x, y=None):
out = self.linear.forward(x)
out = self.softmax.forward(out, y)
return out
def backward(self):
grad = self.softmax.backward()
grad = self.linear.backward(grad)
return grad
if __name__ == "__main__":
data = np.array([[2, 1, 0],
[2, 2, 0],
[5, 4, 1],
[4, 5, 1],
[2, 3, 0],
[3, 2, 0],
[6, 5, 1],
[4, 1, 0],
[6, 3, 1],
[7, 4, 1]])
# x = np.concatenate([np.array([[1]] * data.shape[0]), data[:, :2]], axis=1)
x = data[:, :-1]
y = data[:, -1:].flatten()
net = Network(2, 2, 2, 0.1)
# loss_fn = CrossEntropy(n_classes=2)
for epoch in range(500):
prob, loss = net.forward(x, y)
# loss = loss_fn.forward(out, y)
# grad_ = loss_fn.backward()
grad = net.backward()
print(loss)
net.softmax.train = False
print(net.forward(np.array([[0, 0], [0, 4], [8, 6], [10, 10]])), y)