目录
批量梯度下降(Batch Gradient Descent)
随机梯度下降(Stochastic Gradient Descent)
小批量随机梯度下降(Mini-Batch Stochastic Gradient Descent)
动量法
adagrad
rmsprop
adadelta
adam
假设拟合
:第i个样本的值
m :样本数量
:学习率
批量梯度下降(Batch Gradient Descent)
每次迭代用所有的样本
损失函数为
梯度
参数更新
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35 1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4
5# 拟合y=2*x1-3.4*x2+4.2
6true_w = [2, -3.4]
7true_b = 4.2
8# 1000个样本
9nums_input = 1000
10nums_example = len(true_w)
11
12features = np.random.normal(0, 1, [nums_input, nums_example])
13target = np.sum(features * true_w, 1) + true_b
14# 加噪声
15target = target + np.random.normal(0, 0.02, target.shape)
16target = target.squeeze()
17
18# 训练轮数
19nums_epoch = 100
20w = [0, 0]
21b = 0
22# 学习率
23lr = 0.1
24
25for epoch in range(nums_epoch):
26 y_hat = np.sum(features * w, 1) + b
27
28 # 更新参数
29 for i in range(len(w)):
30 w[i] = w[i] - lr * sum((y_hat - target) * features[:, i]) / nums_input
31 b = b - lr * sum((y_hat - target)) / nums_input
32
33 print(epoch, [w, b], [true_w, true_b])
34
35
随机梯度下降(Stochastic Gradient Descent)
每次迭代用一个样本
损失函数为
梯度
参数更新
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45 1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5
6# 拟合y=2*x1-3.4*x2+4.2
7true_w = [2, -3.4]
8true_b = 4.2
9# 1000个样本
10nums_input = 1000
11nums_example = len(true_w)
12
13features = np.random.normal(0, 1, [nums_input, nums_example])
14target = np.sum(features * true_w, 1) + true_b
15# 加噪声
16target = target + np.random.normal(0, 0.02, target.shape)
17target = target.squeeze()
18
19# 训练轮数
20nums_epoch = 100
21w = [0, 0]
22b = 0
23# 学习率
24lr = 0.1
25
26
27def data_iter(features, target):
28 num_examples = len(features)
29 indices = list(range(num_examples))
30 random.shuffle(indices)
31 for i in range(0, num_examples):
32 yield features[i], target[i]
33
34
35for epoch in range(nums_epoch):
36 for X, y in data_iter(features, target):
37 y_hat = b
38 for i in range(len(w)):
39 y_hat += w[i] * X[i]
40 for i in range(len(w)):
41 w[i] = w[i] - lr * (y_hat - y) * X[i]
42 b = b - lr * (y_hat - y)
43 print(epoch, [w, b], [true_w, true_b])
44
45
小批量随机梯度下降(Mini-Batch Stochastic Gradient Descent)
每次迭代用一个批次的样本
损失函数为
B是批次
是批次大小
梯度
参数更新
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56 1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18
19nums_epoch = 50
20lr = 0.01
21batch_size = 10
22
23y_hat = np.matmul(features, w) + b
24loss = np.mean(0.5 * (y_hat - target) ** 2)
25ls_loss = [loss]
26
27
28def data_iter(batch_size, features, target):
29 num_examples = len(features)
30 indices = list(range(num_examples))
31 random.shuffle(indices)
32 for i in range(0, num_examples, batch_size):
33 j = indices[i:min(i + batch_size, num_examples)]
34 yield features[j], target[j]
35
36
37for epoch in range(nums_epoch):
38 for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
39 y_hat = np.matmul(X, w) + b
40 loss = np.mean(0.5 * (y_hat - y) ** 2)
41 # 更新参数
42 for i in range(w.shape[0]):
43 w[i] = w[i] - lr * np.mean((y_hat - y) * X[:, i])
44 b = b - lr * np.mean(y_hat - y)
45
46 # 损失
47 if (batch_i + 1) * batch_size % 100 == 0:
48 ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
49 print(ls_loss[-1])
50
51plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
52plt.xlabel('epoch')
53plt.ylabel('loss')
54plt.show()
55
56
动量法
使用了指数加权平均(EMA)
:EMA参数
:动量
对过去 的动量做移动平均
如果,就是普通的小批量梯度下降
损失函数为
梯度
更新动量
更新参数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66 1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18# 动量
19V_w = np.zeros([features.shape[1], 1])
20V_b = np.zeros(1)
21
22nums_epoch = 2
23lr = 0.02
24# 动量参数
25momentum = 0.5
26batch_size = 10
27
28y_hat = np.matmul(features, w) + b
29loss = np.mean(0.5 * (y_hat - target) ** 2)
30ls_loss = [loss]
31
32
33def data_iter(batch_size, features, target):
34 num_examples = len(features)
35 indices = list(range(num_examples))
36 random.shuffle(indices)
37 for i in range(0, num_examples, batch_size):
38 j = indices[i:min(i + batch_size, num_examples)]
39 yield features[j], target[j]
40
41
42for epoch in range(nums_epoch):
43 for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
44 y_hat = np.matmul(X, w) + b
45 loss = np.mean(0.5 * (y_hat - y) ** 2)
46
47 # 更新参数
48 V_w = lr * np.mean((y_hat - y) * X, 0)[:, np.newaxis] + momentum * V_w
49 w = w - V_w
50 # for i in range(w.shape[0]):
51 # v[i] = lr * np.mean((y_hat - y) * X[:, i]) + momentum * v[i]
52 # w[i] = w[i] - v[i]
53 V_b = lr * np.mean(y_hat - y) + momentum * V_b
54 b = b - V_b
55
56 # 损失
57 if (batch_i + 1) * batch_size % 100 == 0:
58 ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
59 print(ls_loss[-1])
60
61plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
62plt.xlabel('epoch')
63plt.ylabel('loss')
64plt.show()
65
66
adagrad
损失函数为
梯度
更新
让每个参数的学习率都不一样,当梯度过大的时候,学习率就会变小,学习率本身也会慢慢衰减
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65 1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18eps = 1e-6
19
20S_w = np.zeros([features.shape[1], 1])
21S_b = np.zeros(1)
22
23nums_epoch = 10
24lr = 0.5
25batch_size = 10
26
27y_hat = np.matmul(features, w) + b
28loss = np.mean(0.5 * (y_hat - target) ** 2)
29ls_loss = [loss]
30
31
32def data_iter(batch_size, features, target):
33 num_examples = len(features)
34 indices = list(range(num_examples))
35 random.shuffle(indices)
36 for i in range(0, num_examples, batch_size):
37 j = indices[i:min(i + batch_size, num_examples)]
38 yield features[j], target[j]
39
40
41for epoch in range(nums_epoch):
42 for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
43 y_hat = np.matmul(X, w) + b
44 loss = np.mean(0.5 * (y_hat - y) ** 2)
45
46 # 更新参数
47 grad_w = np.mean((y_hat - y) * X, 0)[:,np.newaxis]
48 S_w = S_w + grad_w * grad_w
49 w = w - lr / np.sqrt(S_w + eps) * grad_w
50
51 grad_b = np.mean(y_hat - y)
52 S_b = S_b + grad_b * grad_b
53 b = b - lr / np.sqrt(S_b + eps) * grad_b
54
55 # 损失
56 if (batch_i + 1) * batch_size % 100 == 0:
57 ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
58 print(ls_loss[-1])
59
60plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
61plt.xlabel('epoch')
62plt.ylabel('loss')
63plt.show()
64
65
rmsprop
:EMA参数
损失函数为
梯度
更新
简单来说就是对S做EMA,防止衰减过快
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66 1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18eps = 1e-6
19
20S_w = np.zeros([features.shape[1], 1])
21S_b = np.zeros(1)
22
23nums_epoch = 2
24lr = 0.01
25gamma = 0.9
26batch_size = 10
27
28y_hat = np.matmul(features, w) + b
29loss = np.mean(0.5 * (y_hat - target) ** 2)
30ls_loss = [loss]
31
32
33def data_iter(batch_size, features, target):
34 num_examples = len(features)
35 indices = list(range(num_examples))
36 random.shuffle(indices)
37 for i in range(0, num_examples, batch_size):
38 j = indices[i:min(i + batch_size, num_examples)]
39 yield features[j], target[j]
40
41
42for epoch in range(nums_epoch):
43 for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
44 y_hat = np.matmul(X, w) + b
45 loss = np.mean(0.5 * (y_hat - y) ** 2)
46
47 # 更新参数
48 grad_w = np.mean((y_hat - y) * X, 0)[:, np.newaxis]
49 S_w = gamma*S_w + (1-gamma)*grad_w * grad_w
50 w = w - lr / np.sqrt(S_w + eps) * grad_w
51
52 grad_b = np.mean(y_hat - y)
53 S_b = gamma*S_b + (1-gamma)*grad_b * grad_b
54 b = b - lr / np.sqrt(S_b + eps) * grad_b
55
56 # 损失
57 if (batch_i + 1) * batch_size % 100 == 0:
58 ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
59 print(ls_loss[-1])
60
61plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
62plt.xlabel('epoch')
63plt.ylabel('loss')
64plt.show()
65
66
adadelta
:EMA参数
损失函数为
梯度
更新
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71 1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18eps = 1e-6
19
20S_w = np.zeros([features.shape[1], 1])
21S_b = np.zeros(1)
22delta_w = np.zeros([features.shape[1], 1])
23delta_b = np.zeros(1)
24
25nums_epoch = 20
26rho = 0.5
27batch_size = 10
28
29y_hat = np.matmul(features, w) + b
30loss = np.mean(0.5 * (y_hat - target) ** 2)
31ls_loss = [loss]
32
33
34def data_iter(batch_size, features, target):
35 num_examples = len(features)
36 indices = list(range(num_examples))
37 random.shuffle(indices)
38 for i in range(0, num_examples, batch_size):
39 j = indices[i:min(i + batch_size, num_examples)]
40 yield features[j], target[j]
41
42
43for epoch in range(nums_epoch):
44 for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
45 y_hat = np.matmul(X, w) + b
46 loss = np.mean(0.5 * (y_hat - y) ** 2)
47
48 # 更新参数
49 grad_w = np.mean((y_hat - y) * X, 0)[:, np.newaxis]
50 S_w = rho * S_w + (1 - rho) * grad_w * grad_w
51 g_w = np.sqrt(delta_w + eps) / np.sqrt(S_w + eps) * grad_w
52 w = w - g_w
53 delta_w = rho * delta_w + (1 - rho) * g_w * g_w
54
55 grad_b = np.mean(y_hat - y)
56 S_b = rho * S_b + (1 - rho) * grad_b * grad_b
57 g_b = np.sqrt(delta_b + eps) / np.sqrt(S_b + eps) * grad_b
58 b = b - g_b
59 delta_b = rho * delta_b + (1 - rho) * b * b
60
61 # 损失
62 if (batch_i + 1) * batch_size % 100 == 0:
63 ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
64 print(ls_loss[-1])
65
66plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
67plt.xlabel('epoch')
68plt.ylabel('loss')
69plt.show()
70
71
adam
损失函数为
梯度
更新
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79 1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18eps = 1e-8
19
20v_w = np.zeros([features.shape[1], 1])
21v_b = np.zeros(1)
22S_w = np.zeros([features.shape[1], 1])
23S_b = np.zeros(1)
24t = 1
25
26nums_epoch = 2
27lr = 0.01
28beta1 = 0.9
29beta2 = 0.999
30batch_size = 10
31
32y_hat = np.matmul(features, w) + b
33loss = np.mean(0.5 * (y_hat - target) ** 2)
34ls_loss = [loss]
35
36
37def data_iter(batch_size, features, target):
38 num_examples = len(features)
39 indices = list(range(num_examples))
40 random.shuffle(indices)
41 for i in range(0, num_examples, batch_size):
42 j = indices[i:min(i + batch_size, num_examples)]
43 yield features[j], target[j]
44
45
46for epoch in range(nums_epoch):
47 for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
48 y_hat = np.matmul(X, w) + b
49 loss = np.mean(0.5 * (y_hat - y) ** 2)
50
51 # 更新参数
52 grad_w = np.mean((y_hat - y) * X, 0)[:, np.newaxis]
53 v_w = beta1 * v_w + (1 - beta1) * grad_w
54 S_w = beta2 * S_w + (1 - beta2) * grad_w * grad_w
55 v_w_bias_corr = v_w / (1 - beta1 ** t)
56 s_w_bias_corr = S_w / (1 - beta2 ** t)
57 g_w = lr * v_w_bias_corr / (np.sqrt(s_w_bias_corr) + eps)
58 w = w - g_w
59
60 grad_b = np.mean(y_hat - y)
61 v_b = beta1 * v_b + (1 - beta1) * grad_b
62 S_b = beta2 * S_b + (1 - beta2) * grad_b * grad_b
63 v_b_bias_corr = v_b / (1 - beta1 ** t)
64 s_b_bias_corr = S_b / (1 - beta2 ** t)
65 g_b = lr * v_b_bias_corr / (np.sqrt(s_b_bias_corr) + eps)
66 b = b - g_b
67 t = t + 1
68
69 # 损失
70 if (batch_i + 1) * batch_size % 100 == 0:
71 ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
72 print(ls_loss[-1])
73
74plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
75plt.xlabel('epoch')
76plt.ylabel('loss')
77plt.show()
78
79