优化算法

释放双眼,带上耳机,听听看~!

目录

批量梯度下降(Batch Gradient Descent)

随机梯度下降(Stochastic Gradient Descent)

小批量随机梯度下降(Mini-Batch Stochastic Gradient Descent)

动量法

adagrad

rmsprop

adadelta

adam


假设拟合

 :第i个样本的值

m :样本数量

 :学习率

批量梯度下降(Batch Gradient Descent)

每次迭代用所有的样本

损失函数为  

梯度

参数更新


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4
5# 拟合y=2*x1-3.4*x2+4.2
6true_w = [2, -3.4]
7true_b = 4.2
8# 1000个样本
9nums_input = 1000
10nums_example = len(true_w)
11
12features = np.random.normal(0, 1, [nums_input, nums_example])
13target = np.sum(features * true_w, 1) + true_b
14# 加噪声
15target = target + np.random.normal(0, 0.02, target.shape)
16target = target.squeeze()
17
18# 训练轮数
19nums_epoch = 100
20w = [0, 0]
21b = 0
22# 学习率
23lr = 0.1
24
25for epoch in range(nums_epoch):
26    y_hat = np.sum(features * w, 1) + b
27
28    # 更新参数
29    for i in range(len(w)):
30        w[i] = w[i] - lr * sum((y_hat - target) * features[:, i]) / nums_input
31    b = b - lr * sum((y_hat - target)) / nums_input
32
33    print(epoch, [w, b], [true_w, true_b])
34
35

随机梯度下降(Stochastic Gradient Descent)

每次迭代用一个样本

损失函数为  

梯度

参数更新


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5
6# 拟合y=2*x1-3.4*x2+4.2
7true_w = [2, -3.4]
8true_b = 4.2
9# 1000个样本
10nums_input = 1000
11nums_example = len(true_w)
12
13features = np.random.normal(0, 1, [nums_input, nums_example])
14target = np.sum(features * true_w, 1) + true_b
15# 加噪声
16target = target + np.random.normal(0, 0.02, target.shape)
17target = target.squeeze()
18
19# 训练轮数
20nums_epoch = 100
21w = [0, 0]
22b = 0
23# 学习率
24lr = 0.1
25
26
27def data_iter(features, target):
28    num_examples = len(features)
29    indices = list(range(num_examples))
30    random.shuffle(indices)
31    for i in range(0, num_examples):
32        yield features[i], target[i]
33
34
35for epoch in range(nums_epoch):
36    for X, y in data_iter(features, target):
37        y_hat = b
38        for i in range(len(w)):
39            y_hat += w[i] * X[i]
40        for i in range(len(w)):
41            w[i] = w[i] - lr * (y_hat - y) * X[i]
42        b = b - lr * (y_hat - y)
43    print(epoch, [w, b], [true_w, true_b])
44
45

小批量随机梯度下降(Mini-Batch Stochastic Gradient Descent)

每次迭代用一个批次的样本

损失函数为  

B是批次

是批次大小

梯度

参数更新

优化算法


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18
19nums_epoch = 50
20lr = 0.01
21batch_size = 10
22
23y_hat = np.matmul(features, w) + b
24loss = np.mean(0.5 * (y_hat - target) ** 2)
25ls_loss = [loss]
26
27
28def data_iter(batch_size, features, target):
29    num_examples = len(features)
30    indices = list(range(num_examples))
31    random.shuffle(indices)
32    for i in range(0, num_examples, batch_size):
33        j = indices[i:min(i + batch_size, num_examples)]
34        yield features[j], target[j]
35
36
37for epoch in range(nums_epoch):
38    for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
39        y_hat = np.matmul(X, w) + b
40        loss = np.mean(0.5 * (y_hat - y) ** 2)
41        # 更新参数
42        for i in range(w.shape[0]):
43            w[i] = w[i] - lr * np.mean((y_hat - y) * X[:, i])
44        b = b - lr * np.mean(y_hat - y)
45
46        # 损失
47        if (batch_i + 1) * batch_size % 100 == 0:
48            ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
49            print(ls_loss[-1])
50
51plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
52plt.xlabel('epoch')
53plt.ylabel('loss')
54plt.show()
55
56

动量法

使用了指数加权平均(EMA)

:EMA参数

:动量

对过去 的动量做移动平均

如果,就是普通的小批量梯度下降

损失函数为  

梯度

更新动量

更新参数


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18# 动量
19V_w = np.zeros([features.shape[1], 1])
20V_b = np.zeros(1)
21
22nums_epoch = 2
23lr = 0.02
24# 动量参数
25momentum = 0.5
26batch_size = 10
27
28y_hat = np.matmul(features, w) + b
29loss = np.mean(0.5 * (y_hat - target) ** 2)
30ls_loss = [loss]
31
32
33def data_iter(batch_size, features, target):
34    num_examples = len(features)
35    indices = list(range(num_examples))
36    random.shuffle(indices)
37    for i in range(0, num_examples, batch_size):
38        j = indices[i:min(i + batch_size, num_examples)]
39        yield features[j], target[j]
40
41
42for epoch in range(nums_epoch):
43    for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
44        y_hat = np.matmul(X, w) + b
45        loss = np.mean(0.5 * (y_hat - y) ** 2)
46
47        # 更新参数
48        V_w = lr * np.mean((y_hat - y) * X, 0)[:, np.newaxis] + momentum * V_w
49        w = w - V_w
50        # for i in range(w.shape[0]):
51        #     v[i] = lr * np.mean((y_hat - y) * X[:, i]) + momentum * v[i]
52        #     w[i] = w[i] - v[i]
53        V_b = lr * np.mean(y_hat - y) + momentum * V_b
54        b = b - V_b
55
56        # 损失
57        if (batch_i + 1) * batch_size % 100 == 0:
58            ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
59            print(ls_loss[-1])
60
61plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
62plt.xlabel('epoch')
63plt.ylabel('loss')
64plt.show()
65
66

adagrad

损失函数为  

梯度

更新

让每个参数的学习率都不一样,当梯度过大的时候,学习率就会变小,学习率本身也会慢慢衰减


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18eps = 1e-6
19
20S_w = np.zeros([features.shape[1], 1])
21S_b = np.zeros(1)
22
23nums_epoch = 10
24lr = 0.5
25batch_size = 10
26
27y_hat = np.matmul(features, w) + b
28loss = np.mean(0.5 * (y_hat - target) ** 2)
29ls_loss = [loss]
30
31
32def data_iter(batch_size, features, target):
33    num_examples = len(features)
34    indices = list(range(num_examples))
35    random.shuffle(indices)
36    for i in range(0, num_examples, batch_size):
37        j = indices[i:min(i + batch_size, num_examples)]
38        yield features[j], target[j]
39
40
41for epoch in range(nums_epoch):
42    for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
43        y_hat = np.matmul(X, w) + b
44        loss = np.mean(0.5 * (y_hat - y) ** 2)
45
46        # 更新参数
47        grad_w = np.mean((y_hat - y) * X, 0)[:,np.newaxis]
48        S_w = S_w + grad_w * grad_w
49        w = w - lr / np.sqrt(S_w + eps) * grad_w
50
51        grad_b = np.mean(y_hat - y)
52        S_b = S_b + grad_b * grad_b
53        b = b - lr / np.sqrt(S_b + eps) * grad_b
54
55        # 损失
56        if (batch_i + 1) * batch_size % 100 == 0:
57            ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
58            print(ls_loss[-1])
59
60plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
61plt.xlabel('epoch')
62plt.ylabel('loss')
63plt.show()
64
65

rmsprop

:EMA参数

损失函数为  

梯度

更新

简单来说就是对S做EMA,防止衰减过快


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18eps = 1e-6
19
20S_w = np.zeros([features.shape[1], 1])
21S_b = np.zeros(1)
22
23nums_epoch = 2
24lr = 0.01
25gamma = 0.9
26batch_size = 10
27
28y_hat = np.matmul(features, w) + b
29loss = np.mean(0.5 * (y_hat - target) ** 2)
30ls_loss = [loss]
31
32
33def data_iter(batch_size, features, target):
34    num_examples = len(features)
35    indices = list(range(num_examples))
36    random.shuffle(indices)
37    for i in range(0, num_examples, batch_size):
38        j = indices[i:min(i + batch_size, num_examples)]
39        yield features[j], target[j]
40
41
42for epoch in range(nums_epoch):
43    for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
44        y_hat = np.matmul(X, w) + b
45        loss = np.mean(0.5 * (y_hat - y) ** 2)
46
47        # 更新参数
48        grad_w = np.mean((y_hat - y) * X, 0)[:, np.newaxis]
49        S_w = gamma*S_w + (1-gamma)*grad_w * grad_w
50        w = w - lr / np.sqrt(S_w + eps) * grad_w
51
52        grad_b = np.mean(y_hat - y)
53        S_b = gamma*S_b + (1-gamma)*grad_b * grad_b
54        b = b - lr / np.sqrt(S_b + eps) * grad_b
55
56        # 损失
57        if (batch_i + 1) * batch_size % 100 == 0:
58            ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
59            print(ls_loss[-1])
60
61plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
62plt.xlabel('epoch')
63plt.ylabel('loss')
64plt.show()
65
66

adadelta

:EMA参数

损失函数为  

梯度

更新


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18eps = 1e-6
19
20S_w = np.zeros([features.shape[1], 1])
21S_b = np.zeros(1)
22delta_w = np.zeros([features.shape[1], 1])
23delta_b = np.zeros(1)
24
25nums_epoch = 20
26rho = 0.5
27batch_size = 10
28
29y_hat = np.matmul(features, w) + b
30loss = np.mean(0.5 * (y_hat - target) ** 2)
31ls_loss = [loss]
32
33
34def data_iter(batch_size, features, target):
35    num_examples = len(features)
36    indices = list(range(num_examples))
37    random.shuffle(indices)
38    for i in range(0, num_examples, batch_size):
39        j = indices[i:min(i + batch_size, num_examples)]
40        yield features[j], target[j]
41
42
43for epoch in range(nums_epoch):
44    for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
45        y_hat = np.matmul(X, w) + b
46        loss = np.mean(0.5 * (y_hat - y) ** 2)
47
48        # 更新参数
49        grad_w = np.mean((y_hat - y) * X, 0)[:, np.newaxis]
50        S_w = rho * S_w + (1 - rho) * grad_w * grad_w
51        g_w = np.sqrt(delta_w + eps) / np.sqrt(S_w + eps) * grad_w
52        w = w - g_w
53        delta_w = rho * delta_w + (1 - rho) * g_w * g_w
54
55        grad_b = np.mean(y_hat - y)
56        S_b = rho * S_b + (1 - rho) * grad_b * grad_b
57        g_b = np.sqrt(delta_b + eps) / np.sqrt(S_b + eps) * grad_b
58        b = b - g_b
59        delta_b = rho * delta_b + (1 - rho) * b * b
60
61        # 损失
62        if (batch_i + 1) * batch_size % 100 == 0:
63            ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
64            print(ls_loss[-1])
65
66plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
67plt.xlabel('epoch')
68plt.ylabel('loss')
69plt.show()
70
71

adam

损失函数为  

梯度

更新

优化算法


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
1#!/usr/bin/env python
2# _*_ coding:utf-8 _*_
3import numpy as np
4import random
5from IPython import display
6from matplotlib import pyplot as plt
7
8display.set_matplotlib_formats('svg')
9plt.rcParams['figure.figsize'] = (3.5, 2.5)
10
11data = np.genfromtxt(r'E:\d2l-zh\data\airfoil_self_noise.dat', delimiter='\t')
12data = (data - data.mean(axis=0)) / data.std(axis=0)
13features = data[:1500, :-1]
14target = data[:1500, -1, np.newaxis]
15
16w = np.random.normal(0, 0.01, [features.shape[1], 1])
17b = 0
18eps = 1e-8
19
20v_w = np.zeros([features.shape[1], 1])
21v_b = np.zeros(1)
22S_w = np.zeros([features.shape[1], 1])
23S_b = np.zeros(1)
24t = 1
25
26nums_epoch = 2
27lr = 0.01
28beta1 = 0.9
29beta2 = 0.999
30batch_size = 10
31
32y_hat = np.matmul(features, w) + b
33loss = np.mean(0.5 * (y_hat - target) ** 2)
34ls_loss = [loss]
35
36
37def data_iter(batch_size, features, target):
38    num_examples = len(features)
39    indices = list(range(num_examples))
40    random.shuffle(indices)
41    for i in range(0, num_examples, batch_size):
42        j = indices[i:min(i + batch_size, num_examples)]
43        yield features[j], target[j]
44
45
46for epoch in range(nums_epoch):
47    for batch_i, (X, y) in enumerate(data_iter(batch_size, features, target)):
48        y_hat = np.matmul(X, w) + b
49        loss = np.mean(0.5 * (y_hat - y) ** 2)
50
51        # 更新参数
52        grad_w = np.mean((y_hat - y) * X, 0)[:, np.newaxis]
53        v_w = beta1 * v_w + (1 - beta1) * grad_w
54        S_w = beta2 * S_w + (1 - beta2) * grad_w * grad_w
55        v_w_bias_corr = v_w / (1 - beta1 ** t)
56        s_w_bias_corr = S_w / (1 - beta2 ** t)
57        g_w = lr * v_w_bias_corr / (np.sqrt(s_w_bias_corr) + eps)
58        w = w - g_w
59
60        grad_b = np.mean(y_hat - y)
61        v_b = beta1 * v_b + (1 - beta1) * grad_b
62        S_b = beta2 * S_b + (1 - beta2) * grad_b * grad_b
63        v_b_bias_corr = v_b / (1 - beta1 ** t)
64        s_b_bias_corr = S_b / (1 - beta2 ** t)
65        g_b = lr * v_b_bias_corr / (np.sqrt(s_b_bias_corr) + eps)
66        b = b - g_b
67        t = t + 1
68
69        # 损失
70        if (batch_i + 1) * batch_size % 100 == 0:
71            ls_loss.append(np.mean(0.5 * (np.matmul(features, w) + b - target) ** 2))
72            print(ls_loss[-1])
73
74plt.plot(np.linspace(0, nums_epoch, len(ls_loss)), ls_loss)
75plt.xlabel('epoch')
76plt.ylabel('loss')
77plt.show()
78
79

 

给TA打赏
共{{data.count}}人
人已打赏
安全经验

独立博客怎样申请谷歌Adsense

2021-10-11 16:36:11

安全经验

安全咨询服务

2022-1-12 14:11:49

个人中心
购物车
优惠劵
今日签到
有新私信 私信列表
搜索