赞
踩
import numpy as np import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt device = torch.device('cuda:0') train_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=True ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) train_loader = torch.utils.data.DataLoader( train_set, batch_size=32, shuffle=True, num_workers = 0 ) test_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=False ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) test_loader = torch.utils.data.DataLoader( test_set, batch_size=32, shuffle=True, num_workers = 0 ) #Flatten拉平 net = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=96, kernel_size=11, stride=4, padding=1), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2), #54*54 --26 nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),#26*26 nn.MaxPool2d(kernel_size=3, stride=2),#12*12 nn.Conv2d(256, 384,kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2), nn.Flatten(), nn.Linear(9600, 4096), nn.ReLU(),nn.Dropout(p=0.5), nn.Linear(4096, 4096), nn.ReLU(),nn.Dropout(p=0.5), nn.Linear(4096, 10)) # x = torch.zeros((1, 3, 224, 224)) # for layer in net: # x = layer(x) # print(layer.__class__.__name__, "\t ouput_size:", x.shape) # #交叉熵损失函数中传递未规范化的预测,并同时计算softmax及其对数 # #参数的初始化 def init_weights(m): if(type(m))==nn.Linear: nn.init.normal_(m.weight, std=0.01) net.apply(init_weights) optimizer = optim.SGD(net.parameters(), lr = 0.12) loss = nn.CrossEntropyLoss(reduction='mean') epoch = 10 losses = [] for i in range(epoch): loss_sum = 0 for x, y in train_loader: net = net.to(device) x = x.to(device) y = y.to(device) y_hat = net(x) loss_temp = loss(y_hat, y) loss_sum += loss_temp optimizer.zero_grad() loss_temp.backward() optimizer.step() losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0]) print("epoch: ",i, "loss=", loss_sum.item()) acc = 0 with torch.no_grad(): for x, y in test_loader: x = x.to(device) y = y.to(device) y_hat = net(x) acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y) print("测试集准确度",(acc/test_set.data.shape[0]).item()) plt.plot(range(epoch), losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show()
‘’’
epoch: 0 loss= 1712.237548828125
epoch: 1 loss= 659.957275390625
epoch: 2 loss= 551.9481201171875
epoch: 3 loss= 484.9080810546875
epoch: 4 loss= 433.572265625
epoch: 5 loss= 393.9735107421875
epoch: 6 loss= 361.9253234863281
epoch: 7 loss= 331.5981140136719
epoch: 8 loss= 302.447265625
epoch: 9 loss= 276.08563232421875
测试集准确度 tensor(0.9043, device=‘cuda:0’)
‘’’
比lenet效果更好,alexnet学习能力更强。
import numpy as np import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt device = torch.device('cuda:0') train_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=True ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) train_loader = torch.utils.data.DataLoader( train_set, batch_size=32, shuffle=True, num_workers = 0 ) test_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=False ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) test_loader = torch.utils.data.DataLoader( test_set, batch_size=32, shuffle=True, num_workers = 0 ) #Flatten拉平 net = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=96, kernel_size=11, stride=4, padding=1), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2), #54*54 --26 nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),#26*26 nn.MaxPool2d(kernel_size=3, stride=2),#12*12 nn.Conv2d(256, 384,kernel_size=3, padding=1), nn.ReLU(), # nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(), # nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2), nn.Flatten(), nn.Linear(9600, 4096), nn.ReLU(),nn.Dropout(p=0.5), # nn.Linear(4096, 4096), nn.ReLU(),nn.Dropout(p=0.5), nn.Linear(4096, 10)) # x = torch.zeros((1, 3, 224, 224)) # for layer in net: # x = layer(x) # print(layer.__class__.__name__, "\t ouput_size:", x.shape) # #交叉熵损失函数中传递未规范化的预测,并同时计算softmax及其对数 # #参数的初始化 def init_weights(m): if(type(m))==nn.Linear: nn.init.normal_(m.weight, std=0.01) net.apply(init_weights) optimizer = optim.SGD(net.parameters(), lr = 0.12) loss = nn.CrossEntropyLoss(reduction='mean') epoch = 10 losses = [] for i in range(epoch): loss_sum = 0 for x, y in train_loader: net = net.to(device) x = x.to(device) y = y.to(device) y_hat = net(x) loss_temp = loss(y_hat, y) loss_sum += loss_temp optimizer.zero_grad() loss_temp.backward() optimizer.step() losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0]) print("epoch: ",i, "loss=", loss_sum.item()) acc = 0 with torch.no_grad(): for x, y in test_loader: x = x.to(device) y = y.to(device) y_hat = net(x) acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y) print("测试集准确度",(acc/test_set.data.shape[0]).item()) plt.plot(range(epoch), losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show() epoch: 0 loss= 980.5542602539062 epoch: 1 loss= 567.556396484375 epoch: 2 loss= 486.2161865234375 epoch: 3 loss= 434.8039855957031 epoch: 4 loss= 394.9348449707031 epoch: 5 loss= 358.361083984375 epoch: 6 loss= 324.60540771484375 epoch: 7 loss= 295.7160339355469 epoch: 8 loss= 268.5332336425781 epoch: 9 loss= 247.34283447265625 测试集准确度 0.9116999506950378
随着batchsize增大,GPU所使用的内存也增多,占用率随着增大。
图片数据和中间结果?
卷积层
VGG更深,卷积层更多,卷积是一件很贵的事情。
那么VGG的网络不能直接使用,需要据此修改一下,调整一下尺寸变化的过程。
import numpy as np import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt device = torch.device('cuda:0') train_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=True ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) train_loader = torch.utils.data.DataLoader( train_set, batch_size=64, shuffle=True, num_workers = 0 ) test_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=False ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) test_loader = torch.utils.data.DataLoader( test_set, batch_size=64, shuffle=True, num_workers = 0 ) #创建VGG块 def vgg_block(num_convs, in_channels, out_channels): block = [] for _ in range(num_convs): block.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)) block.append(nn.ReLU()) in_channels = out_channels block.append(nn.MaxPool2d(kernel_size=2, stride=2)) return nn.Sequential(*block) con_arc_11 = [(1, int(64//4)), (1, 128//4), (2, 256//4), (2, 512//4), (2, 512//4)] #vgg11 con_arc_16 = [(2, 64//4), (2, 128//4), (3, 256//4), (3, 512//4), (3, 512//4)] #vgg16 con_arc_19 = [(2, 64//4), (2, 128//4), (4, 256//4), (4, 512//4), (4, 512//4)] #vgg19 def vgg(con_arc, in_channels): net = [] for (num_convs, out_channels) in con_arc: net.append(vgg_block(num_convs, in_channels, out_channels)) in_channels = out_channels return nn.Sequential(*net, nn.Flatten(), nn.Linear(6272, 4096), nn.ReLU(), nn.Dropout(p=0.5), nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(p=0.5), nn.Linear(4096, 10)) net = vgg(con_arc_16, 1) x = torch.zeros((1, 1, 224, 224)) for layer in net: x = layer(x) print(layer.__class__.__name__, "\t输出的格式为: ", x.shape) print("vgg16的结构为:", net) optimizer = optim.SGD(net.parameters(), lr = 0.12) loss = nn.CrossEntropyLoss(reduction='mean') epoch = 10 losses = [] for i in range(epoch): loss_sum = 0 for x, y in train_loader: net = net.to(device) x = x.to(device) y = y.to(device) y_hat = net(x) loss_temp = loss(y_hat, y) loss_sum += loss_temp optimizer.zero_grad() loss_temp.backward() optimizer.step() losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0]) print("epoch: ",i, "loss=", loss_sum.item()) acc = 0 with torch.no_grad(): for x, y in test_loader: x = x.to(device) y = y.to(device) y_hat = net(x) acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y) print("测试集准确度",(acc/test_set.data.shape[0]).item()) plt.plot(range(epoch), losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show() ''' epoch: 0 loss= 2160.221923828125 epoch: 1 loss= 790.5088500976562 epoch: 2 loss= 310.9482116699219 epoch: 3 loss= 251.4301300048828 epoch: 4 loss= 213.0938262939453 epoch: 5 loss= 185.2352294921875 epoch: 6 loss= 162.23760986328125 epoch: 7 loss= 138.5640869140625 epoch: 8 loss= 118.95980072021484 epoch: 9 loss= 102.10594177246094 测试集准确度 0.9230999946594238 '''
依旧使用SGD,学习率调为0.12, epoch 15
epoch: 0 loss= 949.5301513671875
epoch: 1 loss= 541.0523071289062
epoch: 2 loss= 335.288818359375
epoch: 3 loss= 265.7752990722656
epoch: 4 loss= 236.0061492919922
epoch: 5 loss= 214.9103546142578
epoch: 6 loss= 200.35089111328125
epoch: 7 loss= 186.38710021972656
epoch: 8 loss= 173.8882293701172
epoch: 9 loss= 164.55499267578125
epoch: 10 loss= 157.58424377441406
epoch: 11 loss= 150.84255981445312
epoch: 12 loss= 145.72850036621094
epoch: 13 loss= 139.71893310546875
epoch: 14 loss= 134.76446533203125
测试集准确度 0.8855999708175659
import numpy as np import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt device = torch.device('cuda:0') train_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=True ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) train_loader = torch.utils.data.DataLoader( train_set, batch_size=128, shuffle=True, num_workers = 0 ) test_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=False ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) test_loader = torch.utils.data.DataLoader( test_set, batch_size=128, shuffle=True, num_workers = 0 ) #创建Nin块 def Nin_block(in_channels, out_channels, padding, stride, kernel_size): return nn.Sequential( nn.Conv2d(in_channels, out_channels, padding=padding ,stride=stride, kernel_size=kernel_size), nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1),nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU() ) def Nin(): return nn.Sequential( Nin_block(1, 96, stride=4, kernel_size=11, padding=0), nn.MaxPool2d(kernel_size=3, stride=2), Nin_block(96, 256, stride=1, kernel_size=5, padding=2), nn.MaxPool2d(kernel_size=3, stride=2), Nin_block(256, 384, stride=1, kernel_size=3, padding=1), nn.MaxPool2d(kernel_size=3, stride=2), nn.Dropout(0.5), Nin_block(384, 10, stride=1, kernel_size=3, padding=1), nn.AdaptiveAvgPool2d((1, 1)), nn.Flatten() #去掉多余的维数 ) net = Nin() # x = torch.zeros((1, 1, 224, 224)) # for layer in net: # x = layer(x) # print(layer.__class__.__name__, "\t输出的格式为: ", x.shape) def init_weights(layer): if type(layer)== nn.Linear or type(layer) == nn.Conv2d: nn.init.xavier_uniform_(layer.weight) #初始化很重要,NiN随机初始化训练不动。。。 net.apply(init_weights) print("Nin的结构为:", net) optimizer = optim.SGD(net.parameters(), lr = 0.1) loss = nn.CrossEntropyLoss(reduction='mean') epoch = 10 losses = [] for i in range(epoch): loss_sum = 0 for x, y in train_loader: net = net.to(device) x = x.to(device) y = y.to(device) y_hat = net(x) loss_temp = loss(y_hat, y) loss_sum += loss_temp optimizer.zero_grad() loss_temp.backward() optimizer.step() losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0]) print("epoch: ",i, "loss=", loss_sum.item()) acc = 0 with torch.no_grad(): for x, y in test_loader: x = x.to(device) y = y.to(device) y_hat = net(x) acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y) print("测试集准确度",(acc/test_set.data.shape[0]).item()) plt.plot(range(epoch), losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show()
epoch: 0 loss= 993.8709716796875
epoch: 1 loss= 533.2142944335938
epoch: 2 loss= 361.1051330566406
epoch: 3 loss= 277.5993347167969
epoch: 4 loss= 232.45095825195312
epoch: 5 loss= 205.37686157226562
epoch: 6 loss= 187.60452270507812
epoch: 7 loss= 176.19105529785156
epoch: 8 loss= 165.3572540283203
epoch: 9 loss= 157.9745330810547
测试集准确度 0.8700000047683716
俩个可以多次融合通道信息
换成一个:
Nin的结构为: Sequential( (0): Sequential( (0): Conv2d(1, 96, kernel_size=(11, 11), stride=(4, 4)) (1): ReLU() (2): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1)) (3): ReLU() ) (1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False) (2): Sequential( (0): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2)) (1): ReLU() (2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1)) (3): ReLU() ) (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False) (4): Sequential( (0): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1)) (3): ReLU() ) (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False) (6): Dropout(p=0.5, inplace=False) (7): Sequential( (0): Conv2d(384, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): Conv2d(10, 10, kernel_size=(1, 1), stride=(1, 1)) (3): ReLU() ) (8): AdaptiveAvgPool2d(output_size=(1, 1)) (9): Flatten(start_dim=1, end_dim=-1) ) epoch: 0 loss= 847.5736694335938 epoch: 1 loss= 356.9827575683594 epoch: 2 loss= 261.1847839355469 epoch: 3 loss= 224.17762756347656 epoch: 4 loss= 196.07040405273438 epoch: 5 loss= 180.1049041748047 epoch: 6 loss= 171.22512817382812 epoch: 7 loss= 160.30470275878906 epoch: 8 loss= 153.3402099609375 epoch: 9 loss= 147.3529052734375 测试集准确度 0.8526999950408936
换成一个效果略低,俩个可以增加模型的非线性表达能力。
参数 96 ( 1 ∗ 11 ∗ 11 + 1 ) + ( 1 ∗ 1 ∗ 96 ∗ 96 ) ∗ 2 + 256 ∗ ( 96 ∗ 5 ∗ 5 + 1 ) + ( 1 ∗ 1 ∗ 256 ∗ 256 ) ∗ 2 + 384 ( 256 ∗ 3 ∗ 3 + 1 ) + ( 1 ∗ 1 ∗ 384 ∗ 384 ) ∗ 2 + 10 ( 384 ∗ 3 ∗ 3 + 1 ) + 2 ∗ ( 1 ∗ 1 ∗ 10 ∗ 10 ) = 1995284 96(1*11*11+1)+(1*1*96*96)*2+256*(96*5*5+1)+(1*1*256*256)*2+384(256*3*3+1)+(1*1*384*384)*2+10(384*3*3+1)+2*(1*1*10*10)=1995284 96(1∗11∗11+1)+(1∗1∗96∗96)∗2+256∗(96∗5∗5+1)+(1∗1∗256∗256)∗2+384(256∗3∗3+1)+(1∗1∗384∗384)∗2+10(384∗3∗3+1)+2∗(1∗1∗10∗10)=1995284
计算量: ( 54 ∗ 54 ∗ 96 ∗ 11 ∗ 11 ) + ( 54 ∗ 54 ∗ 96 ∗ 96 ∗ 96 ) ∗ 2 + ( 26 ∗ 26 ∗ 256 ∗ 5 ∗ 5 ) + ( 26 ∗ 26 ∗ 256 ∗ 256 ∗ 256 ) ∗ 2 + ( 12 ∗ 12 ∗ 384 ∗ 3 ∗ 3 ) + ( 12 ∗ 12 ∗ 384 ∗ 384 ∗ 384 ) + ( 10 ∗ 384 ∗ 5 ∗ 5 ∗ 3 ∗ 3 ) + ( 10 ∗ 5 ∗ 5 ∗ 10 ∗ 10 ) ∗ 2 = 2.469 ∗ 1 0 10 (54*54*96*11*11)+(54*54*96*96*96)*2+(26*26*256*5*5)+(26*26*256*256*256)*2+(12*12*384*3*3)+(12*12*384*384*384)+(10*384*5*5*3*3)+(10*5*5*10*10)*2 = 2.469*10^{10} (54∗54∗96∗11∗11)+(54∗54∗96∗96∗96)∗2+(26∗26∗256∗5∗5)+(26∗26∗256∗256∗256)∗2+(12∗12∗384∗3∗3)+(12∗12∗384∗384∗384)+(10∗384∗5∗5∗3∗3)+(10∗5∗5∗10∗10)∗2=2.469∗1010
信息损失过大?
训练结果
epoch: 0 loss= 1633.828857421875
epoch: 1 loss= 803.0449829101562
epoch: 2 loss= 605.2086181640625
epoch: 3 loss= 472.7867126464844
epoch: 4 loss= 409.02154541015625
epoch: 5 loss= 368.7477722167969
epoch: 6 loss= 339.0345458984375
epoch: 7 loss= 315.32861328125
epoch: 8 loss= 297.1580810546875
epoch: 9 loss= 287.3929748535156
测试集准确度 0.880899965763092
只有前面的block1缩小了图片的大小,则最小图片大小为55,经过77卷积变33,再经过3 * 3的池化变11.
NiN是采用11卷积替换全连接,VGG则是用多个小卷积核替代大的卷积核。(Googlenet也采用了11先降低通道维数减少计算参数的方式)
我认为是可以的,偏执最终也会被作为平均值中的一部分然后被减掉。
使用 :1.0 准确度 85.7%
不使用: 0.3 73.3%
只留了最后俩个batch-norm
lr:0.3 51.3% 测试集准确度大起大落不稳定
只留了前俩个batch-norm
lr:0.3 80.9% 测试集准确度较为稳定,说明相较于加在后面还是应该加在前面,可以确保数值稳定性。
net = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5), nn.Dropout(0.5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2), nn.Flatten(),
nn.Linear(256, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10))
lr:0.3 测试准确度 60.6%
个人所见,尽管俩个网络的构建思路不同,Inception是为了使用不同的卷积核去提取不同的特征(同时使用少的参数),但是Resnet模块是为了保证更深的网络表达的函数囊括浅层的函数,使得更深的网络有意义。但是在架构上,可以将Resnet看作是一种特殊的Inception模块。
贴一下Resnet 18的训练结果
epoch: 0 loss= 274.33294677734375 epoch: 1 loss= 123.11670684814453 epoch: 2 loss= 98.39700317382812 epoch: 3 loss= 81.01783752441406 epoch: 4 loss= 65.5454330444336 epoch: 5 loss= 51.666439056396484 epoch: 6 loss= 37.66799545288086 epoch: 7 loss= 27.368988037109375 epoch: 8 loss= 18.34890365600586 epoch: 9 loss= 10.552614212036133 epoch: 10 loss= 9.15225601196289 epoch: 11 loss= 5.566713809967041 epoch: 12 loss= 2.6198325157165527 epoch: 13 loss= 0.49261292815208435 epoch: 14 loss= 0.2144828885793686 测试集准确度 0.9327999949455261
实现一下Resnet-34:
import numpy as np import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt device = torch.device('cuda:0') train_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=True ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) train_loader = torch.utils.data.DataLoader( train_set, batch_size=128, shuffle=True, num_workers = 0 ) test_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=False ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) test_loader = torch.utils.data.DataLoader( test_set, batch_size=128, shuffle=True, num_workers = 0 ) #创建Resnet_block class ResBlock(nn.Module): def __init__(self, in_channels, out_channels, b2=False, first_block = True): super().__init__() if(first_block and not b2): stride = 2 self.conv_one = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, padding=0) else: self.conv_one = None stride = 1 self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1) self.bn1 = nn.BatchNorm2d(out_channels) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) self.bn2 = nn.BatchNorm2d(out_channels) def forward(self, x): y = F.relu(self.bn1(self.conv1(x))) y = self.bn2(self.conv2(y)) if(self.conv_one): out = F.relu(y + self.conv_one(x)) else: out = F.relu(y + x) return out def ResBlocks(nums, b2, in_channels, out_channels): block = [] for i in range(nums): if(i == 0): block.append(ResBlock(in_channels, out_channels, b2, first_block = True)) else: block.append(ResBlock(out_channels, out_channels, b2, first_block = False)) return nn.Sequential(*block) b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) b2 = ResBlocks(3, True, 64, 64) b3 = ResBlocks(4, False, 64, 128) b4 = ResBlocks(6, False, 128, 256) b5 = ResBlocks(3, False, 256, 512) net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1,1)), nn.Flatten(), nn.Linear(512, 10)) x = torch.zeros((1, 1, 224, 224)) for layer in net: x = layer(x) print(layer.__class__.__name__, "\t输出的形状为:", x.shape) def init_weights(layer): if type(layer)== nn.Linear or type(layer) == nn.Conv2d: nn.init.xavier_uniform_(layer.weight) #初始化很重要,NiN随机初始化训练不动。。。 net.apply(init_weights) print("Resnet18的结构为:", net) optimizer = optim.SGD(net.parameters(), lr = 0.12) loss = nn.CrossEntropyLoss(reduction='mean') epoch = 10 losses = [] for i in range(epoch): loss_sum = 0 for x, y in train_loader: net = net.to(device) x = x.to(device) y = y.to(device) y_hat = net(x) loss_temp = loss(y_hat, y) loss_sum += loss_temp optimizer.zero_grad() loss_temp.backward() optimizer.step() losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0]) print("epoch: ",i, "loss=", loss_sum.item()) acc = 0 with torch.no_grad(): for x, y in test_loader: x = x.to(device) y = y.to(device) y_hat = net(x) acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y) print("测试集准确度",(acc/test_set.data.shape[0]).item()) plt.plot(range(epoch), losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show() epoch: 0 loss= 379.17254638671875 epoch: 1 loss= 152.3948211669922 epoch: 2 loss= 121.77942657470703 epoch: 3 loss= 103.70003509521484 epoch: 4 loss= 89.28633117675781 epoch: 5 loss= 75.78622436523438 epoch: 6 loss= 62.75402069091797 epoch: 7 loss= 53.15045928955078 epoch: 8 loss= 42.1845703125 epoch: 9 loss= 34.198307037353516 测试集准确度 0.9125999808311462
x = torch.zeros((1, 1, 224, 224)) class bottleneck(nn.Module): def __init__(self, c_num, conv_skip = True, stride = 1): super().__init__() self.conv_layer = nn.Sequential( nn.Conv2d(c_num[0], c_num[1], kernel_size=1, padding=0, stride=1), nn.BatchNorm2d(c_num[1]), nn.ReLU(), nn.Conv2d(c_num[1], c_num[1], kernel_size=3, padding=1, stride=stride), nn.BatchNorm2d(c_num[1]), nn.ReLU(), nn.Conv2d(c_num[1], c_num[2], kernel_size=1, padding=0, stride=1), nn.BatchNorm2d(c_num[2]), nn.ReLU()) if(conv_skip): self.conv_skip = nn.Conv2d(c_num[0], c_num[2], kernel_size=1, padding=0, stride=stride) else: self.conv_skip = None def forward(self, x): y = self.conv_layer(x) if(self.conv_skip): out = y + self.conv_skip(x) else: out = y + x return out def bottle_block(block_num, c_num, b2 = False): block = [] for i in range(block_num): if(i == 0 and not b2): block.append(bottleneck(c_num, True, stride=2)) else: block.append(bottleneck([c_num[2], c_num[1], c_num[2]], False)) return nn.Sequential(*block) b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) b2 = bottle_block(3, [64, 64, 256]) b3 = bottle_block(4, [256, 128, 512]) b4 = bottle_block(6, [512, 256, 1024]) b5 = bottle_block(3, [1024, 512, 2048]) net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1,1)), nn.Flatten(), nn.Linear(2048, 10)) for layer in net: x = layer(x) print(layer.__class__.__name__, "\t输出的形状为:", x.shape)
Sequential 输出的形状为: torch.Size([1, 64, 56, 56])
Sequential 输出的形状为: torch.Size([1, 256, 28, 28])
Sequential 输出的形状为: torch.Size([1, 512, 14, 14])
Sequential 输出的形状为: torch.Size([1, 1024, 7, 7])
Sequential 输出的形状为: torch.Size([1, 2048, 4, 4])
AdaptiveAvgPool2d 输出的形状为: torch.Size([1, 2048, 1, 1])
Flatten 输出的形状为: torch.Size([1, 2048])
Linear 输出的形状为: torch.Size([1, 10])
先训练一下Densenet
epoch: 0 loss= 574.0405883789062 epoch: 1 loss= 304.017578125 epoch: 2 loss= 242.1274871826172 epoch: 3 loss= 208.28538513183594 epoch: 4 loss= 185.4270782470703 epoch: 5 loss= 170.85366821289062 epoch: 6 loss= 156.8195343017578 epoch: 7 loss= 142.7688446044922 epoch: 8 loss= 132.4578857421875 epoch: 9 loss= 120.2202377319336 epoch: 10 loss= 110.72298431396484 epoch: 11 loss= 103.44509887695312 epoch: 12 loss= 94.1830825805664 epoch: 13 loss= 88.94744110107422 epoch: 14 loss= 80.26952362060547 测试集准确度 0.877299964427948
将平均汇聚改为最大汇聚:
def conv_dense(in_channels, out_channels): return nn.Sequential( nn.BatchNorm2d(in_channels), nn.ReLU(), nn.Conv2d(in_channels, 128, kernel_size=1, padding=0, stride=1), nn.BatchNorm2d(128), nn.ReLU(), nn.Conv2d(128, out_channels, kernel_size=3, stride=1, padding=1)) class DesBlock(nn.Module): def __init__(self, nums, in_channels, out_channels): super().__init__() self.blocks =[] self.blocks = self.blocks for i in range(nums): self.blocks.append(conv_dense(i*out_channels + in_channels, out_channels)) self.net = nn.Sequential(*self.blocks) def forward(self, x): for block in self.net: y = block(x) x = torch.cat((x, y), dim=1) return x def transition(in_channels): out_channels = in_channels//2 return nn.Sequential( nn.BatchNorm2d(in_channels),nn.ReLU(), nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0, stride=1), nn.MaxPool2d(kernel_size=2, stride=2) ) epoch: 0 loss= 515.3784790039062 epoch: 1 loss= 282.50244140625 epoch: 2 loss= 232.67477416992188 epoch: 3 loss= 202.9937286376953 epoch: 4 loss= 178.63426208496094 epoch: 5 loss= 161.4530029296875 epoch: 6 loss= 147.14990234375 epoch: 7 loss= 129.9204864501953 epoch: 8 loss= 117.783203125 epoch: 9 loss= 106.5365219116211 epoch: 10 loss= 94.76472473144531 epoch: 11 loss= 87.81078338623047 epoch: 12 loss= 78.73601531982422 epoch: 13 loss= 69.39637756347656 epoch: 14 loss= 62.99326705932617 测试集准确度 0.9085999727249146
准确度更高。
重复使用了特征图,使得无需重复学习的参数。
是这样的,可能因为要保存许多的特征图,6G显存不太够,需要降低batchsize。
import numpy as np import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt device = torch.device('cuda:0') train_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=True ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) train_loader = torch.utils.data.DataLoader( train_set, batch_size=32, shuffle=True, num_workers = 0 ) test_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=False ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) test_loader = torch.utils.data.DataLoader( test_set, batch_size=32, shuffle=True, num_workers = 0 ) #Flatten拉平 net = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=96, kernel_size=11, stride=4, padding=1), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2), #54*54 --26 nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),#26*26 nn.MaxPool2d(kernel_size=3, stride=2),#12*12 nn.Conv2d(256, 384,kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2), nn.Flatten(), nn.Linear(9600, 4096), nn.ReLU(),nn.Dropout(p=0.5), nn.Linear(4096, 4096), nn.ReLU(),nn.Dropout(p=0.5), nn.Linear(4096, 10)) # x = torch.zeros((1, 3, 224, 224)) # for layer in net: # x = layer(x) # print(layer.__class__.__name__, "\t ouput_size:", x.shape) # #交叉熵损失函数中传递未规范化的预测,并同时计算softmax及其对数 # #参数的初始化 def init_weights(m): if(type(m))==nn.Linear: nn.init.normal_(m.weight, std=0.01) net.apply(init_weights) optimizer = optim.SGD(net.parameters(), lr = 0.12) loss = nn.CrossEntropyLoss(reduction='mean') epoch = 10 losses = [] for i in range(epoch): loss_sum = 0 for x, y in train_loader: net = net.to(device) x = x.to(device) y = y.to(device) y_hat = net(x) loss_temp = loss(y_hat, y) loss_sum += loss_temp optimizer.zero_grad() loss_temp.backward() optimizer.step() losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0]) print("epoch: ",i, "loss=", loss_sum.item()) acc = 0 with torch.no_grad(): for x, y in test_loader: x = x.to(device) y = y.to(device) y_hat = net(x) acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y) print("测试集准确度",(acc/test_set.data.shape[0]).item()) plt.plot(range(epoch), losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show()
‘’’
epoch: 0 loss= 1712.237548828125
epoch: 1 loss= 659.957275390625
epoch: 2 loss= 551.9481201171875
epoch: 3 loss= 484.9080810546875
epoch: 4 loss= 433.572265625
epoch: 5 loss= 393.9735107421875
epoch: 6 loss= 361.9253234863281
epoch: 7 loss= 331.5981140136719
epoch: 8 loss= 302.447265625
epoch: 9 loss= 276.08563232421875
测试集准确度 tensor(0.9043, device=‘cuda:0’)
‘’’
比lenet效果更好,alexnet学习能力更强。
import numpy as np import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt device = torch.device('cuda:0') train_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=True ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) train_loader = torch.utils.data.DataLoader( train_set, batch_size=32, shuffle=True, num_workers = 0 ) test_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=False ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) test_loader = torch.utils.data.DataLoader( test_set, batch_size=32, shuffle=True, num_workers = 0 ) #Flatten拉平 net = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=96, kernel_size=11, stride=4, padding=1), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2), #54*54 --26 nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),#26*26 nn.MaxPool2d(kernel_size=3, stride=2),#12*12 nn.Conv2d(256, 384,kernel_size=3, padding=1), nn.ReLU(), # nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(), # nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2), nn.Flatten(), nn.Linear(9600, 4096), nn.ReLU(),nn.Dropout(p=0.5), # nn.Linear(4096, 4096), nn.ReLU(),nn.Dropout(p=0.5), nn.Linear(4096, 10)) # x = torch.zeros((1, 3, 224, 224)) # for layer in net: # x = layer(x) # print(layer.__class__.__name__, "\t ouput_size:", x.shape) # #交叉熵损失函数中传递未规范化的预测,并同时计算softmax及其对数 # #参数的初始化 def init_weights(m): if(type(m))==nn.Linear: nn.init.normal_(m.weight, std=0.01) net.apply(init_weights) optimizer = optim.SGD(net.parameters(), lr = 0.12) loss = nn.CrossEntropyLoss(reduction='mean') epoch = 10 losses = [] for i in range(epoch): loss_sum = 0 for x, y in train_loader: net = net.to(device) x = x.to(device) y = y.to(device) y_hat = net(x) loss_temp = loss(y_hat, y) loss_sum += loss_temp optimizer.zero_grad() loss_temp.backward() optimizer.step() losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0]) print("epoch: ",i, "loss=", loss_sum.item()) acc = 0 with torch.no_grad(): for x, y in test_loader: x = x.to(device) y = y.to(device) y_hat = net(x) acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y) print("测试集准确度",(acc/test_set.data.shape[0]).item()) plt.plot(range(epoch), losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show() epoch: 0 loss= 980.5542602539062 epoch: 1 loss= 567.556396484375 epoch: 2 loss= 486.2161865234375 epoch: 3 loss= 434.8039855957031 epoch: 4 loss= 394.9348449707031 epoch: 5 loss= 358.361083984375 epoch: 6 loss= 324.60540771484375 epoch: 7 loss= 295.7160339355469 epoch: 8 loss= 268.5332336425781 epoch: 9 loss= 247.34283447265625 测试集准确度 0.9116999506950378
随着batchsize增大,GPU所使用的内存也增多,占用率随着增大。
图片数据和中间结果?
卷积层
VGG更深,卷积层更多,卷积是一件很贵的事情。
那么VGG的网络不能直接使用,需要据此修改一下,调整一下尺寸变化的过程。
import numpy as np import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt device = torch.device('cuda:0') train_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=True ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) train_loader = torch.utils.data.DataLoader( train_set, batch_size=64, shuffle=True, num_workers = 0 ) test_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=False ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) test_loader = torch.utils.data.DataLoader( test_set, batch_size=64, shuffle=True, num_workers = 0 ) #创建VGG块 def vgg_block(num_convs, in_channels, out_channels): block = [] for _ in range(num_convs): block.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)) block.append(nn.ReLU()) in_channels = out_channels block.append(nn.MaxPool2d(kernel_size=2, stride=2)) return nn.Sequential(*block) con_arc_11 = [(1, int(64//4)), (1, 128//4), (2, 256//4), (2, 512//4), (2, 512//4)] #vgg11 con_arc_16 = [(2, 64//4), (2, 128//4), (3, 256//4), (3, 512//4), (3, 512//4)] #vgg16 con_arc_19 = [(2, 64//4), (2, 128//4), (4, 256//4), (4, 512//4), (4, 512//4)] #vgg19 def vgg(con_arc, in_channels): net = [] for (num_convs, out_channels) in con_arc: net.append(vgg_block(num_convs, in_channels, out_channels)) in_channels = out_channels return nn.Sequential(*net, nn.Flatten(), nn.Linear(6272, 4096), nn.ReLU(), nn.Dropout(p=0.5), nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(p=0.5), nn.Linear(4096, 10)) net = vgg(con_arc_16, 1) x = torch.zeros((1, 1, 224, 224)) for layer in net: x = layer(x) print(layer.__class__.__name__, "\t输出的格式为: ", x.shape) print("vgg16的结构为:", net) optimizer = optim.SGD(net.parameters(), lr = 0.12) loss = nn.CrossEntropyLoss(reduction='mean') epoch = 10 losses = [] for i in range(epoch): loss_sum = 0 for x, y in train_loader: net = net.to(device) x = x.to(device) y = y.to(device) y_hat = net(x) loss_temp = loss(y_hat, y) loss_sum += loss_temp optimizer.zero_grad() loss_temp.backward() optimizer.step() losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0]) print("epoch: ",i, "loss=", loss_sum.item()) acc = 0 with torch.no_grad(): for x, y in test_loader: x = x.to(device) y = y.to(device) y_hat = net(x) acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y) print("测试集准确度",(acc/test_set.data.shape[0]).item()) plt.plot(range(epoch), losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show() ''' epoch: 0 loss= 2160.221923828125 epoch: 1 loss= 790.5088500976562 epoch: 2 loss= 310.9482116699219 epoch: 3 loss= 251.4301300048828 epoch: 4 loss= 213.0938262939453 epoch: 5 loss= 185.2352294921875 epoch: 6 loss= 162.23760986328125 epoch: 7 loss= 138.5640869140625 epoch: 8 loss= 118.95980072021484 epoch: 9 loss= 102.10594177246094 测试集准确度 0.9230999946594238 '''
依旧使用SGD,学习率调为0.12, epoch 15
epoch: 0 loss= 949.5301513671875
epoch: 1 loss= 541.0523071289062
epoch: 2 loss= 335.288818359375
epoch: 3 loss= 265.7752990722656
epoch: 4 loss= 236.0061492919922
epoch: 5 loss= 214.9103546142578
epoch: 6 loss= 200.35089111328125
epoch: 7 loss= 186.38710021972656
epoch: 8 loss= 173.8882293701172
epoch: 9 loss= 164.55499267578125
epoch: 10 loss= 157.58424377441406
epoch: 11 loss= 150.84255981445312
epoch: 12 loss= 145.72850036621094
epoch: 13 loss= 139.71893310546875
epoch: 14 loss= 134.76446533203125
测试集准确度 0.8855999708175659
import numpy as np import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt device = torch.device('cuda:0') train_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=True ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) train_loader = torch.utils.data.DataLoader( train_set, batch_size=128, shuffle=True, num_workers = 0 ) test_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=False ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) test_loader = torch.utils.data.DataLoader( test_set, batch_size=128, shuffle=True, num_workers = 0 ) #创建Nin块 def Nin_block(in_channels, out_channels, padding, stride, kernel_size): return nn.Sequential( nn.Conv2d(in_channels, out_channels, padding=padding ,stride=stride, kernel_size=kernel_size), nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1),nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU() ) def Nin(): return nn.Sequential( Nin_block(1, 96, stride=4, kernel_size=11, padding=0), nn.MaxPool2d(kernel_size=3, stride=2), Nin_block(96, 256, stride=1, kernel_size=5, padding=2), nn.MaxPool2d(kernel_size=3, stride=2), Nin_block(256, 384, stride=1, kernel_size=3, padding=1), nn.MaxPool2d(kernel_size=3, stride=2), nn.Dropout(0.5), Nin_block(384, 10, stride=1, kernel_size=3, padding=1), nn.AdaptiveAvgPool2d((1, 1)), nn.Flatten() #去掉多余的维数 ) net = Nin() # x = torch.zeros((1, 1, 224, 224)) # for layer in net: # x = layer(x) # print(layer.__class__.__name__, "\t输出的格式为: ", x.shape) def init_weights(layer): if type(layer)== nn.Linear or type(layer) == nn.Conv2d: nn.init.xavier_uniform_(layer.weight) #初始化很重要,NiN随机初始化训练不动。。。 net.apply(init_weights) print("Nin的结构为:", net) optimizer = optim.SGD(net.parameters(), lr = 0.1) loss = nn.CrossEntropyLoss(reduction='mean') epoch = 10 losses = [] for i in range(epoch): loss_sum = 0 for x, y in train_loader: net = net.to(device) x = x.to(device) y = y.to(device) y_hat = net(x) loss_temp = loss(y_hat, y) loss_sum += loss_temp optimizer.zero_grad() loss_temp.backward() optimizer.step() losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0]) print("epoch: ",i, "loss=", loss_sum.item()) acc = 0 with torch.no_grad(): for x, y in test_loader: x = x.to(device) y = y.to(device) y_hat = net(x) acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y) print("测试集准确度",(acc/test_set.data.shape[0]).item()) plt.plot(range(epoch), losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show()
epoch: 0 loss= 993.8709716796875
epoch: 1 loss= 533.2142944335938
epoch: 2 loss= 361.1051330566406
epoch: 3 loss= 277.5993347167969
epoch: 4 loss= 232.45095825195312
epoch: 5 loss= 205.37686157226562
epoch: 6 loss= 187.60452270507812
epoch: 7 loss= 176.19105529785156
epoch: 8 loss= 165.3572540283203
epoch: 9 loss= 157.9745330810547
测试集准确度 0.8700000047683716
俩个可以多次融合通道信息
换成一个:
Nin的结构为: Sequential( (0): Sequential( (0): Conv2d(1, 96, kernel_size=(11, 11), stride=(4, 4)) (1): ReLU() (2): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1)) (3): ReLU() ) (1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False) (2): Sequential( (0): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2)) (1): ReLU() (2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1)) (3): ReLU() ) (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False) (4): Sequential( (0): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1)) (3): ReLU() ) (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False) (6): Dropout(p=0.5, inplace=False) (7): Sequential( (0): Conv2d(384, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): Conv2d(10, 10, kernel_size=(1, 1), stride=(1, 1)) (3): ReLU() ) (8): AdaptiveAvgPool2d(output_size=(1, 1)) (9): Flatten(start_dim=1, end_dim=-1) ) epoch: 0 loss= 847.5736694335938 epoch: 1 loss= 356.9827575683594 epoch: 2 loss= 261.1847839355469 epoch: 3 loss= 224.17762756347656 epoch: 4 loss= 196.07040405273438 epoch: 5 loss= 180.1049041748047 epoch: 6 loss= 171.22512817382812 epoch: 7 loss= 160.30470275878906 epoch: 8 loss= 153.3402099609375 epoch: 9 loss= 147.3529052734375 测试集准确度 0.8526999950408936
换成一个效果略低,俩个可以增加模型的非线性表达能力。
参数 96 ( 1 ∗ 11 ∗ 11 + 1 ) + ( 1 ∗ 1 ∗ 96 ∗ 96 ) ∗ 2 + 256 ∗ ( 96 ∗ 5 ∗ 5 + 1 ) + ( 1 ∗ 1 ∗ 256 ∗ 256 ) ∗ 2 + 384 ( 256 ∗ 3 ∗ 3 + 1 ) + ( 1 ∗ 1 ∗ 384 ∗ 384 ) ∗ 2 + 10 ( 384 ∗ 3 ∗ 3 + 1 ) + 2 ∗ ( 1 ∗ 1 ∗ 10 ∗ 10 ) = 1995284 96(1*11*11+1)+(1*1*96*96)*2+256*(96*5*5+1)+(1*1*256*256)*2+384(256*3*3+1)+(1*1*384*384)*2+10(384*3*3+1)+2*(1*1*10*10)=1995284 96(1∗11∗11+1)+(1∗1∗96∗96)∗2+256∗(96∗5∗5+1)+(1∗1∗256∗256)∗2+384(256∗3∗3+1)+(1∗1∗384∗384)∗2+10(384∗3∗3+1)+2∗(1∗1∗10∗10)=1995284
计算量: ( 54 ∗ 54 ∗ 96 ∗ 11 ∗ 11 ) + ( 54 ∗ 54 ∗ 96 ∗ 96 ∗ 96 ) ∗ 2 + ( 26 ∗ 26 ∗ 256 ∗ 5 ∗ 5 ) + ( 26 ∗ 26 ∗ 256 ∗ 256 ∗ 256 ) ∗ 2 + ( 12 ∗ 12 ∗ 384 ∗ 3 ∗ 3 ) + ( 12 ∗ 12 ∗ 384 ∗ 384 ∗ 384 ) + ( 10 ∗ 384 ∗ 5 ∗ 5 ∗ 3 ∗ 3 ) + ( 10 ∗ 5 ∗ 5 ∗ 10 ∗ 10 ) ∗ 2 = 2.469 ∗ 1 0 10 (54*54*96*11*11)+(54*54*96*96*96)*2+(26*26*256*5*5)+(26*26*256*256*256)*2+(12*12*384*3*3)+(12*12*384*384*384)+(10*384*5*5*3*3)+(10*5*5*10*10)*2 = 2.469*10^{10} (54∗54∗96∗11∗11)+(54∗54∗96∗96∗96)∗2+(26∗26∗256∗5∗5)+(26∗26∗256∗256∗256)∗2+(12∗12∗384∗3∗3)+(12∗12∗384∗384∗384)+(10∗384∗5∗5∗3∗3)+(10∗5∗5∗10∗10)∗2=2.469∗1010
信息损失过大?
训练结果
epoch: 0 loss= 1633.828857421875
epoch: 1 loss= 803.0449829101562
epoch: 2 loss= 605.2086181640625
epoch: 3 loss= 472.7867126464844
epoch: 4 loss= 409.02154541015625
epoch: 5 loss= 368.7477722167969
epoch: 6 loss= 339.0345458984375
epoch: 7 loss= 315.32861328125
epoch: 8 loss= 297.1580810546875
epoch: 9 loss= 287.3929748535156
测试集准确度 0.880899965763092
只有前面的block1缩小了图片的大小,则最小图片大小为55,经过77卷积变33,再经过3 * 3的池化变11.
NiN是采用11卷积替换全连接,VGG则是用多个小卷积核替代大的卷积核。(Googlenet也采用了11先降低通道维数减少计算参数的方式)
我认为是可以的,偏执最终也会被作为平均值中的一部分然后被减掉。
使用 :1.0 准确度 85.7%
不使用: 0.3 73.3%
只留了最后俩个batch-norm
lr:0.3 51.3% 测试集准确度大起大落不稳定
只留了前俩个batch-norm
lr:0.3 80.9% 测试集准确度较为稳定,说明相较于加在后面还是应该加在前面,可以确保数值稳定性。
net = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5), nn.Dropout(0.5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2), nn.Flatten(),
nn.Linear(256, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10))
lr:0.3 测试准确度 60.6%
个人所见,尽管俩个网络的构建思路不同,Inception是为了使用不同的卷积核去提取不同的特征(同时使用少的参数),但是Resnet模块是为了保证更深的网络表达的函数囊括浅层的函数,使得更深的网络有意义。但是在架构上,可以将Resnet看作是一种特殊的Inception模块。
贴一下Resnet 18的训练结果
epoch: 0 loss= 274.33294677734375 epoch: 1 loss= 123.11670684814453 epoch: 2 loss= 98.39700317382812 epoch: 3 loss= 81.01783752441406 epoch: 4 loss= 65.5454330444336 epoch: 5 loss= 51.666439056396484 epoch: 6 loss= 37.66799545288086 epoch: 7 loss= 27.368988037109375 epoch: 8 loss= 18.34890365600586 epoch: 9 loss= 10.552614212036133 epoch: 10 loss= 9.15225601196289 epoch: 11 loss= 5.566713809967041 epoch: 12 loss= 2.6198325157165527 epoch: 13 loss= 0.49261292815208435 epoch: 14 loss= 0.2144828885793686 测试集准确度 0.9327999949455261
实现一下Resnet-34:
import numpy as np import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt device = torch.device('cuda:0') train_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=True ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) train_loader = torch.utils.data.DataLoader( train_set, batch_size=128, shuffle=True, num_workers = 0 ) test_set = torchvision.datasets.FashionMNIST( root='./dataMnist' ,train=False ,download=True ,transform=transforms.Compose([ transforms.ToTensor(), transforms.Resize((224, 224)) ]) ) test_loader = torch.utils.data.DataLoader( test_set, batch_size=128, shuffle=True, num_workers = 0 ) #创建Resnet_block class ResBlock(nn.Module): def __init__(self, in_channels, out_channels, b2=False, first_block = True): super().__init__() if(first_block and not b2): stride = 2 self.conv_one = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, padding=0) else: self.conv_one = None stride = 1 self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1) self.bn1 = nn.BatchNorm2d(out_channels) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) self.bn2 = nn.BatchNorm2d(out_channels) def forward(self, x): y = F.relu(self.bn1(self.conv1(x))) y = self.bn2(self.conv2(y)) if(self.conv_one): out = F.relu(y + self.conv_one(x)) else: out = F.relu(y + x) return out def ResBlocks(nums, b2, in_channels, out_channels): block = [] for i in range(nums): if(i == 0): block.append(ResBlock(in_channels, out_channels, b2, first_block = True)) else: block.append(ResBlock(out_channels, out_channels, b2, first_block = False)) return nn.Sequential(*block) b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) b2 = ResBlocks(3, True, 64, 64) b3 = ResBlocks(4, False, 64, 128) b4 = ResBlocks(6, False, 128, 256) b5 = ResBlocks(3, False, 256, 512) net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1,1)), nn.Flatten(), nn.Linear(512, 10)) x = torch.zeros((1, 1, 224, 224)) for layer in net: x = layer(x) print(layer.__class__.__name__, "\t输出的形状为:", x.shape) def init_weights(layer): if type(layer)== nn.Linear or type(layer) == nn.Conv2d: nn.init.xavier_uniform_(layer.weight) #初始化很重要,NiN随机初始化训练不动。。。 net.apply(init_weights) print("Resnet18的结构为:", net) optimizer = optim.SGD(net.parameters(), lr = 0.12) loss = nn.CrossEntropyLoss(reduction='mean') epoch = 10 losses = [] for i in range(epoch): loss_sum = 0 for x, y in train_loader: net = net.to(device) x = x.to(device) y = y.to(device) y_hat = net(x) loss_temp = loss(y_hat, y) loss_sum += loss_temp optimizer.zero_grad() loss_temp.backward() optimizer.step() losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0]) print("epoch: ",i, "loss=", loss_sum.item()) acc = 0 with torch.no_grad(): for x, y in test_loader: x = x.to(device) y = y.to(device) y_hat = net(x) acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y) print("测试集准确度",(acc/test_set.data.shape[0]).item()) plt.plot(range(epoch), losses) plt.xlabel('epoch') plt.ylabel('loss') plt.show() epoch: 0 loss= 379.17254638671875 epoch: 1 loss= 152.3948211669922 epoch: 2 loss= 121.77942657470703 epoch: 3 loss= 103.70003509521484 epoch: 4 loss= 89.28633117675781 epoch: 5 loss= 75.78622436523438 epoch: 6 loss= 62.75402069091797 epoch: 7 loss= 53.15045928955078 epoch: 8 loss= 42.1845703125 epoch: 9 loss= 34.198307037353516 测试集准确度 0.9125999808311462
x = torch.zeros((1, 1, 224, 224)) class bottleneck(nn.Module): def __init__(self, c_num, conv_skip = True, stride = 1): super().__init__() self.conv_layer = nn.Sequential( nn.Conv2d(c_num[0], c_num[1], kernel_size=1, padding=0, stride=1), nn.BatchNorm2d(c_num[1]), nn.ReLU(), nn.Conv2d(c_num[1], c_num[1], kernel_size=3, padding=1, stride=stride), nn.BatchNorm2d(c_num[1]), nn.ReLU(), nn.Conv2d(c_num[1], c_num[2], kernel_size=1, padding=0, stride=1), nn.BatchNorm2d(c_num[2]), nn.ReLU()) if(conv_skip): self.conv_skip = nn.Conv2d(c_num[0], c_num[2], kernel_size=1, padding=0, stride=stride) else: self.conv_skip = None def forward(self, x): y = self.conv_layer(x) if(self.conv_skip): out = y + self.conv_skip(x) else: out = y + x return out def bottle_block(block_num, c_num, b2 = False): block = [] for i in range(block_num): if(i == 0 and not b2): block.append(bottleneck(c_num, True, stride=2)) else: block.append(bottleneck([c_num[2], c_num[1], c_num[2]], False)) return nn.Sequential(*block) b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) b2 = bottle_block(3, [64, 64, 256]) b3 = bottle_block(4, [256, 128, 512]) b4 = bottle_block(6, [512, 256, 1024]) b5 = bottle_block(3, [1024, 512, 2048]) net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1,1)), nn.Flatten(), nn.Linear(2048, 10)) for layer in net: x = layer(x) print(layer.__class__.__name__, "\t输出的形状为:", x.shape)
Sequential 输出的形状为: torch.Size([1, 64, 56, 56])
Sequential 输出的形状为: torch.Size([1, 256, 28, 28])
Sequential 输出的形状为: torch.Size([1, 512, 14, 14])
Sequential 输出的形状为: torch.Size([1, 1024, 7, 7])
Sequential 输出的形状为: torch.Size([1, 2048, 4, 4])
AdaptiveAvgPool2d 输出的形状为: torch.Size([1, 2048, 1, 1])
Flatten 输出的形状为: torch.Size([1, 2048])
Linear 输出的形状为: torch.Size([1, 10])
先训练一下Densenet
epoch: 0 loss= 574.0405883789062 epoch: 1 loss= 304.017578125 epoch: 2 loss= 242.1274871826172 epoch: 3 loss= 208.28538513183594 epoch: 4 loss= 185.4270782470703 epoch: 5 loss= 170.85366821289062 epoch: 6 loss= 156.8195343017578 epoch: 7 loss= 142.7688446044922 epoch: 8 loss= 132.4578857421875 epoch: 9 loss= 120.2202377319336 epoch: 10 loss= 110.72298431396484 epoch: 11 loss= 103.44509887695312 epoch: 12 loss= 94.1830825805664 epoch: 13 loss= 88.94744110107422 epoch: 14 loss= 80.26952362060547 测试集准确度 0.877299964427948
将平均汇聚改为最大汇聚:
def conv_dense(in_channels, out_channels): return nn.Sequential( nn.BatchNorm2d(in_channels), nn.ReLU(), nn.Conv2d(in_channels, 128, kernel_size=1, padding=0, stride=1), nn.BatchNorm2d(128), nn.ReLU(), nn.Conv2d(128, out_channels, kernel_size=3, stride=1, padding=1)) class DesBlock(nn.Module): def __init__(self, nums, in_channels, out_channels): super().__init__() self.blocks =[] self.blocks = self.blocks for i in range(nums): self.blocks.append(conv_dense(i*out_channels + in_channels, out_channels)) self.net = nn.Sequential(*self.blocks) def forward(self, x): for block in self.net: y = block(x) x = torch.cat((x, y), dim=1) return x def transition(in_channels): out_channels = in_channels//2 return nn.Sequential( nn.BatchNorm2d(in_channels),nn.ReLU(), nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0, stride=1), nn.MaxPool2d(kernel_size=2, stride=2) ) epoch: 0 loss= 515.3784790039062 epoch: 1 loss= 282.50244140625 epoch: 2 loss= 232.67477416992188 epoch: 3 loss= 202.9937286376953 epoch: 4 loss= 178.63426208496094 epoch: 5 loss= 161.4530029296875 epoch: 6 loss= 147.14990234375 epoch: 7 loss= 129.9204864501953 epoch: 8 loss= 117.783203125 epoch: 9 loss= 106.5365219116211 epoch: 10 loss= 94.76472473144531 epoch: 11 loss= 87.81078338623047 epoch: 12 loss= 78.73601531982422 epoch: 13 loss= 69.39637756347656 epoch: 14 loss= 62.99326705932617 测试集准确度 0.9085999727249146
准确度更高。
重复使用了特征图,使得无需重复学习的参数。
是这样的,可能因为要保存许多的特征图,6G显存不太够,需要降低batchsize。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。