当前位置:   article > 正文

基于Pytorch的DDP训练Mnist数据集

基于Pytorch的DDP训练Mnist数据集

        在前几期的博文中我们讲了pytorch的DDP,但是当时的demo是自制的虚拟数据集(Pytorch分布式训练:DDP),这期文章我们使用Mnist数据集做测试,测试并完善代码。

快速开始

        1.  我们修改一下main函数,在main函数中导入Mnist数据。我这里把测试集关闭了,需要的可以打开。

  1. def main(rank, world_size, max_epochs, batch_size):
  2. ddp_setup(rank, world_size)
  3. train_dataset = datasets.MNIST(root="./MNIST", train=True, transform=data_tf, download=True)
  4. train_dataloader = DataLoader(train_dataset,
  5. batch_size=batch_size,
  6. shuffle=False,
  7. sampler=DistributedSampler(train_dataset))
  8. model = Net()
  9. # optimzer = torch.optim.Adam(model.parameters(), lr=1e-3)
  10. optimzer = torch.optim.SGD(model.parameters(), lr=1e-2)
  11. trainer = Trainer(model=model, gpu_id=rank, optimizer=optimzer, train_dataloader=train_dataloader)
  12. trainer.train(max_epochs)
  13. destroy_process_group()
  14. # test_dataset = datasets.MNIST(root="./MNIST", train=False, transform=data_tf, download=True)
  15. # test_dataloader = DataLoader(test_dataset,
  16. # batch_size=32,
  17. # shuffle=False)
  18. # evaluation(model=model, test_dataloader=test_dataloader)

        2.  修改模型结构,非常简单的一个网络

  1. class Net(nn.Module):
  2. def __init__(self):
  3. super(Net, self).__init__()
  4. self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
  5. self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
  6. self.conv2_drop = nn.Dropout2d()
  7. self.fc1 = nn.Linear( 64*5*5, 500)
  8. self.fc2 = nn.Linear(500, 10)
  9. def forward(self, x):
  10. x = F.relu(F.max_pool2d(self.conv1(x), 2))
  11. x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
  12. # print(x.shape)
  13. x = x.view(-1, 64*5*5)
  14. x = F.relu(self.fc1(x))
  15. x = F.dropout(x, training=self.training)
  16. x = self.fc2(x)
  17. return x

        3. 完整代码如下,增加了计算准确率的功能,这些代码可以自己写个函数进行封装的,我太懒了。。。

  1. """
  2. pytorch分布式训练结构
  3. """
  4. from time import time
  5. import os
  6. import torch.nn as nn
  7. import torch
  8. import torch.nn.functional as F
  9. from torch.utils.data import DataLoader
  10. from torchvision import datasets, transforms
  11. # 多gpu训练所需的包
  12. import torch.multiprocessing as mp
  13. from torch.utils.data.distributed import DistributedSampler
  14. from torch.nn.parallel import DistributedDataParallel as DDP
  15. from torch.distributed import init_process_group, destroy_process_group
  16. def ddp_setup(rank, world_size):
  17. """
  18. 每个显卡都进行初始化
  19. """
  20. os.environ["MASTER_ADDR"] = "localhost"
  21. os.environ["MASTER_PORT"] = "12355"
  22. # init_process_group(backend="nccl", rank=rank, world_size=world_size)
  23. init_process_group(backend="gloo", rank=rank, world_size=world_size)
  24. torch.cuda.set_device(rank)
  25. data_tf = transforms.Compose(
  26. [transforms.ToTensor(),
  27. transforms.Normalize([0.5],[0.5])]
  28. )
  29. class Net(nn.Module):
  30. def __init__(self):
  31. super(Net, self).__init__()
  32. self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
  33. self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
  34. self.conv2_drop = nn.Dropout2d()
  35. self.fc1 = nn.Linear( 64*5*5, 500)
  36. self.fc2 = nn.Linear(500, 10)
  37. def forward(self, x):
  38. x = F.relu(F.max_pool2d(self.conv1(x), 2))
  39. x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
  40. # print(x.shape)
  41. x = x.view(-1, 64*5*5)
  42. x = F.relu(self.fc1(x))
  43. x = F.dropout(x, training=self.training)
  44. x = self.fc2(x)
  45. return x
  46. class Trainer:
  47. def __init__(self, model, train_dataloader, optimizer, gpu_id):
  48. self.gpu_id = gpu_id
  49. self.model = model.to(gpu_id)
  50. self.train_dataloader = train_dataloader
  51. self.optimizer = optimizer
  52. self.model = DDP(model, device_ids=[gpu_id])
  53. self.criterion = torch.nn.CrossEntropyLoss()
  54. def _run_batch(self, xs, ys):
  55. self.optimizer.zero_grad()
  56. output = self.model(xs)
  57. loss = self.criterion(output, ys)
  58. loss.backward()
  59. self.optimizer.step()
  60. _, predicted = torch.max(output, 1)
  61. return ys.size(0), (predicted == ys).sum()
  62. def _run_epoch(self, epoch):
  63. batch_size = len(next(iter(self.train_dataloader))[0])
  64. # print(f"|GPU:{self.gpu_id}| Epoch:{epoch} | batchsize:{batch_size} | steps:{len(self.train_dataloader)}")
  65. # 打乱数据,随机打乱
  66. self.train_dataloader.sampler.set_epoch(epoch)
  67. sample_nums = 0
  68. train_correct = 0
  69. for xs, ys in self.train_dataloader:
  70. xs = xs.to(self.gpu_id)
  71. ys = ys.to(self.gpu_id)
  72. sample_num, correct = self._run_batch(xs, ys)
  73. sample_nums += sample_num
  74. train_correct += correct
  75. # print(train_correct.item(), sample_nums)
  76. print(f"train_acc: {train_correct.item() / sample_nums * 100 :.3f}")
  77. def _save_checkpoint(self, epoch):
  78. ckp = self.model.module.state_dict()
  79. PATH = f"./params/checkpoint_{epoch}.pt"
  80. torch.save(ckp, PATH)
  81. def train(self, max_epoch: int):
  82. for epoch in range(max_epoch):
  83. self._run_epoch(epoch)
  84. # if self.gpu_id == 0:
  85. # self._save_checkpoint(epoch)
  86. def evaluation(model, test_dataloader):
  87. model.eval()
  88. model.to("cuda:0")
  89. sample_nums = 0
  90. train_correct = 0
  91. for xs, ys in test_dataloader:
  92. xs = xs.to("cuda:0")
  93. ys = ys.to("cuda:0")
  94. output = model(xs)
  95. _, predicted = torch.max(output, 1)
  96. sample_nums += ys.size(0)
  97. train_correct += (predicted == ys).sum()
  98. print(f"test_acc: {train_correct.item() / sample_nums * 100 :.3f}")
  99. def main(rank, world_size, max_epochs, batch_size):
  100. ddp_setup(rank, world_size)
  101. train_dataset = datasets.MNIST(root="./MNIST", train=True, transform=data_tf, download=True)
  102. train_dataloader = DataLoader(train_dataset,
  103. batch_size=batch_size,
  104. shuffle=False,
  105. sampler=DistributedSampler(train_dataset))
  106. model = Net()
  107. # optimzer = torch.optim.Adam(model.parameters(), lr=1e-3)
  108. optimzer = torch.optim.SGD(model.parameters(), lr=1e-2)
  109. trainer = Trainer(model=model, gpu_id=rank, optimizer=optimzer, train_dataloader=train_dataloader)
  110. trainer.train(max_epochs)
  111. destroy_process_group()
  112. # test_dataset = datasets.MNIST(root="./MNIST", train=False, transform=data_tf, download=True)
  113. # test_dataloader = DataLoader(test_dataset,
  114. # batch_size=32,
  115. # shuffle=False)
  116. # evaluation(model=model, test_dataloader=test_dataloader)
  117. if __name__ == "__main__":
  118. start_time = time()
  119. max_epochs = 50
  120. batch_size = 128
  121. world_size = torch.cuda.device_count()
  122. mp.spawn(main, args=(world_size, max_epochs, batch_size), nprocs=world_size)
  123. print(time() - start_time)

训练测试

        我简单的测试了一下单卡和多卡的GPU性能(一张3090、一张3090ti),表格如下:

        在数据量较小的前提下双卡对单卡优势不明显,加大epoch才能看出明显差距。

结尾

如果不出意外DDP的内容已经结束了,后续发现什么好玩的继续发出来
如果觉得文章对你有用请点赞、关注  ->> 你的点赞对我太有用了
群内交流更多技术
130856474  <--  在这里

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/article/detail/47386
推荐阅读
相关标签
  

闽ICP备14008679号