%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.pylab as plab
from PIL import Image, ImageDraw
import numpy as np
import pandas as pd
import os
import copy
import collections
from sklearn.model_selection import StratifiedShuffleSplit
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import torchvision.transforms as transforms
from torchvision import models,utils, datasets
import torch.nn.functional as F
from torch import optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchsummary import summary
# CPU or GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# dataloader里的多进程用到num_workers
workers = 0 if os.name=='nt' else 4
# 显示图片
def img_show(img, y=None, color=True, title='default'):
npimg = img.numpy()
npimg_tr = np.transpose(npimg, (1, 2, 0))
if y is not None:
plt.title(f'{title} label: {str(y)}')
return True
# 用torch.utils.make_grid构建一组图片
def make_grid_image(ori_ds, grid_size=4):
grid_size = grid_size
rnd_inds = np.random.randint(0,len(ori_ds),grid_size)
print("image indices:", rnd_inds)
x_grid=[ori_ds[i][0] for i in rnd_inds]
y_grid=[ori_ds[i][1] for i in rnd_inds]
x_grid = utils.make_grid(x_grid, nrow=grid_size, padding=2)
return x_grid, y_grid
# 计算训练数据各通道均值与方差
def caculate_channels_mean_std(cal_ds):
# RGB mean and std
meanRGB=[np.mean(x.numpy(),axis=(1,2)) for x,_ in train_ds]
stdRGB=[np.std(x.numpy(),axis=(1,2)) for x,_ in train_ds]
meanR=np.mean([m[0] for m in meanRGB])
meanG=np.mean([m[1] for m in meanRGB])
meanB=np.mean([m[2] for m in meanRGB])
stdR=np.mean([s[0] for s in stdRGB])
stdG=np.mean([s[1] for s in stdRGB])
stdB=np.mean([s[2] for s in stdRGB])
mean_rgb = [meanR,meanG,meanB]
std_rgb = [stdR,stdG,stdB]
print(f'each channel mean : {mean_rgb}')
print(f'each channel std : {std_rgb}')
return mean_rgb, std_rgb
# 获取学习率方法
def get_lr(opt):
for param_group in opt.param_groups:
return param_group['lr']
# 定义几组中间函数
# 预测正确数(与真实值比较)
def metrics_batch(output, target):
# 取得预测输出类别
pred = output.argmax(dim=1, keepdim=True)
# 预测值与真实比较
corrects = pred.eq(target.view_as(pred)).sum().item()
return corrects
# 每批次迭代的损失计算方法
def loss_batch(loss_func, output, target, opt=None):
# 取得损失值
loss = loss_func(output, target)
# 取得预测正确个数
metric_b = metrics_batch(output,target)
if opt is not None:
return loss.item(), metric_b
# 定义每轮次损失计算 epoch
def loss_epoch(model,loss_func,dataset_dl,sanity_check=False,opt=None):
running_loss = 0.0
running_metric = 0.0
len_data = len(dataset_dl.dataset)
for xb, yb in dataset_dl:
xb = xb.to(device)
yb = yb.to(device)
# 调用每批次损失计算
loss_b,metric_b=loss_batch(loss_func, output, yb, opt)
# 更新损失值
running_loss += loss_b
# 叠加预测正确数
if metric_b is not None:
running_metric += metric_b
# 在可用性检测条件下,跳出循环,即只循环一次batch
if sanity_check is True:
# 计算损失平均值
loss = running_loss / float(len_data)
# 计算正确值平均
metric = running_metric / float(len_data)
return loss, metric
# 画出损失值与正确率
def show_loss_acc(num_epochs, loss_hist, metric_hist):
# 损失值
plt.title("Train-Val Loss")
plt.xlabel("Training Epochs")
# 准确率
plt.title("Train-Val Accuracy")
plt.xlabel("Training Epochs")
# 实例dataset
# 数据存储地址
path2data = "./data/multi_class/"
if not os.path.exists(path2data):
# 定义数据转换器
data_transformer = transforms.Compose([transforms.ToTensor()])
# 加载数据
train_ds = datasets.STL10(path2data, split='train', download=True, transform=data_transformer)
test0_ds=datasets.STL10(path2data, split='test', download=True, transform=data_transformer)
# 查看数据形状
# 查看各类别数据量
y_train = [y for _, y in train_ds]
counter_train = collections.Counter(y_train)
# 切分测试集数据为验证集+测试集
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
indices = list(range(len(test0_ds)))
y_test0 = [y for _,y in test0_ds]
for test_index, val_index in sss.split(indices, y_test0):
print("test:", test_index, "\nval:", val_index)
val_ds = Subset(test0_ds, val_index)
test_ds = Subset(test0_ds, test_index)
Files already downloaded and verified
Files already downloaded and verified
(5000, 3, 96, 96)
(8000, 3, 96, 96)
Counter({1: 500, 5: 500, 6: 500, 3: 500, 9: 500, 7: 500, 4: 500, 8: 500, 0: 500, 2: 500})
['airplane', 'bird', 'car', 'cat', 'deer', 'dog', 'horse', 'monkey', 'ship', 'truck']
test: [2096 4321 2767 ... 3206 3910 2902]
val: [6332 6852 1532 ... 5766 4469 1011]
1600 6400
# 可视化部分训练验证数据
train_grid, train_y_grid = make_grid_image(train_ds)
img_show(train_grid, train_y_grid, title='Train')
val_grid, val_y_grid = make_grid_image(val_ds)
img_show(val_grid, val_y_grid, title='Val')
train grid
val grid
定义转换器进行数据处理 transforms
# 构建dataset
# 计算数据的均值与方差,用于后面的归一化处理
mean_rgb, std_rgb = caculate_channels_mean_std(train_ds)
# 定义转换器 transforms
train_transformer = transforms.Compose([
transforms.Normalize(mean_rgb, std_rgb)])
test0_transformer = transforms.Compose([
transforms.Normalize(mean_rgb, std_rgb),
# 更新训练测试集的转换器
train_ds.transform = train_transformer
test0_ds.transform = test0_transformer
# 查看重新转换后的图片数据
train_grid, train_y_grid = make_grid_image(train_ds)
img_show(train_grid, train_y_grid, title='Train after trans')
val_grid, val_y_grid = make_grid_image(val_ds)
img_show(val_grid, val_y_grid, title='Val after trans')
each channel mean : [0.4467106, 0.43980986, 0.40664646]
each channel std : [0.22414584, 0.22148906, 0.22389975]
image indices: [2732 2607 1653 3264]
torch.Size([3, 100, 394])
image indices: [ 835 763 1383 1033]
torch.Size([3, 100, 394])
train after trans.png
val after trans.png
实例化dataloader 及加载预训练模型(resnet18)
# 实例化dataloader
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=64, shuffle=False)
# 调用预训练好的模型resnet18,pretrained=True
model_resnet18 = models.resnet18(pretrained=True)
# 打印模型信息
# print(model_resnet18)
# 修改输出类别数 1000 --> 10
num_ftrs = model_resnet18.fc.in_features
model_resnet18.fc = nn.Linear(num_ftrs, num_classes)
# 查看模型结构信息
summary(model_resnet18, input_size=(3, 224, 224))
Layer (type) Output Shape Param #
Conv2d-1 [-1, 64, 112, 112] 9,408
BatchNorm2d-2 [-1, 64, 112, 112] 128
ReLU-3 [-1, 64, 112, 112] 0
MaxPool2d-4 [-1, 64, 56, 56] 0
Conv2d-5 [-1, 64, 56, 56] 36,864
BatchNorm2d-6 [-1, 64, 56, 56] 128
ReLU-7 [-1, 64, 56, 56] 0
Conv2d-8 [-1, 64, 56, 56] 36,864
BatchNorm2d-9 [-1, 64, 56, 56] 128
ReLU-10 [-1, 64, 56, 56] 0
BasicBlock-11 [-1, 64, 56, 56] 0
Conv2d-12 [-1, 64, 56, 56] 36,864
BatchNorm2d-13 [-1, 64, 56, 56] 128
ReLU-14 [-1, 64, 56, 56] 0
Conv2d-15 [-1, 64, 56, 56] 36,864
BatchNorm2d-16 [-1, 64, 56, 56] 128
ReLU-17 [-1, 64, 56, 56] 0
BasicBlock-18 [-1, 64, 56, 56] 0
Conv2d-19 [-1, 128, 28, 28] 73,728
BatchNorm2d-20 [-1, 128, 28, 28] 256
ReLU-21 [-1, 128, 28, 28] 0
Conv2d-22 [-1, 128, 28, 28] 147,456
BatchNorm2d-23 [-1, 128, 28, 28] 256
Conv2d-24 [-1, 128, 28, 28] 8,192
BatchNorm2d-25 [-1, 128, 28, 28] 256
ReLU-26 [-1, 128, 28, 28] 0
BasicBlock-27 [-1, 128, 28, 28] 0
Conv2d-28 [-1, 128, 28, 28] 147,456
BatchNorm2d-29 [-1, 128, 28, 28] 256
ReLU-30 [-1, 128, 28, 28] 0
Conv2d-31 [-1, 128, 28, 28] 147,456
BatchNorm2d-32 [-1, 128, 28, 28] 256
ReLU-33 [-1, 128, 28, 28] 0
BasicBlock-34 [-1, 128, 28, 28] 0
Conv2d-35 [-1, 256, 14, 14] 294,912
BatchNorm2d-36 [-1, 256, 14, 14] 512
ReLU-37 [-1, 256, 14, 14] 0
Conv2d-38 [-1, 256, 14, 14] 589,824
BatchNorm2d-39 [-1, 256, 14, 14] 512
Conv2d-40 [-1, 256, 14, 14] 32,768
BatchNorm2d-41 [-1, 256, 14, 14] 512
ReLU-42 [-1, 256, 14, 14] 0
BasicBlock-43 [-1, 256, 14, 14] 0
Conv2d-44 [-1, 256, 14, 14] 589,824
BatchNorm2d-45 [-1, 256, 14, 14] 512
ReLU-46 [-1, 256, 14, 14] 0
Conv2d-47 [-1, 256, 14, 14] 589,824
BatchNorm2d-48 [-1, 256, 14, 14] 512
ReLU-49 [-1, 256, 14, 14] 0
BasicBlock-50 [-1, 256, 14, 14] 0
Conv2d-51 [-1, 512, 7, 7] 1,179,648
BatchNorm2d-52 [-1, 512, 7, 7] 1,024
ReLU-53 [-1, 512, 7, 7] 0
Conv2d-54 [-1, 512, 7, 7] 2,359,296
BatchNorm2d-55 [-1, 512, 7, 7] 1,024
Conv2d-56 [-1, 512, 7, 7] 131,072
BatchNorm2d-57 [-1, 512, 7, 7] 1,024
ReLU-58 [-1, 512, 7, 7] 0
BasicBlock-59 [-1, 512, 7, 7] 0
Conv2d-60 [-1, 512, 7, 7] 2,359,296
BatchNorm2d-61 [-1, 512, 7, 7] 1,024
ReLU-62 [-1, 512, 7, 7] 0
Conv2d-63 [-1, 512, 7, 7] 2,359,296
BatchNorm2d-64 [-1, 512, 7, 7] 1,024
ReLU-65 [-1, 512, 7, 7] 0
BasicBlock-66 [-1, 512, 7, 7] 0
AdaptiveAvgPool2d-67 [-1, 512, 1, 1] 0
Linear-68 [-1, 10] 5,130
Total params: 11,181,642
Trainable params: 11,181,642
Non-trainable params: 0
Input size (MB): 0.57
Forward/backward pass size (MB): 62.79
Params size (MB): 42.65
Estimated Total Size (MB): 106.01
# 可视化第一层卷积层后的图片信息
# 取得第一层卷积层的权重
for w in model_resnet18.parameters():
w = w.data.cpu()
# normalize to [0,1]
min_w = torch.min(w)
w1 = (-1/(2*min_w))*w + 0.5
# 构建grid图片
x_grid=[w1[i] for i in range(grid_size)]
x_grid=utils.make_grid(x_grid, nrow=8, padding=1)
torch.Size([64, 3, 7, 7])
0.0 1.102618932723999
torch.Size([3, 65, 65])
# 可视化
The first layer outputs
# 定义训练验证主函数
def train_val(model, params):
# 提取各个参数
# 存储中间损失值
"train": [],
"val": [],
# 存储中间正确预测数
"train": [],
"val": [],
# 存储中间较好模型的参数
best_model_wts = copy.deepcopy(model.state_dict())
# 初始化最优损失值
# 主函数
for epoch in range(num_epochs):
# 获得当前学习率值
print('Epoch {}/{}, current lr={}'.format(epoch, num_epochs - 1, current_lr))
# 模型训练
train_loss, train_metric=loss_epoch(model,loss_func,train_dl,sanity_check,opt)
# 存储中间各数据
# 模型验证
with torch.no_grad():
val_loss, val_metric=loss_epoch(model,loss_func,val_dl,sanity_check)
# 存储较好的模型参数
if val_loss < best_loss:
best_loss = val_loss
best_model_wts = copy.deepcopy(model.state_dict())
# 保存到指定路径下
torch.save(model.state_dict(), path2weights)
print("Copied best model weights!")
# 存储验证过程中的数据
# 执行学习率更新策略
print("train loss: %.6f, dev loss: %.6f, accuracy: %.2f" %(train_loss,val_loss,100*val_metric))
# 加载最优的参数值
return model, loss_history, metric_history
# 定义损失函数
loss_fn = nn.CrossEntropyLoss(reduction="sum")
# 定义优化器
opt = optim.Adam(model_resnet18.parameters(), lr=1e-4)
# 学习率更新策略
lr_scheduler = CosineAnnealingLR(opt,T_max=5,eta_min=1e-6)
params_train = {
"num_epochs": 10,
"optimizer": opt,
"loss_func": loss_func,
"train_dl": train_dl,
"val_dl": val_dl,
"sanity_check": False,
"lr_scheduler": lr_scheduler,
"path2weights": "./models/resnet18.pt",
# 训练及验证模型
Epoch 0/9, current lr=0.0001
Copied best model weights!
train loss: 0.960643, dev loss: 0.460767, accuracy: 84.50
Epoch 1/9, current lr=9.05463412215599e-05
Copied best model weights!
train loss: 0.427234, dev loss: 0.383167, accuracy: 87.06
Epoch 2/9, current lr=6.57963412215599e-05
Copied best model weights!
train loss: 0.276291, dev loss: 0.354399, accuracy: 87.44
Epoch 3/9, current lr=3.52036587784401e-05
Copied best model weights!
train loss: 0.192877, dev loss: 0.335165, accuracy: 88.31
Epoch 4/9, current lr=1.0453658778440105e-05
Copied best model weights!
train loss: 0.158006, dev loss: 0.333249, accuracy: 88.25
Epoch 5/9, current lr=1e-06
Copied best model weights!
train loss: 0.153824, dev loss: 0.326157, accuracy: 89.00
Epoch 6/9, current lr=1.0453658778440102e-05
train loss: 0.144040, dev loss: 0.328791, accuracy: 88.88
Epoch 7/9, current lr=3.520365877844009e-05
Copied best model weights!
train loss: 0.135462, dev loss: 0.325485, accuracy: 88.56
Epoch 8/9, current lr=6.579634122155988e-05
train loss: 0.105906, dev loss: 0.360124, accuracy: 88.38
Epoch 9/9, current lr=9.054634122155989e-05
train loss: 0.101154, dev loss: 0.391994, accuracy: 87.88
# 可视化结果
show_loss_acc(params_train['num_epochs'], loss_hist, metric_hist)
# 可以看到cuda显存的信息
| PyTorch CUDA memory summary, device ID 0 |
| CUDA OOMs: 0 | cudaMalloc retries: 0 |
| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |
| Allocated memory | 353214 KB | 712633 KB | 2865 GB | 2865 GB |
| from large pool | 326272 KB | 682240 KB | 2846 GB | 2846 GB |
| from small pool | 26942 KB | 40619 KB | 19 GB | 19 GB |
| Active memory | 353214 KB | 712633 KB | 2865 GB | 2865 GB |
| from large pool | 326272 KB | 682240 KB | 2846 GB | 2846 GB |
| from small pool | 26942 KB | 40619 KB | 19 GB | 19 GB |
| GPU reserved memory | 737280 KB | 737280 KB | 982 MB | 268288 KB |
| from large pool | 694272 KB | 694272 KB | 930 MB | 258048 KB |
| from small pool | 43008 KB | 43008 KB | 52 MB | 10240 KB |
| Non-releasable memory | 162882 KB | 167026 KB | 2405 GB | 2405 GB |
| from large pool | 157056 KB | 165760 KB | 2384 GB | 2384 GB |
| from small pool | 5826 KB | 10363 KB | 20 GB | 20 GB |
| Allocations | 556 | 800 | 555489 | 554933 |
| from large pool | 64 | 115 | 225764 | 225700 |
| from small pool | 492 | 720 | 329725 | 329233 |
| Active allocs | 556 | 800 | 555489 | 554933 |
| from large pool | 64 | 115 | 225764 | 225700 |
| from small pool | 492 | 720 | 329725 | 329233 |
| GPU reserved segments | 41 | 41 | 51 | 10 |
| from large pool | 20 | 20 | 25 | 5 |
| from small pool | 21 | 21 | 26 | 5 |
| Non-releasable allocs | 62 | 70 | 331818 | 331756 |
| from large pool | 14 | 15 | 121327 | 121313 |
| from small pool | 48 | 60 | 210491 | 210443 |
# 释放GPU内
if model_resnet18:
del model_resnet18