本文从MNIST数据集下载开始,详细介绍在PaddlePaddle中,基于高层API实现MNIST数据集训练
第一步:将MNIST数据下载到本地,下载链接:http://yann.lecun.com/exdb/mnist/,可以得到四个文件:
- train-images-idx3-ubyte.gz: 训练集图像数据
- train-labels-idx1-ubyte.gz: 训练集标签
- t10k-images-idx3-ubyte.gz: 测试集图像数据
- t10k-labels-idx1-ubyte.gz: 测试集标签
第二步:将下载的四个文件以Numpy ndarray类型载入内存。解压文件并读取数据的过程非常标准,大家可以直接用下面的范例程序
# train-images-idx3-ubyte 文件格式, 参考:http://yann.lecun.com/exdb/mnist/
'''
[offset] [type] [value] [description]
0000 32 bit integer 0x00000803(2051) magic number
0004 32 bit integer 60000 number of images
0008 32 bit integer 28 number of rows
0012 32 bit integer 28 number of columns
0016 unsigned byte ?? pixel
0017 unsigned byte ?? pixel
........
xxxx unsigned byte ?? pixel
Pixels are organized row-wise. Pixel values are 0 to 255.
0 means background (white), 255 means foreground (black).
'''
def load_images(image_file):
# 读取*.gz格式文件
with gzip.open(image_file) as f:
buf = f.read()
idx = 0
# 读取文件信息
magic, num_images, rows, cols = struct.unpack_from('>IIII', buf, idx)
idx += struct.calcsize('>IIII')
length = int(num_images*rows*cols)
# 读取图像数据
images = struct.unpack_from('>'+str(length)+'B', buf, idx)
images = np.array(images).astype('float32')
images = images.reshape(num_images, rows, cols)
# 返回np.ndarray类型, N*r*c 图像数据
return images
# train-labels-idx1-ubyte.gz 文件格式
'''
[offset] [type] [value] [description]
0000 32 bit integer 0x00000801(2049) magic number (MSB first)
0004 32 bit integer 60000 number of items
0008 unsigned byte ?? label
0009 unsigned byte ?? label
........
xxxx unsigned byte ?? label
The labels values are 0 to 9.
'''
def load_labels(label_file):
# 读取*.gz格式文件
with gzip.open(label_file) as f:
buf = f.read()
# 读取文件信息
idx = 0
magic, num_labels = struct.unpack_from('>II', buf, idx)
# 读取标签数据
idx += struct.calcsize('>II')
labels = struct.unpack_from('>'+str(num_labels)+'B',buf,idx)
labels = np.array(labels).astype('int64')
# 返回np.ndarray类型, 标签数据
return labels
可以用下面的代码来测试图像数据的读入
# Test Code
import matplotlib.pyplot as plt
train_images = load_images('train-images-idx3-ubyte.gz')
test_images = load_images('t10k-images-idx3-ubyte.gz')
train_labels = load_labels('train-labels-idx1-ubyte.gz').reshape(-1,1)
test_labels = load_labels('t10k-labels-idx1-ubyte.gz').reshape(-1,1)
print(train_images.shape, train_labels.shape, test_images.shape, test_labels.shape)
idx = 5
fig = plt.figure()
plt.subplot(1,2,1)
plt.imshow(train_images[idx],cmap='rainbow')
plt.subplot(1,2,2)
plt.imshow(test_images[idx],cmap='rainbow')
plt.show()
(60000, 28, 28) (60000,1) (10000, 28, 28) (10000,1)
第三步:使用飞桨提供的paddle.io.Dataset基类,将数据封装为可迭代的数据源。
train_images = load_images('train-images-idx3-ubyte.gz')
test_images = load_images('t10k-images-idx3-ubyte.gz')
train_labels = load_labels('train-labels-idx1-ubyte.gz')
test_labels = load_labels('t10k-labels-idx1-ubyte.gz')
# 图像数据归一化
train_images = train_images / 255.0
test_images = test_images / 255.0
num_train_samples = train_images.shape[0]
num_test_samples = test_images.shape[0]
import paddle
from paddle.io import Dataset
class TrainDataSet(Dataset):
"""
步骤一:继承paddle.io.Dataset类
"""
def __init__(self, num_samples):
"""
步骤二:实现构造函数,定义数据集大小
"""
super().__init__()
self.num_samples = num_samples
def __getitem__(self, index):
"""
步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
"""
data = train_images[index]
label = train_labels[index]
return data, label
def __len__(self):
"""
步骤四:实现__len__方法,返回数据集总数目
"""
return self.num_samples
class TestDataSet(Dataset):
"""
步骤一:继承paddle.io.Dataset类
"""
def __init__(self, num_samples):
"""
步骤二:实现构造函数,定义数据集大小
"""
super().__init__()
self.num_samples = num_samples
def __getitem__(self, index):
"""
步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
"""
data = test_images[index]
label = test_labels[index]
return data, label
def __len__(self):
"""
步骤四:实现__len__方法,返回数据集总数目
"""
return self.num_samples
# 测试定义的数据集
train_dataset = TrainDataSet(num_train_samples)
test_dataset = TestDataSet(num_test_samples)
第四步:针对顺序的线性网络结构,使用飞桨提供的Sequential类来快速完成组网,这样可以减少类的定义等代码编写。
# 定义模型
mnist = paddle.nn.Sequential(
paddle.nn.Flatten(),
paddle.nn.Linear(784, 512),
paddle.nn.ReLU(),
paddle.nn.Dropout(0.2),
paddle.nn.Linear(512, 10)
)
第五步:生成模型实例,并完成损失函数、优化方法和评估方法的配置。
# 预计模型结构生成模型实例,便于进行后续的配置、训练和验证
model = paddle.Model(mnist)
# 模型训练相关配置,准备损失计算方法,优化器和精度计算方法
model.prepare(paddle.optimizer.Adam(parameters=model.parameters()),
paddle.nn.CrossEntropyLoss(),
paddle.metric.Accuracy())
最后一步:用fit()方法启动训练,evaluate()方法实现评估,predict()方法实现预测
# 开始模型训练
model.fit(train_dataset,
epochs=5,
batch_size=100,
verbose=1)
# 用 evaluate 在测试集上对模型进行验证
eval_result = model.evaluate(test_dataset, verbose=0)
print(eval_result)
# 用 predict 在测试集上对模型进行测试
test_result = model.predict(test_dataset)
完整可运行的代码如下
import gzip
import struct
import numpy as np
# train-images-idx3-ubyte 文件格式, 参考:http://yann.lecun.com/exdb/mnist/
'''
[offset] [type] [value] [description]
0000 32 bit integer 0x00000803(2051) magic number
0004 32 bit integer 60000 number of images
0008 32 bit integer 28 number of rows
0012 32 bit integer 28 number of columns
0016 unsigned byte ?? pixel
0017 unsigned byte ?? pixel
........
xxxx unsigned byte ?? pixel
Pixels are organized row-wise. Pixel values are 0 to 255.
0 means background (white), 255 means foreground (black).
'''
def load_images(image_file):
# 读取*.gz格式文件
with gzip.open(image_file) as f:
buf = f.read()
idx = 0
# 读取文件信息
magic, num_images, rows, cols = struct.unpack_from('>IIII', buf, idx)
idx += struct.calcsize('>IIII')
length = int(num_images*rows*cols)
# 读取图像数据
images = struct.unpack_from('>'+str(length)+'B', buf, idx)
images = np.array(images).astype('float32')
images = images.reshape(num_images, rows, cols)
# 返回np.ndarray类型, N*r*c 图像数据
return images
# train-labels-idx1-ubyte.gz 文件格式
'''
[offset] [type] [value] [description]
0000 32 bit integer 0x00000801(2049) magic number (MSB first)
0004 32 bit integer 60000 number of items
0008 unsigned byte ?? label
0009 unsigned byte ?? label
........
xxxx unsigned byte ?? label
The labels values are 0 to 9.
'''
def load_labels(label_file):
# 读取*.gz格式文件
with gzip.open(label_file) as f:
buf = f.read()
# 读取文件信息
idx = 0
magic, num_labels = struct.unpack_from('>II', buf, idx)
# 读取标签数据
idx += struct.calcsize('>II')
labels = struct.unpack_from('>'+str(num_labels)+'B',buf,idx)
labels = np.array(labels).astype('int64')
# 返回np.ndarray类型, 标签数据
return labels
train_images = load_images('train-images-idx3-ubyte.gz')
test_images = load_images('t10k-images-idx3-ubyte.gz')
train_labels = load_labels('train-labels-idx1-ubyte.gz').reshape(-1,1)
test_labels = load_labels('t10k-labels-idx1-ubyte.gz').reshape(-1,1)
# 图像数据归一化
train_images = train_images / 255.0
test_images = test_images / 255.0
num_train_samples = train_images.shape[0]
num_test_samples = test_images.shape[0]
import paddle
from paddle.io import Dataset
class TrainDataSet(Dataset):
"""
步骤一:继承paddle.io.Dataset类
"""
def __init__(self, num_samples):
"""
步骤二:实现构造函数,定义数据集大小
"""
super().__init__()
self.num_samples = num_samples
def __getitem__(self, index):
"""
步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
"""
data = train_images[index]
label = train_labels[index]
return data, label
def __len__(self):
"""
步骤四:实现__len__方法,返回数据集总数目
"""
return self.num_samples
class TestDataSet(Dataset):
"""
步骤一:继承paddle.io.Dataset类
"""
def __init__(self, num_samples):
"""
步骤二:实现构造函数,定义数据集大小
"""
super().__init__()
self.num_samples = num_samples
def __getitem__(self, index):
"""
步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
"""
data = test_images[index]
label = test_labels[index]
return data, label
def __len__(self):
"""
步骤四:实现__len__方法,返回数据集总数目
"""
return self.num_samples
# 测试定义的数据集
train_dataset = TrainDataSet(num_train_samples)
test_dataset = TestDataSet(num_test_samples)
# 定义模型
mnist = paddle.nn.Sequential(
paddle.nn.Flatten(),
paddle.nn.Linear(784, 512),
paddle.nn.ReLU(),
paddle.nn.Dropout(0.2),
paddle.nn.Linear(512, 10)
)
# 预计模型结构生成模型实例,便于进行后续的配置、训练和验证
model = paddle.Model(mnist)
# 模型训练相关配置,准备损失计算方法,优化器和精度计算方法
model.prepare(paddle.optimizer.Adam(parameters=model.parameters()),
paddle.nn.CrossEntropyLoss(),
paddle.metric.Accuracy())
# 开始模型训练
model.fit(train_dataset,
epochs=5,
batch_size=100,
verbose=1)
# 用 evaluate 在测试集上对模型进行验证
eval_result = model.evaluate(test_dataset, verbose=0)
print(eval_result)
# 用 predict 在测试集上对模型进行测试
test_result = model.predict(test_dataset)
# 展示预测结果
import matplotlib.pyplot as plt
def show_img(img, predict):
plt.title(f'predict:{predict}')
plt.imshow(img.reshape([28,28]))
plt.show()
idx = 2 #抽样索引
show_img(test_dataset[idx][0], np.argmax(test_result[0][idx]))
运行结果如下: