PyTorch入门-2
Datesets & Dataloaders 位置: torch.utils.data.DataLoader
and torch.utils.data.Dataset
Dataset
存储样本及其对应的标签,DataLoader
在数据集周围包装了一个可迭代对象,以方便访问样本
加载数据 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 import torchfrom torch.utils.data import Datasetfrom torchvision import datasetsfrom torchvision.transforms import ToTensorimport matplotlib.pyplot as plttraining_data = datasets.FashionMNIST( root="data" , train=True , download=True , transform=ToTensor() ) test_data = datasets.FashionMNIST( root="data" , train=False , download=True , transform=ToTensor() )
迭代、可视化数据集 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 labels_map = { 0 : "T-Shirt" , 1 : "Trouser" , 2 : "Pullover" , 3 : "Dress" , 4 : "Coat" , 5 : "Sandal" , 6 : "Shirt" , 7 : "Sneaker" , 8 : "Bag" , 9 : "Ankle Boot" , } figure = plt.figure(figsize=(8 , 8 )) cols, rows = 3 , 3 for i in range (1 , cols * rows + 1 ): sample_idx = torch.randint(len (training_data), size=(1 ,)).item() img, label = training_data[sample_idx] figure.add_subplot(rows, cols, i) plt.title(labels_map[label]) plt.axis("off" ) plt.imshow(img.squeeze(), cmap="gray" ) plt.show()
创建自定义数据集 自定义数据集类必须包含三个函数:__init__, __len__, and __getitem__
FashionMNIST图像存储在目录img_dir
中,它们的标签分别存储在CSV文件annotations_file
中。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 import osimport pandas as pdfrom torchvision.io import read_imageclass CustomImageDataset (Dataset ): def __init__ (self, annotations_file, img_dir, transform=None , target_transform=None ): self.img_labels = pd.read_csv(annotations_file, names=['file_name' , 'label' ]) self.img_dir = img_dir self.transform = transform self.target_transform = target_transform def __len__ (self ): return len (self.img_labels) def __getitem__ (self, idx ): img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0 ]) image = read_image(img_path) label = self.img_labels.iloc[idx, 1 ] if self.transform: image = self.transform(image) if self.target_transform: label = self.target_transform(label) return image, label
使用DataLoader为训练做准备,遍历DataLoader 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 from torch.utils.data import DataLoadertrain_dataloader = DataLoader(training_data, batch_size=64 , shuffle=True ) test_dataloader = DataLoader(test_data, batch_size=64 , shuffle=True ) train_features, train_labels = next (iter (train_dataloader)) print (f"Feature batch shape: {train_features.size()} " )print (f"Labels batch shape: {train_labels.size()} " )img = train_features[0 ].squeeze() label = train_labels[0 ] plt.imshow(img, cmap="gray" ) plt.show() print (f"Label: {label} " )Feature batch shape: torch.Size([64 , 1 , 28 , 28 ]) Labels batch shape: torch.Size([64 ]) Label: 7
数据并不总是以训练机器学习算法所需的最终处理形式出现。我们使用转换来对数据执行一些操作,并使其适合于训练
transform
修改特征,target_transform
修改标签
1 2 3 4 5 6 7 8 9 10 11 import torchfrom torchvision import datasetsfrom torchvision.transforms import ToTensor, Lambdads = datasets.FashionMNIST( root="data" , train=True , download=True , transform=ToTensor(), target_transform=Lambda(lambda y: torch.zeros(10 , dtype=torch.float ).scatter_(0 , torch.tensor(y), value=1 )) )