本文介绍了如何在PaddlePaddle中构建VOC2012数据集及自定义类VOC数据集的加载器。先解析VOC2012目录结构,通过继承Dataset类,实现__init__、__getitem__等方法读取图片和标注,还进行了数据测试。同时,调整代码适配自定义数据集,完成数据加载与可视化。
☞☞☞AI 智能聊天, 问答助手, AI 智能搜索, 免费无限量使用 DeepSeek R1 模型☜☜☜

我最近在复现Faster-RCNN 的resnet50+FPN版本的时候在数据预处理和加载的时候,在阅读paddlepaddle官方给的数据加载的类的时候发现并没有一个很好的例子,
所以我今天就做了一个这个,如何做一个读取标准的Voc2012年的数据加载器,之后我也会用我自己的一个数据集,来做一个自定义数据读取
Paddle数据集定义与加载文档
本次使用的是来自ai studio 笨笨提供的pascal-voc数据集
pascal-voc数据集,包含voc2007和voc2012数据,主要用于目标检测,语义分隔等视觉任务
下面是Pascal-voc数据集目录的结构
.
└── VOCdevkit #根目录
└── VOC2012 #不同年份的数据集,这里只下载了2012的,还有2007等其它年份的
├── Annotations #存放xml文件,与JPEGImages中的图片一一对应,解释图片的内容等等
├── ImageSets #该目录下存放的都是txt文件,txt文件中每一行包含一个图片的名称,末尾会加上±1表示正负样本
│ ├── Action
│ ├── Layout
│ ├── Main
│ └── Segmentation
├── JPEGImages #存放源图片
├── SegmentationClass #存放的是图片,语义分割相关
└── SegmentationObject #存放的是图片,实例分割相关这里我们将会用到VOC2012数据集
其中因为Faster-RCNN是用于目标检测任务所以我们将会用到Annotations ,JPEGImages ,和ImageSets中Main之中的train.txt和val.txt
Annotations 存放xml文件的目录
JPEGImages 存放图片文件的目录
train.txt 存放训练文件名称的txt
val.txt 存放验证文件名称的txt
首先我们将pascal-voc数据集进行解压
!unzip -oq data/data4379/pascalvoc.zip
因为我们这里只用到了VOC2012所以将VOC2012文件夹移动到根目录下
!mv pascalvoc/VOCdevkit/VOC2012 ./
paddlepaddle官方提供了一个十分简单的自定义数据集的案例
import paddlefrom paddle.io import Dataset
BATCH_SIZE = 64BATCH_NUM = 20IMAGE_SIZE = (28, 28)
CLASS_NUM = 10class MyDataset(Dataset):
"""
步骤一:继承paddle.io.Dataset类
"""
def __init__(self, num_samples):
"""
步骤二:实现构造函数,定义数据集大小
"""
super(MyDataset, self).__init__()
self.num_samples = num_samples def __getitem__(self, index):
"""
步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
"""
data = paddle.uniform(IMAGE_SIZE, dtype='float32')
label = paddle.randint(0, CLASS_NUM-1, dtype='int64') return data, label def __len__(self):
"""
步骤四:实现__len__方法,返回数据集总数目
"""
return self.num_samples# 测试定义的数据集custom_dataset = MyDataset(BATCH_SIZE * BATCH_NUM)print('=============custom dataset=============')for data, label in custom_dataset: print(data.shape, label.shape) break我们可以按照他的样子来一步一步进行实现
# 定义数据读取类,继承Paddle.io.Datasetclass VOCDataset(paddle.io.Dataset):
在__init__方法中,我们要定义一下读取VOC2012各个文件夹的路径 同时还需要将VOC2012数据集的类别文件进行读取
存放VOC2012数据集的类别文件我给放在根目录下了
路径:pascal_voc_classes.json
def __init__(self,voc_root, year='2012',transforms=None, txt_name:str = 'train.txt'): assert year in ['2007','2012'], "year must be in ['2007','2012']"
self.root = os.path.join(voc_root,f"VOC{year}") self.img_root = os.path.join(self.root,'JPEGImages') self.annotations_root = os.path.join(self.root,'Annotations')
txt_path = os.path.join(self.root,"ImageSets",'Main',txt_name) assert os.path.exists(txt_path),'not found {} file'.format(txt_name)
with open(txt_path) as read: self.xml_list = [os.path.join(self.annotations_root,line.strip()+'.xml') for line in read.readlines() if len(line.strip()) >0 ]
#check file assert len(self.xml_list) > 0, "in '{}' file does not find any information.".format(txt_path) for xml_path in self.xml_list: assert os.path.exists(xml_path), "not found '{}' file.".format(xml_path)
# read class_indict
json_file = './pascal_voc_classes.json'
assert os.path.exists(json_file), "{} file not exist.".format(json_file)
json_file = open(json_file, 'r') self.class_dict = json.load(json_file)
json_file.close() self.transforms = transformsdef __getitem__(self, idx):
# read xml
xml_path = self.xml_list[idx] with open(xml_path) as fid:
xml_str = fid.read()
xml = etree.fromstring(xml_str)
data = self.parse_xml_to_dict(xml)["annotation"]
img_path = os.path.join(self.img_root, data["filename"])
image = Image.open(img_path) if image.format != "JPEG": raise ValueError("Image '{}' format not JPEG".format(img_path))
boxes = []
labels = []
iscrowd = [] assert "object" in data, "{} lack of object information.".format(xml_path) for obj in data["object"]:
xmin = float(obj["bndbox"]["xmin"])
xmax = float(obj["bndbox"]["xmax"])
ymin = float(obj["bndbox"]["ymin"])
ymax = float(obj["bndbox"]["ymax"]) # 进一步检查数据,有的标注信息中可能有w或h为0的情况,这样的数据会导致计算回归loss为nan
if xmax <= xmin or ymax <= ymin: print("Warning: in '{}' xml, there are some bbox w/h <=0".format(xml_path)) continue
boxes.append([xmin, ymin, xmax, ymax])
labels.append(self.class_dict[obj["name"]]) if "difficult" in obj:
iscrowd.append(int(obj["difficult"])) else:
iscrowd.append(0) # convert everything into a paddle.Tensor
boxes = paddle.to_tensor(boxes).astype('float32')
labels = paddle.to_tensor(labels).astype('int32')
iscrowd = paddle.to_tensor(iscrowd, dtype=paddle.int64)
image_id = paddle.to_tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd if self.transforms is not None:
image, target = self.transforms(image, target)
return image, target
def parse_xml_to_dict(self, xml):
"""
将xml文件解析成字典形式,参考tensorflow的recursive_parse_xml_to_dict
Args:
xml: xml tree obtained by parsing XML file contents using lxml.etree
Returns:
Python dictionary holding XML contents.
"""
if len(xml) == 0: # 遍历到底层,直接返回tag对应的信息
return {xml.tag: xml.text}
result = {} for child in xml:
child_result = self.parse_xml_to_dict(child) # 递归遍历标签信息
if child.tag != 'object':
result[child.tag] = child_result[child.tag] else: if child.tag not in result: # 因为object可能有多个,所以需要放入列表里
result[child.tag] = []
result[child.tag].append(child_result[child.tag]) return {xml.tag: result}{'filename': '2010_001142.jpg', 'folder': 'VOC2012', 'object': [{'name': 'bottle', 'bndbox': {'xmax': '282', 'xmin': '264', 'ymax': '244', 'ymin': '210'}, 'difficult': '0', 'occluded': '0', 'pose': 'Unspecified', 'truncated': '0'}, {'name': 'bottle', 'bndbox': {'xmax': '308', 'xmin': '295', 'ymax': '184', 'ymin': '162'}, 'difficult': '1', 'occluded': '0', 'pose': 'Unspecified', 'truncated': '0'}, {'name': 'bottle', 'bndbox': {'xmax': '270', 'xmin': '254', 'ymax': '224', 'ymin': '196'}, 'difficult': '1', 'occluded': '0', 'pose': 'Unspecified', 'truncated': '1'}, {'name': 'bottle', 'bndbox': {'xmax': '292', 'xmin': '281', 'ymax': '225', 'ymin': '204'}, 'difficult': '1', 'occluded': '0', 'pose': 'Unspecified', 'truncated': '1'}, {'name': 'bottle', 'bndbox': {'xmax': '221', 'xmin': '212', 'ymax': '227', 'ymin': '208'}, 'difficult': '1', 'occluded': '0', 'pose': 'Unspecified', 'truncated': '0'}, {'name': 'person', 'bndbox': {'xmax': '371', 'xmin': '315', 'ymax': '220', 'ymin': '103'}, 'difficult': '0', 'occluded': '1', 'pose': 'Frontal', 'truncated': '1'}, {'name': 'person', 'bndbox': {'xmax': '379', 'xmin': '283', 'ymax': '342', 'ymin': '171'}, 'difficult': '0', 'occluded': '0', 'pose': 'Left', 'truncated': '0'}, {'name': 'person', 'bndbox': {'xmax': '216', 'xmin': '156', 'ymax': '260', 'ymin': '180'}, 'difficult': '0', 'occluded': '1', 'pose': 'Right', 'truncated': '1'}, {'name': 'person', 'bndbox': {'xmax': '223', 'xmin': '205', 'ymax': '198', 'ymin': '172'}, 'difficult': '1', 'occluded': '1', 'pose': 'Frontal', 'truncated': '1'}, {'name': 'person', 'bndbox': {'xmax': '280', 'xmin': '218', 'ymax': '234', 'ymin': '155'}, 'difficult': '0', 'occluded': '1', 'pose': 'Right', 'truncated': '1'}, {'name': 'person', 'bndbox': {'xmax': '343', 'xmin': '292', 'ymax': '241', 'ymin': '185'}, 'difficult': '1', 'occluded': '1', 'pose': 'Left', 'truncated': '1'}], 'segmented': '0', 'size': {'depth': '3', 'height': '375', 'width': '500'}, 'source': {'annotation': 'PASCAL VOC2010', 'database': 'The VOC2010 Database', 'image': 'flickr'}}!pip install lxml
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple Requirement already satisfied: lxml in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (4.8.0)WARNING: You are using pip version 21.3.1; however, version 22.0.3 is available. You should consider upgrading via the '/opt/conda/envs/python35-paddle120-env/bin/python -m pip install --upgrade pip' command.
import paddleimport osimport jsonfrom PIL import Imagefrom lxml import etree# 定义数据读取类,继承Paddle.io.Datasetclass VOCDataset(paddle.io.Dataset):
def __init__(self,voc_root, year='2012',transforms=None, txt_name:str = 'train.txt'):
assert year in ['2007','2012'], "year must be in ['2007','2012']"
self.root = os.path.join(voc_root,f"VOC{year}")
self.img_root = os.path.join(self.root,'JPEGImages')
self.annotations_root = os.path.join(self.root,'Annotations')
txt_path = os.path.join(self.root,"ImageSets",'Main',txt_name) assert os.path.exists(txt_path),'not found {} file'.format(txt_name) with open(txt_path) as read:
self.xml_list = [os.path.join(self.annotations_root,line.strip()+'.xml') for line in read.readlines() if len(line.strip()) >0 ]
#check file
assert len(self.xml_list) > 0, "in '{}' file does not find any information.".format(txt_path) for xml_path in self.xml_list: assert os.path.exists(xml_path), "not found '{}' file.".format(xml_path)
# read class_indict
json_file = './pascal_voc_classes.json'
assert os.path.exists(json_file), "{} file not exist.".format(json_file)
json_file = open(json_file, 'r')
self.class_dict = json.load(json_file)
json_file.close()
self.transforms = transforms def __len__(self):
return len(self.xml_list) def __getitem__(self, idx):
# read xml
xml_path = self.xml_list[idx] with open(xml_path) as fid:
xml_str = fid.read()
xml = etree.fromstring(xml_str)
data = self.parse_xml_to_dict(xml)["annotation"]
img_path = os.path.join(self.img_root, data["filename"])
image = Image.open(img_path) if image.format != "JPEG": raise ValueError("Image '{}' format not JPEG".format(img_path))
boxes = []
labels = []
iscrowd = [] assert "object" in data, "{} lack of object information.".format(xml_path) for obj in data["object"]:
xmin = float(obj["bndbox"]["xmin"])
xmax = float(obj["bndbox"]["xmax"])
ymin = float(obj["bndbox"]["ymin"])
ymax = float(obj["bndbox"]["ymax"]) # 进一步检查数据,有的标注信息中可能有w或h为0的情况,这样的数据会导致计算回归loss为nan
if xmax <= xmin or ymax <= ymin: print("Warning: in '{}' xml, there are some bbox w/h <=0".format(xml_path)) continue
boxes.append([xmin, ymin, xmax, ymax])
labels.append(self.class_dict[obj["name"]]) if "difficult" in obj:
iscrowd.append(int(obj["difficult"])) else:
iscrowd.append(0) # convert everything into a paddle.Tensor
boxes = paddle.to_tensor(boxes).astype('float32')
labels = paddle.to_tensor(labels).astype('int32')
iscrowd = paddle.to_tensor(iscrowd, dtype=paddle.int64)
image_id = paddle.to_tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd if self.transforms is not None:
image, target = self.transforms(image, target)
return image, target
def parse_xml_to_dict(self, xml):
"""
将xml文件解析成字典形式,参考tensorflow的recursive_parse_xml_to_dict
Args:
xml: xml tree obtained by parsing XML file contents using lxml.etree
Returns:
Python dictionary holding XML contents.
"""
if len(xml) == 0: # 遍历到底层,直接返回tag对应的信息
return {xml.tag: xml.text}
result = {} for child in xml:
child_result = self.parse_xml_to_dict(child) # 递归遍历标签信息
if child.tag != 'object':
result[child.tag] = child_result[child.tag] else: if child.tag not in result: # 因为object可能有多个,所以需要放入列表里
result[child.tag] = []
result[child.tag].append(child_result[child.tag]) return {xml.tag: result}
def collate_fn(batch):
return tuple(zip(*batch))with open('VOC2012/ImageSets/Main/train.txt') as t: passtrain_dataset = VOCDataset('./', "2012")print(train_dataset.class_dict)
{'aeroplane': 1, 'bicycle': 2, 'bird': 3, 'boat': 4, 'bottle': 5, 'bus': 6, 'car': 7, 'cat': 8, 'chair': 9, 'cow': 10, 'diningtable': 11, 'dog': 12, 'horse': 13, 'motorbike': 14, 'person': 15, 'pottedplant': 16, 'sheep': 17, 'sofa': 18, 'train': 19, 'tvmonitor': 20}import paddle.vision.transforms as transformsfrom draw_box_utils import draw_boxfrom PIL import Imageimport jsonimport matplotlib.pyplot as pltimport random# read class_indictcategory_index = {}try:
json_file = open('./pascal_voc_classes.json', 'r')
class_dict = json.load(json_file)
category_index = {v: k for k, v in class_dict.items()}except Exception as e: print(e)
exit(-1)
data_transform = { "train": transforms.Compose([transforms.ToTensor(),
transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()])
}# load train data settrain_data_set = VOCDataset('./', "2012")print(len(train_data_set))for index in random.sample(range(0, len(train_data_set)), k=5):
img, target = train_data_set[index]
draw_box(img,
target["boxes"].numpy(),
target["labels"].numpy(),
[1 for i in range(len(target["labels"].numpy()))],
category_index,
thresh=0.5,
line_thickness=5)
plt.imshow(img)
plt.show()5717
<Figure size 432x288 with 1 Axes>
<Figure size 432x288 with 1 Axes>
<Figure size 432x288 with 1 Axes>
<Figure size 432x288 with 1 Axes>
<Figure size 432x288 with 1 Axes>
!unzip -oq data/data106197/voc.zip
import paddleimport osimport jsonfrom PIL import Imagefrom lxml import etree# 定义数据读取类,继承Paddle.io.Datasetclass Selfataset(paddle.io.Dataset):
def __init__(self,voc_root,transforms=None,txt_name:str = 'train.txt'):
self.root =voc_root
self.img_root = os.path.join(self.root,'JPEGImages')
self.annotations_root = os.path.join(self.root,'Annotations')
txt_path = os.path.join(self.root,txt_name) print(txt_path) assert os.path.exists(txt_path),'not found {} file'.format(txt_name)#self.xml_list = [os.path.join(self.annotations_root,line.strip()+'.xml')
#for line in read.readlines() if len(line.strip()) >0 ]
self.image_list = []
self.xml_list = [] with open(txt_path) as read:
self.path_list = [line.strip() for line in read.readlines() if len(line.strip()) >0 ]
for path in self.path_list:
self.image_list.append(os.path.join(self.root,path.split(' ')[0]))
self.xml_list.append(os.path.join(self.root,path.split(' ')[1]))
assert len(self.xml_list) > 0, "in '{}' file does not find any information.".format(txt_path) for xml_path in self.xml_list: assert os.path.exists(xml_path), "not found '{}' file.".format(xml_path)
#read class
self.class_dict = {}
self.class_path = os.path.join(self.root,'labels.txt') print(self.class_path) with open(self.class_path) as read:
self.classes = [class_name.strip() for class_name in read.readlines() ] print(self.classes) for number,class_name in enumerate(self.classes,1):
self.class_dict[class_name] = number
self.transforms = transforms def __len__(self):
return len(self.xml_list) def __getitem__(self, idx):
# read xml
xml_path = self.xml_list[idx] with open(xml_path) as fid:
xml_str = fid.read()
xml = etree.fromstring(xml_str)
data = self.parse_xml_to_dict(xml)["annotation"] #print(data)
img_path = os.path.join(self.img_root, data["frame"]+'.jpg')
image = Image.open(img_path) #if image.format != "JPEG":
#raise ValueError("Image '{}' format not JPEG".format(img_path))
boxes = []
labels = []
iscrowd = [] assert "object" in data, "{} lack of object information.".format(xml_path) for obj in data["object"]:
xmin = float(obj["bndbox"]["xmin"])
xmax = float(obj["bndbox"]["xmax"])
ymin = float(obj["bndbox"]["ymin"])
ymax = float(obj["bndbox"]["ymax"]) # 进一步检查数据,有的标注信息中可能有w或h为0的情况,这样的数据会导致计算回归loss为nan
if xmax <= xmin or ymax <= ymin: print("Warning: in '{}' xml, there are some bbox w/h <=0".format(xml_path)) continue
boxes.append([xmin, ymin, xmax, ymax])
labels.append(self.class_dict[obj["name"]]) if "difficult" in obj:
iscrowd.append(int(obj["difficult"])) else:
iscrowd.append(0) # convert everything into a paddle.Tensor
boxes = paddle.to_tensor(boxes).astype('float32')
labels = paddle.to_tensor(labels).astype('int32')
iscrowd = paddle.to_tensor(iscrowd, dtype=paddle.int64)
image_id = paddle.to_tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd if self.transforms is not None:
image, target = self.transforms(image, target)
return image, target
def parse_xml_to_dict(self, xml):
"""
将xml文件解析成字典形式,参考tensorflow的recursive_parse_xml_to_dict
Args:
xml: xml tree obtained by parsing XML file contents using lxml.etree
Returns:
Python dictionary holding XML contents.
"""
if len(xml) == 0: # 遍历到底层,直接返回tag对应的信息
return {xml.tag: xml.text}
result = {} for child in xml:
child_result = self.parse_xml_to_dict(child) # 递归遍历标签信息
if child.tag != 'object':
result[child.tag] = child_result[child.tag] else: if child.tag not in result: # 因为object可能有多个,所以需要放入列表里
result[child.tag] = []
result[child.tag].append(child_result[child.tag]) return {xml.tag: result} def collate_fn(batch):
return tuple(zip(*batch))a = Selfataset('voc',None,'train_list.txt')voc/train_list.txt voc/labels.txt ['flv', 'gx', 'mbw']
a.class_dict
{'flv': 1, 'gx': 2, 'mbw': 3}import paddle.vision.transforms as transformsfrom draw_box_utils import draw_boxfrom PIL import Imageimport jsonimport matplotlib.pyplot as pltimport random# read class_indictcategory_index = {}try:
json_file = open('./pascal_voc_classes.json', 'r')
class_dict = json.load(json_file)
category_index = {v: k for k, v in class_dict.items()}except Exception as e: print(e)
exit(-1)
data_transform = { "train": transforms.Compose([transforms.ToTensor(),
transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()])
}# load train data settrain_data_set = Selfataset('voc',None,'train_list.txt')print(len(train_data_set))for index in random.sample(range(0, len(train_data_set)), k=5):
img, target = train_data_set[index]
draw_box(img,
target["boxes"].numpy(),
target["labels"].numpy(),
[1 for i in range(len(target["labels"].numpy()))],
category_index,
thresh=0.6,
line_thickness=5)
plt.imshow(img)
plt.show()# targetn = []# for index in range(0, len(train_data_set)):# try:# img, target = train_data_set[index]# targetn.append(target["labels"].numpy())# except:# passvoc/train_list.txt voc/labels.txt ['flv', 'gx', 'mbw'] 1216
<Figure size 432x288 with 1 Axes>
<Figure size 432x288 with 1 Axes>
<Figure size 432x288 with 1 Axes>
<Figure size 432x288 with 1 Axes>
<Figure size 432x288 with 1 Axes>
以上就是自定义dataset,教你制作自己的 VOC 数据读取器的详细内容,更多请关注php中文网其它相关文章!
每个人都需要一台速度更快、更稳定的 PC。随着时间的推移,垃圾文件、旧注册表数据和不必要的后台进程会占用资源并降低性能。幸运的是,许多工具可以让 Windows 保持平稳运行。
Copyright 2014-2025 https://www.php.cn/ All Rights Reserved | php.cn | 湘ICP备2023035733号