COCO 读取器#
此读取器算子读取 COCO 数据集或 COCO 的子集,它由一个注解文件和图像目录组成。
DALI_EXTRA_PATH
环境变量应指向从 DALI extra 仓库 下载的数据所在的位置。
重要提示:请确保检出与已安装 DALI 版本对应的正确发布标签。
[1]:
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.fn as fn
import nvidia.dali.types as types
import numpy as np
import os.path
test_data_root = os.environ["DALI_EXTRA_PATH"]
file_root = os.path.join(test_data_root, "db", "coco", "images")
annotations_file = os.path.join(test_data_root, "db", "coco", "instances.json")
batch_size = 16
[2]:
pipe = Pipeline(batch_size=batch_size, num_threads=4, device_id=0)
with pipe:
jpegs, bboxes, labels, polygons, vertices = fn.readers.coco(
file_root=file_root,
annotations_file=annotations_file,
polygon_masks=True,
ratio=True,
)
images = fn.decoders.image(jpegs, device="mixed", output_type=types.RGB)
pipe.set_outputs(images, bboxes, labels, polygons, vertices)
[3]:
pipe.build()
pipe_out = pipe.run()
images_cpu = pipe_out[0].as_cpu()
bboxes_cpu = pipe_out[1]
labels_cpu = pipe_out[2]
polygons_cpu = pipe_out[3]
vertices_cpu = pipe_out[4]
该算子将边界框作为 float
类型的二维数组返回。默认情况下,内部维度包含 [x, y, width, height]
。当 ltrb
参数设置为 True
时,内部维度包含 [left, top, right, bottom]
。
[4]:
bboxes = bboxes_cpu.at(4)
labels = labels_cpu.at(4)
for bbox, label in zip(bboxes, labels):
x, y, width, height = bbox
print(
f"Bounding box (x={x}, y={y}, width={width}, height={height}), "
f"label={label}"
)
Bounding box (x=0.125, y=0.1794569045305252, width=0.3226562440395355, height=0.46871310472488403), label=1
掩码多边形由两个输出定义:polygons 和 vertices。polygons 中的每个条目包含三个整数,分别表示多边形所属的掩码索引、起始顶点索引和多边形所属的最后一个顶点索引之后的一个索引。vertices 中的每个条目包含表示多边形顶点的坐标 (x, y)。
[5]:
polygons = polygons_cpu.at(4)
vertices = vertices_cpu.at(4)
print(polygons.shape, vertices.shape)
(1, 3) (26, 2)
[6]:
for polygon in polygons:
mask_idx, start_vertex, end_vertex = polygon
nvertices = end_vertex - start_vertex
print(
f"Polygon belonging to mask index {mask_idx} containing {nvertices} "
"vertices:"
)
polygon_vertices = vertices[start_vertex:end_vertex]
for vertex_id in range(nvertices):
x, y = vertices[vertex_id]
print(f"Vertex {vertex_id}: x={x}, y={y}")
Polygon belonging to mask index 0 containing 26 vertices:
Vertex 0: x=0.17783120274543762, y=0.23328186571598053
Vertex 1: x=0.1417735069990158, y=0.29483649134635925
Vertex 2: x=0.12574785947799683, y=0.38767293095588684
Vertex 3: x=0.13042201101779938, y=0.48959121108055115
Vertex 4: x=0.15646366775035858, y=0.572336733341217
Vertex 5: x=0.19853098690509796, y=0.6238003969192505
Vertex 6: x=0.23392093181610107, y=0.6490277647972107
Vertex 7: x=0.3073717951774597, y=0.6460005044937134
Vertex 8: x=0.36479702591896057, y=0.6449913382530212
Vertex 9: x=0.41554489731788635, y=0.6016004085540771
Vertex 10: x=0.4435897469520569, y=0.5097730755805969
Vertex 11: x=0.44826388359069824, y=0.4018002450466156
Vertex 12: x=0.43223825097084045, y=0.31199103593826294
Vertex 13: x=0.39684829115867615, y=0.24034550786018372
Vertex 14: x=0.35611647367477417, y=0.20502729713916779
Vertex 15: x=0.3213942348957062, y=0.20200002193450928
Vertex 16: x=0.2986912429332733, y=0.20300911366939545
Vertex 17: x=0.30069443583488464, y=0.18585455417633057
Vertex 18: x=0.2960202991962433, y=0.17980000376701355
Vertex 19: x=0.2873397469520569, y=0.17980000376701355
Vertex 20: x=0.28600427508354187, y=0.1848454624414444
Vertex 21: x=0.2873397469520569, y=0.19695456326007843
Vertex 22: x=0.2893429398536682, y=0.20401820540428162
Vertex 23: x=0.2893429398536682, y=0.20906366407871246
Vertex 24: x=0.22123396396636963, y=0.22621823847293854
Vertex 25: x=0.21589210629463196, y=0.22016368806362152
要在图像上可视化 ground truth 边界框和掩码多边形,请执行以下操作
[7]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
random.seed(1231243)
def plot_sample(img_index, ax):
img = images_cpu.at(img_index)
H = img.shape[0]
W = img.shape[1]
ax.imshow(img)
bboxes = bboxes_cpu.at(img_index)
labels = labels_cpu.at(img_index)
polygons = polygons_cpu.at(img_index)
vertices = vertices_cpu.at(img_index)
categories_set = set()
for label in labels:
categories_set.add(label)
category_id_to_color = dict(
[
(
cat_id,
[
random.uniform(0, 1),
random.uniform(0, 1),
random.uniform(0, 1),
],
)
for cat_id in categories_set
]
)
for bbox, label in zip(bboxes, labels):
rect = patches.Rectangle(
(bbox[0] * W, bbox[1] * H),
bbox[2] * W,
bbox[3] * H,
linewidth=1,
edgecolor=category_id_to_color[label],
facecolor="none",
)
ax.add_patch(rect)
for polygon in polygons:
mask_idx, start_vertex, end_vertex = polygon
polygon_vertices = vertices[start_vertex:end_vertex]
polygon_vertices = polygon_vertices * [W, H]
poly = patches.Polygon(
polygon_vertices,
True,
facecolor=category_id_to_color[label],
alpha=0.7,
)
ax.add_patch(
poly,
)
fig, ax = plt.subplots(2, 2, figsize=(12, 12))
fig.tight_layout()
plot_sample(2, ax[0, 0])
plot_sample(1, ax[0, 1])
plot_sample(4, ax[1, 0])
plot_sample(8, ax[1, 1])
plt.show()
[ ]: