物体检测
边缘框
一个边缘框可以通过4个数字定义
(左上x,左上y,右下x,右下y) (左上x,左上y,宽,高) 目标检测数据集
总结
物体检测识别图片里的多个物体的类别和位置 位置通常用边缘框表示 边缘框相关操作的代码实现
% matplotlib inline
import torch
from d2l import torch as d2l
d2l. set_figsize( )
img = d2l. plt. imread( '../img/catdog.jpg' )
d2l. plt. imshow( img) ;
def box_corner_to_center ( boxes) :
"""从(左上,右下)转换到(中间,宽度,高度)"""
x1, y1, x2, y2 = boxes[ : , 0 ] , boxes[ : , 1 ] , boxes[ : , 2 ] , boxes[ : , 3 ]
cx = ( x1 + x2) / 2
cy = ( y1 + y2) / 2
w = x2 - x1
h = y2 - y1
boxes = torch. stack( ( cx, cy, w, h) , axis= - 1 )
return boxes
def box_center_to_corner ( boxes) :
"""从(中间,宽度,高度)转换到(左上,右下)"""
cx, cy, w, h = boxes[ : , 0 ] , boxes[ : , 1 ] , boxes[ : , 2 ] , boxes[ : , 3 ]
x1 = cx - 0.5 * w
y1 = cy - 0.5 * h
x2 = cx + 0.5 * w
y2 = cy + 0.5 * h
boxes = torch. stack( ( x1, y1, x2, y2) , axis= - 1 )
return boxes
dog_bbox, cat_bbox = [ 60.0 , 45.0 , 378.0 , 516.0 ] , [ 400.0 , 112.0 , 655.0 , 493.0 ]
boxes = torch. tensor( ( dog_bbox, cat_bbox) )
box_center_to_corner( box_corner_to_center( boxes) ) == boxes
def bbox_to_rect ( bbox, color) :
return d2l. plt. Rectangle( xy= ( bbox[ 0 ] , bbox[ 1 ] ) , width= bbox[ 2 ] - bbox[ 0 ] ,
height= bbox[ 3 ] - bbox[ 1 ] , fill= False ,
edgecolor= color, linewidth= 2 )
fig = d2l. plt. imshow( img)
fig. axes. add_patch( bbox_to_rect( dog_bbox, 'blue' ) )
fig. axes. add_patch( bbox_to_rect( cat_bbox, 'red' ) ) ;
def read_data_bananas ( is_train= True ) :
"""读取香蕉检测数据集中的图像和标签。"""
data_dir = d2l. download_extract( 'banana-detection' )
csv_fname = os. path. join( data_dir,
'bananas_train' if is_train else 'bananas_val' ,
'label.csv' )
csv_data = pd. read_csv( csv_fname)
csv_data = csv_data. set_index( 'img_name' )
images, targets = [ ] , [ ]
for img_name, target in csv_data. iterrows( ) :
images. append(
torchvision. io. read_image(
os. path. join( data_dir,
'bananas_train' if is_train else 'bananas_val' ,
'images' , f' { img_name} ' ) ) )
targets. append( list ( target) )
return images, torch. tensor( targets) . unsqueeze( 1 ) / 256
class BananasDataset ( torch. utils. data. Dataset) :
"""一个用于加载香蕉检测数据集的自定义数据集。"""
def __init__ ( self, is_train) :
self. features, self. labels = read_data_bananas( is_train)
print ( 'read ' + str ( len ( self. features) ) + (
f' training examples' if is_train else f' validation examples' ) )
def __getitem__ ( self, idx) :
return ( self. features[ idx] . float ( ) , self. labels[ idx] )
def __len__ ( self) :
return len ( self. features)
def load_data_bananas ( batch_size) :
"""加载香蕉检测数据集。"""
train_iter = torch. utils. data. DataLoader( BananasDataset( is_train= True ) ,
batch_size, shuffle= True )
val_iter = torch. utils. data. DataLoader( BananasDataset( is_train= False ) ,
batch_size)
return train_iter, val_iter
batch_size, edge_size = 32 , 256
train_iter, _ = load_data_bananas( batch_size)
batch = next ( iter ( train_iter) )
batch[ 0 ] . shape, batch[ 1 ] . shape
imgs = ( batch[ 0 ] [ 0 : 10 ] . permute( 0 , 2 , 3 , 1 ) ) / 255
axes = d2l. show_images( imgs, 2 , 5 , scale= 2 )
for ax, label in zip ( axes, batch[ 1 ] [ 0 : 10 ] ) :
d2l. show_bboxes( ax, [ label[ 0 ] [ 1 : 5 ] * edge_size] , colors= [ 'w' ] )
锚框
定义
一类目标检测算法是基于锚框
提出多个被称为锚框的区域(边缘框) 预测每个锚框里是否含有关注的物体 如果是,预测从这个锚框到真实边缘框的偏移 IoU-交并比—从锚框拟合到实际预测框
本意用来计算两个框的相似度
J
(
A
,
B
)
=
∣
A
⋂
B
∣
|
A
⋃
B
|
J(A, B)=\frac{|A \bigcap B|}{|A \bigcup B|}
J ( A , B ) = | A ⋃ B | ∣ A ⋂ B ∣ 赋予锚框标号
每个锚框是一个训练样本 将每个锚框,要么标准成背景,关联上一个真实边缘框 我们可能会生成大量的锚框
通俗理解锚框
锚框就是我们希望拟合到已经标注的边缘框的训练样本 锚框本身就是一种训练样本,因此,一张图片可以定义多个锚框,使得其训练多次 使用非极大值抑制(NMS)输出
每个锚框预测一个边缘框 NMS可以合并相似的预测
选中是非背景锚框,其为最大预测值 去掉所有其他和它IOU值大于
θ
\theta
θ 的预测 重复上述过程直到所有预测要么被选中,要么被去掉 总结
锚框是自定义的训练样本,边缘框是已经标注好的样本 全流程分析
锚框与边缘框求IOU,保证至少一个边缘框对应一个锚框,其他IOU不满足域值的设为背景框 被选出的锚框中,将每一类物体最大值的锚框选出,并且与同预测类的进行IOU求解,IOU大于某一域值的全部被滤除,这样在一个目标周围就保证了只出现一个框 官方思路
首先生成大量锚框,并赋予编号,每个锚框进行一个样本进行训练 在预测时,使用NMS来去掉冗余的预测 代码锚框相关技术
% matplotlib inline
import torch
from d2l import torch as d2l
torch. set_printoptions( 2 )
def multibox_prior ( data, sizes, ratios) :
"""生成以每个像素为中心具有不同形状的锚框。"""
in_height, in_width = data. shape[ - 2 : ]
device, num_sizes, num_ratios = data. device, len ( sizes) , len ( ratios)
boxes_per_pixel = ( num_sizes + num_ratios - 1 )
size_tensor = torch. tensor( sizes, device= device)
ratio_tensor = torch. tensor( ratios, device= device)
offset_h, offset_w = 0.5 , 0.5
steps_h = 1.0 / in_height
steps_w = 1.0 / in_width
center_h = ( torch. arange( in_height, device= device) + offset_h) * steps_h
center_w = ( torch. arange( in_width, device= device) + offset_w) * steps_w
shift_y, shift_x = torch. meshgrid( center_h, center_w)
shift_y, shift_x = shift_y. reshape( - 1 ) , shift_x. reshape( - 1 )
w = torch. cat( ( size_tensor * torch. sqrt( ratio_tensor[ 0 ] ) ,
sizes[ 0 ] * torch. sqrt( ratio_tensor[ 1 : ] ) ) ) \
* in_height / in_width
h = torch. cat( ( size_tensor / torch. sqrt( ratio_tensor[ 0 ] ) ,
sizes[ 0 ] / torch. sqrt( ratio_tensor[ 1 : ] ) ) )
anchor_manipulations = torch. stack(
( - w, - h, w, h) ) . T. repeat( in_height * in_width, 1 ) / 2
out_grid = torch. stack( [ shift_x, shift_y, shift_x, shift_y] ,
dim= 1 ) . repeat_interleave( boxes_per_pixel, dim= 0 )
output = out_grid + anchor_manipulations
return output. unsqueeze( 0 )
img = d2l. plt. imread( '../img/catdog.jpg' )
h, w = img. shape[ : 2 ]
print ( h, w)
X = torch. rand( size= ( 1 , 3 , h, w) )
Y = multibox_prior( X, sizes= [ 0.75 , 0.5 , 0.25 ] , ratios= [ 1 , 2 , 0.5 ] )
Y. shape
boxes = Y. reshape( h, w, 5 , 4 )
boxes[ 250 , 250 , 0 , : ]
def show_bboxes ( axes, bboxes, labels= None , colors= None ) :
"""显示所有边界框。"""
def _make_list ( obj, default_values= None ) :
if obj is None :
obj = default_values
elif not isinstance ( obj, ( list , tuple ) ) :
obj = [ obj]
return obj
labels = _make_list( labels)
colors = _make_list( colors, [ 'b' , 'g' , 'r' , 'm' , 'c' ] )
for i, bbox in enumerate ( bboxes) :
color = colors[ i % len ( colors) ]
rect = d2l. bbox_to_rect( bbox. detach( ) . numpy( ) , color)
axes. add_patch( rect)
if labels and len ( labels) > i:
text_color = 'k' if color == 'w' else 'w'
axes. text( rect. xy[ 0 ] , rect. xy[ 1 ] , labels[ i] , va= 'center' ,
ha= 'center' , fontsize= 9 , color= text_color,
bbox= dict ( facecolor= color, lw= 0 ) )
d2l. set_figsize( )
bbox_scale = torch. tensor( ( w, h, w, h) )
fig = d2l. plt. imshow( img)
show_bboxes( fig. axes, boxes[ 250 , 250 , : , : ] * bbox_scale, [
's=0.75, r=1' , 's=0.5, r=1' , 's=0.25, r=1' , 's=0.75, r=2' , 's=0.75, r=0.5'
] )
def box_iou ( boxes1, boxes2) :
"""计算两个锚框或边界框列表中成对的交并比。"""
box_area = lambda boxes: ( ( boxes[ : , 2 ] - boxes[ : , 0 ] ) *
( boxes[ : , 3 ] - boxes[ : , 1 ] ) )
areas1 = box_area( boxes1)
areas2 = box_area( boxes2)
inter_upperlefts = torch. max ( boxes1[ : , None , : 2 ] , boxes2[ : , : 2 ] )
inter_lowerrights = torch. min ( boxes1[ : , None , 2 : ] , boxes2[ : , 2 : ] )
inters = ( inter_lowerrights - inter_upperlefts) . clamp( min = 0 )
inter_areas = inters[ : , : , 0 ] * inters[ : , : , 1 ]
union_areas = areas1[ : , None ] + areas2 - inter_areas
return inter_areas / union_areas
def assign_anchor_to_bbox ( ground_truth, anchors, device, iou_threshold= 0.5 ) :
"""将最接近的真实边界框分配给锚框。"""
num_anchors, num_gt_boxes = anchors. shape[ 0 ] , ground_truth. shape[ 0 ]
jaccard = box_iou( anchors, ground_truth)
anchors_bbox_map = torch. full( ( num_anchors, ) , - 1 , dtype= torch. long ,
device= device)
max_ious, indices = torch. max ( jaccard, dim= 1 )
anc_i = torch. nonzero( max_ious >= 0.5 ) . reshape( - 1 )
box_j = indices[ max_ious >= 0.5 ]
anchors_bbox_map[ anc_i] = box_j
col_discard = torch. full( ( num_anchors, ) , - 1 )
row_discard = torch. full( ( num_gt_boxes, ) , - 1 )
for _ in range ( num_gt_boxes) :
max_idx = torch. argmax( jaccard)
box_idx = ( max_idx % num_gt_boxes) . long ( )
anc_idx = ( max_idx / num_gt_boxes) . long ( )
anchors_bbox_map[ anc_idx] = box_idx
jaccard[ : , box_idx] = col_discard
jaccard[ anc_idx, : ] = row_discard
return anchors_bbox_map
def offset_boxes ( anchors, assigned_bb, eps= 1e-6 ) :
"""对锚框偏移量的转换。"""
c_anc = d2l. box_corner_to_center( anchors)
c_assigned_bb = d2l. box_corner_to_center( assigned_bb)
offset_xy = 10 * ( c_assigned_bb[ : , : 2 ] - c_anc[ : , : 2 ] ) / c_anc[ : , 2 : ]
offset_wh = 5 * torch. log( eps + c_assigned_bb[ : , 2 : ] / c_anc[ : , 2 : ] )
offset = torch. cat( [ offset_xy, offset_wh] , axis= 1 )
return offset
def multibox_target ( anchors, labels) :
"""使用真实边界框标记锚框。"""
batch_size, anchors = labels. shape[ 0 ] , anchors. squeeze( 0 )
batch_offset, batch_mask, batch_class_labels = [ ] , [ ] , [ ]
device, num_anchors = anchors. device, anchors. shape[ 0 ]
for i in range ( batch_size) :
label = labels[ i, : , : ]
anchors_bbox_map = assign_anchor_to_bbox( label[ : , 1 : ] , anchors,
device)
bbox_mask = ( ( anchors_bbox_map >= 0 ) . float ( ) . unsqueeze( - 1 ) ) . repeat(
1 , 4 )
class_labels = torch. zeros( num_anchors, dtype= torch. long ,
device= device)
assigned_bb = torch. zeros( ( num_anchors, 4 ) , dtype= torch. float32,
device= device)
indices_true = torch. nonzero( anchors_bbox_map >= 0 )
bb_idx = anchors_bbox_map[ indices_true]
class_labels[ indices_true] = label[ bb_idx, 0 ] . long ( ) + 1
assigned_bb[ indices_true] = label[ bb_idx, 1 : ]
offset = offset_boxes( anchors, assigned_bb) * bbox_mask
batch_offset. append( offset. reshape( - 1 ) )
batch_mask. append( bbox_mask. reshape( - 1 ) )
batch_class_labels. append( class_labels)
bbox_offset = torch. stack( batch_offset)
bbox_mask = torch. stack( batch_mask)
class_labels = torch. stack( batch_class_labels)
return ( bbox_offset, bbox_mask, class_labels)
ground_truth = torch. tensor( [ [ 0 , 0.1 , 0.08 , 0.52 , 0.92 ] ,
[ 1 , 0.55 , 0.2 , 0.9 , 0.88 ] ] )
anchors = torch. tensor( [ [ 0 , 0.1 , 0.2 , 0.3 ] , [ 0.15 , 0.2 , 0.4 , 0.4 ] ,
[ 0.63 , 0.05 , 0.88 , 0.98 ] , [ 0.66 , 0.45 , 0.8 , 0.8 ] ,
[ 0.57 , 0.3 , 0.92 , 0.9 ] ] )
fig = d2l. plt. imshow( img)
show_bboxes( fig. axes, ground_truth[ : , 1 : ] * bbox_scale, [ 'dog' , 'cat' ] , 'k' )
show_bboxes( fig. axes, anchors * bbox_scale, [ '0' , '1' , '2' , '3' , '4' ] ) ;
labels = multibox_target( anchors. unsqueeze( dim= 0 ) ,
ground_truth. unsqueeze( dim= 0 ) )
labels[ 2 ]
labels[ 1 ]
labels[ 0 ]
def offset_inverse ( anchors, offset_preds) :
"""根据带有预测偏移量的锚框来预测边界框。"""
anc = d2l. box_corner_to_center( anchors)
pred_bbox_xy = ( offset_preds[ : , : 2 ] * anc[ : , 2 : ] / 10 ) + anc[ : , : 2 ]
pred_bbox_wh = torch. exp( offset_preds[ : , 2 : ] / 5 ) * anc[ : , 2 : ]
pred_bbox = torch. cat( ( pred_bbox_xy, pred_bbox_wh) , axis= 1 )
predicted_bbox = d2l. box_center_to_corner( pred_bbox)
return predicted_bbox
def nms ( boxes, scores, iou_threshold) :
"""对预测边界框的置信度进行排序。"""
B = torch. argsort( scores, dim= - 1 , descending= True )
keep = [ ]
while B. numel( ) > 0 :
i = B[ 0 ]
keep. append( i)
if B. numel( ) == 1 : break
iou = box_iou( boxes[ i, : ] . reshape( - 1 , 4 ) ,
boxes[ B[ 1 : ] , : ] . reshape( - 1 , 4 ) ) . reshape( - 1 )
inds = torch. nonzero( iou <= iou_threshold) . reshape( - 1 )
B = B[ inds + 1 ]
return torch. tensor( keep, device= boxes. device)
def multibox_detection ( cls_probs, offset_preds, anchors, nms_threshold= 0.5 ,
pos_threshold= 0.009999999 ) :
"""使用非极大值抑制来预测边界框。"""
device, batch_size = cls_probs. device, cls_probs. shape[ 0 ]
anchors = anchors. squeeze( 0 )
num_classes, num_anchors = cls_probs. shape[ 1 ] , cls_probs. shape[ 2 ]
out = [ ]
for i in range ( batch_size) :
cls_prob, offset_pred = cls_probs[ i] , offset_preds[ i] . reshape( - 1 , 4 )
conf, class_id = torch. max ( cls_prob[ 1 : ] , 0 )
predicted_bb = offset_inverse( anchors, offset_pred)
keep = nms( predicted_bb, conf, nms_threshold)
all_idx = torch. arange( num_anchors, dtype= torch. long , device= device)
combined = torch. cat( ( keep, all_idx) )
uniques, counts = combined. unique( return_counts= True )
non_keep = uniques[ counts == 1 ]
all_id_sorted = torch. cat( ( keep, non_keep) )
class_id[ non_keep] = - 1
class_id = class_id[ all_id_sorted]
conf, predicted_bb = conf[ all_id_sorted] , predicted_bb[ all_id_sorted]
below_min_idx = ( conf < pos_threshold)
class_id[ below_min_idx] = - 1
conf[ below_min_idx] = 1 - conf[ below_min_idx]
pred_info = torch. cat(
( class_id. unsqueeze( 1 ) , conf. unsqueeze( 1 ) , predicted_bb) , dim= 1 )
out. append( pred_info)
return torch. stack( out)
anchors = torch. tensor( [ [ 0.1 , 0.08 , 0.52 , 0.92 ] , [ 0.08 , 0.2 , 0.56 , 0.95 ] ,
[ 0.15 , 0.3 , 0.62 , 0.91 ] , [ 0.55 , 0.2 , 0.9 , 0.88 ] ] )
offset_preds = torch. tensor( [ 0 ] * anchors. numel( ) )
cls_probs = torch. tensor( [ [ 0 ] * 4 ,
[ 0.9 , 0.8 , 0.7 , 0.1 ] ,
[ 0.1 , 0.2 , 0.3 , 0.9 ] ] )
fig = d2l. plt. imshow( img)
show_bboxes( fig. axes, anchors * bbox_scale,
[ 'dog=0.9' , 'dog=0.8' , 'dog=0.7' , 'cat=0.9' ] )
output = multibox_detection( cls_probs. unsqueeze( dim= 0 ) ,
offset_preds. unsqueeze( dim= 0 ) ,
anchors. unsqueeze( dim= 0 ) , nms_threshold= 0.5 )
output
fig = d2l. plt. imshow( img)
for i in output[ 0 ] . detach( ) . numpy( ) :
if i[ 0 ] == - 1 :
continue
label = ( 'dog=' , 'cat=' ) [ int ( i[ 0 ] ) ] + str ( i[ 1 ] )
show_bboxes( fig. axes, [ torch. tensor( i[ 2 : ] ) * bbox_scale] , label)
目标检测的概括
区域卷积神经网络
R-CNN—对每一个锚框进行CNN抽取
使用启发式搜索算法来选择锚框 使用预训练模型来对每个锚框提取特征 训练一个SVM来对类别分类 训练一个线性回归模型来预测边缘框偏移 ROI兴趣区域的池化
给定一个锚框,均匀分割成n*m块,输出每块里的最大值 不管锚框多大,最后总是输出n*m个值 Fast-RCNN—对全局进行CNN抽取特征后再进行锚框的ROI特征
使用CNN对图片抽取特征 使用ROI池化层对每个锚框生成固定长度特征 Faster-R-CNN
使用一个区域提议网络来替代启发式搜索来获得更好的锚框 流程—CNN提取特征—RPN进行二分类问题判读是否是一个合理的锚框—ROI Mask-R-CNN
单发多框检测(SSD)
核心—端到端的预测性 生成锚框
对每个像素,生成多个以它为中心的锚框 给定
n
n
n 个大小
s
1
,
s
2
,
…
,
s
n
s_1,s_2,\dots,s_n
s 1 , s 2 , … , s n 和m个高宽比,那么生成n+m-1个锚框,其大小和高宽比分别为
(
s
1
,
r
1
)
,
(
s
2
,
r
1
)
,
…
,
(
s
n
,
r
1
)
,
…
,
(
s
1
,
r
m
)
(s_1,r_1),(s_2,r_1),\dots,(s_n,r_1),\dots,(s_1,r_m)
( s 1 , r 1 ) , ( s 2 , r 1 ) , … , ( s n , r 1 ) , … , ( s 1 , r m ) 思路
一个基础网络来抽取特征,然后多个卷积层块来减半高宽 在每段都生成锚框
对每个锚框预测类别和边缘框 总结
SSD通过单神经网络来检测模型 以每个像素 为中心的产生多个锚框 在多个段的输出上进行多尺度的检测 YOLO
SSD中锚框大量重叠,因为每个像素进行锚框,因此浪费来很多计算 YOLO将图片直接均匀分成了S*S个锚框 每个锚框预测
B
B
B 个边缘框 后续版本(v2,v3,v4,$\dots)