mirror of
https://github.com/davisking/dlib.git
synced 2024-11-01 10:14:53 +08:00
Added an option to do bounding box regression to the loss_mmod layer.
This commit is contained in:
parent
16c96bc51f
commit
7b006f3769
145
dlib/dnn/loss.h
145
dlib/dnn/loss.h
@ -702,6 +702,8 @@ namespace dlib
|
||||
double truth_match_iou_threshold = 0.5;
|
||||
test_box_overlap overlaps_nms = test_box_overlap(0.4);
|
||||
test_box_overlap overlaps_ignore;
|
||||
bool use_bounding_box_regression = false;
|
||||
double bbr_lambda = 100;
|
||||
|
||||
use_image_pyramid assume_image_pyramid = use_image_pyramid::yes;
|
||||
|
||||
@ -937,7 +939,7 @@ namespace dlib
|
||||
|
||||
inline void serialize(const mmod_options& item, std::ostream& out)
|
||||
{
|
||||
int version = 3;
|
||||
int version = 4;
|
||||
|
||||
serialize(version, out);
|
||||
serialize(item.detector_windows, out);
|
||||
@ -947,13 +949,15 @@ namespace dlib
|
||||
serialize(item.overlaps_nms, out);
|
||||
serialize(item.overlaps_ignore, out);
|
||||
serialize(static_cast<uint8_t>(item.assume_image_pyramid), out);
|
||||
serialize(item.use_bounding_box_regression, out);
|
||||
serialize(item.bbr_lambda, out);
|
||||
}
|
||||
|
||||
inline void deserialize(mmod_options& item, std::istream& in)
|
||||
{
|
||||
int version = 0;
|
||||
deserialize(version, in);
|
||||
if (version != 3 && version != 2 && version != 1)
|
||||
if (!(1 <= version && version <= 4))
|
||||
throw serialization_error("Unexpected version found while deserializing dlib::mmod_options");
|
||||
if (version == 1)
|
||||
{
|
||||
@ -979,6 +983,13 @@ namespace dlib
|
||||
deserialize(assume_image_pyramid, in);
|
||||
item.assume_image_pyramid = static_cast<use_image_pyramid>(assume_image_pyramid);
|
||||
}
|
||||
item.use_bounding_box_regression = mmod_options().use_bounding_box_regression; // use default value since this wasn't provided
|
||||
item.bbr_lambda = mmod_options().bbr_lambda; // use default value since this wasn't provided
|
||||
if (version >= 4)
|
||||
{
|
||||
deserialize(item.use_bounding_box_regression, in);
|
||||
deserialize(item.bbr_lambda, in);
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
@ -991,20 +1002,31 @@ namespace dlib
|
||||
|
||||
intermediate_detection(
|
||||
rectangle rect_
|
||||
) : rect(rect_) {}
|
||||
) : rect(rect_), rect_bbr(rect_) {}
|
||||
|
||||
intermediate_detection(
|
||||
rectangle rect_,
|
||||
double detection_confidence_,
|
||||
size_t tensor_offset_,
|
||||
long channel
|
||||
) : rect(rect_), detection_confidence(detection_confidence_), tensor_offset(tensor_offset_), tensor_channel(channel) {}
|
||||
) : rect(rect_), rect_bbr(rect_), detection_confidence(detection_confidence_), tensor_offset(tensor_offset_), tensor_channel(channel) {}
|
||||
|
||||
// rect is the rectangle you get without any bounding box regression. So it's
|
||||
// the basic sliding window box (aka, the "anchor box").
|
||||
rectangle rect;
|
||||
double detection_confidence = 0;
|
||||
size_t tensor_offset = 0;
|
||||
long tensor_channel = 0;
|
||||
|
||||
// rect_bbr = rect + bounding box regression. So more accurate. Or if bbr is off then
|
||||
// this is just rect. The important thing about rect_bbr is that its the
|
||||
// rectangle we use for doing NMS.
|
||||
drectangle rect_bbr;
|
||||
size_t tensor_offset_dx = 0;
|
||||
size_t tensor_offset_dy = 0;
|
||||
size_t tensor_offset_dw = 0;
|
||||
size_t tensor_offset_dh = 0;
|
||||
|
||||
bool operator<(const intermediate_detection& item) const { return detection_confidence < item.detection_confidence; }
|
||||
};
|
||||
|
||||
@ -1032,7 +1054,14 @@ namespace dlib
|
||||
) const
|
||||
{
|
||||
const tensor& output_tensor = sub.get_output();
|
||||
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
|
||||
if (options.use_bounding_box_regression)
|
||||
{
|
||||
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()*5);
|
||||
}
|
||||
else
|
||||
{
|
||||
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
|
||||
}
|
||||
DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
|
||||
DLIB_CASSERT(sub.sample_expansion_factor() == 1, sub.sample_expansion_factor());
|
||||
|
||||
@ -1046,10 +1075,10 @@ namespace dlib
|
||||
final_dets.clear();
|
||||
for (unsigned long i = 0; i < dets_accum.size(); ++i)
|
||||
{
|
||||
if (overlaps_any_box_nms(final_dets, dets_accum[i].rect))
|
||||
if (overlaps_any_box_nms(final_dets, dets_accum[i].rect_bbr))
|
||||
continue;
|
||||
|
||||
final_dets.push_back(mmod_rect(dets_accum[i].rect,
|
||||
final_dets.push_back(mmod_rect(dets_accum[i].rect_bbr,
|
||||
dets_accum[i].detection_confidence,
|
||||
options.detector_windows[dets_accum[i].tensor_channel].label));
|
||||
}
|
||||
@ -1075,13 +1104,19 @@ namespace dlib
|
||||
DLIB_CASSERT(sub.sample_expansion_factor() == 1);
|
||||
DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
|
||||
DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
|
||||
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
|
||||
if (options.use_bounding_box_regression)
|
||||
{
|
||||
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()*5);
|
||||
}
|
||||
else
|
||||
{
|
||||
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
|
||||
}
|
||||
|
||||
double det_thresh_speed_adjust = 0;
|
||||
|
||||
|
||||
// we will scale the loss so that it doesn't get really huge
|
||||
const double scale = 1.0/output_tensor.size();
|
||||
const double scale = 1.0/(output_tensor.nr()*output_tensor.nc()*output_tensor.num_samples()*options.detector_windows.size());
|
||||
double loss = 0;
|
||||
|
||||
float* g = grad.host_write_only();
|
||||
@ -1230,6 +1265,59 @@ namespace dlib
|
||||
hit_truth_table[hittruth.second] = true;
|
||||
final_dets.push_back(dets[i]);
|
||||
loss -= options.loss_per_missed_target;
|
||||
|
||||
// Now account for BBR loss and gradient if appropriate.
|
||||
if (options.use_bounding_box_regression)
|
||||
{
|
||||
double dx = out_data[dets[i].tensor_offset_dx];
|
||||
double dy = out_data[dets[i].tensor_offset_dy];
|
||||
double dw = out_data[dets[i].tensor_offset_dw];
|
||||
double dh = out_data[dets[i].tensor_offset_dh];
|
||||
|
||||
dpoint p = dcenter(dets[i].rect_bbr);
|
||||
double w = dets[i].rect_bbr.width()-1;
|
||||
double h = dets[i].rect_bbr.height()-1;
|
||||
drectangle truth_box = (*truth)[hittruth.second].rect;
|
||||
dpoint p_truth = dcenter(truth_box);
|
||||
|
||||
DLIB_CASSERT(w > 0);
|
||||
DLIB_CASSERT(h > 0);
|
||||
|
||||
double target_dx = (p_truth.x() - p.x())/w;
|
||||
double target_dy = (p_truth.y() - p.y())/h;
|
||||
double target_dw = std::log((truth_box.width()-1)/w);
|
||||
double target_dh = std::log((truth_box.height()-1)/h);
|
||||
|
||||
|
||||
// compute smoothed L1 loss on BBR outputs. This loss
|
||||
// is just the MSE loss when the loss is small and L1
|
||||
// when large.
|
||||
dx = dx-target_dx;
|
||||
dy = dy-target_dy;
|
||||
dw = dw-target_dw;
|
||||
dh = dh-target_dh;
|
||||
|
||||
// use smoothed L1
|
||||
double ldx = std::abs(dx)<1 ? 0.5*dx*dx : std::abs(dx)-0.5;
|
||||
double ldy = std::abs(dy)<1 ? 0.5*dy*dy : std::abs(dy)-0.5;
|
||||
double ldw = std::abs(dw)<1 ? 0.5*dw*dw : std::abs(dw)-0.5;
|
||||
double ldh = std::abs(dh)<1 ? 0.5*dh*dh : std::abs(dh)-0.5;
|
||||
|
||||
loss += options.bbr_lambda*(ldx + ldy + ldw + ldh);
|
||||
|
||||
// now compute the derivatives of the smoothed L1 loss
|
||||
ldx = put_in_range(-1,1, dx);
|
||||
ldy = put_in_range(-1,1, dy);
|
||||
ldw = put_in_range(-1,1, dw);
|
||||
ldh = put_in_range(-1,1, dh);
|
||||
|
||||
|
||||
// also smoothed L1 gradient goes to gradient output
|
||||
g[dets[i].tensor_offset_dx] += scale*options.bbr_lambda*ldx;
|
||||
g[dets[i].tensor_offset_dy] += scale*options.bbr_lambda*ldy;
|
||||
g[dets[i].tensor_offset_dw] += scale*options.bbr_lambda*ldw;
|
||||
g[dets[i].tensor_offset_dh] += scale*options.bbr_lambda*ldh;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1299,6 +1387,9 @@ namespace dlib
|
||||
out << ", loss per FA:" << opts.loss_per_false_alarm;
|
||||
out << ", loss per miss:" << opts.loss_per_missed_target;
|
||||
out << ", truth match IOU thresh:" << opts.truth_match_iou_threshold;
|
||||
out << ", use_bounding_box_regression:" << opts.use_bounding_box_regression;
|
||||
if (opts.use_bounding_box_regression)
|
||||
out << ", bbr_lambda:" << opts.bbr_lambda;
|
||||
out << ", overlaps_nms:("<<opts.overlaps_nms.get_iou_thresh()<<","<<opts.overlaps_nms.get_percent_covered_thresh()<<")";
|
||||
out << ", overlaps_ignore:("<<opts.overlaps_ignore.get_iou_thresh()<<","<<opts.overlaps_ignore.get_percent_covered_thresh()<<")";
|
||||
|
||||
@ -1325,11 +1416,19 @@ namespace dlib
|
||||
) const
|
||||
{
|
||||
DLIB_CASSERT(net.sample_expansion_factor() == 1,net.sample_expansion_factor());
|
||||
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
|
||||
if (options.use_bounding_box_regression)
|
||||
{
|
||||
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()*5);
|
||||
}
|
||||
else
|
||||
{
|
||||
DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
|
||||
}
|
||||
|
||||
const float* out_data = output_tensor.host() + output_tensor.k()*output_tensor.nr()*output_tensor.nc()*i;
|
||||
// scan the final layer and output the positive scoring locations
|
||||
dets_accum.clear();
|
||||
for (long k = 0; k < output_tensor.k(); ++k)
|
||||
for (long k = 0; k < options.detector_windows.size(); ++k)
|
||||
{
|
||||
for (long r = 0; r < output_tensor.nr(); ++r)
|
||||
{
|
||||
@ -1343,6 +1442,28 @@ namespace dlib
|
||||
rect = input_layer(net).tensor_space_to_image_space(input_tensor,rect);
|
||||
|
||||
dets_accum.push_back(intermediate_detection(rect, score, (k*output_tensor.nr() + r)*output_tensor.nc() + c, k));
|
||||
|
||||
if (options.use_bounding_box_regression)
|
||||
{
|
||||
const auto offset = options.detector_windows.size() + k*4;
|
||||
dets_accum.back().tensor_offset_dx = ((offset+0)*output_tensor.nr() + r)*output_tensor.nc() + c;
|
||||
dets_accum.back().tensor_offset_dy = ((offset+1)*output_tensor.nr() + r)*output_tensor.nc() + c;
|
||||
dets_accum.back().tensor_offset_dw = ((offset+2)*output_tensor.nr() + r)*output_tensor.nc() + c;
|
||||
dets_accum.back().tensor_offset_dh = ((offset+3)*output_tensor.nr() + r)*output_tensor.nc() + c;
|
||||
|
||||
// apply BBR to dets_accum.back()
|
||||
double dx = out_data[dets_accum.back().tensor_offset_dx];
|
||||
double dy = out_data[dets_accum.back().tensor_offset_dy];
|
||||
double dw = out_data[dets_accum.back().tensor_offset_dw];
|
||||
double dh = out_data[dets_accum.back().tensor_offset_dh];
|
||||
dw = std::exp(dw);
|
||||
dh = std::exp(dh);
|
||||
double w = rect.width()-1;
|
||||
double h = rect.height()-1;
|
||||
rect = translate_rect(rect, dpoint(dx*w,dy*h));
|
||||
rect = centered_drect(rect, w*dw+1, h*dh+1);
|
||||
dets_accum.back().rect_bbr = rect;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -652,6 +652,21 @@ namespace dlib
|
||||
// However, sometimes scale-invariance may not be desired.
|
||||
use_image_pyramid assume_image_pyramid = use_image_pyramid::yes;
|
||||
|
||||
// By default, the mmod loss doesn't train any bounding box regression model. But
|
||||
// if you set use_bounding_box_regression == true then it expects the network to
|
||||
// output a tensor with detector_windows.size()*5 channels rather than just
|
||||
// detector_windows.size() channels. The 4 extra channels per window are trained
|
||||
// to give a bounding box regression output that improves the positioning of the
|
||||
// output detection box.
|
||||
bool use_bounding_box_regression = false;
|
||||
// When using bounding box regression, bbr_lambda determines how much you care
|
||||
// about getting the bounding box shape correct vs just getting the detector to
|
||||
// find objects. That is, the objective function being optimized is
|
||||
// basic_mmod_loss + bbr_lambda*bounding_box_regression_loss. So setting
|
||||
// bbr_lambda to a larger value will cause the overall loss to care more about
|
||||
// getting the bounding box shape correct.
|
||||
double bbr_lambda = 100;
|
||||
|
||||
mmod_options (
|
||||
const std::vector<std::vector<mmod_rect>>& boxes,
|
||||
const unsigned long target_size,
|
||||
|
Loading…
Reference in New Issue
Block a user