Added an option to do bounding box regression to the loss_mmod layer.

2024-11-01 10:14:53 +08:00 · 2018-09-10 08:26:36 -04:00 · 2018-09-10 08:26:36 -04:00 · 7b006f3769
commit 7b006f3769
parent 16c96bc51f
2 changed files with 148 additions and 12 deletions
--- a/dlib/dnn/loss.h
+++ b/dlib/dnn/loss.h
@ -702,6 +702,8 @@ namespace dlib
        double truth_match_iou_threshold = 0.5;
        test_box_overlap overlaps_nms = test_box_overlap(0.4);
        test_box_overlap overlaps_ignore;
+        bool use_bounding_box_regression = false; 
+        double bbr_lambda = 100; 

        use_image_pyramid assume_image_pyramid = use_image_pyramid::yes;

@ -937,7 +939,7 @@ namespace dlib

    inline void serialize(const mmod_options& item, std::ostream& out)
    {
-        int version = 3;
+        int version = 4;

        serialize(version, out);
        serialize(item.detector_windows, out);
@ -947,13 +949,15 @@ namespace dlib
        serialize(item.overlaps_nms, out);
        serialize(item.overlaps_ignore, out);
        serialize(static_cast<uint8_t>(item.assume_image_pyramid), out);
+        serialize(item.use_bounding_box_regression, out);
+        serialize(item.bbr_lambda, out);
    }

    inline void deserialize(mmod_options& item, std::istream& in)
    {
        int version = 0;
        deserialize(version, in);
-        if (version != 3 && version != 2 && version != 1)
+        if (!(1 <= version && version <= 4))
            throw serialization_error("Unexpected version found while deserializing dlib::mmod_options");
        if (version == 1)
        {
@ -979,6 +983,13 @@ namespace dlib
            deserialize(assume_image_pyramid, in);
            item.assume_image_pyramid = static_cast<use_image_pyramid>(assume_image_pyramid);
        }
+        item.use_bounding_box_regression = mmod_options().use_bounding_box_regression; // use default value since this wasn't provided
+        item.bbr_lambda = mmod_options().bbr_lambda; // use default value since this wasn't provided
+        if (version >= 4)
+        {
+            deserialize(item.use_bounding_box_regression, in);
+            deserialize(item.bbr_lambda, in);
+        }
    }

 // ----------------------------------------------------------------------------------------
@ -991,20 +1002,31 @@ namespace dlib

            intermediate_detection(
                rectangle rect_
-            ) : rect(rect_) {}
+            ) : rect(rect_), rect_bbr(rect_) {}

            intermediate_detection(
                rectangle rect_,
                double detection_confidence_,
                size_t tensor_offset_,
                long channel
-            ) : rect(rect_), detection_confidence(detection_confidence_), tensor_offset(tensor_offset_), tensor_channel(channel) {}
+            ) : rect(rect_), rect_bbr(rect_), detection_confidence(detection_confidence_), tensor_offset(tensor_offset_), tensor_channel(channel) {}

+            // rect is the rectangle you get without any bounding box regression.  So it's
+            // the basic sliding window box (aka, the "anchor box").
            rectangle rect;
            double detection_confidence = 0;
            size_t tensor_offset = 0;
            long tensor_channel = 0;

+            // rect_bbr = rect + bounding box regression.  So more accurate.  Or if bbr is off then
+            // this is just rect.  The important thing about rect_bbr is that its the
+            // rectangle we use for doing NMS.
+            drectangle rect_bbr; 
+            size_t tensor_offset_dx = 0;
+            size_t tensor_offset_dy = 0;
+            size_t tensor_offset_dw = 0;
+            size_t tensor_offset_dh = 0;
+
            bool operator<(const intermediate_detection& item) const { return detection_confidence < item.detection_confidence; }
        };

@ -1032,7 +1054,14 @@ namespace dlib
        ) const
        {
            const tensor& output_tensor = sub.get_output();
-            DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+            if (options.use_bounding_box_regression)
+            {
+                DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()*5);
+            }
+            else
+            {
+                DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+            }
            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
            DLIB_CASSERT(sub.sample_expansion_factor() == 1,  sub.sample_expansion_factor());

@ -1046,10 +1075,10 @@ namespace dlib
                final_dets.clear();
                for (unsigned long i = 0; i < dets_accum.size(); ++i)
                {
-                    if (overlaps_any_box_nms(final_dets, dets_accum[i].rect))
+                    if (overlaps_any_box_nms(final_dets, dets_accum[i].rect_bbr))
                        continue;

-                    final_dets.push_back(mmod_rect(dets_accum[i].rect,
+                    final_dets.push_back(mmod_rect(dets_accum[i].rect_bbr,
                                                   dets_accum[i].detection_confidence,
                                                   options.detector_windows[dets_accum[i].tensor_channel].label));
                }
@ -1075,13 +1104,19 @@ namespace dlib
            DLIB_CASSERT(sub.sample_expansion_factor() == 1);
            DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
            DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
-            DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+            if (options.use_bounding_box_regression)
+            {
+                DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()*5);
+            }
+            else
+            {
+                DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+            }

            double det_thresh_speed_adjust = 0;

-
            // we will scale the loss so that it doesn't get really huge
-            const double scale = 1.0/output_tensor.size();
+            const double scale = 1.0/(output_tensor.nr()*output_tensor.nc()*output_tensor.num_samples()*options.detector_windows.size());
            double loss = 0;

            float* g = grad.host_write_only();
@ -1230,6 +1265,59 @@ namespace dlib
                                hit_truth_table[hittruth.second] = true;
                                final_dets.push_back(dets[i]);
                                loss -= options.loss_per_missed_target;
+
+                                // Now account for BBR loss and gradient if appropriate.
+                                if (options.use_bounding_box_regression)
+                                {
+                                    double dx = out_data[dets[i].tensor_offset_dx];
+                                    double dy = out_data[dets[i].tensor_offset_dy];
+                                    double dw = out_data[dets[i].tensor_offset_dw];
+                                    double dh = out_data[dets[i].tensor_offset_dh];
+
+                                    dpoint p = dcenter(dets[i].rect_bbr); 
+                                    double w = dets[i].rect_bbr.width()-1;
+                                    double h = dets[i].rect_bbr.height()-1;
+                                    drectangle truth_box = (*truth)[hittruth.second].rect;
+                                    dpoint p_truth = dcenter(truth_box); 
+
+                                    DLIB_CASSERT(w > 0);
+                                    DLIB_CASSERT(h > 0);
+
+                                    double target_dx = (p_truth.x() - p.x())/w;
+                                    double target_dy = (p_truth.y() - p.y())/h;
+                                    double target_dw = std::log((truth_box.width()-1)/w);
+                                    double target_dh = std::log((truth_box.height()-1)/h);
+
+
+                                    // compute smoothed L1 loss on BBR outputs.  This loss
+                                    // is just the MSE loss when the loss is small and L1
+                                    // when large.
+                                    dx = dx-target_dx;
+                                    dy = dy-target_dy;
+                                    dw = dw-target_dw;
+                                    dh = dh-target_dh;
+
+                                    // use smoothed L1 
+                                    double ldx = std::abs(dx)<1 ? 0.5*dx*dx : std::abs(dx)-0.5;
+                                    double ldy = std::abs(dy)<1 ? 0.5*dy*dy : std::abs(dy)-0.5;
+                                    double ldw = std::abs(dw)<1 ? 0.5*dw*dw : std::abs(dw)-0.5;
+                                    double ldh = std::abs(dh)<1 ? 0.5*dh*dh : std::abs(dh)-0.5;
+
+                                    loss += options.bbr_lambda*(ldx + ldy + ldw + ldh);
+      
+                                    // now compute the derivatives of the smoothed L1 loss
+                                    ldx = put_in_range(-1,1, dx);
+                                    ldy = put_in_range(-1,1, dy);
+                                    ldw = put_in_range(-1,1, dw);
+                                    ldh = put_in_range(-1,1, dh);
+
+
+                                    // also smoothed L1 gradient goes to gradient output
+                                    g[dets[i].tensor_offset_dx] += scale*options.bbr_lambda*ldx;
+                                    g[dets[i].tensor_offset_dy] += scale*options.bbr_lambda*ldy;
+                                    g[dets[i].tensor_offset_dw] += scale*options.bbr_lambda*ldw;
+                                    g[dets[i].tensor_offset_dh] += scale*options.bbr_lambda*ldh;
+                                }
                            }
                            else
                            {
@ -1299,6 +1387,9 @@ namespace dlib
            out << ", loss per FA:" << opts.loss_per_false_alarm;
            out << ", loss per miss:" << opts.loss_per_missed_target;
            out << ", truth match IOU thresh:" << opts.truth_match_iou_threshold;
+            out << ", use_bounding_box_regression:" << opts.use_bounding_box_regression;
+            if (opts.use_bounding_box_regression)
+                out << ", bbr_lambda:" << opts.bbr_lambda;
            out << ", overlaps_nms:("<<opts.overlaps_nms.get_iou_thresh()<<","<<opts.overlaps_nms.get_percent_covered_thresh()<<")";
            out << ", overlaps_ignore:("<<opts.overlaps_ignore.get_iou_thresh()<<","<<opts.overlaps_ignore.get_percent_covered_thresh()<<")";

@ -1325,11 +1416,19 @@ namespace dlib
        ) const
        {
            DLIB_CASSERT(net.sample_expansion_factor() == 1,net.sample_expansion_factor());
-            DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+            if (options.use_bounding_box_regression)
+            {
+                DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size()*5);
+            }
+            else
+            {
+                DLIB_CASSERT(output_tensor.k() == (long)options.detector_windows.size());
+            }
+
            const float* out_data = output_tensor.host() + output_tensor.k()*output_tensor.nr()*output_tensor.nc()*i;
            // scan the final layer and output the positive scoring locations
            dets_accum.clear();
-            for (long k = 0; k < output_tensor.k(); ++k)
+            for (long k = 0; k < options.detector_windows.size(); ++k)
            {
                for (long r = 0; r < output_tensor.nr(); ++r)
                {
@ -1343,6 +1442,28 @@ namespace dlib
                            rect = input_layer(net).tensor_space_to_image_space(input_tensor,rect);

                            dets_accum.push_back(intermediate_detection(rect, score, (k*output_tensor.nr() + r)*output_tensor.nc() + c, k));
+
+                            if (options.use_bounding_box_regression)
+                            {
+                                const auto offset = options.detector_windows.size() + k*4;
+                                dets_accum.back().tensor_offset_dx = ((offset+0)*output_tensor.nr() + r)*output_tensor.nc() + c;
+                                dets_accum.back().tensor_offset_dy = ((offset+1)*output_tensor.nr() + r)*output_tensor.nc() + c;
+                                dets_accum.back().tensor_offset_dw = ((offset+2)*output_tensor.nr() + r)*output_tensor.nc() + c;
+                                dets_accum.back().tensor_offset_dh = ((offset+3)*output_tensor.nr() + r)*output_tensor.nc() + c;
+
+                                // apply BBR to dets_accum.back()
+                                double dx = out_data[dets_accum.back().tensor_offset_dx];
+                                double dy = out_data[dets_accum.back().tensor_offset_dy];
+                                double dw = out_data[dets_accum.back().tensor_offset_dw];
+                                double dh = out_data[dets_accum.back().tensor_offset_dh];
+                                dw = std::exp(dw);
+                                dh = std::exp(dh);
+                                double w = rect.width()-1;
+                                double h = rect.height()-1;
+                                rect = translate_rect(rect, dpoint(dx*w,dy*h));
+                                rect = centered_drect(rect, w*dw+1, h*dh+1);
+                                dets_accum.back().rect_bbr = rect;
+                            }
                        }
                    }
                }
--- a/dlib/dnn/loss_abstract.h
+++ b/dlib/dnn/loss_abstract.h
@ -652,6 +652,21 @@ namespace dlib
        // However, sometimes scale-invariance may not be desired.
        use_image_pyramid assume_image_pyramid = use_image_pyramid::yes;

+        // By default, the mmod loss doesn't train any bounding box regression model.  But
+        // if you set use_bounding_box_regression == true then it expects the network to
+        // output a tensor with detector_windows.size()*5 channels rather than just
+        // detector_windows.size() channels.  The 4 extra channels per window are trained
+        // to give a bounding box regression output that improves the positioning of the
+        // output detection box.
+        bool use_bounding_box_regression = false; 
+        // When using bounding box regression, bbr_lambda determines how much you care
+        // about getting the bounding box shape correct vs just getting the detector to
+        // find objects.  That is, the objective function being optimized is
+        // basic_mmod_loss + bbr_lambda*bounding_box_regression_loss.  So setting
+        // bbr_lambda to a larger value will cause the overall loss to care more about
+        // getting the bounding box shape correct.
+        double bbr_lambda = 100; 
+
        mmod_options (
            const std::vector<std::vector<mmod_rect>>& boxes,
            const unsigned long target_size,