From 19a952c3a4d983b456d497aa888942694d98a809 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A0=20Arrufat?=
 <1671644+arrufat@users.noreply.github.com>
Date: Fri, 8 Dec 2023 12:22:05 +0900
Subject: [PATCH] extend letterbox behavior (#2899)

* extend letterbox behavior

* simplify scale logic and update docs

* oops, forgot one line in the yolo example

* make dpoint const
---
 dlib/image_transforms/interpolation.h         | 34 +++++---------
 .../image_transforms/interpolation_abstract.h | 45 +++----------------
 dlib/test/image.cpp                           |  4 +-
 docs/docs/imaging.xml                         |  2 +-
 examples/dnn_yolo_train_ex.cpp                | 15 ++++---
 5 files changed, 27 insertions(+), 73 deletions(-)
diff --git a/dlib/image_transforms/interpolation.h b/dlib/image_transforms/interpolation.h
index 22fcea6e9..106022ada 100644
--- a/dlib/image_transforms/interpolation.h
+++ b/dlib/image_transforms/interpolation.h
@@ -977,28 +977,29 @@ namespace dlib
     point_transform_affine letterbox_image (
         const image_type1& img_in,
         image_type2& img_out,
-        long size,
         const interpolation_type& interp
     )
     {
-        DLIB_CASSERT(size > 0, "size must be bigger than zero, but was " << size);
         const_image_view<image_type1> vimg_in(img_in);
         image_view<image_type2> vimg_out(img_out);
-
-        const auto scale = size / std::max<double>(vimg_in.nr(), vimg_in.nc());
+        const long rows = vimg_out.nr();
+        const long cols = vimg_out.nc();
+        DLIB_CASSERT(vimg_out.size() > 0, "img_out size must be bigger than zero, but was " << rows << "x" << cols);
 
         // early return if the image has already the requested size and no padding is needed
-        if (scale == 1 && vimg_in.nr() == vimg_in.nc())
+        if (have_same_dimensions(vimg_in, vimg_out))
         {
             assign_image(vimg_out, vimg_in);
             return point_transform_affine();
         }
 
-        vimg_out.set_size(size, size);
+        const double rows_scale = rows / static_cast<double>(vimg_in.nr());
+        const double cols_scale = cols / static_cast<double>(vimg_in.nc());
+        const double scale = rows_scale * vimg_in.nc() > rows ? cols_scale : rows_scale;
 
-        const long nr = std::round(scale * vimg_in.nr());
-        const long nc = std::round(scale * vimg_in.nc());
-        dpoint offset((size - nc) / 2.0, (size - nr) / 2.0);
+        const long nr = std::lround(scale * vimg_in.nr());
+        const long nc = std::lround(scale * vimg_in.nc());
+        const dpoint offset((cols - nc) / 2.0, (rows - nr) / 2.0);
         const auto r = rectangle(offset.x(), offset.y(), offset.x() + nc - 1, offset.y() + nr - 1);
         zero_border_pixels(vimg_out, r);
         auto si = sub_image(img_out, r);
@@ -1006,19 +1007,6 @@ namespace dlib
         return point_transform_affine(identity_matrix<double>(2) * scale, offset);
     }
 
-    template <
-        typename image_type1,
-        typename image_type2
-        >
-    point_transform_affine letterbox_image (
-        const image_type1& img_in,
-        image_type2& img_out,
-        long size
-    )
-    {
-        return letterbox_image(img_in, img_out, size, interpolate_bilinear());
-    }
-
     template <
         typename image_type1,
         typename image_type2
@@ -1028,7 +1016,7 @@ namespace dlib
         image_type2& img_out
     )
     {
-        return letterbox_image(img_in, img_out, std::max(num_rows(img_in), num_columns(img_in)), interpolate_bilinear());
+        return letterbox_image(img_in, img_out, interpolate_bilinear());
     }
 
 // ----------------------------------------------------------------------------------------
diff --git a/dlib/image_transforms/interpolation_abstract.h b/dlib/image_transforms/interpolation_abstract.h
index 8a88befd1..fad9a87c3 100644
--- a/dlib/image_transforms/interpolation_abstract.h
+++ b/dlib/image_transforms/interpolation_abstract.h
@@ -446,7 +446,6 @@ namespace dlib
     point_transform_affine letterbox_image (
         const image_type1& img_in,
         image_type2& img_out,
-        long size
         const interpolation_type interp
     );
     /*!
@@ -455,15 +454,12 @@ namespace dlib
               dlib/image_processing/generic_image.h
             - image_type2 == an image object that implements the interface defined in
               dlib/image_processing/generic_image.h
+            - img_out.size() > 0
             - interpolation_type == interpolate_nearest_neighbor, interpolate_bilinear,
               interpolate_quadratic, or a type with a compatible interface.
-            - size > 0
             - is_same_object(in_img, out_img) == false
         ensures
-            - Scales in_img so that it fits into a size * size square.
-              In particular, we will have:
-                - #img_out.nr() == size
-                - #img_out.nc() == size
+            - Scales in_img so that it fits into img_out.
             - Preserves the aspect ratio of in_img by 0-padding the shortest side.
             - Uses the supplied interpolation routine interp to perform the necessary
               pixel interpolation.
@@ -471,35 +467,6 @@ namespace dlib
               corresponding location in #out_img.
     !*/
 
-    template <
-        typename image_type1,
-        typename image_type2
-        >
-    point_transform_affine letterbox_image (
-        const image_type1& img_in,
-        image_type2& img_out,
-        long size
-    );
-    /*!
-        requires
-            - image_type1 == an image object that implements the interface defined in
-              dlib/image_processing/generic_image.h
-            - image_type2 == an image object that implements the interface defined in
-              dlib/image_processing/generic_image.h
-            - size > 0
-            - is_same_object(in_img, out_img) == false
-        ensures
-            - Scales in_img so that it fits into a size * size square.
-              In particular, we will have:
-                - #img_out.nr() == size
-                - #img_out.nc() == size
-            - Preserves the aspect ratio of in_img by 0-padding the shortest side.
-            - Uses the bilinear interpolation to perform the necessary pixel
-              interpolation.
-            - Returns a transformation object that maps points in in_img into their
-              corresponding location in #out_img.
-    !*/
-
     template <
         typename image_type1,
         typename image_type2
@@ -514,13 +481,11 @@ namespace dlib
               dlib/image_processing/generic_image.h
             - image_type2 == an image object that implements the interface defined in
               dlib/image_processing/generic_image.h
+            - img_out.size() > 0
             - is_same_object(in_img, out_img) == false
         ensures
-            - 0-pads in_img so that it fits into a square whose side is computed as
-              max(num_rows(in_img), num_columns(in_img)) and stores into #out_img.
-              In particular, we will have:
-                - #img_out.nr() == max(num_rows(in_img), num_columns(in_img))
-                - #img_out.nc() == max(num_rows(in_img), num_columns(in_img))
+            - Scales in_img so that it fits into img_out using bilinear interpolation.
+            - Preserves the aspect ratio of in_img by 0-padding the shortest side.
             - Returns a transformation object that maps points in in_img into their
               corresponding location in #out_img.
     !*/
diff --git a/dlib/test/image.cpp b/dlib/test/image.cpp
index 674bddcec..6c6c7bd12 100644
--- a/dlib/test/image.cpp
+++ b/dlib/test/image.cpp
@@ -2438,9 +2438,9 @@ namespace
         rgb_pixel black(0, 0, 0);
         rgb_pixel white(255, 255, 255);
         matrix<rgb_pixel> img_s(40, 60);
-        matrix<rgb_pixel> img_d;
+        matrix<rgb_pixel> img_d(30, 30);
         assign_all_pixels(img_s, white);
-        const auto tform = letterbox_image(img_s, img_d, 30, interpolate_nearest_neighbor());
+        const auto tform = letterbox_image(img_s, img_d, interpolate_nearest_neighbor());
         DLIB_TEST(tform.get_m() == identity_matrix<double>(2) * 0.5);
         DLIB_TEST(tform.get_b() == dpoint(0, 5));
 
diff --git a/docs/docs/imaging.xml b/docs/docs/imaging.xml
index 24fe8b88f..769ab8551 100644
--- a/docs/docs/imaging.xml
+++ b/docs/docs/imaging.xml
@@ -2008,7 +2008,7 @@
          <file>dlib/image_transforms.h</file>
          <spec_file link="true">dlib/image_transforms/interpolation_abstract.h</spec_file>
          <description>        
-            Scales an image so that it fits into a size * size square, while preserving the aspect
+            Scales an image so that it fits into another size, while preserving the aspect
             ratio of the actual contents by appropriate 0 padding.
 
          <examples>
diff --git a/examples/dnn_yolo_train_ex.cpp b/examples/dnn_yolo_train_ex.cpp
index b4a62f442..2ffdd5ac2 100644
--- a/examples/dnn_yolo_train_ex.cpp
+++ b/examples/dnn_yolo_train_ex.cpp
@@ -131,9 +131,9 @@ namespace darknet
 }
 
 // In this example, YOLO expects square images, and we choose to transform them by letterboxing them.
-rectangle_transform preprocess_image(const matrix<rgb_pixel>& image, matrix<rgb_pixel>& output, const long image_size)
+rectangle_transform preprocess_image(const matrix<rgb_pixel>& image, matrix<rgb_pixel>& output)
 {
-    return rectangle_transform(inv(letterbox_image(image, output, image_size)));
+    return rectangle_transform(inv(letterbox_image(image, output)));
 }
 
 // YOLO outputs the bounding boxes in the coordinate system of the input (letterboxed) image, so we need to convert them
@@ -296,14 +296,14 @@ try
         }
         const double threshold = get_option(parser, "test", 0.01);
         image_window win;
-        matrix<rgb_pixel> image, resized;
+        matrix<rgb_pixel> image, resized(image_size, image_size);
         for (const auto& im : dataset.images)
         {
             win.clear_overlay();
             load_image(image, data_directory + "/" + im.filename);
             win.set_title(im.filename);
             win.set_image(image);
-            const auto tform = preprocess_image(image, resized, image_size);
+            const auto tform = preprocess_image(image, resized);
             auto detections = net.process(resized, threshold);
             postprocess_detections(tform, detections);
             cout << "# detections: " << detections.size() << endl;
@@ -329,7 +329,7 @@ try
             cout << "Could not find file " << sync_file_name << endl;
             return EXIT_FAILURE;
         }
-        matrix<rgb_pixel> image, resized;
+        matrix<rgb_pixel> image, resized(image_size, image_size);
         std::map<std::string, std::vector<std::pair<double, bool>>> hits;
         std::map<std::string, unsigned long> missing;
         for (const auto& label : options.labels)
@@ -342,7 +342,7 @@ try
         {
             const auto& im = dataset.images[i];
             load_image(image, data_directory + "/" + im.filename);
-            const auto tform = preprocess_image(image, resized, image_size);
+            const auto tform = preprocess_image(image, resized);
             auto dets = net.process(resized, 0.005);
             postprocess_detections(tform, dets);
             std::vector<bool> used(dets.size(), false);
@@ -395,6 +395,7 @@ try
         dlib::rand rnd(time(nullptr) + seed);
         matrix<rgb_pixel> image, rotated;
         std::pair<matrix<rgb_pixel>, std::vector<yolo_rect>> temp;
+        temp.first.set_size(image_size, image_size);
         random_cropper cropper;
         cropper.set_seed(time(nullptr) + seed);
         cropper.set_chip_dims(image_size, image_size);
@@ -423,7 +424,7 @@ try
                 for (auto& box : temp.second)
                     box.rect = tform(box.rect);
 
-                tform = letterbox_image(rotated, temp.first, image_size);
+                tform = letterbox_image(rotated, temp.first);
                 for (auto& box : temp.second)
                     box.rect = tform(box.rect);