From 19a952c3a4d983b456d497aa888942694d98a809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Arrufat?= <1671644+arrufat@users.noreply.github.com> Date: Fri, 8 Dec 2023 12:22:05 +0900 Subject: [PATCH] extend letterbox behavior (#2899) * extend letterbox behavior * simplify scale logic and update docs * oops, forgot one line in the yolo example * make dpoint const --- dlib/image_transforms/interpolation.h | 34 +++++--------- .../image_transforms/interpolation_abstract.h | 45 +++---------------- dlib/test/image.cpp | 4 +- docs/docs/imaging.xml | 2 +- examples/dnn_yolo_train_ex.cpp | 15 ++++--- 5 files changed, 27 insertions(+), 73 deletions(-) diff --git a/dlib/image_transforms/interpolation.h b/dlib/image_transforms/interpolation.h index 22fcea6e9..106022ada 100644 --- a/dlib/image_transforms/interpolation.h +++ b/dlib/image_transforms/interpolation.h @@ -977,28 +977,29 @@ namespace dlib point_transform_affine letterbox_image ( const image_type1& img_in, image_type2& img_out, - long size, const interpolation_type& interp ) { - DLIB_CASSERT(size > 0, "size must be bigger than zero, but was " << size); const_image_view vimg_in(img_in); image_view vimg_out(img_out); - - const auto scale = size / std::max(vimg_in.nr(), vimg_in.nc()); + const long rows = vimg_out.nr(); + const long cols = vimg_out.nc(); + DLIB_CASSERT(vimg_out.size() > 0, "img_out size must be bigger than zero, but was " << rows << "x" << cols); // early return if the image has already the requested size and no padding is needed - if (scale == 1 && vimg_in.nr() == vimg_in.nc()) + if (have_same_dimensions(vimg_in, vimg_out)) { assign_image(vimg_out, vimg_in); return point_transform_affine(); } - vimg_out.set_size(size, size); + const double rows_scale = rows / static_cast(vimg_in.nr()); + const double cols_scale = cols / static_cast(vimg_in.nc()); + const double scale = rows_scale * vimg_in.nc() > rows ? cols_scale : rows_scale; - const long nr = std::round(scale * vimg_in.nr()); - const long nc = std::round(scale * vimg_in.nc()); - dpoint offset((size - nc) / 2.0, (size - nr) / 2.0); + const long nr = std::lround(scale * vimg_in.nr()); + const long nc = std::lround(scale * vimg_in.nc()); + const dpoint offset((cols - nc) / 2.0, (rows - nr) / 2.0); const auto r = rectangle(offset.x(), offset.y(), offset.x() + nc - 1, offset.y() + nr - 1); zero_border_pixels(vimg_out, r); auto si = sub_image(img_out, r); @@ -1006,19 +1007,6 @@ namespace dlib return point_transform_affine(identity_matrix(2) * scale, offset); } - template < - typename image_type1, - typename image_type2 - > - point_transform_affine letterbox_image ( - const image_type1& img_in, - image_type2& img_out, - long size - ) - { - return letterbox_image(img_in, img_out, size, interpolate_bilinear()); - } - template < typename image_type1, typename image_type2 @@ -1028,7 +1016,7 @@ namespace dlib image_type2& img_out ) { - return letterbox_image(img_in, img_out, std::max(num_rows(img_in), num_columns(img_in)), interpolate_bilinear()); + return letterbox_image(img_in, img_out, interpolate_bilinear()); } // ---------------------------------------------------------------------------------------- diff --git a/dlib/image_transforms/interpolation_abstract.h b/dlib/image_transforms/interpolation_abstract.h index 8a88befd1..fad9a87c3 100644 --- a/dlib/image_transforms/interpolation_abstract.h +++ b/dlib/image_transforms/interpolation_abstract.h @@ -446,7 +446,6 @@ namespace dlib point_transform_affine letterbox_image ( const image_type1& img_in, image_type2& img_out, - long size const interpolation_type interp ); /*! @@ -455,15 +454,12 @@ namespace dlib dlib/image_processing/generic_image.h - image_type2 == an image object that implements the interface defined in dlib/image_processing/generic_image.h + - img_out.size() > 0 - interpolation_type == interpolate_nearest_neighbor, interpolate_bilinear, interpolate_quadratic, or a type with a compatible interface. - - size > 0 - is_same_object(in_img, out_img) == false ensures - - Scales in_img so that it fits into a size * size square. - In particular, we will have: - - #img_out.nr() == size - - #img_out.nc() == size + - Scales in_img so that it fits into img_out. - Preserves the aspect ratio of in_img by 0-padding the shortest side. - Uses the supplied interpolation routine interp to perform the necessary pixel interpolation. @@ -471,35 +467,6 @@ namespace dlib corresponding location in #out_img. !*/ - template < - typename image_type1, - typename image_type2 - > - point_transform_affine letterbox_image ( - const image_type1& img_in, - image_type2& img_out, - long size - ); - /*! - requires - - image_type1 == an image object that implements the interface defined in - dlib/image_processing/generic_image.h - - image_type2 == an image object that implements the interface defined in - dlib/image_processing/generic_image.h - - size > 0 - - is_same_object(in_img, out_img) == false - ensures - - Scales in_img so that it fits into a size * size square. - In particular, we will have: - - #img_out.nr() == size - - #img_out.nc() == size - - Preserves the aspect ratio of in_img by 0-padding the shortest side. - - Uses the bilinear interpolation to perform the necessary pixel - interpolation. - - Returns a transformation object that maps points in in_img into their - corresponding location in #out_img. - !*/ - template < typename image_type1, typename image_type2 @@ -514,13 +481,11 @@ namespace dlib dlib/image_processing/generic_image.h - image_type2 == an image object that implements the interface defined in dlib/image_processing/generic_image.h + - img_out.size() > 0 - is_same_object(in_img, out_img) == false ensures - - 0-pads in_img so that it fits into a square whose side is computed as - max(num_rows(in_img), num_columns(in_img)) and stores into #out_img. - In particular, we will have: - - #img_out.nr() == max(num_rows(in_img), num_columns(in_img)) - - #img_out.nc() == max(num_rows(in_img), num_columns(in_img)) + - Scales in_img so that it fits into img_out using bilinear interpolation. + - Preserves the aspect ratio of in_img by 0-padding the shortest side. - Returns a transformation object that maps points in in_img into their corresponding location in #out_img. !*/ diff --git a/dlib/test/image.cpp b/dlib/test/image.cpp index 674bddcec..6c6c7bd12 100644 --- a/dlib/test/image.cpp +++ b/dlib/test/image.cpp @@ -2438,9 +2438,9 @@ namespace rgb_pixel black(0, 0, 0); rgb_pixel white(255, 255, 255); matrix img_s(40, 60); - matrix img_d; + matrix img_d(30, 30); assign_all_pixels(img_s, white); - const auto tform = letterbox_image(img_s, img_d, 30, interpolate_nearest_neighbor()); + const auto tform = letterbox_image(img_s, img_d, interpolate_nearest_neighbor()); DLIB_TEST(tform.get_m() == identity_matrix(2) * 0.5); DLIB_TEST(tform.get_b() == dpoint(0, 5)); diff --git a/docs/docs/imaging.xml b/docs/docs/imaging.xml index 24fe8b88f..769ab8551 100644 --- a/docs/docs/imaging.xml +++ b/docs/docs/imaging.xml @@ -2008,7 +2008,7 @@ dlib/image_transforms.h dlib/image_transforms/interpolation_abstract.h - Scales an image so that it fits into a size * size square, while preserving the aspect + Scales an image so that it fits into another size, while preserving the aspect ratio of the actual contents by appropriate 0 padding. diff --git a/examples/dnn_yolo_train_ex.cpp b/examples/dnn_yolo_train_ex.cpp index b4a62f442..2ffdd5ac2 100644 --- a/examples/dnn_yolo_train_ex.cpp +++ b/examples/dnn_yolo_train_ex.cpp @@ -131,9 +131,9 @@ namespace darknet } // In this example, YOLO expects square images, and we choose to transform them by letterboxing them. -rectangle_transform preprocess_image(const matrix& image, matrix& output, const long image_size) +rectangle_transform preprocess_image(const matrix& image, matrix& output) { - return rectangle_transform(inv(letterbox_image(image, output, image_size))); + return rectangle_transform(inv(letterbox_image(image, output))); } // YOLO outputs the bounding boxes in the coordinate system of the input (letterboxed) image, so we need to convert them @@ -296,14 +296,14 @@ try } const double threshold = get_option(parser, "test", 0.01); image_window win; - matrix image, resized; + matrix image, resized(image_size, image_size); for (const auto& im : dataset.images) { win.clear_overlay(); load_image(image, data_directory + "/" + im.filename); win.set_title(im.filename); win.set_image(image); - const auto tform = preprocess_image(image, resized, image_size); + const auto tform = preprocess_image(image, resized); auto detections = net.process(resized, threshold); postprocess_detections(tform, detections); cout << "# detections: " << detections.size() << endl; @@ -329,7 +329,7 @@ try cout << "Could not find file " << sync_file_name << endl; return EXIT_FAILURE; } - matrix image, resized; + matrix image, resized(image_size, image_size); std::map>> hits; std::map missing; for (const auto& label : options.labels) @@ -342,7 +342,7 @@ try { const auto& im = dataset.images[i]; load_image(image, data_directory + "/" + im.filename); - const auto tform = preprocess_image(image, resized, image_size); + const auto tform = preprocess_image(image, resized); auto dets = net.process(resized, 0.005); postprocess_detections(tform, dets); std::vector used(dets.size(), false); @@ -395,6 +395,7 @@ try dlib::rand rnd(time(nullptr) + seed); matrix image, rotated; std::pair, std::vector> temp; + temp.first.set_size(image_size, image_size); random_cropper cropper; cropper.set_seed(time(nullptr) + seed); cropper.set_chip_dims(image_size, image_size); @@ -423,7 +424,7 @@ try for (auto& box : temp.second) box.rect = tform(box.rect); - tform = letterbox_image(rotated, temp.first, image_size); + tform = letterbox_image(rotated, temp.first); for (auto& box : temp.second) box.rect = tform(box.rect);