Add input_grayscale_image_pyramid, issue #354 (#1761)

Add input_grayscale_image_pyramid
2024-11-01 10:14:53 +08:00 · 2019-05-25 18:18:01 -03:00 · 2019-05-25 18:18:01 -03:00 · 8001b924e6
commit 8001b924e6
parent 0ecb49b94e
2 changed files with 370 additions and 102 deletions
--- a/dlib/dnn/input.h
+++ b/dlib/dnn/input.h
@ -588,70 +588,135 @@ namespace dlib
        }
    };

+// ----------------------------------------------------------------------------------------
+
+    namespace detail {
+        template <typename PYRAMID_TYPE>
+        class input_image_pyramid
+        {
+        public:
+
+            virtual ~input_image_pyramid() = 0;
+
+            typedef PYRAMID_TYPE pyramid_type;
+
+            unsigned long get_pyramid_padding() const { return pyramid_padding; }
+            void set_pyramid_padding(unsigned long value) { pyramid_padding = value; }
+
+            unsigned long get_pyramid_outer_padding() const { return pyramid_outer_padding; }
+            void set_pyramid_outer_padding(unsigned long value) { pyramid_outer_padding = value; }
+
+            bool image_contained_point(
+                const tensor& data,
+                const point& p
+            ) const
+            {
+                auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
+                DLIB_CASSERT(rects.size() > 0);
+                return rects[0].contains(p + rects[0].tl_corner());
+            }
+
+            drectangle tensor_space_to_image_space(
+                const tensor& data,
+                drectangle r
+            ) const
+            {
+                auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
+                return tiled_pyramid_to_image<pyramid_type>(rects, r);
+            }
+
+            drectangle image_space_to_tensor_space (
+                const tensor& data,
+                double scale,
+                drectangle r
+            ) const
+            {
+                DLIB_CASSERT(0 < scale && scale <= 1, "scale: " << scale);
+                auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
+                return image_to_tiled_pyramid<pyramid_type>(rects, scale, r);
+            }
+
+        protected:
+
+            template <typename forward_iterator>
+            void to_tensor_init (
+                forward_iterator ibegin,
+                forward_iterator iend,
+                resizable_tensor &data,
+                unsigned int k
+            ) const
+            {
+
+                DLIB_CASSERT(std::distance(ibegin, iend) > 0);
+                auto nr = ibegin->nr();
+                auto nc = ibegin->nc();
+                // make sure all the input matrices have the same dimensions
+                for (auto i = ibegin; i != iend; ++i)
+                {
+                    DLIB_CASSERT(i->nr() == nr && i->nc() == nc,
+                                 "\t input_grayscale_image_pyramid::to_tensor()"
+                                         << "\n\t All matrices given to to_tensor() must have the same dimensions."
+                                         << "\n\t nr: " << nr
+                                         << "\n\t nc: " << nc
+                                         << "\n\t i->nr(): " << i->nr()
+                                         << "\n\t i->nc(): " << i->nc()
+                    );
+                }
+
+                long NR, NC;
+                pyramid_type pyr;
+                auto& rects = data.annotation().get<std::vector<rectangle>>();
+                impl::compute_tiled_image_pyramid_details(pyr, nr, nc, pyramid_padding, pyramid_outer_padding, rects,
+                                                          NR, NC);
+
+                // initialize data to the right size to contain the stuff in the iterator range.
+                data.set_size(std::distance(ibegin, iend), k, NR, NC);
+
+                // We need to zero the image before doing the pyramid, since the pyramid
+                // creation code doesn't write to all parts of the image.  We also take
+                // care to avoid triggering any device to hosts copies.
+                auto ptr = data.host_write_only();
+                for (size_t i = 0; i < data.size(); ++i)
+                    ptr[i] = 0;
+
+            }
+
+            // now build the image pyramid into data.  This does the same thing as
+            // standard create_tiled_pyramid(), except we use the GPU if one is available.
+            void create_tiled_pyramid (
+                const std::vector<rectangle>& rects,
+                resizable_tensor& data
+            ) const
+            {
+                for (size_t i = 1; i < rects.size(); ++i) {
+                    alias_tensor src(data.num_samples(), data.k(), rects[i - 1].height(), rects[i - 1].width());
+                    alias_tensor dest(data.num_samples(), data.k(), rects[i].height(), rects[i].width());
+
+                    auto asrc = src(data, data.nc() * rects[i - 1].top() + rects[i - 1].left());
+                    auto adest = dest(data, data.nc() * rects[i].top() + rects[i].left());
+
+                    tt::resize_bilinear(adest, data.nc(), data.nr() * data.nc(),
+                                        asrc, data.nc(), data.nr() * data.nc());
+                }
+            }
+
+            unsigned long pyramid_padding = 10;
+            unsigned long pyramid_outer_padding = 11;
+        };
+
+        template <typename PYRAMID_TYPE>
+        input_image_pyramid<PYRAMID_TYPE>::~input_image_pyramid() {}
+    }
+
 // ----------------------------------------------------------------------------------------

    template <typename PYRAMID_TYPE>
-    class input_rgb_image_pyramid
+    class input_grayscale_image_pyramid : public detail::input_image_pyramid<PYRAMID_TYPE>
    {
    public:
-        typedef matrix<rgb_pixel> input_type;
+        typedef matrix<unsigned char> input_type;
        typedef PYRAMID_TYPE pyramid_type;

-        input_rgb_image_pyramid (
-        ) : 
-            avg_red(122.782), 
-            avg_green(117.001),
-            avg_blue(104.298) 
-        {
-        }
-
-        input_rgb_image_pyramid (
-            float avg_red_,
-            float avg_green_,
-            float avg_blue_
-        ) : avg_red(avg_red_), avg_green(avg_green_), avg_blue(avg_blue_) 
-        {}
-
-        float get_avg_red()   const { return avg_red; }
-        float get_avg_green() const { return avg_green; }
-        float get_avg_blue()  const { return avg_blue; }
-
-        unsigned long get_pyramid_padding () const { return pyramid_padding; }
-        void set_pyramid_padding (unsigned long value) { pyramid_padding = value; }
-
-        unsigned long get_pyramid_outer_padding () const { return pyramid_outer_padding; }
-        void set_pyramid_outer_padding (unsigned long value) { pyramid_outer_padding = value; }
-
-        bool image_contained_point (
-            const tensor& data,
-            const point& p
-        ) const
-        {
-            auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
-            DLIB_CASSERT(rects.size() > 0);
-            return rects[0].contains(p+rects[0].tl_corner());
-        }
-
-        drectangle tensor_space_to_image_space (
-            const tensor& data,
-            drectangle r
-        ) const
-        {
-            auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
-            return tiled_pyramid_to_image<pyramid_type>(rects, r);
-        }
-
-        drectangle image_space_to_tensor_space (
-            const tensor& data,
-            double scale,
-            drectangle r 
-        ) const
-        {
-            DLIB_CASSERT(0 < scale && scale <= 1 , "scale: "<< scale);
-            auto&& rects = any_cast<std::vector<rectangle>>(data.annotation());
-            return image_to_tiled_pyramid<pyramid_type>(rects, scale, r);
-        }
-
        template <typename forward_iterator>
        void to_tensor (
            forward_iterator ibegin,
@ -659,42 +724,110 @@ namespace dlib
            resizable_tensor& data
        ) const
        {
-            DLIB_CASSERT(std::distance(ibegin,iend) > 0);
-            auto nr = ibegin->nr();
-            auto nc = ibegin->nc();
-            // make sure all the input matrices have the same dimensions
-            for (auto i = ibegin; i != iend; ++i)
-            {
-                DLIB_CASSERT(i->nr()==nr && i->nc()==nc,
-                    "\t input_rgb_image_pyramid::to_tensor()"
-                    << "\n\t All matrices given to to_tensor() must have the same dimensions."
-                    << "\n\t nr: " << nr
-                    << "\n\t nc: " << nc
-                    << "\n\t i->nr(): " << i->nr()
-                    << "\n\t i->nc(): " << i->nc()
-                );
-            }
-
-            long NR, NC;
-            pyramid_type pyr;
-            auto& rects = data.annotation().get<std::vector<rectangle>>();
-            impl::compute_tiled_image_pyramid_details(pyr, nr, nc, pyramid_padding, pyramid_outer_padding, rects, NR, NC);
-
-            // initialize data to the right size to contain the stuff in the iterator range.
-            data.set_size(std::distance(ibegin,iend), 3, NR, NC);
-
-            // We need to zero the image before doing the pyramid, since the pyramid
-            // creation code doesn't write to all parts of the image.  We also take
-            // care to avoid triggering any device to hosts copies.
-            auto ptr = data.host_write_only();
-            for (size_t i = 0; i < data.size(); ++i)
-                ptr[i] = 0;
+            this->to_tensor_init(ibegin, iend, data, 1);

+            const auto rects = data.annotation().get<std::vector<rectangle>>();
            if (rects.size() == 0)
                return;

            // copy the first raw image into the top part of the tiled pyramid.  We need to
            // do this for each of the input images/samples in the tensor.
+            auto ptr = data.host_write_only();
+            for (auto i = ibegin; i != iend; ++i)
+            {
+                auto& img = *i;
+                ptr += rects[0].top()*data.nc();
+                for (long r = 0; r < img.nr(); ++r)
+                {
+                    auto p = ptr+rects[0].left();
+                    for (long c = 0; c < img.nc(); ++c)
+                        p[c] = (img(r,c))/256.0;
+                    ptr += data.nc();
+                }
+                ptr += data.nc()*(data.nr()-rects[0].bottom()-1);
+            }
+
+            this->create_tiled_pyramid(rects, data);
+        }
+
+        friend void serialize(const input_grayscale_image_pyramid& item, std::ostream& out)
+        {
+            serialize("input_grayscale_image_pyramid", out);
+            serialize(item.pyramid_padding, out);
+            serialize(item.pyramid_outer_padding, out);
+        }
+
+        friend void deserialize(input_grayscale_image_pyramid& item, std::istream& in)
+        {
+            std::string version;
+            deserialize(version, in);
+            if (version != "input_grayscale_image_pyramid")
+                throw serialization_error("Unexpected version found while deserializing dlib::input_grayscale_image_pyramid.");
+            deserialize(item.pyramid_padding, in);
+            deserialize(item.pyramid_outer_padding, in);
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const input_grayscale_image_pyramid& item)
+        {
+            out << "input_grayscale_image_pyramid()";
+            out << " pyramid_padding="<<item.pyramid_padding;
+            out << " pyramid_outer_padding="<<item.pyramid_outer_padding;
+            return out;
+        }
+
+        friend void to_xml(const input_grayscale_image_pyramid& item, std::ostream& out)
+        {
+            out << "<input_grayscale_image_pyramid"
+                <<"' pyramid_padding='"<<item.pyramid_padding
+                <<"' pyramid_outer_padding='"<<item.pyramid_outer_padding
+                <<"'/>";
+        }
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename PYRAMID_TYPE>
+    class input_rgb_image_pyramid : public detail::input_image_pyramid<PYRAMID_TYPE>
+    {
+    public:
+        typedef matrix<rgb_pixel> input_type;
+        typedef PYRAMID_TYPE pyramid_type;
+
+        input_rgb_image_pyramid (
+        ) :
+            avg_red(122.782),
+            avg_green(117.001),
+            avg_blue(104.298)
+        {
+        }
+
+        input_rgb_image_pyramid (
+            float avg_red_,
+            float avg_green_,
+            float avg_blue_
+        ) : avg_red(avg_red_), avg_green(avg_green_), avg_blue(avg_blue_)
+        {}
+
+        float get_avg_red()   const { return avg_red; }
+        float get_avg_green() const { return avg_green; }
+        float get_avg_blue()  const { return avg_blue; }
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const
+        {
+            this->to_tensor_init(ibegin, iend, data, 3);
+
+            const auto rects = data.annotation().get<std::vector<rectangle>>();
+            if (rects.size() == 0)
+                return;
+
+            // copy the first raw image into the top part of the tiled pyramid.  We need to
+            // do this for each of the input images/samples in the tensor.
+            auto ptr = data.host_write_only();
            for (auto i = ibegin; i != iend; ++i)
            {
                auto& img = *i;
@ -729,19 +862,7 @@ namespace dlib
                ptr += data.nc()*(data.nr()-rects[0].bottom()-1);
            }

-            // now build the image pyramid into data.  This does the same thing as
-            // create_tiled_pyramid(), except we use the GPU if one is available. 
-            for (size_t i = 1; i < rects.size(); ++i)
-            {
-                alias_tensor src(data.num_samples(),data.k(),rects[i-1].height(),rects[i-1].width());
-                alias_tensor dest(data.num_samples(),data.k(),rects[i].height(),rects[i].width());
-
-                auto asrc  = src(data, data.nc()*rects[i-1].top() + rects[i-1].left());
-                auto adest = dest(data, data.nc()*rects[i].top() + rects[i].left());
-
-                tt::resize_bilinear(adest, data.nc(), data.nr()*data.nc(), 
-                                    asrc, data.nc(), data.nr()*data.nc());
-            }
+            this->create_tiled_pyramid(rects, data);
        }

        friend void serialize(const input_rgb_image_pyramid& item, std::ostream& out)
@ -796,8 +917,6 @@ namespace dlib
        float avg_red;
        float avg_green;
        float avg_blue;
-        unsigned long pyramid_padding = 10;
-        unsigned long pyramid_outer_padding = 11;
    };

 // ----------------------------------------------------------------------------------------
--- a/dlib/dnn/input_abstract.h
+++ b/dlib/dnn/input_abstract.h
@ -271,6 +271,155 @@ namespace dlib

    };

+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename PYRAMID_TYPE
+        >
+    class input_grayscale_image_pyramid
+    {
+        /*!
+            REQUIREMENTS ON PYRAMID_TYPE
+                PYRAMID_TYPE must be an instance of the dlib::pyramid_down template.
+
+            WHAT THIS OBJECT REPRESENTS
+                This input layer works with gray scale images of type matrix<unsigned char>.
+                It is identical to input layer except that it outputs a tensor containing a tiled
+                image pyramid of each input image rather than a simple copy of each image.
+                The tiled image pyramid is created using create_tiled_pyramid().
+        !*/
+
+    public:
+
+        typedef matrix<unsigned char> input_type;
+        typedef PYRAMID_TYPE pyramid_type;
+        input_grayscale_image_pyramid (
+        );
+        /*!
+            ensures
+                - #get_pyramid_padding() == 10
+                - #get_pyramid_outer_padding() == 11
+        !*/
+
+        unsigned long get_pyramid_padding (
+        ) const;
+        /*!
+            ensures
+                - When this object creates a pyramid it will call create_tiled_pyramid() and
+                  set create_tiled_pyramid's pyramid_padding parameter to get_pyramid_padding().
+        !*/
+
+        void set_pyramid_padding (
+            unsigned long value
+        );
+        /*!
+            ensures
+                - #get_pyramid_padding() == value
+        !*/
+
+        unsigned long get_pyramid_outer_padding (
+        ) const;
+        /*!
+            ensures
+                - When this object creates a pyramid it will call create_tiled_pyramid()
+                  and set create_tiled_pyramid's pyramid_outer_padding parameter to
+                  get_pyramid_outer_padding().
+        !*/
+
+        void set_pyramid_outer_padding (
+            unsigned long value
+        );
+        /*!
+            ensures
+                - #get_pyramid_outer_padding() == value
+        !*/
+
+        template <typename forward_iterator>
+        void to_tensor (
+            forward_iterator ibegin,
+            forward_iterator iend,
+            resizable_tensor& data
+        ) const;
+        /*!
+            requires
+                - [ibegin, iend) is an iterator range over input_type objects.
+                - std::distance(ibegin,iend) > 0
+                - The input range should contain images that all have the same
+                  dimensions.
+            ensures
+                - Converts the iterator range into a tensor and stores it into #data.  In
+                  particular, we will have:
+                    - #data.num_samples() == std::distance(ibegin,iend)
+                    - #data.k() == 1
+                    - Each sample in #data contains a tiled image pyramid of the
+                      corresponding input image.  The tiled pyramid is created by
+                      create_tiled_pyramid().
+                  Moreover, each pixel is normalized, dividing them by 256.0.
+        !*/
+
+        bool image_contained_point (
+            const tensor& data,
+            const point& p
+        ) const;
+        /*!
+            requires
+                - data is a tensor that was produced by this->to_tensor()
+            ensures
+                - Since data is a tensor that is built from a bunch of identically sized
+                  images, we can ask if those images were big enough to contain the point
+                  p.  This function returns the answer to that question.
+        !*/
+
+        drectangle image_space_to_tensor_space (
+            const tensor& data,
+            double scale,
+            drectangle r
+        ) const;
+        /*!
+            requires
+                - data is a tensor that was produced by this->to_tensor()
+                - 0 < scale <= 1
+            ensures
+                - This function maps from to_tensor()'s input image space to its output
+                  tensor space.  Therefore, given that data is a tensor produced by
+                  to_tensor(), image_space_to_tensor_space() allows you to ask for the
+                  rectangle in data that corresponds to a rectangle in the original image
+                  space.
+
+                  Note that since the output tensor contains an image pyramid, there are
+                  multiple points in the output tensor that correspond to any input
+                  location.  So you must also specify a scale so we know what level of the
+                  pyramid is needed.  So given a rectangle r in an input image, you can
+                  ask, what rectangle in data corresponds to r when things are scale times
+                  smaller?  That rectangle is returned by this function.
+                - A scale of 1 means we don't move anywhere in the pyramid scale space relative
+                  to the input image while smaller values of scale mean we move down the
+                  pyramid.
+        !*/
+
+        drectangle tensor_space_to_image_space (
+            const tensor& data,
+            drectangle r
+        ) const;
+        /*!
+            requires
+                - data is a tensor that was produced by this->to_tensor()
+            ensures
+                - This function maps from to_tensor()'s output tensor space to its input
+                  image space.  Therefore, given that data is a tensor produced by
+                  to_tensor(), tensor_space_to_image_space() allows you to ask for the
+                  rectangle in the input image that corresponds to a rectangle in data.
+                - It should be noted that this function isn't always an inverse of
+                  image_space_to_tensor_space().  This is because you can ask
+                  image_space_to_tensor_space() for the coordinates of points outside the input
+                  image and they will be mapped to somewhere that doesn't have an inverse.
+                  But for points actually inside the input image this function performs an
+                  approximate inverse mapping.  I.e. when image_contained_point(data,center(r))==true
+                  there is an approximate inverse.
+        !*/
+
+    };
+
 // ----------------------------------------------------------------------------------------

    template <