Simplified code slightly and filled out the spec

2024-11-01 10:14:53 +08:00 · 2013-03-24 15:18:12 -04:00 · 2013-03-24 15:18:12 -04:00 · 7847038b33
commit 7847038b33
parent 94b0344532
2 changed files with 379 additions and 6 deletions
--- a/dlib/image_processing/scan_image_boxes.h
+++ b/dlib/image_processing/scan_image_boxes.h
@ -29,10 +29,6 @@ namespace dlib
            rects.clear();
            find_candidate_object_locations(img, rects);
        }
-
-        void copy_configuration (
-            const default_box_generator&
-        ){}
    };

    inline void serialize(const default_box_generator&, std::ostream& ) {}
@ -341,7 +337,7 @@ namespace dlib
        const box_generator& bg 
    )
    {
-        detect_boxes.copy_configuration(bg);
+        detect_boxes = bg;
    }

 // ----------------------------------------------------------------------------------------
@ -356,7 +352,7 @@ namespace dlib
    )
    {
        feats.copy_configuration(item.feats);
-        detect_boxes.copy_configuration(item.detect_boxes);
+        detect_boxes = item.detect_boxes;
        num_spatial_pyramid_levels = item.num_spatial_pyramid_levels;
    }

--- a/dlib/image_processing/scan_image_boxes_abstract.h
+++ b/dlib/image_processing/scan_image_boxes_abstract.h
@ -1,2 +1,379 @@
+// Copyright (C) 2013  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_SCAN_IMAGE_bOXES_ABSTRACT_H__
+#ifdef DLIB_SCAN_IMAGE_bOXES_ABSTRACT_H__
+
+#include "../matrix.h"
+#include "../geometry.h"
+#include "../image_processing.h"
+#include "../array2d.h"
+#include "full_object_detection.h"
+#include "../image_transforms/segment_image_abstract.h"
+#include <vector>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class default_box_generator
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is a function object that takes in an image and outputs a set of
+                candidate object locations.  It is also the default box generator used by
+                the scan_image_boxes object defined below.
+        !*/
+
+    public:
+
+        template <typename image_type>
+        void operator() (
+            const image_type& img,
+            std::vector<rectangle>& rects
+        ) const
+        /*!
+            ensures
+                - #rects == the set of candidate object locations which should be searched
+                  inside img.  That is, these are the rectangles which might contain
+                  objects of interest within the given image.
+        !*/
+        {
+            rects.clear();
+            find_candidate_object_locations(img, rects);
+        }
+    };
+
+    inline void serialize  (const default_box_generator&, std::ostream& ) {}
+    inline void deserialize(      default_box_generator&, std::istream& ) {}
+    /*!
+        ensures
+            - provides serialization support.  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Feature_extractor_type,
+        typename Box_generator = default_box_generator
+        >
+    class scan_image_boxes : noncopyable
+    {
+        /*!
+            REQUIREMENTS ON Feature_extractor_type
+                - must be an object with an interface compatible with the hashed_feature_image 
+                  object defined in dlib/image_keypoint/hashed_feature_image_abstract.h or 
+                  with the nearest_neighbor_feature_image object defined in 
+                  dlib/image_keypoint/nearest_neighbor_feature_image_abstract.h
+
+            REQUIREMENTS ON Box_generator
+                - must be an object with an interface compatible with the
+                  default_box_generator object defined at the top of this file.
+
+            INITIAL VALUE
+                - get_num_spatial_pyramid_levels() == 3
+                - is_loaded_with_image() == false
+
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tool for running a classifier over an image with the goal
+                of localizing each object present.  The localization is in the form of the
+                bounding box around each object of interest.  
+
+                Unlike the scan_image_boxes object which scans a fixed sized window over
+                an image pyramid, the scan_image_boxes tool allows you to define your own
+                list of "candidate object locations" which should be evaluated.  This is
+                simply a list of rectangle objects which might contain objects of interest.
+                The scan_image_boxes object will then evaluate the classifier at each of
+                these locations and return the subset of rectangles which appear to have
+                objects in them.  The candidate object location generation is provided by
+                the Box_generator that is passed in as a template argument.  
+
+                This object can also be understood as a general tool for implementing the
+                spatial pyramid models described in the paper:
+                    Beyond Bags of Features: Spatial Pyramid Matching for Recognizing 
+                    Natural Scene Categories by Svetlana Lazebnik, Cordelia Schmid, 
+                    and Jean Ponce


+                The classifiers used by this object have three parts: 
+                   1. The underlying feature extraction provided by Feature_extractor_type
+                      objects, which associate a vector with each location in an image.
+
+                   2. A rule for extracting a feature vector from a candidate object
+                      location.  In this object we use the spatial pyramid matching method.
+                      This means we cut an object's detection window into a set of "feature
+                      extraction regions" and extract a bag-of-words vector from each
+                      before finally concatenating them to form the final feature vector
+                      representing the entire object window.  The set of feature extraction
+                      regions can be configured by the user by calling
+                      set_num_spatial_pyramid_levels().  To be a little more precise, the
+                      feature vector for a candidate object window is defined as follows:
+                        - Let N denote the number of feature extraction zones.
+                        - Let M denote the dimensionality of the vectors output by
+                          Feature_extractor_type objects.
+                        - Let F(i) == the M dimensional vector which is the sum of all
+                          vectors given by our Feature_extractor_type object inside the
+                          i-th feature extraction zone.  So this is notionally a
+                          bag-of-words vector from the i-th zone.
+                        - Then the feature vector for an object window is an M*N
+                          dimensional vector [F(1) F(2) F(3) ... F(N)] (i.e. it is a
+                          concatenation of the N vectors).  This feature vector can be
+                          thought of as a collection of N bags-of-words, each bag coming
+                          from a spatial location determined by one of the feature
+                          extraction zones.
+                          
+                   3. A weight vector and a threshold value.  The dot product between the
+                      weight vector and the feature vector for a candidate object location
+                      gives the score of the location.  If this score is greater than the
+                      threshold value then the candidate object location is output as a
+                      detection.
+
+            THREAD SAFETY
+                Concurrent access to an instance of this object is not safe and should be
+                protected by a mutex lock except for the case where you are copying the
+                configuration (via copy_configuration()) of a scan_image_boxes object to
+                many other threads.  In this case, it is safe to copy the configuration of
+                a shared object so long as no other operations are performed on it.
+        !*/
+
+    public:
+
+        typedef matrix<double,0,1> feature_vector_type;
+
+        typedef Feature_extractor_type feature_extractor_type;
+        typedef Box_generator box_generator;
+
+        scan_image_boxes (
+        );  
+        /*!
+            ensures
+                - this object is properly initialized
+        !*/
+
+        template <
+            typename image_type
+            >
+        void load (
+            const image_type& img
+        );
+        /*!
+            requires
+                - image_type must be a type with the following properties:
+                    - image_type objects can be loaded into Feature_extractor_type
+                      objects via Feature_extractor_type::load().
+                    - image_type objects can be passed to the first argument of
+                      Box_generator::operator()
+            ensures
+                - #is_loaded_with_image() == true
+                - This object is ready to run a classifier over img to detect object
+                  locations.  Call detect() to do this.
+        !*/
+
+        bool is_loaded_with_image (
+        ) const;
+        /*!
+            ensures
+                - returns true if this object has been loaded with an image to process and
+                  false otherwise.
+        !*/
+
+        void copy_configuration(
+            const feature_extractor_type& fe
+        );
+        /*!
+            ensures
+                - Let BASE_FE denote the feature_extractor_type object used internally for
+                  local feature extraction.  Then this function performs
+                  BASE_FE.copy_configuration(fe) (i.e. this function allows you to
+                  configure the parameters of the underlying feature extractor used by a
+                  scan_image_boxes object)
+        !*/
+
+        void copy_configuration(
+            const box_generator& bg
+        );
+        /*!
+            ensures
+                - Let BASE_BG denote the box_generator object used internally for candidate
+                  box generation.  Then this function performs:
+                    BASE_BG = bg;
+                  (i.e. this function allows you to configure the parameters of the
+                  underlying box generator used by a scan_image_boxes object)
+        !*/
+
+        void copy_configuration (
+            const scan_image_boxes& item
+        );
+        /*!
+            ensures
+                - Copies all the state information of item into *this, except for state 
+                  information populated by load().  More precisely, given two scan_image_boxes 
+                  objects S1 and S2, the following sequence of instructions should always 
+                  result in both of them having the exact same state:
+                    S2.copy_configuration(S1);
+                    S1.load(img);
+                    S2.load(img);
+        !*/
+
+        long get_num_dimensions (
+        ) const;
+        /*!
+            ensures
+                - returns the number of dimensions in the feature vector for a candidate
+                  object location.  This value is the dimensionality of the underlying
+                  feature vectors produced by Feature_extractor_type times the number of
+                  feature extraction regions used.  Note that the number of feature
+                  extraction regions used is a function of
+                  get_num_spatial_pyramid_levels().
+        !*/
+
+        unsigned long get_num_spatial_pyramid_levels (
+        ) const;
+        /*!
+            ensures
+                - returns the number of layers in the spatial pyramid.  For example, if
+                  this function returns 1 then it means we use a simple bag-of-words
+                  representation over the whole object window.  If it returns 2 then it
+                  means the feature representation is the concatenation of 5 bag-of-words
+                  vectors, one from the entire object window and 4 others from 4 different
+                  parts of the object window.  If it returns 3 then there are 1+4+16
+                  bag-of-words vectors concatenated together in the feature representation,
+                  and so on.
+        !*/
+
+        void set_num_spatial_pyramid_levels (
+            unsigned long levels
+        );
+        /*!
+            requires
+                - levels > 0
+            ensures
+                - #get_num_spatial_pyramid_levels() == levels
+        !*/
+
+        void detect (
+            const feature_vector_type& w,
+            std::vector<std::pair<double, rectangle> >& dets,
+            const double thresh
+        ) const;
+        /*!
+            requires
+                - w.size() >= get_num_dimensions()
+                - is_loaded_with_image() == true
+            ensures
+                - Scans over all the candidate object locations as discussed in the WHAT
+                  THIS OBJECT REPRESENTS section and stores all detections into #dets.
+                - for all valid i:
+                    - #dets[i].second == The candidate object location which produced this
+                      detection.  This rectangle gives the location of the detection.  
+                    - #dets[i].first == The score for this detection.  This value is equal
+                      to dot(w, feature vector for this candidate object location).
+                    - #dets[i].first >= thresh
+                - #dets will be sorted in descending order. 
+                  (i.e.  #dets[i].first >= #dets[j].first for all i, and j>i)
+                - Elements of w beyond index get_num_dimensions()-1 are ignored.  I.e. only
+                  the first get_num_dimensions() are used.
+                - Note that no form of non-max suppression is performed.  If a locations
+                  has a score >= thresh then it is reported in #dets.
+        !*/
+
+        void get_feature_vector (
+            const full_object_detection& obj,
+            feature_vector_type& psi
+        ) const;
+        /*!
+            requires
+                - obj.num_parts() == 0 
+                - is_loaded_with_image() == true
+                - psi.size() >= get_num_dimensions()
+                  (i.e. psi must have preallocated its memory before this function is called)
+            ensures
+                - This function allows you to determine the feature vector used for a
+                  candidate object location output from detect().  Note that this vector is
+                  added to psi.  Note also that you must use get_full_object_detection() to
+                  convert a rectangle from detect() into the needed full_object_detection.
+                - Since scan_image_boxes only searches a limited set of object locations,
+                  not all possible rectangles can be output by detect().  So in the case
+                  where obj.get_rect() could not arise from a call to detect(), this
+                  function will map obj.get_rect() to the nearest possible rectangle and
+                  then add the feature vector for the mapped rectangle into #psi.
+                - get_best_matching_rect(obj.get_rect()) == the rectangle obj.get_rect()
+                  gets mapped to for feature extraction.
+        !*/
+
+        full_object_detection get_full_object_detection (
+            const rectangle& rect,
+            const feature_vector_type& w
+        ) const;
+        /*!
+            ensures
+                - returns full_object_detection(rect)
+                  (This function is here only for compatibility with the scan_image_pyramid
+                  object)
+        !*/
+
+        const rectangle get_best_matching_rect (
+            const rectangle& rect
+        ) const;
+        /*!
+            requires
+                - is_loaded_with_image() == true
+            ensures
+                - Since scan_image_boxes only searches a limited set of object locations,
+                  not all possible rectangles can be represented.  Therefore, this function
+                  allows you to supply a rectangle and obtain the nearest possible
+                  candidate object location rectangle.
+        !*/
+
+        unsigned long get_num_detection_templates (
+        ) const { return 1; }
+        /*!
+            ensures
+                - returns 1.  Note that this function is here only for compatibility with 
+                  the scan_image_pyramid object.  Notionally, its return value indicates 
+                  that a scan_image_boxes object is always ready to detect objects once
+                  an image has been loaded.
+        !*/
+
+        unsigned long get_num_movable_components_per_detection_template (
+        ) const { return 0; }
+        /*!
+            ensures
+                - returns 0.  Note that this function is here only for compatibility with
+                  the scan_image_pyramid object.  Its return value means that this object
+                  does not support using movable part models.
+        !*/
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Feature_extractor_type,
+        typename Box_generator 
+        >
+    void serialize (
+        const scan_image_boxes<Feature_extractor_type,Box_generator>& item,
+        std::ostream& out
+    );
+    /*!
+        provides serialization support 
+    !*/
+
+    template <
+        typename Feature_extractor_type,
+        typename Box_generator 
+        >
+    void deserialize (
+        scan_image_boxes<Feature_extractor_type,Box_generator>& item,
+        std::istream& in 
+    );
+    /*!
+        provides deserialization support 
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_SCAN_IMAGE_bOXES_ABSTRACT_H__
+