diff --git a/dlib/image_processing/object_detector_abstract.h b/dlib/image_processing/object_detector_abstract.h index a4bc61f19..f3ba18d12 100644 --- a/dlib/image_processing/object_detector_abstract.h +++ b/dlib/image_processing/object_detector_abstract.h @@ -157,8 +157,8 @@ namespace dlib minus the threshold, therefore this is a value > 0. - #dets[i].second == the bounding box for the i-th detection. - #get_scanner() will have been loaded with img. Therefore, you can call - #get_scanner().get_feature_vector() to obtain the feature vectors for - the resulting object detection boxes. + #get_scanner().get_feature_vector() to obtain the feature vectors or + full_object_detections for the resulting object detection boxes. - The detection threshold is adjusted by having adjust_threshold added to it. Therefore, an adjust_threshold value > 0 makes detecting objects harder while a negative one makes it easier. diff --git a/dlib/image_processing/scan_image_pyramid.h b/dlib/image_processing/scan_image_pyramid.h index 981e37f99..8b315adec 100644 --- a/dlib/image_processing/scan_image_pyramid.h +++ b/dlib/image_processing/scan_image_pyramid.h @@ -9,6 +9,7 @@ #include "../image_processing.h" #include "../array2d.h" #include +#include "full_object_detection.h" namespace dlib { @@ -52,12 +53,24 @@ namespace dlib void add_detection_template ( const rectangle& object_box, - const std::vector& feature_extraction_regions + const std::vector& stationary_feature_extraction_regions, + const std::vector& movable_feature_extraction_regions + ); + + void add_detection_template ( + const rectangle& object_box, + const std::vector& stationary_feature_extraction_regions ); inline unsigned long get_num_detection_templates ( ) const; + inline unsigned long get_num_movable_components_per_detection_template ( + ) const; + + inline unsigned long get_num_stationary_components_per_detection_template ( + ) const; + inline unsigned long get_num_components_per_detection_template ( ) const; @@ -96,7 +109,13 @@ namespace dlib ) const; void get_feature_vector ( + const full_object_detection& obj, + feature_vector_type& psi + ) const; + + full_object_detection get_feature_vector ( const rectangle& rect, + const feature_vector_type& w, feature_vector_type& psi ) const; @@ -129,6 +148,7 @@ namespace dlib { rectangle object_box; // always centered at (0,0) std::vector rects; // template with respect to (0,0) + std::vector movable_rects; }; friend void serialize(const detection_template& item, std::ostream& out) @@ -394,27 +414,61 @@ namespace dlib void scan_image_pyramid:: add_detection_template ( const rectangle& object_box, - const std::vector& feature_extraction_regions + const std::vector& stationary_feature_extraction_regions, + const std::vector& movable_feature_extraction_regions ) { +#ifdef ENABLE_ASSERTS // make sure requires clause is not broken DLIB_ASSERT((get_num_detection_templates() == 0 || - get_num_components_per_detection_template() == feature_extraction_regions.size()) && + (get_num_stationary_components_per_detection_template() == stationary_feature_extraction_regions.size() && + get_num_movable_components_per_detection_template() == movable_feature_extraction_regions.size())) && center(object_box) == point(0,0), "\t void scan_image_pyramid::add_detection_template()" << "\n\t The number of rects in this new detection template doesn't match " << "\n\t the number in previous detection templates." - << "\n\t get_num_components_per_detection_template(): " << get_num_components_per_detection_template() - << "\n\t feature_extraction_regions.size(): " << feature_extraction_regions.size() + << "\n\t get_num_stationary_components_per_detection_template(): " << get_num_stationary_components_per_detection_template() + << "\n\t stationary_feature_extraction_regions.size(): " << stationary_feature_extraction_regions.size() + << "\n\t get_num_movable_components_per_detection_template(): " << get_num_movable_components_per_detection_template() + << "\n\t movable_feature_extraction_regions.size(): " << movable_feature_extraction_regions.size() << "\n\t this: " << this ); + for (unsigned long i = 0; i < movable_feature_extraction_regions.size(); ++i) + { + DLIB_ASSERT(center(movable_feature_extraction_regions[i]) == point(0,0), + "Invalid inputs were given to this function." + << "\n\t center(movable_feature_extraction_regions["< + void scan_image_pyramid:: + add_detection_template ( + const rectangle& object_box, + const std::vector& stationary_feature_extraction_regions + ) + { + // an empty set of movable feature regions + const std::vector movable_feature_extraction_regions; + add_detection_template(object_box, stationary_feature_extraction_regions, + movable_feature_extraction_regions); + } + // ---------------------------------------------------------------------------------------- template < @@ -428,6 +482,48 @@ namespace dlib return det_templates.size(); } +// ---------------------------------------------------------------------------------------- + + template < + typename Pyramid_type, + typename Feature_extractor_type + > + unsigned long scan_image_pyramid:: + get_num_stationary_components_per_detection_template ( + ) const + { + // make sure requires clause is not broken + DLIB_ASSERT(get_num_detection_templates() > 0 , + "\t unsigned long scan_image_pyramid::get_num_stationary_components_per_detection_template()" + << "\n\t You need to give some detection templates before calling this function. " + << "\n\t get_num_detection_templates(): " << get_num_detection_templates() + << "\n\t this: " << this + ); + + return det_templates[0].rects.size(); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename Pyramid_type, + typename Feature_extractor_type + > + unsigned long scan_image_pyramid:: + get_num_movable_components_per_detection_template ( + ) const + { + // make sure requires clause is not broken + DLIB_ASSERT(get_num_detection_templates() > 0 , + "\t unsigned long scan_image_pyramid::get_num_movable_components_per_detection_template()" + << "\n\t You need to give some detection templates before calling this function. " + << "\n\t get_num_detection_templates(): " << get_num_detection_templates() + << "\n\t this: " << this + ); + + return det_templates[0].movable_rects.size(); + } + // ---------------------------------------------------------------------------------------- template < @@ -446,7 +542,8 @@ namespace dlib << "\n\t this: " << this ); - return det_templates[0].rects.size(); + return get_num_movable_components_per_detection_template() + + get_num_stationary_components_per_detection_template(); } // ---------------------------------------------------------------------------------------- @@ -697,25 +794,48 @@ namespace dlib typename Pyramid_type, typename Feature_extractor_type > - void scan_image_pyramid:: + full_object_detection scan_image_pyramid:: get_feature_vector ( const rectangle& rect, + const feature_vector_type&,// w, + feature_vector_type& psi + ) const + { + // TODO + get_feature_vector(full_object_detection(rect), psi); + return full_object_detection(rect); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename Pyramid_type, + typename Feature_extractor_type + > + void scan_image_pyramid:: + get_feature_vector ( + const full_object_detection& obj, feature_vector_type& psi ) const { // make sure requires clause is not broken DLIB_ASSERT(get_num_detection_templates() > 0 && is_loaded_with_image() && - psi.size() >= get_num_dimensions(), + psi.size() >= get_num_dimensions() && + obj.movable_parts.size() == get_num_movable_components_per_detection_template(), "\t void scan_image_pyramid::get_feature_vector()" << "\n\t Invalid inputs were given to this function " << "\n\t get_num_detection_templates(): " << get_num_detection_templates() << "\n\t is_loaded_with_image(): " << is_loaded_with_image() << "\n\t psi.size(): " << psi.size() << "\n\t get_num_dimensions(): " << get_num_dimensions() + << "\n\t get_num_movable_components_per_detection_template(): " << get_num_movable_components_per_detection_template() + << "\n\t obj.movable_parts.size(): " << obj.movable_parts.size() << "\n\t this: " << this ); + const rectangle rect = obj.rect; + pyramid_type pyr; rectangle mapped_rect; detection_template best_template; diff --git a/dlib/image_processing/scan_image_pyramid_abstract.h b/dlib/image_processing/scan_image_pyramid_abstract.h index da50e9ec8..1fe714611 100644 --- a/dlib/image_processing/scan_image_pyramid_abstract.h +++ b/dlib/image_processing/scan_image_pyramid_abstract.h @@ -8,6 +8,7 @@ #include "../image_processing.h" #include "../array2d.h" #include +#include "full_object_detection_abstract.h" namespace dlib { @@ -56,30 +57,39 @@ namespace dlib objects, which associate a vector with each location in an image. 2. A detection template. This is a rectangle which defines the shape of a - sliding window (the object_box), as well as a set of rectangles which - envelop it. This set of enveloping rectangles defines the spatial - structure of the overall feature extraction within a sliding window. - In particular, each location of a sliding window has a feature vector + sliding window (i.e. the object_box), as well as a set of rectangular feature + extraction regions inside it. This set of regions defines the spatial + structure of the overall feature extraction within a sliding window. In + particular, each location of a sliding window has a feature vector associated with it. This feature vector is defined as follows: - - Let N denote the number of enveloping rectangles. + - Let N denote the number of feature extraction zones. - Let M denote the dimensionality of the vectors output by Feature_extractor_type objects. - Let F(i) == the M dimensional vector which is the sum of all vectors - given by our Feature_extractor_type object inside the ith enveloping - rectangle. + given by our Feature_extractor_type object inside the ith feature extraction + zone. - Then the feature vector for a sliding window is an M*N dimensional vector [F(1) F(2) F(3) ... F(N)] (i.e. it is a concatenation of the N vectors). This feature vector can be thought of as a collection of N "bags of features", - each bag coming from a spatial location determined by one of the enveloping - rectangles. + each bag coming from a spatial location determined by one of the rectangular + feature extraction zones. 3. A weight vector and a threshold value. The dot product between the weight vector and the feature vector for a sliding window location gives the score of the window. If this score is greater than the threshold value then the window location is output as a detection. - Finally, the sliding window classifiers described above are applied to every level - of an image pyramid. + Finally, the sliding window classifiers described above are applied to every level of + an image pyramid. Moreover, some of the feature extraction zones are allowed to move + freely within the object box. This means that when we are sliding the classifier over + an image, some feature extraction zones are stationary (i.e. always in the same place + relative to the object box) while others are allowed to move anywhere within the object + box. In particular, the movable regions are placed at the locations that maximize the + score of the classifier. Note further that each of the movable feature extraction + zones must pass a threshold test for it to be included. That is, if the score that a + movable zone would contribute to the overall score for a sliding window location is not + positive then that zone is not included in the feature vector (i.e. its part of the + feature vector is set to zero. This way the length of the feature vector stays constant). THREAD SAFETY Concurrent access to an instance of this object is not safe and should be protected @@ -164,30 +174,48 @@ namespace dlib void add_detection_template ( const rectangle& object_box, - const std::vector& feature_extraction_regions + const std::vector& stationary_feature_extraction_regions, + const std::vector& movable_feature_extraction_regions ); /*! requires - - center(object_box) == point(0,0), + - center(object_box) == point(0,0) + - for all valid i: + - center(movable_feature_extraction_regions[i]) == point(0,0) - if (get_num_detection_templates() > 0) then - - get_num_components_per_detection_template() == feature_extraction_regions.size() + - get_num_stationary_components_per_detection_template() == stationary_feature_extraction_regions.size() + - get_num_movable_components_per_detection_template() == movable_feature_extraction_regions.size() (i.e. if you already have detection templates in this object, then any new detection template must declare a consistent number of feature extraction regions) ensures - Adds another detection template to this object. In particular, object_box - defines the size and shape of a sliding window while feature_extraction_regions - defines the locations for feature extraction as discussed in the WHAT THIS - OBJECT REPRESENTS section above. Note also that the locations of the feature - extraction regions are relative to the object_box. + defines the size and shape of a sliding window while stationary_feature_extraction_regions + and movable_feature_extraction_regions defines the locations for feature extraction as + discussed in the WHAT THIS OBJECT REPRESENTS section above. Note also that the locations of + the stationary feature extraction regions are relative to the object_box. - #get_num_detection_templates() == get_num_detection_templates() + 1 - - The order of rectangles in feature_extraction_regions matters. Recall that - each rectangle gets its own set of features. So given two different templates, - their ith rectangles will both share the same part of the weight vector (w) - supplied to detect(). So there should be some reasonable correspondence + - The order of rectangles in stationary_feature_extraction_regions and + movable_feature_extraction_regions matters. Recall that each rectangle + gets its own set of features. So given two different templates, their + ith rectangles will both share the same part of the weight vector (i.e. the w + supplied to detect()). So there should be some reasonable correspondence between the rectangle ordering in different detection templates. For, - example, different detection templates should place corresponding - feature extraction regions in roughly the same part of the object_box. + example, different detection templates should place corresponding feature + extraction regions in roughly the same part of the object_box. + - #get_num_stationary_components_per_detection_template() = stationary_feature_extraction_regions.size() + - #get_num_movable_components_per_detection_template() = movable_feature_extraction_regions.size() + !*/ + + void add_detection_template ( + const rectangle& object_box, + const std::vector& stationary_feature_extraction_regions + ); + /*! + ensures + - calls add_detection_template(object_box, stationary_feature_extraction_regions, empty_list) + where empty_list is a vector of size 0. I.e. this function is just a convenience + routine for adding detection templates with no movable regions. !*/ unsigned long get_num_detection_templates ( @@ -197,16 +225,40 @@ namespace dlib - returns the number of detection templates in this object !*/ + unsigned long get_num_stationary_components_per_detection_template ( + ) const; + /*! + requires + - get_num_detection_templates() > 0 + ensures + - A detection template is a rectangle which defines the shape of a sliding + window (the object_box), as well as a set of rectangles which define + feature extraction zones. This function returns the number of stationary + feature extraction zones in the detection templates used by this object. + !*/ + + unsigned long get_num_movable_components_per_detection_template ( + ) const; + /*! + requires + - get_num_detection_templates() > 0 + ensures + - A detection template is a rectangle which defines the shape of a sliding + window (the object_box), as well as a set of rectangles which define + feature extraction zones. This function returns the number of movable + feature extraction zones in the detection templates used by this object. + !*/ + unsigned long get_num_components_per_detection_template ( ) const; /*! requires - get_num_detection_templates() > 0 ensures - - A detection template is a rectangle which defines the shape of a - sliding window (the object_box), as well as a set of rectangles which - envelop it. This function returns the number of enveloping rectangles - in the detection templates used by this object. + - returns the total number of feature extraction zones in the detection + templates used by this object. That is, returns the following: + - get_num_movable_components_per_detection_template() + + get_num_stationary_components_per_detection_template() !*/ long get_num_dimensions ( @@ -217,7 +269,8 @@ namespace dlib ensures - returns the number of dimensions in the feature vector for a sliding window location. This value is the dimensionality of the underlying feature vectors - produced by Feature_extractor_type times get_num_components_per_detection_template(). + produced by Feature_extractor_type times (get_num_stationary_components_per_detection_template() + + get_num_movable_components_per_detection_template()). !*/ unsigned long get_max_pyramid_levels ( @@ -339,21 +392,45 @@ namespace dlib !*/ void get_feature_vector ( - const rectangle& rect, + const full_object_detection& obj, feature_vector_type& psi ) const; /*! requires + - obj.movable_parts.size() == get_num_movable_components_per_detection_template() - is_loaded_with_image() == true - get_num_detection_templates() > 0 - psi.size() >= get_num_dimensions() + (i.e. psi must have preallocated its memory before this function is called) + ensures + - This function allows you to determine the feature vector used for a sliding window location. + Note that this vector is added to psi. + - Since scan_image_pyramid is a sliding window classifier system, not all possible rectangles can + be output by detect(). So in the case where obj.rect could not arise from a call to detect(), this + function will map obj.rect to the nearest possible object box and then add the feature vector for + the mapped rectangle into #psi. + - get_best_matching_rect(obj.rect) == the rectangle obj.rect gets mapped to for feature extraction. + !*/ + + full_object_detection get_feature_vector ( + const rectangle& rect, + const feature_vector_type& w, + feature_vector_type& psi + ) const; + /*! + requires + - w.size() >= get_num_dimensions() + - is_loaded_with_image() == true + - get_num_detection_templates() > 0 + - psi.size() >= get_num_dimensions() + (i.e. psi must have preallocated its memory before this function is called) ensures - This function allows you to determine the feature vector used for a sliding window location. Note that this vector is added to psi. - if (rect was produced by a call to detect(), i.e. rect contains an element of dets) then - #psi == psi + the feature vector corresponding to the sliding window location indicated by rect. - - Let w denote the w vector given to detect(), then if we assigned psi to 0 before calling + - If w is the w vector given to detect(), then if we assigned 0 to psi before calling get_feature_vector() then we have: - dot(w,#psi) == the score produced by detect() for rect. - get_best_matching_rect(rect) == rect @@ -363,6 +440,12 @@ namespace dlib function will map rect to the nearest possible object box and then add the feature vector for the mapped rectangle into #psi. - get_best_matching_rect(rect) == the rectangle rect gets mapped to for feature extraction. + - returns a full_object_detection OBJ such that calling get_feature_vector(OBJ,psi) + and get_feature_vector(OBJ.rect,w,psi) on a psi of 0 would both result in the same psi vector being output. + This means that: + - OBJ.rect == rect + - OBJ.movable_parts.size() == get_num_movable_components_per_detection_template() + - OBJ.movable_parts == the locations of the movable parts inside this detection. !*/ }; diff --git a/dlib/svm/structural_object_detection_trainer.h b/dlib/svm/structural_object_detection_trainer.h index e9dc63f41..2651f15d8 100644 --- a/dlib/svm/structural_object_detection_trainer.h +++ b/dlib/svm/structural_object_detection_trainer.h @@ -9,6 +9,7 @@ #include "structural_svm_object_detection_problem.h" #include "../image_processing/object_detector.h" #include "../image_processing/box_overlap_testing.h" +#include "../image_processing/full_object_detection.h" namespace dlib @@ -54,6 +55,12 @@ namespace dlib auto_overlap_tester = is_same_type::value; } + const image_scanner_type& get_scanner ( + ) const + { + return scanner; + } + bool auto_set_overlap_tester ( ) const { @@ -239,29 +246,45 @@ namespace dlib > const trained_function_type train ( const image_array_type& images, - const std::vector >& truth_rects + const std::vector >& truth_object_detections ) const { +#ifdef ENABLE_ASSERTS // make sure requires clause is not broken - DLIB_ASSERT(is_learning_problem(images,truth_rects) == true, - "\t trained_function_type structural_object_detection_trainer::train(x,y)" + DLIB_ASSERT(is_learning_problem(images,truth_object_detections) == true, + "\t trained_function_type structural_object_detection_trainer::train()" << "\n\t invalid inputs were given to this function" << "\n\t images.size(): " << images.size() - << "\n\t truth_rects.size(): " << truth_rects.size() - << "\n\t is_learning_problem(images,truth_rects): " << is_learning_problem(images,truth_rects) + << "\n\t truth_object_detections.size(): " << truth_object_detections.size() + << "\n\t is_learning_problem(images,truth_object_detections): " << is_learning_problem(images,truth_object_detections) ); + for (unsigned long i = 0; i < truth_object_detections.size(); ++i) + { + for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j) + { + DLIB_ASSERT(truth_object_detections[i][j].movable_parts.size() == get_scanner().get_num_movable_components_per_detection_template(), + "\t trained_function_type structural_object_detection_trainer::train()" + << "\n\t invalid inputs were given to this function" + << "\n\t truth_object_detections["< > mapped_rects(truth_rects.size()); - for (unsigned long i = 0; i < truth_rects.size(); ++i) + std::vector > mapped_rects(truth_object_detections.size()); + for (unsigned long i = 0; i < truth_object_detections.size(); ++i) { - mapped_rects[i].resize(truth_rects[i].size()); - for (unsigned long j = 0; j < truth_rects[i].size(); ++j) + mapped_rects[i].resize(truth_object_detections[i].size()); + for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j) { - mapped_rects[i][j] = scanner.get_best_matching_rect(truth_rects[i][j]); + mapped_rects[i][j] = scanner.get_best_matching_rect(truth_object_detections[i][j].rect); } } @@ -273,7 +296,7 @@ namespace dlib } structural_svm_object_detection_problem - svm_prob(scanner, local_overlap_tester, images, truth_rects, num_threads); + svm_prob(scanner, local_overlap_tester, images, truth_object_detections, num_threads); if (verbose) svm_prob.be_verbose(); @@ -293,6 +316,25 @@ namespace dlib return object_detector(scanner, local_overlap_tester, w); } + template < + typename image_array_type + > + const trained_function_type train ( + const image_array_type& images, + const std::vector >& truth_object_detections + ) const + { + std::vector > truth_dets(truth_object_detections.size()); + for (unsigned long i = 0; i < truth_object_detections.size(); ++i) + { + for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j) + { + truth_dets[i].push_back(full_object_detection(truth_object_detections[i][j])); + } + } + + return train(images, truth_dets); + } private: diff --git a/dlib/svm/structural_object_detection_trainer_abstract.h b/dlib/svm/structural_object_detection_trainer_abstract.h index 47969c39f..15db48c71 100644 --- a/dlib/svm/structural_object_detection_trainer_abstract.h +++ b/dlib/svm/structural_object_detection_trainer_abstract.h @@ -6,6 +6,7 @@ #include "structural_svm_object_detection_problem_abstract.h" #include "../image_processing/object_detector_abstract.h" #include "../image_processing/box_overlap_testing_abstract.h" +#include "../image_processing/full_object_detection_abstract.h" namespace dlib @@ -60,12 +61,22 @@ namespace dlib - #get_loss_per_false_alarm() == 1 - This object will attempt to learn a model for the given scanner object when train() is called. + - #get_scanner() == scanner + (note that only the "configuration" of scanner is copied. + I.e. the copy is done using copy_configuration()) - if (overlap_tester_type == test_box_overlap) then - #auto_set_overlap_tester() == true - else - #auto_set_overlap_tester() == false !*/ + const image_scanner_type& get_scanner ( + ) const; + /*! + ensures + - returns the image scanner used by this object. + !*/ + bool auto_set_overlap_tester ( ) const; /*! @@ -74,7 +85,7 @@ namespace dlib state for the overlap tester used for non-max suppression.) then - returns true - In this case, it is determined using the find_tight_overlap_tester() - routine based on the truth_rects given to the + routine based on the truth_object_detections given to the structural_object_detection_trainer::train() method. - else - returns false @@ -276,20 +287,43 @@ namespace dlib > const trained_function_type train ( const image_array_type& images, - const std::vector >& truth_rects + const std::vector >& truth_object_detections ) const; /*! requires - - is_learning_problem(images, truth_rects) == true + - is_learning_problem(images, truth_object_detections) == true - it must be valid to pass images[0] into the image_scanner_type::load() method. (also, image_array_type must be an implementation of dlib/array/array_kernel_abstract.h) + - for all valid i, j: + - truth_object_detections[i][j].movable_parts.size() == get_scanner().get_num_movable_components_per_detection_template() ensures - Uses the structural_svm_object_detection_problem to train an object_detector - on the given images and truth_rects. + on the given images and truth_object_detections. - returns a function F with the following properties: - F(new_image) == A prediction of what objects are present in new_image. This is a set of rectangles indicating their positions. !*/ + + template < + typename image_array_type + > + const trained_function_type train ( + const image_array_type& images, + const std::vector >& truth_object_detections + ) const; + /*! + requires + - is_learning_problem(images, truth_object_detections) == true + - it must be valid to pass images[0] into the image_scanner_type::load() method. + (also, image_array_type must be an implementation of dlib/array/array_kernel_abstract.h) + - get_scanner().get_num_movable_components_per_detection_template() == 0 + ensures + - This function is identical to the above train(), except that it converts + each element of truth_object_detections into a full_object_detection by + passing it to full_object_detection's constructor taking only a rectangle. + Therefore, this version of train() is a convenience function for for the + case where you don't have any movable components of the detection templates. + !*/ }; // ---------------------------------------------------------------------------------------- diff --git a/dlib/svm/structural_svm_object_detection_problem.h b/dlib/svm/structural_svm_object_detection_problem.h index 21b0e2170..adcc21f83 100644 --- a/dlib/svm/structural_svm_object_detection_problem.h +++ b/dlib/svm/structural_svm_object_detection_problem.h @@ -9,6 +9,7 @@ #include #include "../string.h" #include "../array.h" +#include "../image_processing/full_object_detection.h" namespace dlib { @@ -37,35 +38,51 @@ namespace dlib const image_scanner_type& scanner, const overlap_tester_type& overlap_tester, const image_array_type& images_, - const std::vector >& truth_rects_, + const std::vector >& truth_object_detections_, unsigned long num_threads = 2 ) : structural_svm_problem_threaded >(num_threads), boxes_overlap(overlap_tester), images(images_), - truth_rects(truth_rects_), + truth_object_detections(truth_object_detections_), match_eps(0.5), loss_per_false_alarm(1), loss_per_missed_target(1) { +#ifdef ENABLE_ASSERTS // make sure requires clause is not broken - DLIB_ASSERT(is_learning_problem(images_, truth_rects_) && + DLIB_ASSERT(is_learning_problem(images_, truth_object_detections_) && scanner.get_num_detection_templates() > 0, "\t structural_svm_object_detection_problem::structural_svm_object_detection_problem()" << "\n\t Invalid inputs were given to this function " << "\n\t scanner.get_num_detection_templates(): " << scanner.get_num_detection_templates() - << "\n\t is_learning_problem(images_,truth_rects_): " << is_learning_problem(images_,truth_rects_) + << "\n\t is_learning_problem(images_,truth_object_detections_): " << is_learning_problem(images_,truth_object_detections_) << "\n\t this: " << this ); + for (unsigned long i = 0; i < truth_object_detections.size(); ++i) + { + for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j) + { + DLIB_ASSERT(truth_object_detections[i][j].movable_parts.size() == scanner.get_num_movable_components_per_detection_template(), + "\t trained_function_type structural_object_detection_trainer::train()" + << "\n\t invalid inputs were given to this function" + << "\n\t truth_object_detections["< max_num_dets) - max_num_dets = truth_rects[i].size(); + if (truth_object_detections[i].size() > max_num_dets) + max_num_dets = truth_object_detections[i].size(); scanners[i].copy_configuration(scanner); } @@ -160,12 +177,12 @@ namespace dlib std::vector mapped_rects; psi = 0; - for (unsigned long i = 0; i < truth_rects[idx].size(); ++i) + for (unsigned long i = 0; i < truth_object_detections[idx].size(); ++i) { - mapped_rects.push_back(scanner.get_best_matching_rect(truth_rects[idx][i])); - scanner.get_feature_vector(truth_rects[idx][i], psi); + mapped_rects.push_back(scanner.get_best_matching_rect(truth_object_detections[idx][i].rect)); + scanner.get_feature_vector(truth_object_detections[idx][i], psi); } - psi(scanner.get_num_dimensions()) = -1.0*truth_rects[idx].size(); + psi(scanner.get_num_dimensions()) = -1.0*truth_object_detections[idx].size(); // check if any of the boxes overlap. If they do then it is impossible for // us to learn to correctly classify this sample @@ -207,8 +224,8 @@ namespace dlib // truth rectangles. for (unsigned long i = 0; i < mapped_rects.size(); ++i) { - const double area = (truth_rects[idx][i].intersect(mapped_rects[i])).area(); - const double total_area = (truth_rects[idx][i] + mapped_rects[i]).area(); + const double area = (truth_object_detections[idx][i].rect.intersect(mapped_rects[i])).area(); + const double total_area = (truth_object_detections[idx][i].rect + mapped_rects[i]).area(); if (area/total_area <= match_eps) { using namespace std; @@ -231,9 +248,9 @@ namespace dlib sout << "image index "<< idx << endl; sout << "match_eps: "<< match_eps << endl; sout << "best possible match: "<< area/total_area << endl; - sout << "truth rect: "<< truth_rects[idx][i] << endl; - sout << "truth rect width/height: "<< truth_rects[idx][i].width()/(double)truth_rects[idx][i].height() << endl; - sout << "truth rect area: "<< truth_rects[idx][i].area() << endl; + sout << "truth rect: "<< truth_object_detections[idx][i].rect << endl; + sout << "truth rect width/height: "<< truth_object_detections[idx][i].rect.width()/(double)truth_object_detections[idx][i].rect.height() << endl; + sout << "truth rect area: "<< truth_object_detections[idx][i].rect.area() << endl; sout << "nearest detection template rect: "<< mapped_rects[i] << endl; sout << "nearest detection template rect width/height: "<< mapped_rects[i].width()/(double)mapped_rects[i].height() << endl; sout << "nearest detection template rect area: "<< mapped_rects[i].area() << endl; @@ -262,13 +279,13 @@ namespace dlib // The loss will measure the number of incorrect detections. A detection is // incorrect if it doesn't hit a truth rectangle or if it is a duplicate detection // on a truth rectangle. - loss = truth_rects[idx].size()*loss_per_missed_target; + loss = truth_object_detections[idx].size()*loss_per_missed_target; // Measure the loss augmented score for the detections which hit a truth rect. - std::vector truth_score_hits(truth_rects[idx].size(), 0); + std::vector truth_score_hits(truth_object_detections[idx].size(), 0); // keep track of which truth boxes we have hit so far. - std::vector hit_truth_table(truth_rects[idx].size(), false); + std::vector hit_truth_table(truth_object_detections[idx].size(), false); std::vector final_dets; // The point of this loop is to fill out the truth_score_hits array. @@ -277,7 +294,7 @@ namespace dlib if (overlaps_any_box(final_dets, dets[i].second)) continue; - const std::pair truth = find_best_match(truth_rects[idx], dets[i].second); + const std::pair truth = find_best_match(truth_object_detections[idx], dets[i].second); final_dets.push_back(dets[i].second); @@ -285,7 +302,7 @@ namespace dlib // if hit truth rect if (truth_match > match_eps) { - // if this is the first time we have seen a detect which hit truth_rects[truth.second] + // if this is the first time we have seen a detect which hit truth_object_detections[truth.second] const double score = dets[i].first - thresh; if (hit_truth_table[truth.second] == false) { @@ -311,7 +328,7 @@ namespace dlib if (overlaps_any_box(final_dets, dets[i].second)) continue; - const std::pair truth = find_best_match(truth_rects[idx], dets[i].second); + const std::pair truth = find_best_match(truth_object_detections[idx], dets[i].second); const double truth_match = truth.first; if (truth_match > match_eps) @@ -342,27 +359,27 @@ namespace dlib psi.set_size(get_num_dimensions()); psi = 0; for (unsigned long i = 0; i < final_dets.size(); ++i) - scanner.get_feature_vector(final_dets[i], psi); + scanner.get_feature_vector(final_dets[i], current_solution, psi); psi(scanner.get_num_dimensions()) = -1.0*final_dets.size(); } bool overlaps_any_box ( - const std::vector& truth_rects, + const std::vector& truth_object_detections, const dlib::rectangle& rect ) const { - for (unsigned long i = 0; i < truth_rects.size(); ++i) + for (unsigned long i = 0; i < truth_object_detections.size(); ++i) { - if (boxes_overlap(truth_rects[i], rect)) + if (boxes_overlap(truth_object_detections[i], rect)) return true; } return false; } std::pair find_best_match( - const std::vector& boxes, + const std::vector& boxes, const rectangle rect ) const /*! @@ -381,10 +398,10 @@ namespace dlib for (unsigned long i = 0; i < boxes.size(); ++i) { - const unsigned long area = rect.intersect(boxes[i]).area(); + const unsigned long area = rect.intersect(boxes[i].rect).area(); if (area != 0) { - const double new_match = area / static_cast((rect + boxes[i]).area()); + const double new_match = area / static_cast((rect + boxes[i].rect).area()); if (new_match > match) { match = new_match; @@ -411,7 +428,7 @@ namespace dlib mutable array scanners; const image_array_type& images; - const std::vector >& truth_rects; + const std::vector >& truth_object_detections; unsigned long max_num_dets; double match_eps; diff --git a/dlib/svm/structural_svm_object_detection_problem_abstract.h b/dlib/svm/structural_svm_object_detection_problem_abstract.h index c6dc9ba95..50141fee4 100644 --- a/dlib/svm/structural_svm_object_detection_problem_abstract.h +++ b/dlib/svm/structural_svm_object_detection_problem_abstract.h @@ -6,6 +6,7 @@ #include "../matrix.h" #include "structural_svm_problem_threaded_abstract.h" #include +#include "../image_processing/full_object_detection_abstract.h" namespace dlib { @@ -81,23 +82,25 @@ namespace dlib const image_scanner_type& scanner, const overlap_tester_type& overlap_tester, const image_array_type& images, - const std::vector >& truth_rects, + const std::vector >& truth_object_detections, unsigned long num_threads = 2 ); /*! requires - - is_learning_problem(images, truth_rects) + - is_learning_problem(images, truth_object_detections) - scanner.get_num_detection_templates() > 0 - scanner.load(images[0]) must be a valid expression. + - for all valid i, j: + - truth_object_detections[i][j].movable_rects.size() == scanner.get_num_movable_components_per_detection_template() ensures - This object attempts to learn a mapping from the given images to the - object locations given in truth_rects. In particular, it attempts to - learn to predict truth_rects[i] based on images[i]. + object locations given in truth_object_detections. In particular, it attempts to + learn to predict truth_object_detections[i] based on images[i]. Or in other words, this object can be used to learn a parameter vector, w, such that an object_detector declared as: object_detector detector(scanner,overlap_tester,w) results in a detector object which attempts to compute the following mapping: - truth_rects[i] == detector(images[i]) + truth_object_detections[i].rect == detector(images[i]) - #get_match_eps() == 0.5 - This object will use num_threads threads during the optimization procedure. You should set this parameter equal to the number of diff --git a/dlib/test/object_detector.cpp b/dlib/test/object_detector.cpp index 0712f3261..e01cd7973 100644 --- a/dlib/test/object_detector.cpp +++ b/dlib/test/object_detector.cpp @@ -57,6 +57,7 @@ namespace detector(images[i], dets2); matrix psi(detector.get_w().size()); + matrix psi2(detector.get_w().size()); const double thresh = detector.get_w()(detector.get_w().size()-1); DLIB_TEST(dets.size() == dets2.size()); @@ -65,10 +66,19 @@ namespace DLIB_TEST(dets[j] == dets2[j].second); psi = 0; - detector.get_scanner().get_feature_vector(dets[j], psi); + const full_object_detection fdet = detector.get_scanner().get_feature_vector(dets[j], detector.get_w(), psi); - const double check_score = dot(psi,detector.get_w()) - thresh; + double check_score = dot(psi,detector.get_w()) - thresh; DLIB_TEST(std::abs(check_score - dets2[j].first) < 1e-10); + + + // Make sure fdet works the way it is supposed to with get_feature_vector(). + psi2 = 0; + detector.get_scanner().get_feature_vector(fdet, psi2); + + check_score = dot(psi2,detector.get_w()) - thresh; + DLIB_TEST(std::abs(check_score - dets2[j].first) < 1e-10); + DLIB_TEST(max(abs(psi-psi2)) < 1e-10); } }