Refactored the interfaces and objects related to object detection so that

they can support movable object part models.  Now all that needs to be
done is to implement the TODO inside the scan_image_pyramid object and
the movable part model support should be up and working.
This commit is contained in:
Davis King 2012-08-12 13:41:46 -04:00
parent 6f57d405e2
commit 838caffb8a
8 changed files with 403 additions and 94 deletions

View File

@ -157,8 +157,8 @@ namespace dlib
minus the threshold, therefore this is a value > 0.
- #dets[i].second == the bounding box for the i-th detection.
- #get_scanner() will have been loaded with img. Therefore, you can call
#get_scanner().get_feature_vector() to obtain the feature vectors for
the resulting object detection boxes.
#get_scanner().get_feature_vector() to obtain the feature vectors or
full_object_detections for the resulting object detection boxes.
- The detection threshold is adjusted by having adjust_threshold added
to it. Therefore, an adjust_threshold value > 0 makes detecting
objects harder while a negative one makes it easier.

View File

@ -9,6 +9,7 @@
#include "../image_processing.h"
#include "../array2d.h"
#include <vector>
#include "full_object_detection.h"
namespace dlib
{
@ -52,12 +53,24 @@ namespace dlib
void add_detection_template (
const rectangle& object_box,
const std::vector<rectangle>& feature_extraction_regions
const std::vector<rectangle>& stationary_feature_extraction_regions,
const std::vector<rectangle>& movable_feature_extraction_regions
);
void add_detection_template (
const rectangle& object_box,
const std::vector<rectangle>& stationary_feature_extraction_regions
);
inline unsigned long get_num_detection_templates (
) const;
inline unsigned long get_num_movable_components_per_detection_template (
) const;
inline unsigned long get_num_stationary_components_per_detection_template (
) const;
inline unsigned long get_num_components_per_detection_template (
) const;
@ -96,7 +109,13 @@ namespace dlib
) const;
void get_feature_vector (
const full_object_detection& obj,
feature_vector_type& psi
) const;
full_object_detection get_feature_vector (
const rectangle& rect,
const feature_vector_type& w,
feature_vector_type& psi
) const;
@ -129,6 +148,7 @@ namespace dlib
{
rectangle object_box; // always centered at (0,0)
std::vector<rectangle> rects; // template with respect to (0,0)
std::vector<rectangle> movable_rects;
};
friend void serialize(const detection_template& item, std::ostream& out)
@ -394,27 +414,61 @@ namespace dlib
void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
add_detection_template (
const rectangle& object_box,
const std::vector<rectangle>& feature_extraction_regions
const std::vector<rectangle>& stationary_feature_extraction_regions,
const std::vector<rectangle>& movable_feature_extraction_regions
)
{
#ifdef ENABLE_ASSERTS
// make sure requires clause is not broken
DLIB_ASSERT((get_num_detection_templates() == 0 ||
get_num_components_per_detection_template() == feature_extraction_regions.size()) &&
(get_num_stationary_components_per_detection_template() == stationary_feature_extraction_regions.size() &&
get_num_movable_components_per_detection_template() == movable_feature_extraction_regions.size())) &&
center(object_box) == point(0,0),
"\t void scan_image_pyramid::add_detection_template()"
<< "\n\t The number of rects in this new detection template doesn't match "
<< "\n\t the number in previous detection templates."
<< "\n\t get_num_components_per_detection_template(): " << get_num_components_per_detection_template()
<< "\n\t feature_extraction_regions.size(): " << feature_extraction_regions.size()
<< "\n\t get_num_stationary_components_per_detection_template(): " << get_num_stationary_components_per_detection_template()
<< "\n\t stationary_feature_extraction_regions.size(): " << stationary_feature_extraction_regions.size()
<< "\n\t get_num_movable_components_per_detection_template(): " << get_num_movable_components_per_detection_template()
<< "\n\t movable_feature_extraction_regions.size(): " << movable_feature_extraction_regions.size()
<< "\n\t this: " << this
);
for (unsigned long i = 0; i < movable_feature_extraction_regions.size(); ++i)
{
DLIB_ASSERT(center(movable_feature_extraction_regions[i]) == point(0,0),
"Invalid inputs were given to this function."
<< "\n\t center(movable_feature_extraction_regions["<<i<<"]): " << center(movable_feature_extraction_regions[i])
<< "\n\t this: " << this
);
}
#endif
detection_template temp;
temp.object_box = object_box;
temp.rects = feature_extraction_regions;
temp.rects = stationary_feature_extraction_regions;
temp.movable_rects = movable_feature_extraction_regions;
det_templates.push_back(temp);
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type,
typename Feature_extractor_type
>
void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
add_detection_template (
const rectangle& object_box,
const std::vector<rectangle>& stationary_feature_extraction_regions
)
{
// an empty set of movable feature regions
const std::vector<rectangle> movable_feature_extraction_regions;
add_detection_template(object_box, stationary_feature_extraction_regions,
movable_feature_extraction_regions);
}
// ----------------------------------------------------------------------------------------
template <
@ -428,6 +482,48 @@ namespace dlib
return det_templates.size();
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type,
typename Feature_extractor_type
>
unsigned long scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
get_num_stationary_components_per_detection_template (
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(get_num_detection_templates() > 0 ,
"\t unsigned long scan_image_pyramid::get_num_stationary_components_per_detection_template()"
<< "\n\t You need to give some detection templates before calling this function. "
<< "\n\t get_num_detection_templates(): " << get_num_detection_templates()
<< "\n\t this: " << this
);
return det_templates[0].rects.size();
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type,
typename Feature_extractor_type
>
unsigned long scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
get_num_movable_components_per_detection_template (
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(get_num_detection_templates() > 0 ,
"\t unsigned long scan_image_pyramid::get_num_movable_components_per_detection_template()"
<< "\n\t You need to give some detection templates before calling this function. "
<< "\n\t get_num_detection_templates(): " << get_num_detection_templates()
<< "\n\t this: " << this
);
return det_templates[0].movable_rects.size();
}
// ----------------------------------------------------------------------------------------
template <
@ -446,7 +542,8 @@ namespace dlib
<< "\n\t this: " << this
);
return det_templates[0].rects.size();
return get_num_movable_components_per_detection_template() +
get_num_stationary_components_per_detection_template();
}
// ----------------------------------------------------------------------------------------
@ -697,25 +794,48 @@ namespace dlib
typename Pyramid_type,
typename Feature_extractor_type
>
void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
full_object_detection scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
get_feature_vector (
const rectangle& rect,
const feature_vector_type&,// w,
feature_vector_type& psi
) const
{
// TODO
get_feature_vector(full_object_detection(rect), psi);
return full_object_detection(rect);
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type,
typename Feature_extractor_type
>
void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
get_feature_vector (
const full_object_detection& obj,
feature_vector_type& psi
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(get_num_detection_templates() > 0 &&
is_loaded_with_image() &&
psi.size() >= get_num_dimensions(),
psi.size() >= get_num_dimensions() &&
obj.movable_parts.size() == get_num_movable_components_per_detection_template(),
"\t void scan_image_pyramid::get_feature_vector()"
<< "\n\t Invalid inputs were given to this function "
<< "\n\t get_num_detection_templates(): " << get_num_detection_templates()
<< "\n\t is_loaded_with_image(): " << is_loaded_with_image()
<< "\n\t psi.size(): " << psi.size()
<< "\n\t get_num_dimensions(): " << get_num_dimensions()
<< "\n\t get_num_movable_components_per_detection_template(): " << get_num_movable_components_per_detection_template()
<< "\n\t obj.movable_parts.size(): " << obj.movable_parts.size()
<< "\n\t this: " << this
);
const rectangle rect = obj.rect;
pyramid_type pyr;
rectangle mapped_rect;
detection_template best_template;

View File

@ -8,6 +8,7 @@
#include "../image_processing.h"
#include "../array2d.h"
#include <vector>
#include "full_object_detection_abstract.h"
namespace dlib
{
@ -56,30 +57,39 @@ namespace dlib
objects, which associate a vector with each location in an image.
2. A detection template. This is a rectangle which defines the shape of a
sliding window (the object_box), as well as a set of rectangles which
envelop it. This set of enveloping rectangles defines the spatial
structure of the overall feature extraction within a sliding window.
In particular, each location of a sliding window has a feature vector
sliding window (i.e. the object_box), as well as a set of rectangular feature
extraction regions inside it. This set of regions defines the spatial
structure of the overall feature extraction within a sliding window. In
particular, each location of a sliding window has a feature vector
associated with it. This feature vector is defined as follows:
- Let N denote the number of enveloping rectangles.
- Let N denote the number of feature extraction zones.
- Let M denote the dimensionality of the vectors output by Feature_extractor_type
objects.
- Let F(i) == the M dimensional vector which is the sum of all vectors
given by our Feature_extractor_type object inside the ith enveloping
rectangle.
given by our Feature_extractor_type object inside the ith feature extraction
zone.
- Then the feature vector for a sliding window is an M*N dimensional vector
[F(1) F(2) F(3) ... F(N)] (i.e. it is a concatenation of the N vectors).
This feature vector can be thought of as a collection of N "bags of features",
each bag coming from a spatial location determined by one of the enveloping
rectangles.
each bag coming from a spatial location determined by one of the rectangular
feature extraction zones.
3. A weight vector and a threshold value. The dot product between the weight
vector and the feature vector for a sliding window location gives the score
of the window. If this score is greater than the threshold value then the
window location is output as a detection.
Finally, the sliding window classifiers described above are applied to every level
of an image pyramid.
Finally, the sliding window classifiers described above are applied to every level of
an image pyramid. Moreover, some of the feature extraction zones are allowed to move
freely within the object box. This means that when we are sliding the classifier over
an image, some feature extraction zones are stationary (i.e. always in the same place
relative to the object box) while others are allowed to move anywhere within the object
box. In particular, the movable regions are placed at the locations that maximize the
score of the classifier. Note further that each of the movable feature extraction
zones must pass a threshold test for it to be included. That is, if the score that a
movable zone would contribute to the overall score for a sliding window location is not
positive then that zone is not included in the feature vector (i.e. its part of the
feature vector is set to zero. This way the length of the feature vector stays constant).
THREAD SAFETY
Concurrent access to an instance of this object is not safe and should be protected
@ -164,30 +174,48 @@ namespace dlib
void add_detection_template (
const rectangle& object_box,
const std::vector<rectangle>& feature_extraction_regions
const std::vector<rectangle>& stationary_feature_extraction_regions,
const std::vector<rectangle>& movable_feature_extraction_regions
);
/*!
requires
- center(object_box) == point(0,0),
- center(object_box) == point(0,0)
- for all valid i:
- center(movable_feature_extraction_regions[i]) == point(0,0)
- if (get_num_detection_templates() > 0) then
- get_num_components_per_detection_template() == feature_extraction_regions.size()
- get_num_stationary_components_per_detection_template() == stationary_feature_extraction_regions.size()
- get_num_movable_components_per_detection_template() == movable_feature_extraction_regions.size()
(i.e. if you already have detection templates in this object, then
any new detection template must declare a consistent number of
feature extraction regions)
ensures
- Adds another detection template to this object. In particular, object_box
defines the size and shape of a sliding window while feature_extraction_regions
defines the locations for feature extraction as discussed in the WHAT THIS
OBJECT REPRESENTS section above. Note also that the locations of the feature
extraction regions are relative to the object_box.
defines the size and shape of a sliding window while stationary_feature_extraction_regions
and movable_feature_extraction_regions defines the locations for feature extraction as
discussed in the WHAT THIS OBJECT REPRESENTS section above. Note also that the locations of
the stationary feature extraction regions are relative to the object_box.
- #get_num_detection_templates() == get_num_detection_templates() + 1
- The order of rectangles in feature_extraction_regions matters. Recall that
each rectangle gets its own set of features. So given two different templates,
their ith rectangles will both share the same part of the weight vector (w)
supplied to detect(). So there should be some reasonable correspondence
- The order of rectangles in stationary_feature_extraction_regions and
movable_feature_extraction_regions matters. Recall that each rectangle
gets its own set of features. So given two different templates, their
ith rectangles will both share the same part of the weight vector (i.e. the w
supplied to detect()). So there should be some reasonable correspondence
between the rectangle ordering in different detection templates. For,
example, different detection templates should place corresponding
feature extraction regions in roughly the same part of the object_box.
example, different detection templates should place corresponding feature
extraction regions in roughly the same part of the object_box.
- #get_num_stationary_components_per_detection_template() = stationary_feature_extraction_regions.size()
- #get_num_movable_components_per_detection_template() = movable_feature_extraction_regions.size()
!*/
void add_detection_template (
const rectangle& object_box,
const std::vector<rectangle>& stationary_feature_extraction_regions
);
/*!
ensures
- calls add_detection_template(object_box, stationary_feature_extraction_regions, empty_list)
where empty_list is a vector of size 0. I.e. this function is just a convenience
routine for adding detection templates with no movable regions.
!*/
unsigned long get_num_detection_templates (
@ -197,16 +225,40 @@ namespace dlib
- returns the number of detection templates in this object
!*/
unsigned long get_num_stationary_components_per_detection_template (
) const;
/*!
requires
- get_num_detection_templates() > 0
ensures
- A detection template is a rectangle which defines the shape of a sliding
window (the object_box), as well as a set of rectangles which define
feature extraction zones. This function returns the number of stationary
feature extraction zones in the detection templates used by this object.
!*/
unsigned long get_num_movable_components_per_detection_template (
) const;
/*!
requires
- get_num_detection_templates() > 0
ensures
- A detection template is a rectangle which defines the shape of a sliding
window (the object_box), as well as a set of rectangles which define
feature extraction zones. This function returns the number of movable
feature extraction zones in the detection templates used by this object.
!*/
unsigned long get_num_components_per_detection_template (
) const;
/*!
requires
- get_num_detection_templates() > 0
ensures
- A detection template is a rectangle which defines the shape of a
sliding window (the object_box), as well as a set of rectangles which
envelop it. This function returns the number of enveloping rectangles
in the detection templates used by this object.
- returns the total number of feature extraction zones in the detection
templates used by this object. That is, returns the following:
- get_num_movable_components_per_detection_template() +
get_num_stationary_components_per_detection_template()
!*/
long get_num_dimensions (
@ -217,7 +269,8 @@ namespace dlib
ensures
- returns the number of dimensions in the feature vector for a sliding window
location. This value is the dimensionality of the underlying feature vectors
produced by Feature_extractor_type times get_num_components_per_detection_template().
produced by Feature_extractor_type times (get_num_stationary_components_per_detection_template() +
get_num_movable_components_per_detection_template()).
!*/
unsigned long get_max_pyramid_levels (
@ -339,21 +392,45 @@ namespace dlib
!*/
void get_feature_vector (
const rectangle& rect,
const full_object_detection& obj,
feature_vector_type& psi
) const;
/*!
requires
- obj.movable_parts.size() == get_num_movable_components_per_detection_template()
- is_loaded_with_image() == true
- get_num_detection_templates() > 0
- psi.size() >= get_num_dimensions()
(i.e. psi must have preallocated its memory before this function is called)
ensures
- This function allows you to determine the feature vector used for a sliding window location.
Note that this vector is added to psi.
- Since scan_image_pyramid is a sliding window classifier system, not all possible rectangles can
be output by detect(). So in the case where obj.rect could not arise from a call to detect(), this
function will map obj.rect to the nearest possible object box and then add the feature vector for
the mapped rectangle into #psi.
- get_best_matching_rect(obj.rect) == the rectangle obj.rect gets mapped to for feature extraction.
!*/
full_object_detection get_feature_vector (
const rectangle& rect,
const feature_vector_type& w,
feature_vector_type& psi
) const;
/*!
requires
- w.size() >= get_num_dimensions()
- is_loaded_with_image() == true
- get_num_detection_templates() > 0
- psi.size() >= get_num_dimensions()
(i.e. psi must have preallocated its memory before this function is called)
ensures
- This function allows you to determine the feature vector used for a sliding window location.
Note that this vector is added to psi.
- if (rect was produced by a call to detect(), i.e. rect contains an element of dets) then
- #psi == psi + the feature vector corresponding to the sliding window location indicated
by rect.
- Let w denote the w vector given to detect(), then if we assigned psi to 0 before calling
- If w is the w vector given to detect(), then if we assigned 0 to psi before calling
get_feature_vector() then we have:
- dot(w,#psi) == the score produced by detect() for rect.
- get_best_matching_rect(rect) == rect
@ -363,6 +440,12 @@ namespace dlib
function will map rect to the nearest possible object box and then add the feature vector for
the mapped rectangle into #psi.
- get_best_matching_rect(rect) == the rectangle rect gets mapped to for feature extraction.
- returns a full_object_detection OBJ such that calling get_feature_vector(OBJ,psi)
and get_feature_vector(OBJ.rect,w,psi) on a psi of 0 would both result in the same psi vector being output.
This means that:
- OBJ.rect == rect
- OBJ.movable_parts.size() == get_num_movable_components_per_detection_template()
- OBJ.movable_parts == the locations of the movable parts inside this detection.
!*/
};

View File

@ -9,6 +9,7 @@
#include "structural_svm_object_detection_problem.h"
#include "../image_processing/object_detector.h"
#include "../image_processing/box_overlap_testing.h"
#include "../image_processing/full_object_detection.h"
namespace dlib
@ -54,6 +55,12 @@ namespace dlib
auto_overlap_tester = is_same_type<overlap_tester_type,test_box_overlap>::value;
}
const image_scanner_type& get_scanner (
) const
{
return scanner;
}
bool auto_set_overlap_tester (
) const
{
@ -239,29 +246,45 @@ namespace dlib
>
const trained_function_type train (
const image_array_type& images,
const std::vector<std::vector<rectangle> >& truth_rects
const std::vector<std::vector<full_object_detection> >& truth_object_detections
) const
{
#ifdef ENABLE_ASSERTS
// make sure requires clause is not broken
DLIB_ASSERT(is_learning_problem(images,truth_rects) == true,
"\t trained_function_type structural_object_detection_trainer::train(x,y)"
DLIB_ASSERT(is_learning_problem(images,truth_object_detections) == true,
"\t trained_function_type structural_object_detection_trainer::train()"
<< "\n\t invalid inputs were given to this function"
<< "\n\t images.size(): " << images.size()
<< "\n\t truth_rects.size(): " << truth_rects.size()
<< "\n\t is_learning_problem(images,truth_rects): " << is_learning_problem(images,truth_rects)
<< "\n\t truth_object_detections.size(): " << truth_object_detections.size()
<< "\n\t is_learning_problem(images,truth_object_detections): " << is_learning_problem(images,truth_object_detections)
);
for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
{
for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
{
DLIB_ASSERT(truth_object_detections[i][j].movable_parts.size() == get_scanner().get_num_movable_components_per_detection_template(),
"\t trained_function_type structural_object_detection_trainer::train()"
<< "\n\t invalid inputs were given to this function"
<< "\n\t truth_object_detections["<<i<<"]["<<j<<"].movable_parts.size(): " <<
truth_object_detections[i][j].movable_parts.size()
<< "\n\t get_scanner().get_num_movable_components_per_detection_template(): " <<
get_scanner().get_num_movable_components_per_detection_template()
);
}
}
#endif
overlap_tester_type local_overlap_tester;
if (auto_overlap_tester)
{
std::vector<std::vector<rectangle> > mapped_rects(truth_rects.size());
for (unsigned long i = 0; i < truth_rects.size(); ++i)
std::vector<std::vector<rectangle> > mapped_rects(truth_object_detections.size());
for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
{
mapped_rects[i].resize(truth_rects[i].size());
for (unsigned long j = 0; j < truth_rects[i].size(); ++j)
mapped_rects[i].resize(truth_object_detections[i].size());
for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
{
mapped_rects[i][j] = scanner.get_best_matching_rect(truth_rects[i][j]);
mapped_rects[i][j] = scanner.get_best_matching_rect(truth_object_detections[i][j].rect);
}
}
@ -273,7 +296,7 @@ namespace dlib
}
structural_svm_object_detection_problem<image_scanner_type,overlap_tester_type,image_array_type >
svm_prob(scanner, local_overlap_tester, images, truth_rects, num_threads);
svm_prob(scanner, local_overlap_tester, images, truth_object_detections, num_threads);
if (verbose)
svm_prob.be_verbose();
@ -293,6 +316,25 @@ namespace dlib
return object_detector<image_scanner_type,overlap_tester_type>(scanner, local_overlap_tester, w);
}
template <
typename image_array_type
>
const trained_function_type train (
const image_array_type& images,
const std::vector<std::vector<rectangle> >& truth_object_detections
) const
{
std::vector<std::vector<full_object_detection> > truth_dets(truth_object_detections.size());
for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
{
for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
{
truth_dets[i].push_back(full_object_detection(truth_object_detections[i][j]));
}
}
return train(images, truth_dets);
}
private:

View File

@ -6,6 +6,7 @@
#include "structural_svm_object_detection_problem_abstract.h"
#include "../image_processing/object_detector_abstract.h"
#include "../image_processing/box_overlap_testing_abstract.h"
#include "../image_processing/full_object_detection_abstract.h"
namespace dlib
@ -60,12 +61,22 @@ namespace dlib
- #get_loss_per_false_alarm() == 1
- This object will attempt to learn a model for the given
scanner object when train() is called.
- #get_scanner() == scanner
(note that only the "configuration" of scanner is copied.
I.e. the copy is done using copy_configuration())
- if (overlap_tester_type == test_box_overlap) then
- #auto_set_overlap_tester() == true
- else
- #auto_set_overlap_tester() == false
!*/
const image_scanner_type& get_scanner (
) const;
/*!
ensures
- returns the image scanner used by this object.
!*/
bool auto_set_overlap_tester (
) const;
/*!
@ -74,7 +85,7 @@ namespace dlib
state for the overlap tester used for non-max suppression.) then
- returns true
- In this case, it is determined using the find_tight_overlap_tester()
routine based on the truth_rects given to the
routine based on the truth_object_detections given to the
structural_object_detection_trainer::train() method.
- else
- returns false
@ -276,20 +287,43 @@ namespace dlib
>
const trained_function_type train (
const image_array_type& images,
const std::vector<std::vector<rectangle> >& truth_rects
const std::vector<std::vector<full_object_detection> >& truth_object_detections
) const;
/*!
requires
- is_learning_problem(images, truth_rects) == true
- is_learning_problem(images, truth_object_detections) == true
- it must be valid to pass images[0] into the image_scanner_type::load() method.
(also, image_array_type must be an implementation of dlib/array/array_kernel_abstract.h)
- for all valid i, j:
- truth_object_detections[i][j].movable_parts.size() == get_scanner().get_num_movable_components_per_detection_template()
ensures
- Uses the structural_svm_object_detection_problem to train an object_detector
on the given images and truth_rects.
on the given images and truth_object_detections.
- returns a function F with the following properties:
- F(new_image) == A prediction of what objects are present in new_image. This
is a set of rectangles indicating their positions.
!*/
template <
typename image_array_type
>
const trained_function_type train (
const image_array_type& images,
const std::vector<std::vector<rectangle> >& truth_object_detections
) const;
/*!
requires
- is_learning_problem(images, truth_object_detections) == true
- it must be valid to pass images[0] into the image_scanner_type::load() method.
(also, image_array_type must be an implementation of dlib/array/array_kernel_abstract.h)
- get_scanner().get_num_movable_components_per_detection_template() == 0
ensures
- This function is identical to the above train(), except that it converts
each element of truth_object_detections into a full_object_detection by
passing it to full_object_detection's constructor taking only a rectangle.
Therefore, this version of train() is a convenience function for for the
case where you don't have any movable components of the detection templates.
!*/
};
// ----------------------------------------------------------------------------------------

View File

@ -9,6 +9,7 @@
#include <sstream>
#include "../string.h"
#include "../array.h"
#include "../image_processing/full_object_detection.h"
namespace dlib
{
@ -37,35 +38,51 @@ namespace dlib
const image_scanner_type& scanner,
const overlap_tester_type& overlap_tester,
const image_array_type& images_,
const std::vector<std::vector<rectangle> >& truth_rects_,
const std::vector<std::vector<full_object_detection> >& truth_object_detections_,
unsigned long num_threads = 2
) :
structural_svm_problem_threaded<matrix<double,0,1> >(num_threads),
boxes_overlap(overlap_tester),
images(images_),
truth_rects(truth_rects_),
truth_object_detections(truth_object_detections_),
match_eps(0.5),
loss_per_false_alarm(1),
loss_per_missed_target(1)
{
#ifdef ENABLE_ASSERTS
// make sure requires clause is not broken
DLIB_ASSERT(is_learning_problem(images_, truth_rects_) &&
DLIB_ASSERT(is_learning_problem(images_, truth_object_detections_) &&
scanner.get_num_detection_templates() > 0,
"\t structural_svm_object_detection_problem::structural_svm_object_detection_problem()"
<< "\n\t Invalid inputs were given to this function "
<< "\n\t scanner.get_num_detection_templates(): " << scanner.get_num_detection_templates()
<< "\n\t is_learning_problem(images_,truth_rects_): " << is_learning_problem(images_,truth_rects_)
<< "\n\t is_learning_problem(images_,truth_object_detections_): " << is_learning_problem(images_,truth_object_detections_)
<< "\n\t this: " << this
);
for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
{
for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
{
DLIB_ASSERT(truth_object_detections[i][j].movable_parts.size() == scanner.get_num_movable_components_per_detection_template(),
"\t trained_function_type structural_object_detection_trainer::train()"
<< "\n\t invalid inputs were given to this function"
<< "\n\t truth_object_detections["<<i<<"]["<<j<<"].movable_parts.size(): " <<
truth_object_detections[i][j].movable_parts.size()
<< "\n\t scanner.get_num_movable_components_per_detection_template(): " <<
scanner.get_num_movable_components_per_detection_template()
);
}
}
#endif
scanners.set_max_size(images.size());
scanners.set_size(images.size());
max_num_dets = 0;
for (unsigned long i = 0; i < truth_rects.size(); ++i)
for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
{
if (truth_rects[i].size() > max_num_dets)
max_num_dets = truth_rects[i].size();
if (truth_object_detections[i].size() > max_num_dets)
max_num_dets = truth_object_detections[i].size();
scanners[i].copy_configuration(scanner);
}
@ -160,12 +177,12 @@ namespace dlib
std::vector<rectangle> mapped_rects;
psi = 0;
for (unsigned long i = 0; i < truth_rects[idx].size(); ++i)
for (unsigned long i = 0; i < truth_object_detections[idx].size(); ++i)
{
mapped_rects.push_back(scanner.get_best_matching_rect(truth_rects[idx][i]));
scanner.get_feature_vector(truth_rects[idx][i], psi);
mapped_rects.push_back(scanner.get_best_matching_rect(truth_object_detections[idx][i].rect));
scanner.get_feature_vector(truth_object_detections[idx][i], psi);
}
psi(scanner.get_num_dimensions()) = -1.0*truth_rects[idx].size();
psi(scanner.get_num_dimensions()) = -1.0*truth_object_detections[idx].size();
// check if any of the boxes overlap. If they do then it is impossible for
// us to learn to correctly classify this sample
@ -207,8 +224,8 @@ namespace dlib
// truth rectangles.
for (unsigned long i = 0; i < mapped_rects.size(); ++i)
{
const double area = (truth_rects[idx][i].intersect(mapped_rects[i])).area();
const double total_area = (truth_rects[idx][i] + mapped_rects[i]).area();
const double area = (truth_object_detections[idx][i].rect.intersect(mapped_rects[i])).area();
const double total_area = (truth_object_detections[idx][i].rect + mapped_rects[i]).area();
if (area/total_area <= match_eps)
{
using namespace std;
@ -231,9 +248,9 @@ namespace dlib
sout << "image index "<< idx << endl;
sout << "match_eps: "<< match_eps << endl;
sout << "best possible match: "<< area/total_area << endl;
sout << "truth rect: "<< truth_rects[idx][i] << endl;
sout << "truth rect width/height: "<< truth_rects[idx][i].width()/(double)truth_rects[idx][i].height() << endl;
sout << "truth rect area: "<< truth_rects[idx][i].area() << endl;
sout << "truth rect: "<< truth_object_detections[idx][i].rect << endl;
sout << "truth rect width/height: "<< truth_object_detections[idx][i].rect.width()/(double)truth_object_detections[idx][i].rect.height() << endl;
sout << "truth rect area: "<< truth_object_detections[idx][i].rect.area() << endl;
sout << "nearest detection template rect: "<< mapped_rects[i] << endl;
sout << "nearest detection template rect width/height: "<< mapped_rects[i].width()/(double)mapped_rects[i].height() << endl;
sout << "nearest detection template rect area: "<< mapped_rects[i].area() << endl;
@ -262,13 +279,13 @@ namespace dlib
// The loss will measure the number of incorrect detections. A detection is
// incorrect if it doesn't hit a truth rectangle or if it is a duplicate detection
// on a truth rectangle.
loss = truth_rects[idx].size()*loss_per_missed_target;
loss = truth_object_detections[idx].size()*loss_per_missed_target;
// Measure the loss augmented score for the detections which hit a truth rect.
std::vector<double> truth_score_hits(truth_rects[idx].size(), 0);
std::vector<double> truth_score_hits(truth_object_detections[idx].size(), 0);
// keep track of which truth boxes we have hit so far.
std::vector<bool> hit_truth_table(truth_rects[idx].size(), false);
std::vector<bool> hit_truth_table(truth_object_detections[idx].size(), false);
std::vector<rectangle> final_dets;
// The point of this loop is to fill out the truth_score_hits array.
@ -277,7 +294,7 @@ namespace dlib
if (overlaps_any_box(final_dets, dets[i].second))
continue;
const std::pair<double,unsigned int> truth = find_best_match(truth_rects[idx], dets[i].second);
const std::pair<double,unsigned int> truth = find_best_match(truth_object_detections[idx], dets[i].second);
final_dets.push_back(dets[i].second);
@ -285,7 +302,7 @@ namespace dlib
// if hit truth rect
if (truth_match > match_eps)
{
// if this is the first time we have seen a detect which hit truth_rects[truth.second]
// if this is the first time we have seen a detect which hit truth_object_detections[truth.second]
const double score = dets[i].first - thresh;
if (hit_truth_table[truth.second] == false)
{
@ -311,7 +328,7 @@ namespace dlib
if (overlaps_any_box(final_dets, dets[i].second))
continue;
const std::pair<double,unsigned int> truth = find_best_match(truth_rects[idx], dets[i].second);
const std::pair<double,unsigned int> truth = find_best_match(truth_object_detections[idx], dets[i].second);
const double truth_match = truth.first;
if (truth_match > match_eps)
@ -342,27 +359,27 @@ namespace dlib
psi.set_size(get_num_dimensions());
psi = 0;
for (unsigned long i = 0; i < final_dets.size(); ++i)
scanner.get_feature_vector(final_dets[i], psi);
scanner.get_feature_vector(final_dets[i], current_solution, psi);
psi(scanner.get_num_dimensions()) = -1.0*final_dets.size();
}
bool overlaps_any_box (
const std::vector<rectangle>& truth_rects,
const std::vector<rectangle>& truth_object_detections,
const dlib::rectangle& rect
) const
{
for (unsigned long i = 0; i < truth_rects.size(); ++i)
for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
{
if (boxes_overlap(truth_rects[i], rect))
if (boxes_overlap(truth_object_detections[i], rect))
return true;
}
return false;
}
std::pair<double,unsigned int> find_best_match(
const std::vector<rectangle>& boxes,
const std::vector<full_object_detection>& boxes,
const rectangle rect
) const
/*!
@ -381,10 +398,10 @@ namespace dlib
for (unsigned long i = 0; i < boxes.size(); ++i)
{
const unsigned long area = rect.intersect(boxes[i]).area();
const unsigned long area = rect.intersect(boxes[i].rect).area();
if (area != 0)
{
const double new_match = area / static_cast<double>((rect + boxes[i]).area());
const double new_match = area / static_cast<double>((rect + boxes[i].rect).area());
if (new_match > match)
{
match = new_match;
@ -411,7 +428,7 @@ namespace dlib
mutable array<image_scanner_type> scanners;
const image_array_type& images;
const std::vector<std::vector<rectangle> >& truth_rects;
const std::vector<std::vector<full_object_detection> >& truth_object_detections;
unsigned long max_num_dets;
double match_eps;

View File

@ -6,6 +6,7 @@
#include "../matrix.h"
#include "structural_svm_problem_threaded_abstract.h"
#include <sstream>
#include "../image_processing/full_object_detection_abstract.h"
namespace dlib
{
@ -81,23 +82,25 @@ namespace dlib
const image_scanner_type& scanner,
const overlap_tester_type& overlap_tester,
const image_array_type& images,
const std::vector<std::vector<rectangle> >& truth_rects,
const std::vector<std::vector<full_object_detection> >& truth_object_detections,
unsigned long num_threads = 2
);
/*!
requires
- is_learning_problem(images, truth_rects)
- is_learning_problem(images, truth_object_detections)
- scanner.get_num_detection_templates() > 0
- scanner.load(images[0]) must be a valid expression.
- for all valid i, j:
- truth_object_detections[i][j].movable_rects.size() == scanner.get_num_movable_components_per_detection_template()
ensures
- This object attempts to learn a mapping from the given images to the
object locations given in truth_rects. In particular, it attempts to
learn to predict truth_rects[i] based on images[i].
object locations given in truth_object_detections. In particular, it attempts to
learn to predict truth_object_detections[i] based on images[i].
Or in other words, this object can be used to learn a parameter vector, w, such that
an object_detector declared as:
object_detector<image_scanner_type,overlap_tester_type> detector(scanner,overlap_tester,w)
results in a detector object which attempts to compute the following mapping:
truth_rects[i] == detector(images[i])
truth_object_detections[i].rect == detector(images[i])
- #get_match_eps() == 0.5
- This object will use num_threads threads during the optimization
procedure. You should set this parameter equal to the number of

View File

@ -57,6 +57,7 @@ namespace
detector(images[i], dets2);
matrix<double,0,1> psi(detector.get_w().size());
matrix<double,0,1> psi2(detector.get_w().size());
const double thresh = detector.get_w()(detector.get_w().size()-1);
DLIB_TEST(dets.size() == dets2.size());
@ -65,10 +66,19 @@ namespace
DLIB_TEST(dets[j] == dets2[j].second);
psi = 0;
detector.get_scanner().get_feature_vector(dets[j], psi);
const full_object_detection fdet = detector.get_scanner().get_feature_vector(dets[j], detector.get_w(), psi);
const double check_score = dot(psi,detector.get_w()) - thresh;
double check_score = dot(psi,detector.get_w()) - thresh;
DLIB_TEST(std::abs(check_score - dets2[j].first) < 1e-10);
// Make sure fdet works the way it is supposed to with get_feature_vector().
psi2 = 0;
detector.get_scanner().get_feature_vector(fdet, psi2);
check_score = dot(psi2,detector.get_w()) - thresh;
DLIB_TEST(std::abs(check_score - dets2[j].first) < 1e-10);
DLIB_TEST(max(abs(psi-psi2)) < 1e-10);
}
}