Gave the shape_predictor_trainer the ability to learn from datasets where

some landmarks are missing.
2024-11-01 10:14:53 +08:00 · 2015-06-12 09:00:21 -04:00 · 2015-06-12 09:00:21 -04:00 · 2d57a7546e
commit 2d57a7546e
parent 3286a0a342
2 changed files with 107 additions and 21 deletions
--- a/dlib/image_processing/shape_predictor.h
+++ b/dlib/image_processing/shape_predictor.h
@ -563,6 +563,7 @@ namespace dlib
            // make sure the objects agree on the number of parts and that there is at
            // least one full_object_detection. 
            unsigned long num_parts = 0;
+            std::vector<int> part_present;
            for (unsigned long i = 0; i < objects.size(); ++i)
            {
                for (unsigned long j = 0; j < objects[i].size(); ++j)
@ -574,6 +575,7 @@ namespace dlib
                            "\t shape_predictor shape_predictor_trainer::train()"
                            << "\n\t You can't give objects that don't have any parts to the trainer."
                        );
+                        part_present.resize(num_parts);
                    }
                    else
                    {
@ -584,12 +586,22 @@ namespace dlib
                            << "\n\t num_parts:  " << num_parts 
                        );
                    }
+                    for (unsigned long p = 0; p < objects[i][j].num_parts(); ++p)
+                    {
+                        if (objects[i][j].part(p) != OBJECT_PART_NOT_PRESENT)
+                            part_present[p] = 1;
+                    }
                }
            }
            DLIB_CASSERT(num_parts != 0,
                "\t shape_predictor shape_predictor_trainer::train()"
                << "\n\t You must give at least one full_object_detection if you want to train a shape model and it must have parts."
            );
+            DLIB_CASSERT(sum(mat(part_present)) == (long)num_parts,
+                "\t shape_predictor shape_predictor_trainer::train()"
+                << "\n\t Each part must appear at least once in this training data.  That is, "
+                << "\n\t you can't have a part that is always set to OBJECT_PART_NOT_PRESENT."
+            );



@ -646,19 +658,33 @@ namespace dlib

    private:

-        static matrix<float,0,1> object_to_shape (
-            const full_object_detection& obj
+        static void object_to_shape (
+            const full_object_detection& obj,
+            matrix<float,0,1>& shape,
+            matrix<float,0,1>& present // a mask telling which elements of #shape are present.
        )
        {
-            matrix<float,0,1> shape(obj.num_parts()*2);
+            shape.set_size(obj.num_parts()*2);
+            present.set_size(obj.num_parts()*2);
            const point_transform_affine tform_from_img = impl::normalizing_tform(obj.get_rect());
            for (unsigned long i = 0; i < obj.num_parts(); ++i)
            {
-                vector<float,2> p = tform_from_img(obj.part(i));
-                shape(2*i)   = p.x();
-                shape(2*i+1) = p.y();
+                if (obj.part(i) != OBJECT_PART_NOT_PRESENT)
+                {
+                    vector<float,2> p = tform_from_img(obj.part(i));
+                    shape(2*i)   = p.x();
+                    shape(2*i+1) = p.y();
+                    present(2*i)   = 1;
+                    present(2*i+1) = 1;
+                }
+                else
+                {
+                    shape(2*i)   = 0;
+                    shape(2*i+1) = 0;
+                    present(2*i)   = 0;
+                    present(2*i+1) = 0;
+                }
            }
-            return shape;
        }

        struct training_sample 
@ -671,7 +697,9 @@ namespace dlib
                  pixel when you look it up relative to the shape in current_shape.

                - target_shape == The truth shape.  Stays constant during the whole
-                  training process.
+                  training process (except for the parts that are not present, those are
+                  always equal to the current_shape values).
+                - present == 0/1 mask saying which parts of target_shape are present.
                - rect == the position of the object in the image_idx-th image.  All shape
                  coordinates are coded relative to this rectangle.
            !*/
@ -679,6 +707,7 @@ namespace dlib
            unsigned long image_idx;
            rectangle rect;
            matrix<float,0,1> target_shape; 
+            matrix<float,0,1> present; 

            matrix<float,0,1> current_shape;  
            std::vector<float> feature_pixel_values;
@ -688,6 +717,7 @@ namespace dlib
                std::swap(image_idx, item.image_idx);
                std::swap(rect, item.rect);
                target_shape.swap(item.target_shape);
+                present.swap(item.present);
                current_shape.swap(item.current_shape);
                feature_pixel_values.swap(item.feature_pixel_values);
            }
@ -727,17 +757,38 @@ namespace dlib

            // Now all the parts contain the ranges for the leaves so we can use them to
            // compute the average leaf values.
+            matrix<float,0,1> present_counts(samples[0].target_shape.size());
            tree.leaf_values.resize(parts.size());
            for (unsigned long i = 0; i < parts.size(); ++i)
            {
+                // Get the present counts for each dimension so we can divide each
+                // dimension by the number of observations we have on it to find the mean
+                // displacement in each leaf. 
+                present_counts = 0;
+                for (unsigned long j = parts[i].first; j < parts[i].second; ++j)
+                    present_counts += samples[j].present;
+                present_counts = dlib::reciprocal(present_counts);
+
                if (parts[i].second != parts[i].first)
-                    tree.leaf_values[i] = sums[num_split_nodes+i]*get_nu()/(parts[i].second - parts[i].first);
+                    tree.leaf_values[i] = pointwise_multiply(present_counts,sums[num_split_nodes+i]*get_nu());
                else
                    tree.leaf_values[i] = zeros_matrix(samples[0].target_shape);

                // now adjust the current shape based on these predictions
                for (unsigned long j = parts[i].first; j < parts[i].second; ++j)
+                {
                    samples[j].current_shape += tree.leaf_values[i];
+                    // For parts that aren't present in the training data, we just make
+                    // sure that the target shape always matches and therefore gives zero
+                    // error.  So this makes the algorithm simply ignore non-present
+                    // landmarks.
+                    for (long k = 0; k < samples[j].present.size(); ++k)
+                    {
+                        // if this part is not present
+                        if (samples[j].present(k) == 0)
+                            samples[j].target_shape(k) = samples[j].current_shape(k);
+                    }
+                }
            }

            return tree;
@ -867,7 +918,7 @@ namespace dlib
        {
            samples.clear();
            matrix<float,0,1> mean_shape;
-            long count = 0;
+            matrix<float,0,1> count;
            // first fill out the target shapes
            for (unsigned long i = 0; i < objects.size(); ++i)
            {
@ -876,15 +927,15 @@ namespace dlib
                    training_sample sample;
                    sample.image_idx = i;
                    sample.rect = objects[i][j].get_rect();
-                    sample.target_shape = object_to_shape(objects[i][j]);
+                    object_to_shape(objects[i][j], sample.target_shape, sample.present);
                    for (unsigned long itr = 0; itr < get_oversampling_amount(); ++itr)
                        samples.push_back(sample);
                    mean_shape += sample.target_shape;
-                    ++count;
+                    count += sample.present;
                }
            }

-            mean_shape /= count;
+            mean_shape = pointwise_multiply(mean_shape,reciprocal(count));

            // now go pick random initial shapes
            for (unsigned long i = 0; i < samples.size(); ++i)
@ -897,12 +948,35 @@ namespace dlib
                }
                else
                {
-                    // Pick a random convex combination of two of the target shapes and use
-                    // that as the initial shape for this sample.
-                    const unsigned long rand_idx = rnd.get_random_32bit_number()%samples.size();
-                    const unsigned long rand_idx2 = rnd.get_random_32bit_number()%samples.size();
-                    const double alpha = rnd.get_random_double();
-                    samples[i].current_shape = alpha*samples[rand_idx].target_shape + (1-alpha)*samples[rand_idx2].target_shape;
+                    samples[i].current_shape.set_size(0);
+
+                    matrix<float,0,1> hits(mean_shape.size());
+                    hits = 0;
+
+                    int iter = 0;
+                    // Pick a few samples at random and randomly average them together to
+                    // make the initial shape.  Note that we make sure we get at least one
+                    // observation (i.e. non-OBJECT_PART_NOT_PRESENT) on each part
+                    // location.
+                    while(min(hits) == 0 || iter < 2)
+                    {
+                        ++iter;
+                        const unsigned long rand_idx = rnd.get_random_32bit_number()%samples.size();
+                        const double alpha = rnd.get_random_double()+0.1;
+                        samples[i].current_shape += alpha*samples[rand_idx].target_shape;
+                        hits += alpha*samples[rand_idx].present;
+                    }
+                    samples[i].current_shape = pointwise_multiply(samples[i].current_shape, reciprocal(hits));
+                }
+
+            }
+            for (unsigned long i = 0; i < samples.size(); ++i)
+            {
+                for (long k = 0; k < samples[i].present.size(); ++k)
+                {
+                    // if this part is not present
+                    if (samples[i].present(k) == 0)
+                        samples[i].target_shape(k) = samples[i].current_shape(k);
                }
            }

@ -1029,8 +1103,11 @@ namespace dlib

                for (unsigned long k = 0; k < det.num_parts(); ++k)
                {
-                    double score = length(det.part(k) - objects[i][j].part(k))/scale;
-                    rs.add(score);
+                    if (objects[i][j].part(k) != OBJECT_PART_NOT_PRESENT)
+                    {
+                        double score = length(det.part(k) - objects[i][j].part(k))/scale;
+                        rs.add(score);
+                    }
                }
            }
        }
--- a/dlib/image_processing/shape_predictor_abstract.h
+++ b/dlib/image_processing/shape_predictor_abstract.h
@ -359,6 +359,9 @@ namespace dlib
                - images.size() > 0
                - for some i: objects[i].size() != 0
                  (i.e. there has to be at least one full_object_detection in the training set)
+                - for all valid p, there must exist i and j such that: 
+                  objects[i][j].part(p) != OBJECT_PART_NOT_PRESENT.
+                  (i.e. You can't define a part that is always set to OBJECT_PART_NOT_PRESENT.)
                - for all valid i,j,k,l:
                    - objects[i][j].num_parts() == objects[k][l].num_parts()
                      (i.e. all objects must agree on the number of parts)
@ -370,6 +373,10 @@ namespace dlib
                  shape_predictor, SP, such that:
                    SP(images[i], objects[i][j].get_rect()) == objects[i][j]
                  This learned SP object is then returned.
+                - Not all parts are required to be observed for all objects.  So if you
+                  have training instances with missing parts then set the part positions
+                  equal to OBJECT_PART_NOT_PRESENT and this algorithm will basically ignore
+                  those missing parts.
        !*/
    };

@ -408,6 +415,8 @@ namespace dlib
              and compare the result with the truth part positions in objects[i][j].  We
              then return the average distance (measured in pixels) between a predicted
              part location and its true position.  
+            - Note that any parts in objects that are set to OBJECT_PART_NOT_PRESENT are
+              simply ignored.
            - if (scales.size() != 0) then
                - Each time we compute the distance between a predicted part location and
                  its true location in objects[i][j] we divide the distance by