Changed the behavior of imglab's --flip option. It will now attempt to adjust

any object part labels so that the flipped dataset has the same average part layout as the source dataset. I added a --flip-basic that behaves like the old --flip. However, most people flipping a dataset with part annotations will want to use --flip.
2024-11-01 10:14:53 +08:00 · 2018-01-14 09:16:50 -05:00 · 2018-01-14 09:16:50 -05:00 · c68bb4e785
commit c68bb4e785
parent c13ca8ebe7
2 changed files with 201 additions and 9 deletions
--- a/tools/imglab/src/flip_dataset.cpp
+++ b/tools/imglab/src/flip_dataset.cpp
@ -7,17 +7,195 @@
 #include <string>
 #include "common.h"
 #include <dlib/image_transforms.h>
+#include <dlib/optimization.h>
+#include <dlib/image_processing.h>

 using namespace dlib;
 using namespace std;

 // ----------------------------------------------------------------------------------------

+std::vector<long> align_points(
+    const std::vector<dpoint>& from,
+    const std::vector<dpoint>& to,
+    double min_angle = -90*pi/180.0,
+    double max_angle = -90*pi/180.0,
+    long num_angles = 180 
+)
+/*!
+    ensures
+        - Figures out how to align the points in from with the points in to.  Returns an
+          assignment array A that indicates that from[i] matches with to[A[i]].
+
+          We use the Hungarian algorithm with a search over reasonable angles.  This method
+          works because we just need to account for a translation and a mild rotation and
+          nothing else.  If there is any other more complex mapping then you probably don't
+          have landmarks that make sense to flip.
+!*/
+{
+    DLIB_CASSERT(from.size() == to.size());
+
+    std::vector<long> best_assignment;
+    double best_assignment_cost = std::numeric_limits<double>::infinity();
+
+    matrix<double> dists(from.size(), to.size());
+    matrix<long long> idists;
+
+    for (auto angle : linspace(min_angle, max_angle, num_angles))
+    {
+        for (long r = 0; r < dists.nr(); ++r)
+        {
+            for (long c = 0; c < dists.nc(); ++c)
+            {
+                dists(r,c) = length_squared(from[r]-to[c]);
+            }
+        }
+
+        idists = matrix_cast<long long>(-round(std::numeric_limits<long long>::max()*(dists/max(dists))));
+
+        auto assignment = max_cost_assignment(idists);
+        auto cost = assignment_cost(dists, assignment);
+        if (cost < best_assignment_cost)
+        {
+            best_assignment_cost = cost;
+            best_assignment = std::move(assignment);
+        }
+    }
+
+
+    // Now compute the alignment error in terms of average distance moved by each part.  We
+    // do this so we can give the user a warning if it's impossible to make a good
+    // alignment.
+    running_stats<double> rs;
+    std::vector<dpoint> tmp(to.size());
+    for (size_t i = 0; i < to.size(); ++i)
+        tmp[best_assignment[i]] = to[i];
+    auto tform = find_similarity_transform(from, tmp);
+    for (size_t i = 0; i < from.size(); ++i)
+        rs.add(length(tform(from[i])-tmp[i]));
+    if (rs.mean() > 0.05)
+    {
+        cout << "WARNING, your dataset has object part annotations and you asked imglab to " << endl;
+        cout << "flip the data.  Imglab tried to adjust the part labels so that the average" << endl;
+        cout << "part layout in the flipped dataset is the same as the source dataset.  " << endl;
+        cout << "However, the part annotation scheme doesn't seem to be left-right symmetric." << endl;
+        cout << "You should manually review the output to make sure the part annotations are " << endl;
+        cout << "labeled as you expect." << endl;
+    }
+
+
+    return best_assignment;
+}
+
+// ----------------------------------------------------------------------------------------
+
+std::map<string,dpoint> normalized_parts (
+    const image_dataset_metadata::box& b
+)
+{
+    auto tform = dlib::impl::normalizing_tform(b.rect);
+    std::map<string,dpoint> temp;
+    for (auto& p : b.parts)
+        temp[p.first] = tform(p.second);
+    return temp;
+}
+
+// ----------------------------------------------------------------------------------------
+
+std::map<string,dpoint> average_parts (
+    const image_dataset_metadata::dataset& data
+)
+/*!
+    ensures
+        - returns the average part layout over all objects in data.  This is done by
+          centering the parts inside their rects and then averaging all the objects.
+!*/
+{
+    std::map<string,dpoint> psum;
+    std::map<string,double> pcnt;
+    for (auto& image : data.images)
+    {
+        for (auto& box : image.boxes)
+        {
+            for (auto& p : normalized_parts(box))
+            {
+                psum[p.first] += p.second;
+                pcnt[p.first] += 1;
+            }
+        }
+    }
+
+    // make into an average
+    for (auto& p : psum)
+        p.second /= pcnt[p.first];
+
+    return psum;
+}
+
+// ----------------------------------------------------------------------------------------
+
+void make_part_labeling_match_target_dataset (
+    const image_dataset_metadata::dataset& target,
+    image_dataset_metadata::dataset& data 
+)
+/*!
+    This function tries to adjust the part labels in data so that the average part layout
+    in data is the same as target, according to the string labels.  Therefore, it doesn't
+    adjust part positions, instead it changes the string labels on the parts to achieve
+    this.  This really only makes sense when you flipped a dataset that contains left-right
+    symmetric objects and you want to remap the part labels of the flipped data so that
+    they match the unflipped data's annotation scheme.
+!*/
+{
+    auto target_parts = average_parts(target);
+    auto data_parts = average_parts(data);
+
+    // Convert to a form align_points() understands.  We also need to keep track of the
+    // labels for later.
+    std::vector<dpoint> from, to;
+    std::vector<string> from_labels, to_labels;
+    for (auto& p : target_parts)
+    {
+        from_labels.emplace_back(p.first);
+        from.emplace_back(p.second);
+    }
+    for (auto& p : data_parts)
+    {
+        to_labels.emplace_back(p.first);
+        to.emplace_back(p.second);
+    }
+
+    auto assignment = align_points(from, to);
+    // so now we know that from_labels[i] should replace to_labels[assignment[i]]
+    std::map<string,string> label_mapping;
+    for (size_t i = 0; i < assignment.size(); ++i)
+        label_mapping[to_labels[assignment[i]]] = from_labels[i];
+
+    // now apply the label mapping to the dataset
+    for (auto& image : data.images)
+    {
+        for (auto& box : image.boxes)
+        {
+            std::map<string,point> temp;
+            for (auto& p : box.parts)
+                temp[label_mapping[p.first]] = p.second;
+            box.parts = std::move(temp);
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------------------
+
 void flip_dataset(const command_line_parser& parser)
 {
-    image_dataset_metadata::dataset metadata;
-    const string datasource = parser.option("flip").argument();
+    image_dataset_metadata::dataset metadata, orig_metadata;
+    string datasource;
+    if (parser.option("flip"))
+        datasource = parser.option("flip").argument();
+    else
+        datasource = parser.option("flip-basic").argument();
    load_image_dataset_metadata(metadata,datasource);
+    orig_metadata = metadata;

    // Set the current directory to be the one that contains the
    // metadata file. We do this because the file might contain
@ -51,16 +229,18 @@ void flip_dataset(const command_line_parser& parser)
            metadata.images[i].boxes[j].rect = impl::flip_rect_left_right(metadata.images[i].boxes[j].rect, get_rect(img));

            // flip all the object parts
-            std::map<std::string,point>::iterator k;
-            for (k = metadata.images[i].boxes[j].parts.begin(); k != metadata.images[i].boxes[j].parts.end(); ++k)
+            for (auto& part : metadata.images[i].boxes[j].parts)
            {
-                k->second = impl::flip_rect_left_right(rectangle(k->second,k->second), get_rect(img)).tl_corner();
+                part.second = impl::flip_rect_left_right(rectangle(part.second,part.second), get_rect(img)).tl_corner();
            }
        }

        metadata.images[i].filename = filename;
    }

+    if (!parser.option("flip-basic"))
+        make_part_labeling_match_target_dataset(orig_metadata, metadata);
+
    save_image_dataset_metadata(metadata, metadata_filename);
 }

--- a/tools/imglab/src/main.cpp
+++ b/tools/imglab/src/main.cpp
@ -21,7 +21,7 @@
 #include <dlib/dir_nav.h>


-const char* VERSION = "1.12";
+const char* VERSION = "1.13";



@ -550,7 +550,11 @@ int main(int argc, char** argv)
                                 "image tags from <arg1>.  The results are saved into merged.xml and neither <arg1> or "
                                 "<arg2> files are modified.",2);
        parser.add_option("flip", "Read an XML image dataset from the <arg> XML file and output a left-right flipped "
-                                  "version of the dataset and an accompanying flipped XML file named flipped_<arg>.",1);
+                                  "version of the dataset and an accompanying flipped XML file named flipped_<arg>. " 
+                                  "We also adjust object part labels after flipping so that the new flipped dataset "
+                                  "has the same average part layout as the source dataset." ,1);
+        parser.add_option("flip-basic", "This option is just like --flip, except we don't adjust any object part labels after flipping. "
+                                        "The parts are instead simply mirrored to the flipped dataset.", 1);
        parser.add_option("rotate", "Read an XML image dataset and output a copy that is rotated counter clockwise by <arg> degrees. "
                                  "The output is saved to an XML file prefixed with rotated_<arg>.",1);
        parser.add_option("cluster", "Cluster all the objects in an XML file into <arg> different clusters and save "
@ -575,7 +579,7 @@ int main(int argc, char** argv)
        parser.parse(argc, argv);

        const char* singles[] = {"h","c","r","l","files","convert","parts","rmdiff", "rmtrunc", "rmdupes", "seed", "shuffle", "split", "add", 
-                                 "flip", "rotate", "tile", "size", "cluster", "resample", "min-object-size", "rmempty",
+                                 "flip-basic", "flip", "rotate", "tile", "size", "cluster", "resample", "min-object-size", "rmempty",
                                 "crop-size", "cropped-object-size", "rmlabel", "rm-other-labels", "rm-if-overlaps", "sort-num-objects", 
                                 "one-object-per-image", "jpg", "rmignore", "sort"};
        parser.check_one_time_options(singles);
@ -598,6 +602,8 @@ int main(int argc, char** argv)
        parser.check_incompatible_options("c", "rmtrunc");
        parser.check_incompatible_options("c", "add");
        parser.check_incompatible_options("c", "flip");
+        parser.check_incompatible_options("c", "flip-basic");
+        parser.check_incompatible_options("flip", "flip-basic");
        parser.check_incompatible_options("c", "rotate");
        parser.check_incompatible_options("c", "rename");
        parser.check_incompatible_options("c", "ignore");
@ -610,24 +616,30 @@ int main(int argc, char** argv)
        parser.check_incompatible_options("l", "add");
        parser.check_incompatible_options("l", "parts");
        parser.check_incompatible_options("l", "flip");
+        parser.check_incompatible_options("l", "flip-basic");
        parser.check_incompatible_options("l", "rotate");
        parser.check_incompatible_options("files", "rename");
        parser.check_incompatible_options("files", "ignore");
        parser.check_incompatible_options("files", "add");
        parser.check_incompatible_options("files", "parts");
        parser.check_incompatible_options("files", "flip");
+        parser.check_incompatible_options("files", "flip-basic");
        parser.check_incompatible_options("files", "rotate");
        parser.check_incompatible_options("add", "flip");
+        parser.check_incompatible_options("add", "flip-basic");
        parser.check_incompatible_options("add", "rotate");
        parser.check_incompatible_options("add", "tile");
        parser.check_incompatible_options("flip", "tile");
+        parser.check_incompatible_options("flip-basic", "tile");
        parser.check_incompatible_options("rotate", "tile");
        parser.check_incompatible_options("cluster", "tile");
        parser.check_incompatible_options("resample", "tile");
        parser.check_incompatible_options("flip", "cluster");
+        parser.check_incompatible_options("flip-basic", "cluster");
        parser.check_incompatible_options("rotate", "cluster");
        parser.check_incompatible_options("add", "cluster");
        parser.check_incompatible_options("flip", "resample");
+        parser.check_incompatible_options("flip-basic", "resample");
        parser.check_incompatible_options("rotate", "resample");
        parser.check_incompatible_options("add", "resample");
        parser.check_incompatible_options("shuffle", "tile");
@ -679,7 +691,7 @@ int main(int argc, char** argv)
            return EXIT_SUCCESS;
        }

-        if (parser.option("flip"))
+        if (parser.option("flip") || parser.option("flip-basic"))
        {
            flip_dataset(parser);
            return EXIT_SUCCESS;