Added two vehicle detection examples.

This commit is contained in:
Davis King 2017-08-26 17:13:47 -04:00
parent 0a7a75a245
commit 51eae2ba07
4 changed files with 525 additions and 0 deletions

View File

@ -121,6 +121,8 @@ if (NOT USING_OLD_VISUAL_STUDIO_COMPILER)
add_gui_example(random_cropper_ex)
add_gui_example(dnn_mmod_dog_hipsterizer)
add_gui_example(dnn_imagenet_ex)
add_gui_example(dnn_mmod_find_cars_ex)
add_example(dnn_mmod_train_find_cars_ex)
if (NOT MSVC)
# Don't try to compile this program using Visual Studio since it causes the
# compiler to run out of RAM and to crash. Maybe someday Visual Studio

View File

@ -0,0 +1,175 @@
#include <iostream>
#include <dlib/dnn.h>
#include <dlib/data_io.h>
#include <dlib/gui_widgets.h>
#include <dlib/dir_nav.h>
#include <dlib/time_this.h>
#include <dlib/gui_widgets.h>
#include <dlib/image_processing.h>
using namespace std;
using namespace dlib;
// the dnn rear view vehicle detector network
template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
template <long num_filters, typename SUBNET> using con5 = con<num_filters,5,5,1,1,SUBNET>;
template <typename SUBNET> using downsampler = relu<affine<con5d<32, relu<affine<con5d<32, relu<affine<con5d<16,SUBNET>>>>>>>>>;
template <typename SUBNET> using rcon5 = relu<affine<con5<55,SUBNET>>>;
using net_type = loss_mmod<con<1,9,9,1,1,rcon5<rcon5<rcon5<downsampler<input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;
// ----------------------------------------------------------------------------------------
int main() try
{
net_type net;
shape_predictor sp;
// You can get this file from http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2
// This network was produced by the dnn_mmod_train_find_cars_ex.cpp example program.
// As you can see, it also includes a shape_predictor. To see a generic example of how
// to train those refer to train_shape_predictor_ex.cpp.
deserialize("mmod_rear_end_vehicle_detector.dat") >> net >> sp;
matrix<rgb_pixel> img;
load_image(img, "../mmod_cars_test_image.jpg");
image_window win;
win.set_image(img);
// Run the detector on the image and show us the output.
for (auto&& d : net(img))
{
// We use a shape_predictor to refine the exact shape and location of the detection
// box. This shape_predictor is trained to simply output the 4 corner points. So
// all we do is make a rectangle that tightly contains those 4 points and that
// rectangle is our refined detection position.
auto fd = sp(img,d);
rectangle rect;
for (long j = 0; j < fd.num_parts(); ++j)
rect += fd.part(j);
win.add_overlay(rect, rgb_pixel(255,0,0));
}
cout << "Hit enter to view the intermediate processing steps" << endl;
cin.get();
// Create a tiled image pyramid and display it on the screen.
std::vector<rectangle> rects;
matrix<rgb_pixel> tiled_img;
create_tiled_pyramid<std::remove_reference<decltype(input_layer(net))>::type::pyramid_type>(img,
tiled_img, rects, input_layer(net).get_pyramid_padding(),
input_layer(net).get_pyramid_outer_padding());
image_window winpyr(tiled_img, "Tiled image pyramid");
cout << "Number of channels in final tensor image: " << net.subnet().get_output().k() << endl;
matrix<float> network_output = image_plane(net.subnet().get_output(),0,0);
for (long k = 1; k < net.subnet().get_output().k(); ++k)
network_output = max_pointwise(network_output, image_plane(net.subnet().get_output(),0,k));
const double v0_scale = img.nc()/(double)network_output.nc();
resize_image(v0_scale, network_output);
const float lower = -2.5;// min(network_output);
const float upper = 0.0;// max(network_output);
cout << "jet color mapping range: lower="<< lower << " upper="<< upper << endl;
// Display the final layer as a color image
image_window win_output(jet(network_output, upper, lower), "Output tensor from the network");
// Overlay network_output on top of the tiled image pyramid and display it.
matrix<rgb_pixel> tiled_img_sal = tiled_img;
for (long r = 0; r < tiled_img_sal.nr(); ++r)
{
for (long c = 0; c < tiled_img_sal.nc(); ++c)
{
dpoint tmp(c,r);
tmp = input_tensor_to_output_tensor(net, tmp);
tmp = point(v0_scale*tmp);
if (get_rect(network_output).contains(tmp))
{
float val = network_output(tmp.y(),tmp.x());
rgb_alpha_pixel p;
assign_pixel(p , colormap_jet(val,lower,upper));
p.alpha = 120;
assign_pixel(tiled_img_sal(r,c), p);
}
}
}
image_window win_pyr_sal(tiled_img_sal, "Saliency on image pyramid");
// Now collapse the pyramid scales into the original image
matrix<float> collapsed_saliency(img.nr(), img.nc());
resizable_tensor input_tensor;
input_layer(net).to_tensor(&img, &img+1, input_tensor);
for (long r = 0; r < collapsed_saliency.nr(); ++r)
{
for (long c = 0; c < collapsed_saliency.nc(); ++c)
{
// Loop over a bunch of scale values and look up what part of network_output corresponds to
// the point(c,r) in the original image, then take the max saliency value over
// all the scales and save it at pixel point(c,r).
float max_sal = -1e30;
for (double scale = 1; scale > 0.2; scale *= 5.0/6.0)
{
// map from input image coordinates to tiled pyramid and then to output
// tensor coordinates.
dpoint tmp = center(input_layer(net).image_space_to_tensor_space(input_tensor,scale, drectangle(dpoint(c,r))));
tmp = point(v0_scale*input_tensor_to_output_tensor(net, tmp));
if (get_rect(network_output).contains(tmp))
{
float val = network_output(tmp.y(),tmp.x());
if (val > max_sal)
max_sal = val;
}
}
collapsed_saliency(r,c) = max_sal;
// Also blend the saliency into the original input image so we can view it as
// an overlay on the cars.
rgb_alpha_pixel p;
assign_pixel(p , colormap_jet(max_sal,lower,upper));
p.alpha = 120;
assign_pixel(img(r,c), p);
}
}
image_window win_collapsed(jet(collapsed_saliency, upper, lower), "collapsed saliency map");
image_window win_img_and_sal(img);
cout << "Hit enter to end program" << endl;
cin.get();
}
catch(image_load_error& e)
{
cout << e.what() << endl;
cout << "The test image is located in the examples folder. So you should run this program from a sub folder so that the relative path is correct." << endl;
}
catch(serialization_error& e)
{
cout << e.what() << endl;
cout << "The model file can be obtained from: http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2 Don't forget to unzip the file." << endl;
}
catch(std::exception& e)
{
cout << e.what() << endl;
}

View File

@ -0,0 +1,348 @@
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
/*
This example shows how to train a CNN based object detector using dlib's
loss_mmod loss layer. This loss layer implements the Max-Margin Object
Detection loss as described in the paper:
Max-Margin Object Detection by Davis E. King (http://arxiv.org/abs/1502.00046).
This is the same loss used by the popular SVM+HOG object detector in dlib
(see fhog_object_detector_ex.cpp) except here we replace the HOG features
with a CNN and train the entire detector end-to-end. This allows us to make
much more powerful detectors.
It would be a good idea to become familiar with dlib's DNN tooling before
reading this example. So you should read dnn_introduction_ex.cpp and
dnn_introduction2_ex.cpp before reading this example program. You should also read the
DNN+MMOD example as well: dnn_mmod_ex.cpp
This example is essentially a more complex version of dnn_mmod_ex.cpp. In it we train
a detector that finds the rear ends of motor vehicles. I will also discuss some
aspects of data preparation useful when training this kind of detector.
*/
#include <iostream>
#include <dlib/dnn.h>
#include <dlib/data_io.h>
#include <dlib/dir_nav.h>
#include <dlib/time_this.h>
using namespace std;
using namespace dlib;
// the dnn vehicle detector network
template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
template <long num_filters, typename SUBNET> using con5 = con<num_filters,5,5,1,1,SUBNET>;
template <typename SUBNET> using downsampler = relu<bn_con<con5d<32, relu<bn_con<con5d<32, relu<bn_con<con5d<16,SUBNET>>>>>>>>>;
template <typename SUBNET> using rcon5 = relu<bn_con<con5<55,SUBNET>>>;
using net_type = loss_mmod<con<1,9,9,1,1,rcon5<rcon5<rcon5<downsampler<input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;
// ----------------------------------------------------------------------------------------
int ignore_overlapped_boxes(
std::vector<mmod_rect>& boxes,
const test_box_overlap& overlaps
)
{
int num_ignored = 0;
for (size_t i = 0; i < boxes.size(); ++i)
{
if (boxes[i].ignore)
continue;
for (size_t j = i+1; j < boxes.size(); ++j)
{
if (boxes[j].ignore)
continue;
if (overlaps(boxes[i], boxes[j]))
{
++num_ignored;
if(boxes[i].rect.area() < boxes[j].rect.area())
boxes[i].ignore = true;
else
boxes[j].ignore = true;
}
}
}
return num_ignored;
}
// ----------------------------------------------------------------------------------------
template <
typename pyramid_type,
typename image_array_type
>
void upsample_image_dataset_limit (
image_array_type& images,
std::vector<std::vector<mmod_rect>>& objects
)
{
// make sure requires clause is not broken
DLIB_ASSERT( images.size() == objects.size(),
"\t void upsample_image_dataset_limit()"
<< "\n\t Invalid inputs were given to this function."
<< "\n\t images.size(): " << images.size()
<< "\n\t objects.size(): " << objects.size()
);
typename image_array_type::value_type temp;
pyramid_type pyr;
for (unsigned long i = 0; i < images.size(); ++i)
{
if (images[i].size() < 1800*1800)
{
pyramid_up(images[i], temp, pyr);
swap(temp, images[i]);
for (unsigned long j = 0; j < objects[i].size(); ++j)
{
objects[i][j].rect = pyr.rect_up(objects[i][j].rect);
}
}
}
}
// ----------------------------------------------------------------------------------------
int main(int argc, char** argv) try
{
if (argc != 2)
{
cout << "Give the path to a folder containing training.xml and testing.xml files." << endl;
cout << "This example program is specifically designed to run on the dlib vehicle " << endl;
cout << "detection dataset, which is available at this URL: " << endl;
cout << " http://dlib.net/files/data/dlib_rear_end_vehicles_v1.tar" << endl;
cout << endl;
cout << "So download that dataset, extract it somewhere, and then run this program" << endl;
cout << "with the dlib_rear_end_vehicles folder as an argument. E.g. if you extract" << endl;
cout << "the dataset to the current folder then you should run this example program" << endl;
cout << "by typing: " << endl;
cout << " ./dnn_mmod_train_find_cars_ex dlib_rear_end_vehicles" << endl;
cout << endl;
return 0;
}
const std::string data_directory = argv[1];
std::vector<matrix<rgb_pixel>> images_train, images_test;
std::vector<std::vector<mmod_rect>> boxes_train, boxes_test;
load_image_dataset(images_train, boxes_train, data_directory+"/training.xml");
load_image_dataset(images_test, boxes_test, data_directory+"/testing.xml");
int num_overlapped_ignored_test = 0;
for (auto& v : boxes_test)
num_overlapped_ignored_test += ignore_overlapped_boxes(v, test_box_overlap(0.50, 0.99));
int num_overlapped_ignored = 0;
int num_additional_ignored = 0;
for (auto& v : boxes_train)
{
num_overlapped_ignored += ignore_overlapped_boxes(v, test_box_overlap(0.50, 0.99));
for (auto& bb : v)
{
if (bb.rect.width() < 35 && bb.rect.height() < 35)
{
if (!bb.ignore)
{
bb.ignore = true;
++num_additional_ignored;
}
}
// The dlib vehicle detection dataset doesn't contain any detections with
// really extreme aspect ratios. However, some datasets do, often because of
// bad labeling. So it's a good idea to check for that and either eliminate
// those boxes or set them to ignore. Although, this depends on your
// application.
//
// For instance, if your dataset has boxes with an aspect ratio
// of 10 then you should think about what that means for the network
// architecture. Does the receptive field even cover the entirety of the box
// in those cases? Do you care about these boxes? Are they labeling errors?
// I find that many people will download some dataset from the internet and
// just take it as given. They run it through some training algorithm and take
// the dataset as unchallengeable truth. But many datasets are full of
// labeling errors. There are also a lot of datasets that aren't full of
// errors, but are annotated in a sloppy and inconsistent way. Fixing those
// errors and inconsistencies can often greatly improve models trained from
// such data. It's almost always worth the time to try and improve your
// training dataset.
}
}
cout << "num_overlapped_ignored: "<< num_overlapped_ignored << endl;
cout << "num_additional_ignored: "<< num_additional_ignored << endl;
cout << "num_overlapped_ignored_test: "<< num_overlapped_ignored_test << endl;
cout << "num training images: " << images_train.size() << endl;
cout << "num testing images: " << images_test.size() << endl;
// Our vehicle detection dataset has basically 3 different types of boxes. Square
// boxes, tall and skinny boxes (e.g. semi trucks), and short and wide boxes (e.g.
// sedans). Here we are telling the MMOD algorithm that a vehicle is recognizable as
// long as the longest box side is at least 70 pixels long and the shortest box side is
// at least 30 pixels long. It will use these parameters to decide how large each of
// the sliding windows need to be so as to be able to detect all the vehicles. Since
// our dataset has basically only these 3 different aspect ratios, it will decide to
// use 3 different sliding windows at the end of the network.
mmod_options options(boxes_train, 70, 30);
// This setting is very important and dataset specific. The vehicle detection dataset
// contains boxes that are marked as "ignore", as we discussed above. Some of them are
// ignored because we set ignore to true on them in the above code. However, the xml
// files already contained a lot of ignore boxes. Some of them are large boxes that
// encompass large parts of an image and the intention is to have everything inside
// those boxes be ignored. Therefore, we need to tell the MMOD algorithm to do that,
// which we do by setting options.overlaps_ignore appropriately.
//
// But first, we need to understand exactly what this option does. The MMOD loss
// is essentially counting the number of false alarms + missed detections, produced by
// the detector, for each image. During training, the code is running the detector on
// each image in a mini-batch and looking at it's output and counting the number of
// mistakes. The optimizer tries to find parameters settings that minimize the number
// of detector mistakes.
//
// This overlaps_ignore option allows you to tell the loss that some outputs from the
// detector should be totally ignored, as if they never happened. In particular, if a
// detection overlaps a box in the training data with ignore==true then that detection
// is ignored. This overlap is determined by calling
// options.overlaps_ignore(the_detection, the_ignored_training_box). If it returns
// true then that detection is ignored.
//
// You should read the documentation for test_box_overlap, the class type for
// overlaps_ignore for full details. However, the gist is that the default behavior is
// to only consider boxes as overlapping if their intersection over union is > 0.5.
// However, the dlib vehicle detection dataset contains large boxes that are meant to
// mask out large areas of an image. So intersection over union isn't an appropriate
// way to measure "overlaps with box" in this case. We want any box that is contained
// inside one of these big regions to be ignored, even if the detection box is really
// small. So we set overlaps_ignore to behave that way with this line.
options.overlaps_ignore = test_box_overlap(0.5, 0.95);
net_type net(options);
// The final layer of the network must be a con_ layer that contains
// options.detector_windows.size() filters. This is because these final filters are
// what perform the final "sliding window" detection in the network.
net.subnet().layer_details().set_num_filters(options.detector_windows.size());
dnn_trainer<net_type> trainer(net,sgd(0.0001,0.9));
trainer.set_learning_rate(0.1);
trainer.be_verbose();
trainer.set_iterations_without_progress_threshold(50000);
trainer.set_test_iterations_without_progress_threshold(1000);
const string sync_filename = "mmod_cars_sync";
trainer.set_synchronization_file(sync_filename, std::chrono::minutes(5));
std::vector<matrix<rgb_pixel>> mini_batch_samples;
std::vector<std::vector<mmod_rect>> mini_batch_labels;
random_cropper cropper;
cropper.set_seed(1);
cropper.set_chip_dims(350, 350);
cropper.set_min_object_size(0.20);
cropper.set_max_rotation_degrees(2);
dlib::rand rnd;
cout << trainer << cropper << endl;
int cnt = 1;
// Run the trainer until the learning rate gets small.
while(trainer.get_learning_rate() >= 1e-4)
{
if (cnt%30 != 0 || images_test.size() == 0)
{
cropper(87, images_train, boxes_train, mini_batch_samples, mini_batch_labels);
// We can also randomly jitter the colors and that often helps a detector
// generalize better to new images.
for (auto&& img : mini_batch_samples)
disturb_colors(img, rnd);
// It's a good idea to, at least once, put code here that displays the images
// and boxes the random cropper is generating. You should look at them and
// think about if the output makes sense for your problem. Most of the time
// it will be fine, but sometimes you will realize that the pattern of cropping
// isn't really appropriate for your problem and you will need to make some
// change to how the mini-batches are being generated. Maybe you will tweak
// some of the cropper's settings, or write your own entirely separate code to
// create mini-batches. But either way, if you don't look you will never know.
// An easy way to do this is to create a dlib::image_window to display the
// images and boxes.
trainer.train_one_step(mini_batch_samples, mini_batch_labels);
}
else
{
cropper(87, images_test, boxes_test, mini_batch_samples, mini_batch_labels);
// We can also randomly jitter the colors and that often helps a detector
// generalize better to new images.
for (auto&& img : mini_batch_samples)
disturb_colors(img, rnd);
trainer.test_one_step(mini_batch_samples, mini_batch_labels);
}
++cnt;
}
// wait for training threads to stop
trainer.get_net();
cout << "done training" << endl;
// Save the network to disk
net.clean();
serialize("mmod_rear_end_vehicle_detector.dat") << net;
// It's a really good idea to print the training parameters. This is because you will
// invariably be running multiple rounds of training and should be logging the output
// to a log file. This print statement will include many of the training parameters in
// your log.
cout << trainer << cropper << endl;
cout << "\nsync_filename: " << sync_filename << endl;
cout << "num training images: "<< images_train.size() << endl;
cout << "training results: " << test_object_detection_function(net, images_train, boxes_train, test_box_overlap(), 0, options.overlaps_ignore);
upsample_image_dataset_limit<pyramid_down<2>>(images_train, boxes_train);
cout << "training upsampled results: " << test_object_detection_function(net, images_train, boxes_train, test_box_overlap(), 0, options.overlaps_ignore);
cout << "num testing images: "<< images_test.size() << endl;
cout << "testing results: " << test_object_detection_function(net, images_test, boxes_test, test_box_overlap(), 0, options.overlaps_ignore);
upsample_image_dataset_limit<pyramid_down<2>>(images_test, boxes_test);
cout << "testing upsampled results: " << test_object_detection_function(net, images_test, boxes_test, test_box_overlap(), 0, options.overlaps_ignore);
/*
This program takes many hours to execute on a high end GPU. It took about a day to
train on an NVIDIA 1080ti. The resulting model file is available at
http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2
It should be noted that this file on dlib.net has a dlib::shape_predictor appended
onto the end of it (see dnn_mmod_find_cars_ex.cpp for an example of its use). This
explains why the model file on dlib.net is larger than the
mmod_rear_end_vehicle_detector.dat output by this program.
Also, the training and testing accuracies were:
num training images: 2217
training results: 0.990738 0.736431 0.736073
training upsampled results: 0.986837 0.937694 0.936912
num testing images: 135
testing results: 0.988827 0.471372 0.470806
testing upsampled results: 0.987879 0.651132 0.650399
*/
return 0;
}
catch(std::exception& e)
{
cout << e.what() << endl;
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 98 KiB