mirror of
https://github.com/davisking/dlib.git
synced 2024-11-01 10:14:53 +08:00
Added random_cropper and DNN MMOD example programs.
This commit is contained in:
parent
9e42f86cd3
commit
d5dc371ff8
@ -36,6 +36,8 @@ if (COMPILER_CAN_DO_CPP_11)
|
||||
add_example(dnn_inception_ex)
|
||||
add_example(dnn_imagenet_ex)
|
||||
add_example(dnn_imagenet_train_ex)
|
||||
add_example(dnn_mmod_ex)
|
||||
add_example(random_cropper_ex)
|
||||
endif()
|
||||
|
||||
#here we apply our macros
|
||||
|
210
examples/dnn_mmod_ex.cpp
Normal file
210
examples/dnn_mmod_ex.cpp
Normal file
@ -0,0 +1,210 @@
|
||||
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
|
||||
/*
|
||||
This example shows how to train a CNN based object detector using dlib's
|
||||
loss_mmod loss layer. This loss layer implements the Max-Margin Object
|
||||
Detection loss as described in the paper:
|
||||
Max-Margin Object Detection by Davis E. King (http://arxiv.org/abs/1502.00046).
|
||||
This is the same loss used by the popular SVM+HOG object detector in dlib
|
||||
(see fhog_object_detector_ex.cpp) except here we replace the HOG features
|
||||
with a CNN and train the entire detector end-to-end. This allows us to make
|
||||
much more powerful detectors.
|
||||
|
||||
It would be a good idea to become familiar with dlib's DNN tooling before
|
||||
reading this example. So you should read dnn_introduction_ex.cpp and
|
||||
dnn_introduction2_ex.cpp before reading this example program.
|
||||
|
||||
Just like in the fhog_object_detector_ex.cpp example, we are going to train
|
||||
a simple face detector based on the very small training dataset in the
|
||||
examples/faces folder. As we will see, even with this small dataset the
|
||||
MMOD method is able to make a working face detector. However, for real
|
||||
applications you should train with more data for an even better result.
|
||||
*/
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <dlib/dnn.h>
|
||||
#include <dlib/data_io.h>
|
||||
#include <dlib/gui_widgets.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace dlib;
|
||||
|
||||
// The first thing we do is define our CNN. The CNN is going to be evaluated
|
||||
// convolutionally over an entire image pyramid. Think of it like a normal
|
||||
// sliding window classifier. This means you need to define a CNN that can look
|
||||
// at some part of an image and decide if it is an object of interest. In this
|
||||
// example I've defined a CNN with a receptive field of a little over 50x50
|
||||
// pixels. This is reasonable for face detection since you can clearly tell if
|
||||
// a 50x50 image contains a face. Other applications may benefit from CNNs with
|
||||
// different architectures.
|
||||
//
|
||||
// In this example our CNN begins with 3 downsampling layers. These layers will
|
||||
// reduce the size of the image by 8x and output a feature map with
|
||||
// 32 dimensions. Then we will pass that through 4 more convolutional layers to
|
||||
// get the final output of the network. The last layer has only 1 channel and
|
||||
// the values in that last channel are large when the network thinks it has
|
||||
// found an object at a particular location.
|
||||
|
||||
|
||||
// Let's begin the network definition by creating some network blocks.
|
||||
|
||||
// A 5x5 conv layer that does 2x downsampling
|
||||
template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
|
||||
// A 3x3 conv layer that doesn't do any downsampling
|
||||
template <long num_filters, typename SUBNET> using con3 = con<num_filters,3,3,1,1,SUBNET>;
|
||||
|
||||
// Now we can define the 8x downsampling block in terms of conv5d blocks. We
|
||||
// also use relu and batch normalization in the standard way.
|
||||
template <unsigned long N, typename SUBNET> using downsampler = relu<bn_con<con5d<N, relu<bn_con<con5d<N, relu<bn_con<con5d<N,SUBNET>>>>>>>>>;
|
||||
|
||||
// The rest of the network will be 3x3 conv layers with batch normalization and
|
||||
// relu. So we define the 3x3 block we will use here.
|
||||
template <unsigned long N, typename SUBNET> using rcon3 = relu<bn_con<con3<N,SUBNET>>>;
|
||||
|
||||
// Finally, we define the entire network. The special input_rgb_image_pyramid
|
||||
// layer causes the network to operate over a spatial pyramid, making the detector
|
||||
// scale invariant.
|
||||
using net_type = loss_mmod<con<1,6,6,1,1,rcon3<32,rcon3<32,rcon3<32,downsampler<32,input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
int main(int argc, char** argv) try
|
||||
{
|
||||
// In this example we are going to train a face detector based on the
|
||||
// small faces dataset in the examples/faces directory. So the first
|
||||
// thing we do is load that dataset. This means you need to supply the
|
||||
// path to this faces folder as a command line argument so we will know
|
||||
// where it is.
|
||||
if (argc != 2)
|
||||
{
|
||||
cout << "Give the path to the examples/faces directory as the argument to this" << endl;
|
||||
cout << "program. For example, if you are in the examples folder then execute " << endl;
|
||||
cout << "this program by running: " << endl;
|
||||
cout << " ./dnn_mmod_ex faces" << endl;
|
||||
cout << endl;
|
||||
return 0;
|
||||
}
|
||||
const std::string faces_directory = argv[1];
|
||||
// The faces directory contains a training dataset and a separate
|
||||
// testing dataset. The training data consists of 4 images, each
|
||||
// annotated with rectangles that bound each human face. The idea is
|
||||
// to use this training data to learn to identify human faces in new
|
||||
// images.
|
||||
//
|
||||
// Once you have trained an object detector it is always important to
|
||||
// test it on data it wasn't trained on. Therefore, we will also load
|
||||
// a separate testing set of 5 images. Once we have a face detector
|
||||
// created from the training data we will see how well it works by
|
||||
// running it on the testing images.
|
||||
//
|
||||
// So here we create the variables that will hold our dataset.
|
||||
// images_train will hold the 4 training images and face_boxes_train
|
||||
// holds the locations of the faces in the training images. So for
|
||||
// example, the image images_train[0] has the faces given by the
|
||||
// rectangles in face_boxes_train[0].
|
||||
std::vector<matrix<rgb_pixel>> images_train, images_test;
|
||||
std::vector<std::vector<mmod_rect>> face_boxes_train, face_boxes_test;
|
||||
|
||||
// Now we load the data. These XML files list the images in each dataset
|
||||
// and also contain the positions of the face boxes. Obviously you can use
|
||||
// any kind of input format you like so long as you store the data into
|
||||
// images_train and face_boxes_train. But for convenience dlib comes with
|
||||
// tools for creating and loading XML image datasets. Here you see how to
|
||||
// load the data. To create the XML files you can use the imglab tool which
|
||||
// can be found in the tools/imglab folder. It is a simple graphical tool
|
||||
// for labeling objects in images with boxes. To see how to use it read the
|
||||
// tools/imglab/README.txt file.
|
||||
load_image_dataset(images_train, face_boxes_train, faces_directory+"/training.xml");
|
||||
load_image_dataset(images_test, face_boxes_test, faces_directory+"/testing.xml");
|
||||
|
||||
|
||||
cout << "num training images: " << images_train.size() << endl;
|
||||
cout << "num testing images: " << images_test.size() << endl;
|
||||
|
||||
|
||||
// The MMOD algorithm has some options you can set to control its behavior. However,
|
||||
// you can also call the constructor with your training annotations and a "target
|
||||
// object size" and it will automatically configure itself in a reasonable way for your
|
||||
// problem. Here we are saying that faces are still recognizably faces when they are
|
||||
// 40x40 pixels in size. You should generally pick the smallest size where this is
|
||||
// true. Based on this information the mmod_options constructor will automatically
|
||||
// pick a good sliding window width and height. It will also automatically set the
|
||||
// non-max-suppression parameters to something reasonable. For further details see the
|
||||
// mmod_options documentation.
|
||||
mmod_options options(face_boxes_train, 40*40);
|
||||
cout << "detection window width,height: " << options.detector_width << "," << options.detector_height << endl;
|
||||
cout << "overlap NMS IOU thresh: " << options.overlaps_nms.get_iou_thresh() << endl;
|
||||
cout << "overlap NMS percent covered thresh: " << options.overlaps_nms.get_percent_covered_thresh() << endl;
|
||||
|
||||
// Now we are ready to create our network and trainer.
|
||||
net_type net(options);
|
||||
dnn_trainer<net_type> trainer(net);
|
||||
trainer.set_learning_rate(0.1);
|
||||
trainer.be_verbose();
|
||||
trainer.set_synchronization_file("mmod_sync", std::chrono::minutes(5));
|
||||
trainer.set_iterations_without_progress_threshold(300);
|
||||
|
||||
|
||||
// Now let's train the network. We are going to use mini-batches of 150
|
||||
// images. The images are random crops from our training set (see
|
||||
// random_cropper_ex.cpp for a discussion of the random_cropper).
|
||||
std::vector<matrix<rgb_pixel>> mini_batch_samples;
|
||||
std::vector<std::vector<mmod_rect>> mini_batch_labels;
|
||||
random_cropper cropper;
|
||||
cropper.set_chip_dims(200, 200);
|
||||
cropper.set_min_object_height(0.2);
|
||||
dlib::rand rnd;
|
||||
// Run the trainer until the learning rate gets small
|
||||
while(false && trainer.get_learning_rate() >= 1e-4)
|
||||
{
|
||||
cropper(150, images_train, face_boxes_train, mini_batch_samples, mini_batch_labels);
|
||||
// We can also randomly jitter the colors and that often helps a detector
|
||||
// generalize better to new images.
|
||||
for (auto&& img : mini_batch_samples)
|
||||
disturb_colors(img, rnd);
|
||||
|
||||
trainer.train_one_step(mini_batch_samples, mini_batch_labels);
|
||||
}
|
||||
// wait for training threads to stop
|
||||
trainer.get_net();
|
||||
cout << "done training" << endl;
|
||||
|
||||
// Save the network to disk
|
||||
net.clean();
|
||||
serialize("mmod_network.dat") << net;
|
||||
|
||||
|
||||
// Now that we have a face detector we can test it. The first statement tests it
|
||||
// on the training data. It will print the precision, recall, and then average precision.
|
||||
// This statement should indicate that the network works perfectly on the
|
||||
// training data.
|
||||
cout << "training results: " << test_object_detection_function(net, images_train, face_boxes_train) << endl;
|
||||
// However, to get an idea if it really worked without overfitting we need to run
|
||||
// it on images it wasn't trained on. The next line does this. Happily,
|
||||
// this statement indicates that the detector finds most of the faces in the
|
||||
// testing data.
|
||||
cout << "testing results: " << test_object_detection_function(net, images_test, face_boxes_test) << endl;
|
||||
|
||||
// Now lets run the detector on the testing images and look at the outputs.
|
||||
image_window win;
|
||||
for (auto&& img : images_test)
|
||||
{
|
||||
pyramid_up(img);
|
||||
auto dets = net(img);
|
||||
win.clear_overlay();
|
||||
win.set_image(img);
|
||||
for (auto&& d : dets)
|
||||
win.add_overlay(d);
|
||||
cin.get();
|
||||
}
|
||||
return 0;
|
||||
|
||||
}
|
||||
catch(std::exception& e)
|
||||
{
|
||||
cout << e.what() << endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
95
examples/random_cropper_ex.cpp
Normal file
95
examples/random_cropper_ex.cpp
Normal file
@ -0,0 +1,95 @@
|
||||
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
|
||||
/*
|
||||
When you are training a convolutional neural network using the loss_mmod loss
|
||||
layer, you need to generate a bunch of identically sized training images. The
|
||||
random_cropper is a convenient tool to help you crop out a bunch of
|
||||
identically sized images from a training dataset.
|
||||
|
||||
This example shows you what it does exactly and talks about some of its options.
|
||||
*/
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <dlib/data_io.h>
|
||||
#include <dlib/gui_widgets.h>
|
||||
#include <dlib/image_transforms.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace dlib;
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
int main(int argc, char** argv) try
|
||||
{
|
||||
if (argc != 2)
|
||||
{
|
||||
cout << "Give an image dataset XML file to run this program." << endl;
|
||||
cout << "For example, if you are running from the examples folder then run this program by typing" << endl;
|
||||
cout << " ./random_cropper_ex faces/training.xml" << endl;
|
||||
cout << endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// First lets load a dataset
|
||||
std::vector<matrix<rgb_pixel>> images;
|
||||
std::vector<std::vector<mmod_rect>> boxes;
|
||||
load_image_dataset(images, boxes, argv[1]);
|
||||
|
||||
// Here we make our random_cropper. It has a number of options.
|
||||
random_cropper cropper;
|
||||
// We can tell it how big we want the cropped images to be.
|
||||
cropper.set_chip_dims(400,400);
|
||||
// Also, when doing cropping, it will map the object annotations from the
|
||||
// dataset to the cropped image as well as perform random scale jittering.
|
||||
// You can tell it how much scale jittering you would like by saying "please
|
||||
// make the objects in the crops have a min and max size of such and such".
|
||||
// You do that by calling these two functions. Here we are saying we want the
|
||||
// objects in our crops to be between 0.2*400 and 0.8*400 pixels in height.
|
||||
cropper.set_min_object_height(0.2);
|
||||
cropper.set_max_object_height(0.8);
|
||||
// The cropper can also randomly mirror and rotate crops, which we ask it to
|
||||
// perform as well.
|
||||
cropper.set_randomly_flip(true);
|
||||
cropper.set_max_rotation_degrees(50);
|
||||
// This fraction of crops are from random parts of images, rather than being centered
|
||||
// on some object.
|
||||
cropper.set_background_crops_fraction(0.2);
|
||||
|
||||
// Now ask the cropper to generate a bunch of crops. The output is stored in
|
||||
// crops and crop_boxes.
|
||||
std::vector<matrix<rgb_pixel>> crops;
|
||||
std::vector<std::vector<mmod_rect>> crop_boxes;
|
||||
// Make 1000 crops.
|
||||
cropper(1000, images, boxes, crops, crop_boxes);
|
||||
|
||||
// Finally, lets look at the results
|
||||
image_window win;
|
||||
for (size_t i = 0; i < crops.size(); ++i)
|
||||
{
|
||||
win.clear_overlay();
|
||||
win.set_image(crops[i]);
|
||||
for (auto b : crop_boxes[i])
|
||||
{
|
||||
// Note that mmod_rect has an ignore field. If an object was labeled
|
||||
// ignore in boxes then it will still be labeled as ignore in
|
||||
// crop_boxes. Moreover, objects that are not well contained within
|
||||
// the crop are also set to ignore.
|
||||
if (b.ignore)
|
||||
win.add_overlay(b.rect, rgb_pixel(255,255,0)); // draw ignored boxes as orange
|
||||
else
|
||||
win.add_overlay(b.rect, rgb_pixel(255,0,0)); // draw other boxes as red
|
||||
}
|
||||
cout << "Hit enter to view the next random crop.";
|
||||
cin.get();
|
||||
}
|
||||
|
||||
}
|
||||
catch(std::exception& e)
|
||||
{
|
||||
cout << e.what() << endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user