mirror of
https://github.com/davisking/dlib.git
synced 2024-11-01 10:14:53 +08:00
328 lines
13 KiB
C++
328 lines
13 KiB
C++
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
|
|
/*
|
|
This is an example illustrating the use of the deep learning tools from the
|
|
dlib C++ Library. In it, we will show how to use the loss_metric layer to do
|
|
metric learning on images.
|
|
|
|
The main reason you might want to use this kind of algorithm is because you
|
|
would like to use a k-nearest neighbor classifier or similar algorithm, but
|
|
you don't know a good way to calculate the distance between two things. A
|
|
popular example would be face recognition. There are a whole lot of papers
|
|
that train some kind of deep metric learning algorithm that embeds face
|
|
images in some vector space where images of the same person are close to each
|
|
other and images of different people are far apart. Then in that vector
|
|
space it's very easy to do face recognition with some kind of k-nearest
|
|
neighbor classifier.
|
|
|
|
In this example we will use the ResNet-34 network from the dnn_imagenet_ex.cpp
|
|
example to learn to map images into some vector space where pictures of
|
|
the same person are close and pictures of different people are far apart.
|
|
|
|
You might want to read the simpler introduction to the deep metric learning
|
|
API, dnn_metric_learning_ex.cpp, before reading this example. You should
|
|
also have read the examples that introduce the dlib DNN API before
|
|
continuing. These are dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp.
|
|
|
|
*/
|
|
|
|
#include <dlib/dnn.h>
|
|
#include <dlib/image_io.h>
|
|
#include <dlib/misc_api.h>
|
|
|
|
using namespace dlib;
|
|
using namespace std;
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
// We will need to create some functions for loading data. This program will
|
|
// expect to be given a directory structured as follows:
|
|
// top_level_directory/
|
|
// person1/
|
|
// image1.jpg
|
|
// image2.jpg
|
|
// image3.jpg
|
|
// person2/
|
|
// image4.jpg
|
|
// image5.jpg
|
|
// image6.jpg
|
|
// person3/
|
|
// image7.jpg
|
|
// image8.jpg
|
|
// image9.jpg
|
|
//
|
|
// The specific folder and image names don't matter, nor does the number of folders or
|
|
// images. What does matter is that there is a top level folder, which contains
|
|
// subfolders, and each subfolder contains images of a single person.
|
|
|
|
// This function spiders the top level directory and obtains a list of all the
|
|
// image files.
|
|
std::vector<std::vector<string>> load_objects_list (
|
|
const string& dir
|
|
)
|
|
{
|
|
std::vector<std::vector<string>> objects;
|
|
for (auto subdir : directory(dir).get_dirs())
|
|
{
|
|
std::vector<string> imgs;
|
|
for (auto img : subdir.get_files())
|
|
imgs.push_back(img);
|
|
|
|
objects.push_back(imgs);
|
|
}
|
|
return objects;
|
|
}
|
|
|
|
// This function takes the output of load_objects_list() as input and randomly
|
|
// selects images for training. It should also be pointed out that it's really
|
|
// important that each mini-batch contain multiple images of each person. This
|
|
// is because the metric learning algorithm needs to consider pairs of images
|
|
// that should be close (i.e. images of the same person) as well as pairs of
|
|
// images that should be far apart (i.e. images of different people) during each
|
|
// training step.
|
|
void load_mini_batch (
|
|
const size_t num_people, // how many different people to include
|
|
const size_t samples_per_id, // how many images per person to select.
|
|
dlib::rand& rnd,
|
|
const std::vector<std::vector<string>>& objs,
|
|
std::vector<matrix<rgb_pixel>>& images,
|
|
std::vector<unsigned long>& labels
|
|
)
|
|
{
|
|
images.clear();
|
|
labels.clear();
|
|
DLIB_CASSERT(num_people <= objs.size(), "The dataset doesn't have that many people in it.");
|
|
|
|
std::vector<bool> already_selected(objs.size(), false);
|
|
matrix<rgb_pixel> image;
|
|
for (size_t i = 0; i < num_people; ++i)
|
|
{
|
|
size_t id = rnd.get_random_32bit_number()%objs.size();
|
|
// don't pick a person we already added to the mini-batch
|
|
while(already_selected[id])
|
|
id = rnd.get_random_32bit_number()%objs.size();
|
|
already_selected[id] = true;
|
|
|
|
for (size_t j = 0; j < samples_per_id; ++j)
|
|
{
|
|
const auto& obj = objs[id][rnd.get_random_32bit_number()%objs[id].size()];
|
|
load_image(image, obj);
|
|
images.push_back(std::move(image));
|
|
labels.push_back(id);
|
|
}
|
|
}
|
|
|
|
// You might want to do some data augmentation at this point. Here we so some simple
|
|
// color augmentation.
|
|
for (auto&& crop : images)
|
|
disturb_colors(crop,rnd);
|
|
|
|
|
|
// All the images going into a mini-batch have to be the same size. And really, all
|
|
// the images in your entire training dataset should be the same size for what we are
|
|
// doing to make the most sense.
|
|
DLIB_CASSERT(images.size() > 0);
|
|
for (auto&& img : images)
|
|
{
|
|
DLIB_CASSERT(img.nr() == images[0].nr() && img.nc() == images[0].nc(),
|
|
"All the images in a single mini-batch must be the same size.");
|
|
}
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
// The next page of code defines the ResNet-34 network. It's basically copied
|
|
// and pasted from the dnn_imagenet_ex.cpp example, except we replaced the loss
|
|
// layer with loss_metric.
|
|
|
|
template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
|
|
using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
|
|
|
|
template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
|
|
using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;
|
|
|
|
template <int N, template <typename> class BN, int stride, typename SUBNET>
|
|
using block = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;
|
|
|
|
|
|
template <int N, typename SUBNET> using res = relu<residual<block,N,bn_con,SUBNET>>;
|
|
template <int N, typename SUBNET> using ares = relu<residual<block,N,affine,SUBNET>>;
|
|
template <int N, typename SUBNET> using res_down = relu<residual_down<block,N,bn_con,SUBNET>>;
|
|
template <int N, typename SUBNET> using ares_down = relu<residual_down<block,N,affine,SUBNET>>;
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <typename SUBNET> using level1 = res<512,res<512,res_down<512,SUBNET>>>;
|
|
template <typename SUBNET> using level2 = res<256,res<256,res<256,res<256,res<256,res_down<256,SUBNET>>>>>>;
|
|
template <typename SUBNET> using level3 = res<128,res<128,res<128,res_down<128,SUBNET>>>>;
|
|
template <typename SUBNET> using level4 = res<64,res<64,res<64,SUBNET>>>;
|
|
|
|
template <typename SUBNET> using alevel1 = ares<512,ares<512,ares_down<512,SUBNET>>>;
|
|
template <typename SUBNET> using alevel2 = ares<256,ares<256,ares<256,ares<256,ares<256,ares_down<256,SUBNET>>>>>>;
|
|
template <typename SUBNET> using alevel3 = ares<128,ares<128,ares<128,ares_down<128,SUBNET>>>>;
|
|
template <typename SUBNET> using alevel4 = ares<64,ares<64,ares<64,SUBNET>>>;
|
|
|
|
template <typename SUBNET> using final_pooling = avg_pool_everything<SUBNET>;
|
|
template <typename SUBNET> using afinal_pooling = avg_pool_everything<SUBNET>;
|
|
|
|
// training network type
|
|
using net_type = loss_metric<fc_no_bias<128,final_pooling<
|
|
level1<
|
|
level2<
|
|
level3<
|
|
level4<
|
|
max_pool<3,3,2,2,relu<bn_con<con<64,7,7,2,2,
|
|
input_rgb_image
|
|
>>>>>>>>>>>;
|
|
|
|
// testing network type (replaced batch normalization with fixed affine transforms)
|
|
using anet_type = loss_metric<fc_no_bias<128,afinal_pooling<
|
|
alevel1<
|
|
alevel2<
|
|
alevel3<
|
|
alevel4<
|
|
max_pool<3,3,2,2,relu<affine<con<64,7,7,2,2,
|
|
input_rgb_image
|
|
>>>>>>>>>>>;
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
int main(int argc, char** argv)
|
|
{
|
|
if (argc != 2)
|
|
{
|
|
cout << "Give a folder as input. It should contain sub-folders of images and we will " << endl;
|
|
cout << "learn to distinguish between these sub-folders with metric learning. " << endl;
|
|
cout << "For example, you can run this program on the very small examples/johns dataset" << endl;
|
|
cout << "that comes with dlib by running this command:" << endl;
|
|
cout << " ./dnn_metric_learning_on_images_ex johns" << endl;
|
|
return 1;
|
|
}
|
|
|
|
auto objs = load_objects_list(argv[1]);
|
|
|
|
cout << "objs.size(): "<< objs.size() << endl;
|
|
|
|
std::vector<matrix<rgb_pixel>> images;
|
|
std::vector<unsigned long> labels;
|
|
|
|
|
|
net_type net;
|
|
|
|
dnn_trainer<net_type> trainer(net, sgd(0.0005, 0.9));
|
|
trainer.set_learning_rate(0.1);
|
|
trainer.be_verbose();
|
|
trainer.set_synchronization_file("face_metric_sync", std::chrono::minutes(5));
|
|
// I've set this to something really small to make the example terminate
|
|
// sooner. But when you really want to train a good model you should set
|
|
// this to something like 8000 so training doesn't terminate too early.
|
|
trainer.set_iterations_without_progress_threshold(300);
|
|
|
|
// If you have a lot of data then it might not be reasonable to load it all
|
|
// into RAM. So you will need to be sure you are decompressing your images
|
|
// and loading them fast enough to keep the GPU occupied. I like to do this
|
|
// using the following coding pattern: create a bunch of threads that dump
|
|
// mini-batches into dlib::pipes.
|
|
dlib::pipe<std::vector<matrix<rgb_pixel>>> qimages(4);
|
|
dlib::pipe<std::vector<unsigned long>> qlabels(4);
|
|
auto data_loader = [&qimages, &qlabels, &objs](time_t seed)
|
|
{
|
|
dlib::rand rnd(time(0)+seed);
|
|
std::vector<matrix<rgb_pixel>> images;
|
|
std::vector<unsigned long> labels;
|
|
while(qimages.is_enabled())
|
|
{
|
|
try
|
|
{
|
|
load_mini_batch(5, 5, rnd, objs, images, labels);
|
|
qimages.enqueue(images);
|
|
qlabels.enqueue(labels);
|
|
}
|
|
catch(std::exception& e)
|
|
{
|
|
cout << "EXCEPTION IN LOADING DATA" << endl;
|
|
cout << e.what() << endl;
|
|
}
|
|
}
|
|
};
|
|
// Run the data_loader from 5 threads. You should set the number of threads
|
|
// relative to the number of CPU cores you have.
|
|
std::thread data_loader1([data_loader](){ data_loader(1); });
|
|
std::thread data_loader2([data_loader](){ data_loader(2); });
|
|
std::thread data_loader3([data_loader](){ data_loader(3); });
|
|
std::thread data_loader4([data_loader](){ data_loader(4); });
|
|
std::thread data_loader5([data_loader](){ data_loader(5); });
|
|
|
|
|
|
// Here we do the training. We keep passing mini-batches to the trainer until the
|
|
// learning rate has dropped low enough.
|
|
while(trainer.get_learning_rate() >= 1e-4)
|
|
{
|
|
qimages.dequeue(images);
|
|
qlabels.dequeue(labels);
|
|
trainer.train_one_step(images, labels);
|
|
}
|
|
|
|
// Wait for training threads to stop
|
|
trainer.get_net();
|
|
cout << "done training" << endl;
|
|
|
|
// Save the network to disk
|
|
net.clean();
|
|
serialize("metric_network_renset.dat") << net;
|
|
|
|
// stop all the data loading threads and wait for them to terminate.
|
|
qimages.disable();
|
|
qlabels.disable();
|
|
data_loader1.join();
|
|
data_loader2.join();
|
|
data_loader3.join();
|
|
data_loader4.join();
|
|
data_loader5.join();
|
|
|
|
|
|
|
|
|
|
|
|
// Now, just to show an example of how you would use the network, let's check how well
|
|
// it performs on the training data.
|
|
dlib::rand rnd(time(0));
|
|
load_mini_batch(5, 5, rnd, objs, images, labels);
|
|
|
|
// Run all the images through the network to get their vector embeddings.
|
|
std::vector<matrix<float,0,1>> embedded = net(images);
|
|
|
|
// Now, check if the embedding puts images with the same labels near each other and
|
|
// images with different labels far apart.
|
|
int num_right = 0;
|
|
int num_wrong = 0;
|
|
for (size_t i = 0; i < embedded.size(); ++i)
|
|
{
|
|
for (size_t j = i+1; j < embedded.size(); ++j)
|
|
{
|
|
if (labels[i] == labels[j])
|
|
{
|
|
// The loss_metric layer will cause images with the same label to be less
|
|
// than net.loss_details().get_distance_threshold() distance from each
|
|
// other. So we can use that distance value as our testing threshold.
|
|
if (length(embedded[i]-embedded[j]) < net.loss_details().get_distance_threshold())
|
|
++num_right;
|
|
else
|
|
++num_wrong;
|
|
}
|
|
else
|
|
{
|
|
if (length(embedded[i]-embedded[j]) >= net.loss_details().get_distance_threshold())
|
|
++num_right;
|
|
else
|
|
++num_wrong;
|
|
}
|
|
}
|
|
}
|
|
|
|
cout << "num_right: "<< num_right << endl;
|
|
cout << "num_wrong: "<< num_wrong << endl;
|
|
|
|
}
|
|
|
|
|