dlib/examples/dnn_mmod_find_cars_ex.cpp

176 lines
6.4 KiB
C++
Raw Normal View History

2017-08-27 05:13:47 +08:00
#include <iostream>
#include <dlib/dnn.h>
#include <dlib/data_io.h>
#include <dlib/gui_widgets.h>
#include <dlib/dir_nav.h>
#include <dlib/time_this.h>
#include <dlib/gui_widgets.h>
#include <dlib/image_processing.h>
using namespace std;
using namespace dlib;
// the dnn rear view vehicle detector network
template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
template <long num_filters, typename SUBNET> using con5 = con<num_filters,5,5,1,1,SUBNET>;
template <typename SUBNET> using downsampler = relu<affine<con5d<32, relu<affine<con5d<32, relu<affine<con5d<16,SUBNET>>>>>>>>>;
template <typename SUBNET> using rcon5 = relu<affine<con5<55,SUBNET>>>;
using net_type = loss_mmod<con<1,9,9,1,1,rcon5<rcon5<rcon5<downsampler<input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;
// ----------------------------------------------------------------------------------------
int main() try
{
net_type net;
shape_predictor sp;
// You can get this file from http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2
// This network was produced by the dnn_mmod_train_find_cars_ex.cpp example program.
// As you can see, it also includes a shape_predictor. To see a generic example of how
// to train those refer to train_shape_predictor_ex.cpp.
deserialize("mmod_rear_end_vehicle_detector.dat") >> net >> sp;
matrix<rgb_pixel> img;
load_image(img, "../mmod_cars_test_image.jpg");
image_window win;
win.set_image(img);
// Run the detector on the image and show us the output.
for (auto&& d : net(img))
{
// We use a shape_predictor to refine the exact shape and location of the detection
// box. This shape_predictor is trained to simply output the 4 corner points. So
// all we do is make a rectangle that tightly contains those 4 points and that
// rectangle is our refined detection position.
auto fd = sp(img,d);
rectangle rect;
for (long j = 0; j < fd.num_parts(); ++j)
rect += fd.part(j);
win.add_overlay(rect, rgb_pixel(255,0,0));
}
cout << "Hit enter to view the intermediate processing steps" << endl;
cin.get();
// Create a tiled image pyramid and display it on the screen.
std::vector<rectangle> rects;
matrix<rgb_pixel> tiled_img;
create_tiled_pyramid<std::remove_reference<decltype(input_layer(net))>::type::pyramid_type>(img,
tiled_img, rects, input_layer(net).get_pyramid_padding(),
input_layer(net).get_pyramid_outer_padding());
image_window winpyr(tiled_img, "Tiled image pyramid");
cout << "Number of channels in final tensor image: " << net.subnet().get_output().k() << endl;
matrix<float> network_output = image_plane(net.subnet().get_output(),0,0);
for (long k = 1; k < net.subnet().get_output().k(); ++k)
network_output = max_pointwise(network_output, image_plane(net.subnet().get_output(),0,k));
const double v0_scale = img.nc()/(double)network_output.nc();
resize_image(v0_scale, network_output);
const float lower = -2.5;// min(network_output);
const float upper = 0.0;// max(network_output);
cout << "jet color mapping range: lower="<< lower << " upper="<< upper << endl;
// Display the final layer as a color image
image_window win_output(jet(network_output, upper, lower), "Output tensor from the network");
// Overlay network_output on top of the tiled image pyramid and display it.
matrix<rgb_pixel> tiled_img_sal = tiled_img;
for (long r = 0; r < tiled_img_sal.nr(); ++r)
{
for (long c = 0; c < tiled_img_sal.nc(); ++c)
{
dpoint tmp(c,r);
tmp = input_tensor_to_output_tensor(net, tmp);
tmp = point(v0_scale*tmp);
if (get_rect(network_output).contains(tmp))
{
float val = network_output(tmp.y(),tmp.x());
rgb_alpha_pixel p;
assign_pixel(p , colormap_jet(val,lower,upper));
p.alpha = 120;
assign_pixel(tiled_img_sal(r,c), p);
}
}
}
image_window win_pyr_sal(tiled_img_sal, "Saliency on image pyramid");
// Now collapse the pyramid scales into the original image
matrix<float> collapsed_saliency(img.nr(), img.nc());
resizable_tensor input_tensor;
input_layer(net).to_tensor(&img, &img+1, input_tensor);
for (long r = 0; r < collapsed_saliency.nr(); ++r)
{
for (long c = 0; c < collapsed_saliency.nc(); ++c)
{
// Loop over a bunch of scale values and look up what part of network_output corresponds to
// the point(c,r) in the original image, then take the max saliency value over
// all the scales and save it at pixel point(c,r).
float max_sal = -1e30;
for (double scale = 1; scale > 0.2; scale *= 5.0/6.0)
{
// map from input image coordinates to tiled pyramid and then to output
// tensor coordinates.
dpoint tmp = center(input_layer(net).image_space_to_tensor_space(input_tensor,scale, drectangle(dpoint(c,r))));
tmp = point(v0_scale*input_tensor_to_output_tensor(net, tmp));
if (get_rect(network_output).contains(tmp))
{
float val = network_output(tmp.y(),tmp.x());
if (val > max_sal)
max_sal = val;
}
}
collapsed_saliency(r,c) = max_sal;
// Also blend the saliency into the original input image so we can view it as
// an overlay on the cars.
rgb_alpha_pixel p;
assign_pixel(p , colormap_jet(max_sal,lower,upper));
p.alpha = 120;
assign_pixel(img(r,c), p);
}
}
image_window win_collapsed(jet(collapsed_saliency, upper, lower), "collapsed saliency map");
image_window win_img_and_sal(img);
cout << "Hit enter to end program" << endl;
cin.get();
}
catch(image_load_error& e)
{
cout << e.what() << endl;
cout << "The test image is located in the examples folder. So you should run this program from a sub folder so that the relative path is correct." << endl;
}
catch(serialization_error& e)
{
cout << e.what() << endl;
cout << "The model file can be obtained from: http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2 Don't forget to unzip the file." << endl;
}
catch(std::exception& e)
{
cout << e.what() << endl;
}