This commit is contained in:
Davis King 2017-08-27 10:46:49 -04:00
parent 5e9f094d5a
commit a362305e1b

View File

@ -2,9 +2,9 @@
/*
This example shows how to run a CNN based vehicle detector using dlib. The
example loads a pretrained model and uses it to find the rear ends of cars in
images. We will also visualize some of the detector's processing steps by
an image. We will also visualize some of the detector's processing steps by
plotting various intermediate images on the screen. Viewing these can help
understand how the detector works.
you understand how the detector works.
The model used by this example was trained by the dnn_mmod_train_find_cars_ex.cpp
example. Also, since this is a CNN, you really should use a GPU to get the
@ -34,7 +34,7 @@ using namespace dlib;
// the dnn rear view vehicle detector network
// The rear view vehicle detector network
template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
template <long num_filters, typename SUBNET> using con5 = con<num_filters,5,5,1,1,SUBNET>;
template <typename SUBNET> using downsampler = relu<affine<con5d<32, relu<affine<con5d<32, relu<affine<con5d<16,SUBNET>>>>>>>>>;
@ -63,9 +63,9 @@ int main() try
for (auto&& d : net(img))
{
// We use a shape_predictor to refine the exact shape and location of the detection
// box. This shape_predictor is trained to simply output the 4 corner points. So
// all we do is make a rectangle that tightly contains those 4 points and that
// rectangle is our refined detection position.
// box. This shape_predictor is trained to simply output the 4 corner points of
// the box. So all we do is make a rectangle that tightly contains those 4 points
// and that rectangle is our refined detection position.
auto fd = sp(img,d);
rectangle rect;
for (unsigned long j = 0; j < fd.num_parts(); ++j)
@ -79,18 +79,18 @@ int main() try
cin.get();
// Now let's look at how the detector works. The top level processing steps look like:
// Now let's look at how the detector works. The high level processing steps look like:
// 1. Create an image pyramid and pack the pyramid into one big image. We call this
// the "tiled pyramid image".
// 2. Run the tiled pyramid image through the CNN. The CNN outputs a new image where
// bright pixels in the output image indicate the presence of cars.
// 3. Find pixels in the CNN output image with a value > 0. Those locations are your
// 3. Find pixels in the CNN's output image with a value > 0. Those locations are your
// preliminary car detections.
// 4. Perform non-maximum suppression on the preliminary detections to produce the
// final output.
//
// We will be plotting the images from steps 1 and 2 so you can visualize what's
// happening. For the CNN output image, we will use the jet colormap so that "bright"
// happening. For the CNN's output image, we will use the jet colormap so that "bright"
// outputs, i.e. pixels with big values, appear in red and "dim" outputs appear as a
// cold blue color. To do this we pick a range of CNN output values for the color
// mapping. The specific values don't matter. They are just selected to give a nice
@ -104,24 +104,28 @@ int main() try
// Create a tiled pyramid image and display it on the screen.
std::vector<rectangle> rects;
matrix<rgb_pixel> tiled_img;
create_tiled_pyramid<std::remove_reference<decltype(input_layer(net))>::type::pyramid_type>(img,
tiled_img, rects, input_layer(net).get_pyramid_padding(),
input_layer(net).get_pyramid_outer_padding());
// Get the type of pyramid the CNN used
using pyramid_type = std::remove_reference<decltype(input_layer(net))>::type::pyramid_type;
// And tell create_tiled_pyramid to create the pyramid using that pyramid type.
create_tiled_pyramid<pyramid_type>(img, tiled_img, rects,
input_layer(net).get_pyramid_padding(),
input_layer(net).get_pyramid_outer_padding());
image_window winpyr(tiled_img, "Tiled pyramid image");
// This CNN detector represents a sliding window detector with 3 sliding windows, one
// for each aspect ratio of vehicle box. The aspect ratio of a detection is determined
// by which channel in the output image triggers the detection. Here we are just going
// to max pool the channels together to get one final image for our display. In this
// image, a pixel will be bright if any of the sliding window detectors thinks there is
// a car at that location.
// This CNN detector represents a sliding window detector with 3 sliding windows. Each
// of the 3 windows has a different aspect ratio, allowing it to find vehicles which
// are either tall and skinny, squarish, or short and wide. The aspect ratio of a
// detection is determined by which channel in the output image triggers the detection.
// Here we are just going to max pool the channels together to get one final image for
// our display. In this image, a pixel will be bright if any of the sliding window
// detectors thinks there is a car at that location.
cout << "Number of channels in final tensor image: " << net.subnet().get_output().k() << endl;
matrix<float> network_output = image_plane(net.subnet().get_output(),0,0);
for (long k = 1; k < net.subnet().get_output().k(); ++k)
network_output = max_pointwise(network_output, image_plane(net.subnet().get_output(),0,k));
// We will also upsample the CNN output image. The CNN we defined has an 8x
// We will also upsample the CNN's output image. The CNN we defined has an 8x
// downsampling layer at the beginning. In the code below we are going to overlay this
// CNN output image on top of the raw input image. To make that look nice it helps to
// upsample the CNN output image back to the same resolution as the input image, which
@ -135,10 +139,9 @@ int main() try
// Also, overlay network_output on top of the tiled image pyramid and display it.
matrix<rgb_pixel> tiled_img_sal = tiled_img;
for (long r = 0; r < tiled_img_sal.nr(); ++r)
for (long r = 0; r < tiled_img.nr(); ++r)
{
for (long c = 0; c < tiled_img_sal.nc(); ++c)
for (long c = 0; c < tiled_img.nc(); ++c)
{
dpoint tmp(c,r);
tmp = input_tensor_to_output_tensor(net, tmp);
@ -151,16 +154,16 @@ int main() try
rgb_alpha_pixel p;
assign_pixel(p , colormap_jet(val,lower,upper));
p.alpha = 120;
assign_pixel(tiled_img_sal(r,c), p);
assign_pixel(tiled_img(r,c), p);
}
}
}
// If you look at this image you can see that the vehicles get bright red blobs on
// them. That's the CNN saying "there is a car here!". You will also notice that
// there is a certain scale it finds cars at. They have to be not too big or too
// small, which is why we have an image pyramid. The pyramid allows us to find cars of
// all scales.
image_window win_pyr_sal(tiled_img_sal, "Saliency on image pyramid");
// If you look at this image you can see that the vehicles have bright red blobs on
// them. That's the CNN saying "there is a car here!". You will also notice there is
// a certain scale at which it finds cars. They have to be not too big or too small,
// which is why we have an image pyramid. The pyramid allows us to find cars of all
// scales.
image_window win_pyr_overlay(tiled_img, "Detection scores on image pyramid");
@ -169,17 +172,17 @@ int main() try
// actually do this step, since it's enough to threshold the tiled pyramid image to get
// the detections. However, it makes a nice visualization and clearly indicates that
// the detector is firing for all the cars.
matrix<float> collapsed_saliency(img.nr(), img.nc());
matrix<float> collapsed(img.nr(), img.nc());
resizable_tensor input_tensor;
input_layer(net).to_tensor(&img, &img+1, input_tensor);
for (long r = 0; r < collapsed_saliency.nr(); ++r)
for (long r = 0; r < collapsed.nr(); ++r)
{
for (long c = 0; c < collapsed_saliency.nc(); ++c)
for (long c = 0; c < collapsed.nc(); ++c)
{
// Loop over a bunch of scale values and look up what part of network_output corresponds to
// the point(c,r) in the original image, then take the max saliency value over
// all the scales and save it at pixel point(c,r).
float max_sal = -1e30;
// Loop over a bunch of scale values and look up what part of network_output
// corresponds to the point(c,r) in the original image, then take the max
// detection score over all the scales and save it at pixel point(c,r).
float max_score = -1e30;
for (double scale = 1; scale > 0.2; scale *= 5.0/6.0)
{
// Map from input image coordinates to tiled pyramid coordinates.
@ -190,24 +193,24 @@ int main() try
if (get_rect(network_output).contains(tmp))
{
float val = network_output(tmp.y(),tmp.x());
if (val > max_sal)
max_sal = val;
if (val > max_score)
max_score = val;
}
}
collapsed_saliency(r,c) = max_sal;
collapsed(r,c) = max_score;
// Also blend the saliency into the original input image so we can view it as
// Also blend the scores into the original input image so we can view it as
// an overlay on the cars.
rgb_alpha_pixel p;
assign_pixel(p , colormap_jet(max_sal,lower,upper));
assign_pixel(p , colormap_jet(max_score,lower,upper));
p.alpha = 120;
assign_pixel(img(r,c), p);
}
}
image_window win_collapsed(jet(collapsed_saliency, upper, lower), "collapsed saliency map");
image_window win_img_and_sal(img);
image_window win_collapsed(jet(collapsed, upper, lower), "Collapsed output tensor from the network");
image_window win_img_and_sal(img, "Collapsed detection scores on raw image");
cout << "Hit enter to end program" << endl;