dlib/examples/dnn_mmod_find_cars_ex.cpp



#include <iostream>
#include <dlib/dnn.h>
#include <dlib/data_io.h>
#include <dlib/gui_widgets.h>
#include <dlib/dir_nav.h>
#include <dlib/time_this.h>
#include <dlib/gui_widgets.h>
#include <dlib/image_processing.h>

using namespace std;
using namespace dlib;


// the dnn rear view vehicle detector network
template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
template <long num_filters, typename SUBNET> using con5  = con<num_filters,5,5,1,1,SUBNET>;
template <typename SUBNET> using downsampler  = relu<affine<con5d<32, relu<affine<con5d<32, relu<affine<con5d<16,SUBNET>>>>>>>>>;
template <typename SUBNET> using rcon5  = relu<affine<con5<55,SUBNET>>>;
using net_type = loss_mmod<con<1,9,9,1,1,rcon5<rcon5<rcon5<downsampler<input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;

// ----------------------------------------------------------------------------------------

int main() try
{
    net_type net;
    shape_predictor sp;
    // You can get this file from http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2
    // This network was produced by the dnn_mmod_train_find_cars_ex.cpp example program.
    // As you can see, it also includes a shape_predictor.  To see a generic example of how
    // to train those refer to train_shape_predictor_ex.cpp.
    deserialize("mmod_rear_end_vehicle_detector.dat") >> net >> sp;

    matrix<rgb_pixel> img;
    load_image(img, "../mmod_cars_test_image.jpg");

    image_window win;
    win.set_image(img);

    // Run the detector on the image and show us the output.
    for (auto&& d : net(img))
    {
        // We use a shape_predictor to refine the exact shape and location of the detection
        // box.  This shape_predictor is trained to simply output the 4 corner points.  So
        // all we do is make a rectangle that tightly contains those 4 points and that
        // rectangle is our refined detection position.
        auto fd = sp(img,d);
        rectangle rect;
        for (long j = 0; j < fd.num_parts(); ++j)
            rect += fd.part(j);
        win.add_overlay(rect, rgb_pixel(255,0,0));
    }


    cout << "Hit enter to view the intermediate processing steps" << endl;
    cin.get();


    // Create a tiled image pyramid and display it on the screen. 
    std::vector<rectangle> rects;
    matrix<rgb_pixel> tiled_img;
    create_tiled_pyramid<std::remove_reference<decltype(input_layer(net))>::type::pyramid_type>(img,
        tiled_img, rects, input_layer(net).get_pyramid_padding(),
        input_layer(net).get_pyramid_outer_padding());
    image_window winpyr(tiled_img, "Tiled image pyramid");


    cout << "Number of channels in final tensor image: " << net.subnet().get_output().k() << endl;
    matrix<float> network_output = image_plane(net.subnet().get_output(),0,0);
    for (long k = 1; k < net.subnet().get_output().k(); ++k)
        network_output = max_pointwise(network_output, image_plane(net.subnet().get_output(),0,k));
    const double v0_scale = img.nc()/(double)network_output.nc();
    resize_image(v0_scale, network_output);


    const float lower = -2.5;// min(network_output);
    const float upper = 0.0;// max(network_output);
    cout << "jet color mapping range:  lower="<< lower << "  upper="<< upper << endl;

    // Display the final layer as a color image
    image_window win_output(jet(network_output, upper, lower), "Output tensor from the network");


    // Overlay network_output on top of the tiled image pyramid and display it.
    matrix<rgb_pixel> tiled_img_sal = tiled_img;
    for (long r = 0; r < tiled_img_sal.nr(); ++r)
    {
        for (long c = 0; c < tiled_img_sal.nc(); ++c)
        {
            dpoint tmp(c,r);
            tmp = input_tensor_to_output_tensor(net, tmp);
            tmp = point(v0_scale*tmp);
            if (get_rect(network_output).contains(tmp))
            {
                float val = network_output(tmp.y(),tmp.x());
                rgb_alpha_pixel p;
                assign_pixel(p , colormap_jet(val,lower,upper));
                p.alpha = 120;
                assign_pixel(tiled_img_sal(r,c), p);
            }
        }
    }
    image_window win_pyr_sal(tiled_img_sal, "Saliency on image pyramid");


    // Now collapse the pyramid scales into the original image
    matrix<float> collapsed_saliency(img.nr(), img.nc());
    resizable_tensor input_tensor;
    input_layer(net).to_tensor(&img, &img+1, input_tensor);
    for (long r = 0; r < collapsed_saliency.nr(); ++r)
    {
        for (long c = 0; c < collapsed_saliency.nc(); ++c)
        {
            // Loop over a bunch of scale values and look up what part of network_output corresponds to
            // the point(c,r) in the original image, then take the max saliency value over
            // all the scales and save it at pixel point(c,r).
            float max_sal = -1e30;
            for (double scale = 1; scale > 0.2; scale *= 5.0/6.0)
            {
                // map from input image coordinates to tiled pyramid and then to output
                // tensor coordinates.
                dpoint tmp = center(input_layer(net).image_space_to_tensor_space(input_tensor,scale, drectangle(dpoint(c,r))));
                tmp = point(v0_scale*input_tensor_to_output_tensor(net, tmp));
                if (get_rect(network_output).contains(tmp))
                {
                    float val = network_output(tmp.y(),tmp.x());
                    if (val > max_sal)
                        max_sal = val;
                }
            }

            collapsed_saliency(r,c) = max_sal;

            // Also blend the saliency into the original input image so we can view it as
            // an overlay on the cars.
            rgb_alpha_pixel p;
            assign_pixel(p , colormap_jet(max_sal,lower,upper));
            p.alpha = 120;
            assign_pixel(img(r,c), p);
        }
    }

    image_window win_collapsed(jet(collapsed_saliency, upper, lower), "collapsed saliency map");
    image_window win_img_and_sal(img);


    cout << "Hit enter to end program" << endl;
    cin.get();
}
catch(image_load_error& e)
{
    cout << e.what() << endl;
    cout << "The test image is located in the examples folder.  So you should run this program from a sub folder so that the relative path is correct." << endl;
}
catch(serialization_error& e)
{
    cout << e.what() << endl;
    cout << "The model file can be obtained from: http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2   Don't forget to unzip the file." << endl;
}
catch(std::exception& e)
{
    cout << e.what() << endl;
}
Added two vehicle detection examples. 2017-08-27 05:13:47 +08:00

			`#include <iostream>`
			`#include <dlib/dnn.h>`
			`#include <dlib/data_io.h>`
			`#include <dlib/gui_widgets.h>`
			`#include <dlib/dir_nav.h>`
			`#include <dlib/time_this.h>`
			`#include <dlib/gui_widgets.h>`
			`#include <dlib/image_processing.h>`

			`using namespace std;`
			`using namespace dlib;`



			`// the dnn rear view vehicle detector network`
			`template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;`
			`template <long num_filters, typename SUBNET> using con5 = con<num_filters,5,5,1,1,SUBNET>;`
			`template <typename SUBNET> using downsampler = relu<affine<con5d<32, relu<affine<con5d<32, relu<affine<con5d<16,SUBNET>>>>>>>>>;`
			`template <typename SUBNET> using rcon5 = relu<affine<con5<55,SUBNET>>>;`
			`using net_type = loss_mmod<con<1,9,9,1,1,rcon5<rcon5<rcon5<downsampler<input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;`

			`// ----------------------------------------------------------------------------------------`

			`int main() try`
			`{`
			`net_type net;`
			`shape_predictor sp;`
			`// You can get this file from http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2`
			`// This network was produced by the dnn_mmod_train_find_cars_ex.cpp example program.`
			`// As you can see, it also includes a shape_predictor. To see a generic example of how`
			`// to train those refer to train_shape_predictor_ex.cpp.`
			`deserialize("mmod_rear_end_vehicle_detector.dat") >> net >> sp;`

			`matrix<rgb_pixel> img;`
			`load_image(img, "../mmod_cars_test_image.jpg");`

			`image_window win;`
			`win.set_image(img);`

			`// Run the detector on the image and show us the output.`
			`for (auto&& d : net(img))`
			`{`
			`// We use a shape_predictor to refine the exact shape and location of the detection`
			`// box. This shape_predictor is trained to simply output the 4 corner points. So`
			`// all we do is make a rectangle that tightly contains those 4 points and that`
			`// rectangle is our refined detection position.`
			`auto fd = sp(img,d);`
			`rectangle rect;`
			`for (long j = 0; j < fd.num_parts(); ++j)`
			`rect += fd.part(j);`
			`win.add_overlay(rect, rgb_pixel(255,0,0));`
			`}`



			`cout << "Hit enter to view the intermediate processing steps" << endl;`
			`cin.get();`



			`// Create a tiled image pyramid and display it on the screen.`
			`std::vector<rectangle> rects;`
			`matrix<rgb_pixel> tiled_img;`
			`create_tiled_pyramid<std::remove_reference<decltype(input_layer(net))>::type::pyramid_type>(img,`
			`tiled_img, rects, input_layer(net).get_pyramid_padding(),`
			`input_layer(net).get_pyramid_outer_padding());`
			`image_window winpyr(tiled_img, "Tiled image pyramid");`



			`cout << "Number of channels in final tensor image: " << net.subnet().get_output().k() << endl;`
			`matrix<float> network_output = image_plane(net.subnet().get_output(),0,0);`
			`for (long k = 1; k < net.subnet().get_output().k(); ++k)`
			`network_output = max_pointwise(network_output, image_plane(net.subnet().get_output(),0,k));`
			`const double v0_scale = img.nc()/(double)network_output.nc();`
			`resize_image(v0_scale, network_output);`


			`const float lower = -2.5;// min(network_output);`
			`const float upper = 0.0;// max(network_output);`
			`cout << "jet color mapping range: lower="<< lower << " upper="<< upper << endl;`

			`// Display the final layer as a color image`
			`image_window win_output(jet(network_output, upper, lower), "Output tensor from the network");`



			`// Overlay network_output on top of the tiled image pyramid and display it.`
			`matrix<rgb_pixel> tiled_img_sal = tiled_img;`
			`for (long r = 0; r < tiled_img_sal.nr(); ++r)`
			`{`
			`for (long c = 0; c < tiled_img_sal.nc(); ++c)`
			`{`
			`dpoint tmp(c,r);`
			`tmp = input_tensor_to_output_tensor(net, tmp);`
			`tmp = point(v0_scale*tmp);`
			`if (get_rect(network_output).contains(tmp))`
			`{`
			`float val = network_output(tmp.y(),tmp.x());`
			`rgb_alpha_pixel p;`
			`assign_pixel(p , colormap_jet(val,lower,upper));`
			`p.alpha = 120;`
			`assign_pixel(tiled_img_sal(r,c), p);`
			`}`
			`}`
			`}`
			`image_window win_pyr_sal(tiled_img_sal, "Saliency on image pyramid");`




			`// Now collapse the pyramid scales into the original image`
			`matrix<float> collapsed_saliency(img.nr(), img.nc());`
			`resizable_tensor input_tensor;`
			`input_layer(net).to_tensor(&img, &img+1, input_tensor);`
			`for (long r = 0; r < collapsed_saliency.nr(); ++r)`
			`{`
			`for (long c = 0; c < collapsed_saliency.nc(); ++c)`
			`{`
			`// Loop over a bunch of scale values and look up what part of network_output corresponds to`
			`// the point(c,r) in the original image, then take the max saliency value over`
			`// all the scales and save it at pixel point(c,r).`
			`float max_sal = -1e30;`
			`for (double scale = 1; scale > 0.2; scale *= 5.0/6.0)`
			`{`
			`// map from input image coordinates to tiled pyramid and then to output`
			`// tensor coordinates.`
			`dpoint tmp = center(input_layer(net).image_space_to_tensor_space(input_tensor,scale, drectangle(dpoint(c,r))));`
			`tmp = point(v0_scale*input_tensor_to_output_tensor(net, tmp));`
			`if (get_rect(network_output).contains(tmp))`
			`{`
			`float val = network_output(tmp.y(),tmp.x());`
			`if (val > max_sal)`
			`max_sal = val;`
			`}`
			`}`

			`collapsed_saliency(r,c) = max_sal;`

			`// Also blend the saliency into the original input image so we can view it as`
			`// an overlay on the cars.`
			`rgb_alpha_pixel p;`
			`assign_pixel(p , colormap_jet(max_sal,lower,upper));`
			`p.alpha = 120;`
			`assign_pixel(img(r,c), p);`
			`}`
			`}`

			`image_window win_collapsed(jet(collapsed_saliency, upper, lower), "collapsed saliency map");`
			`image_window win_img_and_sal(img);`


			`cout << "Hit enter to end program" << endl;`
			`cin.get();`
			`}`
			`catch(image_load_error& e)`
			`{`
			`cout << e.what() << endl;`
			`cout << "The test image is located in the examples folder. So you should run this program from a sub folder so that the relative path is correct." << endl;`
			`}`
			`catch(serialization_error& e)`
			`{`
			`cout << e.what() << endl;`
			`cout << "The model file can be obtained from: http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2 Don't forget to unzip the file." << endl;`
			`}`
			`catch(std::exception& e)`
			`{`
			`cout << e.what() << endl;`
			`}`