2017-05-07 03:02:38 +08:00
# include <dlib/xml_parser.h>
# include <dlib/matrix.h>
# include <fstream>
# include <vector>
# include <stack>
# include <set>
# include <dlib/string.h>
using namespace std ;
using namespace dlib ;
// ----------------------------------------------------------------------------------------
// Only these computational layers have parameters
2017-05-08 03:26:18 +08:00
const std : : set < string > comp_tags_with_params = { " fc " , " fc_no_bias " , " con " , " affine_con " , " affine_fc " , " affine " , " prelu " } ;
2017-05-07 03:02:38 +08:00
struct layer
{
string type ; // comp, loss, or input
int idx ;
2017-05-24 19:24:12 +08:00
matrix < long , 4 , 1 > output_tensor_shape ; // (N,K,NR,NC)
2017-05-07 03:02:38 +08:00
string detail_name ; // The name of the tag inside the layer tag. e.g. fc, con, max_pool, input_rgb_image.
std : : map < string , double > attributes ;
matrix < double > params ;
long tag_id = - 1 ; // If this isn't -1 then it means this layer was tagged, e.g. wrapped with tag2<> giving tag_id==2
long skip_id = - 1 ; // If this isn't -1 then it means this layer draws its inputs from
// the most recent layer with tag_id==skip_id rather than its immediate predecessor.
2017-05-08 03:26:18 +08:00
double attribute ( const string & key ) const
{
auto i = attributes . find ( key ) ;
if ( i ! = attributes . end ( ) )
return i - > second ;
else
throw dlib : : error ( " Layer doesn't have the requested attribute ' " + key + " '. " ) ;
}
2017-05-07 03:02:38 +08:00
string caffe_layer_name ( ) const
{
if ( type = = " input " )
return " data " ;
else
return detail_name + to_string ( idx ) ;
}
} ;
// ----------------------------------------------------------------------------------------
std : : vector < layer > parse_dlib_xml (
2017-05-24 19:24:12 +08:00
const matrix < long , 4 , 1 > & input_tensor_shape ,
2017-05-07 03:02:38 +08:00
const string & xml_filename
) ;
// ----------------------------------------------------------------------------------------
template < typename iterator >
2017-05-24 19:24:12 +08:00
const layer & find_layer (
2017-05-07 03:02:38 +08:00
iterator i ,
long tag_id
)
/*!
requires
2017-05-24 19:24:12 +08:00
- i is a reverse iterator pointing to a layer in the list of layers produced by parse_dlib_xml ( ) .
2017-05-07 03:02:38 +08:00
- i is not an input layer .
ensures
- if ( tag_id = = - 1 ) then
2017-05-24 19:24:12 +08:00
- returns the previous layer ( i . e . closer to the input ) to layer i .
2017-05-07 03:02:38 +08:00
- else
2017-05-24 19:24:12 +08:00
- returns the previous layer ( i . e . closer to the input ) to layer i with the
given tag_id .
2017-05-07 03:02:38 +08:00
! */
{
if ( tag_id = = - 1 )
{
2017-05-24 19:24:12 +08:00
return * ( i - 1 ) ;
2017-05-07 03:02:38 +08:00
}
else
{
while ( true )
{
i - - ;
// if we hit the end of the network before we found what we were looking for
if ( i - > tag_id = = tag_id )
2017-05-24 19:24:12 +08:00
return * i ;
2017-05-08 03:26:18 +08:00
if ( i - > type = = " input " )
throw dlib : : error ( " Network definition is bad, a layer wanted to skip back to a non-existing layer. " ) ;
2017-05-07 03:02:38 +08:00
}
}
}
template < typename iterator >
2017-05-24 19:24:12 +08:00
const layer & find_input_layer ( iterator i ) { return find_layer ( i , i - > skip_id ) ; }
template < typename iterator >
string find_layer_caffe_name (
iterator i ,
long tag_id
)
{
return find_layer ( i , tag_id ) . caffe_layer_name ( ) ;
}
template < typename iterator >
string find_input_layer_caffe_name ( iterator i ) { return find_input_layer ( i ) . caffe_layer_name ( ) ; }
2017-05-07 03:02:38 +08:00
// ----------------------------------------------------------------------------------------
template < typename EXP >
void print_as_np_array ( std : : ostream & out , const matrix_exp < EXP > & m )
{
out < < " np.array([ " ;
for ( auto x : m )
out < < x < < " , " ;
out < < " ], dtype='float32') " ;
}
// ----------------------------------------------------------------------------------------
2017-05-08 07:40:42 +08:00
void convert_dlib_xml_to_caffe_python_code (
2017-05-23 07:06:55 +08:00
const string & xml_filename ,
const long N ,
const long K ,
const long NR ,
const long NC
2017-05-07 03:02:38 +08:00
)
{
2017-05-08 03:49:19 +08:00
const string out_filename = left_substr ( xml_filename , " . " ) + " _dlib_to_caffe_model.py " ;
cout < < " Writing model to " < < out_filename < < endl ;
ofstream fout ( out_filename ) ;
fout . precision ( 9 ) ;
2017-05-24 19:24:12 +08:00
const auto layers = parse_dlib_xml ( { N , K , NR , NC } , xml_filename ) ;
2017-05-07 03:02:38 +08:00
2017-05-08 05:16:08 +08:00
fout < < " # \n " ;
fout < < " # !!! This file was automatically generated by dlib's tools/convert_dlib_nets_to_caffe utility. !!! \n " ;
fout < < " # !!! It contains all the information from a dlib DNN network and lets you save it as a cafe model. !!! \n " ;
fout < < " # \n " ;
2017-05-08 03:49:19 +08:00
fout < < " import caffe " < < endl ;
fout < < " from caffe import layers as L, params as P " < < endl ;
fout < < " import numpy as np " < < endl ;
2017-05-07 03:02:38 +08:00
2017-05-08 03:26:18 +08:00
// dlib nets don't commit to a batch size, so just use 1 as the default
2017-05-08 03:49:19 +08:00
fout < < " \n # Input tensor dimensions " < < endl ;
2017-05-23 07:06:55 +08:00
fout < < " input_batch_size = " < < N < < " ; " < < endl ;
2017-05-07 03:02:38 +08:00
if ( layers . back ( ) . detail_name = = " input_rgb_image " )
{
2017-05-23 07:06:55 +08:00
fout < < " input_num_channels = 3; " < < endl ;
fout < < " input_num_rows = " < < NR < < " ; " < < endl ;
fout < < " input_num_cols = " < < NC < < " ; " < < endl ;
if ( K ! = 3 )
throw dlib : : error ( " The dlib model requires input tensors with NUM_CHANNELS==3, but the dtoc command line specified NUM_CHANNELS== " + to_string ( K ) ) ;
2017-05-07 03:02:38 +08:00
}
else if ( layers . back ( ) . detail_name = = " input_rgb_image_sized " )
{
2017-05-23 07:06:55 +08:00
fout < < " input_num_channels = 3; " < < endl ;
fout < < " input_num_rows = " < < layers . back ( ) . attribute ( " nr " ) < < " ; " < < endl ;
fout < < " input_num_cols = " < < layers . back ( ) . attribute ( " nc " ) < < " ; " < < endl ;
if ( NR ! = layers . back ( ) . attribute ( " nr " ) )
2017-05-23 07:11:00 +08:00
throw dlib : : error ( " The dlib model requires input tensors with NUM_ROWS== " + to_string ( ( long ) layers . back ( ) . attribute ( " nr " ) ) + " , but the dtoc command line specified NUM_ROWS== " + to_string ( NR ) ) ;
2017-05-23 07:06:55 +08:00
if ( NC ! = layers . back ( ) . attribute ( " nc " ) )
2017-05-23 07:11:00 +08:00
throw dlib : : error ( " The dlib model requires input tensors with NUM_COLUMNS== " + to_string ( ( long ) layers . back ( ) . attribute ( " nc " ) ) + " , but the dtoc command line specified NUM_COLUMNS== " + to_string ( NC ) ) ;
2017-05-23 07:06:55 +08:00
if ( K ! = 3 )
throw dlib : : error ( " The dlib model requires input tensors with NUM_CHANNELS==3, but the dtoc command line specified NUM_CHANNELS== " + to_string ( K ) ) ;
2017-05-07 03:02:38 +08:00
}
else if ( layers . back ( ) . detail_name = = " input " )
{
2017-05-23 07:06:55 +08:00
fout < < " input_num_channels = 1; " < < endl ;
fout < < " input_num_rows = " < < NR < < " ; " < < endl ;
fout < < " input_num_cols = " < < NC < < " ; " < < endl ;
if ( K ! = 1 )
throw dlib : : error ( " The dlib model requires input tensors with NUM_CHANNELS==1, but the dtoc command line specified NUM_CHANNELS== " + to_string ( K ) ) ;
2017-05-07 03:02:38 +08:00
}
else
{
throw dlib : : error ( " No known transformation from dlib's " + layers . back ( ) . detail_name + " layer to caffe. " ) ;
}
2017-05-08 03:49:19 +08:00
fout < < endl ;
2017-05-08 05:16:08 +08:00
fout < < " # Call this function to write the dlib DNN model out to file as a pair of caffe \n " ;
fout < < " # definition and weight files. You can then use the network by loading it with \n " ;
fout < < " # this statement: \n " ;
fout < < " # net = caffe.Net(def_file, weights_file, caffe.TEST); \n " ;
fout < < " # \n " ;
fout < < " def save_as_caffe_model(def_file, weights_file): \n " ;
fout < < " with open(def_file, 'w') as f: f.write(str(make_netspec())); \n " ;
fout < < " net = caffe.Net(def_file, caffe.TEST); \n " ;
fout < < " set_network_weights(net); \n " ;
fout < < " net.save(weights_file); \n \n " ;
fout < < " ############################################################################### \n " ;
fout < < " # EVERYTHING BELOW HERE DEFINES THE DLIB MODEL PARAMETERS # \n " ;
fout < < " ############################################################################### \n \n \n " ;
// -----------------------------------------------------------------------------------
// The next block of code outputs python code that defines the network architecture.
// -----------------------------------------------------------------------------------
2017-05-07 03:02:38 +08:00
2017-05-08 03:49:19 +08:00
fout < < " def make_netspec(): " < < endl ;
fout < < " # For reference, the only \" documentation \" about caffe layer parameters seems to be this page: \n " ;
fout < < " # https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto \n " < < endl ;
fout < < " n = caffe.NetSpec(); " < < endl ;
2017-05-23 07:06:55 +08:00
fout < < " n.data,n.label = L.MemoryData(batch_size=input_batch_size, channels=input_num_channels, height=input_num_rows, width=input_num_cols, ntop=2) " < < endl ;
2017-05-07 03:02:38 +08:00
// iterate the layers starting with the input layer
for ( auto i = layers . rbegin ( ) ; i ! = layers . rend ( ) ; + + i )
{
// skip input and loss layers
if ( i - > type = = " loss " | | i - > type = = " input " )
continue ;
if ( i - > detail_name = = " con " )
{
2017-05-08 03:49:19 +08:00
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.Convolution(n. " < < find_input_layer_caffe_name ( i ) ;
fout < < " , num_output= " < < i - > attribute ( " num_filters " ) ;
fout < < " , kernel_w= " < < i - > attribute ( " nc " ) ;
fout < < " , kernel_h= " < < i - > attribute ( " nr " ) ;
fout < < " , stride_w= " < < i - > attribute ( " stride_x " ) ;
fout < < " , stride_h= " < < i - > attribute ( " stride_y " ) ;
fout < < " , pad_w= " < < i - > attribute ( " padding_x " ) ;
fout < < " , pad_h= " < < i - > attribute ( " padding_y " ) ;
fout < < " ); \n " ;
2017-05-07 03:02:38 +08:00
}
else if ( i - > detail_name = = " relu " )
{
2017-05-08 03:49:19 +08:00
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.ReLU(n. " < < find_input_layer_caffe_name ( i ) ;
fout < < " ); \n " ;
2017-05-07 03:02:38 +08:00
}
2017-05-08 04:23:35 +08:00
else if ( i - > detail_name = = " sig " )
{
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.Sigmoid(n. " < < find_input_layer_caffe_name ( i ) ;
fout < < " ); \n " ;
}
else if ( i - > detail_name = = " prelu " )
{
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.PReLU(n. " < < find_input_layer_caffe_name ( i ) ;
fout < < " , channel_shared=True " ;
fout < < " ); \n " ;
}
2017-05-07 03:02:38 +08:00
else if ( i - > detail_name = = " max_pool " )
{
2017-05-08 03:49:19 +08:00
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.Pooling(n. " < < find_input_layer_caffe_name ( i ) ;
fout < < " , pool=P.Pooling.MAX " ;
2017-05-08 03:26:18 +08:00
if ( i - > attribute ( " nc " ) = = 0 )
{
2017-05-08 03:49:19 +08:00
fout < < " , global_pooling=True " ;
2017-05-08 03:26:18 +08:00
}
else
{
2017-05-08 03:49:19 +08:00
fout < < " , kernel_w= " < < i - > attribute ( " nc " ) ;
fout < < " , kernel_h= " < < i - > attribute ( " nr " ) ;
2017-05-08 03:26:18 +08:00
}
2017-05-08 03:49:19 +08:00
fout < < " , stride_w= " < < i - > attribute ( " stride_x " ) ;
fout < < " , stride_h= " < < i - > attribute ( " stride_y " ) ;
fout < < " , pad_w= " < < i - > attribute ( " padding_x " ) ;
fout < < " , pad_h= " < < i - > attribute ( " padding_y " ) ;
fout < < " ); \n " ;
2017-05-07 03:02:38 +08:00
}
else if ( i - > detail_name = = " avg_pool " )
{
2017-05-08 03:49:19 +08:00
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.Pooling(n. " < < find_input_layer_caffe_name ( i ) ;
fout < < " , pool=P.Pooling.AVE " ;
2017-05-08 03:26:18 +08:00
if ( i - > attribute ( " nc " ) = = 0 )
{
2017-05-08 03:49:19 +08:00
fout < < " , global_pooling=True " ;
2017-05-08 03:26:18 +08:00
}
else
{
2017-05-08 03:49:19 +08:00
fout < < " , kernel_w= " < < i - > attribute ( " nc " ) ;
fout < < " , kernel_h= " < < i - > attribute ( " nr " ) ;
2017-05-08 03:26:18 +08:00
}
if ( i - > attribute ( " padding_x " ) ! = 0 | | i - > attribute ( " padding_y " ) ! = 0 )
{
throw dlib : : error ( " dlib and caffe implement pooling with non-zero padding differently, so you can't convert a "
" network with such pooling layers. " ) ;
}
2017-05-08 03:49:19 +08:00
fout < < " , stride_w= " < < i - > attribute ( " stride_x " ) ;
fout < < " , stride_h= " < < i - > attribute ( " stride_y " ) ;
fout < < " , pad_w= " < < i - > attribute ( " padding_x " ) ;
fout < < " , pad_h= " < < i - > attribute ( " padding_y " ) ;
fout < < " ); \n " ;
2017-05-07 03:02:38 +08:00
}
else if ( i - > detail_name = = " fc " )
{
2017-05-08 03:49:19 +08:00
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.InnerProduct(n. " < < find_input_layer_caffe_name ( i ) ;
fout < < " , num_output= " < < i - > attribute ( " num_outputs " ) ;
fout < < " , bias_term=True " ;
fout < < " ); \n " ;
2017-05-07 03:02:38 +08:00
}
else if ( i - > detail_name = = " fc_no_bias " )
{
2017-05-08 03:49:19 +08:00
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.InnerProduct(n. " < < find_input_layer_caffe_name ( i ) ;
fout < < " , num_output= " < < i - > attribute ( " num_outputs " ) ;
fout < < " , bias_term=False " ;
fout < < " ); \n " ;
2017-05-07 03:02:38 +08:00
}
2017-05-08 03:26:18 +08:00
else if ( i - > detail_name = = " bn_con " | | i - > detail_name = = " bn_fc " )
2017-05-07 03:02:38 +08:00
{
2017-05-08 03:26:18 +08:00
throw dlib : : error ( " Conversion from dlib's batch norm layers to caffe's isn't supported. Instead, "
2017-05-08 04:57:34 +08:00
" you should put your dlib network into 'test mode' by switching batch norm layers to affine layers. "
" Then you can convert that 'test mode' network to caffe. " ) ;
2017-05-07 03:02:38 +08:00
}
2017-05-08 03:26:18 +08:00
else if ( i - > detail_name = = " affine_con " )
2017-05-07 03:02:38 +08:00
{
2017-05-08 03:49:19 +08:00
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.Scale(n. " < < find_input_layer_caffe_name ( i ) ;
fout < < " , bias_term=True " ;
fout < < " ); \n " ;
2017-05-08 03:26:18 +08:00
}
else if ( i - > detail_name = = " affine_fc " )
{
2017-05-08 03:49:19 +08:00
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.Scale(n. " < < find_input_layer_caffe_name ( i ) ;
fout < < " , bias_term=True " ;
fout < < " ); \n " ;
2017-05-07 03:02:38 +08:00
}
else if ( i - > detail_name = = " add_prev " )
{
2017-05-24 19:24:12 +08:00
auto in_shape1 = find_input_layer ( i ) . output_tensor_shape ;
auto in_shape2 = find_layer ( i , i - > attribute ( " tag " ) ) . output_tensor_shape ;
if ( in_shape1 ! = in_shape2 )
{
// if only the number of channels differs then we will use a dummy layer to
// pad with zeros. But otherwise we will throw an error.
if ( in_shape1 ( 0 ) = = in_shape2 ( 0 ) & &
in_shape1 ( 2 ) = = in_shape2 ( 2 ) & &
in_shape1 ( 3 ) = = in_shape2 ( 3 ) )
{
fout < < " n. " < < i - > caffe_layer_name ( ) < < " _zeropad = L.DummyData(num= " < < in_shape1 ( 0 ) ;
fout < < " , channels= " < < std : : abs ( in_shape1 ( 1 ) - in_shape2 ( 1 ) ) ;
fout < < " , height= " < < in_shape1 ( 2 ) ;
fout < < " , width= " < < in_shape1 ( 3 ) ;
fout < < " ); \n " ;
string smaller_layer = find_input_layer_caffe_name ( i ) ;
string bigger_layer = find_layer_caffe_name ( i , i - > attribute ( " tag " ) ) ;
if ( in_shape1 ( 1 ) > in_shape2 ( 1 ) )
swap ( smaller_layer , bigger_layer ) ;
fout < < " n. " < < i - > caffe_layer_name ( ) < < " _concat = L.Concat(n. " < < smaller_layer ;
fout < < " , n. " < < i - > caffe_layer_name ( ) < < " _zeropad " ;
fout < < " ); \n " ;
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.Eltwise(n. " < < i - > caffe_layer_name ( ) < < " _concat " ;
fout < < " , n. " < < bigger_layer ;
fout < < " , operation=P.Eltwise.SUM " ;
fout < < " ); \n " ;
}
else
{
std : : ostringstream sout ;
sout < < " The dlib network contained an add_prev layer (layer idx " < < i - > idx < < " ) that adds two previous " ;
sout < < " layers with different output tensor dimensions. Caffe's equivalent layer, Eltwise, doesn't support " ;
sout < < " adding layers together with different dimensions. In the special case where the only difference is " ;
sout < < " in the number of channels, this converter program will add a dummy layer that outputs a tensor full of zeros " ;
sout < < " and concat it appropriately so this will work. However, this network you are converting has tensor dimensions " ;
sout < < " different in values other than the number of channels. In particular, here are the two tensor shapes (batch size, channels, rows, cols): " ;
std : : ostringstream sout2 ;
sout2 < < wrap_string ( sout . str ( ) ) < < endl ;
sout2 < < trans ( in_shape1 ) ;
sout2 < < trans ( in_shape2 ) ;
throw dlib : : error ( sout2 . str ( ) ) ;
}
}
else
{
fout < < " n. " < < i - > caffe_layer_name ( ) < < " = L.Eltwise(n. " < < find_input_layer_caffe_name ( i ) ;
fout < < " , n. " < < find_layer_caffe_name ( i , i - > attribute ( " tag " ) ) ;
fout < < " , operation=P.Eltwise.SUM " ;
fout < < " ); \n " ;
}
2017-05-07 03:02:38 +08:00
}
else
{
throw dlib : : error ( " No known transformation from dlib's " + i - > detail_name + " layer to caffe. " ) ;
}
}
2017-05-08 03:49:19 +08:00
fout < < " return n.to_proto(); \n \n " < < endl ;
2017-05-07 03:02:38 +08:00
2017-05-08 03:26:18 +08:00
2017-05-08 04:23:35 +08:00
// -----------------------------------------------------------------------------------
// The next block of code outputs python code that populates all the filter weights.
// -----------------------------------------------------------------------------------
2017-05-07 03:02:38 +08:00
2017-05-08 03:49:19 +08:00
fout < < " def set_network_weights(net): \n " ;
fout < < " # populate network parameters \n " ;
2017-05-07 03:02:38 +08:00
// iterate the layers starting with the input layer
for ( auto i = layers . rbegin ( ) ; i ! = layers . rend ( ) ; + + i )
{
// skip input and loss layers
if ( i - > type = = " loss " | | i - > type = = " input " )
continue ;
if ( i - > detail_name = = " con " )
{
2017-05-08 03:26:18 +08:00
const long num_filters = i - > attribute ( " num_filters " ) ;
2017-05-07 03:02:38 +08:00
matrix < double > weights = trans ( rowm ( i - > params , range ( 0 , i - > params . size ( ) - num_filters - 1 ) ) ) ;
matrix < double > biases = trans ( rowm ( i - > params , range ( i - > params . size ( ) - num_filters , i - > params . size ( ) - 1 ) ) ) ;
// main filter weights
2017-05-08 03:49:19 +08:00
fout < < " p = " ; print_as_np_array ( fout , weights ) ; fout < < " ; \n " ;
fout < < " p.shape = net.params[' " < < i - > caffe_layer_name ( ) < < " '][0].data.shape; \n " ;
fout < < " net.params[' " < < i - > caffe_layer_name ( ) < < " '][0].data[:] = p; \n " ;
2017-05-07 03:02:38 +08:00
// biases
2017-05-08 03:49:19 +08:00
fout < < " p = " ; print_as_np_array ( fout , biases ) ; fout < < " ; \n " ;
fout < < " p.shape = net.params[' " < < i - > caffe_layer_name ( ) < < " '][1].data.shape; \n " ;
fout < < " net.params[' " < < i - > caffe_layer_name ( ) < < " '][1].data[:] = p; \n " ;
2017-05-07 03:02:38 +08:00
}
else if ( i - > detail_name = = " fc " )
{
matrix < double > weights = trans ( rowm ( i - > params , range ( 0 , i - > params . nr ( ) - 2 ) ) ) ;
matrix < double > biases = rowm ( i - > params , i - > params . nr ( ) - 1 ) ;
// main filter weights
2017-05-08 03:49:19 +08:00
fout < < " p = " ; print_as_np_array ( fout , weights ) ; fout < < " ; \n " ;
fout < < " p.shape = net.params[' " < < i - > caffe_layer_name ( ) < < " '][0].data.shape; \n " ;
fout < < " net.params[' " < < i - > caffe_layer_name ( ) < < " '][0].data[:] = p; \n " ;
2017-05-07 03:02:38 +08:00
// biases
2017-05-08 03:49:19 +08:00
fout < < " p = " ; print_as_np_array ( fout , biases ) ; fout < < " ; \n " ;
fout < < " p.shape = net.params[' " < < i - > caffe_layer_name ( ) < < " '][1].data.shape; \n " ;
fout < < " net.params[' " < < i - > caffe_layer_name ( ) < < " '][1].data[:] = p; \n " ;
2017-05-07 03:02:38 +08:00
}
else if ( i - > detail_name = = " fc_no_bias " )
{
matrix < double > weights = trans ( i - > params ) ;
// main filter weights
2017-05-08 03:49:19 +08:00
fout < < " p = " ; print_as_np_array ( fout , weights ) ; fout < < " ; \n " ;
fout < < " p.shape = net.params[' " < < i - > caffe_layer_name ( ) < < " '][0].data.shape; \n " ;
fout < < " net.params[' " < < i - > caffe_layer_name ( ) < < " '][0].data[:] = p; \n " ;
2017-05-07 03:02:38 +08:00
}
2017-05-08 03:26:18 +08:00
else if ( i - > detail_name = = " affine_con " | | i - > detail_name = = " affine_fc " )
2017-05-07 03:02:38 +08:00
{
2017-05-08 03:26:18 +08:00
const long dims = i - > params . size ( ) / 2 ;
matrix < double > gamma = trans ( rowm ( i - > params , range ( 0 , dims - 1 ) ) ) ;
matrix < double > beta = trans ( rowm ( i - > params , range ( dims , 2 * dims - 1 ) ) ) ;
// set gamma weights
2017-05-08 03:49:19 +08:00
fout < < " p = " ; print_as_np_array ( fout , gamma ) ; fout < < " ; \n " ;
fout < < " p.shape = net.params[' " < < i - > caffe_layer_name ( ) < < " '][0].data.shape; \n " ;
fout < < " net.params[' " < < i - > caffe_layer_name ( ) < < " '][0].data[:] = p; \n " ;
2017-05-08 03:26:18 +08:00
// set beta weights
2017-05-08 03:49:19 +08:00
fout < < " p = " ; print_as_np_array ( fout , beta ) ; fout < < " ; \n " ;
fout < < " p.shape = net.params[' " < < i - > caffe_layer_name ( ) < < " '][1].data.shape; \n " ;
fout < < " net.params[' " < < i - > caffe_layer_name ( ) < < " '][1].data[:] = p; \n " ;
2017-05-07 03:02:38 +08:00
}
2017-05-08 04:23:35 +08:00
else if ( i - > detail_name = = " prelu " )
{
const double param = i - > params ( 0 ) ;
// main filter weights
fout < < " tmp = net.params[' " < < i - > caffe_layer_name ( ) < < " '][0].data.view(); \n " ;
fout < < " tmp.shape = 1; \n " ;
fout < < " tmp[0] = " < < param < < " ; \n " ;
}
2017-05-07 03:02:38 +08:00
}
}
// ----------------------------------------------------------------------------------------
int main ( int argc , char * * argv ) try
{
2017-05-23 07:06:55 +08:00
if ( argc ! = 6 )
2017-05-08 03:49:19 +08:00
{
2017-05-23 07:06:55 +08:00
cout < < " To use this program, give it an xml file generated by dlib::net_to_xml() " < < endl ;
cout < < " and then 4 numbers that indicate the input tensor size. It will convert " < < endl ;
cout < < " the xml file into a python file that outputs a caffe model containing the dlib model. " < < endl ;
cout < < " For example, you might run this program like this: " < < endl ;
cout < < " ./dtoc lenet.xml 1 1 28 28 " < < endl ;
cout < < " would convert the lenet.xml model into a caffe model with an input tensor of shape(1,1,28,28) " < < endl ;
cout < < " where the shape values are (num samples in batch, num channels, num rows, num columns). " < < endl ;
2017-05-08 03:49:19 +08:00
return 0 ;
}
2017-05-23 07:06:55 +08:00
const long N = sa = argv [ 2 ] ;
const long K = sa = argv [ 3 ] ;
const long NR = sa = argv [ 4 ] ;
const long NC = sa = argv [ 5 ] ;
convert_dlib_xml_to_caffe_python_code ( argv [ 1 ] , N , K , NR , NC ) ;
2017-05-07 03:02:38 +08:00
return 0 ;
}
catch ( std : : exception & e )
{
cout < < " \n \n *************** ERROR CONVERTING TO CAFFE *************** \n " < < e . what ( ) < < endl ;
return 1 ;
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
class doc_handler : public document_handler
{
public :
std : : vector < layer > layers ;
bool seen_first_tag = false ;
layer next_layer ;
std : : stack < string > current_tag ;
long tag_id = - 1 ;
virtual void start_document (
)
{
layers . clear ( ) ;
seen_first_tag = false ;
tag_id = - 1 ;
}
virtual void end_document (
) { }
virtual void start_element (
const unsigned long line_number ,
const std : : string & name ,
const dlib : : attribute_list & atts
)
{
if ( ! seen_first_tag )
{
if ( name ! = " net " )
throw dlib : : error ( " The top level XML tag must be a 'net' tag. " ) ;
seen_first_tag = true ;
}
if ( name = = " layer " )
{
next_layer = layer ( ) ;
if ( atts [ " type " ] = = " skip " )
{
// Don't make a new layer, just apply the tag id to the previous layer
if ( layers . size ( ) = = 0 )
throw dlib : : error ( " A skip layer was found as the first layer, but the first layer should be an input layer. " ) ;
layers . back ( ) . skip_id = sa = atts [ " id " ] ;
// We intentionally leave next_layer empty so the end_element() callback
// don't add it as another layer when called.
}
else if ( atts [ " type " ] = = " tag " )
{
// Don't make a new layer, just remember the tag id so we can apply it on
// the next layer.
tag_id = sa = atts [ " id " ] ;
// We intentionally leave next_layer empty so the end_element() callback
// don't add it as another layer when called.
}
else
{
next_layer . idx = sa = atts [ " idx " ] ;
next_layer . type = atts [ " type " ] ;
if ( tag_id ! = - 1 )
{
next_layer . tag_id = tag_id ;
tag_id = - 1 ;
}
}
}
else if ( current_tag . size ( ) ! = 0 & & current_tag . top ( ) = = " layer " )
{
next_layer . detail_name = name ;
// copy all the XML tag's attributes into the layer struct
atts . reset ( ) ;
while ( atts . move_next ( ) )
next_layer . attributes [ atts . element ( ) . key ( ) ] = sa = atts . element ( ) . value ( ) ;
}
current_tag . push ( name ) ;
}
virtual void end_element (
const unsigned long line_number ,
const std : : string & name
)
{
current_tag . pop ( ) ;
if ( name = = " layer " & & next_layer . type . size ( ) ! = 0 )
layers . push_back ( next_layer ) ;
}
virtual void characters (
const std : : string & data
)
{
if ( current_tag . size ( ) = = 0 )
return ;
if ( comp_tags_with_params . count ( current_tag . top ( ) ) ! = 0 )
{
istringstream sin ( data ) ;
sin > > next_layer . params ;
}
}
virtual void processing_instruction (
const unsigned long line_number ,
const std : : string & target ,
const std : : string & data
)
{
}
} ;
// ----------------------------------------------------------------------------------------
2017-05-24 19:24:12 +08:00
void compute_output_tensor_shapes ( const matrix < long , 4 , 1 > & input_tensor_shape , std : : vector < layer > & layers )
{
DLIB_CASSERT ( layers . back ( ) . type = = " input " ) ;
layers . back ( ) . output_tensor_shape = input_tensor_shape ;
for ( auto i = + + layers . rbegin ( ) ; i ! = layers . rend ( ) ; + + i )
{
const auto input_shape = find_input_layer ( i ) . output_tensor_shape ;
if ( i - > type = = " comp " )
{
if ( i - > detail_name = = " fc " | | i - > detail_name = = " fc_no_bias " )
{
long num_outputs = i - > attribute ( " num_outputs " ) ;
i - > output_tensor_shape = { input_shape ( 0 ) , num_outputs , 1 , 1 } ;
}
else if ( i - > detail_name = = " con " )
{
long num_filters = i - > attribute ( " num_filters " ) ;
long filter_nc = i - > attribute ( " nc " ) ;
long filter_nr = i - > attribute ( " nr " ) ;
long stride_x = i - > attribute ( " stride_x " ) ;
long stride_y = i - > attribute ( " stride_y " ) ;
long padding_x = i - > attribute ( " padding_x " ) ;
long padding_y = i - > attribute ( " padding_y " ) ;
long nr = 1 + ( input_shape ( 2 ) + 2 * padding_y - filter_nr ) / stride_y ;
long nc = 1 + ( input_shape ( 3 ) + 2 * padding_x - filter_nc ) / stride_x ;
i - > output_tensor_shape = { input_shape ( 0 ) , num_filters , nr , nc } ;
}
else if ( i - > detail_name = = " max_pool " | | i - > detail_name = = " avg_pool " )
{
long filter_nc = i - > attribute ( " nc " ) ;
long filter_nr = i - > attribute ( " nr " ) ;
long stride_x = i - > attribute ( " stride_x " ) ;
long stride_y = i - > attribute ( " stride_y " ) ;
long padding_x = i - > attribute ( " padding_x " ) ;
long padding_y = i - > attribute ( " padding_y " ) ;
long nr = 1 + ( input_shape ( 2 ) + 2 * padding_y - filter_nr ) / stride_y ;
long nc = 1 + ( input_shape ( 3 ) + 2 * padding_x - filter_nc ) / stride_x ;
i - > output_tensor_shape = { input_shape ( 0 ) , input_shape ( 1 ) , nr , nc } ;
}
else if ( i - > detail_name = = " add_prev " )
{
auto aux_shape = find_layer ( i , i - > attribute ( " tag " ) ) . output_tensor_shape ;
for ( long j = 0 ; j < input_shape . size ( ) ; + + j )
i - > output_tensor_shape ( j ) = std : : max ( input_shape ( j ) , aux_shape ( j ) ) ;
}
else
{
i - > output_tensor_shape = input_shape ;
}
}
else
{
i - > output_tensor_shape = input_shape ;
}
}
}
// ----------------------------------------------------------------------------------------
2017-05-07 03:02:38 +08:00
std : : vector < layer > parse_dlib_xml (
2017-05-24 19:24:12 +08:00
const matrix < long , 4 , 1 > & input_tensor_shape ,
2017-05-07 03:02:38 +08:00
const string & xml_filename
)
{
doc_handler dh ;
parse_xml ( xml_filename , dh ) ;
if ( dh . layers . size ( ) = = 0 )
throw dlib : : error ( " No layers found in XML file! " ) ;
if ( dh . layers . back ( ) . type ! = " input " )
throw dlib : : error ( " The network in the XML file is missing an input layer! " ) ;
2017-05-24 19:24:12 +08:00
compute_output_tensor_shapes ( input_tensor_shape , dh . layers ) ;
2017-05-07 03:02:38 +08:00
return dh . layers ;
}
// ----------------------------------------------------------------------------------------