Made this object properly warm-startable

This commit is contained in:
Davis King 2012-12-16 22:56:30 -05:00
parent 34a9e4f671
commit 52e35c31fb

View File

@ -207,6 +207,128 @@ namespace dlib
Cneg = C;
}
class optimizer_state
{
friend class svm_c_linear_dcd_trainer;
public:
optimizer_state() : did_init(false) {}
private:
template <
typename in_sample_vector_type
>
void init(
const in_sample_vector_type& x,
bool have_bias_,
bool last_weight_1_
)
{
const long new_dims = max_index_plus_one(x);
long new_idx = 0;
if (did_init)
{
DLIB_CASSERT(have_bias_ == have_bias &&
last_weight_1_ == last_weight_1, "");
DLIB_CASSERT( new_dims >= dims,"");
DLIB_CASSERT( x.size() >= static_cast<long>(alpha.size()),"");
// make sure we amortize the cost of growing the alpha vector.
if (alpha.capacity() < static_cast<unsigned long>(x.size()))
alpha.reserve(x.size()*2);
new_idx = alpha.size();
// Make sure alpha has the same length as x. So pad with extra zeros if
// necessary to make this happen.
alpha.resize(x.size(),0);
if (new_dims != dims)
{
// The only valid way the dimensions can be different here is if
// you are using a sparse vector type. This is because we might
// have had training samples which just happened to not include all
// the features previously. Therefore, max_index_plus_one() would
// have given too low of a result. But for dense vectors it is
// definitely a user error if the dimensions don't match.
DLIB_CASSERT(is_matrix<sample_type>::value == false, "");
// extend w by the right number of elements
if (have_bias)
{
// Splice some zeros into the w vector so it will have the
// right length. Here we are being careful to move the bias
// weight to the end of the resulting vector.
w = join_cols(join_cols(
colm(w,0,dims),
zeros_matrix<scalar_type>(1, new_dims-dims)),
uniform_matrix<scalar_type>(1,1,w(dims))
);
}
else
{
// Just concatenate the right number of zeros.
w = join_cols(w, zeros_matrix<scalar_type>(1, new_dims-dims));
}
dims = new_dims;
}
}
else
{
did_init = true;
have_bias = have_bias_;
last_weight_1 = last_weight_1_;
dims = new_dims;
alpha.resize(x.size());
index.reserve(x.size());
Q.reserve(x.size());
if (have_bias)
w.set_size(dims+1);
else
w.set_size(dims);
w = 0;
}
for (long i = new_idx; i < x.size(); ++i)
{
Q.push_back(dlib::dot(x(i),x(i)));
if (have_bias)
{
index.push_back(i);
Q.back() += 1;
}
else if (Q.back() != 0)
{
index.push_back(i);
}
}
if (last_weight_1)
w(dims-1) = 1;
}
bool did_init;
bool have_bias;
bool last_weight_1;
std::vector<scalar_type> alpha;
scalar_vector_type w;
std::vector<scalar_type> Q;
std::vector<long> index;
long dims;
dlib::rand rnd;
};
template <
typename in_sample_vector_type,
typename in_scalar_vector_type
@ -216,9 +338,8 @@ namespace dlib
const in_scalar_vector_type& y
) const
{
scalar_vector_type alpha(x.size());
alpha = 0;
return do_train(vector_to_matrix(x), vector_to_matrix(y), alpha);
optimizer_state state;
return do_train(vector_to_matrix(x), vector_to_matrix(y), state);
}
template <
@ -228,24 +349,10 @@ namespace dlib
const decision_function<kernel_type> train (
const in_sample_vector_type& x,
const in_scalar_vector_type& y,
scalar_vector_type& alpha
optimizer_state& state
) const
{
DLIB_CASSERT (static_cast<long>(x.size()) >= alpha.size(),
"\t decision_function svm_c_linear_dcd_trainer::train(x,y,alpha)"
<< "\n\t invalid inputs were given to this function"
<< "\n\t x.size(): " << x.size()
<< "\n\t alpha.size(): " << alpha.size()
);
if (static_cast<long>(x.size()) > alpha.size())
{
// Make sure alpha has the same length as x. So pad with extra zeros if
// necessary to make this happen.
alpha = join_cols(alpha, zeros_matrix<scalar_type>(1,x.size()-alpha.size()));
}
return do_train(vector_to_matrix(x), vector_to_matrix(y), alpha);
return do_train(vector_to_matrix(x), vector_to_matrix(y), state);
}
private:
@ -259,12 +366,9 @@ namespace dlib
const decision_function<kernel_type> do_train (
const in_sample_vector_type& x,
const in_scalar_vector_type& y,
scalar_vector_type& alpha
optimizer_state& state
) const
{
// TODO, requires labels are all +1 or -1. But we don't have to see both
// types.
// make sure requires clause is not broken
DLIB_ASSERT(is_learning_problem(x,y) == true,
"\t decision_function svm_c_linear_dcd_trainer::train(x,y)"
@ -273,50 +377,25 @@ namespace dlib
<< "\n\t y.size(): " << y.size()
<< "\n\t is_learning_problem(x,y): " << is_learning_problem(x,y)
);
const long dims = max_index_plus_one(x);
// TODO, return an opaque object instead of alpha. Also, the object
// needs to verify that the trainer has the same settings from one
// call to the next.
std::vector<long> index(x.size());
scalar_vector_type Q(x.size());
scalar_vector_type w;
if (have_bias)
w.set_size(dims+1);
else
w.set_size(dims);
w = 0;
if (last_weight_1)
w(dims-1) = 1;
long ii = 0;
for (long i = 0; i < alpha.size(); ++i)
#if ENABLE_ASSERTS
for (long i = 0; i < x.size(); ++i)
{
index[ii] = i;
Q(ii) = dlib::dot(x(i),x(i));
if (have_bias)
{
Q(ii) += 1;
++ii;
}
else if (Q(ii) != 0)
{
++ii;
}
DLIB_ASSERT(y(i) == +1 || y(i) == -1,
"\t decision_function svm_c_linear_dcd_trainer::train(x,y)"
<< "\n\t invalid inputs were given to this function"
<< "\n\t y("<<i<<"): " << y(i)
);
}
#endif
// What we are doing here is ignoring x elements that have 0 norm. We
// Do this because they are impossible to classify and this also avoids
// a division by zero problem later on in the code.
const long max_possible_active = ii;
state.init(x,have_bias,last_weight_1);
dlib::rand rnd;
long active_size = max_possible_active;
std::vector<scalar_type>& alpha = state.alpha;
scalar_vector_type& w = state.w;
std::vector<long>& index = state.index;
const long dims = state.dims;
unsigned long active_size = index.size();
scalar_type PG_max_prev = std::numeric_limits<scalar_type>::infinity();
scalar_type PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
@ -328,15 +407,15 @@ namespace dlib
scalar_type PG_min = std::numeric_limits<scalar_type>::infinity();
// randomly shuffle the indices
for (long i = 0; i < active_size; ++i)
for (unsigned long i = 0; i < active_size; ++i)
{
// pick a random index >= i
const long j = i + rnd.get_random_32bit_number()%(active_size-i);
const long j = i + state.rnd.get_random_32bit_number()%(active_size-i);
std::swap(index[i], index[j]);
}
// for all the active training samples
for (long ii = 0; ii < active_size; ++ii)
for (unsigned long ii = 0; ii < active_size; ++ii)
{
const long i = index[ii];
@ -344,7 +423,7 @@ namespace dlib
const scalar_type C = (y(i) > 0) ? Cpos : Cneg;
scalar_type PG = 0;
if (alpha(i) == 0)
if (alpha[i] == 0)
{
if (G > PG_max_prev)
{
@ -358,7 +437,7 @@ namespace dlib
if (G < 0)
PG = G;
}
else if (alpha(i) == C)
else if (alpha[i] == C)
{
if (G < PG_min_prev)
{
@ -385,9 +464,9 @@ namespace dlib
// if PG != 0
if (std::abs(PG) > 1e-12)
{
const scalar_type alpha_old = alpha(i);
alpha(i) = std::min(std::max(alpha(i) - G/Q(i), (scalar_type)0.0), C);
const scalar_type delta = (alpha(i)-alpha_old)*y(i);
const scalar_type alpha_old = alpha[i];
alpha[i] = std::min(std::max(alpha[i] - G/state.Q[i], (scalar_type)0.0), C);
const scalar_type delta = (alpha[i]-alpha_old)*y(i);
add_to(w, x(i), delta);
if (have_bias)
w(w.size()-1) -= delta;
@ -411,12 +490,12 @@ namespace dlib
{
// stop if we are within eps tolerance and the last iteration
// was over all the samples
if (active_size == max_possible_active)
if (active_size == index.size())
break;
// Turn of shrinking on the next iteration. We will stop if the
// tolerance is still <= eps when shrinking is off.
active_size = max_possible_active;
active_size = index.size();
PG_max_prev = std::numeric_limits<scalar_type>::infinity();
PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
}
@ -429,7 +508,11 @@ namespace dlib
if (PG_min_prev >= 0)
PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
}
}
} // end of main optimization loop
// put the solution into a decision function and then return it
decision_function<kernel_type> df;
@ -439,10 +522,9 @@ namespace dlib
df.b = 0;
df.basis_vectors.set_size(1);
// Copy the plane normal into the output basis vector. The output vector might be a
// sparse vector container so we need to use this special kind of copy to handle that case.
// As an aside, the reason for using max_index_plus_one() and not just w.size()-1 is because
// doing it this way avoids an inane warning from gcc that can occur in some cases.
// Copy the plane normal into the output basis vector. The output vector might
// be a sparse vector container so we need to use this special kind of copy to
// handle that case.
assign(df.basis_vectors(0), colm(w, 0, dims));
df.alpha.set_size(1);
df.alpha(0) = 1;