mirror of
https://github.com/davisking/dlib.git
synced 2024-11-01 10:14:53 +08:00
Made this object properly warm-startable
This commit is contained in:
parent
34a9e4f671
commit
52e35c31fb
@ -207,6 +207,128 @@ namespace dlib
|
||||
Cneg = C;
|
||||
}
|
||||
|
||||
class optimizer_state
|
||||
{
|
||||
friend class svm_c_linear_dcd_trainer;
|
||||
|
||||
public:
|
||||
optimizer_state() : did_init(false) {}
|
||||
|
||||
private:
|
||||
|
||||
template <
|
||||
typename in_sample_vector_type
|
||||
>
|
||||
void init(
|
||||
const in_sample_vector_type& x,
|
||||
bool have_bias_,
|
||||
bool last_weight_1_
|
||||
)
|
||||
{
|
||||
const long new_dims = max_index_plus_one(x);
|
||||
long new_idx = 0;
|
||||
|
||||
if (did_init)
|
||||
{
|
||||
DLIB_CASSERT(have_bias_ == have_bias &&
|
||||
last_weight_1_ == last_weight_1, "");
|
||||
|
||||
DLIB_CASSERT( new_dims >= dims,"");
|
||||
DLIB_CASSERT( x.size() >= static_cast<long>(alpha.size()),"");
|
||||
|
||||
// make sure we amortize the cost of growing the alpha vector.
|
||||
if (alpha.capacity() < static_cast<unsigned long>(x.size()))
|
||||
alpha.reserve(x.size()*2);
|
||||
|
||||
new_idx = alpha.size();
|
||||
|
||||
// Make sure alpha has the same length as x. So pad with extra zeros if
|
||||
// necessary to make this happen.
|
||||
alpha.resize(x.size(),0);
|
||||
|
||||
|
||||
if (new_dims != dims)
|
||||
{
|
||||
// The only valid way the dimensions can be different here is if
|
||||
// you are using a sparse vector type. This is because we might
|
||||
// have had training samples which just happened to not include all
|
||||
// the features previously. Therefore, max_index_plus_one() would
|
||||
// have given too low of a result. But for dense vectors it is
|
||||
// definitely a user error if the dimensions don't match.
|
||||
|
||||
DLIB_CASSERT(is_matrix<sample_type>::value == false, "");
|
||||
|
||||
// extend w by the right number of elements
|
||||
if (have_bias)
|
||||
{
|
||||
// Splice some zeros into the w vector so it will have the
|
||||
// right length. Here we are being careful to move the bias
|
||||
// weight to the end of the resulting vector.
|
||||
w = join_cols(join_cols(
|
||||
colm(w,0,dims),
|
||||
zeros_matrix<scalar_type>(1, new_dims-dims)),
|
||||
uniform_matrix<scalar_type>(1,1,w(dims))
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Just concatenate the right number of zeros.
|
||||
w = join_cols(w, zeros_matrix<scalar_type>(1, new_dims-dims));
|
||||
}
|
||||
dims = new_dims;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
did_init = true;
|
||||
have_bias = have_bias_;
|
||||
last_weight_1 = last_weight_1_;
|
||||
dims = new_dims;
|
||||
|
||||
alpha.resize(x.size());
|
||||
|
||||
index.reserve(x.size());
|
||||
Q.reserve(x.size());
|
||||
|
||||
if (have_bias)
|
||||
w.set_size(dims+1);
|
||||
else
|
||||
w.set_size(dims);
|
||||
|
||||
w = 0;
|
||||
}
|
||||
|
||||
for (long i = new_idx; i < x.size(); ++i)
|
||||
{
|
||||
Q.push_back(dlib::dot(x(i),x(i)));
|
||||
|
||||
if (have_bias)
|
||||
{
|
||||
index.push_back(i);
|
||||
Q.back() += 1;
|
||||
}
|
||||
else if (Q.back() != 0)
|
||||
{
|
||||
index.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
if (last_weight_1)
|
||||
w(dims-1) = 1;
|
||||
}
|
||||
|
||||
bool did_init;
|
||||
bool have_bias;
|
||||
bool last_weight_1;
|
||||
std::vector<scalar_type> alpha;
|
||||
scalar_vector_type w;
|
||||
std::vector<scalar_type> Q;
|
||||
std::vector<long> index;
|
||||
long dims;
|
||||
dlib::rand rnd;
|
||||
};
|
||||
|
||||
template <
|
||||
typename in_sample_vector_type,
|
||||
typename in_scalar_vector_type
|
||||
@ -216,9 +338,8 @@ namespace dlib
|
||||
const in_scalar_vector_type& y
|
||||
) const
|
||||
{
|
||||
scalar_vector_type alpha(x.size());
|
||||
alpha = 0;
|
||||
return do_train(vector_to_matrix(x), vector_to_matrix(y), alpha);
|
||||
optimizer_state state;
|
||||
return do_train(vector_to_matrix(x), vector_to_matrix(y), state);
|
||||
}
|
||||
|
||||
template <
|
||||
@ -228,24 +349,10 @@ namespace dlib
|
||||
const decision_function<kernel_type> train (
|
||||
const in_sample_vector_type& x,
|
||||
const in_scalar_vector_type& y,
|
||||
scalar_vector_type& alpha
|
||||
optimizer_state& state
|
||||
) const
|
||||
{
|
||||
DLIB_CASSERT (static_cast<long>(x.size()) >= alpha.size(),
|
||||
"\t decision_function svm_c_linear_dcd_trainer::train(x,y,alpha)"
|
||||
<< "\n\t invalid inputs were given to this function"
|
||||
<< "\n\t x.size(): " << x.size()
|
||||
<< "\n\t alpha.size(): " << alpha.size()
|
||||
);
|
||||
|
||||
if (static_cast<long>(x.size()) > alpha.size())
|
||||
{
|
||||
// Make sure alpha has the same length as x. So pad with extra zeros if
|
||||
// necessary to make this happen.
|
||||
alpha = join_cols(alpha, zeros_matrix<scalar_type>(1,x.size()-alpha.size()));
|
||||
}
|
||||
|
||||
return do_train(vector_to_matrix(x), vector_to_matrix(y), alpha);
|
||||
return do_train(vector_to_matrix(x), vector_to_matrix(y), state);
|
||||
}
|
||||
|
||||
private:
|
||||
@ -259,12 +366,9 @@ namespace dlib
|
||||
const decision_function<kernel_type> do_train (
|
||||
const in_sample_vector_type& x,
|
||||
const in_scalar_vector_type& y,
|
||||
scalar_vector_type& alpha
|
||||
optimizer_state& state
|
||||
) const
|
||||
{
|
||||
// TODO, requires labels are all +1 or -1. But we don't have to see both
|
||||
// types.
|
||||
|
||||
// make sure requires clause is not broken
|
||||
DLIB_ASSERT(is_learning_problem(x,y) == true,
|
||||
"\t decision_function svm_c_linear_dcd_trainer::train(x,y)"
|
||||
@ -273,50 +377,25 @@ namespace dlib
|
||||
<< "\n\t y.size(): " << y.size()
|
||||
<< "\n\t is_learning_problem(x,y): " << is_learning_problem(x,y)
|
||||
);
|
||||
|
||||
const long dims = max_index_plus_one(x);
|
||||
|
||||
// TODO, return an opaque object instead of alpha. Also, the object
|
||||
// needs to verify that the trainer has the same settings from one
|
||||
// call to the next.
|
||||
|
||||
std::vector<long> index(x.size());
|
||||
scalar_vector_type Q(x.size());
|
||||
|
||||
scalar_vector_type w;
|
||||
if (have_bias)
|
||||
w.set_size(dims+1);
|
||||
else
|
||||
w.set_size(dims);
|
||||
|
||||
w = 0;
|
||||
if (last_weight_1)
|
||||
w(dims-1) = 1;
|
||||
|
||||
long ii = 0;
|
||||
for (long i = 0; i < alpha.size(); ++i)
|
||||
#if ENABLE_ASSERTS
|
||||
for (long i = 0; i < x.size(); ++i)
|
||||
{
|
||||
index[ii] = i;
|
||||
Q(ii) = dlib::dot(x(i),x(i));
|
||||
|
||||
if (have_bias)
|
||||
{
|
||||
Q(ii) += 1;
|
||||
++ii;
|
||||
}
|
||||
else if (Q(ii) != 0)
|
||||
{
|
||||
++ii;
|
||||
}
|
||||
DLIB_ASSERT(y(i) == +1 || y(i) == -1,
|
||||
"\t decision_function svm_c_linear_dcd_trainer::train(x,y)"
|
||||
<< "\n\t invalid inputs were given to this function"
|
||||
<< "\n\t y("<<i<<"): " << y(i)
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
// What we are doing here is ignoring x elements that have 0 norm. We
|
||||
// Do this because they are impossible to classify and this also avoids
|
||||
// a division by zero problem later on in the code.
|
||||
const long max_possible_active = ii;
|
||||
state.init(x,have_bias,last_weight_1);
|
||||
|
||||
dlib::rand rnd;
|
||||
long active_size = max_possible_active;
|
||||
std::vector<scalar_type>& alpha = state.alpha;
|
||||
scalar_vector_type& w = state.w;
|
||||
std::vector<long>& index = state.index;
|
||||
const long dims = state.dims;
|
||||
|
||||
unsigned long active_size = index.size();
|
||||
|
||||
scalar_type PG_max_prev = std::numeric_limits<scalar_type>::infinity();
|
||||
scalar_type PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
|
||||
@ -328,15 +407,15 @@ namespace dlib
|
||||
scalar_type PG_min = std::numeric_limits<scalar_type>::infinity();
|
||||
|
||||
// randomly shuffle the indices
|
||||
for (long i = 0; i < active_size; ++i)
|
||||
for (unsigned long i = 0; i < active_size; ++i)
|
||||
{
|
||||
// pick a random index >= i
|
||||
const long j = i + rnd.get_random_32bit_number()%(active_size-i);
|
||||
const long j = i + state.rnd.get_random_32bit_number()%(active_size-i);
|
||||
std::swap(index[i], index[j]);
|
||||
}
|
||||
|
||||
// for all the active training samples
|
||||
for (long ii = 0; ii < active_size; ++ii)
|
||||
for (unsigned long ii = 0; ii < active_size; ++ii)
|
||||
{
|
||||
const long i = index[ii];
|
||||
|
||||
@ -344,7 +423,7 @@ namespace dlib
|
||||
const scalar_type C = (y(i) > 0) ? Cpos : Cneg;
|
||||
|
||||
scalar_type PG = 0;
|
||||
if (alpha(i) == 0)
|
||||
if (alpha[i] == 0)
|
||||
{
|
||||
if (G > PG_max_prev)
|
||||
{
|
||||
@ -358,7 +437,7 @@ namespace dlib
|
||||
if (G < 0)
|
||||
PG = G;
|
||||
}
|
||||
else if (alpha(i) == C)
|
||||
else if (alpha[i] == C)
|
||||
{
|
||||
if (G < PG_min_prev)
|
||||
{
|
||||
@ -385,9 +464,9 @@ namespace dlib
|
||||
// if PG != 0
|
||||
if (std::abs(PG) > 1e-12)
|
||||
{
|
||||
const scalar_type alpha_old = alpha(i);
|
||||
alpha(i) = std::min(std::max(alpha(i) - G/Q(i), (scalar_type)0.0), C);
|
||||
const scalar_type delta = (alpha(i)-alpha_old)*y(i);
|
||||
const scalar_type alpha_old = alpha[i];
|
||||
alpha[i] = std::min(std::max(alpha[i] - G/state.Q[i], (scalar_type)0.0), C);
|
||||
const scalar_type delta = (alpha[i]-alpha_old)*y(i);
|
||||
add_to(w, x(i), delta);
|
||||
if (have_bias)
|
||||
w(w.size()-1) -= delta;
|
||||
@ -411,12 +490,12 @@ namespace dlib
|
||||
{
|
||||
// stop if we are within eps tolerance and the last iteration
|
||||
// was over all the samples
|
||||
if (active_size == max_possible_active)
|
||||
if (active_size == index.size())
|
||||
break;
|
||||
|
||||
// Turn of shrinking on the next iteration. We will stop if the
|
||||
// tolerance is still <= eps when shrinking is off.
|
||||
active_size = max_possible_active;
|
||||
active_size = index.size();
|
||||
PG_max_prev = std::numeric_limits<scalar_type>::infinity();
|
||||
PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
|
||||
}
|
||||
@ -429,7 +508,11 @@ namespace dlib
|
||||
if (PG_min_prev >= 0)
|
||||
PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
|
||||
}
|
||||
}
|
||||
|
||||
} // end of main optimization loop
|
||||
|
||||
|
||||
|
||||
|
||||
// put the solution into a decision function and then return it
|
||||
decision_function<kernel_type> df;
|
||||
@ -439,10 +522,9 @@ namespace dlib
|
||||
df.b = 0;
|
||||
|
||||
df.basis_vectors.set_size(1);
|
||||
// Copy the plane normal into the output basis vector. The output vector might be a
|
||||
// sparse vector container so we need to use this special kind of copy to handle that case.
|
||||
// As an aside, the reason for using max_index_plus_one() and not just w.size()-1 is because
|
||||
// doing it this way avoids an inane warning from gcc that can occur in some cases.
|
||||
// Copy the plane normal into the output basis vector. The output vector might
|
||||
// be a sparse vector container so we need to use this special kind of copy to
|
||||
// handle that case.
|
||||
assign(df.basis_vectors(0), colm(w, 0, dims));
|
||||
df.alpha.set_size(1);
|
||||
df.alpha(0) = 1;
|
||||
|
Loading…
Reference in New Issue
Block a user