mirror of
https://github.com/davisking/dlib.git
synced 2024-11-01 10:14:53 +08:00
Made this object properly warm-startable
This commit is contained in:
parent
34a9e4f671
commit
52e35c31fb
@ -207,6 +207,128 @@ namespace dlib
|
|||||||
Cneg = C;
|
Cneg = C;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class optimizer_state
|
||||||
|
{
|
||||||
|
friend class svm_c_linear_dcd_trainer;
|
||||||
|
|
||||||
|
public:
|
||||||
|
optimizer_state() : did_init(false) {}
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
template <
|
||||||
|
typename in_sample_vector_type
|
||||||
|
>
|
||||||
|
void init(
|
||||||
|
const in_sample_vector_type& x,
|
||||||
|
bool have_bias_,
|
||||||
|
bool last_weight_1_
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const long new_dims = max_index_plus_one(x);
|
||||||
|
long new_idx = 0;
|
||||||
|
|
||||||
|
if (did_init)
|
||||||
|
{
|
||||||
|
DLIB_CASSERT(have_bias_ == have_bias &&
|
||||||
|
last_weight_1_ == last_weight_1, "");
|
||||||
|
|
||||||
|
DLIB_CASSERT( new_dims >= dims,"");
|
||||||
|
DLIB_CASSERT( x.size() >= static_cast<long>(alpha.size()),"");
|
||||||
|
|
||||||
|
// make sure we amortize the cost of growing the alpha vector.
|
||||||
|
if (alpha.capacity() < static_cast<unsigned long>(x.size()))
|
||||||
|
alpha.reserve(x.size()*2);
|
||||||
|
|
||||||
|
new_idx = alpha.size();
|
||||||
|
|
||||||
|
// Make sure alpha has the same length as x. So pad with extra zeros if
|
||||||
|
// necessary to make this happen.
|
||||||
|
alpha.resize(x.size(),0);
|
||||||
|
|
||||||
|
|
||||||
|
if (new_dims != dims)
|
||||||
|
{
|
||||||
|
// The only valid way the dimensions can be different here is if
|
||||||
|
// you are using a sparse vector type. This is because we might
|
||||||
|
// have had training samples which just happened to not include all
|
||||||
|
// the features previously. Therefore, max_index_plus_one() would
|
||||||
|
// have given too low of a result. But for dense vectors it is
|
||||||
|
// definitely a user error if the dimensions don't match.
|
||||||
|
|
||||||
|
DLIB_CASSERT(is_matrix<sample_type>::value == false, "");
|
||||||
|
|
||||||
|
// extend w by the right number of elements
|
||||||
|
if (have_bias)
|
||||||
|
{
|
||||||
|
// Splice some zeros into the w vector so it will have the
|
||||||
|
// right length. Here we are being careful to move the bias
|
||||||
|
// weight to the end of the resulting vector.
|
||||||
|
w = join_cols(join_cols(
|
||||||
|
colm(w,0,dims),
|
||||||
|
zeros_matrix<scalar_type>(1, new_dims-dims)),
|
||||||
|
uniform_matrix<scalar_type>(1,1,w(dims))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Just concatenate the right number of zeros.
|
||||||
|
w = join_cols(w, zeros_matrix<scalar_type>(1, new_dims-dims));
|
||||||
|
}
|
||||||
|
dims = new_dims;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
did_init = true;
|
||||||
|
have_bias = have_bias_;
|
||||||
|
last_weight_1 = last_weight_1_;
|
||||||
|
dims = new_dims;
|
||||||
|
|
||||||
|
alpha.resize(x.size());
|
||||||
|
|
||||||
|
index.reserve(x.size());
|
||||||
|
Q.reserve(x.size());
|
||||||
|
|
||||||
|
if (have_bias)
|
||||||
|
w.set_size(dims+1);
|
||||||
|
else
|
||||||
|
w.set_size(dims);
|
||||||
|
|
||||||
|
w = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (long i = new_idx; i < x.size(); ++i)
|
||||||
|
{
|
||||||
|
Q.push_back(dlib::dot(x(i),x(i)));
|
||||||
|
|
||||||
|
if (have_bias)
|
||||||
|
{
|
||||||
|
index.push_back(i);
|
||||||
|
Q.back() += 1;
|
||||||
|
}
|
||||||
|
else if (Q.back() != 0)
|
||||||
|
{
|
||||||
|
index.push_back(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (last_weight_1)
|
||||||
|
w(dims-1) = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool did_init;
|
||||||
|
bool have_bias;
|
||||||
|
bool last_weight_1;
|
||||||
|
std::vector<scalar_type> alpha;
|
||||||
|
scalar_vector_type w;
|
||||||
|
std::vector<scalar_type> Q;
|
||||||
|
std::vector<long> index;
|
||||||
|
long dims;
|
||||||
|
dlib::rand rnd;
|
||||||
|
};
|
||||||
|
|
||||||
template <
|
template <
|
||||||
typename in_sample_vector_type,
|
typename in_sample_vector_type,
|
||||||
typename in_scalar_vector_type
|
typename in_scalar_vector_type
|
||||||
@ -216,9 +338,8 @@ namespace dlib
|
|||||||
const in_scalar_vector_type& y
|
const in_scalar_vector_type& y
|
||||||
) const
|
) const
|
||||||
{
|
{
|
||||||
scalar_vector_type alpha(x.size());
|
optimizer_state state;
|
||||||
alpha = 0;
|
return do_train(vector_to_matrix(x), vector_to_matrix(y), state);
|
||||||
return do_train(vector_to_matrix(x), vector_to_matrix(y), alpha);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <
|
template <
|
||||||
@ -228,24 +349,10 @@ namespace dlib
|
|||||||
const decision_function<kernel_type> train (
|
const decision_function<kernel_type> train (
|
||||||
const in_sample_vector_type& x,
|
const in_sample_vector_type& x,
|
||||||
const in_scalar_vector_type& y,
|
const in_scalar_vector_type& y,
|
||||||
scalar_vector_type& alpha
|
optimizer_state& state
|
||||||
) const
|
) const
|
||||||
{
|
{
|
||||||
DLIB_CASSERT (static_cast<long>(x.size()) >= alpha.size(),
|
return do_train(vector_to_matrix(x), vector_to_matrix(y), state);
|
||||||
"\t decision_function svm_c_linear_dcd_trainer::train(x,y,alpha)"
|
|
||||||
<< "\n\t invalid inputs were given to this function"
|
|
||||||
<< "\n\t x.size(): " << x.size()
|
|
||||||
<< "\n\t alpha.size(): " << alpha.size()
|
|
||||||
);
|
|
||||||
|
|
||||||
if (static_cast<long>(x.size()) > alpha.size())
|
|
||||||
{
|
|
||||||
// Make sure alpha has the same length as x. So pad with extra zeros if
|
|
||||||
// necessary to make this happen.
|
|
||||||
alpha = join_cols(alpha, zeros_matrix<scalar_type>(1,x.size()-alpha.size()));
|
|
||||||
}
|
|
||||||
|
|
||||||
return do_train(vector_to_matrix(x), vector_to_matrix(y), alpha);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -259,12 +366,9 @@ namespace dlib
|
|||||||
const decision_function<kernel_type> do_train (
|
const decision_function<kernel_type> do_train (
|
||||||
const in_sample_vector_type& x,
|
const in_sample_vector_type& x,
|
||||||
const in_scalar_vector_type& y,
|
const in_scalar_vector_type& y,
|
||||||
scalar_vector_type& alpha
|
optimizer_state& state
|
||||||
) const
|
) const
|
||||||
{
|
{
|
||||||
// TODO, requires labels are all +1 or -1. But we don't have to see both
|
|
||||||
// types.
|
|
||||||
|
|
||||||
// make sure requires clause is not broken
|
// make sure requires clause is not broken
|
||||||
DLIB_ASSERT(is_learning_problem(x,y) == true,
|
DLIB_ASSERT(is_learning_problem(x,y) == true,
|
||||||
"\t decision_function svm_c_linear_dcd_trainer::train(x,y)"
|
"\t decision_function svm_c_linear_dcd_trainer::train(x,y)"
|
||||||
@ -273,50 +377,25 @@ namespace dlib
|
|||||||
<< "\n\t y.size(): " << y.size()
|
<< "\n\t y.size(): " << y.size()
|
||||||
<< "\n\t is_learning_problem(x,y): " << is_learning_problem(x,y)
|
<< "\n\t is_learning_problem(x,y): " << is_learning_problem(x,y)
|
||||||
);
|
);
|
||||||
|
#if ENABLE_ASSERTS
|
||||||
const long dims = max_index_plus_one(x);
|
for (long i = 0; i < x.size(); ++i)
|
||||||
|
|
||||||
// TODO, return an opaque object instead of alpha. Also, the object
|
|
||||||
// needs to verify that the trainer has the same settings from one
|
|
||||||
// call to the next.
|
|
||||||
|
|
||||||
std::vector<long> index(x.size());
|
|
||||||
scalar_vector_type Q(x.size());
|
|
||||||
|
|
||||||
scalar_vector_type w;
|
|
||||||
if (have_bias)
|
|
||||||
w.set_size(dims+1);
|
|
||||||
else
|
|
||||||
w.set_size(dims);
|
|
||||||
|
|
||||||
w = 0;
|
|
||||||
if (last_weight_1)
|
|
||||||
w(dims-1) = 1;
|
|
||||||
|
|
||||||
long ii = 0;
|
|
||||||
for (long i = 0; i < alpha.size(); ++i)
|
|
||||||
{
|
{
|
||||||
index[ii] = i;
|
DLIB_ASSERT(y(i) == +1 || y(i) == -1,
|
||||||
Q(ii) = dlib::dot(x(i),x(i));
|
"\t decision_function svm_c_linear_dcd_trainer::train(x,y)"
|
||||||
|
<< "\n\t invalid inputs were given to this function"
|
||||||
if (have_bias)
|
<< "\n\t y("<<i<<"): " << y(i)
|
||||||
{
|
);
|
||||||
Q(ii) += 1;
|
|
||||||
++ii;
|
|
||||||
}
|
|
||||||
else if (Q(ii) != 0)
|
|
||||||
{
|
|
||||||
++ii;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// What we are doing here is ignoring x elements that have 0 norm. We
|
state.init(x,have_bias,last_weight_1);
|
||||||
// Do this because they are impossible to classify and this also avoids
|
|
||||||
// a division by zero problem later on in the code.
|
|
||||||
const long max_possible_active = ii;
|
|
||||||
|
|
||||||
dlib::rand rnd;
|
std::vector<scalar_type>& alpha = state.alpha;
|
||||||
long active_size = max_possible_active;
|
scalar_vector_type& w = state.w;
|
||||||
|
std::vector<long>& index = state.index;
|
||||||
|
const long dims = state.dims;
|
||||||
|
|
||||||
|
unsigned long active_size = index.size();
|
||||||
|
|
||||||
scalar_type PG_max_prev = std::numeric_limits<scalar_type>::infinity();
|
scalar_type PG_max_prev = std::numeric_limits<scalar_type>::infinity();
|
||||||
scalar_type PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
|
scalar_type PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
|
||||||
@ -328,15 +407,15 @@ namespace dlib
|
|||||||
scalar_type PG_min = std::numeric_limits<scalar_type>::infinity();
|
scalar_type PG_min = std::numeric_limits<scalar_type>::infinity();
|
||||||
|
|
||||||
// randomly shuffle the indices
|
// randomly shuffle the indices
|
||||||
for (long i = 0; i < active_size; ++i)
|
for (unsigned long i = 0; i < active_size; ++i)
|
||||||
{
|
{
|
||||||
// pick a random index >= i
|
// pick a random index >= i
|
||||||
const long j = i + rnd.get_random_32bit_number()%(active_size-i);
|
const long j = i + state.rnd.get_random_32bit_number()%(active_size-i);
|
||||||
std::swap(index[i], index[j]);
|
std::swap(index[i], index[j]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// for all the active training samples
|
// for all the active training samples
|
||||||
for (long ii = 0; ii < active_size; ++ii)
|
for (unsigned long ii = 0; ii < active_size; ++ii)
|
||||||
{
|
{
|
||||||
const long i = index[ii];
|
const long i = index[ii];
|
||||||
|
|
||||||
@ -344,7 +423,7 @@ namespace dlib
|
|||||||
const scalar_type C = (y(i) > 0) ? Cpos : Cneg;
|
const scalar_type C = (y(i) > 0) ? Cpos : Cneg;
|
||||||
|
|
||||||
scalar_type PG = 0;
|
scalar_type PG = 0;
|
||||||
if (alpha(i) == 0)
|
if (alpha[i] == 0)
|
||||||
{
|
{
|
||||||
if (G > PG_max_prev)
|
if (G > PG_max_prev)
|
||||||
{
|
{
|
||||||
@ -358,7 +437,7 @@ namespace dlib
|
|||||||
if (G < 0)
|
if (G < 0)
|
||||||
PG = G;
|
PG = G;
|
||||||
}
|
}
|
||||||
else if (alpha(i) == C)
|
else if (alpha[i] == C)
|
||||||
{
|
{
|
||||||
if (G < PG_min_prev)
|
if (G < PG_min_prev)
|
||||||
{
|
{
|
||||||
@ -385,9 +464,9 @@ namespace dlib
|
|||||||
// if PG != 0
|
// if PG != 0
|
||||||
if (std::abs(PG) > 1e-12)
|
if (std::abs(PG) > 1e-12)
|
||||||
{
|
{
|
||||||
const scalar_type alpha_old = alpha(i);
|
const scalar_type alpha_old = alpha[i];
|
||||||
alpha(i) = std::min(std::max(alpha(i) - G/Q(i), (scalar_type)0.0), C);
|
alpha[i] = std::min(std::max(alpha[i] - G/state.Q[i], (scalar_type)0.0), C);
|
||||||
const scalar_type delta = (alpha(i)-alpha_old)*y(i);
|
const scalar_type delta = (alpha[i]-alpha_old)*y(i);
|
||||||
add_to(w, x(i), delta);
|
add_to(w, x(i), delta);
|
||||||
if (have_bias)
|
if (have_bias)
|
||||||
w(w.size()-1) -= delta;
|
w(w.size()-1) -= delta;
|
||||||
@ -411,12 +490,12 @@ namespace dlib
|
|||||||
{
|
{
|
||||||
// stop if we are within eps tolerance and the last iteration
|
// stop if we are within eps tolerance and the last iteration
|
||||||
// was over all the samples
|
// was over all the samples
|
||||||
if (active_size == max_possible_active)
|
if (active_size == index.size())
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// Turn of shrinking on the next iteration. We will stop if the
|
// Turn of shrinking on the next iteration. We will stop if the
|
||||||
// tolerance is still <= eps when shrinking is off.
|
// tolerance is still <= eps when shrinking is off.
|
||||||
active_size = max_possible_active;
|
active_size = index.size();
|
||||||
PG_max_prev = std::numeric_limits<scalar_type>::infinity();
|
PG_max_prev = std::numeric_limits<scalar_type>::infinity();
|
||||||
PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
|
PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
|
||||||
}
|
}
|
||||||
@ -429,7 +508,11 @@ namespace dlib
|
|||||||
if (PG_min_prev >= 0)
|
if (PG_min_prev >= 0)
|
||||||
PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
|
PG_min_prev = -std::numeric_limits<scalar_type>::infinity();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
} // end of main optimization loop
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// put the solution into a decision function and then return it
|
// put the solution into a decision function and then return it
|
||||||
decision_function<kernel_type> df;
|
decision_function<kernel_type> df;
|
||||||
@ -439,10 +522,9 @@ namespace dlib
|
|||||||
df.b = 0;
|
df.b = 0;
|
||||||
|
|
||||||
df.basis_vectors.set_size(1);
|
df.basis_vectors.set_size(1);
|
||||||
// Copy the plane normal into the output basis vector. The output vector might be a
|
// Copy the plane normal into the output basis vector. The output vector might
|
||||||
// sparse vector container so we need to use this special kind of copy to handle that case.
|
// be a sparse vector container so we need to use this special kind of copy to
|
||||||
// As an aside, the reason for using max_index_plus_one() and not just w.size()-1 is because
|
// handle that case.
|
||||||
// doing it this way avoids an inane warning from gcc that can occur in some cases.
|
|
||||||
assign(df.basis_vectors(0), colm(w, 0, dims));
|
assign(df.basis_vectors(0), colm(w, 0, dims));
|
||||||
df.alpha.set_size(1);
|
df.alpha.set_size(1);
|
||||||
df.alpha(0) = 1;
|
df.alpha(0) = 1;
|
||||||
|
Loading…
Reference in New Issue
Block a user