Added solve_qp_box_constrained_blockdiag()

2024-11-01 10:14:53 +08:00 · 2017-03-20 16:41:22 -04:00 · 2017-03-20 16:41:22 -04:00 · daa2adbdcc
commit daa2adbdcc
parent 096ab3c847
3 changed files with 396 additions and 0 deletions
--- a/dlib/optimization/optimization_solve_qp_using_smo.h
+++ b/dlib/optimization/optimization_solve_qp_using_smo.h
@ -5,6 +5,8 @@

 #include "optimization_solve_qp_using_smo_abstract.h"
 #include "../matrix.h"
+#include <map>
+#include "../unordered_pair.h"

 namespace dlib
 {
@ -548,6 +550,254 @@ namespace dlib
        return iter+1;
    }

+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_qp_box_constrained_blockdiag ( 
+        const std::vector<matrix<T,NR,NR,MM,L>>& Q_blocks,
+        const std::vector<matrix<T,NR,NC,MM,L>>& bs,
+        const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag,
+        std::vector<matrix<T,NR,NC,MM,L>>& alphas,
+        const std::vector<matrix<T,NR,NC,MM,L>>& lowers,
+        const std::vector<matrix<T,NR,NC,MM,L>>& uppers,
+        T eps,
+        unsigned long max_iter
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_CASSERT(Q_blocks.size() > 0);
+        DLIB_CASSERT(Q_blocks.size() == bs.size() && 
+                     Q_blocks.size() == alphas.size() &&
+                     Q_blocks.size() == lowers.size() &&
+                     Q_blocks.size() == uppers.size(),
+                   "Q_blocks.size():  "<< Q_blocks.size() << "\n" <<
+                   "bs.size():        "<< bs.size() << "\n" <<
+                   "alphas.size():    "<< alphas.size() << "\n" <<
+                   "lowers.size():    "<< lowers.size() << "\n" <<
+                   "uppers.size():    "<< uppers.size() << "\n"
+                   );
+        for (auto& Q : Q_blocks)
+        {
+            DLIB_CASSERT(Q.nr() == Q.nc(), "All the matrices in Q_blocks have the same dimensions.");
+            DLIB_CASSERT(Q.nr() == Q_blocks[0].nr() && Q.nc() == Q_blocks[0].nc(), "All the matrices in Q_blocks have the same dimensions.");
+        }
+#ifdef ENABLE_ASSERTS
+        for (size_t i = 0; i < alphas.size(); ++i)
+        {
+            DLIB_CASSERT(is_col_vector(bs[i]) && bs[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(bs["<<i<<"]): " << is_col_vector(bs[i]) << "\n" <<
+                "bs["<<i<<"].size():         " << bs[i].size() << "\n" <<
+                "Q_blocks[0].nr():           " << Q_blocks[0].nr());
+
+            for (auto& Qoffdiag : Q_offdiag)
+            {
+                auto& Q_offdiag_element = Qoffdiag.second;
+                long r = Qoffdiag.first.first;
+                long c = Qoffdiag.first.second;
+                DLIB_CASSERT(is_col_vector(Q_offdiag_element) && Q_offdiag_element.size() == Q_blocks[0].nr(),
+                    "is_col_vector(Q_offdiag["<<r<<","<<c<<"]): " << is_col_vector(Q_offdiag_element) << "\n" <<
+                    "Q_offdiag["<<r<<","<<c<<"].size():         " << Q_offdiag_element.size() << "\n" <<
+                    "Q_blocks[0].nr():                  " << Q_blocks[0].nr());
+            }
+
+            DLIB_CASSERT(is_col_vector(alphas[i]) && alphas[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(alphas["<<i<<"]): " << is_col_vector(alphas[i]) << "\n" <<
+                "alphas["<<i<<"].size():         " << alphas[i].size() << "\n" <<
+                "Q_blocks[0].nr():               " << Q_blocks[0].nr());
+
+            DLIB_CASSERT(is_col_vector(lowers[i]) && lowers[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(lowers["<<i<<"]): " << is_col_vector(lowers[i]) << "\n" <<
+                "lowers["<<i<<"].size():         " << lowers[i].size() << "\n" <<
+                "Q_blocks[0].nr():               " << Q_blocks[0].nr());
+
+            DLIB_CASSERT(is_col_vector(uppers[i]) && uppers[i].size() == Q_blocks[0].nr(),
+                "is_col_vector(uppers["<<i<<"]): " << is_col_vector(uppers[i]) << "\n" <<
+                "uppers["<<i<<"].size():         " << uppers[i].size() << "\n" <<
+                "Q_blocks[0].nr():               " << Q_blocks[0].nr());
+
+            DLIB_CASSERT(0 <= min(alphas[i]-lowers[i]), "min(alphas["<<i<<"]-lowers["<<i<<"]): " << min(alphas[i]-lowers[i]));
+            DLIB_CASSERT(0 <= max(uppers[i]-alphas[i]), "max(uppers["<<i<<"]-alphas["<<i<<"]): " << max(uppers[i]-alphas[i]));
+        }
+        DLIB_CASSERT(eps > 0 && max_iter > 0, "eps: " << eps << "\nmax_iter: "<< max_iter);
+#endif // ENABLE_ASSERTS
+
+
+
+        // Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.  
+        std::vector<matrix<T,NR,NC,MM,L>> df;// = Q*alpha + b;
+        auto compute_df = [&]()
+        {
+            df.resize(Q_blocks.size());
+            for (size_t i = 0; i < df.size(); ++i)
+                df[i] = Q_blocks[i]*alphas[i] + bs[i];
+            // Don't forget to include the Q_offdiag terms in the computation
+            for (auto& p : Q_offdiag)
+            {
+                long r = p.first.first;
+                long c = p.first.second;
+                df[r] += pointwise_multiply(p.second, alphas[c]);
+                if (r != c)
+                    df[c] += pointwise_multiply(p.second, alphas[r]);
+            }
+        };
+        compute_df();
+
+        std::vector<matrix<T,NR,NC,MM,L>> Q_diag, Q_ggd;
+        std::vector<matrix<T,NR,NC,MM,L>> QQ;// = reciprocal_max(diag(Q));
+        QQ.resize(Q_blocks.size());
+        Q_diag.resize(Q_blocks.size());
+        Q_ggd.resize(Q_blocks.size());
+
+        // We need to get an upper bound on the Lipschitz constant for this QP. Since that
+        // is just the max eigenvalue of Q we can do it using Gershgorin disks.
+        //const T lipschitz_bound = max(diag(Q) + (sum_cols(abs(Q)) - abs(diag(Q))));
+        for (size_t i = 0; i < QQ.size(); ++i)
+        {
+            auto f = Q_offdiag.find(make_unordered_pair(i,i));
+            if (f != Q_offdiag.end())
+                Q_diag[i] = diag(Q_blocks[i]) + f->second;
+            else
+                Q_diag[i] = diag(Q_blocks[i]);
+            QQ[i] = reciprocal_max(Q_diag[i]);
+
+            Q_ggd[i] = Q_diag[i] + (sum_cols(abs(Q_blocks[i]))-abs(diag(Q_blocks[i])));
+        }
+        for (auto& p : Q_offdiag)
+        {
+            long r = p.first.first;
+            long c = p.first.second;
+            if (r != c)
+            {
+                Q_ggd[r] += abs(p.second);
+                Q_ggd[c] += abs(p.second);
+            }
+        }
+        T lipschitz_bound = -std::numeric_limits<T>::infinity();
+        for (auto& x : Q_ggd)
+            lipschitz_bound = std::max(lipschitz_bound, max(x));
+
+
+        const long num_variables = alphas.size()*alphas[0].size();
+
+        // First we use a coordinate descent method to initialize alpha. 
+        double max_df = 0;
+        for (long iter = 0; iter < num_variables*2; ++iter)
+        {
+            max_df = 0;
+            long best_r =0;
+            size_t best_r2 =0;
+            // find the best alpha to optimize.
+            for (size_t r2 = 0; r2 < alphas.size(); ++r2) 
+            {
+                auto& alpha = alphas[r2];
+                auto& df_ = df[r2];
+                auto& lower = lowers[r2];
+                auto& upper = uppers[r2];
+                for (long r = 0; r < alpha.nr(); ++r)
+                {
+                    if (alpha(r) <= lower(r) && df_(r) > 0)
+                        ;//alpha(r) = lower(r);
+                    else if (alpha(r) >= upper(r) && df_(r) < 0)
+                        ;//alpha(r) = upper(r);
+                    else if (std::abs(df_(r)) > max_df)
+                    {
+                        best_r = r;
+                        best_r2 = r2;
+                        max_df = std::abs(df_(r));
+                    }
+                }
+            }
+
+            // now optimize alphas[best_r2](best_r)
+            const long r = best_r;
+            auto& alpha = alphas[best_r2];
+            auto& lower = lowers[best_r2];
+            auto& upper = uppers[best_r2];
+            auto& df_ = df[best_r2];
+            const T old_alpha = alpha(r);
+            alpha(r) = -(df_(r)-Q_diag[best_r2](r)*alpha(r))*QQ[best_r2](r);
+            if (alpha(r) < lower(r))
+                alpha(r) = lower(r);
+            else if (alpha(r) > upper(r))
+                alpha(r) = upper(r);
+
+            const T delta = old_alpha-alpha(r);
+
+            // Now update the gradient. We will perform the equivalent of: df = Q*alpha +
+            // b; except we only need to compute one column of the matrix multiply because
+            // only one element of alpha changed.
+            auto& Q = Q_blocks[best_r2];
+            for(long k = 0; k < df_.nr(); ++k)
+                df_(k) -= Q(r,k)*delta;
+            for(size_t j = 0; j < Q_blocks.size(); ++j)
+            {
+                auto f = Q_offdiag.find(make_unordered_pair(best_r2, j));
+                if (f != Q_offdiag.end())
+                    df[j](r) -= f->second(r)*delta;
+            }
+        }
+
+
+
+
+        std::vector<matrix<T,NR,NC,MM,L>> v(alphas), v_old(alphas.size());
+        for (size_t i = 0; i < v_old.size(); ++i)
+            v_old[i].set_size(alphas.size());
+        double lambda = 0;
+        unsigned long iter;
+        // Now do the main iteration block of this solver.  The coordinate descent method
+        // we used above can improve the objective rapidly in the beginning.  However,
+        // Nesterov's method has more rapid convergence once it gets going so this is what
+        // we use for the main iteration.
+        for (iter = 0; iter < max_iter; ++iter)
+        {
+            const double next_lambda = (1 + std::sqrt(1+4*lambda*lambda))/2;
+            const double gamma = (1-lambda)/next_lambda;
+            lambda = next_lambda;
+
+            v_old.swap(v);
+
+            //df = Q*alpha + b;
+            compute_df();
+
+            // now take a projected gradient step using Nesterov's method.
+            for (size_t j = 0; j < alphas.size(); ++j)
+            {
+                v[j] = clamp(alphas[j] - 1.0/lipschitz_bound * df[j], lowers[j], uppers[j]);
+                alphas[j] = clamp((1-gamma)*v[j] + gamma*v_old[j], lowers[j], uppers[j]);
+            }
+
+
+            // check for convergence every 10 iterations
+            if (iter%10 == 0)
+            {
+                max_df = 0;
+                for (size_t r2 = 0; r2 < alphas.size(); ++r2) 
+                {
+                    auto& alpha = alphas[r2];
+                    auto& df_ = df[r2];
+                    auto& lower = lowers[r2];
+                    auto& upper = uppers[r2];
+                    for (long r = 0; r < alpha.nr(); ++r)
+                    {
+                        if (alpha(r) <= lower(r) && df_(r) > 0)
+                            ;//alpha(r) = lower(r);
+                        else if (alpha(r) >= upper(r) && df_(r) < 0)
+                            ;//alpha(r) = upper(r);
+                        else if (std::abs(df_(r)) > max_df)
+                            max_df = std::abs(df_(r));
+                    }
+                }
+                if (max_df < eps)
+                    break;
+            }
+        }
+
+        return iter+1;
+    }
+
 // ----------------------------------------------------------------------------------------

    template <
--- a/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
+++ b/dlib/optimization/optimization_solve_qp_using_smo_abstract.h
@ -4,6 +4,8 @@
 #ifdef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_ABSTRACT_Hh_

 #include "../matrix.h"
+#include <map>
+#include "../unordered_pair.h"

 namespace dlib
 {
@ -162,6 +164,74 @@ namespace dlib
              converge to eps accuracy then the number returned will be max_iter+1.
    !*/

+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename T, long NR, long NC, typename MM, typename L
+        >
+    unsigned long solve_qp_box_constrained_blockdiag ( 
+        const std::vector<matrix<T,NR,NR,MM,L>>& Q_blocks,
+        const std::vector<matrix<T,NR,NC,MM,L>>& bs,
+        const std::map<unordered_pair<size_t>, matrix<T,NR,NC,MM,L>>& Q_offdiag,
+        std::vector<matrix<T,NR,NC,MM,L>>& alphas,
+        const std::vector<matrix<T,NR,NC,MM,L>>& lowers,
+        const std::vector<matrix<T,NR,NC,MM,L>>& uppers,
+        T eps,
+        unsigned long max_iter
+    );
+    /*!
+        requires
+            - Q_blocks.size() > 0
+            - Q_blocks.size() == bs.size() == alphas.size() == lowers.size() == uppers.size()
+            - All the matrices in Q_blocks have the same dimensions.  Moreover, they are square
+              matrices.
+            - All the matrices in bs, Q_offdiag, alphas, lowers, and uppers have the same
+              dimensions.  Moreover, they are all column vectors.
+            - Q_blocks[0].nr() == alphas[0].size()
+              (i.e. the dimensionality of the column vectors in alphas must match the
+              dimensionality of the square matrices in Q_blocks.)
+            - for all valid i:
+                - 0 <= min(alphas[i]-lowers[i])
+                - 0 <= max(uppers[i]-alphas[i])
+            - eps > 0
+            - max_iter > 0
+        ensures
+            - This function solves the same QP as solve_qp_box_constrained(), except it is
+              optimized for the case where the Q matrix has a certain sparsity structure.
+              To be precise:
+                - Let Q1 be a block diagonal matrix with the elements of Q_blocks placed
+                  along its diagonal, and in the order contained in Q_blocks.  
+                - Let Q2 be a matrix with the same size as Q1, except instead of being block diagonal, it
+                  is block structured into Q_blocks.nr() by Q_blocks.nc() blocks.  If we let (r,c) be the
+                  coordinate of each block then each block contains the matrix
+                  diagm(Q_offdiag[make_unordered_pair(r,c)]) or the zero matrix if Q_offdiag has no entry
+                  for the coordinate (r,c).
+                - Let Q == Q1+Q2
+                - Let b == the concatenation of all the vectors in bs into one big vector.
+                - Let alpha == the concatenation of all the vectors in alphas into one big vector.
+                - Let lower == the concatenation of all the vectors in lowers into one big vector.
+                - Let upper == the concatenation of all the vectors in uppers into one big vector.
+                - Then this function solves the following quadratic program:
+                    Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha + trans(b)*alpha 
+                    subject to the following box constraints on alpha:
+                        - 0 <= min(alpha-lower)
+                        - 0 <= max(upper-alpha)
+                    Where f is convex.  This means that Q should be positive-semidefinite.
+                - More specifically, this function is identical to
+                  solve_qp_box_constrained(Q, b, alpha, lower, upper, eps, max_iter),
+                  except that it runs faster since it avoids unnecessary computation by
+                  taking advantage of the sparsity structure in the QP.
+            - The solution to the above QP will be stored in #alphas.
+            - This function uses a combination of a SMO algorithm along with Nesterov's
+              method as the main iteration of the solver.  It starts the algorithm with the
+              given alpha and it works on the problem until the derivative of f(alpha) is
+              smaller than eps for each element of alpha or the alpha value is at a box
+              constraint.  So eps controls how accurate the solution is and smaller values
+              result in better solutions.
+            - At most max_iter iterations of optimization will be performed.  
+            - returns the number of iterations performed.  If this method fails to
+              converge to eps accuracy then the number returned will be max_iter+1.
+    !*/
 // ----------------------------------------------------------------------------------------

    template <
--- a/dlib/test/opt_qp_solver.cpp
+++ b/dlib/test/opt_qp_solver.cpp
@ -507,6 +507,81 @@ namespace
        DLIB_TEST(length(A*c1 - B*c2) < 4);
    }

+// ----------------------------------------------------------------------------------------
+
+    void test_solve_qp_box_constrained_blockdiag()
+    {
+        dlib::rand rnd;
+        for (int iter = 0; iter < 50; ++iter)
+        {
+            print_spinner();
+
+            matrix<double> Q1, Q2;
+            matrix<double,0,1> b1, b2;
+
+            Q1 = randm(4,4,rnd); Q1 = Q1*trans(Q1);
+            Q2 = randm(4,4,rnd); Q2 = Q2*trans(Q2);
+            b1 = gaussian_randm(4,1, iter*2+0);
+            b2 = gaussian_randm(4,1, iter*2+1);
+
+            std::map<unordered_pair<size_t>, matrix<double,0,1>> offdiag;
+
+            if (rnd.get_random_gaussian() > 0)
+                offdiag[make_unordered_pair(0,0)] = randm(4,1,rnd);
+            if (rnd.get_random_gaussian() > 0)
+                offdiag[make_unordered_pair(1,0)] = randm(4,1,rnd);
+            if (rnd.get_random_gaussian() > 0)
+                offdiag[make_unordered_pair(1,1)] = randm(4,1,rnd);
+
+            std::vector<matrix<double>> Q_blocks = {Q1, Q2};
+            std::vector<matrix<double,0,1>> bs = {b1, b2};
+
+
+            // make the single big Q and b
+            matrix<double> Q = join_cols(join_rows(Q1, zeros_matrix(Q1)),
+                join_rows(zeros_matrix(Q2),Q2));
+            matrix<double,0,1> b = join_cols(b1,b2);
+            for (auto& p : offdiag)
+            {
+                long r = p.first.first;
+                long c = p.first.second;
+                set_subm(Q, 4*r,4*c, 4,4) += diagm(p.second);
+                if (c != r)
+                    set_subm(Q, 4*c,4*r, 4,4) += diagm(p.second);
+            }
+
+
+            matrix<double,0,1> alpha = zeros_matrix(b);
+            matrix<double,0,1> lower = -10000*ones_matrix(b);
+            matrix<double,0,1> upper = 10000*ones_matrix(b);
+
+            auto iters = solve_qp_box_constrained(Q, b, alpha, lower, upper, 1e-9, 10000);
+            dlog << LINFO << "iters: "<< iters;
+            dlog << LINFO << "alpha: " << trans(alpha);
+
+            dlog << LINFO;
+
+            std::vector<matrix<double,0,1>> alphas(2);
+            alphas[0] = zeros_matrix<double>(4,1); alphas[1] = zeros_matrix<double>(4,1);
+
+            lower = -10000*ones_matrix(alphas[0]);
+            upper = 10000*ones_matrix(alphas[0]);
+            std::vector<matrix<double,0,1>> lowers = {lower,lower}, uppers = {upper, upper};
+            auto iters2 = solve_qp_box_constrained_blockdiag(Q_blocks, bs, offdiag, alphas, lowers, uppers, 1e-9, 10000);
+            dlog << LINFO << "iters2: "<< iters2;
+            dlog << LINFO << "alpha: " << trans(join_cols(alphas[0],alphas[1]));
+
+            dlog << LINFO << "obj1: "<< 0.5*trans(alpha)*Q*alpha + trans(b)*alpha;
+            dlog << LINFO << "obj2: "<< 0.5*trans(join_cols(alphas[0],alphas[1]))*Q*join_cols(alphas[0],alphas[1]) + trans(b)*join_cols(alphas[0],alphas[1]);
+            dlog << LINFO << "obj1-obj2: "<<(0.5*trans(alpha)*Q*alpha + trans(b)*alpha) - (0.5*trans(join_cols(alphas[0],alphas[1]))*Q*join_cols(alphas[0],alphas[1]) + trans(b)*join_cols(alphas[0],alphas[1]));
+
+            DLIB_TEST_MSG(max(abs(alpha - join_cols(alphas[0], alphas[1]))) < 1e-6, max(abs(alpha - join_cols(alphas[0], alphas[1]))));
+
+            DLIB_TEST(iters == iters2);
+
+        }
+    }
+
 // ----------------------------------------------------------------------------------------

    class opt_qp_solver_tester : public tester
@ -566,6 +641,7 @@ namespace


            test_find_gap_between_convex_hulls();
+            test_solve_qp_box_constrained_blockdiag();
        }

        double do_the_test (