diff --git a/dlib/statistics.h b/dlib/statistics.h index c0170fe1b..68578a2bf 100644 --- a/dlib/statistics.h +++ b/dlib/statistics.h @@ -8,6 +8,7 @@ #include "statistics/random_subset_selector.h" #include "statistics/image_feature_sampling.h" #include "statistics/sammon.h" +#include "statistics/cca.h" #endif // DLIB_STATISTICs_H_ diff --git a/dlib/statistics/cca.h b/dlib/statistics/cca.h new file mode 100644 index 000000000..021c8d818 --- /dev/null +++ b/dlib/statistics/cca.h @@ -0,0 +1,122 @@ +// Copyright (C) 2013 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_CCA_h__ +#define DLIB_CCA_h__ + +#include "cca_abstract.h" +#include "../algs.h" +#include "../matrix.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + matrix compute_correlations ( + const matrix& L, + const matrix& R + ) + { + matrix A, B, C; + A = diag(trans(R)*L); + B = sqrt(diag(trans(L)*L)); + C = sqrt(diag(trans(R)*R)); + A = pointwise_multiply(A , reciprocal(pointwise_multiply(B,C))); + return A; + } + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type, + typename T + > + matrix impl_cca ( + const matrix_type& L, + const matrix_type& R, + matrix& Ltrans, + matrix& Rtrans, + unsigned long num_correlations, + unsigned long extra_rank, + unsigned long q, + unsigned long num_output_correlations + ) + { + matrix Ul, Vl; + matrix Ur, Vr; + matrix U, V; + matrix Dr, Dl, D; + + + // Note that we add a few more singular vectors in because it helps improve the + // final results if we run this part with a little higher rank than the final SVD. + svd_fast(L, Ul, Dl, Vl, num_correlations+extra_rank, q); + svd_fast(R, Ur, Dr, Vr, num_correlations+extra_rank, q); + + // This matrix is really small so we can do a normal full SVD on it. + svd3(trans(Ul)*Ur, U, D, V); + // now throw away extra columns of the transformations. We do this in a way + // that keeps the directions that have the highest correlations. + matrix temp = D; + rsort_columns(U, temp); + rsort_columns(V, D); + U = colm(U, range(0, num_output_correlations-1)); + V = colm(V, range(0, num_output_correlations-1)); + D = rowm(D, range(0, num_output_correlations-1)); + + Ltrans = Vl*inv(diagm(Dl))*U; + Rtrans = Vr*inv(diagm(Dr))*V; + + // Note that the D matrix contains the correlation values for the transformed + // vectors. However, when the L and R matrices have rank higher than + // num_correlations+extra_rank then the values in D become only approximate. + return D; + } + +// ---------------------------------------------------------------------------------------- + + template + matrix cca ( + const matrix& L, + const matrix& R, + matrix& Ltrans, + matrix& Rtrans, + unsigned long num_correlations, + unsigned long extra_rank = 5, + unsigned long q = 2 + ) + { + using std::min; + const unsigned long n = min(num_correlations, (unsigned long)min(R.nr(),min(L.nc(), R.nc()))); + return impl_cca(L,R,Ltrans, Rtrans, num_correlations, extra_rank, q, n); + } + +// ---------------------------------------------------------------------------------------- + + template + matrix cca ( + const std::vector& L, + const std::vector& R, + matrix& Ltrans, + matrix& Rtrans, + unsigned long num_correlations, + unsigned long extra_rank = 5, + unsigned long q = 2 + ) + { + using std::min; + const unsigned long n = min(max_index_plus_one(L), max_index_plus_one(R)); + const unsigned long num_output_correlations = min(num_correlations, min(R.size(),n)); + return impl_cca(L,R,Ltrans, Rtrans, num_correlations, extra_rank, q, num_output_correlations); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_CCA_h__ + + diff --git a/dlib/statistics/cca_abstract.h b/dlib/statistics/cca_abstract.h new file mode 100644 index 000000000..326a7ebb3 --- /dev/null +++ b/dlib/statistics/cca_abstract.h @@ -0,0 +1,136 @@ +// Copyright (C) 2013 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_CCA_AbSTRACT_H__ +#ifdef DLIB_CCA_AbSTRACT_H__ + +#include "../matrix/matrix_la_abstract.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + matrix compute_correlations ( + const matrix& L, + const matrix& R + ); + /*! + requires + - L.size() > 0 + - R.size() > 0 + - L.nr() == R.nr() + ensures + - This function treats L and R as sequences of paired row vectors. It + then computes the correlation values between the elements of these + row vectors. In particular, we return a matrix COR such that: + - COR.size() == L.nc() + - for all valid i: + - COR(i) == the correlation coefficient between the following + sequence of paired numbers: (L(k,i), R(k,i)) for k: 0 <= k < L.nr(). + Therefore, COR(i) is a value between -1 and 1 inclusive where 1 + indicates perfect correlation and -1 perfect anti-correlation. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename T + > + matrix cca ( + const matrix& L, + const matrix& R, + matrix& Ltrans, + matrix& Rtrans, + unsigned long num_correlations, + unsigned long extra_rank = 5, + unsigned long q = 2 + ); + /*! + requires + - num_correlations > 0 + - L.size() > 0 + - R.size() > 0 + - L.nr() == R.nr() + ensures + - This function performs a canonical correlation analysis between the row + vectors in L and R. That is, it finds two transformation matrices, Ltrans + and Rtrans, such that row vectors in the transformed matrices L*Ltrans and + R*Rtrans are as correlated as possible. That is, we try to find two transforms + such that the correlation values returned by compute_correlations(L*Ltrans, R*Rtrans) + would be maximized. + - Let N == min(num_correlations, min(R.nr(),min(L.nc(),R.nc()))) + (This is the actual number of elements in the transformed vectors. + Therefore, note that you can't get more outputs than there are rows or + columns in the input matrices.) + - #Ltrans.nr() == L.nc() + - #Ltrans.nc() == N + - #Rtrans.nr() == R.nc() + - #Rtrans.nc() == N + - No centering is applied to the L and R matrices. Therefore, if you want a + CCA relative to the centered vectors then you must apply centering yourself + before calling cca(). + - This function works with reduced rank approximations of the L and R matrices. + This makes it fast when working with large matrices. In particular, we use + the svd_fast() routine to find reduced rank representations of the input + matrices by calling it as follows: svd_fast(L, U,D,V, num_correlations+extra_rank, q) + and similarly for R. This means that you can use the extra_rank and q + arguments to cca() to influence the accuracy of the reduced rank + approximation. However, the default values should work fine for most + problems. + - returns an estimate of compute_correlations(L*#Ltrans, R*#Rtrans). The + returned vector should exactly match the output of compute_correlations() + when the reduced rank approximation to L and R is accurate. However, when L + and/or R are higher rank than num_correlations+extra_rank the return value of + this function will deviate from compute_correlations(L*#Ltrans, R*#Rtrans). + This deviation can be used to check if the reduced rank approximation is + working or you need to increase extra_rank. + - A good discussion of CCA can be found in the paper "Canonical Correlation + Analysis" by David Weenink. In particular, this function is implemented + using equations 29 and 30 from his paper. We also use the idea of doing CCA + on a reduced rank approximation of L and R as suggested by Paramveer S. + Dhillon in his paper "Two Step CCA: A new spectral method for estimating + vector models of words". + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename sparse_vector_type, + typename T + > + matrix cca ( + const std::vector& L, + const std::vector& R, + matrix& Ltrans, + matrix& Rtrans, + unsigned long num_correlations, + unsigned long extra_rank = 5, + unsigned long q = 2 + ); + /*! + requires + - num_correlations > 0 + - L.size() == R.size() + - max_index_plus_one(L) > 0 && max_index_plus_one(R) > 0 + (i.e. L and R can't be a empty matrices) + ensures + - This is just an overload of the cca() function defined above. Except in this + case we take a sparse representation of the input L and R matrices rather than + dense matrices. Therefore, in this case, we interpret L and R as matrices + with L.size() rows, where each row is defined by a sparse vector. So this + function does exactly the same thing as the above cca(). + - Note that you can apply the output transforms to a sparse vector with the + following code: + sparse_matrix_vector_multiply(trans(Ltrans), your_sparse_vector) + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_CCA_AbSTRACT_H__ + +