diff --git a/dlib/lsh.h b/dlib/lsh.h index ca980f0e5..28f4b9bc4 100644 --- a/dlib/lsh.h +++ b/dlib/lsh.h @@ -6,6 +6,7 @@ #include "lsh/projection_hash.h" #include "lsh/create_random_projection_hash.h" +#include "lsh/hashes.h" #endif // DLIB_LSh_ diff --git a/dlib/lsh/hashes.h b/dlib/lsh/hashes.h new file mode 100644 index 000000000..d8c4e6073 --- /dev/null +++ b/dlib/lsh/hashes.h @@ -0,0 +1,218 @@ +// Copyright (C) 2013 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_LSH_HAShES_H__ +#define DLIB_LSH_HAShES_H__ + +#include "hashes_abstract.h" +#include "../hash.h" +#include "../matrix.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class hash_similar_angles_64 + { + public: + hash_similar_angles_64 ( + ) : seed(0) {} + + hash_similar_angles_64 ( + const uint64 seed_ + ) : seed(seed_) {} + + uint64 get_seed ( + ) const { return seed; } + + + typedef uint64 result_type; + + template < + typename sparse_vector_type + > + typename disable_if,uint64>::type operator() ( + const sparse_vector_type& v + ) const + { + typedef typename sparse_vector_type::value_type::second_type scalar_type; + + uint64 temp = 0; + for (int i = 0; i < 64; ++i) + { + // compute the dot product between v and a Gaussian random vector. + scalar_type val = 0; + for (typename sparse_vector_type::const_iterator j = v.begin(); j != v.end(); ++j) + val += j->second*gaussian_random_hash(j->first, i, seed); + + if (val > 0) + temp |= 1; + temp <<= 1; + } + return temp; + } + + template + uint64 operator() ( + const matrix_exp& v + ) const + { + uint64 temp = 0; + for (unsigned long i = 0; i < 64; ++i) + { + if (dot(gaussian_randm(v.size(),1,i+seed*64), v) > 0) + temp |= 1; + temp <<= 1; + } + return temp; + } + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const + { + return hamming_distance(a,b); + } + + private: + const uint64 seed; + }; + +// ---------------------------------------------------------------------------------------- + + class hash_similar_angles_128 + { + public: + hash_similar_angles_128 ( + ) : seed(0),hasher1(0), hasher2(1) {} + + hash_similar_angles_128 ( + const uint64 seed_ + ) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {} + + uint64 get_seed ( + ) const { return seed; } + + typedef std::pair result_type; + + template < + typename vector_type + > + result_type operator() ( + const vector_type& v + ) const + { + return std::make_pair(hasher1(v), hasher2(v)); + } + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const + { + return hamming_distance(a.first,b.first) + + hamming_distance(a.second,b.second); + } + + private: + const uint64 seed; + hash_similar_angles_64 hasher1; + hash_similar_angles_64 hasher2; + + }; + +// ---------------------------------------------------------------------------------------- + + class hash_similar_angles_256 + { + public: + hash_similar_angles_256 ( + ) : seed(0), hasher1(0), hasher2(1) {} + + hash_similar_angles_256 ( + const uint64 seed_ + ) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {} + + uint64 get_seed ( + ) const { return seed; } + + typedef std::pair hash128_type; + typedef std::pair result_type; + + template < + typename vector_type + > + result_type operator() ( + const vector_type& v + ) const + { + return std::make_pair(hasher1(v), hasher2(v)); + } + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const + { + return hasher1.distance(a.first,b.first) + + hasher1.distance(a.second,b.second); + } + + private: + const uint64 seed; + hash_similar_angles_128 hasher1; + hash_similar_angles_128 hasher2; + + }; + +// ---------------------------------------------------------------------------------------- + + class hash_similar_angles_512 + { + public: + hash_similar_angles_512 ( + ) : seed(0), hasher1(0), hasher2(1) {} + + hash_similar_angles_512 ( + const uint64 seed_ + ) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {} + + uint64 get_seed ( + ) const { return seed; } + + + typedef hash_similar_angles_256::result_type hash256_type; + typedef std::pair result_type; + + template < + typename vector_type + > + result_type operator() ( + const vector_type& v + ) const + { + return std::make_pair(hasher1(v), hasher2(v)); + } + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const + { + return hasher1.distance(a.first,b.first) + + hasher1.distance(a.second,b.second); + } + + private: + const uint64 seed; + hash_similar_angles_256 hasher1; + hash_similar_angles_256 hasher2; + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_LSH_HAShES_H__ + diff --git a/dlib/lsh/hashes_abstract.h b/dlib/lsh/hashes_abstract.h new file mode 100644 index 000000000..df952529f --- /dev/null +++ b/dlib/lsh/hashes_abstract.h @@ -0,0 +1,286 @@ +// Copyright (C) 2013 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_LSH_HAShES_ABSTRACT_H__ +#ifdef DLIB_LSH_HAShES_ABSTRACT_H__ + +#include "../matrix.h" + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + class hash_similar_angles_64 + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tool for computing locality sensitive hashes that give + vectors with similar angles between each other similar hash values. In + particular, this object creates 64 random planes which pass though the + origin and uses them to create a 64bit hash. To compute the hash for a new + vector, this object checks which side of each plane the vector falls on and + records this information into a 64bit integer. + !*/ + + public: + + hash_similar_angles_64 ( + ); + /*! + ensures + - #get_seed() == 0 + !*/ + + hash_similar_angles_64 ( + const uint64 seed + ); + /*! + ensures + - #get_seed() == seed + !*/ + + uint64 get_seed ( + ) const; + /*! + ensures + - returns the random seed used to generate the random planes used for + hashing. + !*/ + + typedef uint64 result_type; + + template + result_type perator() ( + const vector_type& v + ) const; + /*! + requires + - v is an unsorted sparse vector or a dlib matrix representing either a + column or row vector. + ensures + - returns a 64 bit hash of the input vector v. The bits in the hash record + which side of each random plane v falls on. + + !*/ + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const; + /*! + ensures + - returns the Hamming distance between the two hashes given to this + function. That is, we return the number of bits in a and b which differ. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + + struct hash_similar_angles_128 + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tool for computing locality sensitive hashes that give + vectors with similar angles between each other similar hash values. In + particular, this object creates 128 random planes which pass though the + origin and uses them to create a 128bit hash. To compute the hash for a new + vector, this object checks which side of each plane the vector falls on and + records this information into a 128bit integer. + !*/ + + public: + + hash_similar_angles_128 ( + ); + /*! + ensures + - #get_seed() == 0 + !*/ + + hash_similar_angles_128 ( + const uint64 seed + ); + /*! + ensures + - #get_seed() == seed + !*/ + + uint64 get_seed ( + ) const; + /*! + ensures + - returns the random seed used to generate the random planes used for + hashing. + !*/ + + typedef std::pair result_type; + + template + result_type perator() ( + const vector_type& v + ) const; + /*! + requires + - v is an unsorted sparse vector or a dlib matrix representing either a + column or row vector. + ensures + - returns a 128 bit hash of the input vector v. The bits in the hash record + which side of each random plane v falls on. + + !*/ + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const; + /*! + ensures + - returns the Hamming distance between the two hashes given to this + function. That is, we return the number of bits in a and b which differ. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + struct hash_similar_angles_256 + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tool for computing locality sensitive hashes that give + vectors with similar angles between each other similar hash values. In + particular, this object creates 256 random planes which pass though the + origin and uses them to create a 256bit hash. To compute the hash for a new + vector, this object checks which side of each plane the vector falls on and + records this information into a 256bit integer. + !*/ + + public: + + hash_similar_angles_256 ( + ); + /*! + ensures + - #get_seed() == 0 + !*/ + + hash_similar_angles_256 ( + const uint64 seed + ); + /*! + ensures + - #get_seed() == seed + !*/ + + uint64 get_seed ( + ) const; + /*! + ensures + - returns the random seed used to generate the random planes used for + hashing. + !*/ + + typedef std::pair hash128_type; + typedef std::pair result_type; + + template + result_type perator() ( + const vector_type& v + ) const; + /*! + requires + - v is an unsorted sparse vector or a dlib matrix representing either a + column or row vector. + ensures + - returns a 256 bit hash of the input vector v. The bits in the hash record + which side of each random plane v falls on. + + !*/ + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const; + /*! + ensures + - returns the Hamming distance between the two hashes given to this + function. That is, we return the number of bits in a and b which differ. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + + struct hash_similar_angles_512 + { + /*! + WHAT THIS OBJECT REPRESENTS + This object is a tool for computing locality sensitive hashes that give + vectors with similar angles between each other similar hash values. In + particular, this object creates 512 random planes which pass though the + origin and uses them to create a 512bit hash. To compute the hash for a new + vector, this object checks which side of each plane the vector falls on and + records this information into a 512bit integer. + !*/ + + public: + + hash_similar_angles_512 ( + ); + /*! + ensures + - #get_seed() == 0 + !*/ + + hash_similar_angles_512 ( + const uint64 seed + ); + /*! + ensures + - #get_seed() == seed + !*/ + + uint64 get_seed ( + ) const; + /*! + ensures + - returns the random seed used to generate the random planes used for + hashing. + !*/ + + typedef hash_similar_angles_256::result_type hash256_type; + typedef std::pair result_type; + + template + result_type perator() ( + const vector_type& v + ) const; + /*! + requires + - v is an unsorted sparse vector or a dlib matrix representing either a + column or row vector. + ensures + - returns a 512 bit hash of the input vector v. The bits in the hash record + which side of each random plane v falls on. + + !*/ + + unsigned int distance ( + const result_type& a, + const result_type& b + ) const; + /*! + ensures + - returns the Hamming distance between the two hashes given to this + function. That is, we return the number of bits in a and b which differ. + !*/ + + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_LSH_HAShES_ABSTRACT_H__ + +