Added a set of new LSH based hashing functions meant for use

with larger vectors and high bit sizes than the current LSH tool.
These are the new hash_similar_angles_xxx objects.
This commit is contained in:
Davis King 2013-03-14 20:01:30 -04:00
parent 6f5ef4c089
commit 4e96485601
3 changed files with 505 additions and 0 deletions

View File

@ -6,6 +6,7 @@
#include "lsh/projection_hash.h"
#include "lsh/create_random_projection_hash.h"
#include "lsh/hashes.h"
#endif // DLIB_LSh_

218
dlib/lsh/hashes.h Normal file
View File

@ -0,0 +1,218 @@
// Copyright (C) 2013 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_LSH_HAShES_H__
#define DLIB_LSH_HAShES_H__
#include "hashes_abstract.h"
#include "../hash.h"
#include "../matrix.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class hash_similar_angles_64
{
public:
hash_similar_angles_64 (
) : seed(0) {}
hash_similar_angles_64 (
const uint64 seed_
) : seed(seed_) {}
uint64 get_seed (
) const { return seed; }
typedef uint64 result_type;
template <
typename sparse_vector_type
>
typename disable_if<is_matrix<sparse_vector_type>,uint64>::type operator() (
const sparse_vector_type& v
) const
{
typedef typename sparse_vector_type::value_type::second_type scalar_type;
uint64 temp = 0;
for (int i = 0; i < 64; ++i)
{
// compute the dot product between v and a Gaussian random vector.
scalar_type val = 0;
for (typename sparse_vector_type::const_iterator j = v.begin(); j != v.end(); ++j)
val += j->second*gaussian_random_hash(j->first, i, seed);
if (val > 0)
temp |= 1;
temp <<= 1;
}
return temp;
}
template <typename EXP>
uint64 operator() (
const matrix_exp<EXP>& v
) const
{
uint64 temp = 0;
for (unsigned long i = 0; i < 64; ++i)
{
if (dot(gaussian_randm(v.size(),1,i+seed*64), v) > 0)
temp |= 1;
temp <<= 1;
}
return temp;
}
unsigned int distance (
const result_type& a,
const result_type& b
) const
{
return hamming_distance(a,b);
}
private:
const uint64 seed;
};
// ----------------------------------------------------------------------------------------
class hash_similar_angles_128
{
public:
hash_similar_angles_128 (
) : seed(0),hasher1(0), hasher2(1) {}
hash_similar_angles_128 (
const uint64 seed_
) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {}
uint64 get_seed (
) const { return seed; }
typedef std::pair<uint64,uint64> result_type;
template <
typename vector_type
>
result_type operator() (
const vector_type& v
) const
{
return std::make_pair(hasher1(v), hasher2(v));
}
unsigned int distance (
const result_type& a,
const result_type& b
) const
{
return hamming_distance(a.first,b.first) +
hamming_distance(a.second,b.second);
}
private:
const uint64 seed;
hash_similar_angles_64 hasher1;
hash_similar_angles_64 hasher2;
};
// ----------------------------------------------------------------------------------------
class hash_similar_angles_256
{
public:
hash_similar_angles_256 (
) : seed(0), hasher1(0), hasher2(1) {}
hash_similar_angles_256 (
const uint64 seed_
) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {}
uint64 get_seed (
) const { return seed; }
typedef std::pair<uint64,uint64> hash128_type;
typedef std::pair<hash128_type,hash128_type> result_type;
template <
typename vector_type
>
result_type operator() (
const vector_type& v
) const
{
return std::make_pair(hasher1(v), hasher2(v));
}
unsigned int distance (
const result_type& a,
const result_type& b
) const
{
return hasher1.distance(a.first,b.first) +
hasher1.distance(a.second,b.second);
}
private:
const uint64 seed;
hash_similar_angles_128 hasher1;
hash_similar_angles_128 hasher2;
};
// ----------------------------------------------------------------------------------------
class hash_similar_angles_512
{
public:
hash_similar_angles_512 (
) : seed(0), hasher1(0), hasher2(1) {}
hash_similar_angles_512 (
const uint64 seed_
) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {}
uint64 get_seed (
) const { return seed; }
typedef hash_similar_angles_256::result_type hash256_type;
typedef std::pair<hash256_type,hash256_type> result_type;
template <
typename vector_type
>
result_type operator() (
const vector_type& v
) const
{
return std::make_pair(hasher1(v), hasher2(v));
}
unsigned int distance (
const result_type& a,
const result_type& b
) const
{
return hasher1.distance(a.first,b.first) +
hasher1.distance(a.second,b.second);
}
private:
const uint64 seed;
hash_similar_angles_256 hasher1;
hash_similar_angles_256 hasher2;
};
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_LSH_HAShES_H__

286
dlib/lsh/hashes_abstract.h Normal file
View File

@ -0,0 +1,286 @@
// Copyright (C) 2013 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_LSH_HAShES_ABSTRACT_H__
#ifdef DLIB_LSH_HAShES_ABSTRACT_H__
#include "../matrix.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class hash_similar_angles_64
{
/*!
WHAT THIS OBJECT REPRESENTS
This object is a tool for computing locality sensitive hashes that give
vectors with similar angles between each other similar hash values. In
particular, this object creates 64 random planes which pass though the
origin and uses them to create a 64bit hash. To compute the hash for a new
vector, this object checks which side of each plane the vector falls on and
records this information into a 64bit integer.
!*/
public:
hash_similar_angles_64 (
);
/*!
ensures
- #get_seed() == 0
!*/
hash_similar_angles_64 (
const uint64 seed
);
/*!
ensures
- #get_seed() == seed
!*/
uint64 get_seed (
) const;
/*!
ensures
- returns the random seed used to generate the random planes used for
hashing.
!*/
typedef uint64 result_type;
template <typename vector_type>
result_type perator() (
const vector_type& v
) const;
/*!
requires
- v is an unsorted sparse vector or a dlib matrix representing either a
column or row vector.
ensures
- returns a 64 bit hash of the input vector v. The bits in the hash record
which side of each random plane v falls on.
!*/
unsigned int distance (
const result_type& a,
const result_type& b
) const;
/*!
ensures
- returns the Hamming distance between the two hashes given to this
function. That is, we return the number of bits in a and b which differ.
!*/
};
// ----------------------------------------------------------------------------------------
struct hash_similar_angles_128
{
/*!
WHAT THIS OBJECT REPRESENTS
This object is a tool for computing locality sensitive hashes that give
vectors with similar angles between each other similar hash values. In
particular, this object creates 128 random planes which pass though the
origin and uses them to create a 128bit hash. To compute the hash for a new
vector, this object checks which side of each plane the vector falls on and
records this information into a 128bit integer.
!*/
public:
hash_similar_angles_128 (
);
/*!
ensures
- #get_seed() == 0
!*/
hash_similar_angles_128 (
const uint64 seed
);
/*!
ensures
- #get_seed() == seed
!*/
uint64 get_seed (
) const;
/*!
ensures
- returns the random seed used to generate the random planes used for
hashing.
!*/
typedef std::pair<uint64,uint64> result_type;
template <typename vector_type>
result_type perator() (
const vector_type& v
) const;
/*!
requires
- v is an unsorted sparse vector or a dlib matrix representing either a
column or row vector.
ensures
- returns a 128 bit hash of the input vector v. The bits in the hash record
which side of each random plane v falls on.
!*/
unsigned int distance (
const result_type& a,
const result_type& b
) const;
/*!
ensures
- returns the Hamming distance between the two hashes given to this
function. That is, we return the number of bits in a and b which differ.
!*/
};
// ----------------------------------------------------------------------------------------
struct hash_similar_angles_256
{
/*!
WHAT THIS OBJECT REPRESENTS
This object is a tool for computing locality sensitive hashes that give
vectors with similar angles between each other similar hash values. In
particular, this object creates 256 random planes which pass though the
origin and uses them to create a 256bit hash. To compute the hash for a new
vector, this object checks which side of each plane the vector falls on and
records this information into a 256bit integer.
!*/
public:
hash_similar_angles_256 (
);
/*!
ensures
- #get_seed() == 0
!*/
hash_similar_angles_256 (
const uint64 seed
);
/*!
ensures
- #get_seed() == seed
!*/
uint64 get_seed (
) const;
/*!
ensures
- returns the random seed used to generate the random planes used for
hashing.
!*/
typedef std::pair<uint64,uint64> hash128_type;
typedef std::pair<hash128_type,hash128_type> result_type;
template <typename vector_type>
result_type perator() (
const vector_type& v
) const;
/*!
requires
- v is an unsorted sparse vector or a dlib matrix representing either a
column or row vector.
ensures
- returns a 256 bit hash of the input vector v. The bits in the hash record
which side of each random plane v falls on.
!*/
unsigned int distance (
const result_type& a,
const result_type& b
) const;
/*!
ensures
- returns the Hamming distance between the two hashes given to this
function. That is, we return the number of bits in a and b which differ.
!*/
};
// ----------------------------------------------------------------------------------------
struct hash_similar_angles_512
{
/*!
WHAT THIS OBJECT REPRESENTS
This object is a tool for computing locality sensitive hashes that give
vectors with similar angles between each other similar hash values. In
particular, this object creates 512 random planes which pass though the
origin and uses them to create a 512bit hash. To compute the hash for a new
vector, this object checks which side of each plane the vector falls on and
records this information into a 512bit integer.
!*/
public:
hash_similar_angles_512 (
);
/*!
ensures
- #get_seed() == 0
!*/
hash_similar_angles_512 (
const uint64 seed
);
/*!
ensures
- #get_seed() == seed
!*/
uint64 get_seed (
) const;
/*!
ensures
- returns the random seed used to generate the random planes used for
hashing.
!*/
typedef hash_similar_angles_256::result_type hash256_type;
typedef std::pair<hash256_type,hash256_type> result_type;
template <typename vector_type>
result_type perator() (
const vector_type& v
) const;
/*!
requires
- v is an unsorted sparse vector or a dlib matrix representing either a
column or row vector.
ensures
- returns a 512 bit hash of the input vector v. The bits in the hash record
which side of each random plane v falls on.
!*/
unsigned int distance (
const result_type& a,
const result_type& b
) const;
/*!
ensures
- returns the Hamming distance between the two hashes given to this
function. That is, we return the number of bits in a and b which differ.
!*/
};
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_LSH_HAShES_ABSTRACT_H__