mirror of
https://github.com/davisking/dlib.git
synced 2024-11-01 10:14:53 +08:00
Added a set of new LSH based hashing functions meant for use
with larger vectors and high bit sizes than the current LSH tool. These are the new hash_similar_angles_xxx objects.
This commit is contained in:
parent
6f5ef4c089
commit
4e96485601
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
#include "lsh/projection_hash.h"
|
#include "lsh/projection_hash.h"
|
||||||
#include "lsh/create_random_projection_hash.h"
|
#include "lsh/create_random_projection_hash.h"
|
||||||
|
#include "lsh/hashes.h"
|
||||||
|
|
||||||
|
|
||||||
#endif // DLIB_LSh_
|
#endif // DLIB_LSh_
|
||||||
|
218
dlib/lsh/hashes.h
Normal file
218
dlib/lsh/hashes.h
Normal file
@ -0,0 +1,218 @@
|
|||||||
|
// Copyright (C) 2013 Davis E. King (davis@dlib.net)
|
||||||
|
// License: Boost Software License See LICENSE.txt for the full license.
|
||||||
|
#ifndef DLIB_LSH_HAShES_H__
|
||||||
|
#define DLIB_LSH_HAShES_H__
|
||||||
|
|
||||||
|
#include "hashes_abstract.h"
|
||||||
|
#include "../hash.h"
|
||||||
|
#include "../matrix.h"
|
||||||
|
|
||||||
|
namespace dlib
|
||||||
|
{
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class hash_similar_angles_64
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
hash_similar_angles_64 (
|
||||||
|
) : seed(0) {}
|
||||||
|
|
||||||
|
hash_similar_angles_64 (
|
||||||
|
const uint64 seed_
|
||||||
|
) : seed(seed_) {}
|
||||||
|
|
||||||
|
uint64 get_seed (
|
||||||
|
) const { return seed; }
|
||||||
|
|
||||||
|
|
||||||
|
typedef uint64 result_type;
|
||||||
|
|
||||||
|
template <
|
||||||
|
typename sparse_vector_type
|
||||||
|
>
|
||||||
|
typename disable_if<is_matrix<sparse_vector_type>,uint64>::type operator() (
|
||||||
|
const sparse_vector_type& v
|
||||||
|
) const
|
||||||
|
{
|
||||||
|
typedef typename sparse_vector_type::value_type::second_type scalar_type;
|
||||||
|
|
||||||
|
uint64 temp = 0;
|
||||||
|
for (int i = 0; i < 64; ++i)
|
||||||
|
{
|
||||||
|
// compute the dot product between v and a Gaussian random vector.
|
||||||
|
scalar_type val = 0;
|
||||||
|
for (typename sparse_vector_type::const_iterator j = v.begin(); j != v.end(); ++j)
|
||||||
|
val += j->second*gaussian_random_hash(j->first, i, seed);
|
||||||
|
|
||||||
|
if (val > 0)
|
||||||
|
temp |= 1;
|
||||||
|
temp <<= 1;
|
||||||
|
}
|
||||||
|
return temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename EXP>
|
||||||
|
uint64 operator() (
|
||||||
|
const matrix_exp<EXP>& v
|
||||||
|
) const
|
||||||
|
{
|
||||||
|
uint64 temp = 0;
|
||||||
|
for (unsigned long i = 0; i < 64; ++i)
|
||||||
|
{
|
||||||
|
if (dot(gaussian_randm(v.size(),1,i+seed*64), v) > 0)
|
||||||
|
temp |= 1;
|
||||||
|
temp <<= 1;
|
||||||
|
}
|
||||||
|
return temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int distance (
|
||||||
|
const result_type& a,
|
||||||
|
const result_type& b
|
||||||
|
) const
|
||||||
|
{
|
||||||
|
return hamming_distance(a,b);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const uint64 seed;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class hash_similar_angles_128
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
hash_similar_angles_128 (
|
||||||
|
) : seed(0),hasher1(0), hasher2(1) {}
|
||||||
|
|
||||||
|
hash_similar_angles_128 (
|
||||||
|
const uint64 seed_
|
||||||
|
) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {}
|
||||||
|
|
||||||
|
uint64 get_seed (
|
||||||
|
) const { return seed; }
|
||||||
|
|
||||||
|
typedef std::pair<uint64,uint64> result_type;
|
||||||
|
|
||||||
|
template <
|
||||||
|
typename vector_type
|
||||||
|
>
|
||||||
|
result_type operator() (
|
||||||
|
const vector_type& v
|
||||||
|
) const
|
||||||
|
{
|
||||||
|
return std::make_pair(hasher1(v), hasher2(v));
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int distance (
|
||||||
|
const result_type& a,
|
||||||
|
const result_type& b
|
||||||
|
) const
|
||||||
|
{
|
||||||
|
return hamming_distance(a.first,b.first) +
|
||||||
|
hamming_distance(a.second,b.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const uint64 seed;
|
||||||
|
hash_similar_angles_64 hasher1;
|
||||||
|
hash_similar_angles_64 hasher2;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class hash_similar_angles_256
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
hash_similar_angles_256 (
|
||||||
|
) : seed(0), hasher1(0), hasher2(1) {}
|
||||||
|
|
||||||
|
hash_similar_angles_256 (
|
||||||
|
const uint64 seed_
|
||||||
|
) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {}
|
||||||
|
|
||||||
|
uint64 get_seed (
|
||||||
|
) const { return seed; }
|
||||||
|
|
||||||
|
typedef std::pair<uint64,uint64> hash128_type;
|
||||||
|
typedef std::pair<hash128_type,hash128_type> result_type;
|
||||||
|
|
||||||
|
template <
|
||||||
|
typename vector_type
|
||||||
|
>
|
||||||
|
result_type operator() (
|
||||||
|
const vector_type& v
|
||||||
|
) const
|
||||||
|
{
|
||||||
|
return std::make_pair(hasher1(v), hasher2(v));
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int distance (
|
||||||
|
const result_type& a,
|
||||||
|
const result_type& b
|
||||||
|
) const
|
||||||
|
{
|
||||||
|
return hasher1.distance(a.first,b.first) +
|
||||||
|
hasher1.distance(a.second,b.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const uint64 seed;
|
||||||
|
hash_similar_angles_128 hasher1;
|
||||||
|
hash_similar_angles_128 hasher2;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class hash_similar_angles_512
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
hash_similar_angles_512 (
|
||||||
|
) : seed(0), hasher1(0), hasher2(1) {}
|
||||||
|
|
||||||
|
hash_similar_angles_512 (
|
||||||
|
const uint64 seed_
|
||||||
|
) : seed(seed_),hasher1(2*seed),hasher2(2*seed+1) {}
|
||||||
|
|
||||||
|
uint64 get_seed (
|
||||||
|
) const { return seed; }
|
||||||
|
|
||||||
|
|
||||||
|
typedef hash_similar_angles_256::result_type hash256_type;
|
||||||
|
typedef std::pair<hash256_type,hash256_type> result_type;
|
||||||
|
|
||||||
|
template <
|
||||||
|
typename vector_type
|
||||||
|
>
|
||||||
|
result_type operator() (
|
||||||
|
const vector_type& v
|
||||||
|
) const
|
||||||
|
{
|
||||||
|
return std::make_pair(hasher1(v), hasher2(v));
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int distance (
|
||||||
|
const result_type& a,
|
||||||
|
const result_type& b
|
||||||
|
) const
|
||||||
|
{
|
||||||
|
return hasher1.distance(a.first,b.first) +
|
||||||
|
hasher1.distance(a.second,b.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const uint64 seed;
|
||||||
|
hash_similar_angles_256 hasher1;
|
||||||
|
hash_similar_angles_256 hasher2;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // DLIB_LSH_HAShES_H__
|
||||||
|
|
286
dlib/lsh/hashes_abstract.h
Normal file
286
dlib/lsh/hashes_abstract.h
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
// Copyright (C) 2013 Davis E. King (davis@dlib.net)
|
||||||
|
// License: Boost Software License See LICENSE.txt for the full license.
|
||||||
|
#undef DLIB_LSH_HAShES_ABSTRACT_H__
|
||||||
|
#ifdef DLIB_LSH_HAShES_ABSTRACT_H__
|
||||||
|
|
||||||
|
#include "../matrix.h"
|
||||||
|
|
||||||
|
namespace dlib
|
||||||
|
{
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class hash_similar_angles_64
|
||||||
|
{
|
||||||
|
/*!
|
||||||
|
WHAT THIS OBJECT REPRESENTS
|
||||||
|
This object is a tool for computing locality sensitive hashes that give
|
||||||
|
vectors with similar angles between each other similar hash values. In
|
||||||
|
particular, this object creates 64 random planes which pass though the
|
||||||
|
origin and uses them to create a 64bit hash. To compute the hash for a new
|
||||||
|
vector, this object checks which side of each plane the vector falls on and
|
||||||
|
records this information into a 64bit integer.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
hash_similar_angles_64 (
|
||||||
|
);
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- #get_seed() == 0
|
||||||
|
!*/
|
||||||
|
|
||||||
|
hash_similar_angles_64 (
|
||||||
|
const uint64 seed
|
||||||
|
);
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- #get_seed() == seed
|
||||||
|
!*/
|
||||||
|
|
||||||
|
uint64 get_seed (
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- returns the random seed used to generate the random planes used for
|
||||||
|
hashing.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
typedef uint64 result_type;
|
||||||
|
|
||||||
|
template <typename vector_type>
|
||||||
|
result_type perator() (
|
||||||
|
const vector_type& v
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
requires
|
||||||
|
- v is an unsorted sparse vector or a dlib matrix representing either a
|
||||||
|
column or row vector.
|
||||||
|
ensures
|
||||||
|
- returns a 64 bit hash of the input vector v. The bits in the hash record
|
||||||
|
which side of each random plane v falls on.
|
||||||
|
|
||||||
|
!*/
|
||||||
|
|
||||||
|
unsigned int distance (
|
||||||
|
const result_type& a,
|
||||||
|
const result_type& b
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- returns the Hamming distance between the two hashes given to this
|
||||||
|
function. That is, we return the number of bits in a and b which differ.
|
||||||
|
!*/
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
struct hash_similar_angles_128
|
||||||
|
{
|
||||||
|
/*!
|
||||||
|
WHAT THIS OBJECT REPRESENTS
|
||||||
|
This object is a tool for computing locality sensitive hashes that give
|
||||||
|
vectors with similar angles between each other similar hash values. In
|
||||||
|
particular, this object creates 128 random planes which pass though the
|
||||||
|
origin and uses them to create a 128bit hash. To compute the hash for a new
|
||||||
|
vector, this object checks which side of each plane the vector falls on and
|
||||||
|
records this information into a 128bit integer.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
hash_similar_angles_128 (
|
||||||
|
);
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- #get_seed() == 0
|
||||||
|
!*/
|
||||||
|
|
||||||
|
hash_similar_angles_128 (
|
||||||
|
const uint64 seed
|
||||||
|
);
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- #get_seed() == seed
|
||||||
|
!*/
|
||||||
|
|
||||||
|
uint64 get_seed (
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- returns the random seed used to generate the random planes used for
|
||||||
|
hashing.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
typedef std::pair<uint64,uint64> result_type;
|
||||||
|
|
||||||
|
template <typename vector_type>
|
||||||
|
result_type perator() (
|
||||||
|
const vector_type& v
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
requires
|
||||||
|
- v is an unsorted sparse vector or a dlib matrix representing either a
|
||||||
|
column or row vector.
|
||||||
|
ensures
|
||||||
|
- returns a 128 bit hash of the input vector v. The bits in the hash record
|
||||||
|
which side of each random plane v falls on.
|
||||||
|
|
||||||
|
!*/
|
||||||
|
|
||||||
|
unsigned int distance (
|
||||||
|
const result_type& a,
|
||||||
|
const result_type& b
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- returns the Hamming distance between the two hashes given to this
|
||||||
|
function. That is, we return the number of bits in a and b which differ.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
struct hash_similar_angles_256
|
||||||
|
{
|
||||||
|
/*!
|
||||||
|
WHAT THIS OBJECT REPRESENTS
|
||||||
|
This object is a tool for computing locality sensitive hashes that give
|
||||||
|
vectors with similar angles between each other similar hash values. In
|
||||||
|
particular, this object creates 256 random planes which pass though the
|
||||||
|
origin and uses them to create a 256bit hash. To compute the hash for a new
|
||||||
|
vector, this object checks which side of each plane the vector falls on and
|
||||||
|
records this information into a 256bit integer.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
hash_similar_angles_256 (
|
||||||
|
);
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- #get_seed() == 0
|
||||||
|
!*/
|
||||||
|
|
||||||
|
hash_similar_angles_256 (
|
||||||
|
const uint64 seed
|
||||||
|
);
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- #get_seed() == seed
|
||||||
|
!*/
|
||||||
|
|
||||||
|
uint64 get_seed (
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- returns the random seed used to generate the random planes used for
|
||||||
|
hashing.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
typedef std::pair<uint64,uint64> hash128_type;
|
||||||
|
typedef std::pair<hash128_type,hash128_type> result_type;
|
||||||
|
|
||||||
|
template <typename vector_type>
|
||||||
|
result_type perator() (
|
||||||
|
const vector_type& v
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
requires
|
||||||
|
- v is an unsorted sparse vector or a dlib matrix representing either a
|
||||||
|
column or row vector.
|
||||||
|
ensures
|
||||||
|
- returns a 256 bit hash of the input vector v. The bits in the hash record
|
||||||
|
which side of each random plane v falls on.
|
||||||
|
|
||||||
|
!*/
|
||||||
|
|
||||||
|
unsigned int distance (
|
||||||
|
const result_type& a,
|
||||||
|
const result_type& b
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- returns the Hamming distance between the two hashes given to this
|
||||||
|
function. That is, we return the number of bits in a and b which differ.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
struct hash_similar_angles_512
|
||||||
|
{
|
||||||
|
/*!
|
||||||
|
WHAT THIS OBJECT REPRESENTS
|
||||||
|
This object is a tool for computing locality sensitive hashes that give
|
||||||
|
vectors with similar angles between each other similar hash values. In
|
||||||
|
particular, this object creates 512 random planes which pass though the
|
||||||
|
origin and uses them to create a 512bit hash. To compute the hash for a new
|
||||||
|
vector, this object checks which side of each plane the vector falls on and
|
||||||
|
records this information into a 512bit integer.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
hash_similar_angles_512 (
|
||||||
|
);
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- #get_seed() == 0
|
||||||
|
!*/
|
||||||
|
|
||||||
|
hash_similar_angles_512 (
|
||||||
|
const uint64 seed
|
||||||
|
);
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- #get_seed() == seed
|
||||||
|
!*/
|
||||||
|
|
||||||
|
uint64 get_seed (
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- returns the random seed used to generate the random planes used for
|
||||||
|
hashing.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
typedef hash_similar_angles_256::result_type hash256_type;
|
||||||
|
typedef std::pair<hash256_type,hash256_type> result_type;
|
||||||
|
|
||||||
|
template <typename vector_type>
|
||||||
|
result_type perator() (
|
||||||
|
const vector_type& v
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
requires
|
||||||
|
- v is an unsorted sparse vector or a dlib matrix representing either a
|
||||||
|
column or row vector.
|
||||||
|
ensures
|
||||||
|
- returns a 512 bit hash of the input vector v. The bits in the hash record
|
||||||
|
which side of each random plane v falls on.
|
||||||
|
|
||||||
|
!*/
|
||||||
|
|
||||||
|
unsigned int distance (
|
||||||
|
const result_type& a,
|
||||||
|
const result_type& b
|
||||||
|
) const;
|
||||||
|
/*!
|
||||||
|
ensures
|
||||||
|
- returns the Hamming distance between the two hashes given to this
|
||||||
|
function. That is, we return the number of bits in a and b which differ.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // DLIB_LSH_HAShES_ABSTRACT_H__
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user