mirror of
https://github.com/davisking/dlib.git
synced 2024-11-01 10:14:53 +08:00
Added the random_subset_selector object.
--HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403413
This commit is contained in:
parent
c88d8b1262
commit
6cad6741f6
@ -5,6 +5,7 @@
|
||||
|
||||
#include "statistics/statistics.h"
|
||||
#include "statistics/dpca.h"
|
||||
#include "statistics/random_subset_selector.h"
|
||||
|
||||
#endif // DLIB_STATISTICs_H_
|
||||
|
||||
|
173
dlib/statistics/random_subset_selector.h
Normal file
173
dlib/statistics/random_subset_selector.h
Normal file
@ -0,0 +1,173 @@
|
||||
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_RANDOM_SUBSeT_SELECTOR_H_
|
||||
#define DLIB_RANDOM_SUBSeT_SELECTOR_H_
|
||||
|
||||
#include "random_subset_selector_abstract.h"
|
||||
#include "../rand.h"
|
||||
#include <vector>
|
||||
#include "../algs.h"
|
||||
#include "../memory_manager.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
template <
|
||||
typename T,
|
||||
typename Rand_type = dlib::rand::kernel_1a
|
||||
>
|
||||
class random_subset_selector
|
||||
{
|
||||
/*!
|
||||
INITIAL VALUE
|
||||
- _max_size == 0
|
||||
- items.size() == 0
|
||||
- count == 0
|
||||
|
||||
CONVENTION
|
||||
- count == the number of times add() has been called since the last
|
||||
time this object was empty.
|
||||
- items.size() == size()
|
||||
- max_size() == _max_size
|
||||
!*/
|
||||
public:
|
||||
typedef T type;
|
||||
typedef memory_manager<char>::kernel_1a mem_manager_type;
|
||||
typedef Rand_type rand_type;
|
||||
|
||||
typedef typename std::vector<T>::iterator iterator;
|
||||
typedef typename std::vector<T>::const_iterator const_iterator;
|
||||
|
||||
|
||||
random_subset_selector (
|
||||
)
|
||||
{
|
||||
_max_size = 0;
|
||||
make_empty();
|
||||
}
|
||||
|
||||
void set_seed(const std::string& value)
|
||||
{
|
||||
rnd.set_seed(value);
|
||||
}
|
||||
|
||||
void make_empty (
|
||||
)
|
||||
{
|
||||
items.resize(0);
|
||||
count = 0;
|
||||
}
|
||||
|
||||
unsigned long size (
|
||||
) const
|
||||
{
|
||||
return items.size();
|
||||
}
|
||||
|
||||
void set_max_size (
|
||||
unsigned long new_max_size
|
||||
)
|
||||
{
|
||||
items.reserve(new_max_size);
|
||||
make_empty();
|
||||
_max_size = new_max_size;
|
||||
}
|
||||
|
||||
unsigned long max_size (
|
||||
) const
|
||||
{
|
||||
return _max_size;
|
||||
}
|
||||
|
||||
T& operator[] (
|
||||
unsigned long idx
|
||||
)
|
||||
{
|
||||
return items[idx];
|
||||
}
|
||||
|
||||
const T& operator[] (
|
||||
unsigned long idx
|
||||
) const
|
||||
{
|
||||
return items[idx];
|
||||
}
|
||||
|
||||
iterator begin() { return items.begin(); }
|
||||
const_iterator begin() const { return items.begin(); }
|
||||
iterator end() { return items.end(); }
|
||||
const_iterator end() const { return items.end(); }
|
||||
|
||||
void add (
|
||||
const T& new_item
|
||||
)
|
||||
{
|
||||
if (items.size() < _max_size)
|
||||
{
|
||||
items.push_back(new_item);
|
||||
// swap into a random place
|
||||
exchange(items[rnd.get_random_32bit_number()%items.size()], items.back());
|
||||
}
|
||||
else
|
||||
{
|
||||
// At this point each element of items has had an equal chance of being in this object.
|
||||
// In particular, the probability that each arrived here is currently items.size()/count.
|
||||
// We need to be able to say that, after this function ends, the probability of any
|
||||
// particular object ending up in items is items.size()/(count+1). So this means that
|
||||
// we should decide to add new_item into items with this probability. If we do so then
|
||||
// we pick one of the current items and replace it at random with new_item.
|
||||
|
||||
// Make me a random 64 bit number. This might seem excessive but I want this object
|
||||
// to be able to handle an effectively infinite number of calls to add(). So count
|
||||
// might get very large and we need to deal with that properly.
|
||||
const unsigned long num1 = rnd.get_random_32bit_number();
|
||||
const unsigned long num2 = rnd.get_random_32bit_number();
|
||||
uint64 num = num1;
|
||||
num <<= 32;
|
||||
num |= num2;
|
||||
|
||||
num %= (count+1);
|
||||
|
||||
if (num < items.size())
|
||||
{
|
||||
// pick a random element of items and replace it.
|
||||
items[rnd.get_random_32bit_number()%items.size()] = new_item;
|
||||
}
|
||||
}
|
||||
|
||||
++count;
|
||||
}
|
||||
|
||||
void swap (
|
||||
random_subset_selector& a
|
||||
)
|
||||
{
|
||||
a.swap(a.items);
|
||||
std::swap(_max_size, a._max_size);
|
||||
std::swap(count, a.count);
|
||||
rnd.swap(a.rnd);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
std::vector<T> items;
|
||||
unsigned long _max_size;
|
||||
uint64 count;
|
||||
|
||||
rand_type rnd;
|
||||
|
||||
};
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename rand_type
|
||||
>
|
||||
void swap (
|
||||
random_subset_selector<T,rand_type>& a,
|
||||
random_subset_selector<T,rand_type>& b
|
||||
) { a.swap(b); }
|
||||
|
||||
}
|
||||
|
||||
#endif // DLIB_RANDOM_SUBSeT_SELECTOR_H_
|
||||
|
||||
|
198
dlib/statistics/random_subset_selector_abstract.h
Normal file
198
dlib/statistics/random_subset_selector_abstract.h
Normal file
@ -0,0 +1,198 @@
|
||||
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#undef DLIB_RANDOM_SUBSeT_SELECTOR_ABSTRACT_H_
|
||||
#ifdef DLIB_RANDOM_SUBSeT_SELECTOR_ABSTRACT_H_
|
||||
|
||||
#include <vector>
|
||||
#include "../rand.h"
|
||||
#include "../memory_manager.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
template <
|
||||
typename T,
|
||||
typename Rand_type = dlib::rand::kernel_1a
|
||||
>
|
||||
class random_subset_selector
|
||||
{
|
||||
/*!
|
||||
REQUIREMENTS ON T
|
||||
T must be a copyable type
|
||||
|
||||
REQUIREMENTS ON Rand_type
|
||||
must be an implementation of dlib/rand/rand_kernel_abstract.h
|
||||
|
||||
INITIAL VALUE
|
||||
- size() == 0
|
||||
- max_size() == 0
|
||||
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This object is a tool to help you select a random subset of a large body of data.
|
||||
In particular, it is useful when the body of data is too large to fit into memory.
|
||||
|
||||
So for example, suppose you have 1000000 data samples and you want to select a
|
||||
random subset of size 1000. Then you could do that as follows:
|
||||
|
||||
random_subset_selector<sample_type> rand_subset;
|
||||
rand_subset.set_max_size(1000)
|
||||
for (int i = 0; i < 1000000; ++i)
|
||||
rand_subset.add( get_next_data_sample());
|
||||
|
||||
|
||||
At the end of the for loop you will have your random subset of 1000 samples. And by
|
||||
random I mean that each of the 1000000 data samples has an equal change of ending
|
||||
up in the rand_subset object.
|
||||
|
||||
!*/
|
||||
public:
|
||||
typedef T type;
|
||||
typedef memory_manager<char>::kernel_1a mem_manager_type;
|
||||
typedef Rand_type rand_type;
|
||||
|
||||
typedef typename std::vector<T>::iterator iterator;
|
||||
typedef typename std::vector<T>::const_iterator const_iterator;
|
||||
|
||||
random_subset_selector (
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- this object is properly initialized
|
||||
!*/
|
||||
|
||||
void set_seed(
|
||||
const std::string& value
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- sets the seed of the random number generator that is embedded in
|
||||
this object to the given value.
|
||||
!*/
|
||||
|
||||
void make_empty (
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- #size() == 0
|
||||
!*/
|
||||
|
||||
unsigned long size (
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns the number of items of type T currently contained in this object
|
||||
!*/
|
||||
|
||||
void set_max_size (
|
||||
unsigned long new_max_size
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- #max_size() == new_max_size
|
||||
- #size() == 0
|
||||
!*/
|
||||
|
||||
unsigned long max_size (
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns the maximum allowable size for this object
|
||||
!*/
|
||||
|
||||
T& operator[] (
|
||||
unsigned long idx
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- idx < size()
|
||||
ensures
|
||||
- returns a non-const reference to the idx'th element of this object
|
||||
!*/
|
||||
|
||||
const T& operator[] (
|
||||
unsigned long idx
|
||||
) const;
|
||||
/*!
|
||||
requires
|
||||
- idx < size()
|
||||
ensures
|
||||
- returns a const reference to the idx'th element of this object
|
||||
!*/
|
||||
|
||||
void add (
|
||||
const T& new_item
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- if (size() < max_size()) then
|
||||
- #size() == size() + 1
|
||||
- places new_item into *this object at a random location
|
||||
- else
|
||||
- randomly does one of the following:
|
||||
- ignores new_item and makes no change
|
||||
- replaces a random element of *this with new_item
|
||||
!*/
|
||||
|
||||
iterator begin(
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- if (size() > 0) then
|
||||
- returns an iterator referring to the first element in
|
||||
this container.
|
||||
- else
|
||||
- returns end()
|
||||
!*/
|
||||
|
||||
const_iterator begin(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- if (size() > 0) then
|
||||
- returns a const_iterator referring to the first element in
|
||||
this container.
|
||||
- else
|
||||
- returns end()
|
||||
!*/
|
||||
|
||||
iterator end(
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- returns an iterator that represents one past the end of
|
||||
this container
|
||||
!*/
|
||||
|
||||
const_iterator end(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns an iterator that represents one past the end of
|
||||
this container
|
||||
!*/
|
||||
|
||||
void swap (
|
||||
random_subset_selector& item
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- swaps *this and item
|
||||
!*/
|
||||
|
||||
};
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename rand_type
|
||||
>
|
||||
void swap (
|
||||
random_subset_selector<T,rand_type>& a,
|
||||
random_subset_selector<T,rand_type>& b
|
||||
) { a.swap(b); }
|
||||
/*!
|
||||
provides global swap support
|
||||
!*/
|
||||
|
||||
}
|
||||
|
||||
#endif // DLIB_RANDOM_SUBSeT_SELECTOR_ABSTRACT_H_
|
||||
|
@ -74,6 +74,7 @@ set (tests
|
||||
stack.cpp
|
||||
static_map.cpp
|
||||
static_set.cpp
|
||||
statistics.cpp
|
||||
std_vector_c.cpp
|
||||
string.cpp
|
||||
svm.cpp
|
||||
|
@ -84,6 +84,7 @@ SRC += sockstreambuf.cpp
|
||||
SRC += stack.cpp
|
||||
SRC += static_map.cpp
|
||||
SRC += static_set.cpp
|
||||
SRC += statistics.cpp
|
||||
SRC += std_vector_c.cpp
|
||||
SRC += string.cpp
|
||||
SRC += svm.cpp
|
||||
|
72
dlib/test/statistics.cpp
Normal file
72
dlib/test/statistics.cpp
Normal file
@ -0,0 +1,72 @@
|
||||
// Copyright (C) 2010 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <dlib/statistics.h>
|
||||
|
||||
#include "tester.h"
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
using namespace test;
|
||||
using namespace dlib;
|
||||
using namespace std;
|
||||
|
||||
logger dlog("test.statistics");
|
||||
|
||||
|
||||
|
||||
class statistics_tester : public tester
|
||||
{
|
||||
public:
|
||||
statistics_tester (
|
||||
) :
|
||||
tester ("test_statistics",
|
||||
"Runs tests on the statistics component.")
|
||||
{}
|
||||
|
||||
void test_random_subset_selector ()
|
||||
{
|
||||
random_subset_selector<double> rand_set;
|
||||
|
||||
for (int j = 0; j < 30; ++j)
|
||||
{
|
||||
print_spinner();
|
||||
|
||||
running_stats<double> rs, rs2;
|
||||
|
||||
rand_set.set_max_size(1000);
|
||||
|
||||
for (double i = 0; i < 100000; ++i)
|
||||
{
|
||||
rs.add(i);
|
||||
rand_set.add(i);
|
||||
}
|
||||
|
||||
|
||||
for (unsigned long i = 0; i < rand_set.size(); ++i)
|
||||
rs2.add(rand_set[i]);
|
||||
|
||||
|
||||
dlog << LDEBUG << "true mean: " << rs.mean();
|
||||
dlog << LDEBUG << "true sampled: " << rs2.mean();
|
||||
double ratio = rs.mean()/rs2.mean();
|
||||
DLIB_TEST_MSG(0.96 < ratio && ratio < 1.04, " ratio: " << ratio);
|
||||
}
|
||||
}
|
||||
|
||||
void perform_test (
|
||||
)
|
||||
{
|
||||
test_random_subset_selector();
|
||||
}
|
||||
} a;
|
||||
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user