[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

sampling.hxx VIGRA

1 /************************************************************************/
2 /* */
3 /* Copyright 2008-2009 by Ullrich Koethe and Rahul Nair */
4 /* */
5 /* This file is part of the VIGRA computer vision library. */
6 /* The VIGRA Website is */
7 /* http://hci.iwr.uni-heidelberg.de/vigra/ */
8 /* Please direct questions, bug reports, and contributions to */
9 /* ullrich.koethe@iwr.uni-heidelberg.de or */
10 /* vigra@informatik.uni-hamburg.de */
11 /* */
12 /* Permission is hereby granted, free of charge, to any person */
13 /* obtaining a copy of this software and associated documentation */
14 /* files (the "Software"), to deal in the Software without */
15 /* restriction, including without limitation the rights to use, */
16 /* copy, modify, merge, publish, distribute, sublicense, and/or */
17 /* sell copies of the Software, and to permit persons to whom the */
18 /* Software is furnished to do so, subject to the following */
19 /* conditions: */
20 /* */
21 /* The above copyright notice and this permission notice shall be */
22 /* included in all copies or substantial portions of the */
23 /* Software. */
24 /* */
25 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */
26 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */
27 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
28 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */
29 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */
30 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */
31 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */
32 /* OTHER DEALINGS IN THE SOFTWARE. */
33 /* */
34 /************************************************************************/
35 
36 #ifndef VIGRA_SAMPLING_HXX
37 #define VIGRA_SAMPLING_HXX
38 
39 #include "array_vector.hxx"
40 #include "random.hxx"
41 #include <map>
42 #include <memory>
43 #include <cmath>
44 
45 namespace vigra
46 {
47 
48 /**\brief Options object for the Sampler class.
49 
50  \ingroup MachineLearning
51 
52  <b>Usage:</b>
53 
54  \code
55  SamplerOptions opt = SamplerOptions()
56  .withReplacement()
57  .sampleProportion(0.5);
58  \endcode
59 
60  Note that the return value of all methods is <tt>*this</tt> which makes
61  concatenating of options as above possible.
62 */
64 {
65  public:
66 
67  double sample_proportion;
68  unsigned int sample_size;
69  bool sample_with_replacement;
70  bool stratified_sampling;
71 
73  : sample_proportion(1.0),
74  sample_size(0),
75  sample_with_replacement(true),
76  stratified_sampling(false)
77  {}
78 
79  /**\brief Sample from training population with replacement.
80  *
81  * <br> Default: true
82  */
84  {
85  sample_with_replacement = in;
86  return *this;
87  }
88 
89  /**\brief Sample from training population without replacement.
90  *
91  * <br> Default (if you don't call this function): false
92  */
94  {
95  sample_with_replacement = !in;
96  return *this;
97  }
98 
99  /**\brief Draw the given number of samples.
100  * If stratifiedSampling is true, the \a size is equally distributed
101  * across all strata (e.g. <tt>size / strataCount</tt> samples are taken
102  * from each stratum, subject to rounding).
103  *
104  * <br> Default: 0 (i.e. determine the count by means of sampleProportion())
105  */
106  SamplerOptions& sampleSize(unsigned int size)
107  {
108  sample_size = size;
109  return *this;
110  }
111 
112 
113  /**\brief Determine the number of samples to draw as a proportion of the total
114  * number. That is, we draw <tt>count = totalCount * proportion</tt> samples.
115  * This option is overridden when an absolute count is specified by sampleSize().
116  *
117  * If stratifiedSampling is true, the count is equally distributed
118  * across all strata (e.g. <tt>totalCount * proportion / strataCount</tt> samples are taken
119  * from each stratum).
120  *
121  * <br> Default: 1.0
122  */
123  SamplerOptions& sampleProportion(double proportion)
124  {
125  vigra_precondition(proportion >= 0.0,
126  "SamplerOptions::sampleProportion(): argument must not be negative.");
127  sample_proportion = proportion;
128  return *this;
129  }
130 
131  /**\brief Draw equally many samples from each "stratum".
132  * A stratum is a group of like entities, e.g. pixels belonging
133  * to the same object class. This is useful to create balanced samples
134  * when the class probabilities are very unbalanced (e.g.
135  * when there are many background and few foreground pixels).
136  * Stratified sampling thus avoids that a trained classifier is biased
137  * towards the majority class.
138  *
139  * <br> Default (if you don't call this function): false
140  */
141  SamplerOptions& stratified(bool in = true)
142  {
143  stratified_sampling = in;
144  return *this;
145  }
146 };
147 
148 /************************************************************/
149 /* */
150 /* Sampler */
151 /* */
152 /************************************************************/
153 
154 /** \brief Create random samples from a sequence of indices.
155 
156  \ingroup MachineLearning
157 
158  Selecting data items at random is a basic task of machine learning,
159  for example in boostrapping, RandomForest training, and cross validation.
160  This class implements various ways to select random samples via their indices.
161  Indices are assumed to be consecutive in
162  the range <tt>0 &lt;= index &lt; total_sample_count</tt>.
163 
164  The class always contains a current sample which can be accessed by
165  the index operator or by the function sampledIndices(). The indices
166  that are not in the current sample (out-of-bag indices) can be accessed
167  via the function oobIndices().
168 
169  The sampling method (with/without replacement, stratified or not) and the
170  number of samples to draw are determined by the option object
171  SamplerOptions.
172 
173  <b>Usage:</b>
174 
175  <b>\#include</b> <vigra/sampling.hxx><br>
176  Namespace: vigra
177 
178  Create a Sampler with default options, i.e. sample as many indices as there
179  are data elements, with replacement. On average, the sample will contain
180  <tt>0.63*totalCount</tt> distinct indices.
181 
182  \code
183  int totalCount = 10000; // total number of data elements
184  int numberOfSamples = 20; // repeat experiment 20 times
185  Sampler<> sampler(totalCount);
186  for(int k=0; k<numberOfSamples; ++k)
187  {
188  // process current sample
189  for(int i=0; i<sampler.sampleSize(); ++i)
190  {
191  int currentIndex = sampler[i];
192  processData(data[currentIndex]);
193  }
194  // create next sample
195  sampler.sample();
196  }
197  \endcode
198 
199  Create a Sampler for stratified sampling, without replacement.
200 
201  \code
202  // prepare the strata (i.e. specify which stratum each element belongs to)
203  int stratumSize1 = 2000, stratumSize2 = 8000,
204  totalCount = stratumSize1 + stratumSize2;
205  ArrayVerctor<int> strata(totalCount);
206  for(int i=0; i<stratumSize1; ++i)
207  strata[i] = 1;
208  for(int i=stratumSize1; i<stratumSize2; ++i)
209  strata[i] = 2;
210 
211  int sampleSize = 200; // i.e. sample 100 elements from each of the two strata
212  int numberOfSamples = 20; // repeat experiment 20 times
213  Sampler<> stratifiedSampler(strata.begin(), strata.end(),
214  SamplerOptions().withoutReplacement().stratified().sampleSize(sampleSize));
215  // create first sample
216  sampler.sample();
217 
218  for(int k=0; k<numberOfSamples; ++k)
219  {
220  // process current sample
221  for(int i=0; i<sampler.sampleSize(); ++i)
222  {
223  int currentIndex = sampler[i];
224  processData(data[currentIndex]);
225  }
226  // create next sample
227  sampler.sample();
228  }
229  \endcode
230 */
231 template<class Random = MersenneTwister >
232 class Sampler
233 {
234  public:
235  /** Internal type of the indices.
236  Currently, 64-bit indices are not supported because this
237  requires extension of the random number generator classes.
238  */
239  typedef Int32 IndexType;
240 
242 
243  /** Type of the array view object that is returned by
244  sampledIndices() and oobIndices().
245  */
247 
248  private:
249  typedef std::map<IndexType, IndexArrayType> StrataIndicesType;
250  typedef std::map<IndexType, int> StrataSizesType;
253 
254  static const int oobInvalid = -1;
255 
256  int total_count_, sample_size_;
257  mutable int current_oob_count_;
258  StrataIndicesType strata_indices_;
259  StrataSizesType strata_sample_size_;
260  IndexArrayType current_sample_;
261  mutable IndexArrayType current_oob_sample_;
262  IsUsedArrayType is_used_;
263  Random default_random_;
264  Random const & random_;
265  SamplerOptions options_;
266 
267  void initStrataCount()
268  {
269  // compute how many samples to take from each stratum
270  // (may be unequal if sample_size_ is not a multiple of strataCount())
271  int strata_sample_size = static_cast<int>(std::ceil(double(sample_size_) / strataCount()));
272  int strata_total_count = strata_sample_size * strataCount();
273 
274  for(StrataIndicesType::iterator i = strata_indices_.begin();
275  i != strata_indices_.end(); ++i)
276  {
277  if(strata_total_count > sample_size_)
278  {
279  strata_sample_size_[i->first] = strata_sample_size - 1;
280  --strata_total_count;
281  }
282  else
283  {
284  strata_sample_size_[i->first] = strata_sample_size;
285  }
286  }
287  }
288 
289  public:
290 
291  /** Create a sampler for \a totalCount data objects.
292 
293  In each invocation of <tt>sample()</tt> below, it will sample
294  indices according to the options passed. If no options are given,
295  <tt>totalCount</tt> indices will be drawn with replacement.
296  */
298  Random const * rnd = 0)
299  : total_count_(totalCount),
300  sample_size_(opt.sample_size == 0
301  ? static_cast<int>((std::ceil(total_count_ * opt.sample_proportion)))
302  : opt.sample_size),
303  current_oob_count_(oobInvalid),
304  current_sample_(sample_size_),
305  current_oob_sample_(total_count_),
306  is_used_(total_count_),
307  default_random_(RandomSeed),
308  random_(rnd ? *rnd : default_random_),
309  options_(opt)
310  {
311  vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
312  "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
313 
314  vigra_precondition(!opt.stratified_sampling,
315  "Sampler(): Stratified sampling requested, but no strata given.");
316 
317  // initialize a single stratum containing all data
318  strata_indices_[0].resize(total_count_);
319  for(int i=0; i<total_count_; ++i)
320  strata_indices_[0][i] = i;
321 
322  initStrataCount();
323  //this is screwing up the random forest tests.
324  //sample();
325  }
326 
327  /** Create a sampler for stratified sampling.
328 
329  <tt>strataBegin</tt> and <tt>strataEnd</tt> must refer to a sequence
330  which specifies for each sample the stratum it belongs to. The
331  total number of data objects will be set to <tt>strataEnd - strataBegin</tt>.
332  Equally many samples (subject to rounding) will be drawn from each stratum,
333  unless the option object explicitly requests unstratified sampling,
334  in which case the strata are ignored.
335  */
336  template <class Iterator>
337  Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const & opt = SamplerOptions(),
338  Random const * rnd = 0)
339  : total_count_(strataEnd - strataBegin),
340  sample_size_(opt.sample_size == 0
341  ? static_cast<int>((std::ceil(total_count_ * opt.sample_proportion)))
342  : opt.sample_size),
343  current_oob_count_(oobInvalid),
344  current_sample_(sample_size_),
345  current_oob_sample_(total_count_),
346  is_used_(total_count_),
347  default_random_(RandomSeed),
348  random_(rnd ? *rnd : default_random_),
349  options_(opt)
350  {
351  vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
352  "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
353 
354  // copy the strata indices
355  if(opt.stratified_sampling)
356  {
357  for(int i = 0; strataBegin != strataEnd; ++i, ++strataBegin)
358  {
359  strata_indices_[*strataBegin].push_back(i);
360  }
361  }
362  else
363  {
364  strata_indices_[0].resize(total_count_);
365  for(int i=0; i<total_count_; ++i)
366  strata_indices_[0][i] = i;
367  }
368 
369  vigra_precondition(sample_size_ >= static_cast<int>(strata_indices_.size()),
370  "Sampler(): Requested sample count must be at least as large as the number of strata.");
371 
372  initStrataCount();
373  //this is screwing up the random forest tests.
374  //sample();
375  }
376 
377  /** Return the k-th index in the current sample.
378  */
379  IndexType operator[](int k) const
380  {
381  return current_sample_[k];
382  }
383 
384  /** Create a new sample.
385  */
386  void sample();
387 
388  /** The total number of data elements.
389  */
390  int totalCount() const
391  {
392  return total_count_;
393  }
394 
395  /** The number of data elements that have been sampled.
396  */
397  int sampleSize() const
398  {
399  return sample_size_;
400  }
401 
402  /** Same as sampleSize().
403  */
404  int size() const
405  {
406  return sample_size_;
407  }
408 
409  /** The number of strata to be used.
410  Will be 1 if no strata are given. Will be ignored when
411  stratifiedSampling() is false.
412  */
413  int strataCount() const
414  {
415  return strata_indices_.size();
416  }
417 
418  /** Whether to use stratified sampling.
419  (If this is false, strata will be ignored even if present.)
420  */
421  bool stratifiedSampling() const
422  {
423  return options_.stratified_sampling;
424  }
425 
426  /** Whether sampling should be performed with replacement.
427  */
428  bool withReplacement() const
429  {
430  return options_.sample_with_replacement;
431  }
432 
433  /** Return an array view containing the indices in the current sample.
434  */
436  {
437  return current_sample_;
438  }
439 
440  /** Return an array view containing the out-of-bag indices.
441  (i.e. the indices that are not in the current sample)
442  */
444  {
445  if(current_oob_count_ == oobInvalid)
446  {
447  current_oob_count_ = 0;
448  for(int i = 0; i<total_count_; ++i)
449  {
450  if(!is_used_[i])
451  {
452  current_oob_sample_[current_oob_count_] = i;
453  ++current_oob_count_;
454  }
455  }
456  }
457  return current_oob_sample_.subarray(0, current_oob_count_);
458  }
459  IsUsedArrayType const & is_used() const
460  {
461  return is_used_;
462  }
463 };
464 
465 
466 template<class Random>
468 {
469  current_oob_count_ = oobInvalid;
470  is_used_.init(false);
471 
472  if(options_.sample_with_replacement)
473  {
474  //Go thru all strata
475  int j = 0;
476  StrataIndicesType::iterator iter;
477  for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
478  {
479  // do sampling with replacement in each strata and copy data.
480  int stratum_size = iter->second.size();
481  for(int i = 0; i < static_cast<int>(strata_sample_size_[iter->first]); ++i, ++j)
482  {
483  current_sample_[j] = iter->second[random_.uniformInt(stratum_size)];
484  is_used_[current_sample_[j]] = true;
485  }
486  }
487  }
488  else
489  {
490  //Go thru all strata
491  int j = 0;
492  StrataIndicesType::iterator iter;
493  for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
494  {
495  // do sampling without replacement in each strata and copy data.
496  int stratum_size = iter->second.size();
497  for(int i = 0; i < static_cast<int>(strata_sample_size_[iter->first]); ++i, ++j)
498  {
499  std::swap(iter->second[i], iter->second[i+ random_.uniformInt(stratum_size - i)]);
500  current_sample_[j] = iter->second[i];
501  is_used_[current_sample_[j]] = true;
502  }
503  }
504  }
505 }
506 
507 template<class Random =RandomTT800 >
508 class PoissonSampler
509 {
510 public:
511  Random randfloat;
512  typedef Int32 IndexType;
513  typedef vigra::ArrayVector <IndexType> IndexArrayType;
514  IndexArrayType used_indices_;
515  double lambda;
516  int minIndex;
517  int maxIndex;
518 
519  PoissonSampler(double lambda,IndexType minIndex,IndexType maxIndex)
520  : lambda(lambda),
521  minIndex(minIndex),
522  maxIndex(maxIndex)
523  {}
524 
525  void sample( )
526  {
527  used_indices_.clear();
528  IndexType i;
529  for(i=minIndex;i<maxIndex;++i)
530  {
531  //from http://en.wikipedia.org/wiki/Poisson_distribution
532  int k=0;
533  double p=1;
534  double L=exp(-lambda);
535  do
536  {
537  ++k;
538  p*=randfloat.uniform53();
539 
540  }while(p>L);
541  --k;
542  //Insert i this many time
543  while(k>0)
544  {
545  used_indices_.push_back(i);
546  --k;
547  }
548  }
549  }
550 
551  IndexType const & operator[](int in) const
552  {
553  return used_indices_[in];
554  }
555 
556  int numOfSamples() const
557  {
558  return used_indices_.size();
559  }
560 };
561 
562 } // namespace vigra
563 
564 #endif /*VIGRA_SAMPLING_HXX*/
ArrayVectorView< IndexType > IndexArrayViewType
Definition: sampling.hxx:246
IndexType operator[](int k) const
Definition: sampling.hxx:379
int strataCount() const
Definition: sampling.hxx:413
Sampler(UInt32 totalCount, SamplerOptions const &opt=SamplerOptions(), Random const *rnd=0)
Definition: sampling.hxx:297
Create random samples from a sequence of indices.
Definition: sampling.hxx:232
SamplerOptions & sampleProportion(double proportion)
Determine the number of samples to draw as a proportion of the total number. That is...
Definition: sampling.hxx:123
void sample()
Definition: sampling.hxx:467
linalg::TemporaryMatrix< T > exp(MultiArrayView< 2, T, C > const &v)
bool withReplacement() const
Definition: sampling.hxx:428
Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const &opt=SamplerOptions(), Random const *rnd=0)
Definition: sampling.hxx:337
SamplerOptions & sampleSize(unsigned int size)
Draw the given number of samples. If stratifiedSampling is true, the size is equally distributed acro...
Definition: sampling.hxx:106
int sampleSize() const
Definition: sampling.hxx:397
int size() const
Definition: sampling.hxx:404
bool stratifiedSampling() const
Definition: sampling.hxx:421
detail::SelectIntegerType< 32, detail::SignedIntTypes >::type Int32
32-bit signed int
Definition: sized_int.hxx:175
IndexArrayViewType sampledIndices() const
Definition: sampling.hxx:435
int totalCount() const
Definition: sampling.hxx:390
this_type subarray(size_type begin, size_type end) const
Definition: array_vector.hxx:200
Int32 IndexType
Definition: sampling.hxx:239
SamplerOptions & stratified(bool in=true)
Draw equally many samples from each "stratum". A stratum is a group of like entities, e.g. pixels belonging to the same object class. This is useful to create balanced samples when the class probabilities are very unbalanced (e.g. when there are many background and few foreground pixels). Stratified sampling thus avoids that a trained classifier is biased towards the majority class.
Definition: sampling.hxx:141
SamplerOptions & withReplacement(bool in=true)
Sample from training population with replacement.
Definition: sampling.hxx:83
detail::SelectIntegerType< 32, detail::UnsignedIntTypes >::type UInt32
32-bit unsigned int
Definition: sized_int.hxx:183
int ceil(FixedPoint< IntBits, FracBits > v)
rounding up.
Definition: fixedpoint.hxx:675
Options object for the Sampler class.
Definition: sampling.hxx:63
IndexArrayViewType oobIndices() const
Definition: sampling.hxx:443
SamplerOptions & withoutReplacement(bool in=true)
Sample from training population without replacement.
Definition: sampling.hxx:93

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.11.1 (Fri May 19 2017)