Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 124 additions & 45 deletions thrill/api/bernoulli_sample.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
*
* Part of Project Thrill - http://project-thrill.org
*
* Copyright (C) 2016 Lorenz Hübschle-Schneider <[email protected]>
* Copyright (C) 2016-2017 Lorenz Hübschle-Schneider <[email protected]>
*
* All rights reserved. Published under the BSD-2 license in the LICENSE file.
******************************************************************************/
Expand All @@ -13,9 +13,12 @@
#define THRILL_API_BERNOULLI_SAMPLE_HEADER

#include <thrill/api/dia.hpp>
#include <thrill/api/dop_node.hpp>
#include <thrill/common/functional.hpp>
#include <thrill/data/serialization.hpp>

#include <random>
#include <vector>

namespace thrill {
namespace api {
Expand All @@ -24,85 +27,161 @@ namespace api {
* \ingroup api_layer
*/
template <typename ValueType>
class BernoulliSampleNode
class BernoulliSampleNode final : public DOpNode<ValueType>
{
static const bool debug = false;

using SkipDistValueType = int;

using Super = DOpNode<ValueType>;
using Super::context_;
public:
explicit BernoulliSampleNode(double p)
: p_(p), use_skip_(p < 0.1) { // use skip values if p < 0.1
template <typename ParentDIA>
BernoulliSampleNode(const ParentDIA& parent, double p)
: Super(parent.ctx(), "BernoulliSample", { parent.id() }, {parent.node() }),
engine_(std::random_device { } ()),
skip_dist_(p),
p_(p)
{
assert(p >= 0.0 && p <= 1.0);

if (use_skip_) {
skip_dist_ = std::geometric_distribution<SkipDistValueType>(p);
skip_remaining_ = skip_dist_(engine_);
skip_remaining_ = skip_dist_(engine_);
LOG << "Skip value initialised with " << skip_remaining_ << ", p=" << p;

// Hook PreOp(s)
auto pre_op_fn = [this](const ValueType& input) {
PreOp(input);
};

auto lop_chain = parent.stack().push(pre_op_fn).fold();
parent.node()->AddChild(this, lop_chain);
}

LOG << "Skip value initialised with " << skip_remaining_;
DIAMemUse PreOpMemUse() final {
return samples_.size() * sizeof(ValueType);
}

void PreOp (const ValueType& item) {
// use geometric distribution and skip values
if (skip_remaining_ == 0) {
// sample element
LOG << "sampled item " << item;
samples_.push_back(item);
skip_remaining_ = skip_dist_(engine_);
}
else {
simple_dist_ = std::bernoulli_distribution(p);
--skip_remaining_;
}
}

template <typename Emitter>
inline void operator () (const ValueType& item, Emitter&& emit) {
if (use_skip_) {
// use geometric distribution and skip values
if (skip_remaining_ == 0) {
// sample element
LOG << "sampled item " << item;
emit(item);
//! Receive a whole data::File of ValueType, but only if our stack is empty.
bool OnPreOpFile(const data::File& file, size_t /* parent_index */) final {
LOG << "Sampling file of size " << file.num_items() << " with p=" << p_;

if (p_ == 0.0) { return true; }

auto reader = file.GetKeepReader();
const size_t file_size = file.num_items();
size_t pos = 0;

if (p_ == 1.0) {
// degenerate case, sample entire file
LOG << "Degenerate: p = 1";
samples_.reserve(file_size);
for (; pos < file_size; ++pos) {
assert(reader.HasNext());
samples_.push_back(reader.Next<ValueType>());
}
return true;
}

// Use reader.Skip if its data has a constant size
if (data::Serialization<decltype(reader), ValueType>::is_fixed_size) {
LOG << "Sampler using FAST path (reader.Skip)";
// fetch a Block to get typecode_verify flag
reader.HasNext();
const size_t bytes_per_item =
(reader.typecode_verify() ? sizeof(size_t) : 0)
+ data::Serialization<decltype(reader), ValueType>::fixed_size;

pos = skip_remaining_;
while (pos < file_size) {
reader.Skip(skip_remaining_, skip_remaining_ * bytes_per_item);
assert(reader.HasNext());
samples_.push_back(reader.Next<ValueType>());
skip_remaining_ = skip_dist_(engine_);
pos += skip_remaining_ + 1; // +1 because we just read one item
}
else {
--skip_remaining_;

} else {
LOG << "Sampler using SLOW path (advance reader one-by-one)";
assert(skip_remaining_ >= 0);
// Skip items at the beginning of the file
for( ; pos < static_cast<size_t>(skip_remaining_) &&
pos < file_size; ++pos) {
reader.Next<ValueType>();
}
}
else {
// use bernoulli distribution
if (simple_dist_(engine_)) {
LOG << "sampled item " << item;
emit(item);
while (pos < file_size) {
assert(reader.HasNext());
samples_.push_back(reader.Next<ValueType>());
pos++;
auto next = pos + skip_dist_(engine_);
if (next >= file_size) {
LOG << "Aborting: next=" << next << " pos=" << pos
<< " file_size=" << file_size;
pos = next; // ensure we skip_remaining_ correctly
break;
}
for (; pos < next; ++pos) {
assert(reader.HasNext());
reader.Next<ValueType>();
}
}
skip_remaining_ = pos - file_size;
}
return true;
}

bool use_skip() const {
return use_skip_;
void Execute() final {
LOG << "Sampled " << samples_.size() << " elements!";
// nothing to do
}

void PushData(bool consume) final {
for (const ValueType& v : samples_) {
this->PushItem(v);
}
if (consume) {
std::vector<ValueType>().swap(samples_);
}
}

void Dispose() final {
std::vector<ValueType>().swap(samples_);
}

private:
// Sampling rate
const double p_;
// Whether to generate skip values with a geometric distribution or to use
// the naive method
const bool use_skip_;
std::vector<ValueType> samples_;
// Random generator
std::default_random_engine engine_ { std::random_device { } () };
std::bernoulli_distribution simple_dist_;
std::default_random_engine engine_;
// Skip Value Distribution
std::geometric_distribution<SkipDistValueType> skip_dist_;
// Sampling rate
const double p_;
// Remaining skip value
SkipDistValueType skip_remaining_ = -1;
};

template <typename ValueType, typename Stack>
auto DIA<ValueType, Stack>::BernoulliSample(const double p) const {
assert(IsValid());

size_t new_id = context().next_dia_id();
using BernoulliSampleNode = api::BernoulliSampleNode<ValueType>;

node_->context().logger_
<< "id" << new_id
<< "label" << "BernoulliSample"
<< "class" << "DIA"
<< "event" << "create"
<< "type" << "LOp"
<< "parents" << (common::Array<size_t>{ id_ });
auto node = common::MakeCounting<BernoulliSampleNode>(
*this, p);

auto new_stack = stack_.push(BernoulliSampleNode<ValueType>(p));
return DIA<ValueType, decltype(new_stack)>(
node_, new_stack, new_id, "BernoulliSample");
return DIA<ValueType>(node);
}

} // namespace api
Expand Down
22 changes: 17 additions & 5 deletions thrill/api/sample.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class SampleNode final : public DOpNode<ValueType>
template <typename ParentDIA>
SampleNode(const ParentDIA& parent, size_t sample_size)
: Super(parent.ctx(), "Sample", { parent.id() }, { parent.node() }),
sample_size_(sample_size)
sample_size_(sample_size), count_(0)
{
samples_.reserve(sample_size);

Expand All @@ -54,12 +54,17 @@ class SampleNode final : public DOpNode<ValueType>
return sample_size_ * sizeof(ValueType);
}

// This implements J. Vitter's Algorithm R for reservoir sampling
void PreOp(const ValueType& input) {
++count_;
if (samples_.size() < sample_size_) {
samples_.emplace_back(input);
samples_.push_back(input);
}
else {
samples_[rng_() % samples_.size()] = input;
size_t pos = rng_() % count_;
if (pos < sample_size_) {
samples_[pos] = input;
}
}
}

Expand All @@ -81,11 +86,14 @@ class SampleNode final : public DOpNode<ValueType>
// globally select random samples among samples_
typename std::vector<ValueType>::iterator it = samples_.begin();

// XXX THIS DOES NOT PRODUCE A RANDOM SAMPLE! SAMPLING THE LOCAL SAMPLES
// ISN'T A CORRECT SAMPLING ALGORITHM! THIS NEEDS TO USE HYPERGEOMETRIC
// DEVIATES TO DETERMINE BOUNDARIES (ALSO DEPENDANT ON count_)
for (size_t i = 0; i < sample_size_; ++i) {
size_t r = rng_() % sample_size_;
size_t r = rng_() % global_size;
if (r < local_rank || r >= local_rank + local_size) continue;

// swap selected item to front.
// swap selected item to front. WTF NO THIS IS WRONG
using std::swap;
if (it < samples_.end()) {
swap(*it, samples_[r - local_rank]);
Expand Down Expand Up @@ -119,8 +127,12 @@ class SampleNode final : public DOpNode<ValueType>
}

private:
//! Size of the sample
size_t sample_size_;

//! Number of values seen so far
size_t count_;

//! local samples
std::vector<ValueType> samples_;

Expand Down