Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ae152f6

Browse files
authored
[Embedding] Add experimental libpmem based PMEM allocator. (DeepRec-AI#49)
1 parent 499d92f commit ae152f6

21 files changed

+920
-25
lines changed

modelzoo/features/pmem/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,15 @@ pandas 1.1.5
2626
--num_mock_cols=30
2727
--batch_size=12800 \
2828
--dim_size=256 \
29-
--ev_storage=pmem \
29+
--ev_storage=pmem_libpmem \
3030

3131
## use criteo data
3232
./launch.sh \
3333
--use_mock_data=False \
3434
--data_dir=${path_to_criteo} \
3535
--batch_size=12800 \
3636
--dim_size=256 \
37-
--ev_storage=pmem \
37+
--ev_storage=pmem_libpmem \
3838
```
3939

4040
- Here `num_mock_cols` is the number of columns in mock training data, `mock_vocabulary_size` is
@@ -44,5 +44,5 @@ would increase the `vocabulary_size` of each column (variable) by a factor of 2.
4444
- The log files are located at `./bench-ps.log` and `./bench-worker.log` for ps and worker process, respectively.
4545
- The default embedding variable type is hash table-based Embedding Variable, which does not require a specification on `mock_vocabulary_size` and `vocabulary_amplify_factor`
4646
- by specifying `--use_ev_var=False`, the benchmark would fallback to use TF's native variable for Embeddings, and users shall provide `mock_vocabulary_size` in mock data scenarios.
47-
- '--ev_storage=pmem/dram' to select EmbeddingVariable StorageType, Default is 'dram'.
47+
- '--ev_storage=pmem_libpmem/pmem_memkind/dram' to select EmbeddingVariable StorageType, Default is 'dram'. If set pmem_libpmem, please also set pmem path and size with '--ev_storage_path=<pmem_path (default: /mnt/pmem0/allocator/)> and --ev_storage_size_gb=<pmem_size (default: 512)>'
4848

modelzoo/features/pmem/benchmark.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
tf.app.flags.DEFINE_boolean("use_xdl_var", False, "")
3030
tf.app.flags.DEFINE_boolean("trace_timeline", False, "")
3131
tf.app.flags.DEFINE_string("ev_storage", 'dram', "")
32+
tf.app.flags.DEFINE_string("ev_storage_path",
33+
'/mnt/pmem0/pmem_allocator/', "")
34+
tf.app.flags.DEFINE_integer("ev_storage_size_gb", '512', "")
3235

3336
def main(_):
3437
cluster_dict = {}
@@ -96,8 +99,13 @@ def main(_):
9699
if FLAGS.use_ev_var:
97100
if FLAGS.ev_storage == "dram":
98101
ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.DRAM))
99-
elif FLAGS.ev_storage == "pmem":
100-
ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.PMEM))
102+
elif FLAGS.ev_storage == "pmem_memkind":
103+
ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.PMEM_MEMKIND))
104+
elif FLAGS.ev_storage == "pmem_libpmem":
105+
ev_option = variables.EmbeddingVariableOption(storage_option=variables.StorageOption(
106+
storage_type=config_pb2.StorageType.PMEM_LIBPMEM,
107+
storage_path=FLAGS.ev_storage_path,
108+
storage_size=FLAGS.ev_storage_size_gb * 1024 * 1024 * 1024))
101109
fm_w = tf.get_embedding_variable(
102110
name='fm_w{}'.format(sidx),
103111
embedding_dim=1,
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash
2+
3+
apt update
4+
apt install -y libpmem-dev libmemkind-dev

tensorflow/core/BUILD

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,7 +1014,11 @@ cc_library(
10141014
"framework/type_traits.h",
10151015
] + select({
10161016
"//tensorflow:with_pmem_support": [
1017-
"framework/pmem_allocator.cc"
1017+
"framework/pmem_allocator.cc",
1018+
"framework/experimental_pmem_allocator.cc",
1019+
"framework/experimental_pmem_allocator_utils.cc",
1020+
"framework/experimental_pmem_allocator.h",
1021+
"framework/experimental_pmem_allocator_utils.h",
10181022
],
10191023
"//conditions:default": [],
10201024
}),
@@ -1027,7 +1031,8 @@ cc_library(
10271031
linkopts = select({
10281032
"//tensorflow:with_pmem_support": [
10291033
"-L/usr/local/lib",
1030-
"-lmemkind"
1034+
"-lmemkind",
1035+
"-lpmem",
10311036
],
10321037
"//conditions:default": [],
10331038
}),
@@ -2958,6 +2963,8 @@ tf_cuda_library(
29582963
"framework/allocator.cc",
29592964
"framework/cpu_allocator_impl.cc",
29602965
"framework/pmem_allocator.cc",
2966+
"framework/experimental_pmem_allocator.*",
2967+
"framework/experimental_pmem_allocator_utils.*",
29612968
"framework/allocator_registry.cc",
29622969
"framework/tracking_allocator.cc",
29632970
"example/example_parser_configuration.*",

tensorflow/core/framework/allocator.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,17 @@ Allocator* pmem_allocator() {
9393
return pmem_alloc;
9494
}
9595

96+
Allocator* experimental_pmem_allocator(const std::string& pmem_path, size_t allocator_size) {
97+
static Allocator* experimental_pmem_allocator =
98+
AllocatorFactoryRegistry::singleton()->GetExperimentalPMEMAllocator(pmem_path, allocator_size);
99+
if (experimental_pmem_allocator && cpu_allocator_collect_full_stats &&
100+
!experimental_pmem_allocator->TracksAllocationSizes()) {
101+
experimental_pmem_allocator =
102+
new TrackingAllocator(experimental_pmem_allocator, true);
103+
}
104+
return experimental_pmem_allocator;
105+
}
106+
96107
Allocator* ev_allocator() {
97108
static Allocator* ev_alloc =
98109
AllocatorFactoryRegistry::singleton()->GetEVAllocator();

tensorflow/core/framework/allocator.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,11 +337,14 @@ Allocator* cpu_allocator_base();
337337
// call it directly.
338338
Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity);
339339

340-
//If use PMEM as allocator, please call this function
340+
//If use PMEM mode of memkind as allocator, please call this function
341341
Allocator* pmem_allocator();
342342

343343
Allocator* ev_allocator();
344344

345+
// If use experimental libpmem based PMEM allocator, please call this function
346+
Allocator* experimental_pmem_allocator(const std::string& pmem_path, size_t allocator_size);
347+
345348
// If 'enable' is true, the default CPU allocator implementation will collect
346349
// AllocatorStats. By default, it's disabled.
347350
void EnableCPUAllocatorStats(bool enable);

tensorflow/core/framework/allocator_registry.cc

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717

1818
#include "tensorflow/core/framework/allocator_registry.h"
1919
#include "tensorflow/core/platform/logging.h"
20+
#include "experimental_pmem_allocator.h"
2021

2122
namespace tensorflow {
2223

@@ -87,6 +88,29 @@ Allocator* AllocatorFactoryRegistry::GetAllocator() {
8788
}
8889
}
8990

91+
Allocator* AllocatorFactoryRegistry::GetExperimentalPMEMAllocator(const std::string& pmem_path, size_t pmem_size) {
92+
mutex_lock l(mu_);
93+
first_alloc_made_ = true;
94+
FactoryEntry* best_entry = nullptr;
95+
for (auto& entry : factories_) {
96+
if (entry.name == "ExperimentalPMEMAllocator") {
97+
best_entry = &entry;
98+
break;
99+
}
100+
}
101+
102+
if (best_entry) {
103+
if (!best_entry->allocator) {
104+
static_cast<ExperimentalPMEMAllocatorFactory*>(best_entry->factory.get())->Init(pmem_path, pmem_size);
105+
best_entry->allocator.reset(best_entry->factory->CreateAllocator());
106+
}
107+
return best_entry->allocator.get();
108+
} else {
109+
LOG(FATAL) << "No registered Experimental PMEM AllocatorFactory";
110+
return nullptr;
111+
}
112+
}
113+
90114
Allocator* AllocatorFactoryRegistry::GetPMEMAllocator() {
91115
mutex_lock l(mu_);
92116
first_alloc_made_ = true;

tensorflow/core/framework/allocator_registry.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ class AllocatorFactoryRegistry {
8080
//If use PMEMallocator, then factory pick this one
8181
Allocator* GetPMEMAllocator();
8282

83+
Allocator* GetExperimentalPMEMAllocator(const std::string& pmem_path, size_t pmem_size);
84+
8385
Allocator* GetEVAllocator();
8486

8587
// Returns 'best fit' SubAllocator. First look for the highest priority

tensorflow/core/framework/embedding/config.proto

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ enum StorageType {
88

99
// one level
1010
DRAM = 1;
11-
PMEM = 2;
12-
SSD = 3;
11+
PMEM_MEMKIND = 2;
12+
PMEM_LIBPMEM = 3;
13+
SSD = 4;
1314

1415
LEVELDB = 14;
1516
/*

tensorflow/core/framework/embedding/embedding_config.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ struct EmbeddingConfig {
2121
DataType counter_type;
2222
embedding::StorageType storage_type;
2323
std::string storage_path;
24+
int64 storage_size;
2425
int64 default_value_dim;
2526
int normal_fix_flag;
2627

@@ -31,7 +32,7 @@ struct EmbeddingConfig {
3132
float l2_weight_threshold = -1.0, const std::string& layout = "normal",
3233
int64 max_element_size = 0, float false_positive_probability = -1.0,
3334
DataType counter_type = DT_UINT64, embedding::StorageType storage_type = embedding::DRAM,
34-
const std::string& storage_path = "",
35+
const std::string& storage_path = "", int64 storage_size = 0,
3536
int64 default_value_dim = 4096):
3637
emb_index(emb_index),
3738
primary_emb_index(primary_emb_index),
@@ -45,6 +46,7 @@ struct EmbeddingConfig {
4546
counter_type(counter_type),
4647
storage_type(storage_type),
4748
storage_path(storage_path),
49+
storage_size(storage_size),
4850
default_value_dim(default_value_dim),
4951
normal_fix_flag(0) {
5052
if ("normal" == layout) {
@@ -106,6 +108,10 @@ struct EmbeddingConfig {
106108
return storage_path;
107109
}
108110

111+
int64 get_storage_size() {
112+
return storage_size;
113+
}
114+
109115
std::string DebugString() const {
110116
return strings::StrCat("opname: ", name,
111117
" emb_index: ", emb_index,
@@ -118,7 +124,8 @@ struct EmbeddingConfig {
118124
" max_freq: ", max_freq,
119125
" l2_weight_threshold: ", l2_weight_threshold,
120126
" storage_type: ", storage_type,
121-
" storage_path: ", storage_path);
127+
" storage_path: ", storage_path,
128+
" storage_size: ", storage_size);
122129
}
123130
};
124131

0 commit comments

Comments
 (0)