From da46b8abd28bbf73176c4aa0d10b795ad328f29b Mon Sep 17 00:00:00 2001 From: zxgx Date: Fri, 12 Aug 2022 15:17:51 +0800 Subject: [PATCH 01/19] Update api --- recsys/models/dlrm.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/recsys/models/dlrm.py b/recsys/models/dlrm.py index 1fd73aa..d210c42 100644 --- a/recsys/models/dlrm.py +++ b/recsys/models/dlrm.py @@ -48,22 +48,26 @@ def __init__(self, is_dist_dataloader=True): super(FusedSparseModules, self).__init__() if use_cache: - self.embed = ParallelFreqAwareEmbeddingBag(sum(num_embeddings_per_feature), - embedding_dim, - sparse=True, - mode=reduction_mode, - include_last_offset=True) - self.embed.preprocess(cache_sets, id_freq_map, warmup_ratio, buffer_size=buffer_size) - else: - raise NotImplementedError() - self.embed = FusedHybridParallelEmbeddingBag( + self.embed = ParallelFreqAwareEmbeddingBag( sum(num_embeddings_per_feature), embedding_dim, - fused_op=fused_op, - mode=reduction_mode, sparse=sparse, + mode=reduction_mode, include_last_offset=True, - output_device_type=output_device_type) + cuda_row_num=cache_sets, + ids_freq_mapping=id_freq_map, + warmup_ratio=warmup_ratio, + buffer_size=buffer_size, + ) + else: + raise NotImplementedError() + self.embed = FusedHybridParallelEmbeddingBag(sum(num_embeddings_per_feature), + embedding_dim, + fused_op=fused_op, + mode=reduction_mode, + sparse=sparse, + include_last_offset=True, + output_device_type=output_device_type) if is_dist_dataloader: self.kjt_collector = KJTAllToAll(gpc.get_group(ParallelMode.GLOBAL)) From e8ba34243a370e6ecd3680b4317dc747dae85743 Mon Sep 17 00:00:00 2001 From: zxgx Date: Mon, 15 Aug 2022 17:52:38 +0800 Subject: [PATCH 02/19] add nvtabular test scripts --- baselines/models/dlrm.py | 16 ++-- hvd_wrapper.sh | 14 +++ recsys/datasets/criteo.py | 82 +++++++++++++---- recsys/datasets/feature_counter.py | 56 ++++++++++++ recsys/datasets/utils.py | 47 ++++++++++ test.py | 136 +++++++++++++++++++++++++++++ 6 files changed, 328 insertions(+), 23 deletions(-) create mode 100644 hvd_wrapper.sh create mode 100644 test.py diff --git a/baselines/models/dlrm.py b/baselines/models/dlrm.py index 62c399a..5a20026 100644 --- a/baselines/models/dlrm.py +++ b/baselines/models/dlrm.py @@ -68,10 +68,10 @@ class SparseArch(nn.Module): def __init__(self, embedding_bag_collection: EmbeddingBagCollection) -> None: super().__init__() self.embedding_bag_collection: EmbeddingBagCollection = embedding_bag_collection - assert self.embedding_bag_collection.embedding_bag_configs(), "Embedding bag collection cannot be empty!" - self.D: int = self.embedding_bag_collection.embedding_bag_configs()[0].embedding_dim + assert self.embedding_bag_collection.embedding_bag_configs, "Embedding bag collection cannot be empty!" + self.D: int = self.embedding_bag_collection.embedding_bag_configs[0].embedding_dim self._sparse_feature_names: List[str] = [ - name for conf in embedding_bag_collection.embedding_bag_configs() for name in conf.feature_names + name for conf in embedding_bag_collection.embedding_bag_configs for name in conf.feature_names ] self.F: int = len(self._sparse_feature_names) @@ -335,13 +335,13 @@ def __init__( dense_device: Optional[torch.device] = None, ) -> None: super().__init__() - assert (len(embedding_bag_collection.embedding_bag_configs()) > 0), "At least one embedding bag is required" - for i in range(1, len(embedding_bag_collection.embedding_bag_configs())): - conf_prev = embedding_bag_collection.embedding_bag_configs()[i - 1] - conf = embedding_bag_collection.embedding_bag_configs()[i] + assert (len(embedding_bag_collection.embedding_bag_configs) > 0), "At least one embedding bag is required" + for i in range(1, len(embedding_bag_collection.embedding_bag_configs)): + conf_prev = embedding_bag_collection.embedding_bag_configs[i - 1] + conf = embedding_bag_collection.embedding_bag_configs[i] assert ( conf_prev.embedding_dim == conf.embedding_dim), "All EmbeddingBagConfigs must have the same dimension" - embedding_dim: int = embedding_bag_collection.embedding_bag_configs()[0].embedding_dim + embedding_dim: int = embedding_bag_collection.embedding_bag_configs[0].embedding_dim if dense_arch_layer_sizes[-1] != embedding_dim: raise ValueError(f"embedding_bag_collection dimension ({embedding_dim}) and final dense " "arch layer size ({dense_arch_layer_sizes[-1]}) must match.") diff --git a/hvd_wrapper.sh b/hvd_wrapper.sh new file mode 100644 index 0000000..919064b --- /dev/null +++ b/hvd_wrapper.sh @@ -0,0 +1,14 @@ + +#!/bin/bash + +# Get local process ID from OpenMPI or alternatively from SLURM +if [ -z "${CUDA_VISIBLE_DEVICES:-}" ]; then + if [ -n "${OMPI_COMM_WORLD_LOCAL_RANK:-}" ]; then + LOCAL_RANK="${OMPI_COMM_WORLD_LOCAL_RANK}" + elif [ -n "${SLURM_LOCALID:-}" ]; then + LOCAL_RANK="${SLURM_LOCALID}" + fi + export CUDA_VISIBLE_DEVICES=${LOCAL_RANK} +fi + +exec "$@" diff --git a/recsys/datasets/criteo.py b/recsys/datasets/criteo.py index c345294..87d21c6 100644 --- a/recsys/datasets/criteo.py +++ b/recsys/datasets/criteo.py @@ -8,6 +8,7 @@ import os from typing import Dict, Iterator, List, Optional import numpy as np +import glob from torchrec.datasets.criteo import (CAT_FEATURE_COUNT, DEFAULT_CAT_NAMES, DEFAULT_INT_NAMES, DAYS, BinaryCriteoUtils) from torchrec.datasets.utils import PATH_MANAGER_KEY, Batch @@ -16,16 +17,25 @@ from pyre_extensions import none_throws import torch from torch.utils.data import DataLoader, IterableDataset +try: + # pyre-ignore[21] + import nvtabular as nvt + from nvtabular.loader.torch import TorchAsyncItr +except ImportError: + print("Unable to import NVTabular, which indicates that you cannot load criteo 1TB dataset with our solution") -from .feature_counter import CriteoSparseProcessor, GlobalFeatureCounter +from .feature_counter import CriteoSparseProcessor, GlobalFeatureCounter, NVTabularFeatureCounter +from .utils import KJTTransform STAGES = ["train", "val", "test"] -NUM_EMBEDDINGS_PER_FEATURE = None - +# 177,944,275 in total +NUM_EMBEDDINGS_PER_FEATURE = "45833188,36746,17245,7413,20243,3,7114,1441,62,29275261,1572176,345138,10,2209,11267," \ + "128,4,974,14,48937457,11316796,40094537,452104,12606,104,35" +# 33,762,577 in total KAGGLE_NUM_EMBEDDINGS_PER_FEATURE = '1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,' \ '27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572' -KAGGLE_TOTAL_TRAINING_SAMPLES = 39291954 # 0-6 days for criteo kaggle, 45840617 samples in total +KAGGLE_TOTAL_TRAINING_SAMPLES = 39_291_954 # 0-6 days for criteo kaggle, 45,840,617 samples in total class InMemoryBinaryCriteoIterDataPipe(IterableDataset): @@ -221,11 +231,7 @@ def __len__(self) -> int: return self.num_batches -def get_dataloader(args, stage, rank, world_size): - stage = stage.lower() - if stage not in STAGES: - raise ValueError(f"Supplied stage was {stage}. Must be one of {STAGES}.") - +def _get_kaggle_dataloader(args, stage, rank, world_size): files = os.listdir(args.dataset_dir) def is_final_day(s: str) -> bool: @@ -263,15 +269,61 @@ def is_final_day(s: str) -> bool: return dataloader +def _get_terabyte_dataloader(args, stage, rank, world_size): + # TODO: replace the data_split with stage + if stage == "train": + data_split = "train" + elif stage == "val": + data_split = "validation" + else: + data_split = "test" + + if world_size > 1: + raise NotImplementedError("NVTabular can not support distributed dataloader") + + files = glob.glob(os.path.join(args.dataset_dir, data_split, "*.parquet")) + + nv_iter = TorchAsyncItr( + nvt.Dataset(files, engine="parquet", part_mem_fraction=0.02), + batch_size=args.batch_size, + cats=DEFAULT_CAT_NAMES, + conts=DEFAULT_INT_NAMES, + labels=["label"], + global_rank=rank, + global_size=world_size, + drop_last=True, + ) + + dataloader = DataLoader(nv_iter, + batch_size=None, + pin_memory=False, + collate_fn=KJTTransform(nv_iter).transform, + num_worker=0) + return dataloader + + +def get_dataloader(args, stage, rank, world_size): + stage = stage.lower() + if stage not in STAGES: + raise ValueError(f"Supplied stage was {stage}. Must be one of {STAGES}.") + + if "kaggle" in args.dataset_dir: + return _get_kaggle_dataloader(args, stage, rank, world_size) + else: + return _get_terabyte_dataloader(args, stage, rank, world_size) + + def get_id_freq_map(path): if 'kaggle' not in path: - raise NotImplementedError() + files = glob.glob(os.path.join(path, "train", "*.parquet")) + feature_count = NVTabularFeatureCounter(files, list(map(int, NUM_EMBEDDINGS_PER_FEATURE.split(','))), 8192) - files = os.listdir(path) - sparse_files = list(filter(lambda s: 'sparse' in s, files)) - sparse_files = [os.path.join(path, _f) for _f in sparse_files] + else: + files = os.listdir(path) + sparse_files = list(filter(lambda s: 'sparse' in s, files)) + sparse_files = [os.path.join(path, _f) for _f in sparse_files] - file_processor = CriteoSparseProcessor(list(map(int, KAGGLE_NUM_EMBEDDINGS_PER_FEATURE.split(',')))) - feature_count = GlobalFeatureCounter(sparse_files, file_processor) + file_processor = CriteoSparseProcessor(list(map(int, KAGGLE_NUM_EMBEDDINGS_PER_FEATURE.split(',')))) + feature_count = GlobalFeatureCounter(sparse_files, file_processor) return feature_count.id_freq_map diff --git a/recsys/datasets/feature_counter.py b/recsys/datasets/feature_counter.py index 3681c91..7bf7cea 100644 --- a/recsys/datasets/feature_counter.py +++ b/recsys/datasets/feature_counter.py @@ -1,5 +1,17 @@ import abc import numpy as np +from contexttimer import Timer +import torch +from torch.utils.data import DataLoader +try: + # pyre-ignore[21] + import nvtabular as nvt + from nvtabular.loader.torch import TorchAsyncItr +except ImportError: + print("Unable to import NVTabular, which indicates that you cannot load criteo 1TB dataset with our solution") + +from .criteo import DEFAULT_CAT_NAMES, DEFAULT_INT_NAMES +from .utils import KJTTransform class CriteoSparseProcessor: @@ -49,3 +61,47 @@ def _collect_statistics(self): self._id_freq_map = self.file_processor(_f) else: self._id_freq_map += self.file_processor(_f) + + +class NVTabularFeatureCounter: + + def __init__(self, datafiles, hashes, batch_size, sample_fraction=0.05): + self.datafiles = datafiles + self._id_freq_map = torch.zeros(sum(hashes), dtype=torch.long, device=torch.cuda.current_device()) + self.batch_size = batch_size + self.pre_ones = torch.ones(batch_size * len(DEFAULT_CAT_NAMES), + dtype=torch.long, + device=torch.cuda.current_device()) + self.sample_fraction = sample_fraction + self._collect_statistics() + + def _collect_statistics(self): + nv_iter = TorchAsyncItr( + nvt.Dataset(self.datafiles[:np.ceil(len(self.datafiles) * self.sample_fraction)], + engine="parquet", + part_mem_fraction=0.02), + batch_size=self.batch_size, + cats=DEFAULT_CAT_NAMES, + conts=DEFAULT_INT_NAMES, + labels=["label"], + global_rank=0, + global_size=1, + drop_last=True, + ) + + dataloader = DataLoader(nv_iter, + batch_size=None, + pin_memory=False, + collate_fn=KJTTransform(nv_iter).transform, + num_workers=0) + + with Timer() as timer: + for batch in dataloader: + sparse = batch.sparse_features.values() + ones = self.pre_ones.narrow(0, start=0, length=sparse.shape[0]) + self._id_freq_map.index_add_(dim=0, index=batch.sparse_features.values(), source=ones) + print(f"statistic costs: {timer.elapsed:.2f}s") + + @property + def id_freq_map(self): + return self._id_freq_map diff --git a/recsys/datasets/utils.py b/recsys/datasets/utils.py index 950e814..8e3daf4 100644 --- a/recsys/datasets/utils.py +++ b/recsys/datasets/utils.py @@ -1,6 +1,8 @@ +import numpy as np import torch import torch.distributed as dist from torchrec.sparse.jagged_tensor import KeyedJaggedTensor +from torchrec.datasets.utils import Batch class KJTAllToAll: @@ -50,3 +52,48 @@ def all_to_all(self, kjt): values=all_values, lengths=all_lengths, ) + + +class KJTTransform: + + def __init__(self, dataloader, hashes=None): + self.batch_size = dataloader.batch_size + self.cats = dataloader.cat_names + self.conts = dataloader.cont_names + self.labels = dataloader.label_names + self.sparse_offset = torch.tensor( + [0, *np.cumsum(hashes)[:-1]], dtype=torch.long, device=torch.cuda.current_device()).view(1, -1) \ + if hashes is not None else None + + _num_ids_in_batch = len(self.cats) * self.batch_size + self.lengths = torch.ones((_num_ids_in_batch,), dtype=torch.int32) + self.offsets = torch.arange(0, _num_ids_in_batch + 1, dtype=torch.int32) + self.length_per_key = len(self.cats) * [self.batch_size] + self.offset_per_key = [self.batch_size * i for i in range(len(self.cats) + 1)] + self.index_per_key = {key: i for (i, key) in enumerate(self.cats)} + + def transform(self, batch): + sparse, dense = [], [] + for col in self.cats: + sparse.append(batch[0][col]) + sparse = torch.cat(sparse, dim=1) + if self.sparse_offset is not None: + sparse += self.sparse_offset + for col in self.conts: + dense.append(batch[0][col]) + dense = torch.cat(dense, dim=1) + + return Batch( + dense_features=dense, + sparse_features=KeyedJaggedTensor( + keys=self.cats, + values=sparse.transpose(1, 0).contiguous().view(-1), + lengths=self.lengths, + offsets=self.offsets, + stride=self.batch_size, + length_per_key=self.length_per_key, + offset_per_key=self.offset_per_key, + index_per_key=self.index_per_key, + ), + labels=batch[1], + ) diff --git a/test.py b/test.py new file mode 100644 index 0000000..059af96 --- /dev/null +++ b/test.py @@ -0,0 +1,136 @@ +import os +import time + +import torch +import torch.distributed as dist +from torch.utils.data import DataLoader +import nvtabular as nvt +from nvtabular.loader.torch import TorchAsyncItr # , DLDataLoader +import cupy +from torchrec.sparse.jagged_tensor import KeyedJaggedTensor +from torchrec.datasets.utils import Batch + +from recsys.datasets.criteo import get_id_freq_map + +INPUT_DATA_DIR = "/data/criteo_preproc/test/" +BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 16384)) +PARTS_PER_CHUNK = int(os.environ.get("PARTS_PER_CHUNK", 2)) +CONTINUOUS_COLUMNS = ["int_" + str(x) for x in range(0, 13)] +CATEGORICAL_COLUMNS = ["cat_" + str(x) for x in range(0, 26)] +LABEL_COLUMNS = ["label"] + + +class KJTTransform: + + def __init__(self, dataloader): + self.batch_size = dataloader.batch_size + self.cats = dataloader.cat_names + self.conts = dataloader.cont_names + self.labels = dataloader.label_names + + _num_ids_in_batch = len(self.cats) * self.batch_size + self.lengths = torch.ones((_num_ids_in_batch,), dtype=torch.int32) + self.offsets = torch.arange(0, _num_ids_in_batch + 1, dtype=torch.int32) + self.length_per_key = len(self.cats) * [self.batch_size] + self.offset_per_key = [self.batch_size * i for i in range(len(self.cats) + 1)] + self.index_per_key = {key: i for (i, key) in enumerate(self.cats)} + + def transform(self, batch): + sparse, dense = [], [] + for col in self.cats: + sparse.append(batch[0][col]) + sparse = torch.cat(sparse, dim=1) + for col in self.conts: + dense.append(batch[0][col]) + dense = torch.cat(dense, dim=1) + + return Batch( + dense_features=dense, + sparse_features=KeyedJaggedTensor( + keys=self.cats, + values=sparse.transpose(1, 0).reshape(-1), + lengths=self.lengths, + offsets=self.offsets, + stride=self.batch_size, + length_per_key=self.length_per_key, + offset_per_key=self.offset_per_key, + index_per_key=self.index_per_key, + ), + labels=batch[1], + ) + + +def seed_fn(): + """ + Generate consistent dataloader shuffle seeds across workers + Reseeds each worker's dataloader each epoch to get fresh a shuffle + that's consistent across workers. + """ + + max_rand = torch.iinfo(torch.int).max // world_size + + # Generate a seed fragment + seed_fragment = cupy.random.randint(0, max_rand) + + # Aggregate seed fragments from all workers + seed_tensor = torch.tensor(seed_fragment) # pylint: disable=not-callable + dist.all_reduce(seed_tensor, op=dist.ReduceOp.SUM) + return seed_tensor % max_rand + + +def run(rank, world_size): + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + + # initialize the process group + dist.init_process_group("gloo", rank=rank, world_size=world_size) + torch.cuda.set_device(rank) + + fname = "part_{}.parquet" + train_paths = [os.path.join(INPUT_DATA_DIR, fname.format(i)) for i in range(64)] + # print(train_paths) + + print(f"{dist.get_rank()}/{dist.get_world_size()}: device: {torch.cuda.current_device()}") + + start = time.time() + train_data = nvt.Dataset(train_paths, engine="parquet", part_mem_fraction=0.04 / PARTS_PER_CHUNK) + print(f"nvdtaset: {time.time() - start}") + start = time.time() + train_data_idrs = TorchAsyncItr(train_data, + batch_size=BATCH_SIZE, + cats=CATEGORICAL_COLUMNS, + conts=CONTINUOUS_COLUMNS, + labels=LABEL_COLUMNS, + global_rank=rank, + global_size=world_size, + drop_last=False, + parts_per_chunk=PARTS_PER_CHUNK, + shuffle=True, + seed_fn=lambda: 1) + print(f"TorchAsyncItr: {time.time() - start}, len: {len(train_data_idrs)}") + + start = time.time() + train_dataloader = DataLoader(train_data_idrs, + collate_fn=KJTTransform(train_data_idrs).transform, + batch_size=None, + pin_memory=False, + num_workers=0) + print(f"dataloader: {time.time() - start}, len: {len(train_dataloader)}") + + data_iter = iter(train_dataloader) + for idx, batch in enumerate(data_iter): + print(f"rank: {rank}, it: {idx}, batch: {batch.dense_features}") + + if idx == 30: + break + print(f"allocate: {torch.cuda.memory_allocated()/1024**3:.2f} GB, " + f"reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB") + + # id_freq_map = get_id_freq_map("/data/criteo_preproc") + # print(id_freq_map.shape, id_freq_map.max(), id_freq_map.min()) + + +if __name__ == "__main__": + world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) + world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) + run(world_rank, world_size) From 77721496f98f6aae47ccdcc4e5831d9e35752a29 Mon Sep 17 00:00:00 2001 From: zxgx Date: Tue, 16 Aug 2022 10:09:40 +0800 Subject: [PATCH 03/19] fix nvtabular test --- test.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/test.py b/test.py index 059af96..9cbc9bc 100644 --- a/test.py +++ b/test.py @@ -83,8 +83,15 @@ def run(rank, world_size): os.environ["MASTER_PORT"] = "12355" # initialize the process group - dist.init_process_group("gloo", rank=rank, world_size=world_size) - torch.cuda.set_device(rank) + dist.init_process_group("nccl", rank=rank, world_size=world_size) + print(f"init rank: {rank}") + torch.cuda.set_device(0) + + # data = torch.rand(1, 2) + # print(f"rank: {rank}, data: {data}") + # data_list = [data if _r == rank else torch.empty_like(data) for _r in range(world_size)] + # dist.all_gather(data_list, data) + # print(data_list) fname = "part_{}.parquet" train_paths = [os.path.join(INPUT_DATA_DIR, fname.format(i)) for i in range(64)] @@ -96,17 +103,17 @@ def run(rank, world_size): train_data = nvt.Dataset(train_paths, engine="parquet", part_mem_fraction=0.04 / PARTS_PER_CHUNK) print(f"nvdtaset: {time.time() - start}") start = time.time() - train_data_idrs = TorchAsyncItr(train_data, - batch_size=BATCH_SIZE, - cats=CATEGORICAL_COLUMNS, - conts=CONTINUOUS_COLUMNS, - labels=LABEL_COLUMNS, - global_rank=rank, - global_size=world_size, - drop_last=False, - parts_per_chunk=PARTS_PER_CHUNK, - shuffle=True, - seed_fn=lambda: 1) + train_data_idrs = TorchAsyncItr( + train_data, + batch_size=BATCH_SIZE, + cats=CATEGORICAL_COLUMNS, + conts=CONTINUOUS_COLUMNS, + labels=LABEL_COLUMNS, + global_rank=0, + global_size=1, + drop_last=False, + parts_per_chunk=PARTS_PER_CHUNK, + ) print(f"TorchAsyncItr: {time.time() - start}, len: {len(train_data_idrs)}") start = time.time() @@ -121,11 +128,11 @@ def run(rank, world_size): for idx, batch in enumerate(data_iter): print(f"rank: {rank}, it: {idx}, batch: {batch.dense_features}") - if idx == 30: + if idx == 3: break print(f"allocate: {torch.cuda.memory_allocated()/1024**3:.2f} GB, " f"reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB") - + torch.cuda.synchronize() # id_freq_map = get_id_freq_map("/data/criteo_preproc") # print(id_freq_map.shape, id_freq_map.max(), id_freq_map.min()) From abc8415ee951a9da8aa828be8caf891c15ec1cb8 Mon Sep 17 00:00:00 2001 From: Jiatong Han Date: Tue, 16 Aug 2022 11:35:49 +0800 Subject: [PATCH 04/19] Changes to stream dataloader --- recsys/modules/embeddings/load_balance_mgr.py | 32 ++++++++++++++++++- .../parallel_mix_vocab_embedding.py | 3 ++ .../dataloader/cuda_stream_dataloader.py | 2 -- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/recsys/modules/embeddings/load_balance_mgr.py b/recsys/modules/embeddings/load_balance_mgr.py index cf65dcb..360c603 100644 --- a/recsys/modules/embeddings/load_balance_mgr.py +++ b/recsys/modules/embeddings/load_balance_mgr.py @@ -7,7 +7,16 @@ def minimize_groupwise_diff(lst: List[int], num_grp: int) -> List[List[int]]: - """This function computes optimized grouping for features""" + """Compute a greedy solution to group numbers into a desired number of groups + and minimize the maximum difference between groups. + + Args: + lst (List[int]): list of ungrouped numbers + num_grp (int): number of groups to be formed into + + Returns: + List[List[int]]: greedily computed grouping of numbers + """ if num_grp == 1: return [lst] indices = list(np.argsort(lst)[::-1]) @@ -47,8 +56,21 @@ def minimize_groupwise_diff(lst: List[int], num_grp: int) -> List[List[int]]: return groups[:num_grp] class LoadBalanceManager(object): + """A load manager that divides training loads evenly across tensor parallel + embedding ranks. + """ def __init__(self, embeddings_per_feat: List[int], num_groups=4, base_emb_dim=128, \ do_fair=True, device=None, disable_random_behavior=False): + """initiate the manager with raw feature embeddings that have yet to be sharded. + + Args: + embeddings_per_feat (List[int]): number of embeddings per sparse feature. + num_groups (int, optional): number of groups to shard into. Usually world size. Defaults to 4. + base_emb_dim (int, optional): desired embedding dimension for features. Defaults to 128. + device (_type_, optional): device where load manager is put. Defaults to None. + disable_random_behavior (bool, optional): set to `True` to disable feature + random shuffling, only applied in table-wise sharding scenario. Defaults to False. + """ assert len(embeddings_per_feat) >= num_groups, \ f"number of input fields {len(embeddings_per_feat)} must be larger than the world size {num_groups}" self.embeddings_per_feat = embeddings_per_feat @@ -56,6 +78,7 @@ def __init__(self, embeddings_per_feat: List[int], num_groups=4, base_emb_dim=12 self.base_emb_dim = base_emb_dim self.do_fair = do_fair self.device = device + # compute the offsets for all set of features self.all_feat_offsets = torch.cumsum(torch.tensor([0]+self.embeddings_per_feat, device=self.device),dim=0) if not self.do_fair: @@ -64,6 +87,8 @@ def __init__(self, embeddings_per_feat: List[int], num_groups=4, base_emb_dim=12 self._fair_initialize() def _fair_initialize(self) -> None: + """shards the features with ... + """ self.num_embeddings_per_rank = sum(self.embeddings_per_feat) // self.num_groups dim_indices = np.array(range(len(self.embeddings_per_feat))) self.groups = [] @@ -112,6 +137,11 @@ def _fair_initialize(self) -> None: self.qr_bucket_size = math.ceil(math.sqrt(self.num_embeddings_per_rank)) def _shuffle_initialize(self, disable_random_behavior=False) -> None: + """_summary_ + + Args: + disable_random_behavior (bool, optional): _description_. Defaults to False. + """ if disable_random_behavior: self.groups = minimize_groupwise_diff(self.embeddings_per_feat, self.num_groups) else: diff --git a/recsys/modules/embeddings/parallel_mix_vocab_embedding.py b/recsys/modules/embeddings/parallel_mix_vocab_embedding.py index 54064d9..63316bc 100644 --- a/recsys/modules/embeddings/parallel_mix_vocab_embedding.py +++ b/recsys/modules/embeddings/parallel_mix_vocab_embedding.py @@ -18,6 +18,9 @@ class QREmbeddingBag(nn.Module): + """Implementation of [paper](https://arxiv.org/pdf/1909.02107.pdf). + + """ def __init__(self, num_embeddings: int, qr_bucket_size: int, diff --git a/recsys/utils/dataloader/cuda_stream_dataloader.py b/recsys/utils/dataloader/cuda_stream_dataloader.py index 2680254..4b20f80 100644 --- a/recsys/utils/dataloader/cuda_stream_dataloader.py +++ b/recsys/utils/dataloader/cuda_stream_dataloader.py @@ -157,8 +157,6 @@ def seed_worker(worker_id): class CudaStreamDataIter(BaseStreamDataIter): """ A data iterator that supports batch prefetching with the help of cuda stream. - Be aware that it now only supports batch loading on GPU. - Also, it can only support dataset in the format of (input, target/label) """ def __init__(self, loader: DataLoader): From 97eb6de2ac35461d107416ea8ef5f641ad345ee4 Mon Sep 17 00:00:00 2001 From: zxgx Date: Tue, 16 Aug 2022 19:27:07 +0800 Subject: [PATCH 05/19] pass feature counter --- recsys/datasets/criteo.py | 14 ++++-- recsys/datasets/feature_counter.py | 31 ++++++++------ test.py | 69 +++++------------------------- 3 files changed, 39 insertions(+), 75 deletions(-) diff --git a/recsys/datasets/criteo.py b/recsys/datasets/criteo.py index 87d21c6..9db6c3f 100644 --- a/recsys/datasets/criteo.py +++ b/recsys/datasets/criteo.py @@ -279,9 +279,10 @@ def _get_terabyte_dataloader(args, stage, rank, world_size): data_split = "test" if world_size > 1: - raise NotImplementedError("NVTabular can not support distributed dataloader") + raise NotImplementedError("We do not support distributed dataloader currently.") - files = glob.glob(os.path.join(args.dataset_dir, data_split, "*.parquet")) + file_num = len(glob.glob(os.path.join(args.dataset_dir, data_split, "*.parquet"))) + files = [os.path.join(args.dataset_dir, data_split, f"part_{i}.parquet") for i in range(file_num)] nv_iter = TorchAsyncItr( nvt.Dataset(files, engine="parquet", part_mem_fraction=0.02), @@ -315,8 +316,13 @@ def get_dataloader(args, stage, rank, world_size): def get_id_freq_map(path): if 'kaggle' not in path: - files = glob.glob(os.path.join(path, "train", "*.parquet")) - feature_count = NVTabularFeatureCounter(files, list(map(int, NUM_EMBEDDINGS_PER_FEATURE.split(','))), 8192) + file_num = len(glob.glob(os.path.join(path, "train", "*.parquet"))) + files = [os.path.join(path, "train", f"part_{i}.parquet") for i in range(file_num)] + + feature_count = NVTabularFeatureCounter(files, + list(map(int, NUM_EMBEDDINGS_PER_FEATURE.split(','))), + 16384, + sample_fraction=0.1) else: files = os.listdir(path) diff --git a/recsys/datasets/feature_counter.py b/recsys/datasets/feature_counter.py index 7bf7cea..006c46a 100644 --- a/recsys/datasets/feature_counter.py +++ b/recsys/datasets/feature_counter.py @@ -1,4 +1,7 @@ import abc +import itertools +from tqdm import tqdm + import numpy as np from contexttimer import Timer import torch @@ -67,19 +70,16 @@ class NVTabularFeatureCounter: def __init__(self, datafiles, hashes, batch_size, sample_fraction=0.05): self.datafiles = datafiles - self._id_freq_map = torch.zeros(sum(hashes), dtype=torch.long, device=torch.cuda.current_device()) + self._id_freq_map = torch.zeros(sum(hashes), dtype=torch.long) self.batch_size = batch_size - self.pre_ones = torch.ones(batch_size * len(DEFAULT_CAT_NAMES), - dtype=torch.long, - device=torch.cuda.current_device()) + self.pre_ones = torch.ones(batch_size * len(DEFAULT_CAT_NAMES), dtype=torch.long) self.sample_fraction = sample_fraction self._collect_statistics() def _collect_statistics(self): + data_files = sorted(self.datafiles[:int(np.ceil(len(self.datafiles) * self.sample_fraction))]) nv_iter = TorchAsyncItr( - nvt.Dataset(self.datafiles[:np.ceil(len(self.datafiles) * self.sample_fraction)], - engine="parquet", - part_mem_fraction=0.02), + nvt.Dataset(data_files, engine="parquet", part_mem_fraction=0.02), batch_size=self.batch_size, cats=DEFAULT_CAT_NAMES, conts=DEFAULT_INT_NAMES, @@ -87,6 +87,7 @@ def _collect_statistics(self): global_rank=0, global_size=1, drop_last=True, + device='cpu', ) dataloader = DataLoader(nv_iter, @@ -94,13 +95,17 @@ def _collect_statistics(self): pin_memory=False, collate_fn=KJTTransform(nv_iter).transform, num_workers=0) - + data_iter = iter(dataloader) with Timer() as timer: - for batch in dataloader: - sparse = batch.sparse_features.values() - ones = self.pre_ones.narrow(0, start=0, length=sparse.shape[0]) - self._id_freq_map.index_add_(dim=0, index=batch.sparse_features.values(), source=ones) - print(f"statistic costs: {timer.elapsed:.2f}s") + for it in tqdm(itertools.count()): + try: + sparse = next(data_iter).sparse_features.values() + ones = self.pre_ones.narrow(0, start=0, length=sparse.shape[0]) + self._id_freq_map.index_add_(dim=0, index=sparse, source=ones) + except StopIteration: + break + print(f"collect statistics over files: {data_files} num batch: {len(dataloader)}, batch size: {self.batch_size}" + f", average time cost: {len(dataloader) / timer.elapsed:.2f} batch/s") @property def id_freq_map(self): diff --git a/test.py b/test.py index 9cbc9bc..2d0cbe4 100644 --- a/test.py +++ b/test.py @@ -1,18 +1,14 @@ import os -import time import torch import torch.distributed as dist -from torch.utils.data import DataLoader -import nvtabular as nvt -from nvtabular.loader.torch import TorchAsyncItr # , DLDataLoader import cupy from torchrec.sparse.jagged_tensor import KeyedJaggedTensor from torchrec.datasets.utils import Batch from recsys.datasets.criteo import get_id_freq_map -INPUT_DATA_DIR = "/data/criteo_preproc/test/" +INPUT_DATA_DIR = "/data/criteo_preproc/train/" BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 16384)) PARTS_PER_CHUNK = int(os.environ.get("PARTS_PER_CHUNK", 2)) CONTINUOUS_COLUMNS = ["int_" + str(x) for x in range(0, 13)] @@ -85,59 +81,16 @@ def run(rank, world_size): # initialize the process group dist.init_process_group("nccl", rank=rank, world_size=world_size) print(f"init rank: {rank}") - torch.cuda.set_device(0) - - # data = torch.rand(1, 2) - # print(f"rank: {rank}, data: {data}") - # data_list = [data if _r == rank else torch.empty_like(data) for _r in range(world_size)] - # dist.all_gather(data_list, data) - # print(data_list) - - fname = "part_{}.parquet" - train_paths = [os.path.join(INPUT_DATA_DIR, fname.format(i)) for i in range(64)] - # print(train_paths) - - print(f"{dist.get_rank()}/{dist.get_world_size()}: device: {torch.cuda.current_device()}") - - start = time.time() - train_data = nvt.Dataset(train_paths, engine="parquet", part_mem_fraction=0.04 / PARTS_PER_CHUNK) - print(f"nvdtaset: {time.time() - start}") - start = time.time() - train_data_idrs = TorchAsyncItr( - train_data, - batch_size=BATCH_SIZE, - cats=CATEGORICAL_COLUMNS, - conts=CONTINUOUS_COLUMNS, - labels=LABEL_COLUMNS, - global_rank=0, - global_size=1, - drop_last=False, - parts_per_chunk=PARTS_PER_CHUNK, - ) - print(f"TorchAsyncItr: {time.time() - start}, len: {len(train_data_idrs)}") - - start = time.time() - train_dataloader = DataLoader(train_data_idrs, - collate_fn=KJTTransform(train_data_idrs).transform, - batch_size=None, - pin_memory=False, - num_workers=0) - print(f"dataloader: {time.time() - start}, len: {len(train_dataloader)}") - - data_iter = iter(train_dataloader) - for idx, batch in enumerate(data_iter): - print(f"rank: {rank}, it: {idx}, batch: {batch.dense_features}") - - if idx == 3: - break - print(f"allocate: {torch.cuda.memory_allocated()/1024**3:.2f} GB, " - f"reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB") - torch.cuda.synchronize() - # id_freq_map = get_id_freq_map("/data/criteo_preproc") - # print(id_freq_map.shape, id_freq_map.max(), id_freq_map.min()) + torch.cuda.set_device(rank) + + id_freq_map = get_id_freq_map("/data/criteo_preproc") + print( + f"rank: {rank}, shape: {id_freq_map.shape}, max: {id_freq_map.max().item()}, min: {id_freq_map.min().item()}, " + f"top 10: {id_freq_map[:10].tolist()}") if __name__ == "__main__": - world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) - world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) - run(world_rank, world_size) + # world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) + # world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) + os.environ["LIBCUDF_CUFILE_POLICY"] = "ALWAYS" + run(0, 1) From 39b21436cd3b105d33bbc000c5ef1c827962d6ae Mon Sep 17 00:00:00 2001 From: zxgx Date: Wed, 17 Aug 2022 10:35:06 +0800 Subject: [PATCH 06/19] single process DLRM for criteo terabyte --- recsys/datasets/criteo.py | 2 +- recsys/dlrm_main.py | 8 +++++++- run.sh | 14 ++++++++++---- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/recsys/datasets/criteo.py b/recsys/datasets/criteo.py index 9db6c3f..062dc4a 100644 --- a/recsys/datasets/criteo.py +++ b/recsys/datasets/criteo.py @@ -299,7 +299,7 @@ def _get_terabyte_dataloader(args, stage, rank, world_size): batch_size=None, pin_memory=False, collate_fn=KJTTransform(nv_iter).transform, - num_worker=0) + num_workers=0) return dataloader diff --git a/recsys/dlrm_main.py b/recsys/dlrm_main.py index c3257fe..645da2f 100644 --- a/recsys/dlrm_main.py +++ b/recsys/dlrm_main.py @@ -1,3 +1,4 @@ +import os from dataclasses import dataclass, field from typing import List, Optional from tqdm import tqdm @@ -207,7 +208,7 @@ def _train(model, model.train() rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() - + if use_overlap: data_iter = FiniteDataIter(data_loader) else: @@ -313,6 +314,7 @@ def main(): rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() + os.environ["CUDA_VISIBLE_DEVICES"] = str(rank) if args.memory_fraction is not None: torch.cuda.set_per_process_memory_fraction(args.memory_fraction) @@ -345,6 +347,10 @@ def main(): id_freq_map = None if args.use_freq: id_freq_map = data_module.get_id_freq_map(args.dataset_dir) + if not isinstance(id_freq_map, torch.Tensor): + id_freq_map = torch.from_numpy(id_freq_map).cuda(non_blocking=True) + else: + id_freq_map = id_freq_map.cuda(non_blocking=True) device = torch.device('cuda', torch.cuda.current_device()) sparse_device = torch.device('cpu') if args.use_cpu else device diff --git a/run.sh b/run.sh index c0ea734..e434fd3 100644 --- a/run.sh +++ b/run.sh @@ -8,13 +8,19 @@ # For Colossalai enabled recsys # criteo kaggle -torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ - --dataset_dir criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap +#torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ +# --dataset_dir criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap # avazu #torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ # --dataset_dir avazu_sample --pin_memory --shuffle_batches \ # --learning_rate 5e-2 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ # --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap + +# criteo terabyte +torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap \ No newline at end of file From ae2803d96f4169be3c3c2c88f70ef553e6f6c693 Mon Sep 17 00:00:00 2001 From: zxgx Date: Wed, 17 Aug 2022 14:25:15 +0800 Subject: [PATCH 07/19] multiprocess DLRM for criteo terabyte --- dist_wrapper.sh | 29 +++++++++ hvd_wrapper.sh | 14 ----- recsys/datasets/avazu.py | 3 +- recsys/datasets/criteo.py | 15 ++++- recsys/dlrm_main.py | 30 ++++++--- run.sh | 3 +- test.py | 124 +++++++++++++++++++++++++++++--------- 7 files changed, 163 insertions(+), 55 deletions(-) create mode 100644 dist_wrapper.sh delete mode 100644 hvd_wrapper.sh diff --git a/dist_wrapper.sh b/dist_wrapper.sh new file mode 100644 index 0000000..ba0436c --- /dev/null +++ b/dist_wrapper.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Due to the distributed GPU settings customized by NVTabular, +# we need some awkward wrapper to initialize the distributed settings in PyTorch code. +# +# Basically, we need to assign each process with a single & different device id to enable NVTabular, +# To cope with the visible device by pytorch, +# we force the OMPI_COMM_WORLD_LOCAL_RANK to 0 (Please refer to dlrm_main.py), +# and thus set each visible device in each process to cuda:0 (this part is done by colossalai under the hood). + +# Usage: +# mpirun -np bash dist_wrapper.sh python [training args] +# +# hovorodrun might also work since it invokes mpirun. + +# Get local process ID from OpenMPI or Slurm +if [ -n "${OMPI_COMM_WORLD_LOCAL_RANK:-}" ]; then + LOCAL_RANK="${OMPI_COMM_WORLD_LOCAL_RANK}" +elif [ -n "${SLURM_LOCALID:-}" ]; then + LOCAL_RANK="${SLURM_LOCALID}" +fi + +if [ -z "${CUDA_VISIBLE_DEVICES:-}" ]; then + export CUDA_VISIBLE_DEVICES=${LOCAL_RANK} +else + device_list=(${CUDA_VISIBLE_DEVICES//","/ }) + export CUDA_VISIBLE_DEVICES=${device_list[$LOCAL_RANK]} +fi + +exec "$@" diff --git a/hvd_wrapper.sh b/hvd_wrapper.sh deleted file mode 100644 index 919064b..0000000 --- a/hvd_wrapper.sh +++ /dev/null @@ -1,14 +0,0 @@ - -#!/bin/bash - -# Get local process ID from OpenMPI or alternatively from SLURM -if [ -z "${CUDA_VISIBLE_DEVICES:-}" ]; then - if [ -n "${OMPI_COMM_WORLD_LOCAL_RANK:-}" ]; then - LOCAL_RANK="${OMPI_COMM_WORLD_LOCAL_RANK}" - elif [ -n "${SLURM_LOCALID:-}" ]; then - LOCAL_RANK="${SLURM_LOCALID}" - fi - export CUDA_VISIBLE_DEVICES=${LOCAL_RANK} -fi - -exec "$@" diff --git a/recsys/datasets/avazu.py b/recsys/datasets/avazu.py index 72284e3..f9b7715 100644 --- a/recsys/datasets/avazu.py +++ b/recsys/datasets/avazu.py @@ -247,4 +247,5 @@ def get_id_freq_map(path): file_processor = CriteoSparseProcessor(list(map(int, NUM_EMBEDDINGS_PER_FEATURE.split(',')))) feature_count = GlobalFeatureCounter(files, file_processor) - return feature_count.id_freq_map + id_freq_map = torch.from_numpy(feature_count.id_freq_map) + return id_freq_map diff --git a/recsys/datasets/criteo.py b/recsys/datasets/criteo.py index 062dc4a..3b87fef 100644 --- a/recsys/datasets/criteo.py +++ b/recsys/datasets/criteo.py @@ -293,6 +293,8 @@ def _get_terabyte_dataloader(args, stage, rank, world_size): global_rank=rank, global_size=world_size, drop_last=True, + shuffle=stage == "train", + seed_fn=lambda: args.seed, ) dataloader = DataLoader(nv_iter, @@ -315,6 +317,11 @@ def get_dataloader(args, stage, rank, world_size): def get_id_freq_map(path): + checkpoint_path = os.path.join(path, "id_freq_map.pt") + if os.path.exists(checkpoint_path): + id_freq_map = torch.load(checkpoint_path) + return id_freq_map + if 'kaggle' not in path: file_num = len(glob.glob(os.path.join(path, "train", "*.parquet"))) files = [os.path.join(path, "train", f"part_{i}.parquet") for i in range(file_num)] @@ -323,7 +330,7 @@ def get_id_freq_map(path): list(map(int, NUM_EMBEDDINGS_PER_FEATURE.split(','))), 16384, sample_fraction=0.1) - + id_freq_map = feature_count.id_freq_map else: files = os.listdir(path) sparse_files = list(filter(lambda s: 'sparse' in s, files)) @@ -331,5 +338,9 @@ def get_id_freq_map(path): file_processor = CriteoSparseProcessor(list(map(int, KAGGLE_NUM_EMBEDDINGS_PER_FEATURE.split(',')))) feature_count = GlobalFeatureCounter(sparse_files, file_processor) + id_freq_map = torch.from_numpy(feature_count.id_freq_map) + + if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + torch.save(id_freq_map, checkpoint_path) - return feature_count.id_freq_map + return id_freq_map diff --git a/recsys/dlrm_main.py b/recsys/dlrm_main.py index 645da2f..cd330a3 100644 --- a/recsys/dlrm_main.py +++ b/recsys/dlrm_main.py @@ -259,6 +259,7 @@ def _evaluate(model, data_loader, stage, use_overlap, use_distributed_dataloader logits = model(dense, sparse).squeeze() preds = torch.sigmoid(logits) # dist_logger.info(f"pred: {preds.max(), preds.min()}") + labels = labels.int() auroc(preds, labels) accuracy(preds, labels) except StopIteration: @@ -305,17 +306,32 @@ def train_val_test( return train_val_test_results +def dist_config(args): + colossalai.logging.disable_existing_loggers() + + mpi_world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", None)) + if mpi_world_size is not None: + # below is just a trick for integrating NVTabular dataloader for criteo terabyte dataset + os.environ["OMPI_COMM_WORLD_LOCAL_RANK"] = "0" + colossalai.launch_from_openmpi( + config={}, + host=os.environ.get("MASTER_ADDR", "localhost"), + port=os.environ.get("MASTER_PORT", "12355"), + seed=args.seed, + verbose=False, + ) + else: + colossalai.launch_from_torch(config={}, seed=args.seed, verbose=False) + + def main(): args = parse_args() - colossalai.logging.disable_existing_loggers() - colossalai.launch_from_torch(config={}, seed=args.seed, verbose=False) + dist_config(args) rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() - os.environ["CUDA_VISIBLE_DEVICES"] = str(rank) - if args.memory_fraction is not None: torch.cuda.set_per_process_memory_fraction(args.memory_fraction) @@ -346,11 +362,7 @@ def main(): id_freq_map = None if args.use_freq: - id_freq_map = data_module.get_id_freq_map(args.dataset_dir) - if not isinstance(id_freq_map, torch.Tensor): - id_freq_map = torch.from_numpy(id_freq_map).cuda(non_blocking=True) - else: - id_freq_map = id_freq_map.cuda(non_blocking=True) + id_freq_map = data_module.get_id_freq_map(args.dataset_dir).cuda(non_blocking=True) device = torch.device('cuda', torch.cuda.current_device()) sparse_device = torch.device('cpu') if args.use_cpu else device diff --git a/run.sh b/run.sh index e434fd3..5eb558c 100644 --- a/run.sh +++ b/run.sh @@ -20,7 +20,6 @@ # --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap # criteo terabyte -torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_preproc \ +mpirun --allow-run-as-root -np 2 bash dist_wrapper.sh python recsys/dlrm_main.py --dataset_dir /data/criteo_preproc \ --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap \ No newline at end of file diff --git a/test.py b/test.py index 2d0cbe4..f69e0d8 100644 --- a/test.py +++ b/test.py @@ -1,16 +1,26 @@ import os +import time +import os +import shutil +from dask.distributed import Client +from dask_cuda import LocalCUDACluster +from nvtabular.utils import device_mem_size import torch import torch.distributed as dist +from torch.utils.data import DataLoader +import nvtabular as nvt +from nvtabular.loader.torch import TorchAsyncItr # , DLDataLoader import cupy from torchrec.sparse.jagged_tensor import KeyedJaggedTensor from torchrec.datasets.utils import Batch +import colossalai from recsys.datasets.criteo import get_id_freq_map +from recsys.utils import get_mem_info -INPUT_DATA_DIR = "/data/criteo_preproc/train/" +INPUT_DATA_DIR = "/data/criteo_preproc/test/" BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 16384)) -PARTS_PER_CHUNK = int(os.environ.get("PARTS_PER_CHUNK", 2)) CONTINUOUS_COLUMNS = ["int_" + str(x) for x in range(0, 13)] CATEGORICAL_COLUMNS = ["cat_" + str(x) for x in range(0, 26)] LABEL_COLUMNS = ["label"] @@ -56,41 +66,101 @@ def transform(self, batch): ) -def seed_fn(): - """ - Generate consistent dataloader shuffle seeds across workers - Reseeds each worker's dataloader each epoch to get fresh a shuffle - that's consistent across workers. - """ +def setup_dask(dask_workdir): + if os.path.exists(dask_workdir): + shutil.rmtree(dask_workdir) + os.makedirs(dask_workdir) + + device_limit_frac = 0.05 # Spill GPU-Worker memory to host at this limit. + device_pool_frac = 0.04 - max_rand = torch.iinfo(torch.int).max // world_size + # Use total device size to calculate device limit and pool_size + device_size = device_mem_size(kind="total") + device_limit = int(device_limit_frac * device_size) + device_pool_size = int(device_pool_frac * device_size) - # Generate a seed fragment - seed_fragment = cupy.random.randint(0, max_rand) + cluster = LocalCUDACluster( + protocol="tcp", + n_workers=1, + CUDA_VISIBLE_DEVICES=os.environ["CUDA_VISIBLE_DEVICES"], + device_memory_limit=device_limit, + local_directory=dask_workdir, + rmm_pool_size=(device_pool_size // 256) * 256, + ) - # Aggregate seed fragments from all workers - seed_tensor = torch.tensor(seed_fragment) # pylint: disable=not-callable - dist.all_reduce(seed_tensor, op=dist.ReduceOp.SUM) - return seed_tensor % max_rand + return Client(cluster) -def run(rank, world_size): +# def run(rank, world_size): +def run(): os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "12355" - - # initialize the process group - dist.init_process_group("nccl", rank=rank, world_size=world_size) - print(f"init rank: {rank}") - torch.cuda.set_device(rank) - - id_freq_map = get_id_freq_map("/data/criteo_preproc") - print( - f"rank: {rank}, shape: {id_freq_map.shape}, max: {id_freq_map.max().item()}, min: {id_freq_map.min().item()}, " - f"top 10: {id_freq_map[:10].tolist()}") + os.environ["OMPI_COMM_WORLD_LOCAL_RANK"] = '0' + + colossalai.logging.disable_existing_loggers() + colossalai.launch_from_openmpi(config={}, + host=os.environ["MASTER_ADDR"], + port=os.environ["MASTER_PORT"], + verbose=False) + + fname = "part_{}.parquet" + train_paths = [os.path.join(INPUT_DATA_DIR, fname.format(i)) for i in range(64)] + + print(f"{dist.get_rank()}/{dist.get_world_size()}: device: {torch.cuda.current_device()}") + + start = time.time() + train_data = nvt.Dataset(train_paths, engine="parquet", part_mem_fraction=0.02) + print(f"nvdtaset: {time.time() - start}") + start = time.time() + train_data_idrs = TorchAsyncItr( + train_data, + batch_size=BATCH_SIZE, + cats=CATEGORICAL_COLUMNS, + conts=CONTINUOUS_COLUMNS, + labels=LABEL_COLUMNS, + global_rank=0, + global_size=1, + drop_last=True, + shuffle=True, + seed_fn=lambda: 1, + ) + print(f"TorchAsyncItr: {time.time() - start}, len: {len(train_data_idrs)}") + + start = time.time() + train_dataloader = DataLoader(train_data_idrs, + collate_fn=KJTTransform(train_data_idrs).transform, + batch_size=None, + pin_memory=False, + num_workers=0) + print(f"dataloader: {time.time() - start}, len: {len(train_dataloader)}") + + data_iter = iter(train_dataloader) + for idx, batch in enumerate(data_iter): + print(f"rank: {dist.get_rank()}, it: {idx}, batch: {batch.dense_features}") + + if idx == 3: + break + print(f"allocate: {torch.cuda.memory_allocated()/1024**3:.2f} GB, " + f"reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB") + torch.cuda.synchronize() + # id_freq_map = get_id_freq_map("/data/criteo_preproc") + # print(id_freq_map.shape, id_freq_map.max(), id_freq_map.min()) if __name__ == "__main__": # world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) # world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) + + # rank = int(os.environ['RANK']) + # local_rank = int(os.environ['LOCAL_RANK']) + # world_size = int(os.environ['WORLD_SIZE']) + # host = os.environ['MASTER_ADDR'] + # port = int(os.environ['MASTER_PORT']) + # print(f"rank: {rank}/{world_size}, local rank: {local_rank}, host: {host}, port: {port}") + # os.environ["CUDA_VISIBLE_DEVICES"] = str(rank) + os.environ["LIBCUDF_CUFILE_POLICY"] = "ALWAYS" - run(0, 1) + + # setup_dask("dask_dir") + # run(world_rank, world_size) + run() From d563ead465167c179cc90bcaff1dede52fb84507 Mon Sep 17 00:00:00 2001 From: zxgx Date: Wed, 17 Aug 2022 15:30:41 +0800 Subject: [PATCH 08/19] fix distributed training for DLRM --- recsys/datasets/feature_counter.py | 2 +- recsys/models/dlrm.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/recsys/datasets/feature_counter.py b/recsys/datasets/feature_counter.py index 006c46a..b710723 100644 --- a/recsys/datasets/feature_counter.py +++ b/recsys/datasets/feature_counter.py @@ -86,7 +86,7 @@ def _collect_statistics(self): labels=["label"], global_rank=0, global_size=1, - drop_last=True, + drop_last=False, device='cpu', ) diff --git a/recsys/models/dlrm.py b/recsys/models/dlrm.py index d210c42..a048143 100644 --- a/recsys/models/dlrm.py +++ b/recsys/models/dlrm.py @@ -7,7 +7,7 @@ # # The infrastructures of DLRM are mainly inspired by TorchRec: # https://github.com/pytorch/torchrec/blob/main/torchrec/models/dlrm.py - +import os from contextlib import nullcontext import torch.nn as nn from torch.nn.parallel import DistributedDataParallel as DDP @@ -160,14 +160,14 @@ def __init__(self, warmup_ratio=warmup_ratio, buffer_size=buffer_size, is_dist_dataloader=is_dist_dataloader).to(sparse_device) - self.dense_modules = DDP(module=FusedDenseModules(embedding_dim, num_sparse_features, dense_in_features, - dense_arch_layer_sizes, - over_arch_layer_sizes).to(dense_device), - device_ids=[gpc.get_global_rank()], - process_group=gpc.get_group(ParallelMode.GLOBAL), - gradient_as_bucket_view=True, - broadcast_buffers=False, - static_graph=True) + self.dense_modules = DDP( + module=FusedDenseModules(embedding_dim, num_sparse_features, dense_in_features, dense_arch_layer_sizes, + over_arch_layer_sizes).to(dense_device), + device_ids=[0 if os.environ.get("OMPI_COMM_WORLD_SIZE", None) else gpc.get_global_rank()], + process_group=gpc.get_group(ParallelMode.GLOBAL), + gradient_as_bucket_view=True, + broadcast_buffers=False, + static_graph=True) # precompute for parallelized embedding param_amount = sum(num_embeddings_per_feature) * embedding_dim From 946f144afb00bb38b8b41ba7a12635c478749de7 Mon Sep 17 00:00:00 2001 From: zxgx Date: Wed, 17 Aug 2022 18:00:30 +0800 Subject: [PATCH 09/19] add trainig meter --- recsys/dlrm_main.py | 11 ++++++++++- run.sh | 2 +- test.py | 19 ++----------------- 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/recsys/dlrm_main.py b/recsys/dlrm_main.py index cd330a3..74a69b4 100644 --- a/recsys/dlrm_main.py +++ b/recsys/dlrm_main.py @@ -214,7 +214,9 @@ def _train(model, else: data_iter = iter(data_loader) - for it in tqdm(itertools.count(), desc=f"Epoch {epoch}"): + total = len(data_loader) if hasattr(data_loader, "__len__") else None + meter = tqdm(itertools.count(), desc=f"Epoch {epoch}", ncols=0, total=total) + for _ in meter: try: dense, sparse, labels = put_data_in_device(next(data_iter), model.dense_device, model.sparse_device, use_distributed_dataloader, rank, world_size) @@ -234,6 +236,13 @@ def _train(model, if prof: prof.step() + postfix_str = f"loss={loss.item:.2f}" + if hasattr(model.module.sparse_modules.embed, "num_miss_history"): + hit_rate = model.module.sparse_modules.embed.num_hits_history[-1] / ( + model.module.sparse_modules.embed.num_hits_history[-1] + + model.module.sparse_modules.embed.num_miss_history[-1]) + postfix_str += f" hit rate={hit_rate:.2f}" + meter.set_postfix_str(postfix_str) except StopIteration: dist_logger.info(f"{get_mem_info('Training: ')}") break diff --git a/run.sh b/run.sh index 5eb558c..99cbc23 100644 --- a/run.sh +++ b/run.sh @@ -20,6 +20,6 @@ # --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap # criteo terabyte -mpirun --allow-run-as-root -np 2 bash dist_wrapper.sh python recsys/dlrm_main.py --dataset_dir /data/criteo_preproc \ +mpirun -x LD_PRELOAD=libmpi.so --allow-run-as-root -np 1 bash dist_wrapper.sh python recsys/dlrm_main.py --dataset_dir /data/criteo_preproc \ --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap \ No newline at end of file diff --git a/test.py b/test.py index f69e0d8..40a0c6e 100644 --- a/test.py +++ b/test.py @@ -91,7 +91,6 @@ def setup_dask(dask_workdir): return Client(cluster) -# def run(rank, world_size): def run(): os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "12355" @@ -138,29 +137,15 @@ def run(): for idx, batch in enumerate(data_iter): print(f"rank: {dist.get_rank()}, it: {idx}, batch: {batch.dense_features}") - if idx == 3: + if idx == 30: break - print(f"allocate: {torch.cuda.memory_allocated()/1024**3:.2f} GB, " - f"reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB") + print(get_mem_info()) torch.cuda.synchronize() # id_freq_map = get_id_freq_map("/data/criteo_preproc") # print(id_freq_map.shape, id_freq_map.max(), id_freq_map.min()) if __name__ == "__main__": - # world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) - # world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) - - # rank = int(os.environ['RANK']) - # local_rank = int(os.environ['LOCAL_RANK']) - # world_size = int(os.environ['WORLD_SIZE']) - # host = os.environ['MASTER_ADDR'] - # port = int(os.environ['MASTER_PORT']) - # print(f"rank: {rank}/{world_size}, local rank: {local_rank}, host: {host}, port: {port}") - # os.environ["CUDA_VISIBLE_DEVICES"] = str(rank) - os.environ["LIBCUDF_CUFILE_POLICY"] = "ALWAYS" - # setup_dask("dask_dir") - # run(world_rank, world_size) run() From 8e106f3a0e4f348b41b754750eac2fae080b6236 Mon Sep 17 00:00:00 2001 From: zxgx Date: Wed, 17 Aug 2022 19:44:26 +0800 Subject: [PATCH 10/19] add option for torchrun launch --- dist_wrapper.sh | 4 +++- recsys/dlrm_main.py | 19 +++++++++++-------- recsys/models/dlrm.py | 16 ++++++++-------- run.sh | 11 +++++++++-- test.py | 11 ++++------- 5 files changed, 35 insertions(+), 26 deletions(-) diff --git a/dist_wrapper.sh b/dist_wrapper.sh index ba0436c..c59021a 100644 --- a/dist_wrapper.sh +++ b/dist_wrapper.sh @@ -9,6 +9,8 @@ # Usage: # mpirun -np bash dist_wrapper.sh python [training args] +# torchrun --nnode=1 --nproc_per_node= --no_python bash dist_wrapper.sh python \ +# [training args] # # hovorodrun might also work since it invokes mpirun. @@ -25,5 +27,5 @@ else device_list=(${CUDA_VISIBLE_DEVICES//","/ }) export CUDA_VISIBLE_DEVICES=${device_list[$LOCAL_RANK]} fi - +export NVT_TAG=1 exec "$@" diff --git a/recsys/dlrm_main.py b/recsys/dlrm_main.py index 74a69b4..03d49d2 100644 --- a/recsys/dlrm_main.py +++ b/recsys/dlrm_main.py @@ -236,12 +236,12 @@ def _train(model, if prof: prof.step() - postfix_str = f"loss={loss.item:.2f}" - if hasattr(model.module.sparse_modules.embed, "num_miss_history"): - hit_rate = model.module.sparse_modules.embed.num_hits_history[-1] / ( - model.module.sparse_modules.embed.num_hits_history[-1] + - model.module.sparse_modules.embed.num_miss_history[-1]) - postfix_str += f" hit rate={hit_rate:.2f}" + postfix_str = f"loss={loss.item():.4f}" + # if hasattr(model.sparse_modules.embed, "num_miss_history"): + # hit_rate = model.sparse_modules.embed.num_hits_history[-1] / ( + # model.sparse_modules.embed.num_hits_history[-1] + + # model.sparse_modules.embed.num_miss_history[-1]) + # postfix_str += f" hit rate={hit_rate*100:.2f}%" meter.set_postfix_str(postfix_str) except StopIteration: dist_logger.info(f"{get_mem_info('Training: ')}") @@ -318,10 +318,11 @@ def train_val_test( def dist_config(args): colossalai.logging.disable_existing_loggers() - mpi_world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", None)) + mpi_world_size = os.environ.get("OMPI_COMM_WORLD_SIZE", None) if mpi_world_size is not None: # below is just a trick for integrating NVTabular dataloader for criteo terabyte dataset - os.environ["OMPI_COMM_WORLD_LOCAL_RANK"] = "0" + if os.environ.get("NVT_TAG", None): + os.environ["OMPI_COMM_WORLD_LOCAL_RANK"] = "0" colossalai.launch_from_openmpi( config={}, host=os.environ.get("MASTER_ADDR", "localhost"), @@ -330,6 +331,8 @@ def dist_config(args): verbose=False, ) else: + if os.environ.get("NVT_TAG", None): + os.environ["LOCAL_RANK"] = "0" colossalai.launch_from_torch(config={}, seed=args.seed, verbose=False) diff --git a/recsys/models/dlrm.py b/recsys/models/dlrm.py index a048143..f41c7e7 100644 --- a/recsys/models/dlrm.py +++ b/recsys/models/dlrm.py @@ -160,14 +160,14 @@ def __init__(self, warmup_ratio=warmup_ratio, buffer_size=buffer_size, is_dist_dataloader=is_dist_dataloader).to(sparse_device) - self.dense_modules = DDP( - module=FusedDenseModules(embedding_dim, num_sparse_features, dense_in_features, dense_arch_layer_sizes, - over_arch_layer_sizes).to(dense_device), - device_ids=[0 if os.environ.get("OMPI_COMM_WORLD_SIZE", None) else gpc.get_global_rank()], - process_group=gpc.get_group(ParallelMode.GLOBAL), - gradient_as_bucket_view=True, - broadcast_buffers=False, - static_graph=True) + self.dense_modules = DDP(module=FusedDenseModules(embedding_dim, num_sparse_features, dense_in_features, + dense_arch_layer_sizes, + over_arch_layer_sizes).to(dense_device), + device_ids=[0 if os.environ.get("NVT_TAG", None) else gpc.get_global_rank()], + process_group=gpc.get_group(ParallelMode.GLOBAL), + gradient_as_bucket_view=True, + broadcast_buffers=False, + static_graph=True) # precompute for parallelized embedding param_amount = sum(num_embeddings_per_feature) * embedding_dim diff --git a/run.sh b/run.sh index 99cbc23..1c9f800 100644 --- a/run.sh +++ b/run.sh @@ -20,6 +20,13 @@ # --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap # criteo terabyte -mpirun -x LD_PRELOAD=libmpi.so --allow-run-as-root -np 1 bash dist_wrapper.sh python recsys/dlrm_main.py --dataset_dir /data/criteo_preproc \ +#mpirun -x LD_PRELOAD=libmpi.so --allow-run-as-root -np 2 bash dist_wrapper.sh python recsys/dlrm_main.py \ +# --dataset_dir /data/criteo_preproc \ +# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap + +# torchrun seems better than mpirun +torchrun --nnode=1 --nproc_per_node=2 --no_python bash dist_wrapper.sh python recsys/dlrm_main.py \ + --dataset_dir /data/criteo_preproc \ --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap \ No newline at end of file + --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap diff --git a/test.py b/test.py index 40a0c6e..965f6cf 100644 --- a/test.py +++ b/test.py @@ -92,15 +92,12 @@ def setup_dask(dask_workdir): def run(): - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = "12355" - os.environ["OMPI_COMM_WORLD_LOCAL_RANK"] = '0' + # os.environ["MASTER_ADDR"] = "localhost" + # os.environ["MASTER_PORT"] = "12355" + os.environ["LOCAL_RANK"] = '0' colossalai.logging.disable_existing_loggers() - colossalai.launch_from_openmpi(config={}, - host=os.environ["MASTER_ADDR"], - port=os.environ["MASTER_PORT"], - verbose=False) + colossalai.launch_from_torch(config={}, verbose=False) fname = "part_{}.parquet" train_paths = [os.path.join(INPUT_DATA_DIR, fname.format(i)) for i in range(64)] From 7891441c2c02add57ca019dfea68a35e4eda59ba Mon Sep 17 00:00:00 2001 From: zxgx Date: Fri, 19 Aug 2022 15:55:17 +0800 Subject: [PATCH 11/19] update logger --- recsys/datasets/criteo.py | 2 +- recsys/datasets/feature_counter.py | 2 +- recsys/dlrm_main.py | 10 ++++-- run.sh | 5 +++ test.py | 55 ++++++++++++++++++++++++------ 5 files changed, 59 insertions(+), 15 deletions(-) diff --git a/recsys/datasets/criteo.py b/recsys/datasets/criteo.py index 3b87fef..818fb77 100644 --- a/recsys/datasets/criteo.py +++ b/recsys/datasets/criteo.py @@ -285,7 +285,7 @@ def _get_terabyte_dataloader(args, stage, rank, world_size): files = [os.path.join(args.dataset_dir, data_split, f"part_{i}.parquet") for i in range(file_num)] nv_iter = TorchAsyncItr( - nvt.Dataset(files, engine="parquet", part_mem_fraction=0.02), + nvt.Dataset(files, engine="parquet", part_size="256MB"), batch_size=args.batch_size, cats=DEFAULT_CAT_NAMES, conts=DEFAULT_INT_NAMES, diff --git a/recsys/datasets/feature_counter.py b/recsys/datasets/feature_counter.py index b710723..65048a3 100644 --- a/recsys/datasets/feature_counter.py +++ b/recsys/datasets/feature_counter.py @@ -79,7 +79,7 @@ def __init__(self, datafiles, hashes, batch_size, sample_fraction=0.05): def _collect_statistics(self): data_files = sorted(self.datafiles[:int(np.ceil(len(self.datafiles) * self.sample_fraction))]) nv_iter = TorchAsyncItr( - nvt.Dataset(data_files, engine="parquet", part_mem_fraction=0.02), + nvt.Dataset(data_files, engine="parquet", part_size="256MB"), batch_size=self.batch_size, cats=DEFAULT_CAT_NAMES, conts=DEFAULT_INT_NAMES, diff --git a/recsys/dlrm_main.py b/recsys/dlrm_main.py index 03d49d2..9cbcf78 100644 --- a/recsys/dlrm_main.py +++ b/recsys/dlrm_main.py @@ -236,13 +236,14 @@ def _train(model, if prof: prof.step() - postfix_str = f"loss={loss.item():.4f}" + # Below will introduce additional overhead + # postfix_str = f"loss={loss.item():.4f}" # if hasattr(model.sparse_modules.embed, "num_miss_history"): # hit_rate = model.sparse_modules.embed.num_hits_history[-1] / ( # model.sparse_modules.embed.num_hits_history[-1] + # model.sparse_modules.embed.num_miss_history[-1]) # postfix_str += f" hit rate={hit_rate*100:.2f}%" - meter.set_postfix_str(postfix_str) + # meter.set_postfix_str(postfix_str) except StopIteration: dist_logger.info(f"{get_mem_info('Training: ')}") break @@ -261,7 +262,10 @@ def _evaluate(model, data_loader, stage, use_overlap, use_distributed_dataloader data_iter = iter(data_loader) with torch.no_grad(): - for _ in tqdm(iter(int, 1), desc=f"Evaluating {stage} set"): + for _ in tqdm(itertools.count(), + desc=f"Evaluating {stage} set", + ncols=0, + total=len(data_loader) if hasattr(data_loader, "__len__") else None): try: dense, sparse, labels = put_data_in_device(next(data_iter), model.dense_device, model.sparse_device, use_distributed_dataloader, rank, world_size) diff --git a/run.sh b/run.sh index 1c9f800..e82005a 100644 --- a/run.sh +++ b/run.sh @@ -1,5 +1,10 @@ #!/bin/bash +export OMP_NUM_THREADS=1 +export MKL_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 +export NUMBA_NUM_THREADS=1 + # For TorchRec baseline #torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script baselines/dlrm_main.py -- \ # --kaggle --in_memory_binary_criteo_path criteo_kaggle_data --embedding_dim 128 --pin_memory \ diff --git a/test.py b/test.py index 965f6cf..4e65b67 100644 --- a/test.py +++ b/test.py @@ -2,6 +2,8 @@ import time import os import shutil +from tqdm import tqdm +import itertools from dask.distributed import Client from dask_cuda import LocalCUDACluster @@ -18,8 +20,10 @@ import colossalai from recsys.datasets.criteo import get_id_freq_map from recsys.utils import get_mem_info +from fsspec.core import get_fs_token_paths +from merlin.core.utils import global_dask_client, _merlin_dask_client -INPUT_DATA_DIR = "/data/criteo_preproc/test/" +INPUT_DATA_DIR = "/data/criteo_preproc/train/" BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 16384)) CONTINUOUS_COLUMNS = ["int_" + str(x) for x in range(0, 13)] CATEGORICAL_COLUMNS = ["cat_" + str(x) for x in range(0, 26)] @@ -92,8 +96,6 @@ def setup_dask(dask_workdir): def run(): - # os.environ["MASTER_ADDR"] = "localhost" - # os.environ["MASTER_PORT"] = "12355" os.environ["LOCAL_RANK"] = '0' colossalai.logging.disable_existing_loggers() @@ -104,9 +106,35 @@ def run(): print(f"{dist.get_rank()}/{dist.get_world_size()}: device: {torch.cuda.current_device()}") + # fs, fs_token, paths2 = get_fs_token_paths(train_paths, mode="rb", storage_options={}) + # print(fs) + # print(fs_token) + # print(paths2) + # start = time.time() - train_data = nvt.Dataset(train_paths, engine="parquet", part_mem_fraction=0.02) - print(f"nvdtaset: {time.time() - start}") + train_data = nvt.Dataset(train_paths, engine="parquet", part_size="128MB") + print(f"nvdtaset: {time.time() - start}, is cpu: {train_data.cpu}") + print(f"Client: {global_dask_client()}, {_merlin_dask_client.get()}") + # + # # import pyarrow.dataset as pa_ds + # # dataset = pa_ds.dataset(train_paths, filesystem=fs) + # # print(f"frag path: {next(dataset.get_fragments()).path}") + # + # import cudf + # _df = cudf.io.read_parquet(train_paths[0], row_groups=1) + # print(f"df: {_df.shape}") + # print(f"take 1: {_df.take([1])}") + # print(f"memory usage: {_df.memory_usage(deep=True).sum()}") + # + # from pathlib import Path + # from merlin.schema.io.tensorflow_metadata import TensorflowMetadata + # schema_path = Path(train_paths[0]).parent + # print(f"Schema: {TensorflowMetadata.from_proto_text_file(schema_path).to_merlin_schema()}") + # + # ddf = train_data.engine.to_ddf() + # print(f"ddf: {ddf}") + # print(f"Npartition: {ddf.npartitions}, dataset partitions: {train_data.npartitions}") + start = time.time() train_data_idrs = TorchAsyncItr( train_data, @@ -122,6 +150,10 @@ def run(): ) print(f"TorchAsyncItr: {time.time() - start}, len: {len(train_data_idrs)}") + # import threading + # event = threading.Event() + # print(f"stop: {event.is_set()}") + start = time.time() train_dataloader = DataLoader(train_data_idrs, collate_fn=KJTTransform(train_data_idrs).transform, @@ -131,10 +163,13 @@ def run(): print(f"dataloader: {time.time() - start}, len: {len(train_dataloader)}") data_iter = iter(train_dataloader) - for idx, batch in enumerate(data_iter): - print(f"rank: {dist.get_rank()}, it: {idx}, batch: {batch.dense_features}") - - if idx == 30: + for idx in tqdm(itertools.count(), + desc=f"Rank {dist.get_rank()}", + ncols=0, + total=len(train_dataloader) if hasattr(train_dataloader, "__len__") else None): + batch = next(data_iter) + print(f"rank: {dist.get_rank()}, ix: {idx}, dense: {batch.dense_features}") + if idx == 5: break print(get_mem_info()) torch.cuda.synchronize() @@ -144,5 +179,5 @@ def run(): if __name__ == "__main__": os.environ["LIBCUDF_CUFILE_POLICY"] = "ALWAYS" - + # torchrun --nnode=1 --nproc_per_node=2 --no_python bash dist_wrapper.sh python run() From 19a8f004eb3b9664d8da00a1aaa50ec50b523fb7 Mon Sep 17 00:00:00 2001 From: zxgx Date: Sat, 20 Aug 2022 20:29:10 +0800 Subject: [PATCH 12/19] temporary timer solution for NVT --- recsys/dlrm_main.py | 12 +++++++++--- run.sh | 9 ++------- test.py | 20 ++++++++++---------- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/recsys/dlrm_main.py b/recsys/dlrm_main.py index 9cbcf78..3dfde7a 100644 --- a/recsys/dlrm_main.py +++ b/recsys/dlrm_main.py @@ -216,9 +216,15 @@ def _train(model, total = len(data_loader) if hasattr(data_loader, "__len__") else None meter = tqdm(itertools.count(), desc=f"Epoch {epoch}", ncols=0, total=total) + timer = colossalai.utils.Timer() for _ in meter: try: - dense, sparse, labels = put_data_in_device(next(data_iter), model.dense_device, model.sparse_device, + # We introduce a timer as a temporary solution to exclude interference + # due to the bugs exists in NVTabular dataloader, please see my discussion: + # https://github.com/dask/dask/discussions/9405. + batch = next(data_iter) + timer.start() + dense, sparse, labels = put_data_in_device(batch, model.dense_device, model.sparse_device, use_distributed_dataloader, rank, world_size) with record_function("(zhg)forward pass"): logits = model(dense, sparse).squeeze() @@ -232,7 +238,7 @@ def _train(model, with record_function("(zhg)optimization"): optimizer.step() - + timer.stop(keep_in_history=True) if prof: prof.step() @@ -245,7 +251,7 @@ def _train(model, # postfix_str += f" hit rate={hit_rate*100:.2f}%" # meter.set_postfix_str(postfix_str) except StopIteration: - dist_logger.info(f"{get_mem_info('Training: ')}") + dist_logger.info(f"{get_mem_info('Training: ')}, average throughput: {timer.get_history_mean():.2f} it/s") break diff --git a/run.sh b/run.sh index e82005a..c9c9fb6 100644 --- a/run.sh +++ b/run.sh @@ -1,10 +1,5 @@ #!/bin/bash -export OMP_NUM_THREADS=1 -export MKL_NUM_THREADS=1 -export OPENBLAS_NUM_THREADS=1 -export NUMBA_NUM_THREADS=1 - # For TorchRec baseline #torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script baselines/dlrm_main.py -- \ # --kaggle --in_memory_binary_criteo_path criteo_kaggle_data --embedding_dim 128 --pin_memory \ @@ -31,7 +26,7 @@ export NUMBA_NUM_THREADS=1 # --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap # torchrun seems better than mpirun -torchrun --nnode=1 --nproc_per_node=2 --no_python bash dist_wrapper.sh python recsys/dlrm_main.py \ - --dataset_dir /data/criteo_preproc \ +torchrun --nnode=1 --nproc_per_node=2 --log_dir=tmp -t 3 --no_python bash dist_wrapper.sh \ + python recsys/dlrm_main.py --dataset_dir /data/criteo_preproc \ --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap diff --git a/test.py b/test.py index 4e65b67..3d0e56e 100644 --- a/test.py +++ b/test.py @@ -1,4 +1,3 @@ -import os import time import os import shutil @@ -13,14 +12,11 @@ from torch.utils.data import DataLoader import nvtabular as nvt from nvtabular.loader.torch import TorchAsyncItr # , DLDataLoader -import cupy from torchrec.sparse.jagged_tensor import KeyedJaggedTensor from torchrec.datasets.utils import Batch import colossalai -from recsys.datasets.criteo import get_id_freq_map from recsys.utils import get_mem_info -from fsspec.core import get_fs_token_paths from merlin.core.utils import global_dask_client, _merlin_dask_client INPUT_DATA_DIR = "/data/criteo_preproc/train/" @@ -87,9 +83,11 @@ def setup_dask(dask_workdir): protocol="tcp", n_workers=1, CUDA_VISIBLE_DEVICES=os.environ["CUDA_VISIBLE_DEVICES"], - device_memory_limit=device_limit, + device_memory_limit="1GB", local_directory=dask_workdir, - rmm_pool_size=(device_pool_size // 256) * 256, + shared_filesystem=True, + memory_limit="100GB", + rmm_pool_size=None # (device_pool_size // 256) * 256, ) return Client(cluster) @@ -112,7 +110,7 @@ def run(): # print(paths2) # start = time.time() - train_data = nvt.Dataset(train_paths, engine="parquet", part_size="128MB") + train_data = nvt.Dataset(train_paths, engine="parquet", part_size="256MB") print(f"nvdtaset: {time.time() - start}, is cpu: {train_data.cpu}") print(f"Client: {global_dask_client()}, {_merlin_dask_client.get()}") # @@ -168,9 +166,9 @@ def run(): ncols=0, total=len(train_dataloader) if hasattr(train_dataloader, "__len__") else None): batch = next(data_iter) - print(f"rank: {dist.get_rank()}, ix: {idx}, dense: {batch.dense_features}") - if idx == 5: - break + # print(f"rank: {dist.get_rank()}, ix: {idx}, dense: {batch.dense_features}") + # if idx == 5: + # break print(get_mem_info()) torch.cuda.synchronize() # id_freq_map = get_id_freq_map("/data/criteo_preproc") @@ -179,5 +177,7 @@ def run(): if __name__ == "__main__": os.environ["LIBCUDF_CUFILE_POLICY"] = "ALWAYS" + client = setup_dask("dask_dir") + print(client.dashboard_link) # torchrun --nnode=1 --nproc_per_node=2 --no_python bash dist_wrapper.sh python run() From bfcb30e75884b59685aec91cd5bc508fef6e2f76 Mon Sep 17 00:00:00 2001 From: zxgx Date: Sat, 20 Aug 2022 21:24:06 +0800 Subject: [PATCH 13/19] polish timer --- recsys/dlrm_main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/recsys/dlrm_main.py b/recsys/dlrm_main.py index 3dfde7a..fb66b4b 100644 --- a/recsys/dlrm_main.py +++ b/recsys/dlrm_main.py @@ -251,8 +251,10 @@ def _train(model, # postfix_str += f" hit rate={hit_rate*100:.2f}%" # meter.set_postfix_str(postfix_str) except StopIteration: - dist_logger.info(f"{get_mem_info('Training: ')}, average throughput: {timer.get_history_mean():.2f} it/s") + dist_logger.info(f"{get_mem_info('Training: ')}") break + if hasattr(data_loader, "__len__"): + dist_logger.info(f"average throughput: {len(data_loader) / timer.get_history_sum():.2f} it/s") def _evaluate(model, data_loader, stage, use_overlap, use_distributed_dataloader): From a6b5d23d3aee4382e0b2090867e75a4b993bc390 Mon Sep 17 00:00:00 2001 From: zxgx Date: Tue, 23 Aug 2022 16:00:50 +0800 Subject: [PATCH 14/19] change nvtabular dataloader to petastorm dataloader --- dist_wrapper.sh | 31 ----- recsys/datasets/avazu.py | 7 +- recsys/datasets/criteo.py | 170 +++++++++++++++++++++------ recsys/datasets/feature_counter.py | 132 ++++++--------------- recsys/dlrm_main.py | 35 ++---- run.sh | 10 +- test.py | 183 ----------------------------- test_petastorm.py | 38 ++++++ 8 files changed, 223 insertions(+), 383 deletions(-) delete mode 100644 dist_wrapper.sh delete mode 100644 test.py create mode 100644 test_petastorm.py diff --git a/dist_wrapper.sh b/dist_wrapper.sh deleted file mode 100644 index c59021a..0000000 --- a/dist_wrapper.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -# Due to the distributed GPU settings customized by NVTabular, -# we need some awkward wrapper to initialize the distributed settings in PyTorch code. -# -# Basically, we need to assign each process with a single & different device id to enable NVTabular, -# To cope with the visible device by pytorch, -# we force the OMPI_COMM_WORLD_LOCAL_RANK to 0 (Please refer to dlrm_main.py), -# and thus set each visible device in each process to cuda:0 (this part is done by colossalai under the hood). - -# Usage: -# mpirun -np bash dist_wrapper.sh python [training args] -# torchrun --nnode=1 --nproc_per_node= --no_python bash dist_wrapper.sh python \ -# [training args] -# -# hovorodrun might also work since it invokes mpirun. - -# Get local process ID from OpenMPI or Slurm -if [ -n "${OMPI_COMM_WORLD_LOCAL_RANK:-}" ]; then - LOCAL_RANK="${OMPI_COMM_WORLD_LOCAL_RANK}" -elif [ -n "${SLURM_LOCALID:-}" ]; then - LOCAL_RANK="${SLURM_LOCALID}" -fi - -if [ -z "${CUDA_VISIBLE_DEVICES:-}" ]; then - export CUDA_VISIBLE_DEVICES=${LOCAL_RANK} -else - device_list=(${CUDA_VISIBLE_DEVICES//","/ }) - export CUDA_VISIBLE_DEVICES=${device_list[$LOCAL_RANK]} -fi -export NVT_TAG=1 -exec "$@" diff --git a/recsys/datasets/avazu.py b/recsys/datasets/avazu.py index f9b7715..95a543f 100644 --- a/recsys/datasets/avazu.py +++ b/recsys/datasets/avazu.py @@ -9,7 +9,7 @@ from torchrec.datasets.utils import LoadFiles, ReadLinesFromCSV, PATH_MANAGER_KEY, Batch from torchrec.datasets.criteo import BinaryCriteoUtils -from .feature_counter import CriteoSparseProcessor, GlobalFeatureCounter +from .feature_counter import GlobalFeatureCounter CAT_FEATURE_COUNT = 13 INT_FEATURE_COUNT = 8 @@ -245,7 +245,6 @@ def get_id_freq_map(path): files = list(filter(lambda s: "sparse" in s, files)) files = [os.path.join(path, _f) for _f in files] - file_processor = CriteoSparseProcessor(list(map(int, NUM_EMBEDDINGS_PER_FEATURE.split(',')))) - feature_count = GlobalFeatureCounter(files, file_processor) - id_freq_map = torch.from_numpy(feature_count.id_freq_map) + feature_count = GlobalFeatureCounter(files, list(map(int, NUM_EMBEDDINGS_PER_FEATURE.split(',')))) + id_freq_map = torch.from_numpy(feature_count.compute()) return id_freq_map diff --git a/recsys/datasets/criteo.py b/recsys/datasets/criteo.py index 818fb77..93f6e66 100644 --- a/recsys/datasets/criteo.py +++ b/recsys/datasets/criteo.py @@ -6,26 +6,23 @@ # LICENSE file in the root directory of this source tree. import os +import random from typing import Dict, Iterator, List, Optional import numpy as np import glob -from torchrec.datasets.criteo import (CAT_FEATURE_COUNT, DEFAULT_CAT_NAMES, DEFAULT_INT_NAMES, DAYS, BinaryCriteoUtils) +from torchrec.datasets.criteo import (CAT_FEATURE_COUNT, DEFAULT_CAT_NAMES, DEFAULT_INT_NAMES, DEFAULT_LABEL_NAME, DAYS, + BinaryCriteoUtils) from torchrec.datasets.utils import PATH_MANAGER_KEY, Batch from torchrec.sparse.jagged_tensor import KeyedJaggedTensor from iopath.common.file_io import PathManager, PathManagerFactory from pyre_extensions import none_throws import torch from torch.utils.data import DataLoader, IterableDataset -try: - # pyre-ignore[21] - import nvtabular as nvt - from nvtabular.loader.torch import TorchAsyncItr -except ImportError: - print("Unable to import NVTabular, which indicates that you cannot load criteo 1TB dataset with our solution") +from petastorm import make_batch_reader +from pyarrow.parquet import ParquetDataset -from .feature_counter import CriteoSparseProcessor, GlobalFeatureCounter, NVTabularFeatureCounter -from .utils import KJTTransform +from .feature_counter import GlobalFeatureCounter, PetastormCounter STAGES = ["train", "val", "test"] @@ -231,6 +228,117 @@ def __len__(self) -> int: return self.num_batches +class PetastormDataReader(IterableDataset): + + def __init__(self, + paths, + batch_size, + rank=None, + world_size=None, + shuffle_batches=False, + hashes=None, + seed=1024, + drop_last=False): + self.dataset = ParquetDataset(paths, use_legacy_dataset=False) + self.batch_size = batch_size + self.rank = rank + self.world_size = world_size + self.shuffle_batches = shuffle_batches + self.hashes = np.array(hashes).reshape((1, CAT_FEATURE_COUNT)) if hashes is not None else None + self.sparse_offsets = np.array([0, *np.cumsum(hashes)[:-1]], dtype=np.int64).reshape(-1, 1) \ + if hashes is not None else None + + self._num_ids_in_batch: int = CAT_FEATURE_COUNT * batch_size + self.keys: List[str] = DEFAULT_CAT_NAMES + self.lengths: torch.Tensor = torch.ones((self._num_ids_in_batch,), dtype=torch.int32) + self.offsets: torch.Tensor = torch.arange(0, self._num_ids_in_batch + 1, dtype=torch.int32) + self.stride = batch_size + self.length_per_key: List[int] = CAT_FEATURE_COUNT * [batch_size] + self.offset_per_key: List[int] = [batch_size * i for i in range(CAT_FEATURE_COUNT + 1)] + self.index_per_key: Dict[str, int] = {key: i for (i, key) in enumerate(self.keys)} + self.seed = seed + self.epoch = 0 + + self.drop_last = drop_last + if drop_last: + self.num_batches = sum([fragment.metadata.num_rows for fragment in self.dataset.fragments + ]) // self.batch_size + else: + self.num_batches = (sum([fragment.metadata.num_rows + for fragment in self.dataset.fragments]) + self.batch_size - 1) // self.batch_size + + def __iter__(self): + buffer: Optional[List[np.ndarray]] = None + + def append_to_buffer(_dense: np.ndarray, _sparse: np.ndarray, _labels: np.ndarray) -> None: + nonlocal buffer + if buffer is None: + buffer = [_dense, _sparse, _labels] + else: + buffer[0] = np.concatenate([buffer[0], _dense], axis=0) + buffer[1] = np.concatenate([buffer[1], _sparse], axis=1) + buffer[2] = np.concatenate([buffer[2], _labels], axis=0) + + random.seed(self.seed + self.epoch) # for sync RNG inside the petastorm reader + self.epoch += 1 + with make_batch_reader( + list(map(lambda x: "file://" + x, self.dataset.files)), + num_epochs=1, + workers_count=1, # for reproducibility + ) as reader: + # note that `batch` here is just a bunch of samples read by petastorm instead of `batch` consumed by models + for batch in reader: + labels = getattr(batch, DEFAULT_LABEL_NAME) + sparse = np.concatenate([getattr(batch, col_name).reshape(1, -1) for col_name in DEFAULT_CAT_NAMES], + axis=0) + if self.sparse_offsets is not None: + sparse = sparse + self.sparse_offsets + dense = np.concatenate([getattr(batch, col_name).reshape(-1, 1) for col_name in DEFAULT_INT_NAMES], + axis=1) + start_idx = 0 + while start_idx < dense.shape[0]: + buffer_size = 0 if buffer is None else buffer[0].shape[0] + if buffer_size == self.batch_size: + _num_batch += 1 + yield self._batch_ndarray(*buffer) + buffer = None + else: + rows_to_get = min(self.batch_size - buffer_size, dense.shape[0] - start_idx) + label_chunk = labels[start_idx:start_idx + rows_to_get] + sparse_chunk = sparse[:, start_idx:start_idx + rows_to_get] + dense_chunk = dense[start_idx:start_idx + rows_to_get, :] + append_to_buffer(dense_chunk, sparse_chunk, label_chunk) + start_idx += rows_to_get + if buffer is not None and not self.drop_last: + yield self._batch_ndarray(*buffer) + + def _batch_ndarray(self, dense: np.ndarray, sparse: np.ndarray, labels: np.ndarray): + if self.shuffle_batches: + # Shuffle all 3 in unison + shuffler = np.random.permutation(len(dense)) + dense = dense[shuffler] + sparse = sparse[shuffler] + labels = labels[shuffler] + + return Batch( + dense_features=torch.from_numpy(dense), + sparse_features=KeyedJaggedTensor( + keys=self.keys, + values=torch.from_numpy(sparse.reshape(-1)), + lengths=self.lengths, + offsets=self.offsets, + stride=self.stride, + length_per_key=self.length_per_key, + offset_per_key=self.offset_per_key, + index_per_key=self.index_per_key, + ), + labels=torch.from_numpy(labels.reshape(-1)), + ) + + def __len__(self): + return self.num_batches + + def _get_kaggle_dataloader(args, stage, rank, world_size): files = os.listdir(args.dataset_dir) @@ -278,30 +386,24 @@ def _get_terabyte_dataloader(args, stage, rank, world_size): else: data_split = "test" - if world_size > 1: - raise NotImplementedError("We do not support distributed dataloader currently.") + if world_size > 1 or rank != 0: + raise RuntimeError("We do not support distributed dataloader currently.") file_num = len(glob.glob(os.path.join(args.dataset_dir, data_split, "*.parquet"))) files = [os.path.join(args.dataset_dir, data_split, f"part_{i}.parquet") for i in range(file_num)] - nv_iter = TorchAsyncItr( - nvt.Dataset(files, engine="parquet", part_size="256MB"), - batch_size=args.batch_size, - cats=DEFAULT_CAT_NAMES, - conts=DEFAULT_INT_NAMES, - labels=["label"], - global_rank=rank, - global_size=world_size, - drop_last=True, - shuffle=stage == "train", - seed_fn=lambda: args.seed, - ) - - dataloader = DataLoader(nv_iter, + dataloader = DataLoader(PetastormDataReader(files, + args.batch_size, + rank=None, + world_size=None, + shuffle_batches=stage == "train", + hashes=args.num_embeddings_per_feature, + seed=args.seed), batch_size=None, pin_memory=False, - collate_fn=KJTTransform(nv_iter).transform, + collate_fn=lambda x: x, num_workers=0) + return dataloader @@ -325,21 +427,19 @@ def get_id_freq_map(path): if 'kaggle' not in path: file_num = len(glob.glob(os.path.join(path, "train", "*.parquet"))) files = [os.path.join(path, "train", f"part_{i}.parquet") for i in range(file_num)] - - feature_count = NVTabularFeatureCounter(files, - list(map(int, NUM_EMBEDDINGS_PER_FEATURE.split(','))), - 16384, - sample_fraction=0.1) - id_freq_map = feature_count.id_freq_map + feature_count = PetastormCounter(files, + list(map(int, NUM_EMBEDDINGS_PER_FEATURE.split(','))), + subsample_fraction=0.1) + id_freq_map = feature_count.compute() else: files = os.listdir(path) sparse_files = list(filter(lambda s: 'sparse' in s, files)) sparse_files = [os.path.join(path, _f) for _f in sparse_files] - file_processor = CriteoSparseProcessor(list(map(int, KAGGLE_NUM_EMBEDDINGS_PER_FEATURE.split(',')))) - feature_count = GlobalFeatureCounter(sparse_files, file_processor) - id_freq_map = torch.from_numpy(feature_count.id_freq_map) + feature_count = GlobalFeatureCounter(sparse_files, list(map(int, KAGGLE_NUM_EMBEDDINGS_PER_FEATURE.split(',')))) + id_freq_map = feature_count.compute() + id_freq_map = torch.from_numpy(id_freq_map) if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: torch.save(id_freq_map, checkpoint_path) diff --git a/recsys/datasets/feature_counter.py b/recsys/datasets/feature_counter.py index 65048a3..adc1952 100644 --- a/recsys/datasets/feature_counter.py +++ b/recsys/datasets/feature_counter.py @@ -1,112 +1,54 @@ import abc -import itertools +import random from tqdm import tqdm import numpy as np -from contexttimer import Timer -import torch -from torch.utils.data import DataLoader -try: - # pyre-ignore[21] - import nvtabular as nvt - from nvtabular.loader.torch import TorchAsyncItr -except ImportError: - print("Unable to import NVTabular, which indicates that you cannot load criteo 1TB dataset with our solution") +from .criteo import DEFAULT_CAT_NAMES +from petastorm import make_batch_reader -from .criteo import DEFAULT_CAT_NAMES, DEFAULT_INT_NAMES -from .utils import KJTTransform - -class CriteoSparseProcessor: - - def __init__(self, hash_sizes): - self.hash_sizes = np.array(hash_sizes).reshape(1, -1) - self.offsets = np.array([0, *np.cumsum(hash_sizes)[:-1]]).reshape(1, -1) - - def __call__(self, _f): - arr = np.load(_f) - arr %= self.hash_sizes - arr += self.offsets - flattened = arr.reshape(-1) - bins = np.bincount(flattened, minlength=self.hash_sizes.sum()) - return bins - - -class BaseFeatureCounter(abc.ABC): - - def __init__(self, datafiles): - self.datafiles = datafiles - self._id_freq_map = None - self._collect_statistics() - - @abc.abstractmethod - def _collect_statistics(self): - pass - - @property - def id_freq_map(self): - return self._id_freq_map - - -class GlobalFeatureCounter(BaseFeatureCounter): +class GlobalFeatureCounter: """ compute the global statistics of the whole training set """ - def __init__(self, datafiles, file_callback): - self.file_processor = file_callback - - super(GlobalFeatureCounter, self).__init__(datafiles) + def __init__(self, datafiles, hash_sizes): + self.datafiles = datafiles + self.hash_sizes = np.array(hash_sizes).reshape(1, -1) + self.offsets = np.array([0, *np.cumsum(hash_sizes)[:-1]]).reshape(1, -1) - def _collect_statistics(self): + def compute(self): + id_freq_map = np.zeros(self.hash_sizes.sum(), dtype=np.int64) for _f in self.datafiles: - if self._id_freq_map is None: - self._id_freq_map = self.file_processor(_f) - else: - self._id_freq_map += self.file_processor(_f) + arr = np.load(_f) + arr %= self.hash_sizes + arr += self.offsets + flattened = arr.reshape(-1) + id_freq_map += np.bincount(flattened, minlength=self.hash_sizes.sum()) + return id_freq_map -class NVTabularFeatureCounter: +class PetastormCounter: - def __init__(self, datafiles, hashes, batch_size, sample_fraction=0.05): + def __init__(self, datafiles, hash_sizes, subsample_fraction=0.2, seed=1024): self.datafiles = datafiles - self._id_freq_map = torch.zeros(sum(hashes), dtype=torch.long) - self.batch_size = batch_size - self.pre_ones = torch.ones(batch_size * len(DEFAULT_CAT_NAMES), dtype=torch.long) - self.sample_fraction = sample_fraction - self._collect_statistics() - - def _collect_statistics(self): - data_files = sorted(self.datafiles[:int(np.ceil(len(self.datafiles) * self.sample_fraction))]) - nv_iter = TorchAsyncItr( - nvt.Dataset(data_files, engine="parquet", part_size="256MB"), - batch_size=self.batch_size, - cats=DEFAULT_CAT_NAMES, - conts=DEFAULT_INT_NAMES, - labels=["label"], - global_rank=0, - global_size=1, - drop_last=False, - device='cpu', - ) + self.total_features = sum(hash_sizes) - dataloader = DataLoader(nv_iter, - batch_size=None, - pin_memory=False, - collate_fn=KJTTransform(nv_iter).transform, - num_workers=0) - data_iter = iter(dataloader) - with Timer() as timer: - for it in tqdm(itertools.count()): - try: - sparse = next(data_iter).sparse_features.values() - ones = self.pre_ones.narrow(0, start=0, length=sparse.shape[0]) - self._id_freq_map.index_add_(dim=0, index=sparse, source=ones) - except StopIteration: - break - print(f"collect statistics over files: {data_files} num batch: {len(dataloader)}, batch size: {self.batch_size}" - f", average time cost: {len(dataloader) / timer.elapsed:.2f} batch/s") - - @property - def id_freq_map(self): - return self._id_freq_map + self.offsets = np.array([0, *np.cumsum(hash_sizes)[:-1]]).reshape(1, -1) + self.subsample_fraction = subsample_fraction + self.seed = seed + + def compute(self): + _id_freq_map = np.zeros(self.total_features, dtype=np.int64) + random.seed(self.seed) + files = list(map(lambda x: "file://" + x, self.datafiles)) + random.shuffle(files) + if 0. < self.subsample_fraction < 1.: + files = files[:int(np.ceil(len(files)) * self.subsample_fraction)] + with make_batch_reader(files, num_epochs=1) as reader: + for batch in tqdm(reader, ncols=0, desc="Collecting id-freq map"): + sparse = np.concatenate([getattr(batch, col_name).reshape(-1, 1) for col_name in DEFAULT_CAT_NAMES], + axis=1) + sparse = (sparse + self.offsets).reshape(-1) + _id_freq_map += np.bincount(sparse, minlength=self.total_features) + return _id_freq_map diff --git a/recsys/dlrm_main.py b/recsys/dlrm_main.py index fb66b4b..3a0dd69 100644 --- a/recsys/dlrm_main.py +++ b/recsys/dlrm_main.py @@ -1,4 +1,5 @@ import os +import time from dataclasses import dataclass, field from typing import List, Optional from tqdm import tqdm @@ -216,14 +217,14 @@ def _train(model, total = len(data_loader) if hasattr(data_loader, "__len__") else None meter = tqdm(itertools.count(), desc=f"Epoch {epoch}", ncols=0, total=total) - timer = colossalai.utils.Timer() + time_elapse = 0. for _ in meter: try: # We introduce a timer as a temporary solution to exclude interference # due to the bugs exists in NVTabular dataloader, please see my discussion: # https://github.com/dask/dask/discussions/9405. batch = next(data_iter) - timer.start() + start = time.time() dense, sparse, labels = put_data_in_device(batch, model.dense_device, model.sparse_device, use_distributed_dataloader, rank, world_size) with record_function("(zhg)forward pass"): @@ -238,7 +239,7 @@ def _train(model, with record_function("(zhg)optimization"): optimizer.step() - timer.stop(keep_in_history=True) + time_elapse += time.time() - start if prof: prof.step() @@ -254,7 +255,7 @@ def _train(model, dist_logger.info(f"{get_mem_info('Training: ')}") break if hasattr(data_loader, "__len__"): - dist_logger.info(f"average throughput: {len(data_loader) / timer.get_history_sum():.2f} it/s") + dist_logger.info(f"average throughput: {len(data_loader) / time_elapse:.2f} it/s") def _evaluate(model, data_loader, stage, use_overlap, use_distributed_dataloader): @@ -327,31 +328,11 @@ def train_val_test( return train_val_test_results -def dist_config(args): - colossalai.logging.disable_existing_loggers() - - mpi_world_size = os.environ.get("OMPI_COMM_WORLD_SIZE", None) - if mpi_world_size is not None: - # below is just a trick for integrating NVTabular dataloader for criteo terabyte dataset - if os.environ.get("NVT_TAG", None): - os.environ["OMPI_COMM_WORLD_LOCAL_RANK"] = "0" - colossalai.launch_from_openmpi( - config={}, - host=os.environ.get("MASTER_ADDR", "localhost"), - port=os.environ.get("MASTER_PORT", "12355"), - seed=args.seed, - verbose=False, - ) - else: - if os.environ.get("NVT_TAG", None): - os.environ["LOCAL_RANK"] = "0" - colossalai.launch_from_torch(config={}, seed=args.seed, verbose=False) - - def main(): args = parse_args() - dist_config(args) + colossalai.logging.disable_existing_loggers() + colossalai.launch_from_torch(config={}, seed=args.seed, verbose=False) rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() @@ -378,7 +359,7 @@ def main(): val_dataloader = data_module.get_dataloader(args, "val", **dataloader_factory) test_dataloader = data_module.get_dataloader(args, "test", **dataloader_factory) - if args.dataset_dir is not None: + if args.dataset_dir is not None and hasattr(train_dataloader, "__len__"): dist_logger.info( f"training batches: {len(train_dataloader)}, val batches: {len(val_dataloader)}, " f"test batches: {len(test_dataloader)}", diff --git a/run.sh b/run.sh index c9c9fb6..7975210 100644 --- a/run.sh +++ b/run.sh @@ -20,13 +20,7 @@ # --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap # criteo terabyte -#mpirun -x LD_PRELOAD=libmpi.so --allow-run-as-root -np 2 bash dist_wrapper.sh python recsys/dlrm_main.py \ -# --dataset_dir /data/criteo_preproc \ -# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ -# --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap - -# torchrun seems better than mpirun -torchrun --nnode=1 --nproc_per_node=2 --log_dir=tmp -t 3 --no_python bash dist_wrapper.sh \ - python recsys/dlrm_main.py --dataset_dir /data/criteo_preproc \ +torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/scratch/criteo_terabyte/criteo_preproc/ \ --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap diff --git a/test.py b/test.py deleted file mode 100644 index 3d0e56e..0000000 --- a/test.py +++ /dev/null @@ -1,183 +0,0 @@ -import time -import os -import shutil -from tqdm import tqdm -import itertools - -from dask.distributed import Client -from dask_cuda import LocalCUDACluster -from nvtabular.utils import device_mem_size -import torch -import torch.distributed as dist -from torch.utils.data import DataLoader -import nvtabular as nvt -from nvtabular.loader.torch import TorchAsyncItr # , DLDataLoader -from torchrec.sparse.jagged_tensor import KeyedJaggedTensor -from torchrec.datasets.utils import Batch - -import colossalai -from recsys.utils import get_mem_info -from merlin.core.utils import global_dask_client, _merlin_dask_client - -INPUT_DATA_DIR = "/data/criteo_preproc/train/" -BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 16384)) -CONTINUOUS_COLUMNS = ["int_" + str(x) for x in range(0, 13)] -CATEGORICAL_COLUMNS = ["cat_" + str(x) for x in range(0, 26)] -LABEL_COLUMNS = ["label"] - - -class KJTTransform: - - def __init__(self, dataloader): - self.batch_size = dataloader.batch_size - self.cats = dataloader.cat_names - self.conts = dataloader.cont_names - self.labels = dataloader.label_names - - _num_ids_in_batch = len(self.cats) * self.batch_size - self.lengths = torch.ones((_num_ids_in_batch,), dtype=torch.int32) - self.offsets = torch.arange(0, _num_ids_in_batch + 1, dtype=torch.int32) - self.length_per_key = len(self.cats) * [self.batch_size] - self.offset_per_key = [self.batch_size * i for i in range(len(self.cats) + 1)] - self.index_per_key = {key: i for (i, key) in enumerate(self.cats)} - - def transform(self, batch): - sparse, dense = [], [] - for col in self.cats: - sparse.append(batch[0][col]) - sparse = torch.cat(sparse, dim=1) - for col in self.conts: - dense.append(batch[0][col]) - dense = torch.cat(dense, dim=1) - - return Batch( - dense_features=dense, - sparse_features=KeyedJaggedTensor( - keys=self.cats, - values=sparse.transpose(1, 0).reshape(-1), - lengths=self.lengths, - offsets=self.offsets, - stride=self.batch_size, - length_per_key=self.length_per_key, - offset_per_key=self.offset_per_key, - index_per_key=self.index_per_key, - ), - labels=batch[1], - ) - - -def setup_dask(dask_workdir): - if os.path.exists(dask_workdir): - shutil.rmtree(dask_workdir) - os.makedirs(dask_workdir) - - device_limit_frac = 0.05 # Spill GPU-Worker memory to host at this limit. - device_pool_frac = 0.04 - - # Use total device size to calculate device limit and pool_size - device_size = device_mem_size(kind="total") - device_limit = int(device_limit_frac * device_size) - device_pool_size = int(device_pool_frac * device_size) - - cluster = LocalCUDACluster( - protocol="tcp", - n_workers=1, - CUDA_VISIBLE_DEVICES=os.environ["CUDA_VISIBLE_DEVICES"], - device_memory_limit="1GB", - local_directory=dask_workdir, - shared_filesystem=True, - memory_limit="100GB", - rmm_pool_size=None # (device_pool_size // 256) * 256, - ) - - return Client(cluster) - - -def run(): - os.environ["LOCAL_RANK"] = '0' - - colossalai.logging.disable_existing_loggers() - colossalai.launch_from_torch(config={}, verbose=False) - - fname = "part_{}.parquet" - train_paths = [os.path.join(INPUT_DATA_DIR, fname.format(i)) for i in range(64)] - - print(f"{dist.get_rank()}/{dist.get_world_size()}: device: {torch.cuda.current_device()}") - - # fs, fs_token, paths2 = get_fs_token_paths(train_paths, mode="rb", storage_options={}) - # print(fs) - # print(fs_token) - # print(paths2) - # - start = time.time() - train_data = nvt.Dataset(train_paths, engine="parquet", part_size="256MB") - print(f"nvdtaset: {time.time() - start}, is cpu: {train_data.cpu}") - print(f"Client: {global_dask_client()}, {_merlin_dask_client.get()}") - # - # # import pyarrow.dataset as pa_ds - # # dataset = pa_ds.dataset(train_paths, filesystem=fs) - # # print(f"frag path: {next(dataset.get_fragments()).path}") - # - # import cudf - # _df = cudf.io.read_parquet(train_paths[0], row_groups=1) - # print(f"df: {_df.shape}") - # print(f"take 1: {_df.take([1])}") - # print(f"memory usage: {_df.memory_usage(deep=True).sum()}") - # - # from pathlib import Path - # from merlin.schema.io.tensorflow_metadata import TensorflowMetadata - # schema_path = Path(train_paths[0]).parent - # print(f"Schema: {TensorflowMetadata.from_proto_text_file(schema_path).to_merlin_schema()}") - # - # ddf = train_data.engine.to_ddf() - # print(f"ddf: {ddf}") - # print(f"Npartition: {ddf.npartitions}, dataset partitions: {train_data.npartitions}") - - start = time.time() - train_data_idrs = TorchAsyncItr( - train_data, - batch_size=BATCH_SIZE, - cats=CATEGORICAL_COLUMNS, - conts=CONTINUOUS_COLUMNS, - labels=LABEL_COLUMNS, - global_rank=0, - global_size=1, - drop_last=True, - shuffle=True, - seed_fn=lambda: 1, - ) - print(f"TorchAsyncItr: {time.time() - start}, len: {len(train_data_idrs)}") - - # import threading - # event = threading.Event() - # print(f"stop: {event.is_set()}") - - start = time.time() - train_dataloader = DataLoader(train_data_idrs, - collate_fn=KJTTransform(train_data_idrs).transform, - batch_size=None, - pin_memory=False, - num_workers=0) - print(f"dataloader: {time.time() - start}, len: {len(train_dataloader)}") - - data_iter = iter(train_dataloader) - for idx in tqdm(itertools.count(), - desc=f"Rank {dist.get_rank()}", - ncols=0, - total=len(train_dataloader) if hasattr(train_dataloader, "__len__") else None): - batch = next(data_iter) - # print(f"rank: {dist.get_rank()}, ix: {idx}, dense: {batch.dense_features}") - # if idx == 5: - # break - print(get_mem_info()) - torch.cuda.synchronize() - # id_freq_map = get_id_freq_map("/data/criteo_preproc") - # print(id_freq_map.shape, id_freq_map.max(), id_freq_map.min()) - - -if __name__ == "__main__": - os.environ["LIBCUDF_CUFILE_POLICY"] = "ALWAYS" - client = setup_dask("dask_dir") - print(client.dashboard_link) - # torchrun --nnode=1 --nproc_per_node=2 --no_python bash dist_wrapper.sh python - run() diff --git a/test_petastorm.py b/test_petastorm.py new file mode 100644 index 0000000..188ddfb --- /dev/null +++ b/test_petastorm.py @@ -0,0 +1,38 @@ +import os +from tqdm import tqdm +import itertools +import random + +import torch +import torch.distributed as dist +from recsys.datasets.criteo import PetastormDataReader, get_id_freq_map +from recsys.utils import get_mem_info + +import pyarrow.parquet as pq + + +def iterate_data(): + dist.init_process_group(backend='nccl') + + dataset_dir = "/data/scratch/criteo_terabyte/criteo_preproc/train/" + fname = "part_{}.parquet" + train_paths = [dataset_dir + fname.format(i) for i in range(64)] + + random.seed(0) + dataloader = PetastormDataReader(train_paths, batch_size=16384, rank=None, world_size=None, shuffle_batches=False) + + data_iter = iter(dataloader) + for idx in tqdm(itertools.count(), ncols=0, total=len(dataloader) if hasattr(dataloader, "__len__") else None): + batch = next(data_iter) + # print(f"rank: {dist.get_rank()}, it {idx}, dense: {batch.dense_features[:5, :5]}") + # if idx == 2: + # break + + # dataset_dir = "/data/scratch/criteo_terabyte/criteo_preproc/" + # id_freq_map = get_id_freq_map(dataset_dir) + # + # print(f"rank: {dist.get_rank()}, first 10: {id_freq_map[:10]}") + + +if __name__ == "__main__": + iterate_data() From 12b54231766d49b2c3bf92f7955f25af8bf6bdcc Mon Sep 17 00:00:00 2001 From: zxgx Date: Tue, 23 Aug 2022 17:44:44 +0800 Subject: [PATCH 15/19] fix bugs --- recsys/datasets/criteo.py | 3 +-- recsys/dlrm_main.py | 2 +- recsys/models/dlrm.py | 7 ------- run.sh | 4 ++-- 4 files changed, 4 insertions(+), 12 deletions(-) diff --git a/recsys/datasets/criteo.py b/recsys/datasets/criteo.py index 93f6e66..ff72f69 100644 --- a/recsys/datasets/criteo.py +++ b/recsys/datasets/criteo.py @@ -299,7 +299,6 @@ def append_to_buffer(_dense: np.ndarray, _sparse: np.ndarray, _labels: np.ndarra while start_idx < dense.shape[0]: buffer_size = 0 if buffer is None else buffer[0].shape[0] if buffer_size == self.batch_size: - _num_batch += 1 yield self._batch_ndarray(*buffer) buffer = None else: @@ -317,7 +316,7 @@ def _batch_ndarray(self, dense: np.ndarray, sparse: np.ndarray, labels: np.ndarr # Shuffle all 3 in unison shuffler = np.random.permutation(len(dense)) dense = dense[shuffler] - sparse = sparse[shuffler] + sparse = sparse[:, shuffler] labels = labels[shuffler] return Batch( diff --git a/recsys/dlrm_main.py b/recsys/dlrm_main.py index 3a0dd69..fed5df4 100644 --- a/recsys/dlrm_main.py +++ b/recsys/dlrm_main.py @@ -367,7 +367,7 @@ def main(): id_freq_map = None if args.use_freq: - id_freq_map = data_module.get_id_freq_map(args.dataset_dir).cuda(non_blocking=True) + id_freq_map = data_module.get_id_freq_map(args.dataset_dir) device = torch.device('cuda', torch.cuda.current_device()) sparse_device = torch.device('cpu') if args.use_cpu else device diff --git a/recsys/models/dlrm.py b/recsys/models/dlrm.py index f41c7e7..f925f25 100644 --- a/recsys/models/dlrm.py +++ b/recsys/models/dlrm.py @@ -1,10 +1,3 @@ -#!/usr/bin/env python3 -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. -# # The infrastructures of DLRM are mainly inspired by TorchRec: # https://github.com/pytorch/torchrec/blob/main/torchrec/models/dlrm.py import os diff --git a/run.sh b/run.sh index 7975210..901adda 100644 --- a/run.sh +++ b/run.sh @@ -20,7 +20,7 @@ # --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap # criteo terabyte -torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ +torchx run -s local_cwd -cfg log_dir=tmp/w2_0_5 dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ --dataset_dir /data/scratch/criteo_terabyte/criteo_preproc/ \ --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap + --profile_dir "tensorboard_log/w2_0_5" --buffer_size 0 --use_overlap From 82e066800cbd884b8bce80098b8d9ec0b03dc0bf Mon Sep 17 00:00:00 2001 From: zxgx Date: Wed, 24 Aug 2022 14:18:11 +0800 Subject: [PATCH 16/19] minor changes --- baselines/dlrm_main.py | 10 +++++----- recsys/datasets/criteo.py | 2 +- run.sh | 14 +++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/baselines/dlrm_main.py b/baselines/dlrm_main.py index 33438fe..0588798 100644 --- a/baselines/dlrm_main.py +++ b/baselines/dlrm_main.py @@ -488,7 +488,7 @@ def main(argv: List[str]) -> None: if args.memory_fraction is not None: torch.cuda.set_per_process_memory_fraction(args.memory_fraction) - print(f"set memory to {int(args.memory_fraction * 80)} GB") + print(f"set memory to {int(args.memory_fraction * 10)} GB") if args.num_embeddings_per_feature is not None: args.num_embeddings_per_feature = list(map(int, args.num_embeddings_per_feature.split(","))) args.num_embeddings = None @@ -542,21 +542,21 @@ def main(argv: List[str]) -> None: print(count_parameters(train_model, "DLRM")) # Torchrec Planner - hbm_cap = int(args.memory_fraction * 80) if args.memory_fraction else 70 + hbm_cap = int(args.memory_fraction * 10) if args.memory_fraction else 10 env = ShardingEnv.from_process_group(dist.GroupMember.WORLD) topology = Topology( world_size=env.world_size, compute_device="cuda", hbm_cap=hbm_cap * 1024**3, # GPU mem - ddr_cap=300 * 1024 * 3, # CPU mem - intra_host_bw=1000 * 1024**3 / 1000, + ddr_cap=100 * 1024**3, # CPU mem + # intra_host_bw=1000 * 1024**3 / 1000, ) # Device to Device bandwidth # inter_host_bw=CROSS_NODE_BANDWIDTH, # Not used yet # batch_size=args.batch_size) # constraints = { # f"t_{feature_name}": # ParameterConstraints(compute_kernels=[EmbeddingComputeKernel.BATCHED_FUSED_UVM.value]) - # for num_embeddings, feature_name in zip(args.num_embeddings_per_feature, DEFAULT_CAT_NAMES) + # for num_embeddings, feature_name in zip(args.num_embeddings_per_feature, data_module.DEFAULT_CAT_NAMES) # } planner = EmbeddingShardingPlanner(topology=topology, # constraints=constraints, diff --git a/recsys/datasets/criteo.py b/recsys/datasets/criteo.py index ff72f69..cb51cfa 100644 --- a/recsys/datasets/criteo.py +++ b/recsys/datasets/criteo.py @@ -238,7 +238,7 @@ def __init__(self, shuffle_batches=False, hashes=None, seed=1024, - drop_last=False): + drop_last=True): self.dataset = ParquetDataset(paths, use_legacy_dataset=False) self.batch_size = batch_size self.rank = rank diff --git a/run.sh b/run.sh index 901adda..ecd3231 100644 --- a/run.sh +++ b/run.sh @@ -2,14 +2,14 @@ # For TorchRec baseline #torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script baselines/dlrm_main.py -- \ -# --kaggle --in_memory_binary_criteo_path criteo_kaggle_data --embedding_dim 128 --pin_memory \ +# --kaggle --in_memory_binary_criteo_path /data/scratch/criteo_kaggle_data --embedding_dim 128 --pin_memory \ # --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ -# --learning_rate 1. --batch_size 8192 +# --learning_rate 1. --batch_size 8192 --memory_fraction 0.8 # For Colossalai enabled recsys # criteo kaggle #torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ -# --dataset_dir criteo_kaggle_data --pin_memory --shuffle_batches \ +# --dataset_dir /data/scratch/criteo_kaggle_data --pin_memory --shuffle_batches \ # --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ # --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap @@ -20,7 +20,7 @@ # --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap # criteo terabyte -torchx run -s local_cwd -cfg log_dir=tmp/w2_0_5 dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/scratch/criteo_terabyte/criteo_preproc/ \ - --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/w2_0_5" --buffer_size 0 --use_overlap +#torchx run -s local_cwd -cfg log_dir=tmp/w2_0_5 dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/scratch/criteo_terabyte/criteo_preproc/ \ +# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/w2_0_5" --buffer_size 0 --use_overlap From 00eb0e45b72fcb32e628c7eb4f5a9f6681ae0f45 Mon Sep 17 00:00:00 2001 From: zxgx Date: Thu, 25 Aug 2022 18:40:20 +0800 Subject: [PATCH 17/19] criteo 1TB code for torchrec baseline --- baselines/data/dlrm_dataloader.py | 161 +++++++++++++++++++++++++++++- baselines/dlrm_main.py | 22 ++-- run.sh | 2 +- 3 files changed, 173 insertions(+), 12 deletions(-) diff --git a/baselines/data/dlrm_dataloader.py b/baselines/data/dlrm_dataloader.py index 0621695..85799a1 100644 --- a/baselines/data/dlrm_dataloader.py +++ b/baselines/data/dlrm_dataloader.py @@ -7,23 +7,34 @@ import argparse import os -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict +import glob +import numpy as np +import torch from torch import distributed as dist -from torch.utils.data import DataLoader +from torch.utils.data import DataLoader, IterableDataset from torchrec.datasets.criteo import ( CAT_FEATURE_COUNT, DEFAULT_CAT_NAMES, DEFAULT_INT_NAMES, + DEFAULT_LABEL_NAME, DAYS, InMemoryBinaryCriteoIterDataPipe, ) from torchrec.datasets.random import RandomRecDataset +from torchrec.sparse.jagged_tensor import KeyedJaggedTensor +from torchrec.datasets.utils import Batch +from petastorm import make_batch_reader +from pyarrow.parquet import ParquetDataset from .avazu import AvazuIterDataPipe STAGES = ["train", "val", "test"] KAGGLE_NUM_EMBEDDINGS_PER_FEATURE = '1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,' \ '5461306,10,5652,2173,4,7046547,18,15,286181,105,142572' # For criteo kaggle +KAGGLE_TOTAL_TRAINING_SAMPLES = 39_291_954 # 0-6 days for criteo kaggle, 45,840,617 samples in total +TERABYTE_NUM_EMBEDDINGS_PER_FEATURE = "45833188,36746,17245,7413,20243,3,7114,1441,62,29275261,1572176,345138,10," \ + "2209,11267,128,4,974,14,48937457,11316796,40094537,452104,12606,104,35" def _get_random_dataloader(args: argparse.Namespace,) -> DataLoader: @@ -126,6 +137,145 @@ def get_avazu_data_loader(args, stage): return dataloader +class PetastormDataReader(IterableDataset): + """ + This is a compromise solution for the criteo terabyte dataset, + please see the solution 3 in: https://github.com/uber/petastorm/issues/508 + + Basically, the dataloader in each rank extracts random samples from the whole dataset in the training stage + in which the batches in each rank are not guaranteed to be unique. + In the validation stage, all the samples are evaluated in each rank, + so that each rank contains the correct result + """ + + def __init__(self, + paths, + batch_size, + rank=None, + world_size=None, + shuffle_batches=False, + hashes=None, + seed=1024, + drop_last=True): + self.dataset = ParquetDataset(paths, use_legacy_dataset=False) + self.batch_size = batch_size + self.rank = rank + self.world_size = world_size + self.shuffle_batches = shuffle_batches + self.hashes = np.array(hashes).reshape((1, CAT_FEATURE_COUNT)) if hashes is not None else None + + self._num_ids_in_batch: int = CAT_FEATURE_COUNT * batch_size + self.keys: List[str] = DEFAULT_CAT_NAMES + self.lengths: torch.Tensor = torch.ones((self._num_ids_in_batch,), dtype=torch.int32) + self.offsets: torch.Tensor = torch.arange(0, self._num_ids_in_batch + 1, dtype=torch.int32) + self.stride = batch_size + self.length_per_key: List[int] = CAT_FEATURE_COUNT * [batch_size] + self.offset_per_key: List[int] = [batch_size * i for i in range(CAT_FEATURE_COUNT + 1)] + self.index_per_key: Dict[str, int] = {key: i for (i, key) in enumerate(self.keys)} + self.seed = seed + + self.drop_last = drop_last + if drop_last: + self.num_batches = sum([fragment.metadata.num_rows for fragment in self.dataset.fragments + ]) // self.batch_size + else: + self.num_batches = (sum([fragment.metadata.num_rows + for fragment in self.dataset.fragments]) + self.batch_size - 1) // self.batch_size + if self.world_size is not None: + self.num_batches = self.num_batches // world_size + + def __iter__(self): + buffer: Optional[List[np.ndarray]] = None + + def append_to_buffer(_dense: np.ndarray, _sparse: np.ndarray, _labels: np.ndarray) -> None: + nonlocal buffer + if buffer is None: + buffer = [_dense, _sparse, _labels] + else: + buffer[0] = np.concatenate([buffer[0], _dense], axis=0) + buffer[1] = np.concatenate([buffer[1], _sparse], axis=1) + buffer[2] = np.concatenate([buffer[2], _labels], axis=0) + + with make_batch_reader( + list(map(lambda x: "file://" + x, self.dataset.files)), + num_epochs=1, + workers_count=1, # for reproducibility + ) as reader: + # note that `batch` here is just a bunch of samples read by petastorm instead of `batch` consumed by models + for _ in range(self.num_batches): + batch = next(reader) + labels = getattr(batch, DEFAULT_LABEL_NAME) + sparse = np.concatenate([getattr(batch, col_name).reshape(1, -1) for col_name in DEFAULT_CAT_NAMES], + axis=0) + dense = np.concatenate([getattr(batch, col_name).reshape(-1, 1) for col_name in DEFAULT_INT_NAMES], + axis=1) + start_idx = 0 + while start_idx < dense.shape[0]: + buffer_size = 0 if buffer is None else buffer[0].shape[0] + if buffer_size == self.batch_size: + yield self._batch_ndarray(*buffer) + buffer = None + else: + rows_to_get = min(self.batch_size - buffer_size, dense.shape[0] - start_idx) + label_chunk = labels[start_idx:start_idx + rows_to_get] + sparse_chunk = sparse[:, start_idx:start_idx + rows_to_get] + dense_chunk = dense[start_idx:start_idx + rows_to_get, :] + append_to_buffer(dense_chunk, sparse_chunk, label_chunk) + start_idx += rows_to_get + if buffer is not None and not self.drop_last: + yield self._batch_ndarray(*buffer) + + def _batch_ndarray(self, dense: np.ndarray, sparse: np.ndarray, labels: np.ndarray): + if self.shuffle_batches: + # Shuffle all 3 in unison + shuffler = np.random.permutation(len(dense)) + dense = dense[shuffler] + sparse = sparse[:, shuffler] + labels = labels[shuffler] + + return Batch( + dense_features=torch.from_numpy(dense), + sparse_features=KeyedJaggedTensor( + keys=self.keys, + values=torch.from_numpy(sparse.reshape(-1)), + lengths=self.lengths, + offsets=self.offsets, + stride=self.stride, + length_per_key=self.length_per_key, + offset_per_key=self.offset_per_key, + index_per_key=self.index_per_key, + ), + labels=torch.from_numpy(labels.reshape(-1)), + ) + + def __len__(self): + return self.num_batches + + +def _get_petastorm_dataloader(args, stage): + if stage == "train": + data_split = "train" + elif stage == "val": + data_split = "validation" + else: + data_split = "test" + + file_num = len(glob.glob(os.path.join(args.in_memory_binary_criteo_path, data_split, "*.parquet"))) + files = [os.path.join(args.in_memory_binary_criteo_path, data_split, f"part_{i}.parquet") for i in range(file_num)] + + dataloader = DataLoader(PetastormDataReader(files, + args.batch_size, + rank=dist.get_rank() if stage == "train" else None, + world_size=dist.get_world_size() if stage == "train" else None, + hashes=args.num_embeddings_per_feature), + batch_size=None, + pin_memory=False, + collate_fn=lambda x: x, + num_workers=0) + + return dataloader + + def get_dataloader(args: argparse.Namespace, backend: str, stage: str) -> DataLoader: """ Gets desired dataloader from dlrm_main command line options. Currently, this @@ -151,13 +301,16 @@ def get_dataloader(args: argparse.Namespace, backend: str, stage: str) -> DataLo if (not hasattr(args, "in_memory_binary_criteo_path") or args.in_memory_binary_criteo_path is None): return _get_random_dataloader(args) elif "criteo" in args.in_memory_binary_criteo_path: - return _get_in_memory_dataloader(args, stage) + if args.kaggle: + return _get_in_memory_dataloader(args, stage) + else: + return _get_petastorm_dataloader(args, stage) elif "avazu" in args.in_memory_binary_criteo_path: return get_avazu_data_loader(args, stage) # ============== Customize for Persia =================== -import numpy as np + try: from persia.embedding.data import IDTypeFeatureWithSingleID, NonIDTypeFeature, Label except ImportError: diff --git a/baselines/dlrm_main.py b/baselines/dlrm_main.py index 0588798..c2b83d3 100644 --- a/baselines/dlrm_main.py +++ b/baselines/dlrm_main.py @@ -38,7 +38,8 @@ try: # pyre-ignore[21] # @manual=//pytorch/benchmark/torchrec_dlrm/data:dlrm_dataloader - from data.dlrm_dataloader import get_dataloader, STAGES, KAGGLE_NUM_EMBEDDINGS_PER_FEATURE + from data.dlrm_dataloader import get_dataloader, STAGES, KAGGLE_NUM_EMBEDDINGS_PER_FEATURE, \ + TERABYTE_NUM_EMBEDDINGS_PER_FEATURE, KAGGLE_TOTAL_TRAINING_SAMPLES from data import avazu # pyre-ignore[21] # @manual=//pytorch/benchmark/torchrec_dlrm/modules:dlrm_train @@ -49,7 +50,8 @@ # internal import try: from .data.dlrm_dataloader import ( # noqa F811 - get_dataloader, STAGES, KAGGLE_NUM_EMBEDDINGS_PER_FEATURE) + get_dataloader, STAGES, KAGGLE_NUM_EMBEDDINGS_PER_FEATURE, TERABYTE_NUM_EMBEDDINGS_PER_FEATURE, # noqa F811 + KAGGLE_TOTAL_TRAINING_SAMPLES) # noqa F811 from .data import avazu from .modules.dlrm_train import DLRMTrain # noqa F811 except ImportError: @@ -59,13 +61,16 @@ from recsys.utils import get_mem_info TRAIN_PIPELINE_STAGES = 3 # Number of stages in TrainPipelineSparseDist. +TOTAL_TRAINING_SAMPLES = None def parse_args(argv: List[str]) -> argparse.Namespace: parser = argparse.ArgumentParser(description="torchrec dlrm example trainer") + parser.add_argument("--kaggle", action='store_true') parser.add_argument("--profile_dir", default="tensorboard_log/torchrec", type=str) parser.add_argument("--memory_fraction", default=None, type=float) + parser.add_argument("--epochs", type=int, default=1, help="number of epochs to train") parser.add_argument("--batch_size", type=int, default=32, help="batch size to use for training") parser.add_argument( @@ -462,10 +467,11 @@ def main(argv: List[str]) -> None: global TOTAL_TRAINING_SAMPLES if 'criteo' in args.in_memory_binary_criteo_path: if args.kaggle: - TOTAL_TRAINING_SAMPLES = 39291954 # 0-6 for criteo kaggle + TOTAL_TRAINING_SAMPLES = KAGGLE_TOTAL_TRAINING_SAMPLES setattr(args, 'num_embeddings_per_feature', KAGGLE_NUM_EMBEDDINGS_PER_FEATURE) else: - raise NotImplementedError("The criteo 1TB dataset is building") + TOTAL_TRAINING_SAMPLES = criteo.TOTAL_TRAINING_SAMPLES + setattr(args, 'num_embeddings_per_feature', TERABYTE_NUM_EMBEDDINGS_PER_FEATURE) data_module = criteo elif 'avazu' in args.in_memory_binary_criteo_path: TOTAL_TRAINING_SAMPLES = avazu.TOTAL_TRAINING_SAMPLES @@ -500,8 +506,9 @@ def main(argv: List[str]) -> None: if dist.get_rank() == 0: print(args) - # print(f"training batches: {len(train_dataloader)}, val batches: {len(val_dataloader)}, " - # f"test batches: {len(test_dataloader)}") + if getattr(train_dataloader, "__len__", None): + print(f"training batches: {len(train_dataloader)}, val batches: {len(val_dataloader)}, " + f"test batches: {len(test_dataloader)}") # Sets default limits for random dataloader iterations when left unspecified. if args.in_memory_binary_criteo_path is None: for stage in STAGES: @@ -555,7 +562,8 @@ def main(argv: List[str]) -> None: # batch_size=args.batch_size) # constraints = { # f"t_{feature_name}": - # ParameterConstraints(compute_kernels=[EmbeddingComputeKernel.BATCHED_FUSED_UVM.value]) + # ParameterConstraints(compute_kernels=[EmbeddingComputeKernel.BATCHED_FUSED_UVM_CACHING.value], + # caching_ratio=0.01 if num_embeddings > 100 else None) # for num_embeddings, feature_name in zip(args.num_embeddings_per_feature, data_module.DEFAULT_CAT_NAMES) # } planner = EmbeddingShardingPlanner(topology=topology, diff --git a/run.sh b/run.sh index ecd3231..b293752 100644 --- a/run.sh +++ b/run.sh @@ -2,7 +2,7 @@ # For TorchRec baseline #torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script baselines/dlrm_main.py -- \ -# --kaggle --in_memory_binary_criteo_path /data/scratch/criteo_kaggle_data --embedding_dim 128 --pin_memory \ +# --in_memory_binary_criteo_path /data/scratch/criteo_terabyte/criteo_preproc/ --embedding_dim 128 --pin_memory \ # --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ # --learning_rate 1. --batch_size 8192 --memory_fraction 0.8 From 963b6538247d9034f582c0aaf1239f92d1ad1ef7 Mon Sep 17 00:00:00 2001 From: Geng Zhang <34452939+zxgx@users.noreply.github.com> Date: Sun, 28 Aug 2022 17:47:04 +0800 Subject: [PATCH 18/19] experiment code (#114) --- Dockerfile | 19 ++++++++ Dockerfile.bak | 22 ++++++++++ avazu.sh | 68 +++++++++++++++++++++++++++++ baselines/dlrm_main.py | 12 ++++-- kaggle.sh | 68 +++++++++++++++++++++++++++++ recsys/datasets/feature_counter.py | 15 +++++-- run.sh | 31 +++++--------- terabyte.sh | 69 ++++++++++++++++++++++++++++++ test_petastorm.py | 31 ++++++++------ torchrec_avazu.sh | 17 ++++++++ torchrec_kaggle.sh | 17 ++++++++ torchrec_terabyte.sh | 17 ++++++++ 12 files changed, 345 insertions(+), 41 deletions(-) create mode 100644 Dockerfile create mode 100644 Dockerfile.bak create mode 100644 avazu.sh create mode 100644 kaggle.sh create mode 100644 terabyte.sh create mode 100644 torchrec_avazu.sh create mode 100644 torchrec_kaggle.sh create mode 100644 torchrec_terabyte.sh diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fb2efa9 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +FROM hpcaitech/pytorch-cuda:1.11.0-11.3.0 + +#install fbgemm_gpu +RUN python3 -m pip install --no-cache-dir fbgemm_gpu==0.1.1 + +#install torchrec +RUN wget https://download.pytorch.org/whl/torchrec-0.1.1-py39-none-any.whl && \ + python3 -m pip install --no-cache-dir torchrec-0.1.1-py39-none-any.whl && \ + rm torchrec-0.1.1-py39-none-any.whl + +# install colossalai +RUN git clone https://github.com/hpcaitech/ColossalAI.git && \ + cd ColossalAI/ && \ + python3 -m pip install --no-cache-dir -r requirements/requirements.txt && \ + python3 -m pip install --no-cache-dir . && \ + cd .. && \ + yes | rm -r ColossalAI/ + +RUN pip install --no-cache-dir petastorm[torch] diff --git a/Dockerfile.bak b/Dockerfile.bak new file mode 100644 index 0000000..57019ad --- /dev/null +++ b/Dockerfile.bak @@ -0,0 +1,22 @@ +# Domestic cloud servers often have network issues with pip, +# so we need to pip install from tsinghua mirror + +FROM hpcaitech/pytorch-cuda:1.11.0-11.3.0 + +#install fbgemm_gpu +RUN python3 -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir fbgemm_gpu==0.1.1 + +#install torchrec +RUN wget https://download.pytorch.org/whl/torchrec-0.1.1-py39-none-any.whl && \ + python3 -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir torchrec-0.1.1-py39-none-any.whl && \ + rm torchrec-0.1.1-py39-none-any.whl + +# install colossalai +RUN git clone https://github.com/hpcaitech/ColossalAI.git && \ + cd ColossalAI/ && \ + python3 -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir -r requirements/requirements.txt && \ + python3 -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir . && \ + cd .. && \ + yes | rm -r ColossalAI/ + +RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir petastorm[torch] torch_tb_profiler diff --git a/avazu.sh b/avazu.sh new file mode 100644 index 0000000..894a8ee --- /dev/null +++ b/avazu.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# For Colossalai enabled recsys +# avazu +torchx run -s local_cwd -cfg log_dir=log/avazu/w1_p1_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w1_p1_16k" --buffer_size 0 --use_overlap --cache_sets 94458 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w2_p1_16k dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w2_p1_16k" --buffer_size 0 --use_overlap --cache_sets 94458 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w4_p1_16k dist.ddp -j 1x4 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w4_p1_16k" --buffer_size 0 --use_overlap --cache_sets 94458 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w1_p1_32k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 32768 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w1_p1_32k" --buffer_size 0 --use_overlap --cache_sets 94458 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w1_p1_8k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 8192 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w1_p1_8k" --buffer_size 0 --use_overlap --cache_sets 94458 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w1_p1_4k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 4096 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w1_p1_4k" --buffer_size 0 --use_overlap --cache_sets 94458 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w1_p1_2k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 2048 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w1_p1_2k" --buffer_size 0 --use_overlap --cache_sets 94458 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w1_p1_1k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 1024 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w1_p1_1k" --buffer_size 0 --use_overlap --cache_sets 94458 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w1_p10_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w1_p10_16k" --buffer_size 0 --use_overlap --cache_sets 944582 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w1_p5_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w1_p5_16k" --buffer_size 0 --use_overlap --cache_sets 472291 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w1_p2_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w1_p2_16k" --buffer_size 0 --use_overlap --cache_sets 188916 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w1_p0_5_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w1_p0_5_16k" --buffer_size 0 --use_overlap --cache_sets 47229 + +torchx run -s local_cwd -cfg log_dir=log/avazu/w1_p0_1_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/avazu_sample --pin_memory --shuffle_batches \ + --learning_rate 5e-2 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/avazu/w1_p0_1_16k" --buffer_size 0 --use_overlap --cache_sets 9445 diff --git a/baselines/dlrm_main.py b/baselines/dlrm_main.py index c2b83d3..9c044e7 100644 --- a/baselines/dlrm_main.py +++ b/baselines/dlrm_main.py @@ -268,10 +268,13 @@ def _evaluate( try: _loss, logits, labels = train_pipeline.progress(combined_iterator) preds = torch.sigmoid(logits) + labels = labels.int() auroc(preds, labels) accuracy(preds, labels) except StopIteration: break + except RuntimeError: # petastorm dataloader StopIteration will raise RuntimeError in train_pipeline + break auroc_result = auroc.compute().item() accuracy_result = accuracy.compute().item() if dist.get_rank() == 0: @@ -377,6 +380,9 @@ def _train( except StopIteration: print(f"{get_mem_info('Training: ')}") break + except RuntimeError: # petastorm dataloader StopIteration will raise RuntimeError in train_pipeline + print(f"{get_mem_info('Training: ')}") + break def train_val_test( @@ -494,7 +500,7 @@ def main(argv: List[str]) -> None: if args.memory_fraction is not None: torch.cuda.set_per_process_memory_fraction(args.memory_fraction) - print(f"set memory to {int(args.memory_fraction * 10)} GB") + print(f"set memory to {int(args.memory_fraction * 80)} GB") if args.num_embeddings_per_feature is not None: args.num_embeddings_per_feature = list(map(int, args.num_embeddings_per_feature.split(","))) args.num_embeddings = None @@ -549,13 +555,13 @@ def main(argv: List[str]) -> None: print(count_parameters(train_model, "DLRM")) # Torchrec Planner - hbm_cap = int(args.memory_fraction * 10) if args.memory_fraction else 10 + hbm_cap = int(args.memory_fraction * 80) if args.memory_fraction else 80 env = ShardingEnv.from_process_group(dist.GroupMember.WORLD) topology = Topology( world_size=env.world_size, compute_device="cuda", hbm_cap=hbm_cap * 1024**3, # GPU mem - ddr_cap=100 * 1024**3, # CPU mem + ddr_cap=1000 * 1024**3, # CPU mem # intra_host_bw=1000 * 1024**3 / 1000, ) # Device to Device bandwidth # inter_host_bw=CROSS_NODE_BANDWIDTH, # Not used yet diff --git a/kaggle.sh b/kaggle.sh new file mode 100644 index 0000000..a392040 --- /dev/null +++ b/kaggle.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# For Colossalai enabled recsys +# criteo kaggle +torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w1_p1_16k" --buffer_size 0 --use_overlap --cache_sets 337625 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w2_p1_16k dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w2_p1_16k" --buffer_size 0 --use_overlap --cache_sets 337625 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w4_p1_16k dist.ddp -j 1x4 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 5e-1 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w4_p1_16k" --buffer_size 0 --use_overlap --cache_sets 337625 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_32k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 32768 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w1_p1_32k" --buffer_size 0 --use_overlap --cache_sets 337625 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_8k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 8192 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w1_p1_8k" --buffer_size 0 --use_overlap --cache_sets 337625 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_4k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 4096 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w1_p1_4k" --buffer_size 0 --use_overlap --cache_sets 337625 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_2k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 2048 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w1_p1_2k" --buffer_size 0 --use_overlap --cache_sets 337625 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_1k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 1024 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w1_p1_1k" --buffer_size 0 --use_overlap --cache_sets 337625 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p10_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w1_p10_16k" --buffer_size 0 --use_overlap --cache_sets 3376257 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p5_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w1_p5_16k" --buffer_size 0 --use_overlap --cache_sets 1688128 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p2_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w1_p2_16k" --buffer_size 0 --use_overlap --cache_sets 675251 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p0_5_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w1_p0_5_16k" --buffer_size 0 --use_overlap --cache_sets 168812 + +torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p0_1_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/kaggle/w1_p0_1_16k" --buffer_size 0 --use_overlap --cache_sets 33762 diff --git a/recsys/datasets/feature_counter.py b/recsys/datasets/feature_counter.py index adc1952..d76cece 100644 --- a/recsys/datasets/feature_counter.py +++ b/recsys/datasets/feature_counter.py @@ -5,6 +5,7 @@ import numpy as np from .criteo import DEFAULT_CAT_NAMES from petastorm import make_batch_reader +from pyarrow.parquet import ParquetDataset class GlobalFeatureCounter: @@ -40,13 +41,19 @@ def __init__(self, datafiles, hash_sizes, subsample_fraction=0.2, seed=1024): def compute(self): _id_freq_map = np.zeros(self.total_features, dtype=np.int64) + + files = self.datafiles random.seed(self.seed) - files = list(map(lambda x: "file://" + x, self.datafiles)) random.shuffle(files) if 0. < self.subsample_fraction < 1.: - files = files[:int(np.ceil(len(files)) * self.subsample_fraction)] - with make_batch_reader(files, num_epochs=1) as reader: - for batch in tqdm(reader, ncols=0, desc="Collecting id-freq map"): + files = files[:int(np.ceil(len(files) * self.subsample_fraction))] + + dataset = ParquetDataset(files, use_legacy_dataset=False) + with make_batch_reader(list(map(lambda x: "file://" + x, dataset.files)), num_epochs=1) as reader: + for batch in tqdm(reader, + ncols=0, + desc="Collecting id-freq map", + total=sum([fragment.metadata.num_row_groups for fragment in dataset.fragments])): sparse = np.concatenate([getattr(batch, col_name).reshape(-1, 1) for col_name in DEFAULT_CAT_NAMES], axis=1) sparse = (sparse + self.offsets).reshape(-1) diff --git a/run.sh b/run.sh index b293752..b6f78c3 100644 --- a/run.sh +++ b/run.sh @@ -1,26 +1,15 @@ #!/bin/bash -# For TorchRec baseline -#torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script baselines/dlrm_main.py -- \ -# --in_memory_binary_criteo_path /data/scratch/criteo_terabyte/criteo_preproc/ --embedding_dim 128 --pin_memory \ -# --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ -# --learning_rate 1. --batch_size 8192 --memory_fraction 0.8 - # For Colossalai enabled recsys -# criteo kaggle -#torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ -# --dataset_dir /data/scratch/criteo_kaggle_data --pin_memory --shuffle_batches \ -# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ -# --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap +bash kaggle.sh + +bash avazu.sh + +bash terabyte.sh + +# For TorchRec baseline +bash torchrec_kaggle.sh -# avazu -#torchx run -s local_cwd -cfg log_dir=tmp dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ -# --dataset_dir avazu_sample --pin_memory --shuffle_batches \ -# --learning_rate 5e-2 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ -# --profile_dir "tensorboard_log/cache" --buffer_size 0 --use_overlap +bash torchrec_avazu.sh -# criteo terabyte -#torchx run -s local_cwd -cfg log_dir=tmp/w2_0_5 dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ -# --dataset_dir /data/scratch/criteo_terabyte/criteo_preproc/ \ -# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ -# --profile_dir "tensorboard_log/w2_0_5" --buffer_size 0 --use_overlap +bash torchrec_terabyte.sh diff --git a/terabyte.sh b/terabyte.sh new file mode 100644 index 0000000..6eb2c5e --- /dev/null +++ b/terabyte.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# For Colossalai enabled recsys + +# criteo terabyte +torchx run -s local_cwd -cfg log_dir=log/terabyte/w1_p1_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w1_p1_16k" --buffer_size 0 --use_overlap --cache_sets 1779442 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w2_p1_16k dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w2_p1_16k" --buffer_size 0 --use_overlap --cache_sets 1779442 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w4_p1_16k dist.ddp -j 1x4 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w4_p1_16k" --buffer_size 0 --use_overlap --cache_sets 1779442 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w1_p1_32k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 32768 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w1_p1_32k" --buffer_size 0 --use_overlap --cache_sets 1779442 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w1_p1_8k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 8192 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w1_p1_8k" --buffer_size 0 --use_overlap --cache_sets 1779442 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w1_p1_4k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 4096 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w1_p1_4k" --buffer_size 0 --use_overlap --cache_sets 1779442 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w1_p1_2k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 2048 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w1_p1_2k" --buffer_size 0 --use_overlap --cache_sets 1779442 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w1_p1_1k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 1024 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w1_p1_1k" --buffer_size 0 --use_overlap --cache_sets 1779442 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w1_p10_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w1_p10_16k" --buffer_size 0 --use_overlap --cache_sets 17794427 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w1_p5_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w1_p5_16k" --buffer_size 0 --use_overlap --cache_sets 8897213 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w1_p2_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w1_p2_16k" --buffer_size 0 --use_overlap --cache_sets 3558885 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w1_p0_1_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w1_p0_1_16k" --buffer_size 0 --use_overlap --cache_sets 177944 + +torchx run -s local_cwd -cfg log_dir=log/terabyte/w1_p0_5_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ + --dataset_dir /data/criteo_preproc/ \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ + --profile_dir "tensorboard_log/terabyte/w1_p0_5_16k" --buffer_size 0 --use_overlap --cache_sets 889721 + diff --git a/test_petastorm.py b/test_petastorm.py index 188ddfb..5427bd3 100644 --- a/test_petastorm.py +++ b/test_petastorm.py @@ -7,28 +7,33 @@ import torch.distributed as dist from recsys.datasets.criteo import PetastormDataReader, get_id_freq_map from recsys.utils import get_mem_info - +from baselines.data.dlrm_dataloader import PetastormDataReader as Reader import pyarrow.parquet as pq def iterate_data(): - dist.init_process_group(backend='nccl') + # dist.init_process_group(backend='nccl') - dataset_dir = "/data/scratch/criteo_terabyte/criteo_preproc/train/" + dataset_dir = "/data/criteo_preproc/validation/" fname = "part_{}.parquet" train_paths = [dataset_dir + fname.format(i) for i in range(64)] - random.seed(0) - dataloader = PetastormDataReader(train_paths, batch_size=16384, rank=None, world_size=None, shuffle_batches=False) - - data_iter = iter(dataloader) - for idx in tqdm(itertools.count(), ncols=0, total=len(dataloader) if hasattr(dataloader, "__len__") else None): - batch = next(data_iter) - # print(f"rank: {dist.get_rank()}, it {idx}, dense: {batch.dense_features[:5, :5]}") - # if idx == 2: - # break + reader = Reader(train_paths, batch_size=16384) + for batch in reader: + print(batch) + break - # dataset_dir = "/data/scratch/criteo_terabyte/criteo_preproc/" + # random.seed(0) + # dataloader = PetastormDataReader(train_paths, batch_size=16384, rank=None, world_size=None, shuffle_batches=False) + # + # data_iter = iter(dataloader) + # for idx in tqdm(itertools.count(), ncols=0, total=len(dataloader) if hasattr(dataloader, "__len__") else None): + # batch = next(data_iter) + # # print(f"rank: {dist.get_rank()}, it {idx}, dense: {batch.dense_features[:5, :5]}") + # # if idx == 2: + # # break + + # dataset_dir = "/data/criteo_preproc/" # id_freq_map = get_id_freq_map(dataset_dir) # # print(f"rank: {dist.get_rank()}, first 10: {id_freq_map[:10]}") diff --git a/torchrec_avazu.sh b/torchrec_avazu.sh new file mode 100644 index 0000000..401029e --- /dev/null +++ b/torchrec_avazu.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# For TorchRec baseline +torchx run -s local_cwd -cfg log_dir=log/torchrec_avazu/w1_16k dist.ddp -j 1x1 --script baselines/dlrm_main.py -- \ + --in_memory_binary_criteo_path /data/avazu_sample --embedding_dim 128 --pin_memory \ + --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ + --learning_rate 1. --batch_size 16384 --profile_dir "tensorboard_log/torchrec_avazu/w1_16k" + +torchx run -s local_cwd -cfg log_dir=log/torchrec_avazu/w2_16k dist.ddp -j 1x2 --script baselines/dlrm_main.py -- \ + --in_memory_binary_criteo_path /data/avazu_sample --embedding_dim 128 --pin_memory \ + --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ + --learning_rate 1. --batch_size 8192 --profile_dir "tensorboard_log/torchrec_avazu/w2_16k" + +torchx run -s local_cwd -cfg log_dir=log/torchrec_avazu/w4_16k dist.ddp -j 1x4 --script baselines/dlrm_main.py -- \ + --in_memory_binary_criteo_path /data/avazu_sample --embedding_dim 128 --pin_memory \ + --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ + --learning_rate 1. --batch_size 4096 --profile_dir "tensorboard_log/torchrec_avazu/w4_16k" diff --git a/torchrec_kaggle.sh b/torchrec_kaggle.sh new file mode 100644 index 0000000..0541480 --- /dev/null +++ b/torchrec_kaggle.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# For TorchRec baseline +torchx run -s local_cwd -cfg log_dir=log/torchrec_kaggle/w1_16k dist.ddp -j 1x1 --script baselines/dlrm_main.py -- \ + --in_memory_binary_criteo_path /data/criteo_kaggle_data --kaggle --embedding_dim 128 --pin_memory \ + --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ + --learning_rate 1. --batch_size 16384 --profile_dir "tensorboard_log/torchrec_kaggle/w1_16k" + +torchx run -s local_cwd -cfg log_dir=log/torchrec_kaggle/w2_16k dist.ddp -j 1x2 --script baselines/dlrm_main.py -- \ + --in_memory_binary_criteo_path /data/criteo_kaggle_data --kaggle --embedding_dim 128 --pin_memory \ + --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ + --learning_rate 1. --batch_size 8192 --profile_dir "tensorboard_log/torchrec_kaggle/w2_16k" + +torchx run -s local_cwd -cfg log_dir=log/torchrec_kaggle/w4_16k dist.ddp -j 1x4 --script baselines/dlrm_main.py -- \ + --in_memory_binary_criteo_path /data/criteo_kaggle_data --kaggle --embedding_dim 128 --pin_memory \ + --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ + --learning_rate 1. --batch_size 4096 --profile_dir "tensorboard_log/torchrec_kaggle/w4_16k" diff --git a/torchrec_terabyte.sh b/torchrec_terabyte.sh new file mode 100644 index 0000000..b15b9e4 --- /dev/null +++ b/torchrec_terabyte.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# For TorchRec baseline +torchx run -s local_cwd -cfg log_dir=log/torchrec_terabyte/w1_16k dist.ddp -j 1x1 --script baselines/dlrm_main.py -- \ + --in_memory_binary_criteo_path /data/criteo_preproc/ --embedding_dim 128 --pin_memory \ + --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ + --learning_rate 1. --batch_size 16384 --profile_dir "tensorboard_log/torchrec_terabyte/w1_16k" + +torchx run -s local_cwd -cfg log_dir=log/torchrec_terabyte/w2_16k dist.ddp -j 1x2 --script baselines/dlrm_main.py -- \ + --in_memory_binary_criteo_path /data/criteo_preproc/ --embedding_dim 128 --pin_memory \ + --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ + --learning_rate 1. --batch_size 8192 --profile_dir "tensorboard_log/torchrec_terabyte/w2_16k" + +torchx run -s local_cwd -cfg log_dir=log/torchrec_terabyte/w4_16k dist.ddp -j 1x4 --script baselines/dlrm_main.py -- \ + --in_memory_binary_criteo_path /data/criteo_preproc/ --embedding_dim 128 --pin_memory \ + --over_arch_layer_sizes "1024,1024,512,256,1" --dense_arch_layer_sizes "512,256,128" --shuffle_batches \ + --learning_rate 1. --batch_size 4096 --profile_dir "tensorboard_log/torchrec_terabyte/w4_16k" From b011cdd99c3d3f4e71d4ee397b62db1725a32b6e Mon Sep 17 00:00:00 2001 From: Jiarui Fang Date: Mon, 29 Aug 2022 16:51:53 +0800 Subject: [PATCH 19/19] [benchmark] add LFU in benchmark (#116) --- benchmark/benchmark_cache.py | 7 +- kaggle.sh | 137 +++++++++++++++++++---------------- recsys/dlrm_main.py | 7 +- recsys/models/dlrm.py | 12 ++- 4 files changed, 90 insertions(+), 73 deletions(-) diff --git a/benchmark/benchmark_cache.py b/benchmark/benchmark_cache.py index 91bbfe1..c1e210a 100644 --- a/benchmark/benchmark_cache.py +++ b/benchmark/benchmark_cache.py @@ -13,7 +13,7 @@ import torch from torch.profiler import profile, ProfilerActivity, schedule, tensorboard_trace_handler -from colossalai.nn.parallel.layers import FreqAwareEmbeddingBag +from colossalai.nn.parallel.layers import FreqAwareEmbeddingBag, EvictionStrategy from recsys.datasets.criteo import get_id_freq_map from data_utils import get_dataloader, NUM_EMBED, CRITEO_PATH @@ -23,7 +23,8 @@ def benchmark_cache_embedding(batch_size, cache_ratio, id_freq_map=None, warmup_ratio=0., - use_limit_buf=True): + use_limit_buf=True, + use_lfu=False): dataloader = get_dataloader('train', batch_size) cuda_row_num = int(cache_ratio * NUM_EMBED) print(f"batch size: {batch_size}, " @@ -38,7 +39,7 @@ def benchmark_cache_embedding(batch_size, torch.cuda.reset_peak_memory_stats() device = torch.device('cuda:0') with Timer() as timer: - model = FreqAwareEmbeddingBag(NUM_EMBED, embedding_dim, sparse=True, include_last_offset=True).to(device) + model = FreqAwareEmbeddingBag(NUM_EMBED, embedding_dim, sparse=True, include_last_offset=True, evict_strategy=EvictionStrategy.LFU if use_lfu else EvictionStrategy.DATASET).to(device) print(f"model init: {timer.elapsed:.2f}s") with Timer() as timer: model.preprocess(cuda_row_num, id_freq_map, warmup_ratio=warmup_ratio, buffer_size=buf_size) diff --git a/kaggle.sh b/kaggle.sh index a392040..5e8495e 100644 --- a/kaggle.sh +++ b/kaggle.sh @@ -1,68 +1,79 @@ #!/bin/bash +export LFU=1 +export DATA_PATH=/data/scratch/RecSys + +if [[ ${LFU} == 1 ]]; then +LFU_FLAG="--use_freq" +else +export LFU_FLAG="" +fi + +mkdir -p logs + # For Colossalai enabled recsys # criteo kaggle torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w1_p1_16k" --buffer_size 0 --use_overlap --cache_sets 337625 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w2_p1_16k dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w2_p1_16k" --buffer_size 0 --use_overlap --cache_sets 337625 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w4_p1_16k dist.ddp -j 1x4 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 5e-1 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w4_p1_16k" --buffer_size 0 --use_overlap --cache_sets 337625 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_32k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 32768 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w1_p1_32k" --buffer_size 0 --use_overlap --cache_sets 337625 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_8k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 8192 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w1_p1_8k" --buffer_size 0 --use_overlap --cache_sets 337625 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_4k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 4096 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w1_p1_4k" --buffer_size 0 --use_overlap --cache_sets 337625 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_2k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 2048 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w1_p1_2k" --buffer_size 0 --use_overlap --cache_sets 337625 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_1k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 1024 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w1_p1_1k" --buffer_size 0 --use_overlap --cache_sets 337625 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p10_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w1_p10_16k" --buffer_size 0 --use_overlap --cache_sets 3376257 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p5_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w1_p5_16k" --buffer_size 0 --use_overlap --cache_sets 1688128 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p2_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w1_p2_16k" --buffer_size 0 --use_overlap --cache_sets 675251 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p0_5_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w1_p0_5_16k" --buffer_size 0 --use_overlap --cache_sets 168812 - -torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p0_1_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ - --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ - --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ - --profile_dir "tensorboard_log/kaggle/w1_p0_1_16k" --buffer_size 0 --use_overlap --cache_sets 33762 + --dataset_dir ${DATA_PATH}/criteo_kaggle_data --pin_memory --shuffle_batches \ + --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache ${LFU_FLAG} \ + --profile_dir "tensorboard_log/kaggle/w1_p1_16k" --buffer_size 0 --use_overlap --cache_sets 337625 2>&1 | tee logs/w1_p1_16k_lfu_${LFU}.txt + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w2_p1_16k dist.ddp -j 1x2 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w2_p1_16k" --buffer_size 0 --use_overlap --cache_sets 337625 + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w4_p1_16k dist.ddp -j 1x4 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 5e-1 --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w4_p1_16k" --buffer_size 0 --use_overlap --cache_sets 337625 + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_32k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 32768 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w1_p1_32k" --buffer_size 0 --use_overlap --cache_sets 337625 + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_8k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 8192 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w1_p1_8k" --buffer_size 0 --use_overlap --cache_sets 337625 + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_4k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 4096 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w1_p1_4k" --buffer_size 0 --use_overlap --cache_sets 337625 + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_2k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 2048 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w1_p1_2k" --buffer_size 0 --use_overlap --cache_sets 337625 + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p1_1k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 1024 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w1_p1_1k" --buffer_size 0 --use_overlap --cache_sets 337625 + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p10_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w1_p10_16k" --buffer_size 0 --use_overlap --cache_sets 3376257 + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p5_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w1_p5_16k" --buffer_size 0 --use_overlap --cache_sets 1688128 + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p2_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w1_p2_16k" --buffer_size 0 --use_overlap --cache_sets 675251 + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p0_5_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w1_p0_5_16k" --buffer_size 0 --use_overlap --cache_sets 168812 + +# torchx run -s local_cwd -cfg log_dir=log/kaggle/w1_p0_1_16k dist.ddp -j 1x1 --script recsys/dlrm_main.py -- \ +# --dataset_dir /data/criteo_kaggle_data --pin_memory --shuffle_batches \ +# --learning_rate 1. --batch_size 16384 --use_sparse_embed_grad --use_cache --use_freq \ +# --profile_dir "tensorboard_log/kaggle/w1_p0_1_16k" --buffer_size 0 --use_overlap --cache_sets 33762 diff --git a/recsys/dlrm_main.py b/recsys/dlrm_main.py index fed5df4..b86db7c 100644 --- a/recsys/dlrm_main.py +++ b/recsys/dlrm_main.py @@ -129,9 +129,10 @@ def parse_args(): default=1, help="Number of cache lines in each cache set. Similar to the N-way set associate mechanism in cache." "Not implemented yet. Increasing this would scale up the cache capacity") - parser.add_argument("--use_freq", action='store_true') - parser.add_argument("--warmup_ratio", type=float, default=0.7) - parser.add_argument("--buffer_size", type=int, default=50_000) + parser.add_argument("--use_freq", action='store_true', help="use the dataset freq information to initialize the softwar cache") + parser.add_argument("--use_lfu", action='store_true', help="use the LFU as the cache eviction strategy. If false use DATASET aware version") + parser.add_argument("--warmup_ratio", type=float, default=0.7, help="warmup ratio of the software cache") + parser.add_argument("--buffer_size", type=int, default=0, help="limit buffer size, if buffer_size=1, do not use the buffer.") # Training parser.add_argument( diff --git a/recsys/models/dlrm.py b/recsys/models/dlrm.py index f925f25..d8962c7 100644 --- a/recsys/models/dlrm.py +++ b/recsys/models/dlrm.py @@ -12,7 +12,7 @@ from ..datasets.utils import KJTAllToAll import colossalai -from colossalai.nn.parallel.layers import ParallelFreqAwareEmbeddingBag +from colossalai.nn.parallel.layers import ParallelFreqAwareEmbeddingBag, EvictionStrategy from colossalai.core import global_context as gpc from colossalai.context.parallel_mode import ParallelMode @@ -38,7 +38,8 @@ def __init__(self, id_freq_map=None, warmup_ratio=0.7, buffer_size=50_000, - is_dist_dataloader=True): + is_dist_dataloader=True, + use_lfu_eviction=False): super(FusedSparseModules, self).__init__() if use_cache: self.embed = ParallelFreqAwareEmbeddingBag( @@ -51,6 +52,7 @@ def __init__(self, ids_freq_mapping=id_freq_map, warmup_ratio=warmup_ratio, buffer_size=buffer_size, + evict_strategy=EvictionStrategy.LFU if use_lfu_eviction else EvictionStrategy.DATASET ) else: raise NotImplementedError() @@ -131,7 +133,8 @@ def __init__(self, id_freq_map=None, warmup_ratio=0.7, buffer_size=50_000, - is_dist_dataloader=True): + is_dist_dataloader=True, + use_lfu_eviction=False): super(HybridParallelDLRM, self).__init__() if use_cache and sparse_device.type != dense_device.type: @@ -152,7 +155,8 @@ def __init__(self, id_freq_map=id_freq_map, warmup_ratio=warmup_ratio, buffer_size=buffer_size, - is_dist_dataloader=is_dist_dataloader).to(sparse_device) + is_dist_dataloader=is_dist_dataloader, + use_lfu_eviction=use_lfu_eviction).to(sparse_device) self.dense_modules = DDP(module=FusedDenseModules(embedding_dim, num_sparse_features, dense_in_features, dense_arch_layer_sizes, over_arch_layer_sizes).to(dense_device),