upgrade hash op to support Tensor and LoDTensor input (PaddlePaddle#17998)

zhoukunsheng · luotao1 · commit 71af72b1c2fe · 2019-07-03T10:46:13.000+08:00
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
@@ -238,7 +238,7 @@ paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], va
 paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65c8362e48810b8226e311c5d046db51'))
 paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', '9f303c67538e468a36c5904a0a3aa110'))
 paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '18ec2e3afeb90e70c8b73d2b71c40fdb'))
-paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'da621ba1363e8f5fe7b702526bbae18f'))
+paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'a0b73c21be618cec0281e7903039e5e3'))
 paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5d16663e096d7f04954c70ce1cc5e195'))
 paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'e3993a477c94729526040ff65d95728e'))
 paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e399f9436fed5f7ff480d8532e42c937'))
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -46,11 +46,10 @@ class HashOp : public framework::OperatorWithKernel {
 class HashOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(Tensor) Input tensor of scale operator.");
-    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddInput("X", "(Tensor) Input tensor of hash operator.");
+    AddOutput("Out", "(Tensor) Output tensor of hash operator.");
     AddComment(R"DOC(
-**Hash Operator**
-$$Out = scale * X$$
+        Execute `num_hash` times xxHash algorithm on all elements on second dimension of input. 
 )DOC");
     AddAttr<int>("num_hash", "").SetDefault(1);
     AddAttr<int>("mod_by", "").SetDefault(100000);
diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -47,10 +47,6 @@ class HashKernel : public framework::OpKernel<T> {
     int num_hash = context.Attr<int>("num_hash");
 
     auto in_dims = in_t->dims();
-    auto in_lod = in_t->lod();
-    PADDLE_ENFORCE_EQ(
-        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
-        "The actual input data's size mismatched with LoD information.");
 
     std::vector<int64_t> out_dims;
     HashOutputSize(in_dims, out_dims, num_hash);
@@ -67,6 +63,7 @@ class HashKernel : public framework::OpKernel<T> {
       }
       input += last_dim;
     }
+
     out_t->set_lod(in_t->lod());
   }
 };
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
@@ -10810,12 +10810,9 @@ def hash(input, hash_size, num_hash=1, name=None):
         Given:
 
         # shape [2, 2]
-        input.data = [
+        input.data = 
             [[1, 2],
-             [3, 4]],
-        ]
-
-        input.lod = [[0, 2]]
+             [3, 4]]
 
         hash_size = 10000
 
@@ -10833,40 +10830,32 @@ def hash(input, hash_size, num_hash=1, name=None):
              [8310, 1327, 1654, 4567]],
         ]
 
-        output.lod = [[0, 2]]
-
     Args:
         input (Variable): The input variable which is a one-hot word. The
-            dimensions of the input variable must be 2.
+            dimensions of the input variable must be 2. Both Tensor and LoDTensor are supported.
         hash_size (int): The space size for hash algorithm. The output value
             will keep in the range:math:`[0, hash_size - 1]`.
         num_hash (int): The times of hash, default 1.
         name (str, default None): The name of this layer.
 
     Returns:
-       Variable: The hash result variable which is a LoDTensor.
+       Variable: The hash result variable, which the same variable type as `input`.
 
     Examples:
        .. code-block:: python
 
             import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            import numpy as np
-
-            titles = fluid.layers.data(name='titles', shape=[1], dtype='int32', lod_level=1)
-            hash_r = fluid.layers.hash(name='hash_x', input=titles, num_hash=1, hash_size=1000)
 
-            place = fluid.core.CPUPlace()
-            exece = fluid.Executor(place)
-            exece.run(fluid.default_startup_program()) 
+            # titles has shape [batch, 1]
+            titles = fluid.layers.data(name='titles', shape=[1], dtype='int32', lod_level=0)
+            # hash_r has shape [batch, 2]
+            hash_r = fluid.layers.hash(name='hash_x', input=titles, num_hash=2, hash_size=1000)
 
-            # Init Tensor
-            tensor = fluid.core.LoDTensor() 
-            tensor.set(np.random.randint(0, 10, (3, 1)).astype("int32"), place)
-            # Set LoD
-            tensor.set_recursive_sequence_lengths([[1, 1, 1]])
 
-            out = exece.run(feed={'titles': tensor}, fetch_list=[hash_r], return_numpy=False)
+            # titles has shape [batch, 1] and lod information
+            titles = fluid.layers.data(name='titles', shape=[1], dtype='int32', lod_level=1)
+            # hash_r has shape [batch, 2] and inherits lod information from titles
+            hash_r = fluid.layers.hash(name='hash_x', input=titles, num_hash=2, hash_size=1000)
     """
     helper = LayerHelper('hash', **locals())
     out = helper.create_variable_for_type_inference(
diff --git a/python/paddle/fluid/tests/unittests/test_hash_op.py b/python/paddle/fluid/tests/unittests/test_hash_op.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,36 +17,41 @@
 from op_test import OpTest
 
 
-class TestScaleOp(OpTest):
+class TestHashOp(OpTest):
     def setUp(self):
         self.op_type = "hash"
         self.init_test_case()
         self.inputs = {'X': (self.in_seq, self.lod)}
-        self.attrs = {'num_hash': 4, 'mod_by': 10000}
+        self.attrs = {'num_hash': 2, 'mod_by': 10000}
         self.outputs = {'Out': (self.out_seq, self.lod)}
 
     def init_test_case(self):
-        np.random.seed = 1
-        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        self.lod = [[9, 4, 11, 6]]
-        #  self.out_seq = np.ones([30, 4, 1], dtype=np.int32)
-        self.out_seq = [
-            [[9662], [9217], [1129], [8487]], [[9662], [9217], [1129], [8487]],
-            [[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]],
-            [[9407], [6715], [6949], [8094]], [[8473], [694], [5142], [2479]],
-            [[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]],
-            [[4372], [9456], [8204], [6695]], [[6897], [3218], [2013], [1241]],
-            [[8473], [694], [5142], [2479]], [[4372], [9456], [8204], [6695]],
-            [[4372], [9456], [8204], [6695]], [[8473], [694], [5142], [2479]],
-            [[9407], [6715], [6949], [8094]], [[9369], [4525], [8935], [9210]],
-            [[4372], [9456], [8204], [6695]], [[4372], [9456], [8204], [6695]],
-            [[9369], [4525], [8935], [9210]], [[6897], [3218], [2013], [1241]],
-            [[9038], [7951], [5953], [8657]], [[9407], [6715], [6949], [8094]],
-            [[9662], [9217], [1129], [8487]], [[9369], [4525], [8935], [9210]],
-            [[9038], [7951], [5953], [8657]], [[9662], [9217], [1129], [8487]],
-            [[9369], [4525], [8935], [9210]], [[1719], [5986], [9919], [3421]],
-            [[4372], [9456], [8204], [6695]], [[9038], [7951], [5953], [8657]]
-        ]
+        np.random.seed(1)
+        self.in_seq = np.random.randint(0, 10, (8, 1)).astype("int32")
+        self.lod = [[2, 6]]
+        self.out_seq = [[[3481], [7475]], [[1719], [5986]], [[8473], [694]],
+                        [[3481], [7475]], [[4372], [9456]], [[4372], [9456]],
+                        [[6897], [3218]], [[9038], [7951]]]
+        self.out_seq = np.array(self.out_seq)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestHashNotLoDOp(TestHashOp):
+    def setUp(self):
+        self.op_type = "hash"
+        self.init_test_case()
+        self.inputs = {'X': self.in_seq}
+        self.attrs = {'num_hash': 2, 'mod_by': 10000}
+        self.outputs = {'Out': self.out_seq}
+
+    def init_test_case(self):
+        np.random.seed(1)
+        self.in_seq = np.random.randint(0, 10, (8, 1)).astype("int32")
+        self.out_seq = [[[3481], [7475]], [[1719], [5986]], [[8473], [694]],
+                        [[3481], [7475]], [[4372], [9456]], [[4372], [9456]],
+                        [[6897], [3218]], [[9038], [7951]]]
         self.out_seq = np.array(self.out_seq)
 
     def test_check_output(self):