Merge pull request tensorflow#4084 from XinyueZ/optimized/cookbook/regression/make_dataset

MarkDaoust · web-flow · commit de9f3584f76f · 2018-04-25T16:30:00.000-07:00
Fixed tensorflow#4083 and two points of optimizations
diff --git a/samples/cookbook/regression/automobile_data.py b/samples/cookbook/regression/automobile_data.py
@@ -109,19 +109,19 @@ def load_data(y_name="price", train_fraction=0.7, seed=None):
 
   return (x_train, y_train), (x_test, y_test)
 
-def make_dataset(x, y=None):
-    """Create a slice Dataset from a pandas DataFrame and labels"""
-    # TODO(markdaooust): simplify this after the 1.4 cut.
-    # Convert the DataFrame to a dict
-    x = dict(x)
-
-    # Convert the pd.Series to np.arrays
-    for key in x:
-        x[key] = np.array(x[key])
 
-    items = [x]
-    if y is not None:
-        items.append(np.array(y, dtype=np.float32))
+def make_dataset(batch_sz, x, y=None, shuffle=False, shuffle_buffer_size=1000):
+    """Create a slice Dataset from a pandas DataFrame and labels"""
 
-    # Create a Dataset of slices
-    return tf.data.Dataset.from_tensor_slices(tuple(items))
+    def input_fn():
+        if y is not None:
+            dataset = tf.data.Dataset.from_tensor_slices((dict(x), y))
+        else:
+            dataset = tf.data.Dataset.from_tensor_slices(dict(x))
+        if shuffle:
+            dataset = dataset.shuffle(shuffle_buffer_size).batch(batch_sz).repeat()
+        else:
+            dataset = dataset.batch(batch_sz)
+        return dataset.make_one_shot_iterator().get_next()
+
+    return input_fn
diff --git a/samples/cookbook/regression/custom_regression.py b/samples/cookbook/regression/custom_regression.py
@@ -31,11 +31,6 @@
 parser.add_argument('--price_norm_factor', default=1000., type=float,
                     help='price normalization factor')
 
-
-def from_dataset(ds):
-    return lambda: ds.make_one_shot_iterator().get_next()
-
-
 def my_dnn_regression_fn(features, labels, mode, params):
   """A model function implementing DNN regression for a custom Estimator."""
 
@@ -81,6 +76,10 @@ def my_dnn_regression_fn(features, labels, mode, params):
   # Calculate root mean squared error
   print(labels)
   print(predictions)
+
+  # Fixed for #4083
+  predictions = tf.cast(predictions, tf.float64)
+
   rmse = tf.metrics.root_mean_squared_error(labels, predictions)
 
   # Add the rmse to the collection of evaluation metrics.
@@ -102,17 +101,11 @@ def main(argv):
   train_y /= args.price_norm_factor
   test_y /= args.price_norm_factor
 
-  # Build the training dataset.
-  train = (
-      automobile_data.make_dataset(train_x, train_y)
-      # Shuffling with a buffer larger than the data set ensures
-      # that the examples are well mixed.
-      .shuffle(1000).batch(args.batch_size)
-      # Repeat forever
-      .repeat())
+  # Provide the training input dataset.
+  train_input_fn = automobile_data.make_dataset(args.batch_size, train_x, train_y, True, 1000)
 
   # Build the validation dataset.
-  test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
+  test_input_fn = automobile_data.make_dataset(args.batch_size, test_x, test_y)
 
   # The first way assigns a unique weight to each category. To do this you must
   # specify the category's vocabulary (values outside this specification will
@@ -151,10 +144,10 @@ def main(argv):
       })
 
   # Train the model.
-  model.train(input_fn=from_dataset(train), steps=args.train_steps)
+  model.train(input_fn=train_input_fn, steps=args.train_steps)
 
   # Evaluate how the model performs on data it has not yet seen.
-  eval_result = model.evaluate(input_fn=from_dataset(test))
+  eval_result = model.evaluate(input_fn=test_input_fn)
 
   # Print the Root Mean Square Error (RMSE).
   print("\n" + 80 * "*")
diff --git a/samples/cookbook/regression/dnn_regression.py b/samples/cookbook/regression/dnn_regression.py
@@ -32,10 +32,6 @@
                     help='price normalization factor')
 
 
-def from_dataset(ds):
-    return lambda: ds.make_one_shot_iterator().get_next()
-
-
 def main(argv):
   """Builds, trains, and evaluates the model."""
   args = parser.parse_args(argv[1:])
@@ -45,17 +41,11 @@ def main(argv):
   train_y /= args.price_norm_factor
   test_y /= args.price_norm_factor
 
-  # Build the training dataset.
-  train = (
-      automobile_data.make_dataset(train_x, train_y)
-      # Shuffling with a buffer larger than the data set ensures
-      # that the examples are well mixed.
-      .shuffle(1000).batch(args.batch_size)
-      # Repeat forever
-      .repeat())
+  # Provide the training input dataset.
+  train_input_fn = automobile_data.make_dataset(args.batch_size, train_x, train_y, True, 1000)
 
-  # Build the validation dataset.
-  test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
+  # Provide the validation input dataset.
+  test_input_fn = automobile_data.make_dataset(args.batch_size, test_x, test_y)
 
   # Use the same categorical columns as in `linear_regression_categorical`
   body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
@@ -84,10 +74,10 @@ def main(argv):
 
   # Train the model.
   # By default, the Estimators log output every 100 steps.
-  model.train(input_fn=from_dataset(train), steps=args.train_steps)
+  model.train(input_fn=train_input_fn, steps=args.train_steps)
 
   # Evaluate how the model performs on data it has not yet seen.
-  eval_result = model.evaluate(input_fn=from_dataset(test))
+  eval_result = model.evaluate(input_fn=test_input_fn)
 
   # The evaluation returns a Python dictionary. The "average_loss" key holds the
   # Mean Squared Error (MSE).
diff --git a/samples/cookbook/regression/linear_regression.py b/samples/cookbook/regression/linear_regression.py
@@ -33,10 +33,6 @@
                     help='price normalization factor')
 
 
-def from_dataset(ds):
-    return lambda: ds.make_one_shot_iterator().get_next()
-
-
 def main(argv):
   """Builds, trains, and evaluates the model."""
   args = parser.parse_args(argv[1:])
@@ -46,17 +42,11 @@ def main(argv):
   train_y /= args.price_norm_factor
   test_y /= args.price_norm_factor
 
-  # Build the training dataset.
-  train = (
-      automobile_data.make_dataset(train_x, train_y)
-      # Shuffling with a buffer larger than the data set ensures
-      # that the examples are well mixed.
-      .shuffle(1000).batch(args.batch_size)
-      # Repeat forever
-      .repeat())
+  # Provide the training input dataset.
+  train_input_fn = automobile_data.make_dataset(args.batch_size, train_x, train_y, True, 1000)
 
-  # Build the validation dataset.
-  test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
+  # Provide the validation input dataset.
+  test_input_fn = automobile_data.make_dataset(args.batch_size, test_x, test_y)
 
   feature_columns = [
       # "curb-weight" and "highway-mpg" are numeric columns.
@@ -69,10 +59,10 @@ def main(argv):
 
   # Train the model.
   # By default, the Estimators log output every 100 steps.
-  model.train(input_fn=from_dataset(train), steps=args.train_steps)
+  model.train(input_fn=train_input_fn, steps=args.train_steps)
 
   # Evaluate how the model performs on data it has not yet seen.
-  eval_result = model.evaluate(input_fn=from_dataset(test))
+  eval_result = model.evaluate(input_fn=test_input_fn)
 
   # The evaluation returns a Python dictionary. The "average_loss" key holds the
   # Mean Squared Error (MSE).
@@ -88,8 +78,10 @@ def main(argv):
       "curb-weight": np.array([2000, 3000]),
       "highway-mpg": np.array([30, 40])
   }
-  predict = automobile_data.make_dataset(input_dict).batch(1)
-  predict_results = model.predict(input_fn=from_dataset(predict))
+
+  # Provide the predict input dataset.
+  predict_input_fn = automobile_data.make_dataset(1, input_dict)
+  predict_results = model.predict(input_fn=predict_input_fn)
 
   # Print the prediction results.
   print("\nPrediction results:")
diff --git a/samples/cookbook/regression/linear_regression_categorical.py b/samples/cookbook/regression/linear_regression_categorical.py
@@ -32,10 +32,6 @@
                     help='price normalization factor')
 
 
-def from_dataset(ds):
-    return lambda: ds.make_one_shot_iterator().get_next()
-
-
 def main(argv):
   """Builds, trains, and evaluates the model."""
   args = parser.parse_args(argv[1:])
@@ -45,17 +41,11 @@ def main(argv):
   train_y /= args.price_norm_factor
   test_y /= args.price_norm_factor
 
-  # Build the training dataset.
-  train = (
-      automobile_data.make_dataset(train_x, train_y)
-      # Shuffling with a buffer larger than the data set ensures
-      # that the examples are well mixed.
-      .shuffle(1000).batch(args.batch_size)
-      # Repeat forever
-      .repeat())
+  # Provide the training input dataset.
+  train_input_fn = automobile_data.make_dataset(args.batch_size, train_x, train_y, True, 1000)
 
-  # Build the validation dataset.
-  test = automobile_data.make_dataset(test_x, test_y).batch(args.batch_size)
+  # Provide the validation input dataset.
+  test_input_fn = automobile_data.make_dataset(args.batch_size, test_x, test_y)
 
   # The following code demonstrates two of the ways that `feature_columns` can
   # be used to build a model with categorical inputs.
@@ -93,10 +83,10 @@ def main(argv):
 
   # Train the model.
   # By default, the Estimators log output every 100 steps.
-  model.train(input_fn=from_dataset(train), steps=args.train_steps)
+  model.train(input_fn=train_input_fn, steps=args.train_steps)
 
   # Evaluate how the model performs on data it has not yet seen.
-  eval_result = model.evaluate(input_fn=from_dataset(test))
+  eval_result = model.evaluate(input_fn=test_input_fn)
 
   # The evaluation returns a Python dictionary. The "average_loss" key holds the
   # Mean Squared Error (MSE).