DavidLanz
diff --git a/‎tensorflow/cc/BUILD‎
Lines changed: 3 additions & 1 deletion b/‎tensorflow/cc/BUILD‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tensorflow/core/BUILD‎
Lines changed: 3 additions & 1 deletion b/‎tensorflow/core/BUILD‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc‎
Lines changed: 17 additions & 17 deletions b/‎tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎tensorflow/core/common_runtime/gpu/gpu_event_mgr.h‎
Lines changed: 18 additions & 56 deletions b/‎tensorflow/core/common_runtime/gpu/gpu_event_mgr.h‎
Lines changed: 18 additions & 56 deletions
diff --git a/‎tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc‎
Lines changed: 4 additions & 12 deletions b/‎tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎tensorflow/core/framework/node_def_util.cc‎
Lines changed: 4 additions & 1 deletion b/‎tensorflow/core/framework/node_def_util.cc‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎tensorflow/core/kernels/cholesky_op.cc‎
Lines changed: 4 additions & 3 deletions b/‎tensorflow/core/kernels/cholesky_op.cc‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎tensorflow/core/kernels/determinant_op.cc‎
Lines changed: 1 addition & 1 deletion b/‎tensorflow/core/kernels/determinant_op.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/core/kernels/matrix_inverse_op.cc‎
Lines changed: 1 addition & 1 deletion b/‎tensorflow/core/kernels/matrix_inverse_op.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/core/kernels/reduction_ops_sum.cc‎
Lines changed: 4 additions & 1 deletion b/‎tensorflow/core/kernels/reduction_ops_sum.cc‎
Lines changed: 4 additions & 1 deletion
@@ -2,7 +2,9 @@
 # TensorFlow is a computational framework, primarily for use in machine
 # learning applications.
 
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
 
 licenses(["notice"])  # Apache 2.0
 
 
@@ -2,7 +2,9 @@
 # TensorFlow is a computational framework, primarily for use in machine
 # learning applications.
 
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
 
 package_group(name = "friends")
 
 
@@ -40,13 +40,13 @@ EventMgr::~EventMgr() {
     delete e;
   }
   while (!used_events_.empty()) {
-    InUse* ue = &used_events_[0];
-    delete ue->event;
-    delete ue->mem;
-    if (ue->bufrec.buf) {
-      ue->bufrec.alloc->DeallocateRaw(ue->bufrec.buf);
+    delete used_events_[0].event;
+    delete used_events_[0].mem;
+    if (used_events_[0].bufrec.buf) {
+      used_events_[0].bufrec.alloc->DeallocateRaw(used_events_[0].bufrec.buf);
     }
-    if (ue->func != nullptr) threadpool_.Schedule(ue->func);
+    if (used_events_[0].func != nullptr)
+      threadpool_.Schedule(used_events_[0].func);
     used_events_.pop_front();
   }
 }
@@ -60,17 +60,15 @@ EventMgr::~EventMgr() {
 void EventMgr::PollLoop() {
   while (!stop_polling_.HasBeenNotified()) {
     Env::Default()->SleepForMicroseconds(1 * 1000);
-    ToFreeVector to_free;
     {
       mutex_lock l(mu_);
-      PollEvents(true, &to_free);
+      PollEvents(true);
     }
-    FreeMemory(to_free);
   }
   polling_stopped_.Notify();
 }
 
-void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
+void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
   VLOG(2) << "QueueInUse  free_events_ " << free_events_.size()
           << " used_events_ " << used_events_.size();
   // Events are created on demand, and repeatedly reused.  There is no
@@ -79,9 +77,10 @@ void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
     free_events_.push_back(new gpu::Event(exec_));
     free_events_.back()->Init();
   }
-  *e = free_events_.back();
+  gpu::Event* e = free_events_.back();
   free_events_.pop_back();
-  iu.event = *e;
+  stream->ThenRecordEvent(e);
+  iu.event = e;
   used_events_.push_back(iu);
 }
 
@@ -104,8 +103,7 @@ void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
 // GPU memory use to spike needlessly.  An alternative strategy would
 // be to throttle new Op execution until the pending event queue
 // clears.
-void EventMgr::PollEvents(bool is_dedicated_poller,
-                          gtl::InlinedVector<InUse, 4>* to_free) {
+void EventMgr::PollEvents(bool is_dedicated_poller) {
   VLOG(2) << "PollEvents  free_events_ " << free_events_.size()
           << " used_events_ " << used_events_.size();
   // Sweep the remaining events in order.  If this is the dedicated
@@ -125,9 +123,11 @@ void EventMgr::PollEvents(bool is_dedicated_poller,
         if (!is_dedicated_poller) return;  // quit processing queue
         break;
       case gpu::Event::Status::kComplete:
-        // Make a copy of the InUse record so we can free it after releasing
-        // the lock
-        to_free->push_back(iu);
+        delete iu.mem;
+        if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
+        // The function must be called in another thread, outside of
+        // the mutex held here.
+        if (iu.func != nullptr) threadpool_.Schedule(iu.func);
         free_events_.push_back(iu.event);
         // Mark this InUse record as completed.
         iu.event = nullptr;
 
@@ -18,10 +18,8 @@ limitations under the License.
 
 #include <deque>
 #include <vector>
-#include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/port.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/tensor.h"
@@ -49,15 +47,9 @@ class EventMgr {
   // currently enqueued on *stream have completed.
   inline void ThenDeleteTensors(perftools::gputools::Stream* stream,
                                 std::vector<Tensor>* tensors) {
-    ToFreeVector to_free;
-    ::perftools::gputools::Event* e;
-    {
-      mutex_lock l(mu_);
-      QueueTensors(stream, tensors, &e);
-      PollEvents(false, &to_free);
-    }
-    stream->ThenRecordEvent(e);
-    FreeMemory(to_free);
+    mutex_lock l(mu_);
+    QueueTensors(stream, tensors);
+    PollEvents(false);
   }
 
   struct BufRec {
@@ -69,28 +61,16 @@ class EventMgr {
   // on it as soon as all events currently enqueued on *stream have completed.
   inline void ThenDeleteBuffer(perftools::gputools::Stream* stream,
                                BufRec bufrec) {
-    ToFreeVector to_free;
-    ::perftools::gputools::Event* e;
-    {
-      mutex_lock l(mu_);
-      QueueBuffer(stream, bufrec, &e);
-      PollEvents(false, &to_free);
-    }
-    stream->ThenRecordEvent(e);
-    FreeMemory(to_free);
+    mutex_lock l(mu_);
+    QueueBuffer(stream, bufrec);
+    PollEvents(false);
   }
 
   inline void ThenExecute(perftools::gputools::Stream* stream,
                           std::function<void()> func) {
-    ToFreeVector to_free;
-    ::perftools::gputools::Event* e;
-    {
-      mutex_lock l(mu_);
-      QueueFunc(stream, func, &e);
-      PollEvents(false, &to_free);
-    }
-    stream->ThenRecordEvent(e);
-    FreeMemory(to_free);
+    mutex_lock l(mu_);
+    QueueFunc(stream, func);
+    PollEvents(false);
   }
 
  private:
@@ -105,50 +85,32 @@ class EventMgr {
     std::function<void()> func;
   };
 
-  typedef gtl::InlinedVector<InUse, 4> ToFreeVector;
-
-  void FreeMemory(const ToFreeVector& to_free) {
-    for (const auto& iu : to_free) {
-      delete iu.mem;
-      if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
-      // The function must be called in another thread.
-      if (iu.func != nullptr) threadpool_.Schedule(iu.func);
-    }
-  }
-
   // Stream-enqueue an unused Event and save with it a collection of
   // Tensors and/or a BufRec to be deleted only after the Event
   // records.
-  void QueueInUse(perftools::gputools::Stream* stream, InUse in_use,
-                  ::perftools::gputools::Event** e)
+  void QueueInUse(perftools::gputools::Stream* stream, InUse in_use)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   void QueueTensors(perftools::gputools::Stream* stream,
-                    std::vector<Tensor>* tensors,
-                    ::perftools::gputools::Event** e)
+                    std::vector<Tensor>* tensors)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr}, e);
+    QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
   }
 
-  void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec,
-                   ::perftools::gputools::Event** e)
+  void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr}, e);
+    QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
   }
 
   void QueueFunc(perftools::gputools::Stream* stream,
-                 std::function<void()> func, ::perftools::gputools::Event** e)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, nullptr, BufRec(), func}, e);
+                 std::function<void()> func) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    QueueInUse(stream, {nullptr, nullptr, BufRec(), func});
   }
 
   // This function should be called at roughly the same tempo as
   // QueueTensors() to check whether pending events have recorded,
-  // and then retire them.  It appends InUse elements that need cleanup
-  // to "*to_free".  The caller should call FreeMemory(to_free)
-  // when this returns.
-  void PollEvents(bool is_dedicated_poller, ToFreeVector* to_free)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // and then retire them.
+  void PollEvents(bool is_dedicated_poller) EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // An internal polling loop that runs at a low frequency to clear
   // straggler Events.
 
@@ -42,21 +42,13 @@ class TEST_EventMgrHelper {
 
   void QueueTensors(perftools::gputools::Stream* stream,
                     std::vector<Tensor>* tensors) {
-    ::perftools::gputools::Event* e;
-    {
-      mutex_lock l(em_->mu_);
-      em_->QueueTensors(stream, tensors, &e);
-    }
-    stream->ThenRecordEvent(e);
+    mutex_lock l(em_->mu_);
+    em_->QueueTensors(stream, tensors);
   }
 
   void PollEvents(bool is_dedicated_poller) {
-    EventMgr::ToFreeVector to_free;
-    {
-      mutex_lock l(em_->mu_);
-      em_->PollEvents(is_dedicated_poller, &to_free);
-    }
-    em_->FreeMemory(to_free);
+    mutex_lock l(em_->mu_);
+    em_->PollEvents(is_dedicated_poller);
   }
 
  private:
 
@@ -79,7 +79,10 @@ Status AttrSlice::Find(const string& attr_name,
     return Status::OK();
   }
   Status s = errors::NotFound("No attr named '", attr_name, "' in NodeDef:");
-  if (ndef_) {
+  // Skip AttachDef for internal attrs since it is a little bit
+  // expensive and it is common for them to correctly not be included
+  // in a NodeDef.
+  if (!StringPiece(attr_name).starts_with("_") && ndef_) {
     s = AttachDef(s, *ndef_);
   }
   return s;
 
@@ -46,7 +46,7 @@ class CholeskyOp
     const int64 rows = input_matrix_shape.dim_size(0);
     if (rows > (1LL << 20)) {
       // A big number to cap the cost in case overflow.
-      return kint32max;
+      return kint64max;
     } else {
       return rows * rows * rows;
     }
@@ -69,8 +69,9 @@ class CholeskyOp
     // Perform the actual LL^T Cholesky decomposition. This will only use
     // the lower triangular part of data_in by default. The upper triangular
     // part of the matrix will not be read.
-    Eigen::LLT<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
-                             Eigen::RowMajor>> llt_decomposition(input);
+    Eigen::LLT<
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+        llt_decomposition(input);
 
     // Output the lower triangular in a dense form.
     *output = llt_decomposition.matrixL();
 
@@ -44,7 +44,7 @@ class DeterminantOp
     const int64 rows = input_matrix_shape.dim_size(0);
     if (rows > (1LL << 20)) {
       // A big number to cap the cost in case overflow.
-      return kint32max;
+      return kint64max;
     } else {
       return rows * rows * rows;
     }
 
@@ -45,7 +45,7 @@ class MatrixInverseOp
     const int64 rows = input_matrix_shape.dim_size(0);
     if (rows > (1LL << 20)) {
       // A big number to cap the cost in case overflow.
-      return kint32max;
+      return kint64max;
     } else {
       return rows * rows * rows;
     }
 
@@ -44,7 +44,10 @@ REGISTER_GPU_KERNELS(float);
 #undef REGISTER_GPU_KERNELS
 
 REGISTER_KERNEL_BUILDER(
-    Name("Sum").Device(DEVICE_GPU).TypeConstraint<complex64>("T"),
+    Name("Sum")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<complex64>("T")
+        .HostMemory("reduction_indices"),
     ReductionOp<GPUDevice, complex64, Eigen::internal::SumReducer<complex64>>);
 
 #endif
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,10 @@ Status AttrSlice::Find(const string& attr_name,`
`79`	`79`	`return Status::OK();`
`80`	`80`	`}`
`81`	`81`	`Status s = errors::NotFound("No attr named '", attr_name, "' in NodeDef:");`
`82`		`- if (ndef_) {`
	`82`	`+ // Skip AttachDef for internal attrs since it is a little bit`
	`83`	`+ // expensive and it is common for them to correctly not be included`
	`84`	`+ // in a NodeDef.`
	`85`	`+ if (!StringPiece(attr_name).starts_with("_") && ndef_) {`
`83`	`86`	`s = AttachDef(s, *ndef_);`
`84`	`87`	`}`
`85`	`88`	`return s;`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ class DeterminantOp`
`44`	`44`	`const int64 rows = input_matrix_shape.dim_size(0);`
`45`	`45`	`if (rows > (1LL << 20)) {`
`46`	`46`	`// A big number to cap the cost in case overflow.`
`47`		`- return kint32max;`
	`47`	`+ return kint64max;`
`48`	`48`	`} else {`
`49`	`49`	`return rows * rows * rows;`
`50`	`50`	`}`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ class MatrixInverseOp`
`45`	`45`	`const int64 rows = input_matrix_shape.dim_size(0);`
`46`	`46`	`if (rows > (1LL << 20)) {`
`47`	`47`	`// A big number to cap the cost in case overflow.`
`48`		`- return kint32max;`
	`48`	`+ return kint64max;`
`49`	`49`	`} else {`
`50`	`50`	`return rows * rows * rows;`
`51`	`51`	`}`