Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit a4806a3

Browse files
author
Vijay Vasudevan
committed
TensorFlow: upstream changes to git.
Change 109321497 Move all images to images directory to make docs versioning easier - adjust all paths in the docs to point to the new locations - remove some now redundant section-order tags added for the old website Change 109317807 Added a kernel op to compute the eigendecomposition of a self-adjoint matrix. Added a new kernel op called self_adjoint_eig (and a batch_self_adjoint_eig) that computes the eigendecomposition of a self-adjoint matrix. The return value is the concatenation of the eigenvalues as a row vector, and the eigenvectors. Change 109310773 Change `_read32()` in the MNIST input example to return an int. Currently we return a 1-D numpy array with 1 element. Numpy has recently deprecated the ability to treat this as a scalar, and as a result this tutorial fails. The fix returns the 0th element of the array instead. Change 109301269 Re-arrange TensorBoard demo files. Change 109273589 add ci_build for ci.tensorflow.org Change 109260293 Speed up NodeDef -> OpKernel process by not spending time generating an error message for missing "_kernel" attr that will be thrown away. Change 109257179 TensorFlow:make event_file_loader_test hermetic by using tempfile instead of fixed filenames. Without this change, running event_file_loader_test twice in the same client (locally) causes it to fail, because it writes into the same file and appends another event, instead of starting from scratch. Change 109256464 Minor cleanup in TensorBoard server code Change 109255382 Change to reduce critical section times in gpu_event_mgr.h: (1) Call stream->ThenRecordEvent outside the EventMgr critical section (2) Do memory deallocation outside the critical section Speeds up one configuration of ptb_word_lm from 2924 words per second (wps) to 3278 wps on my desktop machine with a Titan X. Change 109254843 Fix use of uninitialized memory in test. Change 109250995 python_config.sh needs a license header Otherwise the license test fails. Change 109249914 add ci_build for ci.tensorflow.org Change 109249397 Fixes reduce_sum (complex) on GPU segfaults. Fixes tensorflow#357 Change 109245652 add ci_build for ci.tensorflow.org Base CL: 109321563
1 parent bb7a7a8 commit a4806a3

64 files changed

Lines changed: 1016 additions & 298 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

tensorflow/cc/BUILD

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
# TensorFlow is a computational framework, primarily for use in machine
33
# learning applications.
44

5-
package(default_visibility = ["//tensorflow:internal"])
5+
package(
6+
default_visibility = ["//tensorflow:internal"],
7+
)
68

79
licenses(["notice"]) # Apache 2.0
810

tensorflow/core/BUILD

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
# TensorFlow is a computational framework, primarily for use in machine
33
# learning applications.
44

5-
package(default_visibility = ["//tensorflow:internal"])
5+
package(
6+
default_visibility = ["//tensorflow:internal"],
7+
)
68

79
package_group(name = "friends")
810

tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,13 @@ EventMgr::~EventMgr() {
4040
delete e;
4141
}
4242
while (!used_events_.empty()) {
43-
InUse* ue = &used_events_[0];
44-
delete ue->event;
45-
delete ue->mem;
46-
if (ue->bufrec.buf) {
47-
ue->bufrec.alloc->DeallocateRaw(ue->bufrec.buf);
43+
delete used_events_[0].event;
44+
delete used_events_[0].mem;
45+
if (used_events_[0].bufrec.buf) {
46+
used_events_[0].bufrec.alloc->DeallocateRaw(used_events_[0].bufrec.buf);
4847
}
49-
if (ue->func != nullptr) threadpool_.Schedule(ue->func);
48+
if (used_events_[0].func != nullptr)
49+
threadpool_.Schedule(used_events_[0].func);
5050
used_events_.pop_front();
5151
}
5252
}
@@ -60,17 +60,15 @@ EventMgr::~EventMgr() {
6060
void EventMgr::PollLoop() {
6161
while (!stop_polling_.HasBeenNotified()) {
6262
Env::Default()->SleepForMicroseconds(1 * 1000);
63-
ToFreeVector to_free;
6463
{
6564
mutex_lock l(mu_);
66-
PollEvents(true, &to_free);
65+
PollEvents(true);
6766
}
68-
FreeMemory(to_free);
6967
}
7068
polling_stopped_.Notify();
7169
}
7270

73-
void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
71+
void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
7472
VLOG(2) << "QueueInUse free_events_ " << free_events_.size()
7573
<< " used_events_ " << used_events_.size();
7674
// Events are created on demand, and repeatedly reused. There is no
@@ -79,9 +77,10 @@ void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
7977
free_events_.push_back(new gpu::Event(exec_));
8078
free_events_.back()->Init();
8179
}
82-
*e = free_events_.back();
80+
gpu::Event* e = free_events_.back();
8381
free_events_.pop_back();
84-
iu.event = *e;
82+
stream->ThenRecordEvent(e);
83+
iu.event = e;
8584
used_events_.push_back(iu);
8685
}
8786

@@ -104,8 +103,7 @@ void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
104103
// GPU memory use to spike needlessly. An alternative strategy would
105104
// be to throttle new Op execution until the pending event queue
106105
// clears.
107-
void EventMgr::PollEvents(bool is_dedicated_poller,
108-
gtl::InlinedVector<InUse, 4>* to_free) {
106+
void EventMgr::PollEvents(bool is_dedicated_poller) {
109107
VLOG(2) << "PollEvents free_events_ " << free_events_.size()
110108
<< " used_events_ " << used_events_.size();
111109
// Sweep the remaining events in order. If this is the dedicated
@@ -125,9 +123,11 @@ void EventMgr::PollEvents(bool is_dedicated_poller,
125123
if (!is_dedicated_poller) return; // quit processing queue
126124
break;
127125
case gpu::Event::Status::kComplete:
128-
// Make a copy of the InUse record so we can free it after releasing
129-
// the lock
130-
to_free->push_back(iu);
126+
delete iu.mem;
127+
if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
128+
// The function must be called in another thread, outside of
129+
// the mutex held here.
130+
if (iu.func != nullptr) threadpool_.Schedule(iu.func);
131131
free_events_.push_back(iu.event);
132132
// Mark this InUse record as completed.
133133
iu.event = nullptr;

tensorflow/core/common_runtime/gpu/gpu_event_mgr.h

Lines changed: 18 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,8 @@ limitations under the License.
1818

1919
#include <deque>
2020
#include <vector>
21-
#include "tensorflow/stream_executor/stream.h"
2221
#include "tensorflow/core/lib/core/notification.h"
2322
#include "tensorflow/core/lib/core/threadpool.h"
24-
#include "tensorflow/core/lib/gtl/inlined_vector.h"
2523
#include "tensorflow/core/platform/port.h"
2624
#include "tensorflow/core/platform/thread_annotations.h"
2725
#include "tensorflow/core/public/tensor.h"
@@ -49,15 +47,9 @@ class EventMgr {
4947
// currently enqueued on *stream have completed.
5048
inline void ThenDeleteTensors(perftools::gputools::Stream* stream,
5149
std::vector<Tensor>* tensors) {
52-
ToFreeVector to_free;
53-
::perftools::gputools::Event* e;
54-
{
55-
mutex_lock l(mu_);
56-
QueueTensors(stream, tensors, &e);
57-
PollEvents(false, &to_free);
58-
}
59-
stream->ThenRecordEvent(e);
60-
FreeMemory(to_free);
50+
mutex_lock l(mu_);
51+
QueueTensors(stream, tensors);
52+
PollEvents(false);
6153
}
6254

6355
struct BufRec {
@@ -69,28 +61,16 @@ class EventMgr {
6961
// on it as soon as all events currently enqueued on *stream have completed.
7062
inline void ThenDeleteBuffer(perftools::gputools::Stream* stream,
7163
BufRec bufrec) {
72-
ToFreeVector to_free;
73-
::perftools::gputools::Event* e;
74-
{
75-
mutex_lock l(mu_);
76-
QueueBuffer(stream, bufrec, &e);
77-
PollEvents(false, &to_free);
78-
}
79-
stream->ThenRecordEvent(e);
80-
FreeMemory(to_free);
64+
mutex_lock l(mu_);
65+
QueueBuffer(stream, bufrec);
66+
PollEvents(false);
8167
}
8268

8369
inline void ThenExecute(perftools::gputools::Stream* stream,
8470
std::function<void()> func) {
85-
ToFreeVector to_free;
86-
::perftools::gputools::Event* e;
87-
{
88-
mutex_lock l(mu_);
89-
QueueFunc(stream, func, &e);
90-
PollEvents(false, &to_free);
91-
}
92-
stream->ThenRecordEvent(e);
93-
FreeMemory(to_free);
71+
mutex_lock l(mu_);
72+
QueueFunc(stream, func);
73+
PollEvents(false);
9474
}
9575

9676
private:
@@ -105,50 +85,32 @@ class EventMgr {
10585
std::function<void()> func;
10686
};
10787

108-
typedef gtl::InlinedVector<InUse, 4> ToFreeVector;
109-
110-
void FreeMemory(const ToFreeVector& to_free) {
111-
for (const auto& iu : to_free) {
112-
delete iu.mem;
113-
if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
114-
// The function must be called in another thread.
115-
if (iu.func != nullptr) threadpool_.Schedule(iu.func);
116-
}
117-
}
118-
11988
// Stream-enqueue an unused Event and save with it a collection of
12089
// Tensors and/or a BufRec to be deleted only after the Event
12190
// records.
122-
void QueueInUse(perftools::gputools::Stream* stream, InUse in_use,
123-
::perftools::gputools::Event** e)
91+
void QueueInUse(perftools::gputools::Stream* stream, InUse in_use)
12492
EXCLUSIVE_LOCKS_REQUIRED(mu_);
12593

12694
void QueueTensors(perftools::gputools::Stream* stream,
127-
std::vector<Tensor>* tensors,
128-
::perftools::gputools::Event** e)
95+
std::vector<Tensor>* tensors)
12996
EXCLUSIVE_LOCKS_REQUIRED(mu_) {
130-
QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr}, e);
97+
QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
13198
}
13299

133-
void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec,
134-
::perftools::gputools::Event** e)
100+
void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec)
135101
EXCLUSIVE_LOCKS_REQUIRED(mu_) {
136-
QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr}, e);
102+
QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
137103
}
138104

139105
void QueueFunc(perftools::gputools::Stream* stream,
140-
std::function<void()> func, ::perftools::gputools::Event** e)
141-
EXCLUSIVE_LOCKS_REQUIRED(mu_) {
142-
QueueInUse(stream, {nullptr, nullptr, BufRec(), func}, e);
106+
std::function<void()> func) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
107+
QueueInUse(stream, {nullptr, nullptr, BufRec(), func});
143108
}
144109

145110
// This function should be called at roughly the same tempo as
146111
// QueueTensors() to check whether pending events have recorded,
147-
// and then retire them. It appends InUse elements that need cleanup
148-
// to "*to_free". The caller should call FreeMemory(to_free)
149-
// when this returns.
150-
void PollEvents(bool is_dedicated_poller, ToFreeVector* to_free)
151-
EXCLUSIVE_LOCKS_REQUIRED(mu_);
112+
// and then retire them.
113+
void PollEvents(bool is_dedicated_poller) EXCLUSIVE_LOCKS_REQUIRED(mu_);
152114

153115
// An internal polling loop that runs at a low frequency to clear
154116
// straggler Events.

tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,21 +42,13 @@ class TEST_EventMgrHelper {
4242

4343
void QueueTensors(perftools::gputools::Stream* stream,
4444
std::vector<Tensor>* tensors) {
45-
::perftools::gputools::Event* e;
46-
{
47-
mutex_lock l(em_->mu_);
48-
em_->QueueTensors(stream, tensors, &e);
49-
}
50-
stream->ThenRecordEvent(e);
45+
mutex_lock l(em_->mu_);
46+
em_->QueueTensors(stream, tensors);
5147
}
5248

5349
void PollEvents(bool is_dedicated_poller) {
54-
EventMgr::ToFreeVector to_free;
55-
{
56-
mutex_lock l(em_->mu_);
57-
em_->PollEvents(is_dedicated_poller, &to_free);
58-
}
59-
em_->FreeMemory(to_free);
50+
mutex_lock l(em_->mu_);
51+
em_->PollEvents(is_dedicated_poller);
6052
}
6153

6254
private:

tensorflow/core/framework/node_def_util.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,10 @@ Status AttrSlice::Find(const string& attr_name,
7979
return Status::OK();
8080
}
8181
Status s = errors::NotFound("No attr named '", attr_name, "' in NodeDef:");
82-
if (ndef_) {
82+
// Skip AttachDef for internal attrs since it is a little bit
83+
// expensive and it is common for them to correctly not be included
84+
// in a NodeDef.
85+
if (!StringPiece(attr_name).starts_with("_") && ndef_) {
8386
s = AttachDef(s, *ndef_);
8487
}
8588
return s;

tensorflow/core/kernels/cholesky_op.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class CholeskyOp
4646
const int64 rows = input_matrix_shape.dim_size(0);
4747
if (rows > (1LL << 20)) {
4848
// A big number to cap the cost in case overflow.
49-
return kint32max;
49+
return kint64max;
5050
} else {
5151
return rows * rows * rows;
5252
}
@@ -69,8 +69,9 @@ class CholeskyOp
6969
// Perform the actual LL^T Cholesky decomposition. This will only use
7070
// the lower triangular part of data_in by default. The upper triangular
7171
// part of the matrix will not be read.
72-
Eigen::LLT<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
73-
Eigen::RowMajor>> llt_decomposition(input);
72+
Eigen::LLT<
73+
Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
74+
llt_decomposition(input);
7475

7576
// Output the lower triangular in a dense form.
7677
*output = llt_decomposition.matrixL();

tensorflow/core/kernels/determinant_op.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class DeterminantOp
4444
const int64 rows = input_matrix_shape.dim_size(0);
4545
if (rows > (1LL << 20)) {
4646
// A big number to cap the cost in case overflow.
47-
return kint32max;
47+
return kint64max;
4848
} else {
4949
return rows * rows * rows;
5050
}

tensorflow/core/kernels/matrix_inverse_op.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class MatrixInverseOp
4545
const int64 rows = input_matrix_shape.dim_size(0);
4646
if (rows > (1LL << 20)) {
4747
// A big number to cap the cost in case overflow.
48-
return kint32max;
48+
return kint64max;
4949
} else {
5050
return rows * rows * rows;
5151
}

tensorflow/core/kernels/reduction_ops_sum.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,10 @@ REGISTER_GPU_KERNELS(float);
4444
#undef REGISTER_GPU_KERNELS
4545

4646
REGISTER_KERNEL_BUILDER(
47-
Name("Sum").Device(DEVICE_GPU).TypeConstraint<complex64>("T"),
47+
Name("Sum")
48+
.Device(DEVICE_GPU)
49+
.TypeConstraint<complex64>("T")
50+
.HostMemory("reduction_indices"),
4851
ReductionOp<GPUDevice, complex64, Eigen::internal::SumReducer<complex64>>);
4952

5053
#endif

0 commit comments

Comments
 (0)