scottmudge
diff --git a/‎tensorflow/core/example/example.proto
Lines changed: 12 additions & 12 deletions b/‎tensorflow/core/example/example.proto
Lines changed: 12 additions & 12 deletions
diff --git a/‎tensorflow/core/example/feature.proto
Lines changed: 12 additions & 12 deletions b/‎tensorflow/core/example/feature.proto
Lines changed: 12 additions & 12 deletions
diff --git a/‎tensorflow/core/framework/tensor_util.cc
Lines changed: 102 additions & 0 deletions b/‎tensorflow/core/framework/tensor_util.cc
Lines changed: 102 additions & 0 deletions
diff --git a/‎tensorflow/core/framework/tensor_util.h
Lines changed: 22 additions & 0 deletions b/‎tensorflow/core/framework/tensor_util.h
Lines changed: 22 additions & 0 deletions
diff --git a/‎tensorflow/core/framework/tensor_util_test.cc
Lines changed: 76 additions & 0 deletions b/‎tensorflow/core/framework/tensor_util_test.cc
Lines changed: 76 additions & 0 deletions
diff --git a/‎tensorflow/core/framework/types.h
Lines changed: 5 additions & 0 deletions b/‎tensorflow/core/framework/types.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎tensorflow/core/kernels/bias_op_gpu.cu.cc
Lines changed: 35 additions & 0 deletions b/‎tensorflow/core/kernels/bias_op_gpu.cu.cc
Lines changed: 35 additions & 0 deletions
@@ -11,39 +11,39 @@ package tensorflow;
 //   features {
 //     feature {
 //       key: "age"
-//       float_list {
+//       value { float_list {
 //         value: 29.0
-//       }
+//       }}
 //     }
 //     feature {
 //       key: "movie"
-//       bytes_list {
+//       value { bytes_list {
 //         value: "The Shawshank Redemption"
 //         value: "Fight Club"
-//       }
+//       }}
 //     }
 //     feature {
 //       key: "movie_ratings"
-//       float_list {
+//       value { float_list {
 //         value: 9.0
 //         value: 9.7
-//       }
+//       }}
 //     }
 //     feature {
 //       key: "suggestion"
-//       bytes_list {
+//       value { bytes_list {
 //         value: "Inception"
-//       }
+//       }}
 //     }
 //     # Note that this feature exists to be used as a label in training.
 //     # E.g., if training a logistic regression model to predict purchase
 //     # probability in our learning tool we would set the label feature to
 //     # "suggestion_purchased".
 //     feature {
 //       key: "suggestion_purchased"
-//       float_list {
+//       value { float_list {
 //         value: 1.0
-//       }
+//       }}
 //     }
 //     # Similar to "suggestion_purchased" above this feature exists to be used
 //     # as a label in training.
@@ -52,9 +52,9 @@ package tensorflow;
 //     # "purchase_price".
 //     feature {
 //       key: "purchase_price"
-//       float_list {
+//       value { float_list {
 //         value: 9.99
-//       }
+//       }}
 //     }
 //  }
 //
 
@@ -14,41 +14,41 @@
 // Example Features for a movie recommendation application:
 //   feature {
 //     key: "age"
-//     float_list {
+//     value { float_list {
 //       value: 29.0
-//     }
+//     }}
 //   }
 //   feature {
 //     key: "movie"
-//     bytes_list {
+//     value { bytes_list {
 //       value: "The Shawshank Redemption"
 //       value: "Fight Club"
-//     }
+//     }}
 //   }
 //   feature {
 //     key: "movie_ratings"
-//     float_list {
+//     value { float_list {
 //       value: 9.0
 //       value: 9.7
-//     }
+//     }}
 //   }
 //   feature {
 //     key: "suggestion"
-//     bytes_list {
+//     value { bytes_list {
 //       value: "Inception"
-//     }
+//     }}
 //   }
 //   feature {
 //     key: "suggestion_purchased"
-//     int64_list {
+//     value { int64_list {
 //       value: 1
-//     }
+//     }}
 //   }
 //   feature {
 //     key: "purchase_price"
-//     float_list {
+//     value { float_list {
 //       value: 9.99
-//     }
+//     }}
 //   }
 
 syntax = "proto3";
 
@@ -24,5 +24,107 @@ Tensor DeepCopy(const Tensor& other) {
   return tmp;
 }
 
+Tensor Concat(const gtl::ArraySlice<Tensor>& tensors) {
+  CHECK_GT(tensors.size(), 0);
+  int64 total_dim0_size = 0;
+  for (const Tensor& tensor : tensors) {
+    CHECK_GT(tensor.dims(), 0);
+    total_dim0_size += tensor.dim_size(0);
+  }
+  TensorShape shape = tensors[0].shape();
+  shape.set_dim(0, total_dim0_size);
+  Tensor result = Tensor(tensors[0].dtype(), shape);
+
+  // We use StringPiece as a convenient map over the tensor buffer,
+  // but we cast the type to get to the underlying buffer to do the
+  // copy.
+  StringPiece to_data = result.tensor_data();
+
+  if (DataTypeCanUseMemcpy(result.dtype())) {
+    int64 offset = 0;
+    for (const Tensor& tensor : tensors) {
+      StringPiece from_data = tensor.tensor_data();
+      CHECK_LE(offset + from_data.size(), to_data.size());
+      memcpy(const_cast<char*>(to_data.data()) + offset, from_data.data(),
+             from_data.size());
+
+      offset += from_data.size();
+    }
+  } else {
+    CHECK_EQ(DT_STRING, result.dtype());
+    string* to_strings =
+        reinterpret_cast<string*>(const_cast<char*>(to_data.data()));
+
+    int64 offset = 0;
+    for (const Tensor& tensor : tensors) {
+      auto from_strings = tensor.flat<string>();
+      CHECK_LE(offset + tensor.NumElements(), result.NumElements());
+      for (int i = 0; i < tensor.NumElements(); ++i) {
+        to_strings[offset + i] = from_strings(i);
+      }
+
+      offset += tensor.NumElements();
+    }
+  }
+
+  return result;
+}
+
+std::vector<Tensor> Split(const Tensor& tensor,
+                          const gtl::ArraySlice<int64>& sizes) {
+  CHECK_GT(tensor.dims(), 0);
+  int64 total_size = 0;
+  for (int64 size : sizes) {
+    total_size += size;
+  }
+  CHECK_EQ(total_size, tensor.dim_size(0));
+
+  std::vector<Tensor> result;
+
+  StringPiece from_data = tensor.tensor_data();
+
+  if (DataTypeCanUseMemcpy(tensor.dtype())) {
+    int64 offset = 0;
+    for (int64 size : sizes) {
+      TensorShape shape = tensor.shape();
+      shape.set_dim(0, size);
+      result.emplace_back(tensor.dtype(), shape);
+      Tensor* split = &result[result.size() - 1];
+
+      // We use StringPiece as a convenient map over the tensor buffer,
+      // but we cast the type to get to the underlying buffer to do the
+      // copy.
+      StringPiece to_data = split->tensor_data();
+      CHECK_LE(offset + to_data.size(), from_data.size());
+      memcpy(const_cast<char*>(to_data.data()), from_data.data() + offset,
+             to_data.size());
+
+      offset += to_data.size();
+    }
+  } else {
+    CHECK_EQ(DT_STRING, tensor.dtype());
+    auto from_strings = tensor.flat<string>();
+
+    int64 offset = 0;
+    for (int64 size : sizes) {
+      TensorShape shape = tensor.shape();
+      shape.set_dim(0, size);
+      result.emplace_back(tensor.dtype(), shape);
+      Tensor& split = result[result.size() - 1];
+      string* to_strings = reinterpret_cast<string*>(
+          const_cast<char*>(split.tensor_data().data()));
+
+      CHECK_LE(offset + split.NumElements(), tensor.NumElements());
+      for (int i = 0; i < split.NumElements(); ++i) {
+        to_strings[i] = from_strings(offset + i);
+      }
+
+      offset += split.NumElements();
+    }
+  }
+
+  return result;
+}
+
 }  // namespace tensor
 }  // namespace tensorflow
@@ -15,6 +15,28 @@ namespace tensor {
 //           'other' is not appropriately memory-aligned.
 Tensor DeepCopy(const Tensor& other);
 
+// Concatenates 'tensors' into a single tensor, along their 0th dimension.
+//
+// REQUIRES: All members of 'tensors' must have the same data type parameter.
+// REQUIRES: Each member of 'tensors' must have at least one dimension.
+// REQUIRES: Each member of 'tensors' must point to data stored in CPU memory.
+// REQUIRES: Each member of 'tensors' must be a Tensor of a copy-able type if it
+//           is not appropriately memory-aligned.
+Tensor Concat(const gtl::ArraySlice<Tensor>& tensors);
+
+// Splits 'tensor' into 'sizes.size()' individual tensors, along the 0th
+// dimension. The ith output tensor has 0th-dimension size 'sizes[i]'.
+//
+// REQUIRES: 'tensor' must have at least one dimension.
+// REQUIRES: 'tensor.dim_size(0)' must equal the sum of the elements of 'sizes'.
+// REQUIRES: 'tensor' must point to data stored in CPU memory.
+// REQUIRES: 'tensor' must be a Tensor of a copy-able type if it is not
+//           appropriately memory-aligned.
+//
+// Split() and Concat() are inverse operations.
+std::vector<Tensor> Split(const Tensor& tensor,
+                          const gtl::ArraySlice<int64>& sizes);
+
 }  // namespace tensor
 }  // namespace tensorflow
 
 
@@ -120,5 +120,81 @@ TEST(TensorUtil, DeepCopySlice) {
   }
 }
 
+TEST(TensorUtil, Concat) {
+  std::vector<int64> sizes = {1, 4, 5};
+  std::vector<Tensor> to_concat;
+  int64 total_size = 0;
+  int offset = 0;
+  for (int entry = 0; entry < sizes.size(); ++entry) {
+    const int64 size = sizes[entry];
+    Tensor tensor(DT_INT32, TensorShape({size, 2}));
+    for (int i = offset; i < offset + size; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        tensor.matrix<int32>()(i - offset, j) = 2 * i + j;
+      }
+    }
+    to_concat.push_back(tensor);
+    total_size += size;
+    offset += size;
+  }
+
+  Tensor concated = tensor::Concat(to_concat);
+  ASSERT_EQ(TensorShape({total_size, 2}), concated.shape());
+  for (int i = 0; i < total_size; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      EXPECT_EQ(2 * i + j, concated.matrix<int32>()(i, j));
+    }
+  }
+}
+
+TEST(TensorUtil, Split) {
+  Tensor to_split(DT_INT64, TensorShape({10, 2}));
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      to_split.matrix<int64>()(i, j) = 2 * i + j;
+    }
+  }
+
+  std::vector<int64> sizes = {1, 4, 5};
+  std::vector<Tensor> splits = tensor::Split(to_split, sizes);
+  ASSERT_EQ(sizes.size(), splits.size());
+
+  int offset = 0;
+  for (int entry = 0; entry < splits.size(); ++entry) {
+    const int64 size = sizes[entry];
+    const Tensor& split = splits[entry];
+
+    ASSERT_EQ(TensorShape({size, 2}), split.shape());
+    for (int i = offset; i < offset + size; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        EXPECT_EQ(2 * i + j, split.matrix<int64>()(i - offset, j));
+      }
+    }
+
+    offset += size;
+  }
+}
+
+TEST(TensorUtil, ConcatSplitStrings) {
+  Tensor x(DT_STRING, TensorShape({4, 3}));
+  for (int i = 0; i < 4 * 3; ++i) {
+    x.flat<string>()(i) = strings::StrCat("foo_", i);
+  }
+
+  Tensor x_round_tripped = tensor::Concat(tensor::Split(x, {2, 1, 1}));
+  ASSERT_EQ(x.shape(), x_round_tripped.shape());
+  for (int i = 0; i < 4 * 3; ++i) {
+    EXPECT_EQ(x.flat<string>()(i), x_round_tripped.flat<string>()(i));
+  }
+
+  // Ensure that no memory is being shared between 'x' and 'x_round_tripped'.
+  for (int i = 0; i < 4 * 3; ++i) {
+    x_round_tripped.flat<string>()(i) = strings::StrCat("bar_", i);
+  }
+  for (int i = 0; i < 4 * 3; ++i) {
+    EXPECT_NE(x.flat<string>()(i), x_round_tripped.flat<string>()(i));
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
@@ -5,7 +5,12 @@
 #include <set>
 #include <string>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+// Disable clang-format to prevent 'FixedPoint' header from being included
+// before 'Tensor' header on which it depends.
+// clang-format off
 #include "third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint"
+// clang-format on
 #include "tensorflow/core/framework/bfloat16.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/types.pb.h"
 
@@ -2,14 +2,49 @@
 
 #define EIGEN_USE_GPU
 
+#include <algorithm>
+
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/bias_op.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
 // Definition of the GPU implementations declared in bias_op.cc.
+
+namespace functor {
+
+template <typename T>
+__global__ void BiasOpCustomKernel(int nthreads, const T* input, const T* bias,
+                                   int bias_size, int replicate_count,
+                                   T* output) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int bias_offset = index % bias_size;
+    output[index] = input[index] + bias[bias_offset];
+  }
+}
+
+template <typename T, int Dims>
+struct Bias<GPUDevice, T, Dims> {
+  typedef GPUDevice Device;
+  // Add "bias" to "input", broadcasting it on all dimensions but the last one.
+  void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+                  typename TTypes<T>::ConstVec bias,
+                  typename TTypes<T, Dims>::Tensor output) {
+    const int bias_size = bias.dimension(0);
+    const int rest_size = input.size() / bias_size;
+    CudaLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
+    BiasOpCustomKernel<<<config.block_count, config.thread_per_block, 0,
+                         d.stream()>>>(config.virtual_thread_count,
+                                       input.data(), bias.data(), bias_size,
+                                       rest_size, output.data());
+  }
+};
+
+}  // namespace functor
+
 #define DEFINE_GPU_SPECS(T)                       \
   template struct functor::Bias<GPUDevice, T, 2>; \
   template struct functor::Bias<GPUDevice, T, 3>; \