Codestin Search App

173 lines (147 loc) · 7.7 KB
#include "module.h"
#include <ctranslate2/encoder.h>
#include "replica_pool.h"
namespace ctranslate2 {
  namespace python {
    class EncoderWrapper : public ReplicaPoolHelper<Encoder> {
    public:
      using ReplicaPoolHelper::ReplicaPoolHelper;
      EncoderForwardOutput
      forward_batch(const std::variant<BatchTokens, BatchIds, StorageView>& inputs,
                    const std::optional<StorageView>& lengths,
                    const std::optional<BatchIds>& token_type_ids) {
        std::future<EncoderForwardOutput> future;
        std::shared_lock lock(_mutex);
        assert_model_is_ready();
        switch (inputs.index()) {
        case 0:
          future = _pool->forward_batch_async(
            std::get<BatchTokens>(inputs),
            token_type_ids.value_or(std::vector<std::vector<size_t>>()));
          break;
        case 1:
          future = _pool->forward_batch_async(
            std::get<BatchIds>(inputs),
            token_type_ids.value_or(std::vector<std::vector<size_t>>()));
          break;
        case 2:
          if (!lengths)
            throw std::invalid_argument("lengths vector is required when passing a dense input");
          future = _pool->forward_batch_async(
            std::get<StorageView>(inputs),
            lengths.value(),
            token_type_ids.value_or(std::vector<std::vector<size_t>>()));
          break;
        return future.get();
    void register_encoder(py::module& m) {
      py::class_<EncoderForwardOutput>(m, "EncoderForwardOutput",
                                       "Forward output of an encoder model.")
        .def_readonly("last_hidden_state", &EncoderForwardOutput::last_hidden_state,
                      "Output of the last layer.")
        .def_readonly("pooler_output", &EncoderForwardOutput::pooler_output,
                      "Output of the pooling layer.")
        .def("__repr__", [](const EncoderForwardOutput& output) {
          return "EncoderForwardOutput(last_hidden_state="
            + std::string(py::repr(py::cast(output.last_hidden_state)))
            + ", pooler_output=" + std::string(py::repr(py::cast(output.pooler_output)))
            + ")";
      py::class_<EncoderWrapper>(
        m, "Encoder",
        R"pbdoc(
            A text encoder.
            Example:
                >>> encoder = ctranslate2.Encoder("model/", device="cpu")
                >>> encoder.forward_batch([["▁Hello", "▁world", "!"]])
        )pbdoc")
        .def(py::init<const std::string&, const std::string&, const std::variant<int, std::vector<int>>&, const StringOrMap&, size_t, size_t, long, bool, bool, py::object>(),
             py::arg("model_path"),
             py::arg("device")="cpu",
             py::kw_only(),
             py::arg("device_index")=0,
             py::arg("compute_type")="default",
             py::arg("inter_threads")=1,
             py::arg("intra_threads")=0,
             py::arg("max_queued_batches")=0,
             py::arg("flash_attention")=false,
             py::arg("tensor_parallel")=false,
             py::arg("files")=py::none(),
             R"pbdoc(
                 Initializes the encoder.
                 Arguments:
                   model_path: Path to the CTranslate2 model directory.
                   device: Device to use (possible values are: cpu, cuda, auto).
                   device_index: Device IDs where to place this encoder on.
                   compute_type: Model computation type or a dictionary mapping a device name
                     to the computation type (possible values are: default, auto, int8, int8_float32,
                     int8_float16, int8_bfloat16, int16, float16, bfloat16, float32).
                   inter_threads: Maximum number of parallel generations.
                   intra_threads: Number of OpenMP threads per encoder (0 to use a default value).
                   max_queued_batches: Maximum numbers of batches in the queue (-1 for unlimited,
                     0 for an automatic value). When the queue is full, future requests will block
                     until a free slot is available.
                   flash_attention: run model with flash attention 2 for self-attention layer
                   tensor_parallel: run model with tensor parallel mode
                   files: Load model files from the memory. This argument is a dictionary mapping
                     file names to file contents as file-like or bytes objects. If this is set,
                     :obj:`model_path` acts as an identifier for this model.
             )pbdoc")
        .def_property_readonly("device", &EncoderWrapper::device,
                               "Device this encoder is running on.")
        .def_property_readonly("device_index", &EncoderWrapper::device_index,
                               "List of device IDs where this encoder is running on.")
        .def_property_readonly("compute_type", &EncoderWrapper::compute_type,
                               "Computation type used by the model.")
        .def_property_readonly("num_encoders", &EncoderWrapper::num_replicas,
                               "Number of encoders backing this instance.")
        .def_property_readonly("num_queued_batches", &EncoderWrapper::num_queued_batches,
                               "Number of batches waiting to be processed.")
        .def_property_readonly("tensor_parallel", &EncoderWrapper::tensor_parallel,
                               "Run model with tensor parallel mode.")
        .def_property_readonly("num_active_batches", &EncoderWrapper::num_active_batches,
                               "Number of batches waiting to be processed or currently processed.")
        .def("forward_batch", &EncoderWrapper::forward_batch,
             py::arg("inputs"),
             py::arg("lengths")=py::none(),
             py::arg("token_type_ids")=py::none(),
             py::call_guard<py::gil_scoped_release>(),
             R"pbdoc(
                 Forwards a batch of sequences in the encoder.
                 Arguments:
                   inputs: A batch of sequences either as string tokens or token IDs.
                     This argument can also be a dense int32 array with shape
                     ``[batch_size, max_length]`` (e.g. created from a Numpy array or PyTorch tensor).
                   lengths: The length of each sequence as a int32 array with shape
                     ``[batch_size]``. Required when :obj:`inputs` is a dense array.
                   token_type_ids: A batch of token type IDs of same shape as :obj:`inputs`.
                     ``[batch_size, max_length]``.
                 Returns:
                   The encoder model output.
             )pbdoc")
        .def("unload_model", &EncoderWrapper::unload_model,
             py::arg("to_cpu")=false,
             py::call_guard<py::gil_scoped_release>(),
             R"pbdoc(
                 Unloads the model attached to this encoder but keep enough runtime context
                 to quickly resume encoder on the initial device.
                 Arguments:
                   to_cpu: If ``True``, the model is moved to the CPU memory and not fully unloaded.
             )pbdoc")
        .def("load_model", &EncoderWrapper::load_model,
             py::arg("keep_cache")=false,
             py::call_guard<py::gil_scoped_release>(),
             R"pbdoc(
                 Loads the model back to the initial device.
                 Arguments:
                   keep_cache: If ``True``, the model cache in the CPU memory is not deleted if it exists.
             )pbdoc")
        .def_property_readonly("model_is_loaded", &EncoderWrapper::model_is_loaded,
                               "Whether the model is loaded on the initial device and ready to be used.")
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

encoder.cc

Latest commit

History

encoder.cc

File metadata and controls