Codestin Search App

History

2089 lines (1919 loc) · 74.3 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

// Redistribution and use in source and binary forms, with or without

// modification, are permitted provided that the following conditions

// are met:

// * Redistributions of source code must retain the above copyright

// notice, this list of conditions and the following disclaimer.

// * Redistributions in binary form must reproduce the above copyright

// notice, this list of conditions and the following disclaimer in the

// documentation and/or other materials provided with the distribution.

// * Neither the name of NVIDIA CORPORATION nor the names of its

// contributors may be used to endorse or promote products derived

// from this software without specific prior written permission.

// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY

// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR

// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY

// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#ifdef _WIN32

#define NOMINMAX

#define WIN32_LEAN_AND_MEAN

#include <windows.h>

#include <winsock2.h>

#include <ws2tcpip.h>

#pragma comment(lib, "ws2_32.lib")

#endif

#ifndef _WIN32

#include <getopt.h>

#include <unistd.h>

#endif

#include <stdint.h>

#include <algorithm>

#include <cctype>

#include <iomanip>

#include <iostream>

#include <list>

#include <set>

#include <sstream>

#include <thread>

#include "triton_signal.h"

#ifdef TRITON_ENABLE_ASAN

#include <sanitizer/lsan_interface.h>

#endif // TRITON_ENABLE_ASAN

#include "common.h"

#include "shared_memory_manager.h"

#include "tracer.h"

#include "triton/common/logging.h"

#include "triton/core/tritonserver.h"

#if defined(TRITON_ENABLE_HTTP) || defined(TRITON_ENABLE_METRICS)

#include "http_server.h"

#endif // TRITON_ENABLE_HTTP|| TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_SAGEMAKER

#include "sagemaker_server.h"

#endif // TRITON_ENABLE_SAGEMAKER

#ifdef TRITON_ENABLE_VERTEX_AI

#include "vertex_ai_server.h"

#endif // TRITON_ENABLE_VERTEX_AI

#ifdef TRITON_ENABLE_GRPC

#include "grpc_server.h"

#endif // TRITON_ENABLE_GRPC

#ifdef TRITON_ENABLE_GPU

static_assert(

TRITON_MIN_COMPUTE_CAPABILITY >= 1.0,

"Invalid TRITON_MIN_COMPUTE_CAPABILITY specified");

#endif // TRITON_ENABLE_GPU

namespace {

// Interval, in seconds, when the model repository is polled for

// changes.

int32_t repository_poll_secs_ = 15;

// The HTTP, GRPC and metrics service/s and ports. Initialized to

// default values and modifyied based on command-line args.

#ifdef TRITON_ENABLE_HTTP

std::unique_ptr<triton::server::HTTPServer> http_service_;

bool allow_http_ = true;

int32_t http_port_ = 8000;

bool reuse_http_port_ = false;

std::string http_address_ = "0.0.0.0";

#endif // TRITON_ENABLE_HTTP

#ifdef TRITON_ENABLE_SAGEMAKER

std::unique_ptr<triton::server::HTTPServer> sagemaker_service_;

bool allow_sagemaker_ = false;

int32_t sagemaker_port_ = 8080;

// Triton uses "0.0.0.0" as default address for SageMaker.

std::string sagemaker_address_ = "0.0.0.0";

bool sagemaker_safe_range_set_ = false;

std::pair<int32_t, int32_t> sagemaker_safe_range_ = {-1, -1};

// The number of threads to initialize for the SageMaker HTTP front-end.

int sagemaker_thread_cnt_ = 8;

#endif // TRITON_ENABLE_SAGEMAKER

#ifdef TRITON_ENABLE_VERTEX_AI

std::unique_ptr<triton::server::HTTPServer> vertex_ai_service_;

// Triton uses "0.0.0.0" as default address for Vertex AI.

std::string vertex_ai_address_ = "0.0.0.0";

bool allow_vertex_ai_ = false;

int32_t vertex_ai_port_ = 8080;

// The number of threads to initialize for the Vertex AI HTTP front-end.

int vertex_ai_thread_cnt_ = 8;

std::string vertex_ai_default_model_;

#endif // TRITON_ENABLE_VERTEX_AI

#ifdef TRITON_ENABLE_GRPC

std::unique_ptr<triton::server::GRPCServer> grpc_service_;

bool allow_grpc_ = true;

int32_t grpc_port_ = 8001;

bool reuse_grpc_port_ = false;

std::string grpc_address_ = "0.0.0.0";

bool grpc_use_ssl_ = false;

triton::server::SslOptions grpc_ssl_options_;

grpc_compression_level grpc_response_compression_level_ =

GRPC_COMPRESS_LEVEL_NONE;

// KeepAlive defaults: https://grpc.github.io/grpc/cpp/md_doc_keepalive.html

triton::server::KeepAliveOptions grpc_keepalive_options_;

#endif // TRITON_ENABLE_GRPC

#ifdef TRITON_ENABLE_METRICS

std::unique_ptr<triton::server::HTTPServer> metrics_service_;

bool allow_metrics_ = true;

int32_t metrics_port_ = 8002;

float metrics_interval_ms_ = 2000;

#ifndef TRITON_ENABLE_HTTP

// Triton uses the same address for http and metrics services.

// Need to set http address for metrics when http service is disable.

std::string http_address_ = "0.0.0.0";

#endif // NOT TRITON_ENABLE_HTTP

#endif // TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_TRACING

std::string trace_filepath_;

TRITONSERVER_InferenceTraceLevel trace_level_ =

TRITONSERVER_TRACE_LEVEL_DISABLED;

int32_t trace_rate_ = 1000;

int32_t trace_count_ = -1;

int32_t trace_log_frequency_ = 0;

#endif // TRITON_ENABLE_TRACING

#if defined(TRITON_ENABLE_GRPC)

// The maximum number of inference request/response objects that

// remain allocated for reuse. As long as the number of in-flight

// requests doesn't exceed this value there will be no

// allocation/deallocation of request/response objects.

int grpc_infer_allocation_pool_size_ = 8;

#endif // TRITON_ENABLE_GRPC

#if defined(TRITON_ENABLE_HTTP)

// The number of threads to initialize for the HTTP front-end.

int http_thread_cnt_ = 8;

#endif // TRITON_ENABLE_HTTP

#ifdef _WIN32

// Minimum implementation of <getopt.h> for Windows

#define required_argument 1

#define no_argument 2

int optind = 1;

const char* optarg = nullptr;

struct option {

option(const char* name, int has_arg, int* flag, int val)

: name_(name), has_arg_(has_arg), flag_(flag), val_(val)

{

}

const char* name_;

int has_arg_;

int* flag_;

int val_;

};

bool

end_of_long_opts(const struct option* longopts)

{

return (

(longopts->name_ == nullptr) && (longopts->has_arg_ == 0) &&

(longopts->flag_ == nullptr) && (longopts->val_ == 0));

}

int

getopt_long(

int argc, char* const argv[], const char* optstring,

const struct option* longopts, int* longindex)

{

if ((longindex != NULL) || (optind >= argc)) {

return -1;

}

const struct option* curr_longopt = longopts;

std::string argv_str = argv[optind];

size_t found = argv_str.find_first_of("=");

std::string key = argv_str.substr(

2, (found == std::string::npos) ? std::string::npos : (found - 2));

while (!end_of_long_opts(curr_longopt)) {

if (key == curr_longopt->name_) {

if (curr_longopt->has_arg_ == required_argument) {

if (found == std::string::npos) {

optind++;

if (optind >= argc) {

std::cerr << argv[0] << ": option '" << argv_str

<< "' requires an argument" << std::endl;

return '?';

}

optarg = argv[optind];

} else {

optarg = (argv[optind] + found + 1);

}

optind++;

return curr_longopt->val_;

}

curr_longopt++;

}

return -1;

}

#endif

// Command-line options

enum OptionId {

OPTION_HELP = 1000,

#ifdef TRITON_ENABLE_LOGGING

OPTION_LOG_VERBOSE,

OPTION_LOG_INFO,

OPTION_LOG_WARNING,

OPTION_LOG_ERROR,

OPTION_LOG_FORMAT,

OPTION_LOG_FILE,

#endif // TRITON_ENABLE_LOGGING

OPTION_ID,

OPTION_MODEL_REPOSITORY,

OPTION_EXIT_ON_ERROR,

OPTION_DISABLE_AUTO_COMPLETE_CONFIG,

OPTION_STRICT_MODEL_CONFIG,

OPTION_STRICT_READINESS,

#if defined(TRITON_ENABLE_HTTP)

OPTION_ALLOW_HTTP,

OPTION_HTTP_PORT,

OPTION_REUSE_HTTP_PORT,

OPTION_HTTP_ADDRESS,

OPTION_HTTP_THREAD_COUNT,

#endif // TRITON_ENABLE_HTTP

#if defined(TRITON_ENABLE_GRPC)

OPTION_ALLOW_GRPC,

OPTION_GRPC_PORT,

OPTION_REUSE_GRPC_PORT,

OPTION_GRPC_ADDRESS,

OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE,

OPTION_GRPC_USE_SSL,

OPTION_GRPC_USE_SSL_MUTUAL,

OPTION_GRPC_SERVER_CERT,

OPTION_GRPC_SERVER_KEY,

OPTION_GRPC_ROOT_CERT,

OPTION_GRPC_RESPONSE_COMPRESSION_LEVEL,

OPTION_GRPC_ARG_KEEPALIVE_TIME_MS,

OPTION_GRPC_ARG_KEEPALIVE_TIMEOUT_MS,

OPTION_GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS,

OPTION_GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA,

OPTION_GRPC_ARG_HTTP2_MIN_RECV_PING_INTERVAL_WITHOUT_DATA_MS,

OPTION_GRPC_ARG_HTTP2_MAX_PING_STRIKES,

#endif // TRITON_ENABLE_GRPC

#if defined(TRITON_ENABLE_SAGEMAKER)

OPTION_ALLOW_SAGEMAKER,

OPTION_SAGEMAKER_PORT,

OPTION_SAGEMAKER_SAFE_PORT_RANGE,

OPTION_SAGEMAKER_THREAD_COUNT,

#endif // TRITON_ENABLE_SAGEMAKER

#if defined(TRITON_ENABLE_VERTEX_AI)

OPTION_ALLOW_VERTEX_AI,

OPTION_VERTEX_AI_PORT,

OPTION_VERTEX_AI_THREAD_COUNT,

OPTION_VERTEX_AI_DEFAULT_MODEL,

#endif // TRITON_ENABLE_VERTEX_AI

#ifdef TRITON_ENABLE_METRICS

OPTION_ALLOW_METRICS,

OPTION_ALLOW_GPU_METRICS,

OPTION_ALLOW_CPU_METRICS,

OPTION_METRICS_PORT,

OPTION_METRICS_INTERVAL_MS,

#endif // TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_TRACING

OPTION_TRACE_FILEPATH,

OPTION_TRACE_LEVEL,

OPTION_TRACE_RATE,

OPTION_TRACE_COUNT,

OPTION_TRACE_LOG_FREQUENCY,

#endif // TRITON_ENABLE_TRACING

OPTION_MODEL_CONTROL_MODE,

OPTION_POLL_REPO_SECS,

OPTION_STARTUP_MODEL,

OPTION_RATE_LIMIT,

OPTION_RATE_LIMIT_RESOURCE,

OPTION_PINNED_MEMORY_POOL_BYTE_SIZE,

OPTION_CUDA_MEMORY_POOL_BYTE_SIZE,

OPTION_RESPONSE_CACHE_BYTE_SIZE,

OPTION_MIN_SUPPORTED_COMPUTE_CAPABILITY,

OPTION_EXIT_TIMEOUT_SECS,

OPTION_BACKEND_DIR,

OPTION_REPOAGENT_DIR,

OPTION_BUFFER_MANAGER_THREAD_COUNT,

OPTION_MODEL_LOAD_THREAD_COUNT,

OPTION_BACKEND_CONFIG,

OPTION_HOST_POLICY,

OPTION_MODEL_LOAD_GPU_LIMIT

};

struct Option {

static constexpr const char* ArgNone = "";

static constexpr const char* ArgBool = "boolean";

static constexpr const char* ArgFloat = "float";

static constexpr const char* ArgInt = "integer";

static constexpr const char* ArgStr = "string";

Option(OptionId id, std::string flag, std::string arg_desc, std::string desc)

: id_(id), flag_(flag), arg_desc_(arg_desc), desc_(desc)

{

}

struct option GetLongOption() const

{

struct option lo {

flag_.c_str(), (!arg_desc_.empty()) ? required_argument : no_argument,

nullptr, id_

};

return lo;

}

const OptionId id_;

const std::string flag_;

const std::string arg_desc_;

const std::string desc_;

};

std::vector<Option> options_

{

{OPTION_HELP, "help", Option::ArgNone, "Print usage"},

#ifdef TRITON_ENABLE_LOGGING

{OPTION_LOG_VERBOSE, "log-verbose", Option::ArgInt,

"Set verbose logging level. Zero (0) disables verbose logging and "

"values >= 1 enable verbose logging."},

{OPTION_LOG_INFO, "log-info", Option::ArgBool,

"Enable/disable info-level logging."},

{OPTION_LOG_WARNING, "log-warning", Option::ArgBool,

"Enable/disable warning-level logging."},

{OPTION_LOG_ERROR, "log-error", Option::ArgBool,

"Enable/disable error-level logging."},

{OPTION_LOG_FORMAT, "log-format", Option::ArgStr,

"Set the logging format. Options are \"default\" and \"ISO8601\". "

"The default is \"default\". For \"default\", the log severity (L) and "

"timestamp will be logged as \"LMMDD hh:mm:ss.ssssss\". "

"For \"ISO8601\", the log format will be \"YYYY-MM-DDThh:mm:ssZ L\"."},

{OPTION_LOG_FILE, "log-file", Option::ArgStr,

"Set the name of the log output file. If specified, log outputs will be "

"saved to this file. If not specified, log outputs will stream to the "

"console."},

#endif // TRITON_ENABLE_LOGGING

{OPTION_ID, "id", Option::ArgStr, "Identifier for this server."},

{OPTION_MODEL_REPOSITORY, "model-store", Option::ArgStr,

"Equivalent to --model-repository."},

{OPTION_MODEL_REPOSITORY, "model-repository", Option::ArgStr,

"Path to model repository directory. It may be specified multiple times "

"to add multiple model repositories. Note that if a model is not unique "

"across all model repositories at any time, the model will not be "

"available."},

{OPTION_EXIT_ON_ERROR, "exit-on-error", Option::ArgBool,

"Exit the inference server if an error occurs during initialization."},

{OPTION_DISABLE_AUTO_COMPLETE_CONFIG, "disable-auto-complete-config",

Option::ArgNone,

"If set, disables the triton and backends from auto completing model "

"configuration files. Model configuration files must be provided and "

"all required "

"configuration settings must be specified."},

{OPTION_STRICT_MODEL_CONFIG, "strict-model-config", Option::ArgBool,

"DEPRECATED: If true model configuration files must be provided and all "

"required "

"configuration settings must be specified. If false the model "

"configuration may be absent or only partially specified and the "

"server will attempt to derive the missing required configuration."},

{OPTION_STRICT_READINESS, "strict-readiness", Option::ArgBool,

"If true /v2/health/ready endpoint indicates ready if the server "

"is responsive and all models are available. If false "

"/v2/health/ready endpoint indicates ready if server is responsive "

"even if some/all models are unavailable."},

#if defined(TRITON_ENABLE_HTTP)

{OPTION_ALLOW_HTTP, "allow-http", Option::ArgBool,

"Allow the server to listen for HTTP requests."},

{OPTION_HTTP_PORT, "http-port", Option::ArgInt,

"The port for the server to listen on for HTTP requests."},

{OPTION_REUSE_HTTP_PORT, "reuse-http-port", Option::ArgBool,

"Allow multiple servers to listen on the same HTTP port when every "

"server has this option set."},

{OPTION_HTTP_ADDRESS, "http-address", Option::ArgStr,

"The address for the http server to binds to."},

{OPTION_HTTP_THREAD_COUNT, "http-thread-count", Option::ArgInt,

"Number of threads handling HTTP requests."},

#endif // TRITON_ENABLE_HTTP

#if defined(TRITON_ENABLE_GRPC)

{OPTION_ALLOW_GRPC, "allow-grpc", Option::ArgBool,

"Allow the server to listen for GRPC requests."},

{OPTION_GRPC_PORT, "grpc-port", Option::ArgInt,

"The port for the server to listen on for GRPC requests."},

{OPTION_REUSE_GRPC_PORT, "reuse-grpc-port", Option::ArgBool,

"Allow multiple servers to listen on the same GRPC port when every "

"server has this option set."},

{OPTION_GRPC_ADDRESS, "grpc-address", Option::ArgStr,

"The address for the grpc server to binds to."},

{OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE,

"grpc-infer-allocation-pool-size", Option::ArgInt,

"The maximum number of inference request/response objects that remain "

"allocated for reuse. As long as the number of in-flight requests "

"doesn't exceed this value there will be no allocation/deallocation of "

"request/response objects."},

{OPTION_GRPC_USE_SSL, "grpc-use-ssl", Option::ArgBool,

"Use SSL authentication for GRPC requests. Default is false."},

{OPTION_GRPC_USE_SSL_MUTUAL, "grpc-use-ssl-mutual", Option::ArgBool,

"Use mututal SSL authentication for GRPC requests. Default is false."},

{OPTION_GRPC_SERVER_CERT, "grpc-server-cert", Option::ArgStr,

"File holding PEM-encoded server certificate. Ignored unless "

"--grpc-use-ssl is true."},

{OPTION_GRPC_SERVER_KEY, "grpc-server-key", Option::ArgStr,

"File holding PEM-encoded server key. Ignored unless "

"--grpc-use-ssl is true."},

{OPTION_GRPC_ROOT_CERT, "grpc-root-cert", Option::ArgStr,

"File holding PEM-encoded root certificate. Ignore unless "

"--grpc-use-ssl is false."},

{OPTION_GRPC_RESPONSE_COMPRESSION_LEVEL,

"grpc-infer-response-compression-level", Option::ArgStr,

"The compression level to be used while returning the infer response to "

"the peer. Allowed values are none, low, medium and high. By default, "

"compression level is selected as none."},

{OPTION_GRPC_ARG_KEEPALIVE_TIME_MS, "grpc-keepalive-time", Option::ArgInt,

"The period (in milliseconds) after which a keepalive ping is sent on "

"the transport. Default is 7200000 (2 hours)."},

{OPTION_GRPC_ARG_KEEPALIVE_TIMEOUT_MS, "grpc-keepalive-timeout",

Option::ArgInt,

"The period (in milliseconds) the sender of the keepalive ping waits "

"for an acknowledgement. If it does not receive an acknowledgment "

"within this time, it will close the connection. "

"Default is 20000 (20 seconds)."},

{OPTION_GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS,

"grpc-keepalive-permit-without-calls", Option::ArgBool,

"Allows keepalive pings to be sent even if there are no calls in flight "

"(0 : false; 1 : true). Default is 0 (false)."},

{OPTION_GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA,

"grpc-http2-max-pings-without-data", Option::ArgInt,

"The maximum number of pings that can be sent when there is no "

"data/header frame to be sent. gRPC Core will not continue sending "

"pings if we run over the limit. Setting it to 0 allows sending pings "

"without such a restriction. Default is 2."},

{OPTION_GRPC_ARG_HTTP2_MIN_RECV_PING_INTERVAL_WITHOUT_DATA_MS,

"grpc-http2-min-recv-ping-interval-without-data", Option::ArgInt,

"If there are no data/header frames being sent on the transport, this "

"channel argument on the server side controls the minimum time "

"(in milliseconds) that gRPC Core would expect between receiving "

"successive pings. If the time between successive pings is less than "

"this time, then the ping will be considered a bad ping from the peer. "

"Such a ping counts as a ‘ping strike’. Default is 300000 (5 minutes)."},

{OPTION_GRPC_ARG_HTTP2_MAX_PING_STRIKES, "grpc-http2-max-ping-strikes",

Option::ArgInt,

"Maximum number of bad pings that the server will tolerate before "

"sending an HTTP2 GOAWAY frame and closing the transport. Setting it to "

"0 allows the server to accept any number of bad pings. Default is 2."},

#endif // TRITON_ENABLE_GRPC

#if defined(TRITON_ENABLE_SAGEMAKER)

{OPTION_ALLOW_SAGEMAKER, "allow-sagemaker", Option::ArgBool,

"Allow the server to listen for Sagemaker requests. Default is false."},

{OPTION_SAGEMAKER_PORT, "sagemaker-port", Option::ArgInt,

"The port for the server to listen on for Sagemaker requests. Default "

"is 8080."},

{OPTION_SAGEMAKER_SAFE_PORT_RANGE, "sagemaker-safe-port-range",

"<integer>-<integer>",

"Set the allowed port range for endpoints other than the SageMaker "

"endpoints."},

{OPTION_SAGEMAKER_THREAD_COUNT, "sagemaker-thread-count", Option::ArgInt,

"Number of threads handling Sagemaker requests. Default is 8."},

#endif // TRITON_ENABLE_SAGEMAKER

#if defined(TRITON_ENABLE_VERTEX_AI)

{OPTION_ALLOW_VERTEX_AI, "allow-vertex-ai", Option::ArgBool,

"Allow the server to listen for Vertex AI requests. Default is true if "

"AIP_MODE=PREDICTION, false otherwise."},

{OPTION_VERTEX_AI_PORT, "vertex-ai-port", Option::ArgInt,

"The port for the server to listen on for Vertex AI requests. Default "

"is AIP_HTTP_PORT if set, 8080 otherwise."},

{OPTION_VERTEX_AI_THREAD_COUNT, "vertex-ai-thread-count", Option::ArgInt,

"Number of threads handling Vertex AI requests. Default is 8."},

{OPTION_VERTEX_AI_DEFAULT_MODEL, "vertex-ai-default-model",

Option::ArgStr,

"The name of the model to use for single-model inference requests."},

#endif // TRITON_ENABLE_VERTEX_AI

#ifdef TRITON_ENABLE_METRICS

{OPTION_ALLOW_METRICS, "allow-metrics", Option::ArgBool,

"Allow the server to provide prometheus metrics."},

{OPTION_ALLOW_GPU_METRICS, "allow-gpu-metrics", Option::ArgBool,

"Allow the server to provide GPU metrics. Ignored unless "

"--allow-metrics is true."},

{OPTION_ALLOW_CPU_METRICS, "allow-cpu-metrics", Option::ArgBool,

"Allow the server to provide CPU metrics. Ignored unless "

"--allow-metrics is true."},

{OPTION_METRICS_PORT, "metrics-port", Option::ArgInt,

"The port reporting prometheus metrics."},

{OPTION_METRICS_INTERVAL_MS, "metrics-interval-ms", Option::ArgFloat,

"Metrics will be collected once every <metrics-interval-ms> "

"milliseconds. Default is 2000 milliseconds."},

#endif // TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_TRACING

{OPTION_TRACE_FILEPATH, "trace-file", Option::ArgStr,

"Set the file where trace output will be saved. If --trace-log-frequency"

" is also specified, this argument value will be the prefix of the files"

" to save the trace output. See --trace-log-frequency for detail."},

{OPTION_TRACE_LEVEL, "trace-level", Option::ArgStr,

"Specify a trace level. OFF to disable tracing, TIMESTAMPS to "

"trace timestamps, TENSORS to trace tensors. It may be specified "

"multiple times to trace multiple informations. Default is OFF."},

{OPTION_TRACE_RATE, "trace-rate", Option::ArgInt,

"Set the trace sampling rate. Default is 1000."},

{OPTION_TRACE_COUNT, "trace-count", Option::ArgInt,

"Set the number of traces to be sampled. If the value is -1, the number "

"of traces to be sampled will not be limited. Default is -1."},

{OPTION_TRACE_LOG_FREQUENCY, "trace-log-frequency", Option::ArgInt,

"Set the trace log frequency. If the value is 0, Triton will only log "

"the trace output to <trace-file> when shutting down. Otherwise, Triton "

"will log the trace output to <trace-file>.<idx> when it collects the "

"specified number of traces. For example, if the log frequency is 100, "

"when Triton collects the 100-th trace, it logs the traces to file "

"<trace-file>.0, and when it collects the 200-th trace, it logs the "

"101-th to the 200-th traces to file <trace-file>.1. Default is 0."},

#endif // TRITON_ENABLE_TRACING

{OPTION_MODEL_CONTROL_MODE, "model-control-mode", Option::ArgStr,

"Specify the mode for model management. Options are \"none\", \"poll\" "

"and \"explicit\". The default is \"none\". "

"For \"none\", the server will load all models in the model "

"repository(s) at startup and will not make any changes to the load "

"models after that. For \"poll\", the server will poll the model "

"repository(s) to detect changes and will load/unload models based on "

"those changes. The poll rate is controlled by 'repository-poll-secs'. "

"For \"explicit\", model load and unload is initiated by using the "

"model control APIs, and only models specified with --load-model will "

"be loaded at startup."},

{OPTION_POLL_REPO_SECS, "repository-poll-secs", Option::ArgInt,

"Interval in seconds between each poll of the model repository to check "

"for changes. Valid only when --model-control-mode=poll is "

"specified."},

{OPTION_STARTUP_MODEL, "load-model", Option::ArgStr,

"Name of the model to be loaded on server startup. It may be specified "

"multiple times to add multiple models. To load ALL models at startup, "

"specify '*' as the model name with --load-model=* as the ONLY "

"--load-model argument, this does not imply any pattern matching. "

"Specifying --load-model=* in conjunction with another --load-model "

"argument will result in error. Note that this option will only take "

"effect if --model-control-mode=explicit is true."},

// FIXME: fix the default to execution_count once RL logic is complete.

{OPTION_RATE_LIMIT, "rate-limit", Option::ArgStr,

"Specify the mode for rate limiting. Options are \"execution_count\" "

"and \"off\". The default is \"off\". For "

"\"execution_count\", the server will determine the instance using "

"configured priority and the number of time the instance has been "

"used to run inference. The inference will finally be executed once "

"the required resources are available. For \"off\", the server will "

"ignore any rate limiter config and run inference as soon as an "

"instance is ready."},

{OPTION_RATE_LIMIT_RESOURCE, "rate-limit-resource",

"<string>:<integer>:<integer>",

"The number of resources available to the server. The format of this "

"flag is --rate-limit-resource=<resource_name>:<count>:<device>. The "

"<device> is optional and if not listed will be applied to every "

"device. If the resource is specified as \"GLOBAL\" in the model "

"configuration the resource is considered shared among all the devices "

"in the system. The <device> property is ignored for such resources. "

"This flag can be specified multiple times to specify each resources "

"and their availability. By default, the max across all instances that "

"list the resource is selected as its availability. The values for this "

"flag is case-insensitive."},

{OPTION_PINNED_MEMORY_POOL_BYTE_SIZE, "pinned-memory-pool-byte-size",

Option::ArgInt,

"The total byte size that can be allocated as pinned system memory. "

"If GPU support is enabled, the server will allocate pinned system "

"memory to accelerate data transfer between host and devices until it "

"exceeds the specified byte size. If 'numa-node' is configured via "

"--host-policy, the pinned system memory of the pool size will be "

"allocated on each numa node. This option will not affect the "

"allocation conducted by the backend frameworks. Default is 256 MB."},

{OPTION_CUDA_MEMORY_POOL_BYTE_SIZE, "cuda-memory-pool-byte-size",

"<integer>:<integer>",

"The total byte size that can be allocated as CUDA memory for the GPU "

"device. If GPU support is enabled, the server will allocate CUDA "

"memory to minimize data transfer between host and devices until it "

"exceeds the specified byte size. This option will not affect the "

"allocation conducted by the backend frameworks. The argument should be "

"2 integers separated by colons in the format "

"<GPU device ID>:<pool byte size>. This option can be used multiple "

"times, but only once per GPU device. Subsequent uses will overwrite "

"previous uses for the same GPU device. Default is 64 MB."},

{OPTION_RESPONSE_CACHE_BYTE_SIZE, "response-cache-byte-size",

Option::ArgInt,

"The size in bytes to allocate for a request/response cache. When "

"non-zero, Triton allocates the requested size in CPU memory and "

"shares the cache across all inference requests and across all models. "

"For a given model to use request caching, the model must enable "

"request caching in the model configuration. By default, no model uses "

"request caching even if the request cache is enabled with the "

"--response-cache-byte-size flag. Default is 0."},

{OPTION_MIN_SUPPORTED_COMPUTE_CAPABILITY,

"min-supported-compute-capability", Option::ArgFloat,

"The minimum supported CUDA compute capability. GPUs that don't support "

"this compute capability will not be used by the server."},

{OPTION_EXIT_TIMEOUT_SECS, "exit-timeout-secs", Option::ArgInt,

"Timeout (in seconds) when exiting to wait for in-flight inferences to "

"finish. After the timeout expires the server exits even if inferences "

"are still in flight."},

{OPTION_BACKEND_DIR, "backend-directory", Option::ArgStr,

"The global directory searched for backend shared libraries. Default is "

"'/opt/tritonserver/backends'."},

{OPTION_REPOAGENT_DIR, "repoagent-directory", Option::ArgStr,

"The global directory searched for repository agent shared libraries. "

"Default is '/opt/tritonserver/repoagents'."},

{OPTION_BUFFER_MANAGER_THREAD_COUNT, "buffer-manager-thread-count",

Option::ArgInt,

"The number of threads used to accelerate copies and other operations "

"required to manage input and output tensor contents. Default is 0."},

{OPTION_MODEL_LOAD_THREAD_COUNT, "model-load-thread-count",

Option::ArgInt,

"The number of threads used to concurrently load models in "

"model repositories. Default is 2*<num_cpu_cores>."},

{OPTION_BACKEND_CONFIG, "backend-config", "<string>,<string>=<string>",

"Specify a backend-specific configuration setting. The format of this "

"flag is --backend-config=<backend_name>,<setting>=<value>. Where "

"<backend_name> is the name of the backend, such as 'tensorrt'."},

{OPTION_HOST_POLICY, "host-policy", "<string>,<string>=<string>",

"Specify a host policy setting associated with a policy name. The "

"format of this flag is --host-policy=<policy_name>,<setting>=<value>. "

"Currently supported settings are 'numa-node', 'cpu-cores'. Note that "

"'numa-node' setting will affect pinned memory pool behavior, see "

"--pinned-memory-pool for more detail."},

{

OPTION_MODEL_LOAD_GPU_LIMIT, "model-load-gpu-limit",

"<device_id>:<fraction>",

"Specify the limit on GPU memory usage as a fraction. If model loading "

"on the device is requested and the current memory usage exceeds the "

"limit, the load will be rejected. If not specified, the limit will "

"not be set."

}

};

bool

CheckPortCollision()

{

// List of enabled services and their constraints

std::vector<

std::tuple<std::string, std::string, int32_t, bool, int32_t, int32_t>>

ports;

#ifdef TRITON_ENABLE_HTTP

if (allow_http_) {

ports.emplace_back("HTTP", http_address_, http_port_, false, -1, -1);

}

#endif // TRITON_ENABLE_HTTP

#ifdef TRITON_ENABLE_GRPC

if (allow_grpc_) {

ports.emplace_back("GRPC", grpc_address_, grpc_port_, false, -1, -1);

}

#endif // TRITON_ENABLE_GRPC

#ifdef TRITON_ENABLE_METRICS

if (allow_metrics_) {

ports.emplace_back("metrics", http_address_, metrics_port_, false, -1, -1);

}

#endif // TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_SAGEMAKER

if (allow_sagemaker_) {

ports.emplace_back(

"SageMaker", sagemaker_address_, sagemaker_port_,

sagemaker_safe_range_set_, sagemaker_safe_range_.first,

sagemaker_safe_range_.second);

}

#endif // TRITON_ENABLE_SAGEMAKER

#ifdef TRITON_ENABLE_VERTEX_AI

if (allow_vertex_ai_) {

ports.emplace_back(

"Vertex AI", vertex_ai_address_, vertex_ai_port_, false, -1, -1);

}

#endif // TRITON_ENABLE_VERTEX_AI

for (auto curr_it = ports.begin(); curr_it != ports.end(); ++curr_it) {

// If the current service doesn't specify the allow port range for other

// services, then we don't need to revisit the checked services

auto comparing_it = (std::get<3>(*curr_it)) ? ports.begin() : (curr_it + 1);

for (; comparing_it != ports.end(); ++comparing_it) {

if (comparing_it == curr_it) {

continue;

}

if (std::get<1>(*curr_it) != std::get<1>(*comparing_it)) {

continue;

}

// Set range and comparing service port is out of range

if (std::get<3>(*curr_it) &&

((std::get<2>(*comparing_it) < std::get<4>(*curr_it)) ||

(std::get<2>(*comparing_it) > std::get<5>(*curr_it)))) {

std::cerr << "The server cannot listen to "

<< std::get<0>(*comparing_it) << " requests at port "

<< std::get<2>(*comparing_it) << ", allowed port range is ["

<< std::get<4>(*curr_it) << ", " << std::get<5>(*curr_it)

<< "]" << std::endl;

return true;

}

if (std::get<2>(*curr_it) == std::get<2>(*comparing_it)) {

std::cerr << "The server cannot listen to " << std::get<0>(*curr_it)

<< " requests "

<< "and " << std::get<0>(*comparing_it)

<< " requests at the same address and port "

<< std::get<1>(*curr_it) << ":" << std::get<2>(*curr_it)

<< std::endl;

return true;

}

return false;

}

#ifdef TRITON_ENABLE_GRPC

TRITONSERVER_Error*

StartGrpcService(

std::unique_ptr<triton::server::GRPCServer>* service,

const std::shared_ptr<TRITONSERVER_Server>& server,

triton::server::TraceManager* trace_manager,

const std::shared_ptr<triton::server::SharedMemoryManager>& shm_manager)

{

TRITONSERVER_Error* err = triton::server::GRPCServer::Create(

server, trace_manager, shm_manager, grpc_port_, reuse_grpc_port_,

grpc_address_, grpc_use_ssl_, grpc_ssl_options_,

grpc_infer_allocation_pool_size_, grpc_response_compression_level_,

grpc_keepalive_options_, service);

if (err == nullptr) {

err = (*service)->Start();

}

if (err != nullptr) {

service->reset();

}

return err;

}

#endif // TRITON_ENABLE_GRPC

#ifdef TRITON_ENABLE_HTTP

TRITONSERVER_Error*

StartHttpService(

std::unique_ptr<triton::server::HTTPServer>* service,

const std::shared_ptr<TRITONSERVER_Server>& server,

triton::server::TraceManager* trace_manager,

const std::shared_ptr<triton::server::SharedMemoryManager>& shm_manager)

{

TRITONSERVER_Error* err = triton::server::HTTPAPIServer::Create(

server, trace_manager, shm_manager, http_port_, reuse_http_port_,

http_address_, http_thread_cnt_, service);

if (err == nullptr) {

err = (*service)->Start();

}

if (err != nullptr) {

service->reset();

}

return err;

}

#endif // TRITON_ENABLE_HTTP

#ifdef TRITON_ENABLE_METRICS

TRITONSERVER_Error*

StartMetricsService(

std::unique_ptr<triton::server::HTTPServer>* service,

const std::shared_ptr<TRITONSERVER_Server>& server)

{

TRITONSERVER_Error* err = triton::server::HTTPMetricsServer::Create(

server, metrics_port_, http_address_, 1 /* HTTP thread count */, service);

if (err == nullptr) {

err = (*service)->Start();

}

if (err != nullptr) {

service->reset();

}

return err;

}

#endif // TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_SAGEMAKER

TRITONSERVER_Error*

StartSagemakerService(

std::unique_ptr<triton::server::HTTPServer>* service,

const std::shared_ptr<TRITONSERVER_Server>& server,

triton::server::TraceManager* trace_manager,

const std::shared_ptr<triton::server::SharedMemoryManager>& shm_manager)

{

TRITONSERVER_Error* err = triton::server::SagemakerAPIServer::Create(

server, trace_manager, shm_manager, sagemaker_port_, sagemaker_address_,

sagemaker_thread_cnt_, service);

if (err == nullptr) {

err = (*service)->Start();

}

if (err != nullptr) {

service->reset();

}

return err;

}

#endif // TRITON_ENABLE_SAGEMAKER

#ifdef TRITON_ENABLE_VERTEX_AI

TRITONSERVER_Error*

StartVertexAiService(

std::unique_ptr<triton::server::HTTPServer>* service,

const std::shared_ptr<TRITONSERVER_Server>& server,

triton::server::TraceManager* trace_manager,

const std::shared_ptr<triton::server::SharedMemoryManager>& shm_manager)

{

TRITONSERVER_Error* err = triton::server::VertexAiAPIServer::Create(

server, trace_manager, shm_manager, vertex_ai_port_, vertex_ai_address_,

vertex_ai_thread_cnt_, vertex_ai_default_model_, service);

if (err == nullptr) {

err = (*service)->Start();

}

if (err != nullptr) {

service->reset();

}

return err;

}

#endif // TRITON_ENABLE_VERTEX_AI

bool

StartEndpoints(

const std::shared_ptr<TRITONSERVER_Server>& server,

triton::server::TraceManager* trace_manager,

const std::shared_ptr<triton::server::SharedMemoryManager>& shm_manager)

{

#ifdef _WIN32

WSADATA wsaData;

int wsa_ret = WSAStartup(MAKEWORD(2, 2), &wsaData);

if (wsa_ret != 0) {

LOG_ERROR << "Error in WSAStartup " << wsa_ret;

return false;

}

#endif

#ifdef TRITON_ENABLE_GRPC

// Enable GRPC endpoints if requested...

if (allow_grpc_) {

TRITONSERVER_Error* err =

StartGrpcService(&grpc_service_, server, trace_manager, shm_manager);

if (err != nullptr) {

LOG_TRITONSERVER_ERROR(err, "failed to start GRPC service");

return false;

}

#endif // TRITON_ENABLE_GRPC

#ifdef TRITON_ENABLE_HTTP

// Enable HTTP endpoints if requested...

if (allow_http_) {

TRITONSERVER_Error* err =

StartHttpService(&http_service_, server, trace_manager, shm_manager);

if (err != nullptr) {

LOG_TRITONSERVER_ERROR(err, "failed to start HTTP service");

return false;

}

#endif // TRITON_ENABLE_HTTP

#ifdef TRITON_ENABLE_SAGEMAKER

// Enable Sagemaker endpoints if requested...

if (allow_sagemaker_) {

TRITONSERVER_Error* err = StartSagemakerService(

&sagemaker_service_, server, trace_manager, shm_manager);

if (err != nullptr) {

LOG_TRITONSERVER_ERROR(err, "failed to start Sagemaker service");

return false;

}

#endif // TRITON_ENABLE_SAGEMAKER

#ifdef TRITON_ENABLE_VERTEX_AI

// Enable Vertex AI endpoints if requested...

if (allow_vertex_ai_) {

TRITONSERVER_Error* err = StartVertexAiService(

&vertex_ai_service_, server, trace_manager, shm_manager);

if (err != nullptr) {

LOG_TRITONSERVER_ERROR(err, "failed to start Vertex AI service");

return false;

}

#endif // TRITON_ENABLE_VERTEX_AI

#ifdef TRITON_ENABLE_METRICS

// Enable metrics endpoint if requested...

if (allow_metrics_) {

TRITONSERVER_Error* err = StartMetricsService(&metrics_service_, server);

if (err != nullptr) {

LOG_TRITONSERVER_ERROR(err, "failed to start Metrics service");

return false;

}

#endif // TRITON_ENABLE_METRICS

return true;

}

bool

StopEndpoints()

{

bool ret = true;

#ifdef TRITON_ENABLE_HTTP

if (http_service_) {

TRITONSERVER_Error* err = http_service_->Stop();

if (err != nullptr) {

LOG_TRITONSERVER_ERROR(err, "failed to stop HTTP service");

ret = false;

}

http_service_.reset();

}

#endif // TRITON_ENABLE_HTTP

#ifdef TRITON_ENABLE_GRPC

if (grpc_service_) {

TRITONSERVER_Error* err = grpc_service_->Stop();

if (err != nullptr) {

LOG_TRITONSERVER_ERROR(err, "failed to stop GRPC service");

ret = false;

}

grpc_service_.reset();

}

#endif // TRITON_ENABLE_GRPC

#ifdef TRITON_ENABLE_METRICS

if (metrics_service_) {

TRITONSERVER_Error* err = metrics_service_->Stop();

if (err != nullptr) {

LOG_TRITONSERVER_ERROR(err, "failed to stop Metrics service");

ret = false;

}

metrics_service_.reset();

}

#endif // TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_SAGEMAKER

if (sagemaker_service_) {

TRITONSERVER_Error* err = sagemaker_service_->Stop();

if (err != nullptr) {

LOG_TRITONSERVER_ERROR(err, "failed to stop Sagemaker service");

ret = false;

}

sagemaker_service_.reset();

}

#endif // TRITON_ENABLE_SAGEMAKER

#ifdef TRITON_ENABLE_VERTEX_AI

if (vertex_ai_service_) {

TRITONSERVER_Error* err = vertex_ai_service_->Stop();

if (err != nullptr) {

LOG_TRITONSERVER_ERROR(err, "failed to stop Vertex AI service");

ret = false;

}

vertex_ai_service_.reset();

}

#endif // TRITON_ENABLE_VERTEX_AI

#ifdef _WIN32

int wsa_ret = WSACleanup();

if (wsa_ret != 0) {

LOG_ERROR << "Error in WSACleanup " << wsa_ret;

ret = false;

}

#endif

return ret;

}

bool

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

main.cc

Latest commit

History

main.cc

File metadata and controls