forked from triton-inference-server/server
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.cc
More file actions
2089 lines (1919 loc) · 74.3 KB
/
main.cc
File metadata and controls
2089 lines (1919 loc) · 74.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef _WIN32
#define NOMINMAX
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <winsock2.h>
#include <ws2tcpip.h>
#pragma comment(lib, "ws2_32.lib")
#endif
#ifndef _WIN32
#include <getopt.h>
#include <unistd.h>
#endif
#include <stdint.h>
#include <algorithm>
#include <cctype>
#include <iomanip>
#include <iostream>
#include <list>
#include <set>
#include <sstream>
#include <thread>
#include "triton_signal.h"
#ifdef TRITON_ENABLE_ASAN
#include <sanitizer/lsan_interface.h>
#endif // TRITON_ENABLE_ASAN
#include "common.h"
#include "shared_memory_manager.h"
#include "tracer.h"
#include "triton/common/logging.h"
#include "triton/core/tritonserver.h"
#if defined(TRITON_ENABLE_HTTP) || defined(TRITON_ENABLE_METRICS)
#include "http_server.h"
#endif // TRITON_ENABLE_HTTP|| TRITON_ENABLE_METRICS
#ifdef TRITON_ENABLE_SAGEMAKER
#include "sagemaker_server.h"
#endif // TRITON_ENABLE_SAGEMAKER
#ifdef TRITON_ENABLE_VERTEX_AI
#include "vertex_ai_server.h"
#endif // TRITON_ENABLE_VERTEX_AI
#ifdef TRITON_ENABLE_GRPC
#include "grpc_server.h"
#endif // TRITON_ENABLE_GRPC
#ifdef TRITON_ENABLE_GPU
static_assert(
TRITON_MIN_COMPUTE_CAPABILITY >= 1.0,
"Invalid TRITON_MIN_COMPUTE_CAPABILITY specified");
#endif // TRITON_ENABLE_GPU
namespace {
// Interval, in seconds, when the model repository is polled for
// changes.
int32_t repository_poll_secs_ = 15;
// The HTTP, GRPC and metrics service/s and ports. Initialized to
// default values and modifyied based on command-line args.
#ifdef TRITON_ENABLE_HTTP
std::unique_ptr<triton::server::HTTPServer> http_service_;
bool allow_http_ = true;
int32_t http_port_ = 8000;
bool reuse_http_port_ = false;
std::string http_address_ = "0.0.0.0";
#endif // TRITON_ENABLE_HTTP
#ifdef TRITON_ENABLE_SAGEMAKER
std::unique_ptr<triton::server::HTTPServer> sagemaker_service_;
bool allow_sagemaker_ = false;
int32_t sagemaker_port_ = 8080;
// Triton uses "0.0.0.0" as default address for SageMaker.
std::string sagemaker_address_ = "0.0.0.0";
bool sagemaker_safe_range_set_ = false;
std::pair<int32_t, int32_t> sagemaker_safe_range_ = {-1, -1};
// The number of threads to initialize for the SageMaker HTTP front-end.
int sagemaker_thread_cnt_ = 8;
#endif // TRITON_ENABLE_SAGEMAKER
#ifdef TRITON_ENABLE_VERTEX_AI
std::unique_ptr<triton::server::HTTPServer> vertex_ai_service_;
// Triton uses "0.0.0.0" as default address for Vertex AI.
std::string vertex_ai_address_ = "0.0.0.0";
bool allow_vertex_ai_ = false;
int32_t vertex_ai_port_ = 8080;
// The number of threads to initialize for the Vertex AI HTTP front-end.
int vertex_ai_thread_cnt_ = 8;
std::string vertex_ai_default_model_;
#endif // TRITON_ENABLE_VERTEX_AI
#ifdef TRITON_ENABLE_GRPC
std::unique_ptr<triton::server::GRPCServer> grpc_service_;
bool allow_grpc_ = true;
int32_t grpc_port_ = 8001;
bool reuse_grpc_port_ = false;
std::string grpc_address_ = "0.0.0.0";
bool grpc_use_ssl_ = false;
triton::server::SslOptions grpc_ssl_options_;
grpc_compression_level grpc_response_compression_level_ =
GRPC_COMPRESS_LEVEL_NONE;
// KeepAlive defaults: https://grpc.github.io/grpc/cpp/md_doc_keepalive.html
triton::server::KeepAliveOptions grpc_keepalive_options_;
#endif // TRITON_ENABLE_GRPC
#ifdef TRITON_ENABLE_METRICS
std::unique_ptr<triton::server::HTTPServer> metrics_service_;
bool allow_metrics_ = true;
int32_t metrics_port_ = 8002;
float metrics_interval_ms_ = 2000;
#ifndef TRITON_ENABLE_HTTP
// Triton uses the same address for http and metrics services.
// Need to set http address for metrics when http service is disable.
std::string http_address_ = "0.0.0.0";
#endif // NOT TRITON_ENABLE_HTTP
#endif // TRITON_ENABLE_METRICS
#ifdef TRITON_ENABLE_TRACING
std::string trace_filepath_;
TRITONSERVER_InferenceTraceLevel trace_level_ =
TRITONSERVER_TRACE_LEVEL_DISABLED;
int32_t trace_rate_ = 1000;
int32_t trace_count_ = -1;
int32_t trace_log_frequency_ = 0;
#endif // TRITON_ENABLE_TRACING
#if defined(TRITON_ENABLE_GRPC)
// The maximum number of inference request/response objects that
// remain allocated for reuse. As long as the number of in-flight
// requests doesn't exceed this value there will be no
// allocation/deallocation of request/response objects.
int grpc_infer_allocation_pool_size_ = 8;
#endif // TRITON_ENABLE_GRPC
#if defined(TRITON_ENABLE_HTTP)
// The number of threads to initialize for the HTTP front-end.
int http_thread_cnt_ = 8;
#endif // TRITON_ENABLE_HTTP
#ifdef _WIN32
// Minimum implementation of <getopt.h> for Windows
#define required_argument 1
#define no_argument 2
int optind = 1;
const char* optarg = nullptr;
struct option {
option(const char* name, int has_arg, int* flag, int val)
: name_(name), has_arg_(has_arg), flag_(flag), val_(val)
{
}
const char* name_;
int has_arg_;
int* flag_;
int val_;
};
bool
end_of_long_opts(const struct option* longopts)
{
return (
(longopts->name_ == nullptr) && (longopts->has_arg_ == 0) &&
(longopts->flag_ == nullptr) && (longopts->val_ == 0));
}
int
getopt_long(
int argc, char* const argv[], const char* optstring,
const struct option* longopts, int* longindex)
{
if ((longindex != NULL) || (optind >= argc)) {
return -1;
}
const struct option* curr_longopt = longopts;
std::string argv_str = argv[optind];
size_t found = argv_str.find_first_of("=");
std::string key = argv_str.substr(
2, (found == std::string::npos) ? std::string::npos : (found - 2));
while (!end_of_long_opts(curr_longopt)) {
if (key == curr_longopt->name_) {
if (curr_longopt->has_arg_ == required_argument) {
if (found == std::string::npos) {
optind++;
if (optind >= argc) {
std::cerr << argv[0] << ": option '" << argv_str
<< "' requires an argument" << std::endl;
return '?';
}
optarg = argv[optind];
} else {
optarg = (argv[optind] + found + 1);
}
}
optind++;
return curr_longopt->val_;
}
curr_longopt++;
}
return -1;
}
#endif
// Command-line options
enum OptionId {
OPTION_HELP = 1000,
#ifdef TRITON_ENABLE_LOGGING
OPTION_LOG_VERBOSE,
OPTION_LOG_INFO,
OPTION_LOG_WARNING,
OPTION_LOG_ERROR,
OPTION_LOG_FORMAT,
OPTION_LOG_FILE,
#endif // TRITON_ENABLE_LOGGING
OPTION_ID,
OPTION_MODEL_REPOSITORY,
OPTION_EXIT_ON_ERROR,
OPTION_DISABLE_AUTO_COMPLETE_CONFIG,
OPTION_STRICT_MODEL_CONFIG,
OPTION_STRICT_READINESS,
#if defined(TRITON_ENABLE_HTTP)
OPTION_ALLOW_HTTP,
OPTION_HTTP_PORT,
OPTION_REUSE_HTTP_PORT,
OPTION_HTTP_ADDRESS,
OPTION_HTTP_THREAD_COUNT,
#endif // TRITON_ENABLE_HTTP
#if defined(TRITON_ENABLE_GRPC)
OPTION_ALLOW_GRPC,
OPTION_GRPC_PORT,
OPTION_REUSE_GRPC_PORT,
OPTION_GRPC_ADDRESS,
OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE,
OPTION_GRPC_USE_SSL,
OPTION_GRPC_USE_SSL_MUTUAL,
OPTION_GRPC_SERVER_CERT,
OPTION_GRPC_SERVER_KEY,
OPTION_GRPC_ROOT_CERT,
OPTION_GRPC_RESPONSE_COMPRESSION_LEVEL,
OPTION_GRPC_ARG_KEEPALIVE_TIME_MS,
OPTION_GRPC_ARG_KEEPALIVE_TIMEOUT_MS,
OPTION_GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS,
OPTION_GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA,
OPTION_GRPC_ARG_HTTP2_MIN_RECV_PING_INTERVAL_WITHOUT_DATA_MS,
OPTION_GRPC_ARG_HTTP2_MAX_PING_STRIKES,
#endif // TRITON_ENABLE_GRPC
#if defined(TRITON_ENABLE_SAGEMAKER)
OPTION_ALLOW_SAGEMAKER,
OPTION_SAGEMAKER_PORT,
OPTION_SAGEMAKER_SAFE_PORT_RANGE,
OPTION_SAGEMAKER_THREAD_COUNT,
#endif // TRITON_ENABLE_SAGEMAKER
#if defined(TRITON_ENABLE_VERTEX_AI)
OPTION_ALLOW_VERTEX_AI,
OPTION_VERTEX_AI_PORT,
OPTION_VERTEX_AI_THREAD_COUNT,
OPTION_VERTEX_AI_DEFAULT_MODEL,
#endif // TRITON_ENABLE_VERTEX_AI
#ifdef TRITON_ENABLE_METRICS
OPTION_ALLOW_METRICS,
OPTION_ALLOW_GPU_METRICS,
OPTION_ALLOW_CPU_METRICS,
OPTION_METRICS_PORT,
OPTION_METRICS_INTERVAL_MS,
#endif // TRITON_ENABLE_METRICS
#ifdef TRITON_ENABLE_TRACING
OPTION_TRACE_FILEPATH,
OPTION_TRACE_LEVEL,
OPTION_TRACE_RATE,
OPTION_TRACE_COUNT,
OPTION_TRACE_LOG_FREQUENCY,
#endif // TRITON_ENABLE_TRACING
OPTION_MODEL_CONTROL_MODE,
OPTION_POLL_REPO_SECS,
OPTION_STARTUP_MODEL,
OPTION_RATE_LIMIT,
OPTION_RATE_LIMIT_RESOURCE,
OPTION_PINNED_MEMORY_POOL_BYTE_SIZE,
OPTION_CUDA_MEMORY_POOL_BYTE_SIZE,
OPTION_RESPONSE_CACHE_BYTE_SIZE,
OPTION_MIN_SUPPORTED_COMPUTE_CAPABILITY,
OPTION_EXIT_TIMEOUT_SECS,
OPTION_BACKEND_DIR,
OPTION_REPOAGENT_DIR,
OPTION_BUFFER_MANAGER_THREAD_COUNT,
OPTION_MODEL_LOAD_THREAD_COUNT,
OPTION_BACKEND_CONFIG,
OPTION_HOST_POLICY,
OPTION_MODEL_LOAD_GPU_LIMIT
};
struct Option {
static constexpr const char* ArgNone = "";
static constexpr const char* ArgBool = "boolean";
static constexpr const char* ArgFloat = "float";
static constexpr const char* ArgInt = "integer";
static constexpr const char* ArgStr = "string";
Option(OptionId id, std::string flag, std::string arg_desc, std::string desc)
: id_(id), flag_(flag), arg_desc_(arg_desc), desc_(desc)
{
}
struct option GetLongOption() const
{
struct option lo {
flag_.c_str(), (!arg_desc_.empty()) ? required_argument : no_argument,
nullptr, id_
};
return lo;
}
const OptionId id_;
const std::string flag_;
const std::string arg_desc_;
const std::string desc_;
};
std::vector<Option> options_
{
{OPTION_HELP, "help", Option::ArgNone, "Print usage"},
#ifdef TRITON_ENABLE_LOGGING
{OPTION_LOG_VERBOSE, "log-verbose", Option::ArgInt,
"Set verbose logging level. Zero (0) disables verbose logging and "
"values >= 1 enable verbose logging."},
{OPTION_LOG_INFO, "log-info", Option::ArgBool,
"Enable/disable info-level logging."},
{OPTION_LOG_WARNING, "log-warning", Option::ArgBool,
"Enable/disable warning-level logging."},
{OPTION_LOG_ERROR, "log-error", Option::ArgBool,
"Enable/disable error-level logging."},
{OPTION_LOG_FORMAT, "log-format", Option::ArgStr,
"Set the logging format. Options are \"default\" and \"ISO8601\". "
"The default is \"default\". For \"default\", the log severity (L) and "
"timestamp will be logged as \"LMMDD hh:mm:ss.ssssss\". "
"For \"ISO8601\", the log format will be \"YYYY-MM-DDThh:mm:ssZ L\"."},
{OPTION_LOG_FILE, "log-file", Option::ArgStr,
"Set the name of the log output file. If specified, log outputs will be "
"saved to this file. If not specified, log outputs will stream to the "
"console."},
#endif // TRITON_ENABLE_LOGGING
{OPTION_ID, "id", Option::ArgStr, "Identifier for this server."},
{OPTION_MODEL_REPOSITORY, "model-store", Option::ArgStr,
"Equivalent to --model-repository."},
{OPTION_MODEL_REPOSITORY, "model-repository", Option::ArgStr,
"Path to model repository directory. It may be specified multiple times "
"to add multiple model repositories. Note that if a model is not unique "
"across all model repositories at any time, the model will not be "
"available."},
{OPTION_EXIT_ON_ERROR, "exit-on-error", Option::ArgBool,
"Exit the inference server if an error occurs during initialization."},
{OPTION_DISABLE_AUTO_COMPLETE_CONFIG, "disable-auto-complete-config",
Option::ArgNone,
"If set, disables the triton and backends from auto completing model "
"configuration files. Model configuration files must be provided and "
"all required "
"configuration settings must be specified."},
{OPTION_STRICT_MODEL_CONFIG, "strict-model-config", Option::ArgBool,
"DEPRECATED: If true model configuration files must be provided and all "
"required "
"configuration settings must be specified. If false the model "
"configuration may be absent or only partially specified and the "
"server will attempt to derive the missing required configuration."},
{OPTION_STRICT_READINESS, "strict-readiness", Option::ArgBool,
"If true /v2/health/ready endpoint indicates ready if the server "
"is responsive and all models are available. If false "
"/v2/health/ready endpoint indicates ready if server is responsive "
"even if some/all models are unavailable."},
#if defined(TRITON_ENABLE_HTTP)
{OPTION_ALLOW_HTTP, "allow-http", Option::ArgBool,
"Allow the server to listen for HTTP requests."},
{OPTION_HTTP_PORT, "http-port", Option::ArgInt,
"The port for the server to listen on for HTTP requests."},
{OPTION_REUSE_HTTP_PORT, "reuse-http-port", Option::ArgBool,
"Allow multiple servers to listen on the same HTTP port when every "
"server has this option set."},
{OPTION_HTTP_ADDRESS, "http-address", Option::ArgStr,
"The address for the http server to binds to."},
{OPTION_HTTP_THREAD_COUNT, "http-thread-count", Option::ArgInt,
"Number of threads handling HTTP requests."},
#endif // TRITON_ENABLE_HTTP
#if defined(TRITON_ENABLE_GRPC)
{OPTION_ALLOW_GRPC, "allow-grpc", Option::ArgBool,
"Allow the server to listen for GRPC requests."},
{OPTION_GRPC_PORT, "grpc-port", Option::ArgInt,
"The port for the server to listen on for GRPC requests."},
{OPTION_REUSE_GRPC_PORT, "reuse-grpc-port", Option::ArgBool,
"Allow multiple servers to listen on the same GRPC port when every "
"server has this option set."},
{OPTION_GRPC_ADDRESS, "grpc-address", Option::ArgStr,
"The address for the grpc server to binds to."},
{OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE,
"grpc-infer-allocation-pool-size", Option::ArgInt,
"The maximum number of inference request/response objects that remain "
"allocated for reuse. As long as the number of in-flight requests "
"doesn't exceed this value there will be no allocation/deallocation of "
"request/response objects."},
{OPTION_GRPC_USE_SSL, "grpc-use-ssl", Option::ArgBool,
"Use SSL authentication for GRPC requests. Default is false."},
{OPTION_GRPC_USE_SSL_MUTUAL, "grpc-use-ssl-mutual", Option::ArgBool,
"Use mututal SSL authentication for GRPC requests. Default is false."},
{OPTION_GRPC_SERVER_CERT, "grpc-server-cert", Option::ArgStr,
"File holding PEM-encoded server certificate. Ignored unless "
"--grpc-use-ssl is true."},
{OPTION_GRPC_SERVER_KEY, "grpc-server-key", Option::ArgStr,
"File holding PEM-encoded server key. Ignored unless "
"--grpc-use-ssl is true."},
{OPTION_GRPC_ROOT_CERT, "grpc-root-cert", Option::ArgStr,
"File holding PEM-encoded root certificate. Ignore unless "
"--grpc-use-ssl is false."},
{OPTION_GRPC_RESPONSE_COMPRESSION_LEVEL,
"grpc-infer-response-compression-level", Option::ArgStr,
"The compression level to be used while returning the infer response to "
"the peer. Allowed values are none, low, medium and high. By default, "
"compression level is selected as none."},
{OPTION_GRPC_ARG_KEEPALIVE_TIME_MS, "grpc-keepalive-time", Option::ArgInt,
"The period (in milliseconds) after which a keepalive ping is sent on "
"the transport. Default is 7200000 (2 hours)."},
{OPTION_GRPC_ARG_KEEPALIVE_TIMEOUT_MS, "grpc-keepalive-timeout",
Option::ArgInt,
"The period (in milliseconds) the sender of the keepalive ping waits "
"for an acknowledgement. If it does not receive an acknowledgment "
"within this time, it will close the connection. "
"Default is 20000 (20 seconds)."},
{OPTION_GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS,
"grpc-keepalive-permit-without-calls", Option::ArgBool,
"Allows keepalive pings to be sent even if there are no calls in flight "
"(0 : false; 1 : true). Default is 0 (false)."},
{OPTION_GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA,
"grpc-http2-max-pings-without-data", Option::ArgInt,
"The maximum number of pings that can be sent when there is no "
"data/header frame to be sent. gRPC Core will not continue sending "
"pings if we run over the limit. Setting it to 0 allows sending pings "
"without such a restriction. Default is 2."},
{OPTION_GRPC_ARG_HTTP2_MIN_RECV_PING_INTERVAL_WITHOUT_DATA_MS,
"grpc-http2-min-recv-ping-interval-without-data", Option::ArgInt,
"If there are no data/header frames being sent on the transport, this "
"channel argument on the server side controls the minimum time "
"(in milliseconds) that gRPC Core would expect between receiving "
"successive pings. If the time between successive pings is less than "
"this time, then the ping will be considered a bad ping from the peer. "
"Such a ping counts as a ‘ping strike’. Default is 300000 (5 minutes)."},
{OPTION_GRPC_ARG_HTTP2_MAX_PING_STRIKES, "grpc-http2-max-ping-strikes",
Option::ArgInt,
"Maximum number of bad pings that the server will tolerate before "
"sending an HTTP2 GOAWAY frame and closing the transport. Setting it to "
"0 allows the server to accept any number of bad pings. Default is 2."},
#endif // TRITON_ENABLE_GRPC
#if defined(TRITON_ENABLE_SAGEMAKER)
{OPTION_ALLOW_SAGEMAKER, "allow-sagemaker", Option::ArgBool,
"Allow the server to listen for Sagemaker requests. Default is false."},
{OPTION_SAGEMAKER_PORT, "sagemaker-port", Option::ArgInt,
"The port for the server to listen on for Sagemaker requests. Default "
"is 8080."},
{OPTION_SAGEMAKER_SAFE_PORT_RANGE, "sagemaker-safe-port-range",
"<integer>-<integer>",
"Set the allowed port range for endpoints other than the SageMaker "
"endpoints."},
{OPTION_SAGEMAKER_THREAD_COUNT, "sagemaker-thread-count", Option::ArgInt,
"Number of threads handling Sagemaker requests. Default is 8."},
#endif // TRITON_ENABLE_SAGEMAKER
#if defined(TRITON_ENABLE_VERTEX_AI)
{OPTION_ALLOW_VERTEX_AI, "allow-vertex-ai", Option::ArgBool,
"Allow the server to listen for Vertex AI requests. Default is true if "
"AIP_MODE=PREDICTION, false otherwise."},
{OPTION_VERTEX_AI_PORT, "vertex-ai-port", Option::ArgInt,
"The port for the server to listen on for Vertex AI requests. Default "
"is AIP_HTTP_PORT if set, 8080 otherwise."},
{OPTION_VERTEX_AI_THREAD_COUNT, "vertex-ai-thread-count", Option::ArgInt,
"Number of threads handling Vertex AI requests. Default is 8."},
{OPTION_VERTEX_AI_DEFAULT_MODEL, "vertex-ai-default-model",
Option::ArgStr,
"The name of the model to use for single-model inference requests."},
#endif // TRITON_ENABLE_VERTEX_AI
#ifdef TRITON_ENABLE_METRICS
{OPTION_ALLOW_METRICS, "allow-metrics", Option::ArgBool,
"Allow the server to provide prometheus metrics."},
{OPTION_ALLOW_GPU_METRICS, "allow-gpu-metrics", Option::ArgBool,
"Allow the server to provide GPU metrics. Ignored unless "
"--allow-metrics is true."},
{OPTION_ALLOW_CPU_METRICS, "allow-cpu-metrics", Option::ArgBool,
"Allow the server to provide CPU metrics. Ignored unless "
"--allow-metrics is true."},
{OPTION_METRICS_PORT, "metrics-port", Option::ArgInt,
"The port reporting prometheus metrics."},
{OPTION_METRICS_INTERVAL_MS, "metrics-interval-ms", Option::ArgFloat,
"Metrics will be collected once every <metrics-interval-ms> "
"milliseconds. Default is 2000 milliseconds."},
#endif // TRITON_ENABLE_METRICS
#ifdef TRITON_ENABLE_TRACING
{OPTION_TRACE_FILEPATH, "trace-file", Option::ArgStr,
"Set the file where trace output will be saved. If --trace-log-frequency"
" is also specified, this argument value will be the prefix of the files"
" to save the trace output. See --trace-log-frequency for detail."},
{OPTION_TRACE_LEVEL, "trace-level", Option::ArgStr,
"Specify a trace level. OFF to disable tracing, TIMESTAMPS to "
"trace timestamps, TENSORS to trace tensors. It may be specified "
"multiple times to trace multiple informations. Default is OFF."},
{OPTION_TRACE_RATE, "trace-rate", Option::ArgInt,
"Set the trace sampling rate. Default is 1000."},
{OPTION_TRACE_COUNT, "trace-count", Option::ArgInt,
"Set the number of traces to be sampled. If the value is -1, the number "
"of traces to be sampled will not be limited. Default is -1."},
{OPTION_TRACE_LOG_FREQUENCY, "trace-log-frequency", Option::ArgInt,
"Set the trace log frequency. If the value is 0, Triton will only log "
"the trace output to <trace-file> when shutting down. Otherwise, Triton "
"will log the trace output to <trace-file>.<idx> when it collects the "
"specified number of traces. For example, if the log frequency is 100, "
"when Triton collects the 100-th trace, it logs the traces to file "
"<trace-file>.0, and when it collects the 200-th trace, it logs the "
"101-th to the 200-th traces to file <trace-file>.1. Default is 0."},
#endif // TRITON_ENABLE_TRACING
{OPTION_MODEL_CONTROL_MODE, "model-control-mode", Option::ArgStr,
"Specify the mode for model management. Options are \"none\", \"poll\" "
"and \"explicit\". The default is \"none\". "
"For \"none\", the server will load all models in the model "
"repository(s) at startup and will not make any changes to the load "
"models after that. For \"poll\", the server will poll the model "
"repository(s) to detect changes and will load/unload models based on "
"those changes. The poll rate is controlled by 'repository-poll-secs'. "
"For \"explicit\", model load and unload is initiated by using the "
"model control APIs, and only models specified with --load-model will "
"be loaded at startup."},
{OPTION_POLL_REPO_SECS, "repository-poll-secs", Option::ArgInt,
"Interval in seconds between each poll of the model repository to check "
"for changes. Valid only when --model-control-mode=poll is "
"specified."},
{OPTION_STARTUP_MODEL, "load-model", Option::ArgStr,
"Name of the model to be loaded on server startup. It may be specified "
"multiple times to add multiple models. To load ALL models at startup, "
"specify '*' as the model name with --load-model=* as the ONLY "
"--load-model argument, this does not imply any pattern matching. "
"Specifying --load-model=* in conjunction with another --load-model "
"argument will result in error. Note that this option will only take "
"effect if --model-control-mode=explicit is true."},
// FIXME: fix the default to execution_count once RL logic is complete.
{OPTION_RATE_LIMIT, "rate-limit", Option::ArgStr,
"Specify the mode for rate limiting. Options are \"execution_count\" "
"and \"off\". The default is \"off\". For "
"\"execution_count\", the server will determine the instance using "
"configured priority and the number of time the instance has been "
"used to run inference. The inference will finally be executed once "
"the required resources are available. For \"off\", the server will "
"ignore any rate limiter config and run inference as soon as an "
"instance is ready."},
{OPTION_RATE_LIMIT_RESOURCE, "rate-limit-resource",
"<string>:<integer>:<integer>",
"The number of resources available to the server. The format of this "
"flag is --rate-limit-resource=<resource_name>:<count>:<device>. The "
"<device> is optional and if not listed will be applied to every "
"device. If the resource is specified as \"GLOBAL\" in the model "
"configuration the resource is considered shared among all the devices "
"in the system. The <device> property is ignored for such resources. "
"This flag can be specified multiple times to specify each resources "
"and their availability. By default, the max across all instances that "
"list the resource is selected as its availability. The values for this "
"flag is case-insensitive."},
{OPTION_PINNED_MEMORY_POOL_BYTE_SIZE, "pinned-memory-pool-byte-size",
Option::ArgInt,
"The total byte size that can be allocated as pinned system memory. "
"If GPU support is enabled, the server will allocate pinned system "
"memory to accelerate data transfer between host and devices until it "
"exceeds the specified byte size. If 'numa-node' is configured via "
"--host-policy, the pinned system memory of the pool size will be "
"allocated on each numa node. This option will not affect the "
"allocation conducted by the backend frameworks. Default is 256 MB."},
{OPTION_CUDA_MEMORY_POOL_BYTE_SIZE, "cuda-memory-pool-byte-size",
"<integer>:<integer>",
"The total byte size that can be allocated as CUDA memory for the GPU "
"device. If GPU support is enabled, the server will allocate CUDA "
"memory to minimize data transfer between host and devices until it "
"exceeds the specified byte size. This option will not affect the "
"allocation conducted by the backend frameworks. The argument should be "
"2 integers separated by colons in the format "
"<GPU device ID>:<pool byte size>. This option can be used multiple "
"times, but only once per GPU device. Subsequent uses will overwrite "
"previous uses for the same GPU device. Default is 64 MB."},
{OPTION_RESPONSE_CACHE_BYTE_SIZE, "response-cache-byte-size",
Option::ArgInt,
"The size in bytes to allocate for a request/response cache. When "
"non-zero, Triton allocates the requested size in CPU memory and "
"shares the cache across all inference requests and across all models. "
"For a given model to use request caching, the model must enable "
"request caching in the model configuration. By default, no model uses "
"request caching even if the request cache is enabled with the "
"--response-cache-byte-size flag. Default is 0."},
{OPTION_MIN_SUPPORTED_COMPUTE_CAPABILITY,
"min-supported-compute-capability", Option::ArgFloat,
"The minimum supported CUDA compute capability. GPUs that don't support "
"this compute capability will not be used by the server."},
{OPTION_EXIT_TIMEOUT_SECS, "exit-timeout-secs", Option::ArgInt,
"Timeout (in seconds) when exiting to wait for in-flight inferences to "
"finish. After the timeout expires the server exits even if inferences "
"are still in flight."},
{OPTION_BACKEND_DIR, "backend-directory", Option::ArgStr,
"The global directory searched for backend shared libraries. Default is "
"'/opt/tritonserver/backends'."},
{OPTION_REPOAGENT_DIR, "repoagent-directory", Option::ArgStr,
"The global directory searched for repository agent shared libraries. "
"Default is '/opt/tritonserver/repoagents'."},
{OPTION_BUFFER_MANAGER_THREAD_COUNT, "buffer-manager-thread-count",
Option::ArgInt,
"The number of threads used to accelerate copies and other operations "
"required to manage input and output tensor contents. Default is 0."},
{OPTION_MODEL_LOAD_THREAD_COUNT, "model-load-thread-count",
Option::ArgInt,
"The number of threads used to concurrently load models in "
"model repositories. Default is 2*<num_cpu_cores>."},
{OPTION_BACKEND_CONFIG, "backend-config", "<string>,<string>=<string>",
"Specify a backend-specific configuration setting. The format of this "
"flag is --backend-config=<backend_name>,<setting>=<value>. Where "
"<backend_name> is the name of the backend, such as 'tensorrt'."},
{OPTION_HOST_POLICY, "host-policy", "<string>,<string>=<string>",
"Specify a host policy setting associated with a policy name. The "
"format of this flag is --host-policy=<policy_name>,<setting>=<value>. "
"Currently supported settings are 'numa-node', 'cpu-cores'. Note that "
"'numa-node' setting will affect pinned memory pool behavior, see "
"--pinned-memory-pool for more detail."},
{
OPTION_MODEL_LOAD_GPU_LIMIT, "model-load-gpu-limit",
"<device_id>:<fraction>",
"Specify the limit on GPU memory usage as a fraction. If model loading "
"on the device is requested and the current memory usage exceeds the "
"limit, the load will be rejected. If not specified, the limit will "
"not be set."
}
};
bool
CheckPortCollision()
{
// List of enabled services and their constraints
std::vector<
std::tuple<std::string, std::string, int32_t, bool, int32_t, int32_t>>
ports;
#ifdef TRITON_ENABLE_HTTP
if (allow_http_) {
ports.emplace_back("HTTP", http_address_, http_port_, false, -1, -1);
}
#endif // TRITON_ENABLE_HTTP
#ifdef TRITON_ENABLE_GRPC
if (allow_grpc_) {
ports.emplace_back("GRPC", grpc_address_, grpc_port_, false, -1, -1);
}
#endif // TRITON_ENABLE_GRPC
#ifdef TRITON_ENABLE_METRICS
if (allow_metrics_) {
ports.emplace_back("metrics", http_address_, metrics_port_, false, -1, -1);
}
#endif // TRITON_ENABLE_METRICS
#ifdef TRITON_ENABLE_SAGEMAKER
if (allow_sagemaker_) {
ports.emplace_back(
"SageMaker", sagemaker_address_, sagemaker_port_,
sagemaker_safe_range_set_, sagemaker_safe_range_.first,
sagemaker_safe_range_.second);
}
#endif // TRITON_ENABLE_SAGEMAKER
#ifdef TRITON_ENABLE_VERTEX_AI
if (allow_vertex_ai_) {
ports.emplace_back(
"Vertex AI", vertex_ai_address_, vertex_ai_port_, false, -1, -1);
}
#endif // TRITON_ENABLE_VERTEX_AI
for (auto curr_it = ports.begin(); curr_it != ports.end(); ++curr_it) {
// If the current service doesn't specify the allow port range for other
// services, then we don't need to revisit the checked services
auto comparing_it = (std::get<3>(*curr_it)) ? ports.begin() : (curr_it + 1);
for (; comparing_it != ports.end(); ++comparing_it) {
if (comparing_it == curr_it) {
continue;
}
if (std::get<1>(*curr_it) != std::get<1>(*comparing_it)) {
continue;
}
// Set range and comparing service port is out of range
if (std::get<3>(*curr_it) &&
((std::get<2>(*comparing_it) < std::get<4>(*curr_it)) ||
(std::get<2>(*comparing_it) > std::get<5>(*curr_it)))) {
std::cerr << "The server cannot listen to "
<< std::get<0>(*comparing_it) << " requests at port "
<< std::get<2>(*comparing_it) << ", allowed port range is ["
<< std::get<4>(*curr_it) << ", " << std::get<5>(*curr_it)
<< "]" << std::endl;
return true;
}
if (std::get<2>(*curr_it) == std::get<2>(*comparing_it)) {
std::cerr << "The server cannot listen to " << std::get<0>(*curr_it)
<< " requests "
<< "and " << std::get<0>(*comparing_it)
<< " requests at the same address and port "
<< std::get<1>(*curr_it) << ":" << std::get<2>(*curr_it)
<< std::endl;
return true;
}
}
}
return false;
}
#ifdef TRITON_ENABLE_GRPC
TRITONSERVER_Error*
StartGrpcService(
std::unique_ptr<triton::server::GRPCServer>* service,
const std::shared_ptr<TRITONSERVER_Server>& server,
triton::server::TraceManager* trace_manager,
const std::shared_ptr<triton::server::SharedMemoryManager>& shm_manager)
{
TRITONSERVER_Error* err = triton::server::GRPCServer::Create(
server, trace_manager, shm_manager, grpc_port_, reuse_grpc_port_,
grpc_address_, grpc_use_ssl_, grpc_ssl_options_,
grpc_infer_allocation_pool_size_, grpc_response_compression_level_,
grpc_keepalive_options_, service);
if (err == nullptr) {
err = (*service)->Start();
}
if (err != nullptr) {
service->reset();
}
return err;
}
#endif // TRITON_ENABLE_GRPC
#ifdef TRITON_ENABLE_HTTP
TRITONSERVER_Error*
StartHttpService(
std::unique_ptr<triton::server::HTTPServer>* service,
const std::shared_ptr<TRITONSERVER_Server>& server,
triton::server::TraceManager* trace_manager,
const std::shared_ptr<triton::server::SharedMemoryManager>& shm_manager)
{
TRITONSERVER_Error* err = triton::server::HTTPAPIServer::Create(
server, trace_manager, shm_manager, http_port_, reuse_http_port_,
http_address_, http_thread_cnt_, service);
if (err == nullptr) {
err = (*service)->Start();
}
if (err != nullptr) {
service->reset();
}
return err;
}
#endif // TRITON_ENABLE_HTTP
#ifdef TRITON_ENABLE_METRICS
TRITONSERVER_Error*
StartMetricsService(
std::unique_ptr<triton::server::HTTPServer>* service,
const std::shared_ptr<TRITONSERVER_Server>& server)
{
TRITONSERVER_Error* err = triton::server::HTTPMetricsServer::Create(
server, metrics_port_, http_address_, 1 /* HTTP thread count */, service);
if (err == nullptr) {
err = (*service)->Start();
}
if (err != nullptr) {
service->reset();
}
return err;
}
#endif // TRITON_ENABLE_METRICS
#ifdef TRITON_ENABLE_SAGEMAKER
TRITONSERVER_Error*
StartSagemakerService(
std::unique_ptr<triton::server::HTTPServer>* service,
const std::shared_ptr<TRITONSERVER_Server>& server,
triton::server::TraceManager* trace_manager,
const std::shared_ptr<triton::server::SharedMemoryManager>& shm_manager)
{
TRITONSERVER_Error* err = triton::server::SagemakerAPIServer::Create(
server, trace_manager, shm_manager, sagemaker_port_, sagemaker_address_,
sagemaker_thread_cnt_, service);
if (err == nullptr) {
err = (*service)->Start();
}
if (err != nullptr) {
service->reset();
}
return err;
}
#endif // TRITON_ENABLE_SAGEMAKER
#ifdef TRITON_ENABLE_VERTEX_AI
TRITONSERVER_Error*
StartVertexAiService(
std::unique_ptr<triton::server::HTTPServer>* service,
const std::shared_ptr<TRITONSERVER_Server>& server,
triton::server::TraceManager* trace_manager,
const std::shared_ptr<triton::server::SharedMemoryManager>& shm_manager)
{
TRITONSERVER_Error* err = triton::server::VertexAiAPIServer::Create(
server, trace_manager, shm_manager, vertex_ai_port_, vertex_ai_address_,
vertex_ai_thread_cnt_, vertex_ai_default_model_, service);
if (err == nullptr) {
err = (*service)->Start();
}
if (err != nullptr) {
service->reset();
}
return err;
}
#endif // TRITON_ENABLE_VERTEX_AI
bool
StartEndpoints(
const std::shared_ptr<TRITONSERVER_Server>& server,
triton::server::TraceManager* trace_manager,
const std::shared_ptr<triton::server::SharedMemoryManager>& shm_manager)
{
#ifdef _WIN32
WSADATA wsaData;
int wsa_ret = WSAStartup(MAKEWORD(2, 2), &wsaData);
if (wsa_ret != 0) {
LOG_ERROR << "Error in WSAStartup " << wsa_ret;
return false;
}
#endif
#ifdef TRITON_ENABLE_GRPC
// Enable GRPC endpoints if requested...
if (allow_grpc_) {
TRITONSERVER_Error* err =
StartGrpcService(&grpc_service_, server, trace_manager, shm_manager);
if (err != nullptr) {
LOG_TRITONSERVER_ERROR(err, "failed to start GRPC service");
return false;
}
}
#endif // TRITON_ENABLE_GRPC
#ifdef TRITON_ENABLE_HTTP
// Enable HTTP endpoints if requested...
if (allow_http_) {
TRITONSERVER_Error* err =
StartHttpService(&http_service_, server, trace_manager, shm_manager);
if (err != nullptr) {
LOG_TRITONSERVER_ERROR(err, "failed to start HTTP service");
return false;
}
}
#endif // TRITON_ENABLE_HTTP
#ifdef TRITON_ENABLE_SAGEMAKER
// Enable Sagemaker endpoints if requested...
if (allow_sagemaker_) {
TRITONSERVER_Error* err = StartSagemakerService(
&sagemaker_service_, server, trace_manager, shm_manager);
if (err != nullptr) {
LOG_TRITONSERVER_ERROR(err, "failed to start Sagemaker service");
return false;
}
}
#endif // TRITON_ENABLE_SAGEMAKER
#ifdef TRITON_ENABLE_VERTEX_AI
// Enable Vertex AI endpoints if requested...
if (allow_vertex_ai_) {
TRITONSERVER_Error* err = StartVertexAiService(
&vertex_ai_service_, server, trace_manager, shm_manager);
if (err != nullptr) {
LOG_TRITONSERVER_ERROR(err, "failed to start Vertex AI service");
return false;
}
}
#endif // TRITON_ENABLE_VERTEX_AI
#ifdef TRITON_ENABLE_METRICS
// Enable metrics endpoint if requested...
if (allow_metrics_) {
TRITONSERVER_Error* err = StartMetricsService(&metrics_service_, server);
if (err != nullptr) {
LOG_TRITONSERVER_ERROR(err, "failed to start Metrics service");
return false;
}
}
#endif // TRITON_ENABLE_METRICS
return true;
}
bool
StopEndpoints()
{
bool ret = true;
#ifdef TRITON_ENABLE_HTTP
if (http_service_) {
TRITONSERVER_Error* err = http_service_->Stop();
if (err != nullptr) {
LOG_TRITONSERVER_ERROR(err, "failed to stop HTTP service");
ret = false;
}
http_service_.reset();
}
#endif // TRITON_ENABLE_HTTP
#ifdef TRITON_ENABLE_GRPC
if (grpc_service_) {
TRITONSERVER_Error* err = grpc_service_->Stop();
if (err != nullptr) {
LOG_TRITONSERVER_ERROR(err, "failed to stop GRPC service");
ret = false;
}
grpc_service_.reset();
}
#endif // TRITON_ENABLE_GRPC
#ifdef TRITON_ENABLE_METRICS
if (metrics_service_) {
TRITONSERVER_Error* err = metrics_service_->Stop();
if (err != nullptr) {
LOG_TRITONSERVER_ERROR(err, "failed to stop Metrics service");
ret = false;
}
metrics_service_.reset();
}
#endif // TRITON_ENABLE_METRICS
#ifdef TRITON_ENABLE_SAGEMAKER
if (sagemaker_service_) {
TRITONSERVER_Error* err = sagemaker_service_->Stop();
if (err != nullptr) {
LOG_TRITONSERVER_ERROR(err, "failed to stop Sagemaker service");
ret = false;
}
sagemaker_service_.reset();
}
#endif // TRITON_ENABLE_SAGEMAKER
#ifdef TRITON_ENABLE_VERTEX_AI
if (vertex_ai_service_) {
TRITONSERVER_Error* err = vertex_ai_service_->Stop();
if (err != nullptr) {
LOG_TRITONSERVER_ERROR(err, "failed to stop Vertex AI service");
ret = false;
}
vertex_ai_service_.reset();
}
#endif // TRITON_ENABLE_VERTEX_AI
#ifdef _WIN32
int wsa_ret = WSACleanup();
if (wsa_ret != 0) {
LOG_ERROR << "Error in WSACleanup " << wsa_ret;
ret = false;
}
#endif
return ret;
}
bool