# HELP nv_inference_request_success Number of successful inference requests, all batch sizes # TYPE nv_inference_request_success counter nv_inference_request_success{model="summary",version="1"} 2355 nv_inference_request_success{model="tensorrt_llm",version="1"} 2355 nv_inference_request_success{model="preprocessing",version="1"} 2356 nv_inference_request_success{model="postprocessing",version="1"} 13427 # HELP nv_inference_request_failure Number of failed inference requests, all batch sizes # TYPE nv_inference_request_failure counter nv_inference_request_failure{model="summary",version="1"} 0 nv_inference_request_failure{model="tensorrt_llm",version="1"} 0 nv_inference_request_failure{model="preprocessing",version="1"} 0 nv_inference_request_failure{model="postprocessing",version="1"} 0 # HELP nv_inference_count Number of inferences performed (does not include cached requests) # TYPE nv_inference_count counter nv_inference_count{model="summary",version="1"} 2355 nv_inference_count{model="tensorrt_llm",version="1"} 2355 nv_inference_count{model="preprocessing",version="1"} 2356 nv_inference_count{model="postprocessing",version="1"} 13427 # HELP nv_inference_exec_count Number of model executions performed (does not include cached requests) # TYPE nv_inference_exec_count counter nv_inference_exec_count{model="summary",version="1"} 2355 nv_inference_exec_count{model="tensorrt_llm",version="1"} 2355 nv_inference_exec_count{model="preprocessing",version="1"} 2356 nv_inference_exec_count{model="postprocessing",version="1"} 13427 # HELP nv_inference_request_duration_us Cumulative inference request duration in microseconds (includes cached requests) # TYPE nv_inference_request_duration_us counter nv_inference_request_duration_us{model="summary",version="1"} 33235960598 nv_inference_request_duration_us{model="tensorrt_llm",version="1"} 33191093214 nv_inference_request_duration_us{model="preprocessing",version="1"} 33614085 nv_inference_request_duration_us{model="postprocessing",version="1"} 30493621 # HELP nv_inference_queue_duration_us Cumulative inference queuing duration in microseconds (includes cached requests) # TYPE nv_inference_queue_duration_us counter nv_inference_queue_duration_us{model="summary",version="1"} 7990 nv_inference_queue_duration_us{model="tensorrt_llm",version="1"} 614015 nv_inference_queue_duration_us{model="preprocessing",version="1"} 278954 nv_inference_queue_duration_us{model="postprocessing",version="1"} 1195897 # HELP nv_inference_compute_input_duration_us Cumulative compute input duration in microseconds (does not include cached requests) # TYPE nv_inference_compute_input_duration_us counter nv_inference_compute_input_duration_us{model="summary",version="1"} 2539167 nv_inference_compute_input_duration_us{model="tensorrt_llm",version="1"} 985401 nv_inference_compute_input_duration_us{model="preprocessing",version="1"} 265443 nv_inference_compute_input_duration_us{model="postprocessing",version="1"} 1280574 # HELP nv_inference_compute_infer_duration_us Cumulative compute inference duration in microseconds (does not include cached requests) # TYPE nv_inference_compute_infer_duration_us counter nv_inference_compute_infer_duration_us{model="summary",version="1"} 33249358059 nv_inference_compute_infer_duration_us{model="tensorrt_llm",version="1"} 33188449847 nv_inference_compute_infer_duration_us{model="preprocessing",version="1"} 33039845 nv_inference_compute_infer_duration_us{model="postprocessing",version="1"} 27890478 # HELP nv_inference_compute_output_duration_us Cumulative inference compute output duration in microseconds (does not include cached requests) # TYPE nv_inference_compute_output_duration_us counter nv_inference_compute_output_duration_us{model="summary",version="1"} 1144944 nv_inference_compute_output_duration_us{model="tensorrt_llm",version="1"} 1030758 nv_inference_compute_output_duration_us{model="preprocessing",version="1"} 21282 nv_inference_compute_output_duration_us{model="postprocessing",version="1"} 85403 # HELP nv_energy_consumption GPU energy consumption in joules since the Triton Server started # TYPE nv_energy_consumption counter nv_energy_consumption{gpu_uuid="GPU-617c6832-5b55-47b8-2e7a-9ff8adcbff34"} 3087619.42200001 nv_energy_consumption{gpu_uuid="GPU-9c4a3fa3-02b6-5a62-6c82-4d3d65861d4c"} 3122454.891000009 nv_energy_consumption{gpu_uuid="GPU-f97345b9-1d30-9a3a-ee2e-dfbc1df9f922"} 3092813.577000006 nv_energy_consumption{gpu_uuid="GPU-754ba16b-d6de-7944-9271-b1b78c4535e6"} 3148481.908999999 nv_energy_consumption{gpu_uuid="GPU-8046ed8f-f18d-30d3-189f-8c270aaed5bc"} 3163774.31600001 nv_energy_consumption{gpu_uuid="GPU-6b156c3d-ab98-5e3d-44e6-97e92a92edbc"} 3109817.071999999 nv_energy_consumption{gpu_uuid="GPU-6db5b1e9-678b-b785-b06b-54f6a2a210dd"} 3114188.709999997 nv_energy_consumption{gpu_uuid="GPU-f653b45e-4bcb-5643-4db1-ff9103bf7fda"} 3092847.051999995 # HELP nv_inference_pending_request_count Instantaneous number of pending requests awaiting execution per-model. # TYPE nv_inference_pending_request_count gauge nv_inference_pending_request_count{model="summary",version="1"} 0 nv_inference_pending_request_count{model="tensorrt_llm",version="1"} 0 nv_inference_pending_request_count{model="preprocessing",version="1"} 0 nv_inference_pending_request_count{model="postprocessing",version="1"} 0 # HELP nv_pinned_memory_pool_total_bytes Pinned memory pool total memory size, in bytes # TYPE nv_pinned_memory_pool_total_bytes gauge nv_pinned_memory_pool_total_bytes 268435456 # HELP nv_pinned_memory_pool_used_bytes Pinned memory pool used memory size, in bytes # TYPE nv_pinned_memory_pool_used_bytes gauge nv_pinned_memory_pool_used_bytes 38916 # HELP nv_gpu_utilization GPU utilization rate [0.0 - 1.0) # TYPE nv_gpu_utilization gauge nv_gpu_utilization{gpu_uuid="GPU-617c6832-5b55-47b8-2e7a-9ff8adcbff34"} 0.99 nv_gpu_utilization{gpu_uuid="GPU-9c4a3fa3-02b6-5a62-6c82-4d3d65861d4c"} 0.99 nv_gpu_utilization{gpu_uuid="GPU-f97345b9-1d30-9a3a-ee2e-dfbc1df9f922"} 0.99 nv_gpu_utilization{gpu_uuid="GPU-754ba16b-d6de-7944-9271-b1b78c4535e6"} 0.99 nv_gpu_utilization{gpu_uuid="GPU-8046ed8f-f18d-30d3-189f-8c270aaed5bc"} 0.99 nv_gpu_utilization{gpu_uuid="GPU-6b156c3d-ab98-5e3d-44e6-97e92a92edbc"} 0.99 nv_gpu_utilization{gpu_uuid="GPU-6db5b1e9-678b-b785-b06b-54f6a2a210dd"} 0.99 nv_gpu_utilization{gpu_uuid="GPU-f653b45e-4bcb-5643-4db1-ff9103bf7fda"} 0.99 # HELP nv_gpu_memory_total_bytes GPU total memory, in bytes # TYPE nv_gpu_memory_total_bytes gauge nv_gpu_memory_total_bytes{gpu_uuid="GPU-617c6832-5b55-47b8-2e7a-9ff8adcbff34"} 85520809984 nv_gpu_memory_total_bytes{gpu_uuid="GPU-9c4a3fa3-02b6-5a62-6c82-4d3d65861d4c"} 85520809984 nv_gpu_memory_total_bytes{gpu_uuid="GPU-f97345b9-1d30-9a3a-ee2e-dfbc1df9f922"} 85520809984 nv_gpu_memory_total_bytes{gpu_uuid="GPU-754ba16b-d6de-7944-9271-b1b78c4535e6"} 85520809984 nv_gpu_memory_total_bytes{gpu_uuid="GPU-8046ed8f-f18d-30d3-189f-8c270aaed5bc"} 85520809984 nv_gpu_memory_total_bytes{gpu_uuid="GPU-6b156c3d-ab98-5e3d-44e6-97e92a92edbc"} 85520809984 nv_gpu_memory_total_bytes{gpu_uuid="GPU-6db5b1e9-678b-b785-b06b-54f6a2a210dd"} 85520809984 nv_gpu_memory_total_bytes{gpu_uuid="GPU-f653b45e-4bcb-5643-4db1-ff9103bf7fda"} 85520809984 # HELP nv_gpu_memory_used_bytes GPU used memory, in bytes # TYPE nv_gpu_memory_used_bytes gauge nv_gpu_memory_used_bytes{gpu_uuid="GPU-617c6832-5b55-47b8-2e7a-9ff8adcbff34"} 84619034624 nv_gpu_memory_used_bytes{gpu_uuid="GPU-9c4a3fa3-02b6-5a62-6c82-4d3d65861d4c"} 84870692864 nv_gpu_memory_used_bytes{gpu_uuid="GPU-f97345b9-1d30-9a3a-ee2e-dfbc1df9f922"} 84868595712 nv_gpu_memory_used_bytes{gpu_uuid="GPU-754ba16b-d6de-7944-9271-b1b78c4535e6"} 84868595712 nv_gpu_memory_used_bytes{gpu_uuid="GPU-8046ed8f-f18d-30d3-189f-8c270aaed5bc"} 84874887168 nv_gpu_memory_used_bytes{gpu_uuid="GPU-6b156c3d-ab98-5e3d-44e6-97e92a92edbc"} 84870692864 nv_gpu_memory_used_bytes{gpu_uuid="GPU-6db5b1e9-678b-b785-b06b-54f6a2a210dd"} 84868595712 nv_gpu_memory_used_bytes{gpu_uuid="GPU-f653b45e-4bcb-5643-4db1-ff9103bf7fda"} 84820361216 # HELP nv_gpu_power_usage GPU power usage in watts # TYPE nv_gpu_power_usage gauge nv_gpu_power_usage{gpu_uuid="GPU-617c6832-5b55-47b8-2e7a-9ff8adcbff34"} 262.727 nv_gpu_power_usage{gpu_uuid="GPU-9c4a3fa3-02b6-5a62-6c82-4d3d65861d4c"} 263.187 nv_gpu_power_usage{gpu_uuid="GPU-f97345b9-1d30-9a3a-ee2e-dfbc1df9f922"} 263.036 nv_gpu_power_usage{gpu_uuid="GPU-754ba16b-d6de-7944-9271-b1b78c4535e6"} 264.093 nv_gpu_power_usage{gpu_uuid="GPU-8046ed8f-f18d-30d3-189f-8c270aaed5bc"} 267.209 nv_gpu_power_usage{gpu_uuid="GPU-6b156c3d-ab98-5e3d-44e6-97e92a92edbc"} 263.736 nv_gpu_power_usage{gpu_uuid="GPU-6db5b1e9-678b-b785-b06b-54f6a2a210dd"} 264.944 nv_gpu_power_usage{gpu_uuid="GPU-f653b45e-4bcb-5643-4db1-ff9103bf7fda"} 259.225 # HELP nv_gpu_power_limit GPU power management limit in watts # TYPE nv_gpu_power_limit gauge nv_gpu_power_limit{gpu_uuid="GPU-617c6832-5b55-47b8-2e7a-9ff8adcbff34"} 700 nv_gpu_power_limit{gpu_uuid="GPU-9c4a3fa3-02b6-5a62-6c82-4d3d65861d4c"} 700 nv_gpu_power_limit{gpu_uuid="GPU-f97345b9-1d30-9a3a-ee2e-dfbc1df9f922"} 700 nv_gpu_power_limit{gpu_uuid="GPU-754ba16b-d6de-7944-9271-b1b78c4535e6"} 700 nv_gpu_power_limit{gpu_uuid="GPU-8046ed8f-f18d-30d3-189f-8c270aaed5bc"} 700 nv_gpu_power_limit{gpu_uuid="GPU-6b156c3d-ab98-5e3d-44e6-97e92a92edbc"} 700 nv_gpu_power_limit{gpu_uuid="GPU-6db5b1e9-678b-b785-b06b-54f6a2a210dd"} 700 nv_gpu_power_limit{gpu_uuid="GPU-f653b45e-4bcb-5643-4db1-ff9103bf7fda"} 700 # HELP nv_cpu_utilization CPU utilization rate [0.0 - 1.0] # TYPE nv_cpu_utilization gauge nv_cpu_utilization 0.04245283018867924 # HELP nv_cpu_memory_total_bytes CPU total memory (RAM), in bytes # TYPE nv_cpu_memory_total_bytes gauge nv_cpu_memory_total_bytes 2164146749440 # HELP nv_cpu_memory_used_bytes CPU used memory (RAM), in bytes # TYPE nv_cpu_memory_used_bytes gauge nv_cpu_memory_used_bytes 79430180864 # HELP nv_trt_llm_request_metrics TRT LLM request metrics # TYPE nv_trt_llm_request_metrics gauge nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="context",version="1"} 1 nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="scheduled",version="1"} 1 nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="max",version="1"} 8 nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="active",version="1"} 1 # HELP nv_trt_llm_runtime_memory_metrics TRT LLM runtime memory metrics # TYPE nv_trt_llm_runtime_memory_metrics gauge nv_trt_llm_runtime_memory_metrics{memory_type="pinned",model="tensorrt_llm",version="1"} 1077115308 nv_trt_llm_runtime_memory_metrics{memory_type="gpu",model="tensorrt_llm",version="1"} 40057073160 nv_trt_llm_runtime_memory_metrics{memory_type="cpu",model="tensorrt_llm",version="1"} 35572 # HELP nv_trt_llm_kv_cache_block_metrics TRT LLM KV cache block metrics # TYPE nv_trt_llm_kv_cache_block_metrics gauge nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="tokens_per",model="tensorrt_llm",version="1"} 64 nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="used",model="tensorrt_llm",version="1"} 158 nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="free",model="tensorrt_llm",version="1"} 2271 nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="max",model="tensorrt_llm",version="1"} 2429 # HELP nv_trt_llm_inflight_batcher_metrics TRT LLM inflight_batcher-specific metrics # TYPE nv_trt_llm_inflight_batcher_metrics gauge nv_trt_llm_inflight_batcher_metrics{inflight_batcher_specific_metric="paused_requests",model="tensorrt_llm",version="1"} 0 nv_trt_llm_inflight_batcher_metrics{inflight_batcher_specific_metric="micro_batch_id",model="tensorrt_llm",version="1"} 0 nv_trt_llm_inflight_batcher_metrics{inflight_batcher_specific_metric="generation_requests",model="tensorrt_llm",version="1"} 0 nv_trt_llm_inflight_batcher_metrics{inflight_batcher_specific_metric="total_context_tokens",model="tensorrt_llm",version="1"} 10076 # HELP nv_trt_llm_general_metrics General TRT LLM metrics # TYPE nv_trt_llm_general_metrics gauge nv_trt_llm_general_metrics{general_type="iteration_counter",model="tensorrt_llm",version="1"} 188462 nv_trt_llm_general_metrics{general_type="timestamp",model="tensorrt_llm",version="1"} 1722397063