我在 sagemaker 中使用 nvidia triton 设置了一个多模型端点。下面的示例代码
模型.py
import numpy as np
import sys
import os
import json
from pathlib import Path
import torch
import triton_python_backend_utils as pb_utils
class TritonPythonModel:
def initialize(self, args):
....
def execute(self, requests):
responses = []
for request in requests:
input_ids = pb_utils.get_input_tensor_by_name(request, "input_ids")
input_ids = input_ids.as_numpy()
input_ids = torch.as_tensor(input_ids).long().cuda()
inputs = {'input_ids': input_ids}
translation = self.model.generate(**inputs, num_beams=1)
inference_response = pb_utils.InferenceResponse(
output_tensors=[
pb_utils.Tensor(
"output",
....
)
]
)
responses.append(inference_response)
return responses
config.pbtxt
name: 'somename'
backend: 'python'
max_batch_size: 16
input [{
name: "INPUT"
data_type: TYPE_INT32
dims: [ -1 ]}
]
output[{
name: "OUTPUT"
data_type: TYPE_STRING
...
}]