问题描述:
目的是创建一个“端到端的ONNX模型,具有集成的令牌化和解码”。
这涉及:
将所有组件加密:集成令牌,语言模型(例如GPT-2)和解码器中的单个ONNX文件。 目标:目标是拥有一个可以将原始文本作为输入的单个onnx文件,并产生可读文本作为输出。 我得到的是:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import onnxruntime as rt
from skl2onnx import convert_sklearn
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from skl2onnx.common.data_types import StringTensorType
from skl2onnx import to_onnx
model_name = "spital/gpt2-small-czech-cs"
file_name = "results/v3/model/spital-gpt2-small-czech-cs.onnx"
def Tokenizer(input_text):
# Loading the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Setting the maximum sequence length
tokenizer.model_max_length = 1024
# PyTorch tensor text tokenization
output = tokenizer.encode(input_text, return_tensors='pt')
return output
tokenizer_transformer = FunctionTransformer(Tokenizer)
# Testing the input tokenizer
input_text = "Téma: Umělá inteligence v moderní společnosti."
tokenized_output = tokenizer_transformer.transform(input_text)
print(tokenized_output)
def Model(tokenized_input):
# Loading the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Loading the model
model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
# Prediction
output = model.generate(tokenized_input, max_length=150, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)
return output
model_transformer = FunctionTransformer(Model)
# Testing the model prediction
predicted_output = model_transformer.transform(tokenized_output)
print(predicted_output)
def Decoder(predicted_input):
# Loading the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Setting the maximum sequence length
tokenizer.model_max_length = 1024
# Decoding tokens into readable text
output = tokenizer.decode(predicted_input[0], skip_special_tokens=True)
return output
decoder_transformer = FunctionTransformer(Decoder)
# Decoding testing
decoded_output = decoder_transformer.transform(predicted_output)
print(decoded_output)
pipeline = Pipeline([
('tokenizer', tokenizer_transformer),
('model', model_transformer),
('decoder', decoder_transformer)
])
# Pipeline testing
input_text = "Téma: Umělá inteligence v moderní společnosti."
output_text = pipeline.transform(input_text)
print(output_text)
# Defining input and output shapes
initial_type = [("input", StringTensorType([1]))]
final_type = [("output", StringTensorType([1]))]
# Pipeline conversion to ONNX format
onnx_model = convert_sklearn(pipeline, initial_types=initial_type, final_types=final_type, target_opset=18, options={'zipmap': False})
问题是sklearn.preprocessing.functionTransFormer不支持Pytorch的ONNX模型。它只是在您可以看到和测试时支持将Pytorch转换器作为Python脚本运行。
我取得了进步:import torch
import onnxruntime_extensions
import onnx
import onnxruntime as ort
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import subprocess
model_name = "spital/gpt2-small-czech-cs"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
input_text = "Téma: Umělá inteligence v moderní společnosti."
# Export the tokenizers to ONNX using gen_processing_models
onnx_tokenizer_coder_path = "results/v5/model/tokenizer_coder.onnx"
onnx_tokenizer_decoder_path = "results/v5/model/tokenizer_decoder.onnx"
# Generate the tokenizers ONNX model
gen_tokenizer_coder_onnx_model = onnxruntime_extensions.gen_processing_models(tokenizer, pre_kwargs={})[0]
gen_tokenizer_decoder_onnx_model = onnxruntime_extensions.gen_processing_models(tokenizer, post_kwargs={})[1]
# Save the tokenizers ONNX model
with open(onnx_tokenizer_coder_path, "wb") as f:
f.write(gen_tokenizer_coder_onnx_model.SerializeToString())
with open(onnx_tokenizer_decoder_path, "wb") as f:
f.write(gen_tokenizer_decoder_onnx_model.SerializeToString())
# Export the Huggingface model to ONNX
onnx_model_path = "results/v5/model/"
# Export the model to ONNX
command = [
"optimum-cli", "export", "onnx",
"-m", model_name,
"--opset", "18",
"--monolith",
"--task", "text-generation",
onnx_model_path
]
subprocess.run(command, check=True)
# Adding position_ids for tokenizer coder for model
add_tokenizer_coder_onnx_model = onnx.load(onnx_tokenizer_coder_path)
shape_node = onnx.helper.make_node(
"Shape",
inputs=["input_ids"],
outputs=["input_shape"]
)
gather_node = onnx.helper.make_node(
"Gather",
inputs=["input_shape", "one"],
outputs=["sequence_length"],
axis=0
)
cast_node = onnx.helper.make_node(
"Cast",
inputs=["sequence_length"],
outputs=["sequence_length_int"],
to=onnx.TensorProto.INT64
)
# Creating position_ids node for tokenizer coder for model
position_ids_node = onnx.helper.make_node(
"Range",
inputs=["zero", "sequence_length_int", "one"],
outputs=["shorter_position_ids"]
)
zero_const = onnx.helper.make_tensor("zero", onnx.TensorProto.INT64, [1], [0])
one_const = onnx.helper.make_tensor("one", onnx.TensorProto.INT64, [1], [1])
position_ids_output = onnx.helper.make_tensor_value_info(
"position_ids",
onnx.TensorProto.INT64,
["sequence_length"]
)
unsqueeze_axes = onnx.helper.make_tensor(
"unsqueeze_axes",
onnx.TensorProto.INT64,
dims=[1],
vals=[0]
)
expand_node = onnx.helper.make_node(
"Unsqueeze",
inputs=["shorter_position_ids", "unsqueeze_axes"],
outputs=["position_ids"]
)
expanded_position_ids_output = onnx.helper.make_tensor_value_info(
"position_ids",
onnx.TensorProto.INT64,
["batch_size", "sequence_length"]
)
# Adding position_ids to outputs of tokenizer coder for model
add_tokenizer_coder_onnx_model.graph.node.extend([shape_node, gather_node, cast_node, position_ids_node, expand_node])
add_tokenizer_coder_onnx_model.graph.output.append(expanded_position_ids_output)
add_tokenizer_coder_onnx_model.graph.initializer.extend([zero_const, one_const, unsqueeze_axes])
# Export tokenizer coder with position_ids for model
onnx.save(add_tokenizer_coder_onnx_model, onnx_tokenizer_coder_path)
# Adding operation ArgMax node to transfer logits -> ids
onnx_argmax_model_path = "results/v5/model/argmax.onnx"
ArgMax_node = onnx.helper.make_node(
"ArgMax",
inputs=["logits"],
outputs=["ids"],
axis=-1,
keepdims=0
)
# Creating ArgMax graph
ArgMax_graph = onnx.helper.make_graph(
[ArgMax_node],
"ArgMaxGraph",
[onnx.helper.make_tensor_value_info("logits", onnx.TensorProto.FLOAT, ["batch_size", "sequence_length", "vocab_size"])],
[onnx.helper.make_tensor_value_info("ids", onnx.TensorProto.INT64, ["batch_size", "sequence_length"])]
)
# Creating ArgMax ONNX model
gen_ArgMax_onnx_model = onnx.helper.make_model(ArgMax_graph)
# Exporting ArgMax ONNX model
onnx.save(gen_ArgMax_onnx_model, onnx_argmax_model_path)
# Adding shape for Tokenizer decoder outputs (Assuming shape with batch_size and sequence_length)
add_tokenizer_decoder_onnx_model = onnx.load(onnx_tokenizer_decoder_path)
expanded_shape = onnx.helper.make_tensor_value_info(
"str",
onnx.TensorProto.STRING,
["batch_size", "sequence_length"]
)
# Adding shape to Tokenizer decoder outputs
output_tensor = add_tokenizer_decoder_onnx_model.graph.output[0]
output_tensor.type.tensor_type.shape.dim.clear()
output_tensor.type.tensor_type.shape.dim.extend(expanded_shape.type.tensor_type.shape.dim)
# Exporting Tokenizer decoder with shape ONNX model
onnx.save(add_tokenizer_decoder_onnx_model, onnx_tokenizer_decoder_path)
# Test Tokenizer coder, Model, ArgMax, Tokenizer decoder using an Inference session with ONNX Runtime Extensions before merging
# Test the tokenizers ONNX model
# Initialize ONNX Runtime SessionOptions and load custom ops library
sess_options = ort.SessionOptions()
sess_options.register_custom_ops_library(onnxruntime_extensions.get_library_path())
# Initialize ONNX Runtime Inference session with Extensions
coder = ort.InferenceSession(onnx_tokenizer_coder_path, sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
model = ort.InferenceSession(onnx_model_path + "model.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
ArgMax = ort.InferenceSession(onnx_argmax_model_path, sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
decoder = ort.InferenceSession(onnx_tokenizer_decoder_path, sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# Prepare dummy input text
input_feed = {"input_text": np.asarray([input_text])} # Assuming "input_text" is the input expected by the tokenizers
# Run the tokenizer coder
tokenized = coder.run(None, input_feed)
print("Tokenized:", tokenized)
# Run the model
model_output = model.run(None, {"input_ids": tokenized[0], "attention_mask": tokenized[1], "position_ids": tokenized[2]})
print("Model output (logits):", model_output[0])
# Run the ArgMax
argmax_output = ArgMax.run(None, {"logits": model_output[0]})
print("ArgMax output (token ids):", argmax_output[0])
# Run the tokenizer decoder
detokenized = decoder.run(None, input_feed={"ids": argmax_output[0]})
print("Detokenized:", detokenized)
# Merge the tokenizer and model ONNX files into one
onnx_combined_model_path = "results/v5/model/combined_model_tokenizer.onnx"
# Load the tokenizers and model ONNX files
tokenizer_coder_onnx_model = onnx.load(onnx_tokenizer_coder_path)
model_onnx_model = onnx.load(onnx_model_path + "model.onnx")
ArgMax_onnx_model = onnx.load(onnx_argmax_model_path)
tokenizer_decoder_onnx_model = onnx.load(onnx_tokenizer_decoder_path)
# Inspect the ONNX models to find the correct input/output names
print("\nTokenizer coder Model Inputs:", [node.name for node in tokenizer_coder_onnx_model.graph.input])
print("Tokenizer coder Model Outputs:", [node.name for node in tokenizer_coder_onnx_model.graph.output])
print("Tokenizer coder Model Shape:", [node.type.tensor_type.shape for node in tokenizer_coder_onnx_model.graph.output])
print("Tokenizer coder Model Type:", [node.type.tensor_type.elem_type for node in tokenizer_coder_onnx_model.graph.output])
print("\nModel Inputs:", [node.name for node in model_onnx_model.graph.input])
print("Model Outputs:", [node.name for node in model_onnx_model.graph.output])
print("Model Shape:", [node.type.tensor_type.shape for node in model_onnx_model.graph.output])
print("Model Type:", [node.type.tensor_type.elem_type for node in model_onnx_model.graph.output])
print("\nArgMax Inputs:", [node.name for node in ArgMax_onnx_model.graph.input])
print("ArgMax Outputs:", [node.name for node in ArgMax_onnx_model.graph.output])
print("ArgMax Shape:", [node.type.tensor_type.shape for node in ArgMax_onnx_model.graph.output])
print("ArgMax Type:", [node.type.tensor_type.elem_type for node in ArgMax_onnx_model.graph.output])
print("\nTokenizer decoder Model Inputs:", [node.name for node in tokenizer_decoder_onnx_model.graph.input])
print("Tokenizer decoder Model Outputs:", [node.name for node in tokenizer_decoder_onnx_model.graph.output])
print("Tokenizer decoder Model Shape:", [node.type.tensor_type.shape for node in tokenizer_decoder_onnx_model.graph.output])
print("Tokenizer decoder Model Type:", [node.type.tensor_type.elem_type for node in tokenizer_decoder_onnx_model.graph.output])
# Merge the tokenizer coder and model ONNX files
combined_model = onnx.compose.merge_models(
tokenizer_coder_onnx_model,
model_onnx_model,
io_map=[('input_ids', 'input_ids'), ('attention_mask', 'attention_mask'), ('position_ids', 'position_ids')]
)
# Merge the model and ArgMax ONNX files
combined_model = onnx.compose.merge_models(
combined_model,
ArgMax_onnx_model,
io_map=[('logits', 'logits')]
)
# Merge the ArgMax and tokenizer decoder ONNX files
combined_model = onnx.compose.merge_models(
combined_model,
tokenizer_decoder_onnx_model,
io_map=[('ids', 'ids')]
)
# Check combined ONNX model
inferred_model = onnx.shape_inference.infer_shapes(combined_model)
onnx.checker.check_model(inferred_model)
# Save the combined model
onnx.save(combined_model, onnx_combined_model_path)
# Test the combined ONNX model using an Inference session with ONNX Runtime Extensions
# Initialize ONNX Runtime SessionOptions and load custom ops library
sess_options = ort.SessionOptions()
sess_options.register_custom_ops_library(onnxruntime_extensions.get_library_path())
# Initialize ONNX Runtime Inference session with Extensions
session = ort.InferenceSession(onnx_combined_model_path, sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# Prepare dummy input text
input_feed = {"input_text": np.asarray([input_text])} # Assuming "input_text" is the input expected by the tokenizer
# Run the model
outputs = session.run(None, input_feed)
# Print the outputs
print("logits:", outputs)
这是一个带有测试的一个GPT2 ONNX模型的示例。