我从 Huggingface 转换器加载分词器和 Bert 模型,并将 Bert 模型导出到 CoreML:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
# Load the model
model = AutoModelForTokenClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
# Example usage
text = "Hugging Face is creating a tool that democratizes AI."
inputs = tokenizer(text, return_tensors="pt")
要求:
pip install transformers torch
如何将分词器从 Huggingface 转换器导出到 CoreML?
这是我使用的 BERT 分词器,效果很好。其中大部分内容来自 Zach Nagengast 和 Julien Chaumond。希望能帮助到你!您所需要的只是分词器词汇的 vocab.txt 文件,可以在这里找到 - https://huggingface.co/google-bert/bert-base-cased/blob/main/vocab.txt
import Foundation
enum TokenizerError: Error {
case tooLong(String)
}
class BertTokenizer {
private let basicTokenizer = BasicTokenizer()
private let wordpieceTokenizer: WordpieceTokenizer
private let maxLen = 512
private let vocab: [String: Int]
private let ids_to_tokens: [Int: String]
init() {
let url = Bundle.main.url(forResource: "vocab", withExtension: "txt")!
let vocabTxt = try! String(contentsOf: url)
let tokens = vocabTxt.split(separator: "\n").map { String($0) }
var vocab: [String: Int] = [:]
var ids_to_tokens: [Int: String] = [:]
for (i, token) in tokens.enumerated() {
vocab[token] = i
ids_to_tokens[i] = token
}
self.vocab = vocab
self.ids_to_tokens = ids_to_tokens
self.wordpieceTokenizer = WordpieceTokenizer(vocab: self.vocab)
}
func tokenize(text: String) -> [String] {
var tokens: [String] = []
for token in basicTokenizer.tokenize(text: text) {
for subToken in wordpieceTokenizer.tokenize(word: token) {
tokens.append(subToken)
}
}
return tokens
}
private func createAttentionMask(tokenIds: [Int]) -> [Int] {
return tokenIds.map { $0 != 0 ? 1 : 0 }
}
private func convertTokensToIds(tokens: [String]) -> [Int] {
if tokens.count > maxLen {
let truncatedTokens = Array(tokens.prefix(maxLen))
return truncatedTokens.map { vocab[$0]! }
} else {
return tokens.map { vocab[$0]! }
}
}
private func padSequence(_ sequence: [Int], toLength length: Int, paddingValue: Int = 0) -> [Int] {
if sequence.count >= length {
return Array(sequence.prefix(length))
} else {
return sequence + Array(repeating: paddingValue, count: length - sequence.count)
}
}
/// Main entry point
func tokenizeToIds(text: String, maxLength: Int = 512) -> (tokenIds: [Int], attentionMask: [Int]) {
let tokens = ["[CLS]"] + tokenize(text: text) + ["[SEP]"]
var tokenIds = convertTokensToIds(tokens: tokens)
tokenIds = padSequence(tokenIds, toLength: maxLength)
let attentionMask = createAttentionMask(tokenIds: tokenIds)
return (tokenIds, attentionMask)
}
func tokenToId(token: String) -> Int {
return vocab[token]!
}
/// Un-tokenization: get tokens from tokenIds
func unTokenize(tokens: [Int]) -> [String] {
return tokens.map { ids_to_tokens[$0]! }
}
/// Un-tokenization:
func convertWordpieceToBasicTokenList(_ wordpieceTokenList: [String]) -> String {
var tokenList: [String] = []
var individualToken: String = ""
for token in wordpieceTokenList {
if token.starts(with: "##") {
individualToken += String(token.suffix(token.count - 2))
} else {
if individualToken.count > 0 {
tokenList.append(individualToken)
}
individualToken = token
}
}
tokenList.append(individualToken)
return tokenList.joined(separator: " ")
}
}
class BasicTokenizer {
let neverSplit = [
"[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"
]
func tokenize(text: String) -> [String] {
let splitTokens = text.folding(options: .diacriticInsensitive, locale: nil)
.components(separatedBy: NSCharacterSet.whitespaces)
let tokens = splitTokens.flatMap({ (token: String) -> [String] in
if neverSplit.contains(token) {
return [token]
}
var toks: [String] = []
var currentTok = ""
for c in token.lowercased() {
if c.isLetter || c.isNumber || c == "°" {
currentTok += String(c)
} else if currentTok.count > 0 {
toks.append(currentTok)
toks.append(String(c))
currentTok = ""
} else {
toks.append(String(c))
}
}
if currentTok.count > 0 {
toks.append(currentTok)
}
return toks
})
return tokens
}
}
class WordpieceTokenizer {
private let unkToken = "[UNK]"
private let maxInputCharsPerWord = 100
private let vocab: [String: Int]
init(vocab: [String: Int]) {
self.vocab = vocab
}
func substr(_ s: String, _ r: Range<Int>) -> String? {
let stringCount = s.count
if stringCount < r.upperBound || stringCount < r.lowerBound {
return nil
}
let startIndex = s.index(s.startIndex, offsetBy: r.lowerBound)
let endIndex = s.index(startIndex, offsetBy: r.upperBound - r.lowerBound)
return String(s[startIndex..<endIndex])
}
func tokenize(word: String) -> [String] {
if word.count > maxInputCharsPerWord {
return [unkToken]
}
var outputTokens: [String] = []
var isBad = false
var start = 0
var subTokens: [String] = []
while start < word.count {
var end = word.count
var cur_substr: String? = nil
while start < end {
var substr = substr(word, start..<end)!
if start > 0 {
substr = "##\(substr)"
}
if vocab[substr] != nil {
cur_substr = substr
break
}
end -= 1
}
if cur_substr == nil {
isBad = true
break
}
subTokens.append(cur_substr!)
start = end
}
if isBad {
outputTokens.append(unkToken)
} else {
outputTokens.append(contentsOf: subTokens)
}
return outputTokens
}
}```