Skip to content

Commit

Permalink
Capitalization-agnostic tokenizer lookup
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Apr 11, 2024
1 parent 4060e8f commit 9ef46a5
Showing 1 changed file with 12 additions and 3 deletions.
15 changes: 12 additions & 3 deletions Sources/Tokenizers/Tokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,20 @@ struct TokenizerModel {
if tokenizerName.hasSuffix("Tokenizer") {
tokenizerName = String(tokenizerName.dropLast("Tokenizer".count))
}
guard let tokenizerClass = TokenizerModel.knownTokenizers[tokenizerName] else {
throw TokenizerError.unsupportedTokenizer(tokenizerName)

// Try to perform a direct case-sensitive lookup first
if let tokenizerClass = TokenizerModel.knownTokenizers[tokenizerName] {
return try tokenizerClass.init(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens)
} else {
// If the direct lookup fails, perform a case-insensitive scan over the keys
if let key = TokenizerModel.knownTokenizers.keys.first(where: { $0.lowercased() == tokenizerName.lowercased() }) {
if let tokenizerClass = TokenizerModel.knownTokenizers[key] {
return try tokenizerClass.init(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens)
}
}
}

return try tokenizerClass.init(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens)
throw TokenizerError.unsupportedTokenizer(tokenizerName)
}
}

Expand Down

0 comments on commit 9ef46a5

Please sign in to comment.