Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Create embedding module #86

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ let package = Package(
name: "swift-transformers",
platforms: [.iOS(.v16), .macOS(.v13)],
products: [
.library(name: "Transformers", targets: ["Tokenizers", "Generation", "Models"]),
.library(name: "Transformers", targets: ["Tokenizers", "Generation", "Models", "Embedding"]),
.executable(name: "transformers", targets: ["TransformersCLI"]),
.executable(name: "hub-cli", targets: ["HubCLI"]),
],
Expand All @@ -26,11 +26,13 @@ let package = Package(
.target(name: "TensorUtils"),
.target(name: "Generation", dependencies: ["Tokenizers", "TensorUtils"]),
.target(name: "Models", dependencies: ["Tokenizers", "Generation", "TensorUtils"]),
.target(name: "Embedding", dependencies: ["Hub", "Tokenizers"]),
.testTarget(name: "TokenizersTests", dependencies: ["Tokenizers", "Models", "Hub"], resources: [.process("Resources"), .process("Vocabs")]),
.testTarget(name: "HubTests", dependencies: ["Hub"]),
.testTarget(name: "PreTokenizerTests", dependencies: ["Tokenizers", "Hub"]),
.testTarget(name: "TensorUtilsTests", dependencies: ["TensorUtils"]),
.testTarget(name: "NormalizerTests", dependencies: ["Tokenizers", "Hub"]),
.testTarget(name: "PostProcessorTests", dependencies: ["Tokenizers", "Hub"])
.testTarget(name: "PostProcessorTests", dependencies: ["Tokenizers", "Hub"]),
.testTarget(name: "EmbeddingTests", dependencies: ["Embedding", "Tokenizers", "Hub", "TensorUtils"], resources: [.process("Resources"), .process("Vocabs")])
]
)
154 changes: 154 additions & 0 deletions Sources/Embedding/Embedding.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import Hub
import Tokenizers
import CoreML
import Accelerate


class BERTEmbedding {

typealias Weights = [String: MLMultiArray]

var shape: [NSNumber] {[
NSNumber(value: maxPositionEmbeddings),
NSNumber(value: hiddenSize),
]}

private let weights: Weights

private let positionEmbeddingType: String
private let hiddenSize: Int
private let vocabSize: Int
private let maxPositionEmbeddings: Int
private let typeVocabSize: Int
private let padTokenID: Int
private let normalizationEpsilon: Float
private let dropoutRate: Float = 1e-1
private let hiddenActivation: BNNS.ActivationFunction = .geluApproximation2(alpha: 1e-1, beta: 1e-1)

private var allocations: [BNNSNDArrayDescriptor] = []

private lazy var wordEmbedding: BNNS.EmbeddingLayer = {
let input = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Int64.self, shape: .vector(maxPositionEmbeddings))
allocations.append(input)
let dictData: [Float32] = weights["bert.embeddings.word_embeddings.weight"]!.toArray()
let dict = BNNSNDArrayDescriptor.allocate(initializingFrom: dictData, shape: .matrixColumnMajor(hiddenSize, vocabSize))
allocations.append(dict)
let output = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Float32.self, shape: .matrixColumnMajor(hiddenSize, maxPositionEmbeddings))
allocations.append(output)

return BNNS.EmbeddingLayer(input: input, output: output, dictionary: dict, paddingIndex: 0, maximumNorm: 0, normType: .l2, scalesGradientByFrequency: false)!
}()

private lazy var positionEmbedding: BNNS.EmbeddingLayer = {
let input = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Int64.self, shape: .vector(maxPositionEmbeddings))
allocations.append(input)
let dictData: [Float32] = weights["bert.embeddings.position_embeddings.weight"]!.toArray()
let dict = BNNSNDArrayDescriptor.allocate(initializingFrom: dictData, shape: .matrixColumnMajor(hiddenSize, maxPositionEmbeddings))
allocations.append(dict)
let output = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Float32.self, shape: .matrixColumnMajor(hiddenSize, maxPositionEmbeddings))
allocations.append(output)

return BNNS.EmbeddingLayer(input: input, output: output, dictionary: dict, paddingIndex: -1, maximumNorm: 0, normType: .l2, scalesGradientByFrequency: true)!
}()

private lazy var tokenTypeEmbedding: BNNS.EmbeddingLayer = {
let input = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Int64.self, shape: .vector(maxPositionEmbeddings))
allocations.append(input)
let dictData: [Float32] = weights["bert.embeddings.token_type_embeddings.weight"]!.toArray()
let dict = BNNSNDArrayDescriptor.allocate(initializingFrom: dictData, shape: .matrixColumnMajor(hiddenSize, typeVocabSize))
allocations.append(dict)
let output = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Float32.self, shape: .matrixColumnMajor(hiddenSize, maxPositionEmbeddings))
allocations.append(output)

return BNNS.EmbeddingLayer(input: input, output: output, dictionary: dict, paddingIndex: -1, maximumNorm: 0, normType: .l2, scalesGradientByFrequency: true)!
}()

private lazy var normalization: BNNS.NormalizationLayer = {
let input = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Float32.self, shape: .matrixRowMajor(maxPositionEmbeddings, hiddenSize))
allocations.append(input)
let output = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Float32.self, shape: .matrixRowMajor(maxPositionEmbeddings, hiddenSize))
allocations.append(output)

let betaWA: MLMultiArray! = weights["bert.embeddings.LayerNorm.beta"] ?? weights["bert.embeddings.LayerNorm.bias"]
let beta = BNNSNDArrayDescriptor.allocate(initializingFrom: betaWA.toArray() as [Float32], shape: .matrixColumnMajor(hiddenSize, maxPositionEmbeddings))
allocations.append(beta)

let gammaWA: MLMultiArray! = weights["bert.embeddings.LayerNorm.gamma"] ?? weights["bert.embeddings.LayerNorm.weight"]
let gamma = BNNSNDArrayDescriptor.allocate(initializingFrom: gammaWA.toArray() as [Float32], shape: .matrixColumnMajor(hiddenSize, maxPositionEmbeddings))
allocations.append(gamma)

return BNNS.NormalizationLayer(type: .batch(movingMean: nil, movingVariance: nil), input: input, output: output, beta: beta, gamma: gamma, epsilon: normalizationEpsilon, activation: hiddenActivation)!
}()

private lazy var dropout: BNNS.DropoutLayer = {
let input = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Float32.self, shape: .matrixColumnMajor(hiddenSize, maxPositionEmbeddings))
allocations.append(input)
let output = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Float32.self, shape: .matrixColumnMajor(hiddenSize, maxPositionEmbeddings))
allocations.append(output)

return BNNS.DropoutLayer(input: input, output: output, rate: dropoutRate, seed: 0, control: 0)!
}()

deinit {
allocations.forEach({ $0.deallocate() })
}

init(config: Config, weights: Weights = [:]) {
assert(config.model_type!.stringValue == "bert")
for key in [
"bert.embeddings.word_embeddings.weight",
"bert.embeddings.position_embeddings.weight",
"bert.embeddings.token_type_embeddings.weight",
] { assert(weights.keys.contains(where: { $0 == key })) }
assert(weights.keys.contains(where: { $0 == "bert.embeddings.LayerNorm.beta" || $0 == "bert.embeddings.LayerNorm.bias" }))
assert(weights.keys.contains(where: { $0 == "bert.embeddings.LayerNorm.gamma" || $0 == "bert.embeddings.LayerNorm.weight" }))
assert(config.hidden_act!.stringValue == "gelu")
assert("absolute" == config.position_embedding_type!.stringValue!)
self.positionEmbeddingType = config.position_embedding_type!.stringValue!
self.hiddenSize = config.hidden_size!.intValue!
self.vocabSize = config.vocab_size!.intValue!
self.maxPositionEmbeddings = config.max_position_embeddings!.intValue!
self.typeVocabSize = config.type_vocab_size!.intValue!
self.padTokenID = config.pad_token_id!.intValue!
self.normalizationEpsilon = Float(config.layer_norm_eps!.doubleValue!)
self.weights = weights
}

public func callAsFunction(inputIDs: [Int64],
tokenTypeIDs: [Int64]? = nil,
positionIDs: [Int64]? = nil) -> MLMultiArray {
let inputLength = inputIDs.count
let inputIDs: [Int64] = inputIDs.padded(length: maxPositionEmbeddings)
let wordInput = BNNSNDArrayDescriptor.allocate(initializingFrom: inputIDs, shape: .vector(inputIDs.count))
let wordOutput = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Float32.self, shape: .matrixColumnMajor(hiddenSize, inputIDs.count))
defer {
wordInput.deallocate()
wordOutput.deallocate()
}
try! wordEmbedding.apply(batchSize: 1, input: wordInput, output: wordOutput)

let positionIDs = positionIDs ?? Array<Int64>(stride(from: 0, through: Int64(inputLength - 1), by: 1))
let positionInput = BNNSNDArrayDescriptor.allocate(initializingFrom: positionIDs.padded(length: maxPositionEmbeddings), shape: .vector(maxPositionEmbeddings))
let positionOutput = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Float32.self, shape: .matrixColumnMajor(hiddenSize, maxPositionEmbeddings))
defer {
positionInput.deallocate()
positionOutput.deallocate()
}
try! self.positionEmbedding.apply(batchSize: 1, input: positionInput, output: positionOutput)

let tokenTypeIDs: [Int64] = tokenTypeIDs ?? Array(repeating: 0, count: maxPositionEmbeddings)
let typeInput = BNNSNDArrayDescriptor.allocate(initializingFrom: tokenTypeIDs, shape: .vector(maxPositionEmbeddings))
let typeOutput = BNNSNDArrayDescriptor.allocateUninitialized(scalarType: Float32.self, shape: .matrixColumnMajor(hiddenSize, maxPositionEmbeddings))
defer {
typeInput.deallocate()
typeOutput.deallocate()
}
try! self.tokenTypeEmbedding.apply(batchSize: 1, input: typeInput, output: typeOutput)

let multiWord = try! wordOutput.makeMultiArray(of: Float32.self, shape: shape)
let multiPosition = try! positionOutput.makeMultiArray(of: Float32.self, shape: shape)
let multiType = try! typeOutput.makeMultiArray(of: Float32.self, shape: shape)

return multiWord + multiPosition + multiType
}
}
1 change: 1 addition & 0 deletions Sources/Hub/Hub.swift
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ public struct Config {
}

public var intValue: Int? { value as? Int }
public var doubleValue: Double? { value as? Double }
public var boolValue: Bool? { value as? Bool }
public var stringValue: String? { value as? String }

Expand Down
8 changes: 8 additions & 0 deletions Sources/TensorUtils/Array+Utils.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import Foundation


public extension Array where Element: Numeric {
func padded(length maxLength: Int) -> Array<Element> {
self + Array(repeating: 0, count: Swift.max(maxLength - count, 0))
}
}
14 changes: 14 additions & 0 deletions Sources/TensorUtils/BNNS+Utils.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import Accelerate
import CoreML.MLMultiArray


public extension BNNSNDArrayDescriptor {
func makeMultiArray<T: Numeric>(of numericType: T.Type, shape: [NSNumber]) throws -> MLMultiArray {
assert(numericType == Float32.self)
let strides = shape.dropFirst().reversed().reduce(into: [1]) { acc, a in
acc.insert(acc[0].intValue * a.intValue as NSNumber, at: 0)
}

return try MLMultiArray(dataPointer: self.data!, shape: shape, dataType: .float32, strides: strides)
}
}
52 changes: 52 additions & 0 deletions Sources/TensorUtils/MLMultiArray+Utils.swift
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import Foundation
import CoreML
import Accelerate

public extension MLMultiArray {
/// All values will be stored in the last dimension of the MLMultiArray (default is dims=1)
Expand Down Expand Up @@ -198,3 +199,54 @@ extension MLMultiArray {
return s + "]"
}
}

public extension MLMultiArray {
func toArray<T: Numeric>() -> Array<T> {
let stride = MemoryLayout<T>.stride
let allocated = UnsafeMutableRawBufferPointer.allocate(byteCount: self.count * stride, alignment: MemoryLayout<T>.alignment)
return self.withUnsafeBytes { ptr in
memcpy(allocated.baseAddress!, ptr.baseAddress!, self.count * stride)
let start = allocated.bindMemory(to: T.self).baseAddress!
return Array<T>(UnsafeBufferPointer(start: start, count: self.count))
}
}
}

public extension MLMultiArray {
static func +(lhs: MLMultiArray, rhs: MLMultiArray) -> MLMultiArray {
assert(lhs.dataType == rhs.dataType && lhs.dataType == .float32)
assert(lhs.shape.count == rhs.shape.count && lhs.shape[1].intValue == rhs.shape[1].intValue)

let outShape: [NSNumber]
let outLength: Int
var ptr0: UnsafeMutablePointer<Float32>
var ptr1: UnsafeMutablePointer<Float32>
if lhs.shape[0].intValue >= rhs.shape[0].intValue {
assert(rhs.shape[0].intValue == 1 || lhs.shape == rhs.shape) // A[m, n], B[1, n] || B[m, n]
outShape = lhs.shape
outLength = lhs.count
ptr0 = UnsafeMutablePointer<Float32>(OpaquePointer(lhs.withUnsafeMutableBytes({ ptr, _ in ptr.baseAddress! })))
ptr1 = UnsafeMutablePointer<Float32>(OpaquePointer(rhs.withUnsafeMutableBytes({ ptr, _ in ptr.baseAddress! })))
} else {
assert(lhs.shape[0].intValue == 1) // Swap when A[1, n], B[m, n]
outShape = rhs.shape
outLength = rhs.count
ptr0 = UnsafeMutablePointer<Float32>(OpaquePointer(rhs.withUnsafeMutableBytes({ ptr, _ in ptr.baseAddress! })))
ptr1 = UnsafeMutablePointer<Float32>(OpaquePointer(lhs.withUnsafeMutableBytes({ ptr, _ in ptr.baseAddress! })))
}

let output = try! MLMultiArray(shape: outShape, dataType: .float32)
var ptrOutput = UnsafeMutablePointer<Float32>(OpaquePointer(output.withUnsafeMutableBytes({ ptr, _ in ptr.baseAddress! })))
vDSP_vadd(ptr0, 1, ptr1, 1, ptrOutput, 1, vDSP_Length(outLength))

if lhs.shape[0].intValue != rhs.shape[0].intValue {
for _ in 1..<outShape[0].intValue {
ptr0 = ptr0.advanced(by: outShape[1].intValue)
ptrOutput = ptrOutput.advanced(by: outShape[1].intValue)
vDSP_vadd(ptr0, 1, ptr1, 1, ptrOutput, 1, vDSP_Length(outShape[1].intValue))
}
}

return output
}
}
7 changes: 7 additions & 0 deletions Tests/EmbeddingTests/EmbeddingTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import XCTest
@testable import Tokenizers
@testable import Hub
@testable import TensorUtils
@testable import Embedding

class EmbeddingTests: XCTestCase { }
23 changes: 23 additions & 0 deletions Tests/TensorUtilsTests/ArrayUtilsTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import XCTest
@testable import TensorUtils

class ArrayUtilsTests: XCTestCase {

func testPaddedArrayWhenNeedPadding() {
let array = [1, 2, 3, 4]
let paddedArray = array.padded(length: 7)
XCTAssertEqual(paddedArray, [1, 2, 3, 4, 0, 0, 0])
}

func testNoPaddingForTheSamePaddingLength() {
let array = [1, 2, 3, 4]
let paddedArray = array.padded(length: 4)
XCTAssertEqual(paddedArray, [1, 2, 3, 4])
}

func testNoPaddingForShorterPaddingLength() {
let array = [1, 2, 3, 4]
let paddedArray = array.padded(length: 2)
XCTAssertEqual(paddedArray, [1, 2, 3, 4])
}
}
26 changes: 26 additions & 0 deletions Tests/TensorUtilsTests/BNNSUtilTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import XCTest
import Accelerate
@testable import TensorUtils

class BNNSUtilsTests: XCTestCase {

func testMakeMultiArrayFromDescriptor() throws {
let rowCount = 4
let dimSize = 6
let dictData: [Float32] = [
1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24,
]
let dict = BNNSNDArrayDescriptor.allocate(initializingFrom: dictData, shape: .matrixColumnMajor(dimSize, rowCount))
let shape: [NSNumber] = [
NSNumber(value: rowCount),
NSNumber(value: dimSize),
]
let multiArray = try dict.makeMultiArray(of: Float32.self, shape: shape)
XCTAssertEqual(multiArray.toArray(), dictData)
XCTAssertEqual(multiArray.floats!, dictData)
}

}
Loading