From c8449dba12b157364d40ed429fe7210f8226a9d7 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Tue, 16 Apr 2024 23:35:54 +0200
Subject: [PATCH] Exposed n_gpu_layers for Metal backend + build works

---
 README.md              | 11 ++++++++++-
 bark.cpp               |  7 +++++++
 examples/common.cpp    | 17 ++++++++++-------
 examples/common.h      |  3 +++
 examples/main/main.cpp |  1 +
 5 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 2ff2448..6f7ccaf 100644
--- a/README.md
+++ b/README.md
@@ -116,7 +116,7 @@ mv ./vocab.txt ./models/
 python3 convert.py --dir-model ./models --out-dir ./ggml_weights/ --vocab-path ./models
 
 # run the inference
-./build/examples/main/main -m ./ggml_weights/ -p "this is an audio"
+./build/examples/main/main -m ./ggml_weights/ -t 4 -p "this is an audio"
 ```
 
 ### (Optional) Quantize weights
@@ -129,6 +129,15 @@ Note that to preserve audio quality, we do not quantize the codec model. The bul
 ./build/examples/quantize/quantize ./ggml_weights.bin ./ggml_weights_q4.bin q4_0
 ```
 
+### Using Metal
+
+To build Bark with support of the Metal backend, run
+
+```bash
+cmake -DGGML_CUBLAS=ON ..
+./build/examples/main/main -m ./ggml_weights/ -ngl 100 -t 8 -p "this is an audio"
+```
+
 ### Seminal papers
 
 - Bark
diff --git a/bark.cpp b/bark.cpp
index 04ea0a7..5efc05e 100644
--- a/bark.cpp
+++ b/bark.cpp
@@ -73,6 +73,13 @@ static void write_safe(std::ofstream& fout, T& dest) {
     fout.write((char*)&dest, sizeof(T));
 }
 
+static void ggml_log_callback_default(ggml_log_level level, const char* text, void* user_data) {
+    (void)level;
+    (void)user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
 static void bark_print_statistics(gpt_model* model) {
     printf("\n\n");
     printf("%s:   sample time = %8.2f ms / %lld tokens\n", __func__, model->t_sample_us / 1000.0f, model->n_sample);
diff --git a/examples/common.cpp b/examples/common.cpp
index 6fc9b6e..4619432 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -28,17 +28,18 @@ void bark_print_usage(char** argv, const bark_params& params) {
     std::cout << "usage: " << argv[0] << " [options]\n"
               << "\n"
               << "options:\n"
-              << "  -h, --help            show this help message and exit\n"
-              << "  -t N, --threads N     number of threads to use during computation (default: " << params.n_threads << ")\n"
-              << "  -s N, --seed N        seed for random number generator (default: " << params.seed << ")\n"
+              << "  -h, --help                    show this help message and exit\n"
+              << "  -t N, --threads N             number of threads to use during computation (default: " << params.n_threads << ")\n"
+              << "  -s N, --seed N                seed for random number generator (default: " << params.seed << ")\n"
+              << "  -ngl N, --n_gpu_layers N      number of GPU layers (default: " << params.n_gpu_layers << ")\n"
               << "  -p PROMPT, --prompt PROMPT\n"
-              << "                        prompt to start generation with (default: random)\n"
+              << "                                prompt to start generation with (default: random)\n"
               << "  -m FNAME, --model FNAME\n"
-              << "                        model path (default: " << params.model_path << ")\n"
+              << "                                model path (default: " << params.model_path << ")\n"
               << "  -em FNAME, --encodec_model_path FNAME\n"
-              << "                        Encodec model path (default: " << params.encodec_model_path << ")\n"
+              << "                                Encodec model path (default: " << params.encodec_model_path << ")\n"
               << "  -o FNAME, --outwav FNAME\n"
-              << "                        output generated wav (default: " << params.dest_wav_path << ")\n"
+              << "                                output generated wav (default: " << params.dest_wav_path << ")\n"
               << "\n";
 }
 
@@ -54,6 +55,8 @@ int bark_params_parse(int argc, char** argv, bark_params& params) {
             params.model_path = argv[++i];
         } else if (arg == "-em" || arg == "--encodec_model_path") {
             params.encodec_model_path = argv[++i];
+        } else if (arg == "-ngl" || arg == "--n_gpu_layers") {
+            params.n_gpu_layers = std::stoi(argv[++i]);
         } else if (arg == "-s" || arg == "--seed") {
             params.seed = std::stoi(argv[++i]);
         } else if (arg == "-o" || arg == "--outwav") {
diff --git a/examples/common.h b/examples/common.h
index 7e415a7..b4e512f 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -6,6 +6,9 @@ struct bark_params {
     // Number of threads used for audio generation.
     int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
 
+    // Number of GPU layers. Used for cuBLAS and Metal backends.
+    int32_t n_gpu_layers = 0;
+
     // User prompt.
     std::string prompt = "This is an audio generated by bark.cpp";
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index ced6e0e..25399df 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -37,6 +37,7 @@ int main(int argc, char **argv) {
         exit(1);
     }
 
+    bctx->n_gpu_layers = params.n_gpu_layers;
     bctx->encodec_model_path = params.encodec_model_path;
 
     // generate audio