From c8449dba12b157364d40ed429fe7210f8226a9d7 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Tue, 16 Apr 2024 23:35:54 +0200 Subject: [PATCH] Exposed n_gpu_layers for Metal backend + build works --- README.md | 11 ++++++++++- bark.cpp | 7 +++++++ examples/common.cpp | 17 ++++++++++------- examples/common.h | 3 +++ examples/main/main.cpp | 1 + 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 2ff2448..6f7ccaf 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ mv ./vocab.txt ./models/ python3 convert.py --dir-model ./models --out-dir ./ggml_weights/ --vocab-path ./models # run the inference -./build/examples/main/main -m ./ggml_weights/ -p "this is an audio" +./build/examples/main/main -m ./ggml_weights/ -t 4 -p "this is an audio" ``` ### (Optional) Quantize weights @@ -129,6 +129,15 @@ Note that to preserve audio quality, we do not quantize the codec model. The bul ./build/examples/quantize/quantize ./ggml_weights.bin ./ggml_weights_q4.bin q4_0 ``` +### Using Metal + +To build Bark with support of the Metal backend, run + +```bash +cmake -DGGML_CUBLAS=ON .. +./build/examples/main/main -m ./ggml_weights/ -ngl 100 -t 8 -p "this is an audio" +``` + ### Seminal papers - Bark diff --git a/bark.cpp b/bark.cpp index 04ea0a7..5efc05e 100644 --- a/bark.cpp +++ b/bark.cpp @@ -73,6 +73,13 @@ static void write_safe(std::ofstream& fout, T& dest) { fout.write((char*)&dest, sizeof(T)); } +static void ggml_log_callback_default(ggml_log_level level, const char* text, void* user_data) { + (void)level; + (void)user_data; + fputs(text, stderr); + fflush(stderr); +} + static void bark_print_statistics(gpt_model* model) { printf("\n\n"); printf("%s: sample time = %8.2f ms / %lld tokens\n", __func__, model->t_sample_us / 1000.0f, model->n_sample); diff --git a/examples/common.cpp b/examples/common.cpp index 6fc9b6e..4619432 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -28,17 +28,18 @@ void bark_print_usage(char** argv, const bark_params& params) { std::cout << "usage: " << argv[0] << " [options]\n" << "\n" << "options:\n" - << " -h, --help show this help message and exit\n" - << " -t N, --threads N number of threads to use during computation (default: " << params.n_threads << ")\n" - << " -s N, --seed N seed for random number generator (default: " << params.seed << ")\n" + << " -h, --help show this help message and exit\n" + << " -t N, --threads N number of threads to use during computation (default: " << params.n_threads << ")\n" + << " -s N, --seed N seed for random number generator (default: " << params.seed << ")\n" + << " -ngl N, --n_gpu_layers N number of GPU layers (default: " << params.n_gpu_layers << ")\n" << " -p PROMPT, --prompt PROMPT\n" - << " prompt to start generation with (default: random)\n" + << " prompt to start generation with (default: random)\n" << " -m FNAME, --model FNAME\n" - << " model path (default: " << params.model_path << ")\n" + << " model path (default: " << params.model_path << ")\n" << " -em FNAME, --encodec_model_path FNAME\n" - << " Encodec model path (default: " << params.encodec_model_path << ")\n" + << " Encodec model path (default: " << params.encodec_model_path << ")\n" << " -o FNAME, --outwav FNAME\n" - << " output generated wav (default: " << params.dest_wav_path << ")\n" + << " output generated wav (default: " << params.dest_wav_path << ")\n" << "\n"; } @@ -54,6 +55,8 @@ int bark_params_parse(int argc, char** argv, bark_params& params) { params.model_path = argv[++i]; } else if (arg == "-em" || arg == "--encodec_model_path") { params.encodec_model_path = argv[++i]; + } else if (arg == "-ngl" || arg == "--n_gpu_layers") { + params.n_gpu_layers = std::stoi(argv[++i]); } else if (arg == "-s" || arg == "--seed") { params.seed = std::stoi(argv[++i]); } else if (arg == "-o" || arg == "--outwav") { diff --git a/examples/common.h b/examples/common.h index 7e415a7..b4e512f 100644 --- a/examples/common.h +++ b/examples/common.h @@ -6,6 +6,9 @@ struct bark_params { // Number of threads used for audio generation. int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency()); + // Number of GPU layers. Used for cuBLAS and Metal backends. + int32_t n_gpu_layers = 0; + // User prompt. std::string prompt = "This is an audio generated by bark.cpp"; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index ced6e0e..25399df 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -37,6 +37,7 @@ int main(int argc, char **argv) { exit(1); } + bctx->n_gpu_layers = params.n_gpu_layers; bctx->encodec_model_path = params.encodec_model_path; // generate audio