Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : refactor src/llama.cpp #10902

Draft
wants to merge 19 commits into
base: master
Choose a base branch
from
Draft
28 changes: 13 additions & 15 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ jobs:
-DLLAMA_CURL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON \
-DBUILD_SHARED_LIBS=OFF
-DGGML_RPC=ON
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

- name: Test
Expand Down Expand Up @@ -123,8 +122,7 @@ jobs:
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
-DBUILD_SHARED_LIBS=OFF
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

- name: Test
Expand Down Expand Up @@ -181,7 +179,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
cmake --build . --config Release -j $(nproc)

- name: Test
Expand Down Expand Up @@ -651,23 +649,23 @@ jobs:
matrix:
include:
- build: 'noavx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
- build: 'avx2-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
- build: 'avx-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
- build: 'avx512-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
- build: 'openblas-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
- build: 'vulkan-x64'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
- build: 'llvm-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
- build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=O'
- build: 'llvm-arm64-opencl-adreno'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'

Expand Down Expand Up @@ -914,7 +912,7 @@ jobs:
shell: cmd
run: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
Expand Down
10 changes: 5 additions & 5 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -922,14 +922,14 @@ struct common_init_result common_init_from_params(common_params & params) {
common_lora_adapter_container loaded_la;
loaded_la.path = la.path;
loaded_la.scale = la.scale;
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
loaded_la.adapter.reset(llama_lora_adapter_init(model, la.path.c_str()));
if (loaded_la.adapter == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
llama_free_model(model);
return iparams;
}
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
iparams.lora_adapters.emplace_back(std::move(loaded_la)); // copy to list of loaded adapters
}
if (!params.lora_init_without_apply) {
common_lora_adapters_apply(lctx, iparams.lora_adapters);
Expand Down Expand Up @@ -993,8 +993,8 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_perf_context_reset(lctx);
}

iparams.model = model;
iparams.context = lctx;
iparams.model.reset(model);
iparams.context.reset(lctx);

return iparams;
}
Expand All @@ -1003,7 +1003,7 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
llama_lora_adapter_clear(ctx);
for (auto & la : lora_adapters) {
if (la.scale != 0.0f) {
llama_lora_adapter_set(ctx, la.adapter, la.scale);
llama_lora_adapter_set(ctx, la.adapter.get(), la.scale);
}
}
}
Expand Down
19 changes: 12 additions & 7 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#pragma once

#include "llama.h"
#include "llama-cpp.h"

#include <string>
#include <vector>
Expand Down Expand Up @@ -30,7 +30,7 @@ struct common_lora_adapter_info {
};

struct common_lora_adapter_container : common_lora_adapter_info {
struct llama_lora_adapter * adapter;
llama_lora_adapter_ptr adapter;
};

using llama_tokens = std::vector<llama_token>;
Expand Down Expand Up @@ -479,8 +479,9 @@ std::string fs_get_cache_file(const std::string & filename);
//

struct common_init_result {
struct llama_model * model = nullptr;
struct llama_context * context = nullptr;
llama_model_ptr model;
llama_context_ptr context;

std::vector<common_lora_adapter_container> lora_adapters;
};

Expand Down Expand Up @@ -637,6 +638,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
// Split utils
//

static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
namespace {

const char * const LLM_KV_SPLIT_NO = "split.no";
const char * const LLM_KV_SPLIT_COUNT = "split.count";
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

}
10 changes: 5 additions & 5 deletions examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -434,12 +434,12 @@ static void print_matrix(struct ggml_tensor * probs) {
}
}

struct llama_file {
struct my_llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;

llama_file(const char * fname, const char * mode) {
my_llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
size = 0;
Expand Down Expand Up @@ -500,15 +500,15 @@ struct llama_file {
return std::string(chars.data(), len);
}

~llama_file() {
~my_llama_file() {
if (fp) {
std::fclose(fp);
}
}
};

static bool is_ggml_file(const char * filename) {
llama_file file(filename, "rb");
my_llama_file file(filename, "rb");
if (file.size < 4) {
return false;
}
Expand Down Expand Up @@ -576,7 +576,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
} else {
// assume llama2.c vocabulary
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
llama_file file(filename, "rb");
my_llama_file file(filename, "rb");
if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename);
}
Expand Down
7 changes: 3 additions & 4 deletions examples/cvector-generator/cvector-generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,12 +415,13 @@ int main(int argc, char ** argv) {
// load the model to get hparams
common_init_result llama_init = common_init_from_params(params);

llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();

// int n_ctx = llama_n_ctx(ctx);
int n_layers = llama_n_layer(model);
int n_embd = llama_n_embd(model);

// get model hint param (a.k.a model arch name)
char model_hint[128];
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
Expand Down Expand Up @@ -474,8 +475,6 @@ int main(int argc, char ** argv) {

// done with the model, we can now free it to make gain some memory
printf("Done evaluate prompts, unload model...\n");
llama_free(ctx);
llama_free_model(model);

bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;

Expand Down
7 changes: 3 additions & 4 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,9 @@ int main(int argc, char ** argv) {
// load the model
common_init_result llama_init = common_init_from_params(params);

llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();

if (model == NULL) {
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
Expand Down Expand Up @@ -316,8 +317,6 @@ int main(int argc, char ** argv) {

// clean up
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();

return 0;
Expand Down
8 changes: 3 additions & 5 deletions examples/eval-callback/eval-callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,9 @@ int main(int argc, char ** argv) {
// init
common_init_result llama_init = common_init_from_params(params);

llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();

if (model == nullptr || ctx == nullptr) {
LOG_ERR("%s : failed to init\n", __func__);
return 1;
Expand All @@ -184,9 +185,6 @@ int main(int argc, char ** argv) {
LOG("\n");
llama_perf_context_print(ctx);

llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
Expand Down
7 changes: 3 additions & 4 deletions examples/gguf-split/gguf-split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@
#include "common.h"

#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <string>
#include <vector>

#include <stdio.h>
#include <string.h>
#include <climits>

#include <cstdio>
#include <cstring>
#include <stdexcept>

#if defined(_WIN32)
Expand Down
11 changes: 5 additions & 6 deletions examples/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,9 +430,10 @@ static void process_logits(

static bool compute_imatrix(llama_context * ctx, const common_params & params) {
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
const int n_ctx = llama_n_ctx(ctx);

GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

auto tim1 = std::chrono::high_resolution_clock::now();
LOG_INF("%s: tokenizing the input ..\n", __func__);

Expand Down Expand Up @@ -618,8 +619,9 @@ int main(int argc, char ** argv) {
// init
common_init_result llama_init = common_init_from_params(params);

llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();

if (model == nullptr || ctx == nullptr) {
LOG_ERR("%s : failed to init\n", __func__);
return 1;
Expand Down Expand Up @@ -655,9 +657,6 @@ int main(int argc, char ** argv) {
LOG("\n");
llama_perf_context_print(ctx);

llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
Expand Down
7 changes: 2 additions & 5 deletions examples/infill/infill.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ int main(int argc, char ** argv) {
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
common_init_result llama_init = common_init_from_params(params);

model = llama_init.model;
ctx = llama_init.context;
model = llama_init.model.get();
ctx = llama_init.context.get();

if (model == NULL) {
LOG_ERR("%s: unable to load model\n", __func__);
Expand Down Expand Up @@ -581,9 +581,6 @@ int main(int argc, char ** argv) {
LOG("\n");
common_perf_print(ctx, smpl);

llama_free(ctx);
llama_free_model(model);

common_sampler_free(smpl);
llama_backend_free();

Expand Down
7 changes: 2 additions & 5 deletions examples/lookahead/lookahead.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ int main(int argc, char ** argv) {
// load the target model
common_init_result llama_init = common_init_from_params(params);

llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();

// Tokenize the prompt
std::vector<llama_token> inp;
Expand Down Expand Up @@ -474,9 +474,6 @@ int main(int argc, char ** argv) {

llama_batch_free(batch);

llama_free(ctx);
llama_free_model(model);

llama_backend_free();

LOG("\n\n");
Expand Down
13 changes: 4 additions & 9 deletions examples/lookup/lookup-create.cpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
#include "arg.h"
#include "common.h"
#include "ngram-cache.h"
#include "ggml.h"
#include "llama.h"

#include <cstdint>
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>

int main(int argc, char ** argv){
Expand All @@ -25,16 +20,16 @@ int main(int argc, char ** argv){
// load the model
common_init_result llama_init = common_init_from_params(params);

llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
llama_model_ptr & model = llama_init.model;
llama_context_ptr & ctx = llama_init.context;

GGML_ASSERT(model != nullptr);

// tokenize the prompt
std::vector<llama_token> inp;
inp = common_tokenize(ctx, params.prompt, true, true);
inp = common_tokenize(ctx.get(), params.prompt, true, true);
fprintf(stderr, "%s: tokenization done\n", __func__);


common_ngram_cache ngram_cache;
common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
Expand Down
Loading
Loading