-
Notifications
You must be signed in to change notification settings - Fork 10.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ggml-ci
- Loading branch information
Showing
7 changed files
with
382 additions
and
329 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,318 @@ | ||
#include "llama-adapter.h" | ||
|
||
#include "llama-model.h" | ||
|
||
#include <algorithm> | ||
#include <map> | ||
#include <cassert> | ||
|
||
// vec | ||
|
||
struct ggml_tensor * llama_control_vector::tensor_for(int il) const { | ||
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { | ||
return nullptr; | ||
} | ||
|
||
return tensors[il]; | ||
} | ||
|
||
struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { | ||
ggml_tensor * layer_dir = tensor_for(il); | ||
if (layer_dir != nullptr) { | ||
cur = ggml_add(ctx, cur, layer_dir); | ||
} | ||
|
||
return cur; | ||
} | ||
|
||
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { | ||
const auto & hparams = model.hparams; | ||
|
||
GGML_ASSERT(cvec.tensors.empty()); | ||
GGML_ASSERT(cvec.ctxs.empty()); | ||
GGML_ASSERT(cvec.bufs.empty()); | ||
|
||
// create a context for each buffer type | ||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; | ||
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { | ||
auto it = ctx_map.find(buft); | ||
if (it == ctx_map.end()) { | ||
struct ggml_init_params params = { | ||
/*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(), | ||
/*.mem_buffer =*/ NULL, | ||
/*.no_alloc =*/ true, | ||
}; | ||
|
||
ggml_context * ctx = ggml_init(params); | ||
if (!ctx) { | ||
return nullptr; | ||
} | ||
|
||
ctx_map[buft] = ctx; | ||
cvec.ctxs.emplace_back(ctx); | ||
|
||
return ctx; | ||
} | ||
|
||
return it->second; | ||
}; | ||
|
||
// make tensors | ||
cvec.tensors.reserve(hparams.n_layer); | ||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 | ||
for (size_t il = 1; il < hparams.n_layer; il++) { | ||
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il); | ||
ggml_context * ctx = ctx_for_buft(buft); | ||
if (!ctx) { | ||
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); | ||
return false; | ||
} | ||
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); | ||
cvec.tensors.push_back(tensor); | ||
} | ||
|
||
// allocate tensors / buffers and zero | ||
cvec.bufs.reserve(ctx_map.size()); | ||
for (auto it : ctx_map) { | ||
ggml_backend_buffer_type_t buft = it.first; | ||
ggml_context * ctx = it.second; | ||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); | ||
if (!buf) { | ||
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); | ||
return false; | ||
} | ||
ggml_backend_buffer_clear(buf, 0); | ||
cvec.bufs.emplace_back(buf); | ||
} | ||
|
||
return true; | ||
} | ||
|
||
int32_t llama_control_vector_apply( | ||
struct llama_control_vector & cvec, | ||
const llama_model & model, | ||
const float * data, | ||
size_t len, | ||
int32_t n_embd, | ||
int32_t il_start, | ||
int32_t il_end) { | ||
const auto & hparams = model.hparams; | ||
|
||
if (data == nullptr) { | ||
// disable the current control vector (but leave allocated for later) | ||
cvec.layer_start = -1; | ||
cvec.layer_end = -1; | ||
return 0; | ||
} | ||
|
||
if (n_embd != (int) hparams.n_embd) { | ||
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); | ||
return 1; | ||
} | ||
|
||
if (cvec.tensors.empty()) { | ||
if (!llama_control_vector_init(cvec, model)) { | ||
return 1; | ||
} | ||
} | ||
|
||
cvec.layer_start = il_start; | ||
cvec.layer_end = il_end; | ||
|
||
for (size_t il = 1; il < hparams.n_layer; il++) { | ||
assert(cvec.tensors[il] != nullptr); | ||
|
||
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present | ||
if (off + n_embd <= len) { | ||
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); | ||
} | ||
} | ||
|
||
return 0; | ||
} | ||
|
||
// lora | ||
|
||
llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) { | ||
const std::string name(w->name); | ||
|
||
const auto pos = ab_map.find(name); | ||
if (pos != ab_map.end()) { | ||
return &pos->second; | ||
} | ||
|
||
return nullptr; | ||
} | ||
|
||
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { | ||
delete adapter; | ||
} | ||
|
||
void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { | ||
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); | ||
|
||
ggml_context * ctx_init; | ||
struct gguf_init_params meta_gguf_params = { | ||
/* .no_alloc = */ true, | ||
/* .ctx = */ &ctx_init, | ||
}; | ||
|
||
gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) }; | ||
if (!ctx_gguf) { | ||
throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); | ||
} | ||
|
||
ggml_context_ptr ctx { ctx_init }; | ||
|
||
// check metadata | ||
{ | ||
auto get_kv_str = [&](const std::string & key) -> std::string { | ||
int id = gguf_find_key(ctx_gguf.get(), key.c_str()); | ||
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id)); | ||
}; | ||
auto get_kv_f32 = [&](const std::string & key) -> float { | ||
int id = gguf_find_key(ctx_gguf.get(), key.c_str()); | ||
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id); | ||
}; | ||
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); | ||
|
||
auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); | ||
if (general_type != "adapter") { | ||
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); | ||
} | ||
|
||
auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); | ||
auto general_arch = llm_arch_from_string(general_arch_str); | ||
if (general_arch != model.arch) { | ||
throw std::runtime_error("model arch and LoRA arch mismatch"); | ||
} | ||
|
||
auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); | ||
if (adapter_type != "lora") { | ||
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); | ||
} | ||
|
||
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); | ||
} | ||
|
||
int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); | ||
|
||
// contexts for each buffer type | ||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; | ||
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { | ||
auto it = ctx_map.find(buft); | ||
if (it == ctx_map.end()) { | ||
// add a new context | ||
struct ggml_init_params params = { | ||
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(), | ||
/*.mem_buffer =*/ NULL, | ||
/*.no_alloc =*/ true, | ||
}; | ||
ggml_context * buft_ctx = ggml_init(params); | ||
if (!buft_ctx) { | ||
return nullptr; | ||
} | ||
ctx_map[buft] = buft_ctx; | ||
adapter.ctxs.emplace_back(buft_ctx); | ||
return buft_ctx; | ||
}; | ||
return it->second; | ||
}; | ||
|
||
// bundle lora_a and lora_b into pairs | ||
std::map<std::string, llama_lora_weight> ab_map; | ||
auto str_endswith = [](const std::string & str, const std::string & suffix) { | ||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; | ||
}; | ||
|
||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) { | ||
std::string name(cur->name); | ||
if (str_endswith(name, ".lora_a")) { | ||
replace_all(name, ".lora_a", ""); | ||
if (ab_map.find(name) == ab_map.end()) { | ||
ab_map[name] = llama_lora_weight(cur, nullptr); | ||
} else { | ||
ab_map[name].a = cur; | ||
} | ||
} else if (str_endswith(name, ".lora_b")) { | ||
replace_all(name, ".lora_b", ""); | ||
if (ab_map.find(name) == ab_map.end()) { | ||
ab_map[name] = llama_lora_weight(nullptr, cur); | ||
} else { | ||
ab_map[name].b = cur; | ||
} | ||
} else { | ||
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); | ||
} | ||
} | ||
|
||
// add tensors | ||
for (auto & it : ab_map) { | ||
const std::string & name = it.first; | ||
llama_lora_weight & w = it.second; | ||
|
||
if (!w.a || !w.b) { | ||
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); | ||
} | ||
|
||
// device buft and device ctx | ||
auto * model_tensor = llama_model_get_tensor(model, name.c_str()); | ||
if (!model_tensor) { | ||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); | ||
} | ||
|
||
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); | ||
// validate tensor shape | ||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { | ||
throw std::runtime_error("tensor '" + name + "' has incorrect shape"); | ||
} | ||
if (w.a->ne[1] != w.b->ne[0]) { | ||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); | ||
} | ||
|
||
// save tensor to adapter | ||
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); | ||
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); | ||
ggml_set_name(tensor_a, w.a->name); | ||
ggml_set_name(tensor_b, w.b->name); | ||
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); | ||
} | ||
|
||
// allocate tensors / buffers and zero | ||
{ | ||
adapter.ctxs.reserve(ctx_map.size()); | ||
adapter.bufs.reserve(ctx_map.size()); | ||
for (auto & it : ctx_map) { | ||
ggml_backend_buffer_type_t buft = it.first; | ||
ggml_context * ctx_dev = it.second; | ||
ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) }; | ||
if (!buf) { | ||
throw std::runtime_error("failed to allocate buffer for lora adapter\n"); | ||
} | ||
LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0); | ||
adapter.bufs.emplace_back(std::move(buf)); | ||
} | ||
} | ||
|
||
// set tensor data | ||
{ | ||
llama_file gguf_file(path_lora, "rb"); | ||
std::vector<uint8_t> read_buf; | ||
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { | ||
size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); | ||
size_t size = ggml_nbytes(orig); | ||
read_buf.resize(size); | ||
gguf_file.seek(offs, SEEK_SET); | ||
gguf_file.read_raw(read_buf.data(), size); | ||
ggml_backend_tensor_set(dev, read_buf.data(), 0, size); | ||
}; | ||
for (auto & it : adapter.ab_map) { | ||
auto orig = ab_map[it.first]; | ||
auto dev = it.second; | ||
set_tensor(orig.a, dev.a); | ||
set_tensor(orig.b, dev.b); | ||
} | ||
} | ||
|
||
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); | ||
} |
Oops, something went wrong.