diff --git a/README.md b/README.md index 33fd56b..a2594de 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,12 @@ -# bloomz.cpp +# clone of bloomz.cpp modified to quantize and run inference bloom-176B model + +You can quantize and run inference on bloom-176b + +- 4-bit quantized Bloom model file is ~112GB +- this code/model takes about 105GB of ram to run inference (though peak allocation is 111GB due to some shortcomings of the code that are not yet fixed) +- performace-wise, this is not fast. at all. about 17 seconds per token (on 96 threads)... which is very close to full size Bloom on same CPU + +## bloomz.cpp Inference of HuggingFace's [BLOOM-like](https://huggingface.co/docs/transformers/model_doc/bloom) models in pure C/C++. diff --git a/ggml.c b/ggml.c index 43f5a25..145a150 100644 --- a/ggml.c +++ b/ggml.c @@ -2229,10 +2229,10 @@ void ggml_print_objects(const struct ggml_context * ctx) { GGML_PRINT("%s: --- end ---\n", __func__); } -int ggml_nelements(const struct ggml_tensor * tensor) { +size_t ggml_nelements(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; + return (size_t) tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } int ggml_nrows(const struct ggml_tensor * tensor) { diff --git a/ggml.h b/ggml.h index 6ae3404..d77dab5 100644 --- a/ggml.h +++ b/ggml.h @@ -327,7 +327,7 @@ int64_t ggml_cycles_per_ms(void); void ggml_print_object (const struct ggml_object * obj); void ggml_print_objects(const struct ggml_context * ctx); -int ggml_nelements(const struct ggml_tensor * tensor); +size_t ggml_nelements(const struct ggml_tensor * tensor); size_t ggml_nbytes (const struct ggml_tensor * tensor); int ggml_blck_size (enum ggml_type type); diff --git a/main.cpp b/main.cpp index 2552981..b7e1621 100644 --- a/main.cpp +++ b/main.cpp @@ -216,6 +216,9 @@ bool bloom_model_load(const std::string & fname, bloom_model & model, gpt_vocab printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } + // FIXME: increase the context size as it seems above calculations are off... + ctx_size += 5000000000; // + 5GB, seems to be the least amount of GB that works + // create the ggml context { struct ggml_init_params params = { @@ -355,7 +358,7 @@ bool bloom_model_load(const std::string & fname, bloom_model & model, gpt_vocab break; } - int32_t nelements = 1; + int64_t nelements = 1; int32_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); @@ -551,7 +554,8 @@ bool bloom_eval( const int d_key = n_embd/n_head; - static size_t buf_size = 512u*1024*1024; + //static size_t buf_size = 512u*1024*1024; + static size_t buf_size = 1024u*1024*1024; static void * buf = malloc(buf_size); if (mem_per_token > 0 && mem_per_token*N > buf_size) { diff --git a/quantize.cpp b/quantize.cpp index e150ae6..3833191 100644 --- a/quantize.cpp +++ b/quantize.cpp @@ -150,7 +150,7 @@ bool bloom_model_quantize(const std::string & fname_inp, const std::string & fna break; } - int32_t nelements = 1; + int64_t nelements = 1; int32_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { finp.read (reinterpret_cast(&ne[i]), sizeof(ne[i])); diff --git a/utils.cpp b/utils.cpp index 3a0a574..9e533ed 100644 --- a/utils.cpp +++ b/utils.cpp @@ -486,7 +486,7 @@ gpt_vocab::id bloom_sample_top_p( } -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) { +size_t ggml_quantize_q4_0(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist) { const int nb = k / qk; const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2); const size_t row_size = nb*bs; @@ -498,7 +498,7 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t char * pdst = (char *) dst; - for (int j = 0; j < n; j += k) { + for (int64_t j = 0; j < n; j += k) { uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); @@ -542,7 +542,7 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t return (n/k)*row_size; } -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { +size_t ggml_quantize_q4_1(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist) { const int nb = k / qk; const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2); @@ -553,7 +553,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t char * pdst = (char *) dst; - for (int j = 0; j < n; j += k) { + for (int64_t j = 0; j < n; j += k) { float * pm = (float *) (pdst + (j/k)*row_size); float * pd = (float *) (pm + nb); uint8_t * pb = (uint8_t *) (pd + nb); diff --git a/utils.h b/utils.h index f69f433..e434974 100644 --- a/utils.h +++ b/utils.h @@ -101,5 +101,5 @@ gpt_vocab::id bloom_sample_top_p( // Quantization // -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); +size_t ggml_quantize_q4_0(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist); +size_t ggml_quantize_q4_1(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist);