Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 17 additions & 17 deletions common/jinja/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@ value binary_expression::execute_impl(context & ctx) {
return false;
};

auto test_is_in = [&]() -> bool {
func_args args(ctx);
args.push_back(left_val);
args.push_back(right_val);
return global_builtins().at("test_is_in")(args)->as_bool();
};

// Handle undefined and null values
if (is_val<value_undefined>(left_val) || is_val<value_undefined>(right_val)) {
if (is_val<value_undefined>(right_val) && (op.value == "in" || op.value == "not in")) {
Expand Down Expand Up @@ -223,19 +230,11 @@ value binary_expression::execute_impl(context & ctx) {
return result;
}
} else if (is_val<value_array>(right_val)) {
auto & arr = right_val->as_array();
bool member = false;
for (const auto & item : arr) {
if (*left_val == *item) {
member = true;
break;
}
}
// case: 1 in [0, 1, 2]
bool member = test_is_in();
if (op.value == "in") {
JJ_DEBUG("Checking membership: %s in Array is %d", left_val->type().c_str(), member);
return mk_val<value_bool>(member);
} else if (op.value == "not in") {
JJ_DEBUG("Checking non-membership: %s not in Array is %d", left_val->type().c_str(), !member);
return mk_val<value_bool>(!member);
}
}
Expand All @@ -252,22 +251,23 @@ value binary_expression::execute_impl(context & ctx) {

// String membership
if (is_val<value_string>(left_val) && is_val<value_string>(right_val)) {
auto left_str = left_val->as_string().str();
auto right_str = right_val->as_string().str();
// case: "a" in "abc"
bool member = test_is_in();
if (op.value == "in") {
return mk_val<value_bool>(right_str.find(left_str) != std::string::npos);
return mk_val<value_bool>(member);
} else if (op.value == "not in") {
return mk_val<value_bool>(right_str.find(left_str) == std::string::npos);
return mk_val<value_bool>(!member);
}
}

// Value key in object
if (is_val<value_object>(right_val)) {
bool has_key = right_val->has_key(left_val);
// case: key in {key: value}
bool member = test_is_in();
if (op.value == "in") {
return mk_val<value_bool>(has_key);
return mk_val<value_bool>(member);
} else if (op.value == "not in") {
return mk_val<value_bool>(!has_key);
return mk_val<value_bool>(!member);
}
}

Expand Down
27 changes: 27 additions & 0 deletions common/jinja/value.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,33 @@ const func_builtins & global_builtins() {
{"test_is_lt", test_compare_fn<value_compare_op::lt>},
{"test_is_lessthan", test_compare_fn<value_compare_op::lt>},
{"test_is_ne", test_compare_fn<value_compare_op::ne>},
{"test_is_in", [](const func_args & args) -> value {
args.ensure_count(2);
auto needle = args.get_pos(0);
auto haystack = args.get_pos(1);
if (is_val<value_undefined>(haystack)) {
return mk_val<value_bool>(false);
}
if (is_val<value_array>(haystack)) {
for (const auto & item : haystack->as_array()) {
if (*needle == *item) {
return mk_val<value_bool>(true);
}
}
return mk_val<value_bool>(false);
}
if (is_val<value_string>(haystack)) {
if (!is_val<value_string>(needle)) {
throw raised_exception("'in' test expects args[1] as string when args[0] is string, got args[1] as " + needle->type());
}
return mk_val<value_bool>(
haystack->as_string().str().find(needle->as_string().str()) != std::string::npos);
}
if (is_val<value_object>(haystack)) {
return mk_val<value_bool>(haystack->has_key(needle));
}
throw raised_exception("'in' test expects iterable as first argument, got " + haystack->type());
}},
{"test_is_test", [](const func_args & args) -> value {
args.ensure_vals<value_string>();
auto & builtins = global_builtins();
Expand Down
5 changes: 5 additions & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ extern "C" {
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;

// use only reference implementations
bool use_ref;
};

// numa strategies
Expand Down Expand Up @@ -132,6 +135,8 @@ extern "C" {
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);

GGML_BACKEND_API void ggml_backend_cpu_set_use_ref(ggml_backend_t backend_cpu, bool use_ref);

GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);

GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ struct ggml_compute_params {
void * wdata;

struct ggml_threadpool * threadpool;

// use reference implementation
bool use_ref;
};


Expand Down
22 changes: 15 additions & 7 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#include "ggml-backend.h"
#include "traits.h"
#include "ggml-cpu-impl.h"
#include "ggml-cpu.h"
#include "ggml-impl.h"
#include "quants.h"
#include "ggml-threading.h"
Expand Down Expand Up @@ -2867,12 +2866,20 @@ struct ggml_cplan ggml_graph_plan(
} break;
case GGML_OP_FLASH_ATTN_EXT:
{
const int64_t neq2 = node->src[0]->ne[2]; // number of query heads
const int64_t DK = node->src[1]->ne[0];
const int64_t DV = node->src[2]->ne[0];

// Tiled flash attention scratch (tile sizes defined in common.h)
// Per-thread: Q_q + KQ + mask + VKQ32 + V32 + padding
cur = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks;
size_t prefill = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks;

// Decode path: n_kv_chunks = n_tasks (one chunk per thread)
// Per-thread: VKQ accmulator (DV), partial M, partial S + intra-thread scratch for V, Q and VKQ
size_t n_chunks = n_tasks;
size_t decode = sizeof(float)*(neq2*n_chunks*(2+DV) + n_tasks*(DK + 2*DV));

cur += MAX(prefill, decode);
} break;
case GGML_OP_FLASH_ATTN_BACK:
{
Expand Down Expand Up @@ -2929,11 +2936,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
set_numa_thread_affinity(state->ith);

struct ggml_compute_params params = {
/*.ith =*/ state->ith,
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
/*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data,
/*.threadpool=*/ tp,
/*.ith =*/ state->ith,
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
/*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data,
/*.threadpool =*/ tp,
/*.use_ref =*/ cplan->use_ref,
};

GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
Expand Down
15 changes: 15 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ struct ggml_backend_cpu_context {

ggml_abort_callback abort_callback;
void * abort_callback_data;

bool use_ref; // use reference implementation
};

static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
Expand Down Expand Up @@ -143,6 +145,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend

cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
cpu_plan->cplan.use_ref = cpu_ctx->use_ref;

return cpu_plan;
}
Expand Down Expand Up @@ -182,6 +185,7 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s

cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
cplan.use_ref = cpu_ctx->use_ref;

return ggml_graph_compute(cgraph, &cplan);
}
Expand Down Expand Up @@ -223,6 +227,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
ctx->work_size = 0;
ctx->abort_callback = NULL;
ctx->abort_callback_data = NULL;
ctx->use_ref = false;

ggml_backend_t cpu_backend = new ggml_backend {
/* .guid = */ ggml_backend_cpu_guid(),
Expand Down Expand Up @@ -270,6 +275,13 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
ctx->abort_callback_data = abort_callback_data;
}

void ggml_backend_cpu_set_use_ref(ggml_backend_t backend_cpu, bool use_ref) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));

struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
ctx->use_ref = use_ref;
}

// CPU backend - device

struct ggml_backend_cpu_device_context {
Expand Down Expand Up @@ -646,6 +658,9 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
return (void *)ggml_is_numa;
}
if (strcmp(name, "ggml_backend_cpu_set_use_ref") == 0) {
return (void *)ggml_backend_cpu_set_use_ref;
}

// threadpool - TODO: move to ggml-base
if (strcmp(name, "ggml_threadpool_new") == 0) {
Expand Down
Loading
Loading