Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions common/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,42 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
return it->second;
}

bool common_speculative_is_compat(llama_context * ctx_tgt) {
auto * mem = llama_get_memory(ctx_tgt);
if (mem == nullptr) {
return false;
}

bool res = true;

llama_memory_clear(mem, true);

// eval 2 tokens to check if the context is compatible
std::vector<llama_token> tmp;
tmp.push_back(0);
tmp.push_back(0);

int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
if (ret != 0) {
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
res = false;
goto done;
}

// try to remove the last tokens
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
res = false;
goto done;
}

done:
llama_memory_clear(mem, true);
llama_synchronize(ctx_tgt);

return res;
}

// initialization of the speculative decoding system
//
common_speculative * common_speculative_init(
Expand Down
4 changes: 4 additions & 0 deletions common/speculative.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
// convert type to string
std::string common_speculative_type_to_str(enum common_speculative_type type);

// check if the llama_context is compatible for speculative decoding
// note: clears the memory of the context
bool common_speculative_is_compat(llama_context * ctx_tgt);

common_speculative * common_speculative_init(
common_params_speculative & params,
llama_context * ctx_tgt);
Expand Down
131 changes: 130 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,7 +920,7 @@ def set_gguf_parameters(self):
self.gguf_writer.add_expert_group_used_count(n_group_used)
logger.info(f"gguf: expert groups used count = {n_group_used}")

if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation_func"], optional=True)) is not None:
if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None:
if score_func == "sigmoid":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif score_func == "softmax":
Expand Down Expand Up @@ -7912,6 +7912,135 @@ def prepare_tensors(self):
raise ValueError(f"Unprocessed experts: {experts}")


@ModelBase.register("Step3p5ForCausalLM")
class Step35Model(TextModel):
model_arch = gguf.MODEL_ARCH.STEP35

def set_gguf_parameters(self):
rope_theta = self.hparams.get("rope_theta")
if isinstance(rope_theta, list):
self.hparams["rope_theta"] = float(rope_theta[0])
self.hparams["local_rope_theta"] = float(rope_theta[1])
self.rope_parameters["rope_theta"] = self.hparams["rope_theta"]
self.rope_parameters["sliding_attention"] = {"rope_theta": self.hparams["local_rope_theta"]}

super().set_gguf_parameters()

layer_types = self.hparams.get("layer_types") or []
partial_rotary_factors = self.hparams.get("partial_rotary_factors") or []
attn_other = self.hparams.get("attention_other_setting") or {}

n_head_base = self.hparams["num_attention_heads"]
n_kv_base = self.hparams["num_attention_groups"]

n_head_swa = attn_other.get("num_attention_heads", n_head_base)
n_kv_swa = attn_other.get("num_attention_groups", n_kv_base)

layer_types = layer_types[: self.block_count]
partial_rotary_factors = partial_rotary_factors[: self.block_count]
assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors
head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types]
kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types]
swa_pat = [lt == "sliding_attention" for lt in layer_types]

self.gguf_writer.add_head_count(head_arr)
self.gguf_writer.add_head_count_kv(kv_arr)

self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
self.gguf_writer.add_sliding_window_pattern(swa_pat)

self.gguf_writer.add_value_length(self.hparams["head_dim"])

# MoE params
self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["share_expert_dim"])

if (moe_router_scaling_factor := self.hparams.get("moe_router_scaling_factor")) is not None:
self.gguf_writer.add_expert_weights_scale(moe_router_scaling_factor)
if (norm_expert_weight := self.hparams.get("norm_expert_weight")) is not None:
self.gguf_writer.add_expert_weights_norm(norm_expert_weight)

# leading dense blocks
leading_dense = 0
moe_layers_enum = self.hparams.get("moe_layers_enum")
if isinstance(moe_layers_enum, str) and moe_layers_enum.strip():
moe_layers = sorted(int(i) for i in moe_layers_enum.strip().split(","))
if moe_layers:
leading_dense = max(0, moe_layers[0])
self.gguf_writer.add_leading_dense_block_count(leading_dense)
self.gguf_writer.add_moe_every_n_layers(int(self.hparams.get("moe_every_n_layer", 1)))

self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))

# Optional per-layer SwiGLU clamps.
if (limits := self.hparams.get("swiglu_limits")) is not None:
limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]]
self.gguf_writer.add_swiglu_clamp_exp(limits_f)
if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None:
limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]]
self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
# remove mtp layers
if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None:
il = int(m.group(1))
n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
if il >= n_main:
return
if name.endswith("norm.weight"):
data_torch += 1.0
# Map router bias (expert selection bias) to a GGUF bias tensor
if name.endswith(".moe.router_bias"):
name += ".bias"

if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")):
data_torch = data_torch.squeeze().contiguous()

yield from super().modify_tensors(data_torch, name, bid)

def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3").
# llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS).
rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
rope_type = rope_params.get("rope_type") or ""
if rope_type.lower() != "llama3":
return

# Step35 configs can carry per-layer rope_theta as a list; for llama3 rope factors we use the base value.
rope_theta = self.hparams.get("rope_theta", 10000.0)
if isinstance(rope_theta, list):
rope_theta = rope_theta[0]
base = float(rope_theta)
if (dim := self.hparams.get("head_dim")) is None:
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
dim = int(dim)

freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

factor = float(rope_params.get("factor", 8.0))
low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))

low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor

rope_factors: list[float] = []
for freq in freqs:
wavelen = 2 * math.pi / float(freq)
if wavelen < high_freq_wavelen:
rope_factors.append(1.0)
elif wavelen > low_freq_wavelen:
rope_factors.append(factor)
else:
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth))

yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))


@ModelBase.register("PanguEmbeddedForCausalLM")
class PanguEmbeddedModel(TextModel):
model_arch = gguf.MODEL_ARCH.PANGU_EMBED
Expand Down
2 changes: 1 addition & 1 deletion docs/ops.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Legend:
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | | 🟡 | ✅ | ❌ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
Expand Down
8 changes: 4 additions & 4 deletions docs/ops/SYCL.csv
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
Expand Down Expand Up @@ -161,8 +161,8 @@
"SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
Expand Down
13 changes: 3 additions & 10 deletions ggml/src/ggml-sycl/element_wise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -836,16 +836,9 @@ static inline void ggml_sycl_op_floor(ggml_backend_sycl_context & ctx, ggml_tens
}

static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, 256);
stream->parallel_for(
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
sycl::range<1>(256)),
[=](sycl::nd_item<1> item_ct1) {
unary_op_ceil_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
return op_ceil(x);
});
}

static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4591,9 +4591,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_SOFTPLUS:
case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_CEIL:
return true;
case GGML_UNARY_OP_FLOOR:
case GGML_UNARY_OP_CEIL:
case GGML_UNARY_OP_ROUND:
case GGML_UNARY_OP_TRUNC:
#if defined (GGML_SYCL_F16)
Expand Down
69 changes: 69 additions & 0 deletions ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -465,4 +465,73 @@ inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_unary_shader(
return result;
}

/** Binary **/

struct ggml_webgpu_binary_pipeline_key {
int type;
int op;
bool inplace;
bool overlap;

bool operator==(const ggml_webgpu_binary_pipeline_key & other) const {
return type == other.type && op == other.op && inplace == other.inplace && overlap == other.overlap;
}
};

struct ggml_webgpu_binary_pipeline_key_hash {
size_t operator()(const ggml_webgpu_binary_pipeline_key & key) const {
size_t seed = 0;
ggml_webgpu_hash_combine(seed, key.type);
ggml_webgpu_hash_combine(seed, key.op);
ggml_webgpu_hash_combine(seed, key.inplace);
ggml_webgpu_hash_combine(seed, key.overlap);
return seed;
}
};

struct ggml_webgpu_binary_shader_lib_context {
ggml_webgpu_binary_pipeline_key key;
uint32_t max_wg_size;
};

inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_binary_shader(
pre_wgsl::Preprocessor & preprocessor,
const char * shader_src,
const ggml_webgpu_binary_shader_lib_context & context) {
std::vector<std::string> defines;
std::string op_name = ggml_op_name((ggml_op) context.key.op);
std::string variant = op_name;

defines.push_back(std::string("OP_") + op_name);

switch (context.key.type) {
case GGML_TYPE_F32:
defines.push_back("TYPE_F32");
variant += "_f32";
break;
case GGML_TYPE_F16:
defines.push_back("TYPE_F16");
variant += "_f16";
break;
default:
GGML_ABORT("Unsupported type for binary shader");
}

if (context.key.inplace) {
defines.push_back("INPLACE");
variant += "_inplace";
} else if (context.key.overlap) {
defines.push_back("OVERLAP");
variant += "_overlap";
}

defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
ggml_webgpu_processed_shader result;
result.wgsl = preprocessor.preprocess(shader_src, defines);
result.variant = variant;
ggml_webgpu_generic_shader_decisions * decisions = new ggml_webgpu_generic_shader_decisions();
decisions->wg_size = context.max_wg_size;
result.decisions = decisions;
return result;
}
#endif // GGML_WEBGPU_SHADER_LIB_HPP
Loading
Loading