From 91e84fed64329cd96202d68220724a1d92f5ec1f Mon Sep 17 00:00:00 2001 From: Sid Shaytay <2595088+SidShaytay@users.noreply.github.com> Date: Fri, 15 May 2026 00:03:24 -0700 Subject: [PATCH 001/309] Support for Codex CLI by skipping unsupported Responses tools (#23041) * Support for Codex CLI by skipping unsupported Responses tools * Warn on skipped Responses tools and preserve gpt-oss apply_patch rejection * Revert gpt-oss apply_patch special handling --- tests/test-chat.cpp | 77 ++++++++++++++++++++++++++++++++++++ tools/server/server-chat.cpp | 11 ++++-- 2 files changed, 85 insertions(+), 3 deletions(-) diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index ea9d87ebed2..05c60d29743 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1664,6 +1664,83 @@ static void test_convert_responses_to_chatcmpl() { assert_equals(false, result.contains("max_output_tokens")); assert_equals(100, result.at("max_tokens").get()); } + + // Test mixed Responses tools: convert only function tools + { + json input = json::parse(R"({ + "input": "Hello", + "model": "test-model", + "tools": [ + { + "type": "web_search" + }, + { + "type": "function", + "name": "get_weather", + "description": "Get weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string" + } + }, + "required": ["location"] + } + }, + { + "type": "image_generation" + }, + { + "type": "mcp", + "server_label": "test-server" + }, + { + "type": "namespace", + "name": "browser" + } + ] + })"); + + json result = server_chat_convert_responses_to_chatcmpl(input); + + assert_equals(true, result.contains("tools")); + assert_equals(true, result.at("tools").is_array()); + assert_equals((size_t)1, result.at("tools").size()); + + const auto & tool = result.at("tools")[0]; + assert_equals(std::string("function"), tool.at("type").get()); + assert_equals(std::string("get_weather"), tool.at("function").at("name").get()); + assert_equals(true, tool.at("function").at("strict").get()); + } + + // Test non-function Responses tools are ignored + { + json input = json::parse(R"({ + "input": "Hello", + "model": "test-model", + "tools": [ + { + "type": "web_search" + }, + { + "type": "image_generation" + }, + { + "type": "mcp", + "server_label": "test-server" + }, + { + "type": "namespace", + "name": "browser" + } + ] + })"); + + json result = server_chat_convert_responses_to_chatcmpl(input); + + assert_equals(false, result.contains("tools")); + } } static void test_template_output_peg_parsers(bool detailed_debug) { diff --git a/tools/server/server-chat.cpp b/tools/server/server-chat.cpp index f276f8da58f..02858a2a028 100644 --- a/tools/server/server-chat.cpp +++ b/tools/server/server-chat.cpp @@ -257,8 +257,11 @@ json server_chat_convert_responses_to_chatcmpl(const json & response_body) { for (json resp_tool : response_body.at("tools")) { json chatcmpl_tool; - if (json_value(resp_tool, "type", std::string()) != "function") { - throw std::invalid_argument("'type' of tool must be 'function'"); + const std::string type = json_value(resp_tool, "type", std::string()); + if (type != "function") { + // Non-function Responses tools have no Chat Completions equivalent. + SRV_WRN("unsupported Responses tool type '%s' skipped\n", type.c_str()); + continue; } resp_tool.erase("type"); chatcmpl_tool["type"] = "function"; @@ -270,7 +273,9 @@ json server_chat_convert_responses_to_chatcmpl(const json & response_body) { chatcmpl_tools.push_back(chatcmpl_tool); } chatcmpl_body.erase("tools"); - chatcmpl_body["tools"] = chatcmpl_tools; + if (!chatcmpl_tools.empty()) { + chatcmpl_body["tools"] = chatcmpl_tools; + } } if (response_body.contains("max_output_tokens")) { From d5284445802b5af6bff3240329f2558f9c35b5f5 Mon Sep 17 00:00:00 2001 From: Pascal Date: Fri, 15 May 2026 11:18:11 +0200 Subject: [PATCH 002/309] webui: preserve partial response on streaming error (#23090) --- .../server/webui/src/lib/constants/agentic.ts | 3 --- .../webui/src/lib/stores/agentic.svelte.ts | 12 +++------- .../webui/src/lib/stores/chat.svelte.ts | 23 +++++++++++-------- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/tools/server/webui/src/lib/constants/agentic.ts b/tools/server/webui/src/lib/constants/agentic.ts index 4fe6da61c38..c0575163efc 100644 --- a/tools/server/webui/src/lib/constants/agentic.ts +++ b/tools/server/webui/src/lib/constants/agentic.ts @@ -4,9 +4,6 @@ export const ATTACHMENT_SAVED_REGEX = /\[Attachment saved: ([^\]]+)\]/; export const NEWLINE_SEPARATOR = '\n'; -export const LLM_ERROR_BLOCK_START = '\n\n```\nUpstream LLM error:\n'; -export const LLM_ERROR_BLOCK_END = '\n```\n'; - export const DEFAULT_AGENTIC_CONFIG: AgenticConfig = { enabled: true, maxTurns: 100, diff --git a/tools/server/webui/src/lib/stores/agentic.svelte.ts b/tools/server/webui/src/lib/stores/agentic.svelte.ts index 3334f6c111a..1f1f05c4539 100644 --- a/tools/server/webui/src/lib/stores/agentic.svelte.ts +++ b/tools/server/webui/src/lib/stores/agentic.svelte.ts @@ -30,12 +30,7 @@ import { ToolSource, ToolPermissionDecision } from '$lib/enums'; import { SvelteMap } from 'svelte/reactivity'; import { ToolsService } from '$lib/services/tools.service'; import { isAbortError } from '$lib/utils'; -import { - DEFAULT_AGENTIC_CONFIG, - NEWLINE_SEPARATOR, - LLM_ERROR_BLOCK_START, - LLM_ERROR_BLOCK_END -} from '$lib/constants'; +import { DEFAULT_AGENTIC_CONFIG, NEWLINE_SEPARATOR } from '$lib/constants'; import { IMAGE_MIME_TO_EXTENSION, DATA_URI_BASE64_REGEX, @@ -640,10 +635,9 @@ class AgenticStore { return; } const normalizedError = error instanceof Error ? error : new Error('LLM stream error'); - // Save error as content in the current turn - onChunk?.(`${LLM_ERROR_BLOCK_START}${normalizedError.message}${LLM_ERROR_BLOCK_END}`); + // preserve partial output as is, the outer error dialog informs the user separately await onAssistantTurnComplete?.( - turnContent + `${LLM_ERROR_BLOCK_START}${normalizedError.message}${LLM_ERROR_BLOCK_END}`, + turnContent, turnReasoningContent || undefined, this.buildFinalTimings(capturedTimings, agenticTimings), undefined diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts index 7c34579ca56..04a735eec91 100644 --- a/tools/server/webui/src/lib/stores/chat.svelte.ts +++ b/tools/server/webui/src/lib/stores/chat.svelte.ts @@ -814,7 +814,7 @@ class ChatStore { ); } }, - onError: (error: Error) => { + onError: async (error: Error) => { this.setStreamingActive(false); if (isAbortError(error)) { cleanupStreamingState(); @@ -826,13 +826,10 @@ class ChatStore { return; } console.error('Streaming error:', error); + // keep whatever was streamed so far, the message stays in memory and in DB + await this.savePartialResponseIfNeeded(convId); cleanupStreamingState(); this.clearPendingMessage(convId); - const idx = conversationsStore.findMessageIndex(assistantMessage.id); - if (idx !== -1) { - const failedMessage = conversationsStore.removeMessageAtIndex(idx); - if (failedMessage) DatabaseService.deleteMessage(failedMessage.id).catch(console.error); - } const contextInfo = ( error as Error & { contextInfo?: { n_prompt_tokens: number; n_ctx: number } } ).contextInfo; @@ -1389,9 +1386,17 @@ class ChatStore { } console.error('Continue generation error:', error); - conversationsStore.updateMessageAtIndex(idx, { content: originalContent }); - - await DatabaseService.updateMessage(msg.id, { content: originalContent }); + // keep whatever was appended so far, the message stays in memory and in DB + await DatabaseService.updateMessage(msg.id, { + content: originalContent + appendedContent, + reasoningContent: originalReasoning + appendedReasoning || undefined, + timestamp: Date.now() + }); + conversationsStore.updateMessageAtIndex(idx, { + content: originalContent + appendedContent, + reasoningContent: originalReasoning + appendedReasoning || undefined, + timestamp: Date.now() + }); this.setChatLoading(msg.convId, false); this.clearChatStreaming(msg.convId); From ac33f032ac41748ffa9aaefca07cfc16e732f73f Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 15 May 2026 17:59:07 +0800 Subject: [PATCH 003/309] reasoning-budget: clone should do a deep-copy (#23095) --- common/reasoning-budget.cpp | 21 +++++------ tests/test-reasoning-budget.cpp | 65 ++++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/common/reasoning-budget.cpp b/common/reasoning-budget.cpp index c6e1f86c91e..958c9cacf51 100644 --- a/common/reasoning-budget.cpp +++ b/common/reasoning-budget.cpp @@ -171,22 +171,12 @@ static void common_reasoning_budget_reset(struct llama_sampler * smpl) { ctx->force_pos = 0; } -// forward declaration for use in clone static struct llama_sampler * common_reasoning_budget_init_state( const struct llama_vocab * vocab, const std::vector & start_tokens, const std::vector & end_tokens, const std::vector & forced_tokens, int32_t budget, common_reasoning_budget_state initial_state); -static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx; - return common_reasoning_budget_init_state( - ctx->vocab, - ctx->start_matcher.tokens, - ctx->end_matcher.tokens, - ctx->forced_tokens, - ctx->budget, - ctx->state); -} +static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl); static void common_reasoning_budget_free(struct llama_sampler * smpl) { delete (common_reasoning_budget_ctx *) smpl->ctx; @@ -205,6 +195,15 @@ static struct llama_sampler_i common_reasoning_budget_i = { /* .backend_set_input = */ nullptr, }; +static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx; + + return llama_sampler_init( + /* .iface = */ &common_reasoning_budget_i, + /* .ctx = */ new common_reasoning_budget_ctx(*ctx) + ); +} + static struct llama_sampler * common_reasoning_budget_init_state( const struct llama_vocab * vocab, const std::vector & start_tokens, diff --git a/tests/test-reasoning-budget.cpp b/tests/test-reasoning-budget.cpp index f7a60178996..10d0ac21da8 100644 --- a/tests/test-reasoning-budget.cpp +++ b/tests/test-reasoning-budget.cpp @@ -124,6 +124,66 @@ static void test_reasoning_budget( (void)sequence; } +static llama_token get_forced_token(struct llama_sampler * sampler, llama_token max_token) { + std::vector cur; + const size_t n_vocab = (size_t) max_token + 1; + for (size_t i = 0; i < n_vocab; i++) { + cur.emplace_back(llama_token_data{(llama_token) i, logf((float) (i + 1)), 0.0f}); + } + + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + llama_sampler_apply(sampler, &cur_p); + + size_t finite_count = 0; + llama_token finite_token = LLAMA_TOKEN_NULL; + for (size_t i = 0; i < cur.size(); i++) { + if (std::isfinite(cur[i].logit)) { + finite_count++; + finite_token = cur[i].id; + } + } + + GGML_ASSERT(finite_count == 1 && "sampler is not forcing exactly one token"); + return finite_token; +} + +static void test_reasoning_budget_clone_mid_counting() { + const std::vector start = {100}; + const std::vector end = {101}; + const std::vector forced = {102, 101}; + + auto * sampler = common_reasoning_budget_init(nullptr, start, end, forced, 2, REASONING_BUDGET_IDLE); + + llama_sampler_accept(sampler, 100); // COUNTING, remaining=2 + llama_sampler_accept(sampler, 50); // COUNTING, remaining=1 + + auto * clone = llama_sampler_clone(sampler); + llama_sampler_accept(clone, 51); // should exhaust the cloned remaining budget + + GGML_ASSERT(get_forced_token(clone, 102) == 102 && "cloned counting state lost remaining budget"); + + llama_sampler_free(clone); + llama_sampler_free(sampler); +} + +static void test_reasoning_budget_clone_mid_forcing() { + const std::vector start = {100}; + const std::vector end = {101}; + const std::vector forced = {102, 101}; + + auto * sampler = common_reasoning_budget_init(nullptr, start, end, forced, 0, REASONING_BUDGET_FORCING); + + GGML_ASSERT(get_forced_token(sampler, 102) == 102); + llama_sampler_accept(sampler, 102); // advance to the second forced token + + auto * clone = llama_sampler_clone(sampler); + + GGML_ASSERT(get_forced_token(clone, 102) == 101 && "cloned forcing state lost force position"); + + llama_sampler_free(clone); + llama_sampler_free(sampler); +} + // UTF-8 boundary detection unit test // Tests common_utf8_is_complete() from reasoning-budget.h static void test_utf8_boundary_detection() { @@ -250,7 +310,10 @@ int main(void) { 7); // forcing continues through i=7 } - printf("OK (6 tests passed)\n"); + test_reasoning_budget_clone_mid_counting(); + test_reasoning_budget_clone_mid_forcing(); + + printf("OK (8 tests passed)\n"); printf("Testing UTF-8 boundary detection... "); test_utf8_boundary_detection(); From d5dc2e0a0275bf51a43074920f03b08c0fa906de Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 15 May 2026 13:58:30 +0300 Subject: [PATCH 004/309] llama-eval : add AIME 2026 dataset support (#23058) Add Aime2026Dataset class loading from MathArena/aime_2026 on HuggingFace. 30 problems (two sets of 15), single config/split. Usage: --dataset aime2026 Assisted-by: llama.cpp:local pi --- examples/llama-eval/llama-eval.py | 57 ++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index a2948273512..e833070eee9 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -44,6 +44,7 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f GRADER_PATTERNS = { "aime": r'\boxed{(\d+)}|\b(\d+)\b', "aime2025": r'\boxed{(\d+)}|\b(\d+)\b', + "aime2026": r'\boxed{(\d+)}|\b(\d+)\b', "gsm8k": r'\b(\d+)\b', } @@ -58,6 +59,11 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f "-123", "999" ], + "aime2026": [ + "42", + "-123", + "999" + ], "gsm8k": [ "42", "-123", @@ -81,6 +87,12 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f {question} +Remember to put your answer inside \\boxed{{}}. +""", + "aime2026": """Solve the following math problem step by step. Put your answer inside \\boxed{{}}. + +{question} + Remember to put your answer inside \\boxed{{}}. """, "gsm8k": """{question} @@ -166,6 +178,8 @@ def load_dataset(self, seed: int = 1234): self.dataset = AimeDataset() elif self.dataset_type == "aime2025": self.dataset = Aime2025Dataset() + elif self.dataset_type == "aime2026": + self.dataset = Aime2026Dataset() elif self.dataset_type == "gsm8k": self.dataset = Gsm8kDataset() elif self.dataset_type == "gpqa": @@ -679,6 +693,47 @@ def get_prompt(self, question: Dict) -> str: question=self.get_question_text(question), ) +class Aime2026Dataset(BaseDataset): + def __init__(self): + self.questions = [] + self._load_dataset() + + def _load_dataset(self): + print(f"Loading AIME2026 dataset...") + from datasets import load_dataset + + cache_path = cache_dir / "MathArena___aime_2026" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = load_dataset("MathArena/aime_2026", "default", split="train", cache_dir=str(cache_path)) + else: + ds = load_dataset("MathArena/aime_2026", "default", split="train") + + self.questions = [] + for row in ds: + question = dict(row) + question["dataset_type"] = "aime2026" + self.questions.append(question) + + print(f"AIME2026 dataset loaded: {len(self.questions)} questions") + + def get_question(self, index: int) -> Dict: + """Get question by index""" + return self.questions[index] + + def get_question_text(self, question: Dict) -> str: + """Get question string""" + return question["problem"] + + def get_answer(self, question: Dict) -> str: + return str(question["answer"]) + + def get_prompt(self, question: Dict) -> str: + """Get formatted prompt for the question""" + return TEMPLATE_REGISTRY["aime2026"].format( + question=self.get_question_text(question), + ) + class Gsm8kDataset(BaseDataset): def __init__(self, split: str = "test"): self.split = split @@ -1188,7 +1243,7 @@ def main(): "--dataset", type=str, default="aime", - choices=["aime", "aime2025", "gsm8k", "gpqa"], + choices=["aime", "aime2025", "aime2026", "gsm8k", "gpqa"], help="Dataset type (default: aime)" ) parser.add_argument( From 769cc93a43b51bf6013986180c73ee60cf24cede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 15 May 2026 13:13:16 +0200 Subject: [PATCH 005/309] ci : fix transform of top . entry in release archive (#23080) * fix transform of top . entry in release archive * simplify --- .github/workflows/release.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8b01798ae78..218d24296be 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -104,7 +104,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -182,7 +182,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -259,7 +259,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -337,7 +337,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -426,7 +426,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/ReleaseOV/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/ReleaseOV/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -867,7 +867,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -979,7 +979,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -1240,7 +1240,7 @@ jobs: - name: Pack artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 From cc7200bf12eac4f5c9ec5377c16ae75b332f8e0c Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Fri, 15 May 2026 15:18:12 +0200 Subject: [PATCH 006/309] Refactor: convert_hf_to_gguf.py (#17114) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * move conversion code to a dedicated conversion directory and split the files akin to the src/models architecture --------- Co-authored-by: Sigbjørn Skjæret --- conversion/__init__.py | 333 + conversion/afmoe.py | 79 + conversion/arctic.py | 162 + conversion/baichuan.py | 59 + conversion/bailingmoe.py | 216 + conversion/base.py | 2468 ++++++ conversion/bert.py | 616 ++ conversion/bitnet.py | 49 + conversion/bloom.py | 67 + conversion/chameleon.py | 58 + conversion/chatglm.py | 167 + conversion/codeshell.py | 21 + conversion/cogvlm.py | 33 + conversion/command_r.py | 57 + conversion/dbrx.py | 75 + conversion/deci.py | 184 + conversion/deepseek.py | 388 + conversion/dots1.py | 32 + conversion/dotsocr.py | 48 + conversion/dream.py | 72 + conversion/ernie.py | 200 + conversion/exaone.py | 210 + conversion/falcon.py | 58 + conversion/falcon_h1.py | 118 + conversion/gemma.py | 840 ++ conversion/glm.py | 259 + conversion/gpt2.py | 78 + conversion/gpt_oss.py | 130 + conversion/gptneox.py | 63 + conversion/granite.py | 328 + conversion/grok.py | 116 + conversion/grovemoe.py | 108 + conversion/hunyuan.py | 407 + conversion/internlm.py | 232 + conversion/internvl.py | 98 + conversion/jais.py | 104 + conversion/jamba.py | 119 + conversion/januspro.py | 116 + conversion/kimi_linear.py | 223 + conversion/kimivl.py | 154 + conversion/lfm2.py | 256 + conversion/lighton_ocr.py | 29 + conversion/llada.py | 172 + conversion/llama.py | 312 + conversion/llama4.py | 38 + conversion/llava.py | 129 + conversion/maincoder.py | 14 + conversion/mamba.py | 199 + conversion/mimo.py | 295 + conversion/minicpm.py | 184 + conversion/minimax.py | 54 + conversion/mistral.py | 201 + conversion/mistral3.py | 67 + conversion/mpt.py | 49 + conversion/nemotron.py | 384 + conversion/olmo.py | 120 + conversion/openelm.py | 83 + conversion/orion.py | 37 + conversion/pangu.py | 46 + conversion/phi.py | 390 + conversion/pixtral.py | 41 + conversion/plamo.py | 195 + conversion/plm.py | 23 + conversion/qwen.py | 544 ++ conversion/qwen3vl.py | 357 + conversion/qwenvl.py | 200 + conversion/refact.py | 68 + conversion/rwkv.py | 302 + conversion/sarashina2.py | 32 + conversion/smallthinker.py | 82 + conversion/smolvlm.py | 47 + conversion/stablelm.py | 98 + conversion/starcoder.py | 23 + conversion/step3.py | 231 + conversion/t5.py | 286 + conversion/ultravox.py | 203 + conversion/wavtokenizer.py | 45 + conversion/xverse.py | 90 + conversion/youtuvl.py | 64 + convert_hf_to_gguf.py | 14054 +-------------------------------- convert_hf_to_gguf_update.py | 4 +- convert_lora_to_gguf.py | 9 +- 82 files changed, 15179 insertions(+), 14023 deletions(-) create mode 100644 conversion/__init__.py create mode 100644 conversion/afmoe.py create mode 100644 conversion/arctic.py create mode 100644 conversion/baichuan.py create mode 100644 conversion/bailingmoe.py create mode 100644 conversion/base.py create mode 100644 conversion/bert.py create mode 100644 conversion/bitnet.py create mode 100644 conversion/bloom.py create mode 100644 conversion/chameleon.py create mode 100644 conversion/chatglm.py create mode 100644 conversion/codeshell.py create mode 100644 conversion/cogvlm.py create mode 100644 conversion/command_r.py create mode 100644 conversion/dbrx.py create mode 100644 conversion/deci.py create mode 100644 conversion/deepseek.py create mode 100644 conversion/dots1.py create mode 100644 conversion/dotsocr.py create mode 100644 conversion/dream.py create mode 100644 conversion/ernie.py create mode 100644 conversion/exaone.py create mode 100644 conversion/falcon.py create mode 100644 conversion/falcon_h1.py create mode 100644 conversion/gemma.py create mode 100644 conversion/glm.py create mode 100644 conversion/gpt2.py create mode 100644 conversion/gpt_oss.py create mode 100644 conversion/gptneox.py create mode 100644 conversion/granite.py create mode 100644 conversion/grok.py create mode 100644 conversion/grovemoe.py create mode 100644 conversion/hunyuan.py create mode 100644 conversion/internlm.py create mode 100644 conversion/internvl.py create mode 100644 conversion/jais.py create mode 100644 conversion/jamba.py create mode 100644 conversion/januspro.py create mode 100644 conversion/kimi_linear.py create mode 100644 conversion/kimivl.py create mode 100644 conversion/lfm2.py create mode 100644 conversion/lighton_ocr.py create mode 100644 conversion/llada.py create mode 100644 conversion/llama.py create mode 100644 conversion/llama4.py create mode 100644 conversion/llava.py create mode 100644 conversion/maincoder.py create mode 100644 conversion/mamba.py create mode 100644 conversion/mimo.py create mode 100644 conversion/minicpm.py create mode 100644 conversion/minimax.py create mode 100644 conversion/mistral.py create mode 100644 conversion/mistral3.py create mode 100644 conversion/mpt.py create mode 100644 conversion/nemotron.py create mode 100644 conversion/olmo.py create mode 100644 conversion/openelm.py create mode 100644 conversion/orion.py create mode 100644 conversion/pangu.py create mode 100644 conversion/phi.py create mode 100644 conversion/pixtral.py create mode 100644 conversion/plamo.py create mode 100644 conversion/plm.py create mode 100644 conversion/qwen.py create mode 100644 conversion/qwen3vl.py create mode 100644 conversion/qwenvl.py create mode 100644 conversion/refact.py create mode 100644 conversion/rwkv.py create mode 100644 conversion/sarashina2.py create mode 100644 conversion/smallthinker.py create mode 100644 conversion/smolvlm.py create mode 100644 conversion/stablelm.py create mode 100644 conversion/starcoder.py create mode 100644 conversion/step3.py create mode 100644 conversion/t5.py create mode 100644 conversion/ultravox.py create mode 100644 conversion/wavtokenizer.py create mode 100644 conversion/xverse.py create mode 100644 conversion/youtuvl.py diff --git a/conversion/__init__.py b/conversion/__init__.py new file mode 100644 index 00000000000..2c38123dff8 --- /dev/null +++ b/conversion/__init__.py @@ -0,0 +1,333 @@ +from __future__ import annotations + +from .base import ( + ModelBase, TextModel, MmprojModel, ModelType, SentencePieceTokenTypes, + logger, _mistral_common_installed, _mistral_import_error_msg, + get_model_architecture, LazyTorchTensor, +) +from typing import Type + + +__all__ = [ + "ModelBase", "TextModel", "MmprojModel", "ModelType", "SentencePieceTokenTypes", + "get_model_architecture", "LazyTorchTensor", "logger", + "_mistral_common_installed", "_mistral_import_error_msg", + "get_model_class", "print_registered_models", "load_all_models", +] + + +TEXT_MODEL_MAP: dict[str, str] = { + "AfmoeForCausalLM": "afmoe", + "ApertusForCausalLM": "llama", + "ArceeForCausalLM": "llama", + "ArcticForCausalLM": "arctic", + "AudioFlamingo3ForConditionalGeneration": "qwen", + "BaiChuanForCausalLM": "baichuan", + "BaichuanForCausalLM": "baichuan", + "BailingMoeForCausalLM": "bailingmoe", + "BailingMoeV2ForCausalLM": "bailingmoe", + "BambaForCausalLM": "granite", + "BertForMaskedLM": "bert", + "BertForSequenceClassification": "bert", + "BertModel": "bert", + "BitnetForCausalLM": "bitnet", + "BloomForCausalLM": "bloom", + "BloomModel": "bloom", + "CamembertModel": "bert", + "ChameleonForCausalLM": "chameleon", + "ChameleonForConditionalGeneration": "chameleon", + "ChatGLMForConditionalGeneration": "chatglm", + "ChatGLMModel": "chatglm", + "CodeShellForCausalLM": "codeshell", + "CogVLMForCausalLM": "cogvlm", + "Cohere2ForCausalLM": "command_r", + "CohereForCausalLM": "command_r", + "DbrxForCausalLM": "dbrx", + "DeciLMForCausalLM": "deci", + "DeepseekForCausalLM": "deepseek", + "DeepseekV2ForCausalLM": "deepseek", + "DeepseekV3ForCausalLM": "deepseek", + "DistilBertForMaskedLM": "bert", + "DistilBertForSequenceClassification": "bert", + "DistilBertModel": "bert", + "Dots1ForCausalLM": "dots1", + "DotsOCRForCausalLM": "qwen", + "DreamModel": "dream", + "Ernie4_5ForCausalLM": "ernie", + "Ernie4_5_ForCausalLM": "ernie", + "Ernie4_5_MoeForCausalLM": "ernie", + "EuroBertModel": "bert", + "Exaone4ForCausalLM": "exaone", + "ExaoneForCausalLM": "exaone", + "ExaoneMoEForCausalLM": "exaone", + "FalconForCausalLM": "falcon", + "FalconH1ForCausalLM": "falcon_h1", + "FalconMambaForCausalLM": "mamba", + "GPT2LMHeadModel": "gpt2", + "GPTBigCodeForCausalLM": "starcoder", + "GPTNeoXForCausalLM": "gptneox", + "GPTRefactForCausalLM": "refact", + "Gemma2ForCausalLM": "gemma", + "Gemma3ForCausalLM": "gemma", + "Gemma3ForConditionalGeneration": "gemma", + "Gemma3TextModel": "gemma", + "Gemma3nForCausalLM": "gemma", + "Gemma3nForConditionalGeneration": "gemma", + "Gemma4ForConditionalGeneration": "gemma", + "GemmaForCausalLM": "gemma", + "Glm4ForCausalLM": "glm", + "Glm4MoeForCausalLM": "glm", + "Glm4MoeLiteForCausalLM": "glm", + "Glm4vForConditionalGeneration": "glm", + "Glm4vMoeForConditionalGeneration": "glm", + "GlmForCausalLM": "chatglm", + "GlmMoeDsaForCausalLM": "glm", + "GlmOcrForConditionalGeneration": "glm", + "GptOssForCausalLM": "gpt_oss", + "GraniteForCausalLM": "granite", + "GraniteMoeForCausalLM": "granite", + "GraniteMoeHybridForCausalLM": "granite", + "GraniteMoeSharedForCausalLM": "granite", + "GraniteSpeechForConditionalGeneration": "granite", + "Grok1ForCausalLM": "grok", + "GrokForCausalLM": "grok", + "GroveMoeForCausalLM": "grovemoe", + "HunYuanDenseV1ForCausalLM": "hunyuan", + "HunYuanMoEV1ForCausalLM": "hunyuan", + "HunYuanVLForConditionalGeneration": "hunyuan", + "IQuestCoderForCausalLM": "llama", + "InternLM2ForCausalLM": "internlm", + "InternLM3ForCausalLM": "internlm", + "JAISLMHeadModel": "jais", + "Jais2ForCausalLM": "jais", + "JambaForCausalLM": "jamba", + "JanusForConditionalGeneration": "januspro", + "JinaBertForMaskedLM": "bert", + "JinaBertModel": "bert", + "JinaEmbeddingsV5Model": "bert", + "KORMoForCausalLM": "qwen", + "KimiK25ForConditionalGeneration": "deepseek", + "KimiLinearForCausalLM": "kimi_linear", + "KimiLinearModel": "kimi_linear", + "KimiVLForConditionalGeneration": "deepseek", + "LFM2ForCausalLM": "lfm2", + "LLaDAMoEModel": "llada", + "LLaDAMoEModelLM": "llada", + "LLaDAModelLM": "llada", + "LLaMAForCausalLM": "llama", + "Lfm25AudioTokenizer": "lfm2", + "Lfm2ForCausalLM": "lfm2", + "Lfm2Model": "lfm2", + "Lfm2MoeForCausalLM": "lfm2", + "Llama4ForCausalLM": "llama", + "Llama4ForConditionalGeneration": "llama", + "LlamaBidirectionalModel": "llama", + "LlamaForCausalLM": "llama", + "LlamaModel": "llama", + "LlavaForConditionalGeneration": "llama", + "LlavaStableLMEpochForCausalLM": "stablelm", + "MPTForCausalLM": "mpt", + "MT5ForConditionalGeneration": "t5", + "MaincoderForCausalLM": "maincoder", + "Mamba2ForCausalLM": "mamba", + "MambaForCausalLM": "mamba", + "MambaLMHeadModel": "mamba", + "MiMoV2FlashForCausalLM": "mimo", + "MiMoV2ForCausalLM": "mimo", + "MiniCPM3ForCausalLM": "minicpm", + "MiniCPMForCausalLM": "minicpm", + "MiniCPMV4_6ForConditionalGeneration": "minicpm", + "MiniMaxM2ForCausalLM": "minimax", + "Ministral3ForCausalLM": "mistral3", + "Mistral3ForConditionalGeneration": "mistral3", + "MistralForCausalLM": "llama", + "MixtralForCausalLM": "llama", + "ModernBertForMaskedLM": "bert", + "ModernBertForSequenceClassification": "bert", + "ModernBertModel": "bert", + "NemotronForCausalLM": "nemotron", + "NemotronHForCausalLM": "nemotron", + "NeoBERT": "bert", + "NeoBERTForSequenceClassification": "bert", + "NeoBERTLMHead": "bert", + "NomicBertModel": "bert", + "OLMoForCausalLM": "olmo", + "Olmo2ForCausalLM": "olmo", + "Olmo3ForCausalLM": "olmo", + "OlmoForCausalLM": "olmo", + "OlmoeForCausalLM": "olmo", + "OpenELMForCausalLM": "openelm", + "OrionForCausalLM": "orion", + "PLMForCausalLM": "plm", + "PLaMo2ForCausalLM": "plamo", + "PLaMo3ForCausalLM": "plamo", + "PaddleOCRVLForConditionalGeneration": "ernie", + "PanguEmbeddedForCausalLM": "pangu", + "Phi3ForCausalLM": "phi", + "Phi4ForCausalLMV": "phi", + "PhiForCausalLM": "phi", + "PhiMoEForCausalLM": "phi", + "Plamo2ForCausalLM": "plamo", + "Plamo3ForCausalLM": "plamo", + "PlamoForCausalLM": "plamo", + "QWenLMHeadModel": "qwen", + "Qwen2AudioForConditionalGeneration": "qwen", + "Qwen2ForCausalLM": "qwen", + "Qwen2Model": "qwen", + "Qwen2MoeForCausalLM": "qwen", + "Qwen2VLForConditionalGeneration": "qwenvl", + "Qwen2VLModel": "qwenvl", + "Qwen2_5OmniModel": "qwenvl", + "Qwen2_5_VLForConditionalGeneration": "qwenvl", + "Qwen3ASRForConditionalGeneration": "qwen3vl", + "Qwen3ForCausalLM": "qwen", + "Qwen3Model": "qwen", + "Qwen3MoeForCausalLM": "qwen", + "Qwen3NextForCausalLM": "qwen", + "Qwen3OmniMoeForConditionalGeneration": "qwen3vl", + "Qwen3VLForConditionalGeneration": "qwen3vl", + "Qwen3VLMoeForConditionalGeneration": "qwen3vl", + "Qwen3_5ForCausalLM": "qwen", + "Qwen3_5ForConditionalGeneration": "qwen", + "Qwen3_5MoeForCausalLM": "qwen", + "Qwen3_5MoeForConditionalGeneration": "qwen", + "RND1": "qwen", + "RWForCausalLM": "falcon", + "RWKV6Qwen2ForCausalLM": "rwkv", + "RWKV7ForCausalLM": "rwkv", + "RobertaForSequenceClassification": "bert", + "RobertaModel": "bert", + "RuGPT3XLForCausalLM": "gpt2", + "Rwkv6ForCausalLM": "rwkv", + "Rwkv7ForCausalLM": "rwkv", + "RwkvHybridForCausalLM": "rwkv", + "Sarashina2VisionForCausalLM": "sarashina2", + "SarvamMoEForCausalLM": "bailingmoe", + "SeedOssForCausalLM": "olmo", + "SmallThinkerForCausalLM": "smallthinker", + "SmolLM3ForCausalLM": "llama", + "SolarOpenForCausalLM": "glm", + "StableLMEpochForCausalLM": "stablelm", + "StableLmForCausalLM": "stablelm", + "Starcoder2ForCausalLM": "starcoder", + "Step3p5ForCausalLM": "step3", + "StepVLForConditionalGeneration": "step3", + "T5EncoderModel": "t5", + "T5ForConditionalGeneration": "t5", + "T5WithLMHeadModel": "t5", + "UMT5ForConditionalGeneration": "t5", + "UMT5Model": "t5", + "UltravoxModel": "ultravox", + "VLlama3ForCausalLM": "llama", + "VoxtralForConditionalGeneration": "llama", + "WavTokenizerDec": "wavtokenizer", + "XLMRobertaForSequenceClassification": "bert", + "XLMRobertaModel": "bert", + "XverseForCausalLM": "xverse", + "YoutuForCausalLM": "deepseek", + "YoutuVLForConditionalGeneration": "deepseek", + "modeling_grove_moe.GroveMoeForCausalLM": "grovemoe", + "modeling_sarvam_moe.SarvamMoEForCausalLM": "bailingmoe", +} + + +MMPROJ_MODEL_MAP: dict[str, str] = { + "AudioFlamingo3ForConditionalGeneration": "ultravox", + "CogVLMForCausalLM": "cogvlm", + "DeepseekOCRForCausalLM": "deepseek", + "DotsOCRForCausalLM": "dotsocr", + "Gemma3ForConditionalGeneration": "gemma", + "Gemma3nForConditionalGeneration": "gemma", + "Gemma4ForConditionalGeneration": "gemma", + "Glm4vForConditionalGeneration": "qwen3vl", + "Glm4vMoeForConditionalGeneration": "qwen3vl", + "GlmOcrForConditionalGeneration": "qwen3vl", + "GlmasrModel": "ultravox", + "GraniteSpeechForConditionalGeneration": "granite", + "HunYuanVLForConditionalGeneration": "hunyuan", + "Idefics3ForConditionalGeneration": "smolvlm", + "InternVisionModel": "internvl", + "JanusForConditionalGeneration": "januspro", + "KimiK25ForConditionalGeneration": "kimivl", + "KimiVLForConditionalGeneration": "kimivl", + "Lfm2AudioForConditionalGeneration": "lfm2", + "Lfm2VlForConditionalGeneration": "lfm2", + "LightOnOCRForConditionalGeneration": "lighton_ocr", + "Llama4ForConditionalGeneration": "llama4", + "LlavaForConditionalGeneration": "llava", + "MERaLiON2ForConditionalGeneration": "ultravox", + "MiMoV2ForCausalLM": "mimo", + "MiniCPMV4_6ForConditionalGeneration": "minicpm", + "Mistral3ForConditionalGeneration": "llava", + "NemotronH_Nano_VL_V2": "nemotron", + "PaddleOCRVisionModel": "ernie", + "Phi4ForCausalLMV": "phi", + "Qwen2AudioForConditionalGeneration": "ultravox", + "Qwen2VLForConditionalGeneration": "qwenvl", + "Qwen2VLModel": "qwenvl", + "Qwen2_5OmniModel": "qwenvl", + "Qwen2_5_VLForConditionalGeneration": "qwenvl", + "Qwen3ASRForConditionalGeneration": "qwen3vl", + "Qwen3OmniMoeForConditionalGeneration": "qwen3vl", + "Qwen3VLForConditionalGeneration": "qwen3vl", + "Qwen3VLMoeForConditionalGeneration": "qwen3vl", + "Qwen3_5ForConditionalGeneration": "qwen3vl", + "Qwen3_5MoeForConditionalGeneration": "qwen3vl", + "RADIOModel": "nemotron", + "Sarashina2VisionForCausalLM": "sarashina2", + "SmolVLMForConditionalGeneration": "smolvlm", + "StepVLForConditionalGeneration": "step3", + "UltravoxModel": "ultravox", + "VoxtralForConditionalGeneration": "ultravox", + "YoutuVLForConditionalGeneration": "youtuvl", +} + + +_TEXT_MODEL_MODULES = sorted(set(TEXT_MODEL_MAP.values())) +_MMPROJ_MODEL_MODULES = sorted(set(MMPROJ_MODEL_MAP.values())) + + +_loaded_text_modules: set[str] = set() +_loaded_mmproj_modules: set[str] = set() + + +def load_all_models() -> None: + """Import all model modules to trigger @ModelBase.register() decorators.""" + if len(_loaded_text_modules) != len(_TEXT_MODEL_MODULES): + for module_name in _TEXT_MODEL_MODULES: + if module_name not in _loaded_text_modules: + try: + __import__(f"conversion.{module_name}") + _loaded_text_modules.add(module_name) + except Exception as e: + logger.warning(f"Failed to load model module {module_name}: {e}") + + if len(_loaded_mmproj_modules) != len(_MMPROJ_MODEL_MODULES): + for module_name in _MMPROJ_MODEL_MODULES: + if module_name not in _loaded_mmproj_modules: + try: + __import__(f"conversion.{module_name}") + _loaded_mmproj_modules.add(module_name) + except Exception as e: + logger.warning(f"Failed to load model module {module_name}: {e}") + + +def get_model_class(name: str, mmproj: bool = False) -> Type[ModelBase]: + """Dynamically import and return a model class by its HuggingFace architecture name.""" + relevant_map = MMPROJ_MODEL_MAP if mmproj else TEXT_MODEL_MAP + if name not in relevant_map: + raise NotImplementedError(f"Architecture {name!r} not supported!") + module_name = relevant_map[name] + __import__(f"conversion.{module_name}") + model_type = ModelType.MMPROJ if mmproj else ModelType.TEXT + return ModelBase._model_classes[model_type][name] + + +def print_registered_models() -> None: + load_all_models() + logger.error("TEXT models:") + for name in sorted(TEXT_MODEL_MAP.keys()): + logger.error(f" - {name}") + logger.error("MMPROJ models:") + for name in sorted(MMPROJ_MODEL_MAP.keys()): + logger.error(f" - {name}") diff --git a/conversion/afmoe.py b/conversion/afmoe.py new file mode 100644 index 00000000000..5e66a51da61 --- /dev/null +++ b/conversion/afmoe.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, gguf + +from .llama import LlamaModel + + +@ModelBase.register("AfmoeForCausalLM") +class AfmoeModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.AFMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # MoE parameters + if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None: + self.gguf_writer.add_expert_shared_count(n_shared_experts) + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None: + self.gguf_writer.add_leading_dense_block_count(n_dense_layers) + + # Route normalization and scaling + if (route_norm := self.hparams.get("route_norm")) is not None: + self.gguf_writer.add_expert_weights_norm(route_norm) + if (route_scale := self.hparams.get("route_scale")) is not None: + self.gguf_writer.add_expert_weights_scale(route_scale) + + # Sliding window attention + if (sliding_window := self.hparams.get("sliding_window")) is not None: + self.gguf_writer.add_sliding_window(sliding_window) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith(".expert_bias"): + name = name.replace(".expert_bias", ".expert_bias.bias") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Handle expert weights - they're already merged in the HF format + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["gate_proj", "up_proj", "down_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename_to_retrieve]) + del self._experts[bid][ename_to_retrieve] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid) + + return + else: + return + + yield from ModelBase.modify_tensors(self, data_torch, name, bid) diff --git a/conversion/arctic.py b/conversion/arctic.py new file mode 100644 index 00000000000..775cacaab9f --- /dev/null +++ b/conversion/arctic.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +import json +import sys + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger + +from .llama import LlamaModel + + +@ModelBase.register("ArcticForCausalLM") +class ArcticModel(TextModel): + model_arch = gguf.MODEL_ARCH.ARCTIC + + def set_vocab(self): + # The reason for using a custom implementation here is that the + # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from + # tokenizer.model and used them as BOS and EOS instead of adding new tokens. + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + logger.error(f'Error: Missing {tokenizer_path}') + sys.exit(1) + + # Read the whole vocabulary from the tokenizer.model file + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + # Use the added_tokens_decoder field from tokeniser_config.json as the source + # of information about added/redefined tokens and modify them accordingly. + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + + if "added_tokens_decoder" in tokenizer_config_json: + added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] + for token_id, token_json in added_tokens_decoder.items(): + token_id = int(token_id) + if token_id >= vocab_size: + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + token_content = token_json["content"] + token_type = SentencePieceTokenTypes.USER_DEFINED + token_score = -10000.0 + + # Map unk_token to UNKNOWN, other special tokens to CONTROL + # Set the score to 0.0 as in the original tokenizer.model + if ("special" in token_json) and token_json["special"]: + if token_content == tokenizer_config_json["unk_token"]: + token_type = SentencePieceTokenTypes.UNKNOWN + else: + token_type = SentencePieceTokenTypes.CONTROL + token_score = 0.0 + + logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") + tokens[token_id] = token_content.encode("utf-8") + toktypes[token_id] = token_type + scores[token_id] = token_score + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for wid in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/baichuan.py b/conversion/baichuan.py new file mode 100644 index 00000000000..4cf34057cd9 --- /dev/null +++ b/conversion/baichuan.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM") +class BaichuanModel(TextModel): + model_arch = gguf.MODEL_ARCH.BAICHUAN + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": + logger.info(f"Unpacking and permuting layer {bid}") + yield from [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), + self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), + self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), + self._reverse_hf_part(data_torch, 2)), + ] + else: + yield from self.modify_tensors(data_torch, self.map_tensor_name(name), bid) + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def _reverse_hf_permute_part( + self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, + ) -> Tensor: + r = weights.shape[0] // 3 + return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) + + def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: + r = weights.shape[0] // 3 + return weights[r * n_part:r * n_part + r, ...] diff --git a/conversion/bailingmoe.py b/conversion/bailingmoe.py new file mode 100644 index 00000000000..319ff6dabee --- /dev/null +++ b/conversion/bailingmoe.py @@ -0,0 +1,216 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("BailingMoeForCausalLM") +class BailingMoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.BAILINGMOE + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_weights_scale(1.0) + self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + + _experts: list[dict[str, Tensor]] | None = None + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + n_embd = self.hparams["hidden_size"] + if (head_dim := self.hparams.get("head_dim")) is None: + head_dim = n_embd // n_head + + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + + if name.endswith("attention.dense.weight"): + yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), bid) + return + elif name.endswith("query_key_value.weight"): + q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2) + + yield from super().modify_tensors(BailingMoeModel.permute(q, n_head, n_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + yield from super().modify_tensors(BailingMoeModel.permute(k, n_head, n_kv_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) + return + elif name.find("mlp.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + yield from super().modify_tensors(data_torch, new_name, bid) + + return + + new_name = self.map_tensor_name(name) + + if new_name == output_name and self.hparams.get("norm_head"): + data_torch = data_torch.float() + data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7 + + yield from super().modify_tensors(data_torch, new_name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("BailingMoeV2ForCausalLM") +class BailingMoeV2Model(TextModel): + model_arch = gguf.MODEL_ARCH.BAILINGMOE2 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0): + self.block_count = self.hparams["num_hidden_layers"] + nextn_layers + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"])) + self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + + if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(nextn_layers) + + _experts: list[dict[str, Tensor]] | None = None + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith(".expert_bias"): + name = name.replace(".expert_bias", ".expert_bias.bias") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "mlp.experts" in name: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM") +class SarvamMoEModel(BailingMoeV2Model): + model_arch = gguf.MODEL_ARCH.BAILINGMOE2 + # Sarvam-MoE shares the BailingMoeV2 architecture; only differences: + # - full rotary (no partial_rotary_factor) + # - expert bias is zero-mean normalized at load time + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + # Override the partial-rotary value written by BailingMoeV2 with the full rotary dim + self.gguf_writer.add_rope_dimension_count(rope_dim) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + if name.endswith(".expert_bias"): + # Sarvam normalizes expert bias to zero mean + inner = gen + + def gen(): + t = inner() + return t - t.mean() + return super().filter_tensors((name, gen)) diff --git a/conversion/base.py b/conversion/base.py new file mode 100644 index 00000000000..d89d32fe150 --- /dev/null +++ b/conversion/base.py @@ -0,0 +1,2468 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import ast +import logging +import contextlib +import json +import os +import re +import sys +from enum import IntEnum +from pathlib import Path +from hashlib import sha256 +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast +from itertools import chain +from transformers import AutoConfig + +import numpy as np +import torch + +if TYPE_CHECKING: + from torch import Tensor + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py')) +import gguf +from gguf.vocab import MistralTokenizerType, MistralVocab + +try: + from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import] + from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found, ty:unresolved-import] + from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import] + from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import] + SentencePieceTokenizer, + ) + + _mistral_common_installed = True + _mistral_import_error_msg = "" +except ImportError: + _MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) + _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) + + _mistral_common_installed = False + TokenizerVersion: Any = None + Tekkenizer: Any = None + SentencePieceTokenizer: Any = None + _mistral_import_error_msg = ( + "Mistral format requires `mistral-common` to be installed. Please run " + "`pip install mistral-common[image,audio]` to install it." + ) + + +logger = logging.getLogger("hf-to-gguf") + + +AnyModel = TypeVar("AnyModel", bound="type[ModelBase]") + + +class SentencePieceTokenTypes(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + + +class ModelType(IntEnum): + TEXT = 1 + MMPROJ = 2 + + +class ModelBase: + _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = { + ModelType.TEXT: {}, + ModelType.MMPROJ: {}, + } + + dir_model: Path + ftype: gguf.LlamaFileType + fname_out: Path + is_big_endian: bool + endianess: gguf.GGUFEndian + use_temp_file: bool + lazy: bool + dry_run: bool + hparams: dict[str, Any] + model_tensors: dict[str, Callable[[], Tensor]] + gguf_writer: gguf.GGUFWriter + model_name: str | None + metadata_override: Path | None + dir_model_card: Path + remote_hf_model_id: str | None + + # subclasses should define this! + model_arch: gguf.MODEL_ARCH + + # subclasses should initialize this! + block_count: int + tensor_map: gguf.TensorNameMap + + # Mistral format specifics + is_mistral_format: bool = False + disable_mistral_community_chat_template: bool = False + sentence_transformers_dense_modules: bool = False + + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False, + use_temp_file: bool = False, eager: bool = False, + metadata_override: Path | None = None, model_name: str | None = None, + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, + small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, + disable_mistral_community_chat_template: bool = False, + sentence_transformers_dense_modules: bool = False, + fuse_gate_up_exps: bool = False): + if type(self) is ModelBase or \ + type(self) is TextModel or \ + type(self) is MmprojModel: + raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + + if self.is_mistral_format and not _mistral_common_installed: + raise ImportError(_mistral_import_error_msg) + + self.dir_model = dir_model + self.ftype = ftype + self.fname_out = fname_out + self.is_big_endian = is_big_endian + self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.use_temp_file = use_temp_file + self.lazy = not eager or (remote_hf_model_id is not None) + self.dry_run = dry_run + self.remote_hf_model_id = remote_hf_model_id + self.sentence_transformers_dense_modules = sentence_transformers_dense_modules + self.fuse_gate_up_exps = fuse_gate_up_exps + self._gate_exp_buffer: dict[int, Tensor] = {} + self._up_exp_buffer: dict[int, Tensor] = {} + self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams + self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) + self.metadata_override = metadata_override + self.model_name = model_name + self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + self._is_nvfp4 = False + self._is_mxfp4 = False + + # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype + # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. + if self.ftype == gguf.LlamaFileType.GUESSED: + for _, tensor in self.get_tensors(): + if tensor.dim() < 2: + continue + + if tensor.dtype == torch.bfloat16: + self.ftype = gguf.LlamaFileType.MOSTLY_BF16 + logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16") + break + elif tensor.dtype == torch.float16: + self.ftype = gguf.LlamaFileType.MOSTLY_F16 + logger.info("heuristics detected float16 tensor dtype, setting --outtype f16") + break + else: + self.ftype = gguf.LlamaFileType.MOSTLY_F16 + logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16") + + # Configure GGUF Writer + self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, + split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) + + # Mistral specific + self.disable_mistral_community_chat_template = disable_mistral_community_chat_template + + @classmethod + def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: + stem, suffix = path.stem, path.suffix + new_name = f"{prefix}{stem}{suffix}" + return path.with_name(new_name) + + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in self.hparams), None) + if key is not None: + return self.hparams[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") + + def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: + tensors: dict[str, Callable[[], Tensor]] = {} + + if remote_hf_model_id is not None: + is_safetensors = True + + logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") + remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) + for name, remote_tensor in remote_tensors.items(): + data_gen = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r) # noqa: E731 + if titem := self.filter_tensors((name, data_gen)): + tname, tgen = titem + tensors[tname] = tgen + + return tensors + + prefix = "model" if not self.is_mistral_format else "consolidated" + part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors") + is_safetensors: bool = len(part_names) > 0 + if not is_safetensors: + part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") + + tensor_names_from_index: set[str] = set() + tensor_names_from_parts: set[str] = set() + + if not self.is_mistral_format: + index_name = "model.safetensors" if is_safetensors else "pytorch_model.bin" + index_name += ".index.json" + index_file = self.dir_model / index_name + + if index_file.is_file(): + logger.info(f"gguf: loading model weight map from '{index_name}'") + with open(index_file, "r", encoding="utf-8") as f: + index: dict[str, Any] = json.load(f) + weight_map = index.get("weight_map") + if weight_map is None or not isinstance(weight_map, dict): + raise ValueError(f"Can't load 'weight_map' from {index_name!r}") + tensor_names_from_index.update(weight_map.keys()) + part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment] + part_names = sorted(part_dict.keys()) + else: + weight_map = {} + else: + weight_map = {} + + for part_name in part_names: + logger.info(f"gguf: indexing model part '{part_name}'") + ctx: ContextManager[Any] + if is_safetensors: + ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name)) + else: + ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) + + with ctx as model_part: + assert model_part is not None + + for name in model_part.keys(): + tensor_names_from_parts.add(name) + if is_safetensors: + data: gguf.utility.LocalTensor = model_part[name] + if self.lazy: + data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data) # noqa: E731 + else: + dtype = LazyTorchTensor._dtype_str_map[data.dtype] + data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape) # noqa: E731 + else: + data_torch: Tensor = model_part[name] + if self.lazy: + data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data) # noqa: E731 + else: + data_gen = lambda data=data_torch: data # noqa: E731 + if titem := self.filter_tensors((name, data_gen)): + tname, tgen = titem + tensors[tname] = tgen + + # verify tensor name presence and identify potentially missing files + if len(tensor_names_from_index) > 0: + if len(tensor_names_from_parts.symmetric_difference(tensor_names_from_index)) > 0: + missing = sorted(tensor_names_from_index.difference(tensor_names_from_parts)) + extra = sorted(tensor_names_from_parts.difference(tensor_names_from_index)) + missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) + if len(extra) == 0 and len(missing_files) > 0: + raise ValueError(f"Missing or incomplete model files: {missing_files}\n" + f"Missing tensors: {missing}") + else: + raise ValueError("Mismatch between weight map and model parts for tensor names:\n" + f"Missing tensors: {missing}\n" + f"Extra tensors: {extra}") + + return tensors + + @staticmethod + def _scale_is_trivial(scale: Tensor) -> bool: + return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6 + + def _write_scale_tensor(self, scale_name: str, scale: Tensor): + if not self._scale_is_trivial(scale): + scale_f32 = scale.float().numpy().flatten() + logger.info(f" + {scale_name} (per-tensor scale, shape [{scale_f32.size}])") + self.gguf_writer.add_tensor(scale_name, scale_f32) + + def _write_scales_tensor(self, scale_name: str, scales: list[float]): + if not np.allclose(scales, 1.0, atol=1e-6): + scale_vals = np.array(scales, dtype=np.float32) + logger.info(f" + {scale_name} (per-expert scale, shape [{len(scales)}])") + self.gguf_writer.add_tensor(scale_name, scale_vals) + + def dequant_model(self): + # If all quantized tensors were already handled (e.g. pure NVFP4), skip + if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors): + return + + tensors_to_remove: list[str] = [] + new_tensors: dict[str, Callable[[], Tensor]] = {} + + if (quant_config := self.hparams.get("quantization_config")) and isinstance(quant_config, dict): + quant_method = quant_config.get("quant_method") + + def dequant_bitnet(weight: Tensor, scale: Tensor) -> Tensor: + weight = weight.view(torch.uint8) + orig_shape = weight.shape + + shift = torch.tensor([0, 2, 4, 6], dtype=torch.uint8).reshape((4, *(1 for _ in range(len(orig_shape))))) + data = weight.unsqueeze(0).expand((4, *orig_shape)) >> shift + data = data & 3 + data = (data.float() - 1).reshape((orig_shape[0] * 4, *orig_shape[1:])) + + # The scale is inverted + return data / scale.float() + + def dequant_simple(weight: Tensor, scale: Tensor, block_size: Sequence[int] | None = None) -> Tensor: + scale = scale.float() + + if block_size is not None: + dim_offset = scale.ndim - len(block_size) + for i, size in enumerate(block_size): + scale = scale.repeat_interleave(size, dim_offset + i) + # unpad the scale (e.g. when the tensor size isn't a multiple of the block size) + scale = scale[tuple(slice(0, size) for size in weight.shape)] + + # align scale dims to weight for correct broadcasting (e.g. [128] -> [128, 1, 1]) + while scale.ndim < weight.ndim: + scale = scale.unsqueeze(-1) + + return weight.float() * scale + + # ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476 + def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) -> Tensor: + bits = quant_config["bits"] + assert bits in (2, 3, 4, 8) + assert qweight.dtype == qzeros.dtype + maxq = (2 ** bits) - 1 + weight = None + zeros = None + pack_dtype_bits = qweight.dtype.itemsize * 8 + + if bits in [2, 4, 8]: + pack_factor = pack_dtype_bits // bits + wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0) + if self.lazy: + wf = LazyTorchTensor.from_eager(wf) + + zeros = torch.bitwise_right_shift( + qzeros.unsqueeze(2).expand(-1, -1, pack_factor), + wf.unsqueeze(0) + ).to(torch.int16 if bits == 8 else torch.int8) + zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape) + + weight = torch.bitwise_and( + torch.bitwise_right_shift( + qweight.unsqueeze(1).expand(-1, pack_factor, -1), + wf.unsqueeze(-1) + ).to(torch.int16 if bits == 8 else torch.int8), + maxq + ) + elif bits == 3: + raise NotImplementedError("3-bit gptq dequantization is not yet implemented") + + assert weight is not None + assert zeros is not None + + weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2]) + + # gptq_v2 doesn't need to offset zeros + if quant_config.get("checkpoint_format", "gptq") == "gptq": + zeros += 1 + + return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T + + def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: Tensor | None, num_bits: int, group_size: int): + assert w.dtype == torch.int32 + shape = tuple(shape_tensor.tolist()) + assert len(shape) == 2 + mask = (1 << num_bits) - 1 + + shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32) + if self.lazy: + shifts = LazyTorchTensor.from_eager(shifts) + + if zero_point is None: + offset = 1 << (num_bits - 1) + else: + assert len(zero_point.shape) == 2 + offset = (zero_point.unsqueeze(1) >> shifts.reshape(1, -1, 1)) & mask + offset = offset.reshape(-1, zero_point.shape[1]) + # trim padding, and prepare for broadcast + # NOTE: the zero-point is packed along dim 0 + offset = offset[:shape[0], :].unsqueeze(-1) + + # extract values + # NOTE: the weights are packed along dim 1 + unpacked = (w.unsqueeze(-1) >> shifts.reshape(1, 1, -1)) & mask + unpacked = unpacked.reshape(shape[0], -1) + + # trim padding + unpacked = unpacked[:, :shape[1]] + + # prepare for broadcast of the scale + unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size) + unpacked = unpacked - offset + + return (unpacked * scale.unsqueeze(-1).float()).reshape(shape) + + if quant_method == "bitnet": + for name in self.model_tensors.keys(): + if name.endswith(".weight_scale"): + weight_name = name.removesuffix("_scale") + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s()) + tensors_to_remove.append(name) + elif quant_method == "fp8": + block_size = quant_config.get("weight_block_size") + for name in self.model_tensors.keys(): + if name.endswith("_scale_inv"): + weight_name = name.removesuffix("_scale_inv") + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs) + tensors_to_remove.append(name) + if name.endswith(".activation_scale"): # unused + tensors_to_remove.append(name) + if name.endswith("_activation_scale"): # Mistral-Small-4-119B-2602, unused + tensors_to_remove.append(name) + # mistral format + if name.endswith(".qscale_weight"): + weight_name = name.removesuffix("qscale_weight") + "weight" + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs) + tensors_to_remove.append(name) + if name.endswith(".qscale_act"): + tensors_to_remove.append(name) + elif quant_method == "gptq": + for name in self.model_tensors.keys(): + if name.endswith(".qweight"): + base_name = name.removesuffix(".qweight") + g_idx = self.model_tensors[base_name + ".g_idx"] + qweight = self.model_tensors[base_name + ".qweight"] + qzeros = self.model_tensors[base_name + ".qzeros"] + scales = self.model_tensors[base_name + ".scales"] + new_tensors[base_name + ".weight"] = ( + lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq( + g(), w(), z(), s() + ) + ) + tensors_to_remove += [ + base_name + n + for n in ( + ".g_idx", + ".qzeros", + ".qweight", + ".scales", + ) + ] + elif quant_method == "compressed-tensors": + quant_format = quant_config["format"] + groups = quant_config["config_groups"] + if len(groups) > 1: + raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet") + weight_config = tuple(groups.values())[0]["weights"] + + if quant_format == "float-quantized" or quant_format == "int-quantized" or quant_format == "naive-quantized": + block_size = weight_config.get("block_structure", None) + strategy = weight_config.get("strategy") + assert strategy == "channel" or strategy == "block" + assert weight_config.get("group_size") is None # didn't find a model using this yet + for name in self.model_tensors.keys(): + if name.endswith(".weight_scale"): + weight_name = name.removesuffix("_scale") + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size) + tensors_to_remove.append(name) + elif quant_format == "pack-quantized": + assert weight_config.get("strategy") == "group" + assert weight_config.get("type", "int") == "int" + num_bits = weight_config.get("num_bits") + group_size = weight_config.get("group_size") + assert isinstance(num_bits, int) + assert isinstance(group_size, int) + for name in self.model_tensors.keys(): + if name.endswith(".weight_packed"): + base_name = name.removesuffix("_packed") + w = self.model_tensors[name] + scale = self.model_tensors[base_name + "_scale"] + shape = self.model_tensors[base_name + "_shape"] + zero_point = self.model_tensors.get(base_name + "_zero_point", lambda: None) + new_tensors[base_name] = ( + lambda w=w, scale=scale, shape=shape, zero_point=zero_point: dequant_packed( + w(), scale(), shape(), zero_point(), num_bits, group_size, + ) + ) + tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")] + if (base_name + "_zero_point") in self.model_tensors: + tensors_to_remove.append(base_name + "_zero_point") + else: + raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported") + elif quant_method == "modelopt": + # Mixed-precision ModelOpt models: NVFP4 tensors are handled by + # _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and + # are dequantized here. k/v scale tensors are unused. + for name in self.model_tensors.keys(): + if name.endswith(".weight_scale"): + weight_name = name.removesuffix("_scale") + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None) + tensors_to_remove.append(name) + if name.endswith((".input_scale", ".k_scale", ".v_scale")): + tensors_to_remove.append(name) + elif quant_method is not None: + raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}") + + for name in tensors_to_remove: + if name in self.model_tensors: + del self.model_tensors[name] + + for name, value in new_tensors.items(): + self.model_tensors[name] = value + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") + + if "language_model." in name: + name = name.replace("language_model.", "") + + return name, gen + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for name, gen in self.model_tensors.items(): + yield name, gen() + + def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: + if key not in gguf.MODEL_TENSORS[self.model_arch]: + raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") + name: str = gguf.TENSOR_NAMES[key] + if "{bid}" in name: + assert bid is not None + name = name.format(bid=bid) + return name + suffix + + def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool: + if key not in gguf.MODEL_TENSORS[self.model_arch]: + return False + key_name: str = gguf.TENSOR_NAMES[key] + if "{bid}" in key_name: + if bid is None: + return False + key_name = key_name.format(bid=bid) + else: + if bid is not None: + return False + return name == (key_name + suffix) + + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) + if new_name is None: + raise ValueError(f"Can not map tensor {name!r}") + return new_name + + def set_gguf_parameters(self): + raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + # Handle gate/up expert tensor fusion if enabled + if self.fuse_gate_up_exps and bid is not None: + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid): + self._gate_exp_buffer[bid] = data_torch + elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): + self._up_exp_buffer[bid] = data_torch + + # Check if both gate and up are buffered for this layer + if bid in self._gate_exp_buffer and bid in self._up_exp_buffer: + gate_data = self._gate_exp_buffer.pop(bid) + up_data = self._up_exp_buffer.pop(bid) + # gate/up shape: (n_expert, n_ff, n_embd), concatenate to (n_expert, n_ff*2, n_embd) + fused_data = torch.cat([gate_data, up_data], dim=1) + fused_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, bid) + logger.info(f"Fused gate_exps and up_exps for layer {bid}") + return [(fused_name, fused_data)] + + # If we buffered a gate/up tensor, wait for the other + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid) or \ + self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): + return [] + + return [(new_name, data_torch)] + + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + del name, new_name, bid, n_dims # unused + + return False + + # some models need extra generated tensors (like rope_freqs) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + return () + + @staticmethod + def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]: + """Repack NVFP4 ModelOpt tensors into ggml super-block layout. + Preserves original E4M3 scale bits as UE4M3 (strip sign bit). + The per-tensor scale2 factor is stored as a separate tensor and applied at inference time via ggml_mul(). + Returns (raw_data, logical_shape).""" + + out_features = weight.shape[0] + n_blocks = scale.shape[1] + + # Unpack ModelOpt nibble-packed weights + w = weight.reshape(out_features, n_blocks, 8) + vals = torch.stack([w & 0x0F, w >> 4], dim=-1).reshape(out_features, n_blocks, 16) + + # Preserve original E4M3 scale bits as UE4M3 (strip sign bit) + d_ue = scale.view(torch.uint8).numpy().reshape(out_features, n_blocks) & 0x7F + qs = (vals[:, :, :8] | (vals[:, :, 8:] << 4)).to(torch.uint8).numpy() + + # Pack into super-blocks: [4 UE4M3 scales, 32 qs bytes] = 36 bytes per 64 elements + n_super = n_blocks // 4 + d_grouped = d_ue.reshape(out_features, n_super, 4) + qs_grouped = qs.reshape(out_features, n_super, 4, 8).reshape(out_features, n_super, 32) + raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36) + return raw, [out_features, n_super * 64] + + def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): + new_name = self.map_tensor_name(name) + + raw, shape = self._nvfp4_pack(weight, scale) + logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4") + self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4) + + self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2) + self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale) + + def _generate_nvfp4_tensors(self): + # Per-layer expert merging to avoid holding all experts in memory + expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {} + expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {} + expert_input_scales: dict[tuple[int, str], list[tuple[int, float]]] = {} + expert_shapes: dict[tuple[int, str], list[int]] = {} + n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0 + consumed: list[str] = [] + + for name in self.model_tensors.keys(): + if not name.endswith(".weight"): + continue + scale_name = name.replace(".weight", ".weight_scale") + scale2_name = name.replace(".weight", ".weight_scale_2") + input_scale_name = name.replace(".weight", ".input_scale") + if scale_name not in self.model_tensors: + continue + # Force eager materialization of lazy tensors + weight = LazyTorchTensor.to_eager(self.model_tensors[name]()) + scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]()) + + # Skip non-NVFP4 tensors (e.g. FP8 with per-channel 1D scales) + if scale.ndim < 2: + continue + + scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))()) + input_scale = LazyTorchTensor.to_eager(self.model_tensors.get(input_scale_name, lambda: torch.tensor(1.0))()) + + # Mark tensors for removal from model_tensors (already written to gguf) + consumed.extend([name, scale_name]) + if scale2_name in self.model_tensors: + consumed.append(scale2_name) + if input_scale_name in self.model_tensors: + consumed.append(input_scale_name) + + # Check if this is a per-expert tensor + m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name) + if m: + expert_id = int(m.group(1)) + proj_type = m.group(2) + bid_m = re.search(r'\.layers\.(\d+)\.', name) + bid = int(bid_m.group(1)) if bid_m else 0 + key = (bid, proj_type) + + raw, shape = self._nvfp4_pack(weight, scale) + + if key not in expert_blocks: + expert_blocks[key] = [] + expert_scales[key] = [] + expert_input_scales[key] = [] + expert_shapes[key] = shape + expert_blocks[key].append((expert_id, raw.copy())) + # Collect per-expert scale2 (scalar per expert) + expert_scales[key].append((expert_id, float(scale2.float().sum()))) + # Collect per-expert input_scale (scalar per expert) + expert_input_scales[key].append((expert_id, float(input_scale.float().sum()))) + + # Flush when all experts for this (layer, proj) are collected + if n_experts > 0 and len(expert_blocks[key]) >= n_experts: + self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type) + else: + self._repack_nvfp4(name, weight, scale, scale2, input_scale) + + # Flush any remaining experts (fallback if n_experts was unknown) + for bid, proj_type in list(expert_blocks.keys()): + self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type) + + # Remove consumed tensors so get_tensors/modify_tensors won't see them + for name in consumed: + self.model_tensors.pop(name, None) + + # Remove any remaining unused auxiliary tensors + for name in list(self.model_tensors.keys()): + if name.endswith((".k_scale", ".v_scale")): + del self.model_tensors[name] + + def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type): + experts = expert_blocks.pop(key) + scales = expert_scales.pop(key) + input_scales = expert_input_scales.pop(key) + shape = expert_shapes.pop(key) + + experts.sort(key=lambda x: x[0]) + merged = np.stack([e[1] for e in experts], axis=0) + merged_name = f"model.layers.{bid}.mlp.experts.{proj_type}.weight" + new_name = self.map_tensor_name(merged_name) + logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4") + self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4) + + scales.sort(key=lambda x: x[0]) + self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales]) + + input_scales.sort(key=lambda x: x[0]) + self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales]) + + del experts, merged + + def prepare_tensors(self): + # detect NVFP4 quantization (ModelOpt format) + quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo") + quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method") + quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {} + quant_config_file = self.dir_model / "hf_quant_config.json" + + if (not quant_algo or not quant_layers) and quant_config_file.is_file(): + with open(quant_config_file, "r", encoding="utf-8") as f: + hf_quant_config = json.load(f) + quant_config = hf_quant_config.get("quantization") or {} + producer = hf_quant_config.get("producer") or {} + producer_name = (producer.get("name") or "").lower() + if quant_method is None: + self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name + quant_algo = quant_config.get("quant_algo", quant_algo) + quant_layers = quant_config.get("quantized_layers", quant_layers) or {} + + # Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with + # per-layer NVFP4/FP8) instead of a single global "NVFP4" value. + if quant_algo != "NVFP4": + if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)): + quant_algo = "NVFP4" + + self._is_nvfp4 = quant_algo == "NVFP4" + self._is_mxfp4 = quant_method == "mxfp4" + + # NVFP4 weights are repacked and written directly to gguf_writer. + # This must run before dequant_model so NVFP4 tensors are removed + # from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant. + if self._is_nvfp4: + self._generate_nvfp4_tensors() + + self.dequant_model() + + # Handle empty tensor_map for models with block_count=0 (like MobileNetV5) + if self.tensor_map.mapping: + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + else: + max_name_len = len("vision_encoder.weight,") # Default reasonable length + + for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # use the first number-like part of the tensor name as the block id + bid = None + for part in name.split("."): + if part.isdecimal(): + bid = int(part) + break + + for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): + # TODO: why do we squeeze here? + # data = data_torch.squeeze().numpy() + data = data_torch.numpy() + + n_dims = len(data.shape) + data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) + + # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors + if n_dims <= 1 or new_name.endswith("_norm.weight"): + data_qtype = gguf.GGMLQuantizationType.F32 + + # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + # Some tensor types are always in float32 + if data_qtype is False and ( + any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.FFN_GATE_INP_SHEXP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + gguf.MODEL_TENSOR.SSM_CONV1D, + gguf.MODEL_TENSOR.SHORTCONV_CONV, + gguf.MODEL_TENSOR.TIME_MIX_FIRST, + gguf.MODEL_TENSOR.TIME_MIX_W1, + gguf.MODEL_TENSOR.TIME_MIX_W2, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, + gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED, + gguf.MODEL_TENSOR.POSNET_NORM1, + gguf.MODEL_TENSOR.POSNET_NORM2, + gguf.MODEL_TENSOR.V_ENC_EMBD_POS, + gguf.MODEL_TENSOR.A_ENC_EMBD_POS, + gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF, + gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF, + # Kimi KDA conv weights should be F32 + gguf.MODEL_TENSOR.SSM_CONV1D_Q, + gguf.MODEL_TENSOR.SSM_CONV1D_K, + gguf.MODEL_TENSOR.SSM_CONV1D_V, + ) + ) + or new_name[-7:] not in (".weight", ".lora_a", ".lora_b") + ): + data_qtype = gguf.GGMLQuantizationType.F32 + + if data_qtype is False and any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD, + gguf.MODEL_TENSOR.OUTPUT, + gguf.MODEL_TENSOR.ALTUP_ROUTER, + gguf.MODEL_TENSOR.LAUREL_L, + gguf.MODEL_TENSOR.LAUREL_R, + ) + ): + if self.ftype in ( + gguf.LlamaFileType.MOSTLY_TQ1_0, + gguf.LlamaFileType.MOSTLY_TQ2_0, + ): + # TODO: use Q4_K and Q6_K + data_qtype = gguf.GGMLQuantizationType.F16 + + # No override (data_qtype is False), or wants to be quantized (data_qtype is True) + if isinstance(data_qtype, bool): + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: + data_qtype = gguf.GGMLQuantizationType.TQ1_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: + data_qtype = gguf.GGMLQuantizationType.TQ2_0 + else: + raise ValueError(f"Unknown file type: {self.ftype.name}") + + try: + data = gguf.quants.quantize(data, data_qtype) + except gguf.QuantError as e: + logger.warning("%s, %s", e, "falling back to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" + + # n_dims is implicit in the shape + logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MODEL) + + def prepare_metadata(self, vocab_only: bool): + + total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() + + self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params) + + # If we are using HF model id, set the metadata name to the model id + if self.remote_hf_model_id: + self.metadata.name = self.remote_hf_model_id + + # Fallback to model directory name if metadata name is still missing + if self.metadata.name is None: + self.metadata.name = self.dir_model.name + + if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16): + if self._is_nvfp4: + self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4 + elif self._is_mxfp4: + self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE + + # Generate parameter weight class (useful for leader boards) if not yet determined + if self.metadata.size_label is None and total_params > 0: + self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) + + self.set_type() + + logger.info("Set meta model") + self.metadata.set_gguf_meta_model(self.gguf_writer) + + logger.info("Set model parameters") + self.set_gguf_parameters() + + logger.info("Set model quantization version") + self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + + def write_vocab(self): + raise NotImplementedError("write_vocab() must be implemented in subclasses") + + def write(self): + self.prepare_tensors() + self.prepare_metadata(vocab_only=False) + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + + @staticmethod + def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: + part_names: list[str] = [] + for filename in os.listdir(dir_model): + if filename.startswith(prefix) and filename.endswith(suffix): + part_names.append(filename) + + part_names.sort() + + return part_names + + @staticmethod + def load_hparams(dir_model: Path, is_mistral_format: bool): + if is_mistral_format: + with open(dir_model / "params.json", "r", encoding="utf-8") as f: + config = json.load(f) + return config + + try: + # for security reason, we don't allow loading remote code by default + # if a model need remote code, we will fallback to config.json + config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict() + except Exception as e: + logger.warning(f"Failed to load model config from {dir_model}: {e}") + logger.warning("Trying to load config.json instead") + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + config = json.load(f) + if "llm_config" in config: + # rename for InternVL + config["text_config"] = config["llm_config"] + if "lm_config" in config: + # rename for GlmASR + config["text_config"] = config["lm_config"] + if "thinker_config" in config: + # rename for Qwen2.5-Omni + config["text_config"] = config["thinker_config"]["text_config"] + if "language_config" in config: + # rename for DeepSeekOCR + config["text_config"] = config["language_config"] + if "lfm" in config: + # rename for LFM2-Audio + config["text_config"] = config["lfm"] + return config + + @classmethod + def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: + assert names + + def func(modelcls: AnyModel) -> AnyModel: + model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT + for name in names: + cls._model_classes[model_type][name] = modelcls + return modelcls + return func + + @classmethod + def print_registered_models(cls): + for model_type, model_classes in cls._model_classes.items(): + logger.error(f"{model_type.name} models:") + for name in sorted(model_classes.keys()): + logger.error(f" - {name}") + + @classmethod + def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]: + try: + return cls._model_classes[model_type][arch] + except KeyError: + raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + + +class TextModel(ModelBase): + model_type = ModelType.TEXT + hf_arch: str + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if not self.is_mistral_format: + self.hf_arch = get_model_architecture(self.hparams, self.model_type) + else: + self.hf_arch = "" + + if "text_config" in self.hparams: + # move the text_config to the root level + self.hparams = {**self.hparams, **self.hparams["text_config"]} + + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {} + + rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True) + local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True) + + # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters + if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters: + if local_rope_theta is not None: + self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta} + if "rope_theta" not in self.rope_parameters and rope_theta is not None: + self.rope_parameters["rope_theta"] = rope_theta + if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None: + self.rope_parameters["rope_type"] = rope_type + + @classmethod + def __init_subclass__(cls): + # can't use an abstract property, because overriding it without type errors + # would require using decorated functions instead of simply defining the property + if "model_arch" not in cls.__dict__: + raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip multimodal tensors + if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \ + or "visual." in name or "vision." in name or "audio." in name or "talker." in name \ + or "vision_" in name or "audio_" in name or "sam_model" in name \ + or "token2wav." in name or "code2wav." in name \ + or "projector." in name or "pre_mm_projector_norm" in name \ + or "image_newline" in name or "view_seperator" in name \ + or "patch_embed" in name or "patch_embedding" in name \ + or "patch_merger." in name or "model.connector." in name: + return None + + return super().filter_tensors(item) + + def set_vocab(self): + self._set_vocab_gpt2() + + def prepare_metadata(self, vocab_only: bool): + super().prepare_metadata(vocab_only=vocab_only) + + total_params = self.gguf_writer.get_total_parameter_count()[0] + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + output_type: str = self.ftype.name.partition("_")[2] + + # Filename Output + if self.fname_out.is_dir(): + # Generate default filename based on model specification and available metadata + if not vocab_only: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) + else: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") + + # Use the default filename + self.fname_out = self.fname_out / f"{fname_default}.gguf" + else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + + # Process templated file name with the output ftype, useful with the "auto" ftype + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + + logger.info("Set model tokenizer") + self.set_vocab() + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None: + self.gguf_writer.add_context_length(n_ctx) + logger.info(f"gguf: context length = {n_ctx}") + + if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None: + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + logger.info(f"gguf: feed forward length = {n_ff}") + + if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None: + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") + + if (n_head_kv := self.find_hparam(["num_key_value_heads", "n_kv_heads"], optional=True)) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + logger.info(f"gguf: key-value head count = {n_head_kv}") + + if self.hparams.get("is_causal") is False: + self.gguf_writer.add_causal_attention(False) + logger.info("gguf: causal attention = False") + + # TODO: Handle "sliding_attention" similarly when models start implementing it + rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) + if (rope_type := rope_params.get("rope_type")) is not None: + rope_factor = rope_params.get("factor") + rope_gguf_type = gguf.RopeScalingType.NONE + if rope_type == "linear" and rope_factor is not None: + rope_gguf_type = gguf.RopeScalingType.LINEAR + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + self.gguf_writer.add_rope_scaling_factor(rope_factor) + elif rope_type == "yarn" and rope_factor is not None: + rope_gguf_type = gguf.RopeScalingType.YARN + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + self.gguf_writer.add_rope_scaling_factor(rope_factor) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"]) + if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None: + self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor) + if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None: + self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor) + if (yarn_beta_fast := rope_params.get("beta_fast")) is not None: + self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast) + if (yarn_beta_slow := rope_params.get("beta_slow")) is not None: + self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow) + # self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) + elif rope_type == "su" or rope_type == "longrope": + rope_gguf_type = gguf.RopeScalingType.LONGROPE + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + elif rope_type == "dynamic": + # HunYuan, handled in model class + pass + elif rope_type.lower() == "llama3": + # Handled in generate_extra_tensors + pass + else: + logger.warning(f"Unknown RoPE type: {rope_type}") + logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}") + + if "mrope_section" in self.rope_parameters: + mrope_section = self.rope_parameters["mrope_section"] + # Pad to 4 dimensions [time, height, width, extra] + while len(mrope_section) < 4: + mrope_section.append(0) + self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) + logger.info(f"gguf: mrope sections: {mrope_section[:4]}") + + if (rope_theta := rope_params.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + logger.info(f"gguf: rope theta = {rope_theta}") + if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base_swa(local_rope_theta) + logger.info(f"gguf: rope theta swa = {local_rope_theta}") + if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") + if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None: + self.gguf_writer.add_expert_count(n_experts) + logger.info(f"gguf: expert count = {n_experts}") + if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + logger.info(f"gguf: experts used count = {n_experts_used}") + if (n_expert_groups := self.hparams.get("n_group")) is not None: + self.gguf_writer.add_expert_group_count(n_expert_groups) + logger.info(f"gguf: expert groups count = {n_expert_groups}") + if (n_group_used := self.hparams.get("topk_group")) is not None: + self.gguf_writer.add_expert_group_used_count(n_group_used) + logger.info(f"gguf: expert groups used count = {n_group_used}") + + if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None: + if score_func == "sigmoid": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + elif score_func == "softmax": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + raise ValueError(f"Unsupported expert score gating function value: {score_func}") + logger.info(f"gguf: expert score gating function = {score_func}") + + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def write_vocab(self): + if len(self.gguf_writer.tensors) != 1: + raise ValueError('Splitting the vocabulary is not supported') + + self.prepare_metadata(vocab_only=True) + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + + def does_token_look_special(self, token: str | bytes) -> bool: + if isinstance(token, (bytes, bytearray)): + token_text = token.decode(encoding="utf-8") + elif isinstance(token, memoryview): + token_text = token.tobytes().decode(encoding="utf-8") + else: + token_text = token + + # Some models mark some added tokens which ought to be control tokens as not special. + # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2}) + seems_special = token_text in ( + "", # deepseek-coder + "", "<2mass>", "[@BOS@]", # gemma{,-2} + ) + + seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) + seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder + + # TODO: should these be marked as UNUSED instead? (maybe not) + seems_special = seems_special or (token_text.startswith("")) # gemma{,-2} + + return seems_special + + # used for GPT-2 BPE and WordPiece vocabs + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] + assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + # NOTE: this was added for Gemma. + # Encoding and decoding the tokens above isn't sufficient for this case. + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + return tokens, toktypes, tokpre + + # NOTE: this function is generated by convert_hf_to_gguf_update.py + # do not modify it manually! + # ref: https://github.com/ggml-org/llama.cpp/pull/6920 + # Marker: Start get_vocab_base_pre + def get_vocab_base_pre(self, tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer + + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.debug(f"chktok: {chktok}") + logger.debug(f"chkhsh: {chkhsh}") + + res = None + + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! + if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b": + # ref: https://huggingface.co/THUDM/glm-4-9b-chat + res = "chatglm-bpe" + if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516": + # ref: https://huggingface.co/THUDM/glm-4-9b-chat + res = "chatglm-bpe" + if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2": + # ref: https://huggingface.co/THUDM/glm-4-9b-hf + res = "glm4" + if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902": + # ref: https://huggingface.co/zai-org/GLM-4.5-Air + res = "glm4" + if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267": + # ref: https://huggingface.co/zai-org/GLM-4.7-Flash + res = "glm4" + if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": + # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 + res = "minerva-7b" + if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664": + # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct + res = "hunyuan" + if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6": + # ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct + res = "hunyuan-dense" + if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6": + # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base + res = "falcon-h1" + if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86": + # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base + res = "falcon-h1" + if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896": + # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base + res = "falcon-h1" + if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b": + # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base + res = "falcon-h1" + if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890": + # ref: https://huggingface.co/moonshotai/Kimi-K2-Base + res = "kimi-k2" + if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": + # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B + res = "qwen2" + if chkhsh == "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f": + # ref: https://huggingface.co/openbmb/MiniCPM-V-4_6 + res = "qwen35" + if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273": + # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer + res = "grok-2" + if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df": + # ref: https://huggingface.co/aari1995/German_Semantic_V3 + res = "jina-v2-de" + if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4": + # ref: https://huggingface.co/evilfreelancer/ruGPT3XL + res = "gpt-2" + if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B + res = "llama-bpe" + if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base + res = "deepseek-llm" + if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base + res = "deepseek-coder" + if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": + # ref: https://huggingface.co/tiiuae/falcon-7b + res = "falcon" + if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": + # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 + res = "bert-bge" + if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": + # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base + res = "falcon3" + if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": + # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 + res = "bert-bge-large" + if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + # ref: https://huggingface.co/mosaicml/mpt-7b + res = "mpt" + if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": + # ref: https://huggingface.co/bigcode/starcoder2-3b + res = "starcoder" + if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": + # ref: https://huggingface.co/openai-community/gpt2 + res = "gpt-2" + if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": + # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b + res = "stablelm2" + if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": + # ref: https://huggingface.co/smallcloudai/Refact-1_6-base + res = "refact" + if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 + res = "command-r" + if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1": + # ref: https://huggingface.co/CohereLabs/tiny-aya-base + res = "tiny_aya" + if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": + # ref: https://huggingface.co/Qwen/Qwen1.5-7B + res = "qwen2" + if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf + res = "olmo" + if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": + # ref: https://huggingface.co/databricks/dbrx-base + res = "dbrx" + if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": + # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + res = "jina-v1-en" + if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en + res = "jina-v2-en" + if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es + res = "jina-v2-es" + if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de + res = "jina-v2-de" + if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d": + # ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano + res = "jina-v5-nano" + if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": + # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct + res = "smaug-bpe" + if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360": + # ref: https://huggingface.co/LumiOpen/Poro-34B-chat + res = "poro-chat" + if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code + res = "jina-v2-code" + if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": + # ref: https://huggingface.co/LumiOpen/Viking-7B + res = "viking" + if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": + # ref: https://huggingface.co/core42/jais-13b + res = "jais" + if chkhsh == "bc5108ee1eb6a3d600cadd065f63190fbd0554dbc9e4bbd6a0d977970afc8d2a": + # ref: https://huggingface.co/inceptionai/Jais-2-8B-Chat + res = "jais-2" + if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f": + # ref: https://huggingface.co/WisdomShell/CodeShell-7B + res = "codeshell" + if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e": + # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 + res = "tekken" + if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": + # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M + res = "smollm" + if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7": + # ref: https://huggingface.co/bigscience/bloom + res = "bloom" + if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21": + # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small + res = "gpt3-finnish" + if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": + # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct + res = "exaone" + if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": + # ref: https://huggingface.co/microsoft/phi-2 + res = "phi-2" + if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": + # ref: https://huggingface.co/facebook/chameleon-7b + res = "chameleon" + if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": + # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base + res = "roberta-bpe" + if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb": + # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct + res = "gigachat" + if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": + # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct + res = "megrez" + if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": + # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 + res = "deepseek-v3" + if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": + # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + res = "deepseek-r1-qwen" + if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e": + # ref: https://huggingface.co/Xenova/gpt-4o + res = "gpt-4o" + if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f": + # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k + res = "superbpe" + if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15": + # ref: https://huggingface.co/trillionlabs/Trillion-7B-preview + res = "trillion" + if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224": + # ref: https://huggingface.co/inclusionAI/Ling-lite + res = "bailingmoe" + if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406": + # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct + res = "llama4" + if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3": + # ref: https://huggingface.co/mistral-community/pixtral-12b + res = "pixtral" + if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec": + # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base + res = "seed-coder" + if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf": + # ref: https://huggingface.co/skt/A.X-4.0 + res = "a.x-4.0" + if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4": + # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct + res = "midm-2.0" + if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51": + # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer + res = "lfm2" + if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb": + # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B + res = "exaone4" + if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756": + # ref: https://huggingface.co/JetBrains/Mellum-4b-base + res = "mellum" + if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152": + # ref: https://huggingface.co/answerdotai/ModernBERT-base + res = "modern-bert" + if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df": + # ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer + res = "afmoe" + if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206": + # ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0 + res = "bailingmoe2" + if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e": + # ref: https://huggingface.co/ibm-granite/granite-docling-258M + res = "granite-docling" + if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95": + # ref: https://huggingface.co/MiniMaxAI/MiniMax-M2 + res = "minimax-m2" + if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665": + # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer + res = "kormo" + if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1": + # ref: https://huggingface.co/tencent/Youtu-LLM-2B + res = "youtu" + if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91": + # ref: https://huggingface.co/upstage/Solar-Open-100B + res = "solar-open" + if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f": + # ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B + res = "exaone-moe" + if chkhsh == "d30d75d9059f1aa2c19359de71047b3ae408c70875e8a3ccf8c5fba56c9d8af4": + # ref: https://huggingface.co/Qwen/Qwen3.5-9B-Instruct + res = "qwen35" + if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d": + # ref: https://huggingface.co/jdopensource/JoyAI-LLM-Flash + res = "joyai-llm" + if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869": + # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601 + res = "kanana2" + if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015": + # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B + res = "f2llmv2" + if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57": + # ref: https://huggingface.co/sarvamai/sarvam-30b + res = "sarvam-moe" + + if res is None: + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {chkhsh}") + logger.warning("**************************************************************************************") + logger.warning("\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") + logger.debug(f"chkhsh: {chkhsh}") + + return res + # Marker: End get_vocab_base_pre + + def _set_vocab_none(self) -> None: + self.gguf_writer.add_tokenizer_model("none") + + def _set_vocab_gpt2(self) -> None: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_qwen(self): + from .qwen import QwenModel + + dir_model = self.dir_model + hparams = self.hparams + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams["vocab_size"] + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] + + tokpre = self.get_vocab_base_pre(tokenizer) + + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + assert len(merged) == 2 + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined + added_vocab = tokenizer.special_tokens # ty: ignore[unresolved-attribute] + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) + special_vocab.merges = merges + # only add special tokens when they were not already loaded from config.json + if len(special_vocab.special_token_ids) == 0: + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_sentencepiece(self, add_to_gguf=True): + tokens, scores, toktypes = self._create_vocab_sentencepiece() + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def _create_vocab_sentencepiece(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.find_hparam([ + "vocab_size_per_layer_input", # gemma3n + "vocab_size", + ], optional=True) or tokenizer.vocab_size() + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + if token_id >= vocab_size: + logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}') + break + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, token_data in added_tokens_decoder.items(): + token_id = int(token_id) + token: str = token_data["content"] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token.encode("utf-8"): + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') + if token_data.get("special") or self.does_token_look_special(token): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + else: + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + scores[token_id] = -1000.0 + tokens[token_id] = token.encode("utf-8") + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + return tokens, scores, toktypes + + def _set_vocab_llama_hf(self): + vocab = gguf.LlamaHfVocab(self.dir_model) + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_rwkv_world(self): + assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() + vocab_size = self.hparams.get("vocab_size", 65536) + + tokens: list[bytes] = [''.encode("utf-8")] + toktypes: list[int] = [gguf.TokenType.CONTROL] + + with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: + lines = f.readlines() + for line in lines: + parts = line.split(' ') + assert len(parts) >= 3 + token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) + token = token.encode("utf-8") if isinstance(token, str) else token + assert isinstance(token, bytes) + assert len(token) == token_len + token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" + tokens.append(token_text.encode("utf-8")) + toktypes.append(gguf.TokenType.NORMAL) + remainder = vocab_size - len(tokens) + assert remainder >= 0 + for i in range(len(tokens), vocab_size): + tokens.append(f"[PAD{i}]".encode("utf-8")) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("rwkv") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + if special_vocab.chat_template is None: + template_path = Path(__file__).parent.parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja" + if template_path.is_file(): + with open(template_path, "r", encoding="utf-8") as f: + template = f.read() + else: + template = "rwkv-world" + special_vocab.chat_template = template + # hack: Add '\n\n' as the EOT token to make it chat normally + special_vocab._set_special_token("eot", 261) + # hack: Override these as they have already been set (incorrectly) + special_vocab.special_token_ids["bos"] = 0 + special_vocab.special_token_ids["eos"] = 0 + + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): + tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" + logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + vocab_reader = gguf.GGUFReader(tokenizer_path, "r") + + default_pre = "mpt" if model_name == "gpt-neox" else "default" + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL) + assert field # tokenizer model + self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) + self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) + assert field # token list + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + + if model_name == "llama-spm": + field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) + assert field # token scores + self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + assert field # token types + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + + if model_name != "llama-spm": + field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) + assert field # token merges + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None: + self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None: + self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None: + self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None: + self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None: + self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None: + self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) + + def _try_set_pooling_type(self) -> None: + # get pooling path + pooling_path = None + module_path = self.dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"].endswith("Pooling"): + pooling_path = mod["path"] + break + + mode_mapping = { + "mean": gguf.PoolingType.MEAN, + "cls": gguf.PoolingType.CLS, + "lasttoken": gguf.PoolingType.LAST, + } + + # get pooling type + if pooling_path is not None: + with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: + pooling = json.load(f) + if pooling.get("pooling_mode_mean_tokens"): + pooling_type = gguf.PoolingType.MEAN + elif pooling.get("pooling_mode_cls_token"): + pooling_type = gguf.PoolingType.CLS + elif pooling.get("pooling_mode_lasttoken"): + pooling_type = gguf.PoolingType.LAST + elif (pooling_mode := pooling.get("pooling_mode")) in mode_mapping: + pooling_type = mode_mapping[pooling_mode] + else: + raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported") + self.gguf_writer.add_pooling_type(pooling_type) + + def _set_vocab_glmedge(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_glm(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + # Special tokens + # Note: Using <|endoftext|> (151329) for eot causes endless generation + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # ty: ignore[unresolved-attribute] # 151331 + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] # 151336 + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] # 151329 + special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # ty: ignore[unresolved-attribute] # 151338 + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_interns1(self): + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) # ty: ignore[unresolved-attribute] + vocab_size = self.hparams.get("vocab_size", len(vocab)) + assert max(vocab.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab._set_special_token("bos", 151643) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_mistral(self): + from .mistral import MistralModel + + if not _mistral_common_installed: + raise ImportError(_mistral_import_error_msg) + + vocab = MistralVocab(self.dir_model) + logger.info( + f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}." + ) + + self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model) + + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size, ( + f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})" + ) + + if vocab.tokenizer_type == MistralTokenizerType.tekken: + self.gguf_writer.add_tokenizer_pre("tekken") + self.gguf_writer.add_token_merges( + vocab.extract_vocab_merges_from_model() + ) + + logger.info( + f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." + ) + + self.gguf_writer.add_bos_token_id(vocab.bos_id) + self.gguf_writer.add_eos_token_id(vocab.eos_id) + self.gguf_writer.add_unk_token_id(vocab.unk_id) + self.gguf_writer.add_pad_token_id(vocab.pad_id) + + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_vocab_size(vocab.vocab_size) + + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(False) + + local_template_file_path = self.dir_model / "chat_template.jinja" + + if self.is_mistral_format and local_template_file_path.is_file(): + # Ministral-3 and other new Mistral models come with chat templates. + # ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main + logger.info("Using an existing Mistral local chat template.") + + with open(local_template_file_path, "r", encoding="utf-8") as f: + template = f.read() + elif not self.is_mistral_format or not self.disable_mistral_community_chat_template: + template_dir = Path(__file__).parent.parent / "models/templates/" + + # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`. + if self.is_mistral_format: + logger.info( + "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. " + "Mistral recommends to use `mistral-common` to perform tokenization and detokenization." + ) + template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format) + else: + logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.") + template = None + + if template is not None: + self.gguf_writer.add_chat_template(template) + + def _set_vocab_plamo(self): + # PLaMo models use a custom tokenizer with a .jsonl file + tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl" + tokenizer_config_path = self.dir_model / "tokenizer_config.json" + + if not tokenizer_jsonl_path.is_file(): + raise FileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}") + + # Load tokenizer config + with open(tokenizer_config_path, "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + + # Load tokens from JSONL file (actually a list format) + tokens = [] + scores = [] + toktypes = [] + + with open(tokenizer_jsonl_path, "r", encoding="utf-8") as f: + for line_num, line in enumerate(f): + if line.strip(): + token_data = json.loads(line) + # Format: [token, score, type, ?, ?, ?, ?] + token = token_data[0].encode("utf-8") + score = float(token_data[1]) + token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL" + + tokens.append(token) + scores.append(score) + + if token_type_str == "UNKNOWN": + toktypes.append(gguf.TokenType.UNKNOWN) + elif token_type_str == "CONTROL": + toktypes.append(gguf.TokenType.CONTROL) + elif token_type_str == "BYTE": + toktypes.append(gguf.TokenType.BYTE) + else: + token_str = token_data[0] + if token_str.startswith("<|plamo:") and token_str.endswith("|>"): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + vocab_size = self.hparams["vocab_size"] + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("plamo2") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None: + token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8")) + self.gguf_writer.add_bos_token_id(token_id) + if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None: + token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8")) + self.gguf_writer.add_eos_token_id(token_id) + if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None: + token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8")) + self.gguf_writer.add_pad_token_id(token_id) + if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None: + token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8")) + self.gguf_writer.add_sep_token_id(token_id) + if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None: + token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8")) + self.gguf_writer.add_unk_token_id(token_id) + + # Add <|plamo:op|> as EOT to ensure appropriate end of generation + self.gguf_writer.add_eot_token_id(4) + + self.gguf_writer.add_add_space_prefix(False) + + +class MmprojModel(ModelBase): + model_type = ModelType.MMPROJ + model_arch = gguf.MODEL_ARCH.MMPROJ + preprocessor_config: dict[str, Any] + global_config: dict[str, Any] + + n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers", "encoder_layers", "vt_num_hidden_layers"] + + has_vision_encoder: bool = True # by default + has_audio_encoder: bool = False + + # for models having multiple encoders, we need to separate their hparams + hparams_vision: dict[str, Any] | None = None + hparams_audio: dict[str, Any] | None = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.model_arch != gguf.MODEL_ARCH.MMPROJ: + raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ") + + # get n_embd of the text model + if not self.is_mistral_format: + if "text_config" not in self.hparams: + self.hparams["text_config"] = {} + if "audio_config" not in self.hparams: + self.hparams["audio_config"] = {} + text_config = {**self.hparams, **self.hparams["text_config"]} + self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0)) + else: + text_config = { + k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"] + } + # mistral native params.json: "dim" is the text hidden size ("hidden_dim" is the FFN intermediate size) + self.n_embd_text = text_config.get("dim", 0) + + assert self.n_embd_text > 0, "n_embd not found in hparams" + + # move vision config to the top level, while preserving the original hparams in global_config + import copy + self.global_config = copy.deepcopy(self.hparams) + self.hparams_vision = self.get_vision_config() + self.hparams_audio = self.get_audio_config() + + if self.hparams_vision is None and self.hparams_audio is None: + raise ValueError("vision_config / audio_config not found in hparams") + + # for compat with vision-only models + self.hparams = self.hparams_vision or self.hparams_audio or self.hparams + + # TODO @ngxson : this is a hack to support both vision and audio encoders + have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder + self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True) + self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) + + # load preprocessor config + self.preprocessor_config = {} + + # prefer preprocessor_config.json if possible + preprocessor_config_path = self.dir_model / "preprocessor_config.json" + if preprocessor_config_path.is_file(): + with open(preprocessor_config_path, "r", encoding="utf-8") as f: + cfg = json.load(f) + # move media_proc_cfg to root level for compat + if "media_proc_cfg" in cfg: + cfg = { + **cfg, + **cfg["media_proc_cfg"], + } + # merge configs + self.preprocessor_config = {**self.preprocessor_config, **cfg} + + # prefer processor_config.json if possible + processor_config_path = self.dir_model / "processor_config.json" + if processor_config_path.is_file(): + with open(processor_config_path, "r", encoding="utf-8") as f: + cfg = json.load(f) + # move image_processor to root level for compat + if "image_processor" in cfg: + cfg = { + **cfg, + **cfg["image_processor"], + } + # merge configs + self.preprocessor_config = {**self.preprocessor_config, **cfg} + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip non-multimodal tensors + if "language_model." in name: + return None + + return super().filter_tensors(item) + + def get_vision_config(self) -> dict[str, Any] | None: + config_name = "vision_config" if not self.is_mistral_format else "vision_encoder" + return self.global_config.get(config_name) + + def get_audio_config(self) -> dict[str, Any] | None: + mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config" + return self.global_config.get(mm_config_key) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MMPROJ) + + def prepare_metadata(self, vocab_only: bool): + super().prepare_metadata(vocab_only=vocab_only) + + output_type: str = self.ftype.name.partition("_")[2] + + if self.fname_out.is_dir(): + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=output_type, model_type=None) + self.fname_out = self.fname_out / f"mmproj-{fname_default}.gguf" + else: + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + + def set_gguf_parameters(self): + self.gguf_writer.add_file_type(self.ftype) + + if self.has_vision_encoder: + self.gguf_writer.add_clip_has_vision_encoder(True) + self.gguf_writer.add_vision_projection_dim(self.n_embd_text) + + # vision config + self.image_size = self.find_vparam(["image_size"]) + self.gguf_writer.add_vision_image_size(self.image_size) + self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"])) + self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", "vt_hidden_size"])) + self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"])) + self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys)) + self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", "vt_num_attention_heads"])) + + # preprocessor config + image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] + image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"] + + self.gguf_writer.add_vision_image_mean(image_mean) + self.gguf_writer.add_vision_image_std(image_std) + + if self.has_audio_encoder: + self.gguf_writer.add_clip_has_audio_encoder(True) + self.gguf_writer.add_audio_projection_dim(self.n_embd_text) + + # audio config + self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"])) + self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"])) + self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys)) + self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"])) + + if not self.has_vision_encoder and not self.has_audio_encoder: + raise ValueError("MmprojModel must have either vision or audio encoder") + + def write_vocab(self): + raise ValueError("MmprojModel does not support vocab writing") + + def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any: + assert self.hparams_vision is not None + return self._find_param(self.hparams_vision, keys, optional) + + def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any: + assert self.hparams_audio is not None + return self._find_param(self.hparams_audio, keys, optional) + + def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in obj), None) + if key is not None: + return obj[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, name, n_dims # unused + if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name: + return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 + return False + + +class LazyTorchTensor(gguf.LazyBase): + _tensor_type = torch.Tensor + # to keep the type-checker happy + dtype: torch.dtype + shape: torch.Size + + # only used when converting a torch.Tensor to a np.ndarray + _dtype_map: dict[torch.dtype, type] = { + torch.float16: np.float16, + torch.float32: np.float32, + torch.uint8: np.uint8, + } + + # only used when byteswapping data. Only correct size is needed + # TODO: uncomment uint64, uint32, and uint16, ref: https://github.com/pytorch/pytorch/issues/58734 + _dtype_byteswap_map: dict[torch.dtype, type] = { + torch.float64: np.float64, + torch.float32: np.float32, + torch.bfloat16: np.float16, + torch.float16: np.float16, + torch.int64: np.int64, + # torch.uint64: np.uint64, + torch.int32: np.int32, + # torch.uint32: np.uint32, + torch.int16: np.int16, + # torch.uint16: np.uint16, + torch.int8: np.int8, + torch.uint8: np.uint8, + torch.bool: np.uint8, + torch.float8_e4m3fn: np.uint8, + torch.float8_e5m2: np.uint8, + } + + # used for safetensors slices + # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 + # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 + _dtype_str_map: dict[str, torch.dtype] = { + "F64": torch.float64, + "F32": torch.float32, + "BF16": torch.bfloat16, + "F16": torch.float16, + # "U64": torch.uint64, + "I64": torch.int64, + # "U32": torch.uint32, + "I32": torch.int32, + # "U16": torch.uint16, + "I16": torch.int16, + "U8": torch.uint8, + "I8": torch.int8, + "BOOL": torch.bool, + "F8_E4M3": torch.float8_e4m3fn, + "F8_E5M2": torch.float8_e5m2, + } + + def numpy(self) -> gguf.LazyNumpyTensor: + dtype = self._dtype_map[self.dtype] + return gguf.LazyNumpyTensor( + meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), + args=(self,), + func=(lambda s: s.numpy()) + ) + + @classmethod + def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: + return torch.empty(size=shape, dtype=dtype, device="meta") + + @classmethod + def from_safetensors_slice(cls, st_slice: Any) -> Tensor: + dtype = cls._dtype_str_map[st_slice.get_dtype()] + shape: tuple[int, ...] = tuple(st_slice.get_shape()) + lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:]) + return cast(torch.Tensor, lazy) + + @classmethod + def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor: + def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor: + def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray: + if sys.byteorder == 'big': + # switch data back to big endian + tensor = tensor.view(dtype).byteswap(inplace=False) + return tensor + dtype = cls._dtype_str_map[tensor.dtype] + numpy_dtype = cls._dtype_byteswap_map[dtype] + return torch.from_numpy(byteswap_tensor(tensor.mmap_bytes(), numpy_dtype)).view(dtype).reshape(tensor.shape) + dtype = cls._dtype_str_map[t.dtype] + shape = t.shape + lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r)) + return cast(torch.Tensor, lazy) + + @classmethod + def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor): + def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray: + if sys.byteorder == 'big': + # switch data back to big endian + tensor = tensor.view(dtype).byteswap(inplace=False) + return tensor + dtype = cls._dtype_str_map[remote_tensor.dtype] + numpy_dtype = cls._dtype_byteswap_map[dtype] + shape = remote_tensor.shape + meta = cls.meta_with_dtype_and_shape(dtype, shape) + lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.from_numpy(byteswap_tensor(np.frombuffer(r.data(), dtype=numpy_dtype), numpy_dtype)).view(dtype).reshape(shape)) + return cast(torch.Tensor, lazy) + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + del types # unused + + if kwargs is None: + kwargs = {} + + if func is torch.Tensor.numpy: + assert len(args) + return args[0].numpy() + + return cls._wrap_fn(func)(*args, **kwargs) + + +def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str: + # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders + # maybe we should fallback to text model's arch in that case, since not many models have both + text_config = hparams.get("text_config", {}) + vision_config = hparams.get("vision_config", {}) + arch = None + if (arches := hparams.get("architectures")) is not None and len(arches) > 0: + arch = arches[0] + elif "ssm_cfg" in hparams: + # For non-hf Mamba and Mamba2 models + arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM" + + # Step3-VL keeps text config under text_config but uses a custom top-level architecture. + # For text conversion we route to a dedicated text-only class. + # TODO: refactor this later to avoid adding exception here + if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"): + return arch + + # if "architectures" is found in the sub-config, use that instead + if model_type == ModelType.TEXT and text_config.get("architectures") is not None: + arch = text_config["architectures"][0] + elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None: + arch = vision_config["architectures"][0] + if arch is None: + raise ValueError("Failed to detect model architecture") + return arch diff --git a/conversion/bert.py b/conversion/bert.py new file mode 100644 index 00000000000..8af6c534d07 --- /dev/null +++ b/conversion/bert.py @@ -0,0 +1,616 @@ +from __future__ import annotations + +import json +import os + +from pathlib import Path +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger + + +@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification") +class BertModel(TextModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.vocab_size = None + + if cls_out_labels := self.hparams.get("id2label"): + if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0": + # Remove dummy labels added by AutoConfig + cls_out_labels = None + self.cls_out_labels = cls_out_labels + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_causal_attention(False) + self._try_set_pooling_type() + + if self.cls_out_labels: + self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())]) + + def set_vocab(self): + tokens, toktypes, tokpre = self.get_vocab_base() + self.vocab_size = len(tokens) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + + # convert to phantom space vocab + def phantom(tok, toktype): + if toktype == gguf.TokenType.CONTROL: + return tok + if tok.startswith("##"): + return tok[2:] + return "\u2581" + tok + assert len(tokens) == len(toktypes) + tokens = list(map(phantom, tokens, toktypes)) + + # add vocab to gguf + self.gguf_writer.add_tokenizer_model("bert") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + # handle special tokens + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("bert."): + name = name[5:] + + if name.endswith(".gamma"): + name = name[:-6] + ".weight" + + if name.endswith(".beta"): + name = name[:-5] + ".bias" + + # we are only using BERT for embeddings so we don't need the pooling layer + if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + return None + + if name.startswith("cls.predictions"): + return None + + if name.startswith("cls.seq_relationship"): + return None + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.cls_out_labels: + # For BertForSequenceClassification (direct projection layer) + if name == "classifier.weight": + name = "classifier.out_proj.weight" + + if name == "classifier.bias": + name = "classifier.out_proj.bias" + + yield from super().modify_tensors(data_torch, name, bid) + + def _xlmroberta_tokenizer_init(self) -> None: + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def _xlmroberta_set_vocab(self) -> None: + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'sentencepiece.bpe.model' + + tokenizer_json = {} + tokenizer_config_json = {} + if not tokenizer_path.is_file(): + tokenizer_path = self.dir_model / 'tokenizer.json' + tokenizer_config_path = self.dir_model / 'tokenizer_config.json' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + from base64 import b64decode + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + + with open(tokenizer_path, "r", encoding="utf-8") as fp: + tokenizer_json = json.load(fp) + + if tokenizer_config_path.is_file(): + with open(tokenizer_config_path, "r", encoding="utf-8") as fp: + tokenizer_config_json = json.load(fp) + + add_prefix = tokenizer.add_prefix_space # ty: ignore[unresolved-attribute] + remove_whitespaces = tokenizer.clean_up_tokenization_spaces # ty: ignore[unresolved-attribute] + precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"]) + + vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) # ty: ignore[unresolved-attribute] + else: + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + if isinstance(tokenizer, SentencePieceProcessor): + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + else: + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + unk_token = tokenizer_config_json.get("unk_token") + unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) # ty: ignore[no-matching-overload] + + for token_id in range(tokenizer.vocab_size): # ty: ignore[unresolved-attribute] + piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] + if (piece := tokenizer._convert_id_to_token(token_id)) is not None: # ty: ignore[unresolved-attribute] + text = piece.encode("utf-8") + score = tokenizer_json["model"]["vocab"][token_id][1] + + toktype = SentencePieceTokenTypes.NORMAL + if token_id == unk_token_id: + toktype = SentencePieceTokenTypes.UNKNOWN + elif token_id in tokenizer.all_special_ids: # ty: ignore[unresolved-attribute] + toktype = SentencePieceTokenTypes.CONTROL + elif token_id in added_vocab.values(): + toktype = SentencePieceTokenTypes.USER_DEFINED + # No reliable way to detect this, but jina doesn't have any + # elif tokenizer.IsByte(token_id): + # toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + if isinstance(tokenizer, SentencePieceProcessor): + # realign tokens (see HF tokenizer code) + tokens = [b'', b'', b'', b''] + tokens[3:-1] + scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] + toktypes = [ + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.UNKNOWN, + ] + toktypes[3:-1] + + if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE: + # Add mask token missing from sentencepiece.bpe.model + tokens[250001] = b'' + scores[250001] = 0.0 + toktypes[250001] = SentencePieceTokenTypes.CONTROL + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + +@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification") +class DistilBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def set_gguf_parameters(self): + self.gguf_writer.add_layer_norm_eps(1e-12) + logger.info("gguf: layer norm epsilon = 1e-12") + super().set_gguf_parameters() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("distilbert."): + name = name[11:] + + # These layers act as MLM head, so we don't need them + if name.startswith("vocab_"): + return None + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("RobertaModel", "RobertaForSequenceClassification") +class RobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + """Support BPE tokenizers for roberta models""" + bpe_tok_path = self.dir_model / "tokenizer.json" + if bpe_tok_path.exists(): + self._set_vocab_gpt2() + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + + else: + return super().set_vocab() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("NomicBertModel") +class NomicBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): + hparams = kwargs.pop("hparams", None) + if hparams is None: + hparams = ModelBase.load_hparams(dir_model, False) + + self.is_moe = bool(hparams.get("moe_every_n_layers")) + self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT + + super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) + + self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta() + if self._tokenizer_is_xlmroberta: + self._xlmroberta_tokenizer_init() + + npos, mtp = self.hparams["n_positions"], self.hparams.get("max_trained_positions", 2048) + if npos == 8192 and mtp == 2048: + self.hparams["n_positions"] = 2048 # nomic-embed-text v1 and v1.5 are trained for 2048 tokens. + elif npos == 2048 and mtp == 2048: + self.hparams["n_positions"] = 512 # nomic-embed-text-v2-moe is trained for 512 tokens. + else: + raise ValueError(f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}") + + assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu" + + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors unless MoE + assert self.hparams["qkv_proj_bias"] == self.is_moe + assert self.hparams["mlp_fc1_bias"] == self.is_moe + assert self.hparams["mlp_fc2_bias"] == self.is_moe + + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_vocab(self) -> None: + if self._tokenizer_is_xlmroberta: + return self._xlmroberta_set_vocab() + return super().set_vocab() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # If the tensor is an experts bias tensor, skip it. + if "mlp.experts.bias" in name: + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + if "mlp.experts.mlp.w1" in name: + data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"]) + name += ".weight" + + if "mlp.experts.mlp.w2" in name: + data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"]) + data_torch = data_torch.transpose(1, 2) + name += ".weight" + + yield from super().modify_tensors(data_torch, name, bid) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.is_moe: + self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"]) + self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) + + def _is_tokenizer_xlmroberta(self) -> bool: + with open(self.dir_model / "tokenizer.json") as f: + tokenizer_json = json.load(f) + toktyp = tokenizer_json["model"]["type"] + if toktyp == "Unigram": + return True + if toktyp == "WordPiece": + return False + raise ValueError(f"unknown tokenizer: {toktyp}") + + +@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification") +class NeoBert(BertModel): + model_arch = gguf.MODEL_ARCH.NEO_BERT + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # NeoBERT uses 2/3 of the intermediate size as feed forward length + self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3)) + self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") + + self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("decoder."): + return None + + if name.startswith("model."): + name = name[6:] + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model") +class EuroBertModel(TextModel): + model_arch = gguf.MODEL_ARCH.EUROBERT + + def set_vocab(self): + self.gguf_writer.add_add_bos_token(False) + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # EuroBert is bidirectional (encoder) + self.gguf_writer.add_causal_attention(False) + + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + self._try_set_pooling_type() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("model."): + name = name[6:] + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") +class XLMRobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + _lora_files = {} + _lora_names = [] + + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): + hparams = kwargs.pop("hparams", None) + if hparams is None: + hparams = ModelBase.load_hparams(dir_model, False) + + if lora_names := hparams.get("lora_adaptations"): + self._lora_names = lora_names + self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3 + + super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) + self._xlmroberta_tokenizer_init() + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if self._lora_names: + for name in self._lora_names: + fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-") + self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run) + + return super().generate_extra_tensors() + + def set_type(self): + for lora_writer in self._lora_files.values(): + lora_writer.add_type(gguf.GGUFType.ADAPTER) + lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") + super().set_type() + + def set_vocab(self): + self._xlmroberta_set_vocab() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + # jina-embeddings-v3 + if ".parametrizations." in name: + name = name.replace(".parametrizations.", ".") + if name.endswith(".original"): + name = name[:-9] + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"): + if name.startswith("pooler.dense"): + return + + num_loras = data_torch.size(0) + assert num_loras == len(self._lora_names) + + # Split out each LoRA in their own GGUF + for i, lora_writer in enumerate(self._lora_files.values()): + new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower() + data = data_torch[i, :, :] + # Transpose/flip token_embd/types into correct shape + if new_name == "token_embd.weight.lora_b": + data = data.T + elif new_name.startswith("token_types.weight."): + new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b") + lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32) + + return + + yield from super().modify_tensors(data_torch, name, bid) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # jina-embeddings-v3 + lora_alpha = self.hparams.get("lora_alpha") + if lora_prompt_prefixes := self.hparams.get("task_instructions"): + assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys()) + for lora_name, lora_writer in self._lora_files.items(): + lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0) + lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name) + if lora_prompt_prefixes: + lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name]) + + def write(self): + super().write() + for lora_writer in self._lora_files.values(): + lora_writer.write_header_to_file() + lora_writer.write_kv_data_to_file() + lora_writer.write_tensors_to_file(progress=True) + lora_writer.close() + + +@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM") +class JinaBertV2Model(BertModel): + model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 + + def set_vocab(self): + tokenizer_class = 'BertTokenizer' + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_class = json.load(f)['tokenizer_class'] + + if tokenizer_class == 'BertTokenizer': + super().set_vocab() + elif tokenizer_class == 'RobertaTokenizer': + self._set_vocab_gpt2() + self.gguf_writer.add_token_type_count(2) + else: + raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') + + +@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification") +class ModernBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.MODERN_BERT + + def set_vocab(self): + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + self.gguf_writer.add_add_sep_token(True) + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_sliding_window(self.hparams["local_attention"]) + if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None: + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("model."): + name = name[6:] + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.cls_out_labels: + # For BertForSequenceClassification (direct projection layer) + if name == "classifier.weight": + name = "classifier.out_proj.weight" + + if name == "classifier.bias": + name = "classifier.out_proj.bias" + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/bitnet.py b/conversion/bitnet.py new file mode 100644 index 00000000000..a66446abee2 --- /dev/null +++ b/conversion/bitnet.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("BitnetForCausalLM") +class BitnetModel(TextModel): + model_arch = gguf.MODEL_ARCH.BITNET + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def weight_quant(self, weight: Tensor) -> Tensor: + dtype = weight.dtype + weight = weight.float() + scale = weight.abs().mean().clamp(min=1e-5) + iscale = 1 / scale + # TODO: multiply by the scale directly instead of inverting it twice + # (this is also unnecessarily doubly inverted upstream) + # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 + result = (weight * iscale).round().clamp(-1, 1) / iscale + return result.type(dtype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + if any(self.match_model_tensor_name(new_name, key, bid) for key in [ + gguf.MODEL_TENSOR.ATTN_Q, + gguf.MODEL_TENSOR.ATTN_K, + gguf.MODEL_TENSOR.ATTN_V, + gguf.MODEL_TENSOR.ATTN_OUT, + gguf.MODEL_TENSOR.FFN_UP, + gguf.MODEL_TENSOR.FFN_DOWN, + gguf.MODEL_TENSOR.FFN_GATE, + ]): + # transform weight into 1/0/-1 (in fp32) + data_torch = self.weight_quant(data_torch) + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/bloom.py b/conversion/bloom.py new file mode 100644 index 00000000000..d98edf6d500 --- /dev/null +++ b/conversion/bloom.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import re + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("BloomForCausalLM", "BloomModel") +class BloomModel(TextModel): + model_arch = gguf.MODEL_ARCH.BLOOM + + def set_gguf_parameters(self): + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + assert n_head is not None + assert n_embed is not None + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(4 * n_embed) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + assert n_head is not None + assert n_embed is not None + + name = re.sub(r'transformer\.', '', name) + + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/chameleon.py b/conversion/chameleon.py new file mode 100644 index 00000000000..a996bfa53cf --- /dev/null +++ b/conversion/chameleon.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + +from .llama import LlamaModel + + +@ModelBase.register("ChameleonForConditionalGeneration") +@ModelBase.register("ChameleonForCausalLM") # obsolete +class ChameleonModel(TextModel): + model_arch = gguf.MODEL_ARCH.CHAMELEON + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False)) + + def set_vocab(self): + self._set_vocab_gpt2() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # ignore image tokenizer for now + # TODO: image support for Chameleon + if name.startswith("model.vqmodel"): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + hidden_dim = self.hparams.get("hidden_size") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + if name.endswith(("q_norm.weight", "q_norm.bias")): + data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim) + if name.endswith(("k_norm.weight", "k_norm.bias")): + data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim) + + yield from super().modify_tensors(data_torch, name, bid) + + # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 + @staticmethod + def _reverse_hf_permute(data_torch, n_heads, hidden_dim): + head_dim = hidden_dim // n_heads + data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1) + data_torch = data_torch.repeat_interleave(n_heads, 0) + return data_torch diff --git a/conversion/chatglm.py b/conversion/chatglm.py new file mode 100644 index 00000000000..7e323b89004 --- /dev/null +++ b/conversion/chatglm.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf + + +@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") +class ChatGLMModel(TextModel): + model_arch = gguf.MODEL_ARCH.CHATGLM + + def set_vocab_chatglm3(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[bytes] = [] + toktypes: list[int] = [] + scores: list[float] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) # ty: ignore[unresolved-attribute] + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + for token_id in range(vocab_size): + piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] + if token_id == 0: + piece = "" + elif token_id == 1: + piece = "" + elif token_id == 2: + piece = "" + + text = piece.encode("utf-8") # ty: ignore[unresolved-attribute] + score = 0.0 + # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), + # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() + if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute, invalid-argument-type] + score = tokenizer.tokenizer.sp_model.get_score(token_id) # ty: ignore[unresolved-attribute] + + if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute] + if piece in special_tokens: + toktype = SentencePieceTokenTypes.CONTROL + elif len(piece) == 0: # ty: ignore[invalid-argument-type] + text = f"[PAD{token_id}]".encode("utf-8") + toktype = SentencePieceTokenTypes.UNUSED + else: + toktype = SentencePieceTokenTypes.USER_DEFINED + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + continue + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.tokenizer.sp_model.is_unknown(token_id): # ty: ignore[unresolved-attribute] + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.tokenizer.sp_model.is_control(token_id): # ty: ignore[unresolved-attribute] + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.tokenizer.sp_model.is_unused(token_id): # ty: ignore[unresolved-attribute] + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.tokenizer.sp_model.is_byte(token_id): # ty: ignore[unresolved-attribute] + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + self.gguf_writer.add_tokenizer_model("llama") + # glm3 needs prefix and suffix formatted as: + # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" + self.gguf_writer.add_tokenizer_pre("chatglm-spm") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + @staticmethod + def token_bytes_to_string(b): + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] + byte_encoder = bytes_to_unicode() + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + + @staticmethod + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + + def set_vocab(self): + if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""): + self.set_vocab_chatglm3() + return + + dir_model = self.dir_model + hparams = self.hparams + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] + + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + # only add special tokens when they were not already loaded from config.json + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + assert n_embed is not None + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + assert n_head is not None + n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head)) + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed))) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5)) + self.gguf_writer.add_file_type(self.ftype) + if "attention_dim" in self.hparams: + rope_dim = self.hparams["attention_dim"] + else: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_add_bos_token(False) + rope_freq = 10000 + if "rope_ratio" in self.hparams: + rope_freq = rope_freq * self.hparams["rope_ratio"] + self.gguf_writer.add_rope_freq_base(rope_freq) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith(".rotary_pos_emb.inv_freq"): + return None + + name = name.removeprefix("transformer.") + + return super().filter_tensors((name, gen)) diff --git a/conversion/codeshell.py b/conversion/codeshell.py new file mode 100644 index 00000000000..8bfc3178d46 --- /dev/null +++ b/conversion/codeshell.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("CodeShellForCausalLM") +class CodeShellModel(TextModel): + model_arch = gguf.MODEL_ARCH.CODESHELL + + def set_gguf_parameters(self): + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(10000.0) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) diff --git a/conversion/cogvlm.py b/conversion/cogvlm.py new file mode 100644 index 00000000000..d92df55d46b --- /dev/null +++ b/conversion/cogvlm.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + +from .llama import LlamaModel + + +@ModelBase.register("CogVLMForCausalLM") +class CogVLMVisionModel(MmprojModel): + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if not name.startswith("model.vision."): + return None + + return super().filter_tensors(item) + + +@ModelBase.register("CogVLMForCausalLM") +class CogVLMModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.COGVLM diff --git a/conversion/command_r.py b/conversion/command_r.py new file mode 100644 index 00000000000..603288d165c --- /dev/null +++ b/conversion/command_r.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("CohereForCausalLM") +class CommandR2Model(TextModel): + model_arch = gguf.MODEL_ARCH.COMMAND_R + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + # aya-23 models don't have model_max_length specified + self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + +@ModelBase.register("Cohere2ForCausalLM") +class Cohere2Model(TextModel): + model_arch = gguf.MODEL_ARCH.COHERE2 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + rotary_pct = self.hparams["rotary_pct"] + hidden_size = self.hparams["hidden_size"] + num_attention_heads = self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads))) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Cohere2 runtime in llama.cpp expects no bias tensors; + # the actual weight only contains 0-value tensors as bias, we can skip them + if name.endswith(".bias"): + if torch.any(data_torch != 0): + raise ValueError(f"Bias tensor {name!r} is not zero.") + logger.debug(f"Skipping bias tensor {name!r} for Cohere2 conversion.") + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/dbrx.py b/conversion/dbrx.py new file mode 100644 index 00000000000..207ebcb8931 --- /dev/null +++ b/conversion/dbrx.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("DbrxForCausalLM") +class DbrxModel(TextModel): + model_arch = gguf.MODEL_ARCH.DBRX + + def set_gguf_parameters(self): + ffn_config = self.hparams["ffn_config"] + attn_config = self.hparams["attn_config"] + self.gguf_writer.add_block_count(self.block_count) + + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) + + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) + + self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) + + self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) + + self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) + + self.gguf_writer.add_layer_norm_eps(1e-5) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_expert = self.hparams["ffn_config"]["moe_num_experts"] + n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] + n_embd = self.hparams["d_model"] + + # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose + # original implementation expects (n_expert, n_ff, n_embd) for all experts weights + # But llama.cpp moe graph works differently + # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions + # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor + exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} + "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + experts = False + + for exp_tensor_name in exp_tensor_names.keys(): + if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: + experts = True + data_torch = data_torch.view(n_expert, n_ff, n_embd) + if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: + data_torch = data_torch.permute(*permute_tensor) + break + + # map tensor names + # In MoE models the ffn tensors are typically most of the model weights, + # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. + # Every other model has the weight names ending in .weight, + # let's assume that is the convention which is not the case for dbrx: + # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 + new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) + + yield from super().modify_tensors(data_torch, new_name, bid) + + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + del name, new_name, bid # unused + + return n_dims > 1 diff --git a/conversion/deci.py b/conversion/deci.py new file mode 100644 index 00000000000..46d8568c5a4 --- /dev/null +++ b/conversion/deci.py @@ -0,0 +1,184 @@ +from __future__ import annotations + +import math + +from typing import Any, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("DeciLMForCausalLM") +class DeciModel(TextModel): + model_arch = gguf.MODEL_ARCH.DECI + + @staticmethod + def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: + # DeciLM-specific code + intermediate_size = int(2 * ffn_mult * n_embd / 3) + return DeciModel._find_multiple(intermediate_size, 256) + + @staticmethod + def _find_multiple(n: int, k: int) -> int: + # DeciLM-specific code + if n % k == 0: + return n + return n + k - (n % k) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + _block_configs: list[dict[str,Any]] = self.hparams["block_configs"] + assert self.block_count == len(_block_configs) + self._num_kv_heads = list() + self._num_heads = list() + _ffn_multipliers = list() + # ***linear attention layer*** + # if n_heads_in_group is None and replace_with_linear is True + # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads + # ***attention-free layer*** + # if n_heads_in_group is None and replace_with_linear is False + # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 + # ***normal attention-layer*** + # if n_heads_in_group is not None, then + # _num_kv_heads[il] is num_attention_head // n_heads_in_group and + # _num_heads[il] is num_attention_head + # ***dummy layer*** for nemotron 253B + # if n_heads_in_group is None and ffn_mult is None + # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0 + for il in range(len(_block_configs)): + if _block_configs[il]["attention"]["n_heads_in_group"] is None: + if _block_configs[il]["attention"]["replace_with_linear"] is True: + self._num_kv_heads.append(0) + self._num_heads.append(self.hparams["num_attention_heads"]) + else: + self._num_kv_heads.append(0) + self._num_heads.append(0) + else: + self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"]) + self._num_heads.append(self.hparams["num_attention_heads"]) + if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer + _ffn_multipliers.append(0.0) + else: + _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(_ffn_multipliers) + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int) + assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float) + self._ffn_dims: list[int] = [ + DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"]) + for multiplier in _ffn_multipliers + ] + + def set_vocab(self): + # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's + # eos_token from '|eot_id|' to '|end_of_text|' + if self.hparams.get("vocab_size", 128256) == 128256: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + else: + # DeciLM-7B + self._set_vocab_llama_hf() + + def set_gguf_parameters(self): + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(self._ffn_dims) + if (rope_theta := self.rope_parameters.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + self.gguf_writer.add_head_count(self._num_heads) + self.gguf_writer.add_feed_forward_length(self._ffn_dims) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_file_type(self.ftype) + else: # DeciLM-7B + super().set_gguf_parameters() + if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B + self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"] + assert self.block_count == len(self._num_kv_heads) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + if bid is not None: + if "num_key_value_heads_per_layer" in self.hparams: + n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid] + elif "block_configs" in self.hparams: + n_kv_head = self._num_kv_heads[bid] + n_head = self._num_heads[bid] + else: + n_kv_head = self.hparams.get("num_key_value_heads") + else: + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) + yield from super().modify_tensors(data_torch, name, bid) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10000.0) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + def prepare_tensors(self): + super().prepare_tensors() diff --git a/conversion/deepseek.py b/conversion/deepseek.py new file mode 100644 index 00000000000..e149fcbf752 --- /dev/null +++ b/conversion/deepseek.py @@ -0,0 +1,388 @@ +from __future__ import annotations + +import re + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf, logger + +from .qwen import QwenModel + + +@ModelBase.register("DeepseekOCRForCausalLM") +class DeepseekOCRVisionModel(MmprojModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR) + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + # calculate proj_scale_factor (used by tinygemma3 test model) + image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + n_per_side = int(image_seq_length ** 0.5) + image_size = self.hparams["image_size"] + patch_size = self.hparams["patch_size"] + proj_scale_factor = (image_size // patch_size) // n_per_side + if proj_scale_factor > 0 and proj_scale_factor != 4: + # we only need to write this if it's not the default value + # in this case, we are converting a test model + self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) + # @bluebread: there's no window_size in config but just add it here anyway + self.gguf_writer.add_vision_window_size(self.hparams.get("window_size", 14)) + + # SAM configuration + sam_hparams = hparams['sam'] + self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers']) + self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width']) + self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads']) + + def get_vision_config(self) -> dict[str, Any]: + vision_config: dict[str, Any] | None = self.global_config.get("vision_config") + + if not vision_config: + raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found") + + vision_config['sam'] = vision_config['width']['sam_vit_b'] + vision_config.update(vision_config['width']['clip-l-14-224']) + vision_config['hidden_size'] = vision_config['width'] + vision_config['num_heads'] = vision_config['heads'] + vision_config['intermediate_size'] = vision_config['heads'] * 4 + + return vision_config + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".embeddings." in name or 'pos_embed' in name: + return gguf.GGMLQuantizationType.F32 + if ".rel_pos_h" in name or '.rel_pos_w' in name: + return gguf.GGMLQuantizationType.F32 + if ".neck." in name or ".net_" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Only process vision-related tensors, skip language model tensors + # Vision components: sam_model, vision_model, projector, image_newline, view_seperator + # Language model components to skip: lm_head, embed_tokens, layers, norm + if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")): + return None + + if name.endswith("pos_embed") or name.endswith("rel_pos_h") or name.endswith("rel_pos_w"): + name += ".weight" + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("DeepseekForCausalLM") +class DeepseekModel(TextModel): + model_arch = gguf.MODEL_ARCH.DEEPSEEK + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_weights_scale(1.0) + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + + _experts: list[dict[str, Tensor]] | None = None + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register( + "DeepseekV2ForCausalLM", + "DeepseekV3ForCausalLM", + "KimiVLForConditionalGeneration", + "KimiK25ForConditionalGeneration", + "YoutuForCausalLM", + "YoutuVLForConditionalGeneration", +) +class DeepseekV2Model(TextModel): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + + # TODO @ngxson : remove this when we support MTP for deepseek models + skip_mtp = True + + merge_expert = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + hparams: dict = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + # special handling for Deepseek OCR + if self.origin_hf_arch == "DeepseekOCRForCausalLM": + self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] + self.gguf_writer.add_architecture() + # default jinja template + self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}") + + def set_vocab(self): + try: + self._set_vocab_gpt2() + return + except Exception: + pass + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + tokpre = self.get_vocab_base_pre(tokenizer) + + if tokpre == "kimi-k2": + # Build merges list using the approach similar to HunYuanMoE + merges = [] + vocab = {} + mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # Build token list + vocab_size = self.hparams["vocab_size"] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token = reverse_vocab[i] + tokens.append(token) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + else: + raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") + + def set_gguf_parameters(self): + is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR) + + if is_ocr: + self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0) + else: + # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) + self.hparams["num_key_value_heads"] = 1 + + self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6) + + super().set_gguf_parameters() + hparams = self.hparams + + # first_k_dense_replace: number of leading layers using dense FFN instead of MoE + # For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers + # For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers + has_moe = hparams.get("n_routed_experts") is not None + first_k_dense_replace = hparams.get("first_k_dense_replace") + if first_k_dense_replace is None: + # Default: if no MoE, all layers are dense; if MoE, none are dense + first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0 + self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) + kv_lora_rank = hparams.get("kv_lora_rank", 512) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + + # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA + if not is_ocr: + self.gguf_writer.add_kv_lora_rank(kv_lora_rank) + self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(kv_lora_rank) + self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) + + # MoE parameters (required by C++ code for DEEPSEEK2 arch) + # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length + moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False) + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + + if (n_routed_experts := hparams.get("n_routed_experts")) is not None: + self.gguf_writer.add_expert_count(n_routed_experts) + + # expert_shared_count is required by C++ code, default to 0 for non-MoE models + n_shared_experts = hparams.get("n_shared_experts", 0) + self.gguf_writer.add_expert_shared_count(n_shared_experts) + + # When not set, C++ code will use scale_w = false to skip the no-op scaling + if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None: + self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) + + if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob: + self.gguf_writer.add_expert_weights_norm(norm_topk_prob) + + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None: + # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul + # ref https://github.com/ggml-org/llama.cpp/pull/17945 + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # skip lm_head.weight if tie_word_embeddings is True + if self.hparams.get("tie_word_embeddings", False): + if name == "lm_head.weight" or name == "model.lm_head.weight": + logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)") + return + + # skip Multi-Token Prediction (MTP) layers + if self.skip_mtp: + block_count = self.hparams["num_hidden_layers"] + match = re.match(r"model.layers.(\d+)", name) + if match and int(match.group(1)) >= block_count: + return + + # process the experts separately + if self.merge_expert and name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") + + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.hparams["v_head_dim"] + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + + kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2) + + yield from super().modify_tensors(k_b, name_kb, bid) + yield from super().modify_tensors(v_b, name_vb, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/dots1.py b/conversion/dots1.py new file mode 100644 index 00000000000..7ac299a6e65 --- /dev/null +++ b/conversion/dots1.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, gguf + +from .qwen import Qwen2MoeModel + + +@ModelBase.register("Dots1ForCausalLM") +class Dots1Model(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.DOTS1 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.hparams["num_experts"] = self.hparams["n_routed_experts"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) + self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"]) + self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + if "shared_experts" in name: + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/dotsocr.py b/conversion/dotsocr.py new file mode 100644 index 00000000000..f87f62abde9 --- /dev/null +++ b/conversion/dotsocr.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + + +@ModelBase.register("DotsOCRForCausalLM") +class DotsOCRVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = 0 # dynamic resolution + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR) + self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) + self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) + self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"])) + self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"])) + self.gguf_writer.add_vision_use_silu(True) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if not name.startswith("vision_tower."): + return None + + if "vision_tower.blocks." in name and ".mlp." in name: + # note: to avoid naming conflicts in tensor_mapping.py, we need to handle FFN renaming here + # x = F.silu(self.fc1(x)) * self.fc3(x) + # x = self.fc2(x) + # fc1 -> gate, fc2 -> down, fc3 -> up + # mapping original names to Qwen2.5 naming scheme + name = name.replace("vision_tower.blocks.", "visual.blocks.") + name = name.replace(".fc1", ".gate_proj") + name = name.replace(".fc2", ".down_proj") + name = name.replace(".fc3", ".up_proj") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/dream.py b/conversion/dream.py new file mode 100644 index 00000000000..459e8d46afb --- /dev/null +++ b/conversion/dream.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("DreamModel") +class DreamModel(TextModel): + model_arch = gguf.MODEL_ARCH.DREAM + + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] + vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) + assert max(vocab_dict.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + # Check if it's a special token - treat special tokens as CONTROL tokens + if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder: + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|> + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + return tokens, toktypes, tokpre + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self._try_set_pooling_type() + + # Dream models use non-causal attention for diffusion + self.gguf_writer.add_causal_attention(False) + + # Add Dream-specific parameters + mask_token_id = self.hparams.get("mask_token_id") + if mask_token_id is not None: + self.gguf_writer.add_mask_token_id(mask_token_id) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Dream model tensors should be mapped directly since it's the base model + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/ernie.py b/conversion/ernie.py new file mode 100644 index 00000000000..aa8a3bc8ee5 --- /dev/null +++ b/conversion/ernie.py @@ -0,0 +1,200 @@ +from __future__ import annotations + +import json +import math +import re + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf + + +@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM") +class Ernie4_5Model(TextModel): + model_arch = gguf.MODEL_ARCH.ERNIE4_5 + + def set_vocab(self): + self._set_vocab_sentencepiece() + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "ernie." in name: + name = name.replace("ernie.", "model.") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + num_heads = self.hparams["num_attention_heads"] + num_kv_heads = self.hparams["num_key_value_heads"] + if (head_dim := self.hparams.get("head_dim")) is None: + head_dim = self.hparams["hidden_size"] // num_heads + + # split the qkv weights + # qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size] + if "qkv_proj" in name: + name_q = name.replace("qkv_proj.weight", "q_proj.weight") + name_k = name.replace("qkv_proj.weight", "k_proj.weight") + name_v = name.replace("qkv_proj.weight", "v_proj.weight") + total_q_dim = num_heads * head_dim + total_k_dim = num_kv_heads * head_dim + total_v_dim = num_kv_heads * head_dim + q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0) + yield from super().modify_tensors(q_proj_weight, name_q, bid) + yield from super().modify_tensors(k_proj_weight, name_k, bid) + yield from super().modify_tensors(v_proj_weight, name_v, bid) + # split the up_gate_proj into gate and up + # up_gate_proj shape: [2 * intermediate_size, hidden_size] + elif "up_gate_proj" in name: + name_up = name.replace("up_gate_proj.weight", "up_proj.weight") + name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight") + dim_half = data_torch.shape[0] // 2 + gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0) + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Ernie4_5_MoeForCausalLM") +class Ernie4_5MoeModel(Ernie4_5Model): + model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE + _experts: list[dict[str, Tensor]] | None = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._experts = [{} for _ in range(self.block_count)] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(self.hparams["moe_k"]) + self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"]) + self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"]) + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None: + self.gguf_writer.add_expert_shared_count(shared_expert_count) + if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None: + self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2) + match = re.match(r"model.mtp_block.(\d+)", name) + if match: + return None + + # skip all other MTP tensors for now + match = re.match(r"model.mtp_emb_norm.(\d+)", name) + if match: + return None + + match = re.match(r"model.mtp_hidden_norm.(\d+)", name) + if match: + return None + + match = re.match(r"model.mtp_linear_proj.(\d+)", name) + if match: + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["moe_num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["gate_proj", "up_proj", "down_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename_to_retrieve]) + del self._experts[bid][ename_to_retrieve] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + yield from super().modify_tensors(data_torch, merged_name, bid) + else: + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("PaddleOCRVLForConditionalGeneration") +class PaddleOCRModel(Ernie4_5Model): + model_arch = gguf.MODEL_ARCH.PADDLEOCR + + +@ModelBase.register("PaddleOCRVisionModel") +class PaddleOCRVisionModel(MmprojModel): + # PaddleOCR-VL uses a modified version of Siglip + min_pixels: int = 0 + max_pixels: int = 0 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.min_pixels = self.preprocessor_config["min_pixels"] + self.max_pixels = self.preprocessor_config["max_pixels"] + self.hparams_vision["image_size"] = int(math.sqrt(self.max_pixels)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + hparams = self.hparams_vision + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PADDLEOCR) + self.gguf_writer.add_vision_max_pixels(self.max_pixels) + self.gguf_writer.add_vision_min_pixels(self.min_pixels) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-6)) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "vision_model" not in name and "mlp_AR" not in name: + return None + name = name.replace("visual.", "model.") + if "packing_position_embedding" in name: + # unused + return None + if "vision_model.head" in name: + # we don't yet support image embeddings for this model + return None + + return super().filter_tensors((name, gen)) diff --git a/conversion/exaone.py b/conversion/exaone.py new file mode 100644 index 00000000000..aa1313e2f80 --- /dev/null +++ b/conversion/exaone.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +import math + +from pathlib import Path +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("ExaoneForCausalLM") +class ExaoneModel(TextModel): + model_arch = gguf.MODEL_ARCH.EXAONE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + assert (hparams["activation_function"] == "silu") + + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) + rotary_factor = rotary_factor if rotary_factor is not None else 1.0 + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = self.rope_parameters.get("rope_theta", 10000.0) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + +@ModelBase.register("Exaone4ForCausalLM") +class Exaone4Model(TextModel): + model_arch = gguf.MODEL_ARCH.EXAONE4 + + def set_vocab(self): + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if hparams.get("sliding_window") is not None: + self.gguf_writer.add_sliding_window(hparams["sliding_window"]) + if "layer_types" in hparams: + self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]]) + elif "sliding_window_pattern" in hparams: + sliding_window_pattern = [] + if isinstance(hparams["sliding_window_pattern"], str): # e.g. LLLG + for i in range(hparams["num_hidden_layers"]): + sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L") + if isinstance(hparams["sliding_window_pattern"], int): # e.g. 4 + for i in range(hparams["num_hidden_layers"]): + sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0) + if len(sliding_window_pattern) == hparams["num_hidden_layers"]: + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10_000.0) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_params.get("factor", 16.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + +@ModelBase.register("ExaoneMoEForCausalLM") +class ExaoneMoEModel(Exaone4Model): + model_arch = gguf.MODEL_ARCH.EXAONE_MOE + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + moe_intermediate_size = self.hparams["moe_intermediate_size"] + num_shared_experts = self.hparams["num_shared_experts"] + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + self.gguf_writer.add_expert_shared_count(num_shared_experts) + self.gguf_writer.add_expert_shared_feed_forward_length(moe_intermediate_size * num_shared_experts) + self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) + n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0)) + self.gguf_writer.add_leading_dense_block_count(n_dense_layer) + self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0)) + + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("mtp."): + if name.find("layers.") != -1: + # `mtp.layers.0.[module_name]` format + name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + self.hparams['num_hidden_layers']}") + else: + # mtp fc/norm weights + remapper = { + "mtp.fc": "model.layers.{bid}.eh_proj", + "mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm", + "mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm", + "mtp.norm": "model.layers.{bid}.shared_head.norm", + } + _n = Path(name) + new_name = remapper[_n.stem] + _n.suffix + + # set shared weights for all NextN/MTP layers + for bid in range(self.hparams['num_hidden_layers'], self.block_count): + yield from super().modify_tensors(data_torch, new_name.format(bid=bid), bid) + return + + if name.find("mlp.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + yield from super().modify_tensors(data_torch, new_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/falcon.py b/conversion/falcon.py new file mode 100644 index 00000000000..085fd4cd33f --- /dev/null +++ b/conversion/falcon.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("FalconForCausalLM", "RWForCausalLM") +class FalconModel(TextModel): + model_arch = gguf.MODEL_ARCH.FALCON + + def set_gguf_parameters(self): + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 + head_dim = self.hparams["hidden_size"] // n_head + + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data_torch = torch.cat((q, k, v)).reshape_as(data_torch) + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/falcon_h1.py b/conversion/falcon_h1.py new file mode 100644 index 00000000000..a8bc880b2c4 --- /dev/null +++ b/conversion/falcon_h1.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +from typing import Any, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, gguf + +from .llama import LlamaModel +from .mamba import Mamba2Model + + +@ModelBase.register("FalconH1ForCausalLM") +class FalconH1Model(Mamba2Model): + model_arch = gguf.MODEL_ARCH.FALCON_H1 + + def __init__(self, *args, **kwargs): + # Set the hparam prefixes for Falcon Mamba2 + self.hparam_prefixes = ["mamba"] + + # Initialize the base Mamba2Model + super().__init__(*args, **kwargs) + + # Use Llama conversion for attention + self._transformer_model_class = LlamaModel + + # n_group and d_inner are used during reshape_tensors for mamba2 + self.n_group = self.find_hparam(["n_groups"]) + self.d_inner = self.find_hparam(["mamba_d_ssm"]) + self.d_head = self.find_hparam(["d_head"]) + + # Initialize any Falcon Mamba2 specific attributes + self.has_attention = True # Falcon Mamba2 has attention components + + # Load Falcon-H1 multipliers from hyperparameters + self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True) + self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True) + self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True) + self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True) + self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True) + self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True) + self.intermediate_size = self.find_hparam(["intermediate_size"]) + self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True) + + def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: + prefixed = [] + for pfx in self.hparam_prefixes: + prefixed.extend( + "_".join([pfx, k]) + for k in keys + ) + keys = list(keys) + prefixed + return super().find_hparam(keys, *args, **kwargs) + + def set_vocab(self): + self._set_vocab_gpt2() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + tensors = list(super().modify_tensors(data_torch, name, bid)) + tensor = tensors[0][1] + + if "down_proj" in name: + tensor = tensor * self.mlp_multipliers[1] + elif "gate_proj" in name: + tensor = tensor * self.mlp_multipliers[0] + elif "k_proj" in name: + tensor = tensor * self.key_multiplier * self.attention_in_multiplier + elif "q_proj" in name: + tensor = tensor * self.attention_in_multiplier + elif "v_proj" in name: + tensor = tensor * self.attention_in_multiplier + elif "o_proj" in name: + tensor = tensor * self.attention_out_multiplier + elif "out_proj" in name: + tensor = tensor * self.ssm_out_multiplier + elif "in_proj" in name: + tensor = tensor * self.ssm_in_multiplier + zxbcdt_multipliers = self.hparams["ssm_multipliers"] + intermediate_size = self.hparams["mamba_d_ssm"] + groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"] + tensor[:intermediate_size, :] *= zxbcdt_multipliers[0] + tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1] + tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2] + tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3] + tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4] + elif "lm_head" in name: + tensor = tensor * self.hparams["lm_head_multiplier"] + elif "embed_tokens" in name: + tensor = tensor * self.hparams["embedding_multiplier"] + elif "mamba.norm" in name: + tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group) + + tensors = [(tensors[0][0], tensor)] + return tensors + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + ## General Params ## + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + # Override some Mamba2 defaults + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0)) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + + ## Attention params ## + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2 + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_key_length(self.hparams["head_dim"]) + self.gguf_writer.add_value_length(self.hparams["head_dim"]) + + ## Validation ## + assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" + assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}" + + # Add any other Falcon Mamba2 specific configuration + self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"]) diff --git a/conversion/gemma.py b/conversion/gemma.py new file mode 100644 index 00000000000..a6e14fbcb98 --- /dev/null +++ b/conversion/gemma.py @@ -0,0 +1,840 @@ +from __future__ import annotations + +import json +import re + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GemmaForCausalLM") +class GemmaModel(TextModel): + model_arch = gguf.MODEL_ARCH.GEMMA + + def set_vocab(self): + self._set_vocab_sentencepiece() + + # TODO: these special tokens should be exported only for the CodeGemma family + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) + special_vocab._set_special_token("prefix", 67) + special_vocab._set_special_token("suffix", 69) + special_vocab._set_special_token("middle", 68) + special_vocab._set_special_token("fsep", 70) + special_vocab._set_special_token("eot", 107) + special_vocab.chat_template = None # do not add it twice + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Gemma2ForCausalLM") +class Gemma2Model(TextModel): + model_arch = gguf.MODEL_ARCH.GEMMA2 + + def set_vocab(self): + self._set_vocab_sentencepiece() + + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_attn_logit_softcapping( + self.hparams["attn_logit_softcapping"] + ) + self.gguf_writer.add_final_logit_softcapping( + self.hparams["final_logit_softcapping"] + ) + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") +class Gemma3Model(TextModel): + model_arch = gguf.MODEL_ARCH.GEMMA3 + + def norm_shift(self, name: str) -> float: + return 1.0 if name.endswith("norm.weight") else 0.0 # Gemma3RMSNorm adds 1.0 to the norm value + + def set_vocab(self): + if (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + self.gguf_writer.add_add_space_prefix(False) + else: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + # some default values are not specified in the hparams + self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072)) + self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8)) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6)) + self.gguf_writer.add_key_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers + # attn_logit_softcapping is removed in Gemma3 + assert hparams.get("attn_logit_softcapping") is None + if (final_logit_softcap := hparams.get("final_logit_softcapping")): + self.gguf_writer.add_final_logit_softcapping(final_logit_softcap) + if hparams.get("sliding_window_pattern") != 1: + self.gguf_writer.add_sliding_window(hparams["sliding_window"]) + self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # remove OOV (out-of-vocabulary) rows in token_embd + if "embed_tokens.weight" in name: + n_vocab_real = -1 + if (self.dir_model / "tokenizer.model").is_file(): + tokens = self._create_vocab_sentencepiece()[0] + n_vocab_real = len(tokens) + else: + with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + n_vocab_real = len(tokenizer_json["model"]["vocab"]) + len(tokenizer_json["added_tokens"]) + data_torch = data_torch[:n_vocab_real] + + # ref code in Gemma3RMSNorm + # output = output * (1.0 + self.weight.float()) + # note: this is not the case on gemma3n + f_shift = self.norm_shift(name) + if f_shift != 0.0: + data_torch = data_torch + f_shift + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Gemma3TextModel") +class EmbeddingGemma(Gemma3Model): + model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING + module_paths = [] + dense_features_dims = {} + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.sentence_transformers_dense_modules: + # read modules.json to determine if model has Dense layers + modules_file = self.dir_model / "modules.json" + if modules_file.is_file(): + with open(modules_file, encoding="utf-8") as modules_json_file: + mods = json.load(modules_json_file) + for mod in mods: + if mod["type"].endswith("Dense"): + mod_path = mod["path"] + # check if model.safetensors file for Dense layer exists + model_tensors_file = self.dir_model / mod_path / "model.safetensors" + if model_tensors_file.is_file(): + self.module_paths.append(mod_path) + # read config.json of the Dense layer to get in/out features + mod_conf_file = self.dir_model / mod_path / "config.json" + if mod_conf_file.is_file(): + with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file: + mod_conf = json.load(mod_conf_json_file) + # hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights + prefix = self._get_dense_prefix(mod_path) + if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None: + self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"]) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + from safetensors.torch import load_file + module_paths = list(self.module_paths) + for i, module_path in enumerate(module_paths): + tensors_file = self.dir_model / module_path / "model.safetensors" + local_tensors = load_file(tensors_file) + tensor_name = self._get_dense_prefix(module_path) + for name, local_tensor in local_tensors.items(): + if not name.endswith(".weight"): + continue + orig_name = name.replace("linear", tensor_name) + name = self.map_tensor_name(orig_name) + yield name, local_tensor.clone() + + @staticmethod + def _get_dense_prefix(module_path) -> str: + """Get the tensor name prefix for the Dense layer from module path.""" + tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3" + return tensor_name + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # Override the sliding window size as it gets adjusted by the Gemma3TextConfig + # constructor. We want to use the value from the original model's config.json. + # ref: https://github.com/huggingface/transformers/pull/40700 + with open(self.dir_model / "config.json", "r", encoding="utf-8") as f: + config = json.load(f) + orig_sliding_window = config.get("sliding_window") + if orig_sliding_window is None: + raise ValueError("sliding_window not found in model config - this is required for the model") + + logger.info(f"Using original sliding_window from config: {orig_sliding_window} " + f"instead of {self.hparams['sliding_window']}") + self.gguf_writer.add_sliding_window(orig_sliding_window) + if self.sentence_transformers_dense_modules: + for dense, dims in self.dense_features_dims.items(): + logger.info(f"Setting dense layer {dense} in/out features to {dims}") + self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1]) + + self._try_set_pooling_type() + + +@ModelBase.register("Gemma3ForConditionalGeneration") +class Gemma3VisionModel(MmprojModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3) + # default values below are taken from HF transformers code + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + # calculate proj_scale_factor (used by tinygemma3 test model) + image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + n_per_side = int(image_seq_length ** 0.5) + image_size = self.hparams["image_size"] + patch_size = self.hparams["patch_size"] + proj_scale_factor = (image_size // patch_size) // n_per_side + if proj_scale_factor > 0 and proj_scale_factor != 4: + # we only need to write this if it's not the default value + # in this case, we are converting a test model + self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # related to https://github.com/ggml-org/llama.cpp/issues/13025 + if "input_projection" in name: + return gguf.GGMLQuantizationType.F16 + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "vision_model.head." in name: + # skip redundant tensors for tinygemma3 + return None + + if not name.startswith(("multi_modal_projector.", "vision_tower.", "multimodal_projector.", "vision_model.")): + return None + + name = name.replace("_weight", ".weight") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector + # the other norm values are part of SigLIP model, and they are already correct + # ref code: Gemma3RMSNorm + if "soft_emb_norm.weight" in name: + logger.info(f"Correcting norm value for '{name}'") + data_torch = data_torch + 1 + + yield from super().modify_tensors(data_torch, name, bid) + + +class ConformerAudioModel(MmprojModel): + _batch_norm_tensors: list[dict[str, Tensor]] | None = None + + @staticmethod + def is_audio_tensor(name: str): + return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"]) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ConformerAudioModel.is_audio_tensor(name): + if ".conv" in name or "_conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # fold running_mean, running_var and eps into weight and bias for batch_norm + if "batch_norm" in name: + if self._batch_norm_tensors is None: + self._batch_norm_tensors = [{} for _ in range(self.block_count)] + assert bid is not None + self._batch_norm_tensors[bid][name] = data_torch + + if len(self._batch_norm_tensors[bid]) < 5: + return + + weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"] + bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"] + running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"] + running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"] + eps = 1e-5 # default value + + a = weight / torch.sqrt(running_var + eps) + b = bias - running_mean * a + yield from super().modify_tensors(a, f"conformer.layers.{bid}.conv.batch_norm.weight", bid) + yield from super().modify_tensors(b, f"conformer.layers.{bid}.conv.batch_norm.bias", bid) + return + + # reshape conv weights + if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"): + data_torch = data_torch[:, None, None] + if "conv.depthwise_conv" in name and name.endswith(".weight"): + assert data_torch.shape[1] == 1 + data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2]) + if "conv.pointwise_conv" in name and name.endswith(".weight"): + assert data_torch.shape[2] == 1 + data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1]) + + mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) + yield (mapped_name, data_torch) + + +@ModelBase.register("Gemma3nForConditionalGeneration") +class Gemma3nVisionAudioModel(ConformerAudioModel): + has_audio_encoder = True + has_vision_encoder = True + + # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py) + # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py + block_tensor_mapping = { + "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.blk.{bid}.{sid}.conv_pwl.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.blk.{bid}.{sid}.bn2.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.blk.{bid}.{sid}.dw_start.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.blk.{bid}.{sid}.dw_start.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.blk.{bid}.{sid}.dw_mid.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.blk.{bid}.{sid}.dw_mid.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.blk.{bid}.{sid}.pw_exp.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.blk.{bid}.{sid}.pw_exp.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.blk.{bid}.{sid}.pw_proj.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.blk.{bid}.{sid}.pw_proj.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.blk.{bid}.{sid}.layer_scale.gamma", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.blk.{bid}.{sid}.attn.query.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.blk.{bid}.{sid}.attn.key.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.blk.{bid}.{sid}.attn.value.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.blk.{bid}.{sid}.attn.output.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.blk.{bid}.{sid}.attn.key.down_conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.blk.{bid}.{sid}.attn.key.norm.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.blk.{bid}.{sid}.attn.value.norm.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight", + } + + def __init__(self, *args, **kwargs): + # Parent init will call find_hparam which now returns 0 for empty keys + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it + self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4 + self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8) + + # MobileNetV5 does not use image_mean/std + self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0] + self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0] + self.hparams_vision["image_size"] = self.preprocessor_config.get( + "size", {"height": 768, "width": 768} + )["height"] + + # Image sequence length (256 tokens = 16x16 for Gemma3n) + image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + image_size = self.hparams_vision["image_size"] + self.hparams_vision["patch_size"] = image_size // image_seq_length + + # remap audio hparams + assert self.hparams_audio is not None + self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"] + self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"] + self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"] + self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # vision params + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + + # audio params + assert self.hparams_audio is not None + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # Force quantization settings for specific tensor types + if "input_projection" in name or "input_proj" in name: + return gguf.GGMLQuantizationType.F16 + if ".embeddings." in name or "stem" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def custom_map(self, name: str) -> str: + """Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping.""" + parts = name.split(".") + # MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix + if len(parts) >= 7: + bid, sid = parts[4], parts[5] + suffix = ".".join(parts[6:]) + template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}" + if template in self.block_tensor_mapping: + return self.block_tensor_mapping[template].format(bid=bid, sid=sid) + + raise ValueError(f"Unknown name: {name}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if (ConformerAudioModel.is_audio_tensor(name)): + name = name.replace("model.audio_tower.conformer.", "conformer.layers.") + yield from super().modify_tensors(data_torch, name, bid) + + # Gemma3n uses + # - model.embed_vision.* for projection layers + # - model.vision_tower.* for vision encoder + # Skip non-vision tensors + if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")): + return + + if name.startswith("model.vision_tower.timm_model.blocks."): + # Double-indexed block tensors through custom logic + yield (self.custom_map(name), data_torch) + return + else: + # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py + new_name = self.map_tensor_name(name) + + if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"): + data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1] + + yield from ModelBase.modify_tensors(self, data_torch, new_name, bid) + + +@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") +class Gemma3NModel(Gemma3Model): + model_arch = gguf.MODEL_ARCH.GEMMA3N + + _altup_proj: list[Tensor] = [] + _altup_unembd: list[Tensor] = [] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs" + self._altup_proj = [ + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + ] + self._altup_unembd = [ + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + ] + + def norm_shift(self, name: str) -> float: + del name + return 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code + + def set_vocab(self): + # For Gemma3n multimodal models, we need the FULL vocab_size (262400) + # which includes special tokens from 262144-262399 for vision/audio. + # The vocab_size_per_layer_input (262144) is only the embedding size per layer. + # Temporarily override the hparams lookup order to prioritize vocab_size. + + # Store original vocab_size_per_layer_input if it exists + vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input") + + # Temporarily remove vocab_size_per_layer_input to force using vocab_size + if vocab_size_per_layer_input is not None: + del self.hparams["vocab_size_per_layer_input"] + + # Call parent set_vocab which will now use vocab_size (262400) + super().set_vocab() + + # Restore vocab_size_per_layer_input for later use + if vocab_size_per_layer_input is not None: + self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"]) + self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"]) + self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"]) + self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"]) + + activation_sparsity_scale = [] + for s in self.hparams["activation_sparsity_pattern"]: + normal_dist = torch.distributions.normal.Normal(0, 1) + std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32)) + activation_sparsity_scale.append(std_multiplier.item()) + self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale) + + sliding_window_pattern = [] + for t in self.hparams["layer_types"]: + sliding_window_pattern.append(t == "sliding_attention") + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + + def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None: + has_all = all(m.numel() > 0 for m in matrices) + if not has_all: + return None + else: + return torch.stack(matrices, dim=0) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith("_scale"): + name = name + ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # TODO: implement self.prediction_coefs.weight.clamp_(...) + + # Pad token embeddings for vision/audio special tokens (262144-262399) + if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name: + # Move to CPU to avoid meta device issues during padding + data_torch = data_torch.to(device="cpu") + + vocab_size = self.hparams.get("vocab_size", 262400) + current_size = data_torch.shape[0] # First dimension is vocab_size + + if current_size < vocab_size: + # Pad with zeros for vision/audio tokens (they get embeddings from vision tower) + padding_size = vocab_size - current_size + tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings" + logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)") + + # Create padding with zeros (vision tokens won't use these embeddings) + padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device) + data_torch = torch.cat([data_torch, padding], dim=0) + + # Continue with normal processing + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + return + + if "altup_unembed_projections" in name: + data_torch = data_torch.to(device="cpu") + # altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based + # They should NOT be padded + if ".0." in name: + self._altup_unembd[0] = data_torch + elif ".1." in name: + self._altup_unembd[1] = data_torch + elif ".2." in name: + self._altup_unembd[2] = data_torch + else: + raise ValueError(f"Unknown name: {name}") + out = self._stack_matrices(self._altup_unembd) + if out is not None: + yield from ModelBase.modify_tensors(self, out, "model.altup_unembed_projections.weight", bid) + return + else: + return + + if "altup_projections" in name: + data_torch = data_torch.to(device="cpu") + if ".0." in name: + self._altup_proj[0] = data_torch + elif ".1." in name: + self._altup_proj[1] = data_torch + elif ".2." in name: + self._altup_proj[2] = data_torch + else: + raise ValueError(f"Unknown name: {name}") + out = self._stack_matrices(self._altup_proj) + if out is not None: + yield from ModelBase.modify_tensors(self, out, "model.altup_projections.weight", bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Gemma4ForConditionalGeneration") +class Gemma4Model(Gemma3Model): + model_arch = gguf.MODEL_ARCH.GEMMA4 + + def norm_shift(self, name: str) -> float: + del name # unused + return 0.0 + + def set_vocab(self): + vocab = gguf.LlamaHfVocab(self.dir_model) + tokens = [] + scores = [] + toktypes = [] + visible_tokens = {"<|channel>", "", "<|tool_call>", "", "<|tool_response>", "", "<|\"|>"} + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + text_str = text.decode() + if text_str in visible_tokens: + # always render these tokens, so that the chat parser can read them + toktypes.append(gguf.TokenType.USER_DEFINED) + logger.info(f"Token '{text_str}' is set to USER_DEFINED") + else: + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("gemma4") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + self.gguf_writer.add_add_space_prefix(False) + self.gguf_writer.add_add_bos_token(True) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + num_kv_shared_layers = self.hparams["num_kv_shared_layers"] + self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers) + + # per-layer embedding is optional + n_pl_embd = self.hparams.get("hidden_size_per_layer_input") or 0 + self.gguf_writer.add_embedding_length_per_layer_input(n_pl_embd) + + swa_layers = [t == "sliding_attention" for t in self.hparams["layer_types"]] + self.gguf_writer.add_sliding_window_pattern(swa_layers) + + head_dim_full = self.hparams["global_head_dim"] + head_dim_swa = self.hparams["head_dim"] + # correct the head dim for global/swa layers + self.gguf_writer.add_key_length(head_dim_full) + self.gguf_writer.add_value_length(head_dim_full) + self.gguf_writer.add_key_length_swa(head_dim_swa) + self.gguf_writer.add_value_length_swa(head_dim_swa) + + expert_intermediate_size = self.find_hparam(["expert_intermediate_size", "moe_intermediate_size"]) + if expert_intermediate_size is not None: + self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) + + # if use_double_wide_mlp is set, we need to adjust the value for kv shared layers + use_double_wide_mlp = self.hparams.get("use_double_wide_mlp", False) + first_kv_shared_layer_idx = self.block_count - num_kv_shared_layers + if use_double_wide_mlp: + n_ff = self.hparams["intermediate_size"] + n_ff_arr = [n_ff if il < first_kv_shared_layer_idx else n_ff * 2 for il in range(self.block_count)] + self.gguf_writer.add_feed_forward_length(n_ff_arr) + + # handle num_global_key_value_heads + num_key_value_heads_full = self.hparams.get("num_global_key_value_heads") + num_key_value_heads_swa = self.hparams.get("num_key_value_heads") + if num_key_value_heads_full is not None and num_key_value_heads_swa is not None: + value_arr = [num_key_value_heads_swa if is_swa else num_key_value_heads_full for is_swa in swa_layers] + self.gguf_writer.add_head_count_kv(value_arr) + + # handle n_rot differently for global vs swa layers + partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0) + n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors + n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa) + self.gguf_writer.add_rope_dimension_count(n_rot_full) + self.gguf_writer.add_rope_dimension_count_swa(n_rot_swa) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # full layer uses "proportional" rope with partial_rotary_factor=0.25 + # the expected ordering is cc000000ss000000 (c = cos, s = sin, 0 = unrotated), + # but ggml neox only supports ccss000000000000, and we cannot rearrange the head because that will break use_alternative_attention + # solution is to set specific freq_factors for the unrotated dims + + # IMPORTANT: this ROPE_FREQS tensor is ONLY used by the full_attention layers + rope_params_full = self.hparams["rope_parameters"]["full_attention"] + assert rope_params_full["rope_type"] == "proportional" + head_dim_full = (self.hparams["global_head_dim"]) + partial_rotary_factor_full = rope_params_full["partial_rotary_factor"] + n_rot_full = int(head_dim_full * partial_rotary_factor_full / 2) + n_unrot_full = int(head_dim_full / 2) - n_rot_full + values = [1.0] * n_rot_full + [1e30] * n_unrot_full + rope_freqs_full = torch.tensor(values, dtype=torch.float32) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full) + + def _generate_nvfp4_tensors(self): + # Gemma-4 stores a per-layer router.per_expert_scale ([n_expert]) that scales + # each expert's contribution. It's mathematically equivalent to a per-expert + # scalar on the down_proj output, which is exactly where ffn_down_exps_s is + # applied at inference. Fold it into each expert's NVFP4 weight_scale_2 so the + # existing NVFP4 path produces the right scales. + n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0 + for name in [n for n in self.model_tensors if n.endswith(".router.per_expert_scale")]: + bid_match = re.search(r"\.layers\.(\d+)\.", name) + if bid_match is None: + continue + bid = bid_match.group(1) + prefix = name[: name.index(f".layers.{bid}.") + len(f".layers.{bid}.")] + w2_targets = [f"{prefix}experts.{e}.down_proj.weight_scale_2" for e in range(n_experts)] + present = [w2 in self.model_tensors for w2 in w2_targets] + if not any(present): + continue + assert all(present), f"layer {bid}: partial NVFP4 quantization across experts" + r = self.model_tensors.pop(name) + for e, w2 in enumerate(w2_targets): + s = self.model_tensors[w2] + self.model_tensors[w2] = lambda s=s, r=r, i=e: s() * r()[i] + super()._generate_nvfp4_tensors() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith("per_dim_scale") or name.endswith("layer_scalar"): + name = name + ".weight" + if ".experts." in name and not name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")): + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith("router.scale"): + name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, ".scale") + yield (name, data_torch) + return + if ".per_expert_scale" in name: + # convert per-expert scale to FFN down scale + name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN_EXP, bid, ".scale") + yield (name, data_torch) + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Gemma4ForConditionalGeneration") +class Gemma4VisionAudioModel(MmprojModel): + has_audio_encoder = True + has_vision_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = 224 # unused, but set to avoid error + + # remap audio hparams + if self.hparams_audio: + self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128) + self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4 + else: + self.has_audio_encoder = False + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # vision params + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + + # audio params + if self.hparams_audio: + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + + def is_audio_tensor(self, name: str) -> bool: + return "audio_tower" in name or "embed_audio" in name + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if self.is_audio_tensor(name): + if ".conv" in name or "_conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F32 + if "position_embedding_table" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if len(data_torch.shape) == 0: + # convert scalar tensors (input/output_mix/max) to 1D tensors + data_torch = data_torch.unsqueeze(0) + + if self.is_audio_tensor(name): + assert self.hparams_audio is not None + name = name.replace("model.audio_tower.", "conformer.") + name = name.replace(".linear.", ".") + if name.endswith("per_dim_key_scale") or name.endswith("per_dim_scale"): + name = name + ".weight" + data_torch = torch.nn.functional.softplus(data_torch) + if "lconv1d.depthwise_conv1d" in name and name.endswith(".weight"): + assert data_torch.shape[1] == 1 + data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2]) + mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) + yield (mapped_name, data_torch) + + else: + name = name.replace("model.vision_tower.encoder.", "vision_model.model.") + name = name.replace(".linear.weight", ".weight") + if name.endswith("layer_scalar") or name.endswith("position_embedding_table"): + name = name + ".weight" + if name.endswith("patch_embedder.input_proj.weight"): + n_embd, ksize_sq_c = data_torch.shape + patch_size = int((ksize_sq_c // 3) ** 0.5) + data_torch = data_torch.reshape(n_embd, patch_size, patch_size, 3) + data_torch = data_torch.permute(0, 3, 1, 2).contiguous() + mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) + yield (mapped_name, data_torch) diff --git a/conversion/glm.py b/conversion/glm.py new file mode 100644 index 00000000000..641937720d6 --- /dev/null +++ b/conversion/glm.py @@ -0,0 +1,259 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + +from .deepseek import DeepseekV2Model + + +@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration") +class Glm4Model(TextModel): + model_arch = gguf.MODEL_ARCH.GLM4 + use_mrope = False + partial_rotary_factor = 0.5 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5) + if "mrope_section" in self.rope_parameters: + self.use_mrope = True + logger.info("Q/K weight will need to be permuted for M-RoPE") + + def set_vocab(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (rope_dim := self.hparams.get("head_dim")) is None: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor)) + + @staticmethod + def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor: + orig_shape = weights.shape + if len(orig_shape) == 1: + weights = weights.unsqueeze(1) # [out_dim, 1] + if len(weights.shape) != 2: + raise ValueError("Only 1D and 2D tensors are supported.") + n_effective_heads = weights.shape[0] // head_dim + if n_head_kv is not None and n_effective_heads != n_head: + if n_effective_heads != n_head_kv: + raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}") + rotary_dim = int(head_dim * partial_rotary_factor) + if rotary_dim % 2 != 0: + raise ValueError("rotary_dim must be even.") + reshaped = weights.reshape(n_effective_heads, head_dim, -1) + rot_part = reshaped[:, :rotary_dim, :] + non_rot_part = reshaped[:, rotary_dim:, :] + permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1) + combined = torch.cat((permuted_rot, non_rot_part), dim=1) + result = combined.reshape(weights.shape) + return result if len(orig_shape) != 1 else result.squeeze(1) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.use_mrope: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams["num_key_value_heads"] + n_embd = self.hparams["hidden_size"] + head_dim = self.hparams.get("head_dim", n_embd // n_head) + # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor) + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("GlmOcrForConditionalGeneration") +class GlmOCRModel(Glm4Model): + model_arch = gguf.MODEL_ARCH.GLM4 + use_mrope = False + partial_rotary_factor = 0.5 + + # Note: GLM-OCR is the same as GLM4, but with an extra NextN/MTP prediction layer + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # GLM-OCR has num_hidden_layers + 1 actual layers (including NextN layer) + self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + # NextN/MTP prediction layers + if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) + + +@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration") +class Glm4MoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.GLM4_MOE + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer) + self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_vocab(self): + return self._set_vocab_glm() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (rope_dim := self.hparams.get("head_dim")) is None: + rope_dim = ( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) + self.gguf_writer.add_rope_dimension_count( + int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)) + ) + + # MoE parameters - Use only routed expert count (shared experts handled separately) + if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None: + self.gguf_writer.add_expert_count(n_routed_experts) + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None: + self.gguf_writer.add_expert_shared_count(n_shared_experts) + if (first_k_dense_replace := self.hparams.get("first_k_dense_replace")) is not None: + self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) + + # Expert gating function (sigmoid for GLM4_MOE) + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + + # Routed scaling factor + if (routed_scaling_factor := self.hparams.get("routed_scaling_factor")) is not None: + self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) + + # Normalise topk probabilities + if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None: + self.gguf_writer.add_expert_weights_norm(norm_topk_prob) + + # NextN/MTP prediction layers + if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) + + _experts: list[dict[str, Tensor]] | None = None + + # note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Handle main token embedding (but not layer-specific NextN embeddings) + if name == "model.embed_tokens.weight" and ".layers." not in name: + yield from super().modify_tensors(data_torch, "token_embd.weight", bid) + return + + # Handle routed experts + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("Glm4MoeLiteForCausalLM") +class Glm4MoeLiteModel(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + + def set_vocab(self): + return self._set_vocab_glm() + + +@ModelBase.register("GlmMoeDsaForCausalLM") +class GlmMoeDsaModel(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.GLM_DSA + skip_mtp = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_vocab(self): + return self._set_vocab_glm() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + rope_dim = self.hparams["qk_rope_head_dim"] + partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0) + self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor)) + + # NextN/MTP prediction layers + if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) + + # DSA indexer parameters + self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"]) + self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"]) + self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"]) + + +@ModelBase.register("SolarOpenForCausalLM") +class SolarOpenModel(Glm4MoeModel): + model_arch = gguf.MODEL_ARCH.GLM4_MOE + + def set_vocab(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()[""]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab.add_to_gguf(self.gguf_writer) diff --git a/conversion/gpt2.py b/conversion/gpt2.py new file mode 100644 index 00000000000..1cf06ae8b50 --- /dev/null +++ b/conversion/gpt2.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GPT2LMHeadModel") +class GPT2Model(TextModel): + model_arch = gguf.MODEL_ARCH.GPT2 + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["n_ctx"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # we don't need these + if name.endswith((".attn.bias", ".attn.masked_bias")): + yield from super().modify_tensors(data_torch, name, bid) + return + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + yield from super().modify_tensors(data_torch, new_name, bid) + + +@ModelBase.register("RuGPT3XLForCausalLM") +class RuGPT3XLModel(TextModel): + model_arch = gguf.MODEL_ARCH.GPT2 + + _qkv_parts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Fuse separate Q, K, V projections into a single QKV tensor + if ".self_attn.q_proj." in name or ".self_attn.k_proj." in name or ".self_attn.v_proj." in name: + suffix = "weight" if name.endswith(".weight") else "bias" + part = "q" if ".q_proj." in name else ("k" if ".k_proj." in name else "v") + key = f"{part}.{suffix}" + + assert bid is not None + if self._qkv_parts is None: + self._qkv_parts = [{} for _ in range(self.block_count)] + self._qkv_parts[bid][key] = data_torch + + q_key, k_key, v_key = f"q.{suffix}", f"k.{suffix}", f"v.{suffix}" + if all(k in self._qkv_parts[bid] for k in [q_key, k_key, v_key]): + q = self._qkv_parts[bid].pop(q_key) + k = self._qkv_parts[bid].pop(k_key) + v = self._qkv_parts[bid].pop(v_key) + data_torch = torch.cat([q, k, v], dim=0) + name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, f".{suffix}") + logger.debug(f"Fused Q/K/V {suffix} for layer {bid} -> {name}") + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._qkv_parts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + parts = [f"({i}){k}" for i, d in enumerate(self._qkv_parts) for k in d.keys()] + if len(parts) > 0: + raise ValueError(f"Unprocessed Q/K/V parts: {parts}") diff --git a/conversion/gpt_oss.py b/conversion/gpt_oss.py new file mode 100644 index 00000000000..d2c70c0bba5 --- /dev/null +++ b/conversion/gpt_oss.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GptOssForCausalLM") +class GptOssModel(TextModel): + model_arch = gguf.MODEL_ARCH.GPT_OSS + + # TODO: remove once MXFP4 is supported more generally + def dequant_model(self): + if self._is_mxfp4: + return + return super().dequant_model() + + def transform_nibble_layout(self, tensor): + assert tensor.dtype == torch.uint8 + assert tensor.shape[-1] == 16 + # swap nibbles + t_lo = tensor & 0x0F + t_hi = tensor & 0xF0 + t_swapped = (t_lo << 4) | (t_hi >> 4) + tensor = t_swapped + # transform aaaa...bbbb... to abababab... + blk_a, blk_b = tensor.chunk(2, dim=-1) + # get a_ + blk_a0 = (blk_a & 0xF0).view(-1, 1) + blk_a1 = (blk_a << 4).view(-1, 1) + blk_a = torch.stack((blk_a0, blk_a1), dim=2).view(tensor.shape) + # get _b + blk_b0 = (blk_b >> 4).view(-1, 1) + blk_b1 = (blk_b & 0x0F).view(-1, 1) + blk_b = torch.stack((blk_b0, blk_b1), dim=2).view(tensor.shape) + # swap once more + out = blk_a | blk_b + out_h = out & 0xF0 + out_l = out & 0x0F + out = (out_h >> 4) | (out_l << 4) + return out + + def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor): + assert blocks.dtype == torch.uint8 + assert scales.dtype == torch.uint8 + scales = scales.unsqueeze(-1) + assert len(blocks.shape) == 4 + assert len(scales.shape) == 4 + blocks = self.transform_nibble_layout(blocks) + new_data = torch.concat((scales, blocks), dim=-1) + new_shape = [new_data.shape[0], new_data.shape[1], new_data.shape[2] * 32] + logger.info(f"Repacked {new_name} with shape {new_shape} and quantization MXFP4") + # flatten last dim + new_data = new_data.view(new_data.shape[0], new_data.shape[1], new_data.shape[2] * new_data.shape[3]) + new_data = new_data.numpy() + self.gguf_writer.add_tensor(new_name, new_data, raw_dtype=gguf.GGMLQuantizationType.MXFP4) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + blocks0: Tensor = torch.zeros(1) + blocks1: Tensor = torch.zeros(1) + # we assume that tensors are loaded in the correct order + for name, data_torch in self.get_tensors(): + if "mlp.experts.down_proj_blocks" in name: + blocks0 = data_torch + elif "mlp.experts.down_proj_scales" in name: + new_name = self.map_tensor_name(name.replace("_scales", ".weight")) + self.repack_mxfp4(new_name, blocks0, data_torch) + elif "mlp.experts.gate_up_proj_blocks" in name: + blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :] + elif "mlp.experts.gate_up_proj_scales" in name: + scales0, scales1 = data_torch[:, ::2, :], data_torch[:, 1::2, :] + new_name_gate = self.map_tensor_name(name.replace("gate_up_proj_scales", "gate_proj.weight")) + new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight")) + self.repack_mxfp4(new_name_gate, blocks0, scales0) + self.repack_mxfp4(new_name_up, blocks1, scales1) + return [] + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "sinks" in name: + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # correct naming for down_proj + if "down_proj" in name: + if name.endswith("_bias"): + name = name.replace("down_proj_bias", "down_proj.bias") + elif "_blocks" not in name and "_scales" not in name: + logger.warning(f"{name} is not in MXFP4, performance may be degraded") + name = name.replace("down_proj", "down_proj.weight") + data_torch = data_torch.transpose(-1, -2) + else: + # otherwise, it should already be repacked to ggml MXFP4 format + return + + # split the gate_up into gate and up + if "gate_up_proj" in name: + if name.endswith("_bias"): + name_up = name.replace("gate_up_proj_bias", "up_proj.bias") + name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias") + gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2] + yield from super().modify_tensors(gate_proj_bias, name_gate, bid) + yield from super().modify_tensors(up_proj_bias, name_up, bid) + elif "_blocks" not in name and "_scales" not in name: + logger.warning(f"{name} is not in MXFP4, performance may be degraded") + name_up = name.replace("gate_up_proj", "up_proj.weight") + name_gate = name.replace("gate_up_proj", "gate_proj.weight") + data_torch = data_torch.transpose(-1, -2) + gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :] + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"]) diff --git a/conversion/gptneox.py b/conversion/gptneox.py new file mode 100644 index 00000000000..6a42b12b15a --- /dev/null +++ b/conversion/gptneox.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import re + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GPTNeoXForCausalLM") +class GPTNeoXModel(TextModel): + model_arch = gguf.MODEL_ARCH.GPTNEOX + + def set_gguf_parameters(self): + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count( + int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + ) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + assert n_head is not None + assert n_embed is not None + + if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/granite.py b/conversion/granite.py new file mode 100644 index 00000000000..647269ba740 --- /dev/null +++ b/conversion/granite.py @@ -0,0 +1,328 @@ +from __future__ import annotations + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf, logger + +from .llama import LlamaModel +from .mamba import Mamba2Model + + +@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration") +class GraniteModel(LlamaModel): + """Conversion for IBM's GraniteForCausalLM""" + model_arch = gguf.MODEL_ARCH.GRANITE + + def set_gguf_parameters(self): + """Granite uses standard llama parameters with the following differences: + + - No head_dim support + - New multiplier params: + - attention_scale + - embedding_scale + - residual_scale + - logits_scaling + """ + if head_dim := self.hparams.pop("head_dim", None): + logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) + super().set_gguf_parameters() + # NOTE: Convert _multiplier params to _scale params for naming + # consistency + if attention_scale := self.hparams.get("attention_multiplier"): + self.gguf_writer.add_attention_scale(attention_scale) + logger.info("gguf: (granite) attention_scale = %s", attention_scale) + if embedding_scale := self.hparams.get("embedding_multiplier"): + self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info("gguf: (granite) embedding_scale = %s", embedding_scale) + if residual_scale := self.hparams.get("residual_multiplier"): + self.gguf_writer.add_residual_scale(residual_scale) + logger.info("gguf: (granite) residual_scale = %s", residual_scale) + if logits_scale := self.hparams.get("logits_scaling"): + self.gguf_writer.add_logit_scale(logits_scale) + logger.info("gguf: (granite) logits_scale = %s", logits_scale) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + if name.startswith("encoder."): + return None + return super().filter_tensors(item) + + +@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM") +class GraniteMoeModel(GraniteModel): + """Conversion for IBM's GraniteMoeForCausalLM""" + model_arch = gguf.MODEL_ARCH.GRANITE_MOE + + def set_gguf_parameters(self): + """GraniteMoeShared uses GraniteMoe parameters plus the following: + - shared_intermediate_size + """ + super().set_gguf_parameters() + if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"): + self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length) + logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + """In modeling_granitemoe, the JetMoe implementation of parallel experts + is used. This essentially merges w1 and w3 into a single tensor with 2x + the hidden size that is then split during forward. To keep compatibility + with existing mixtral support, we pull them apart here. + """ + + if name.endswith("block_sparse_moe.input_linear.weight"): + ffn_dim = self.hparams["intermediate_size"] + assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" + gate, up = data_torch.split(ffn_dim, dim=-2) + yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid) + yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid) + return + + has_experts = bool(self.hparams.get('num_local_experts')) + + if name.endswith("shared_mlp.input_linear.weight"): + ffn_dim = self.hparams["shared_intermediate_size"] + assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size" + gate, up = data_torch.split(ffn_dim, dim=-2) + if has_experts: + yield from ModelBase.modify_tensors(self, gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid) + yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid) + return + yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) + yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) + return + + if not has_experts and name.endswith("shared_mlp.output_linear.weight"): + yield from ModelBase.modify_tensors(self, data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM") +class GraniteHybridModel(Mamba2Model, GraniteMoeModel): + """GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM + layers and optionally uses MoE w/ a shared expert""" + model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID + undo_permute = True + + def __init__(self, *args, **kwargs): + + # Hybrid mamba models use a prefix for the mamba-specific params. + # TODO: Extend this if the prefix(es) need to be configurable + self.hparam_prefixes = ["mamba"] + + super().__init__(*args, **kwargs) + + # Lists of which layers use ssm vs attention + self._attn_layers = self.get_attn_layers() + self._ssm_layers = [ + i for i in range(self.block_count) + if i not in self._attn_layers + ] + + # There are some models in this family that are non-hybrid, but keep the + # same parent class by setting all layers to "attention." If this is the + # case, the model architecture needs to be updated to a standard + # "granite" or "granitemoe" model + if not self._ssm_layers: + has_experts = self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True) + new_arch = ( + gguf.MODEL_ARCH.GRANITE_MOE + if has_experts else + gguf.MODEL_ARCH.GRANITE + ) + self.model_arch = new_arch + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch] + self.gguf_writer.add_architecture() + + # n_group and d_inner are used during reshape_tensors for mamba2 + # NOTE: Explicitly include hparam prefix prefix for d_model to + # disambiguate with top-level head_dim + # NOTE 2: If needed for future models, this can be isolated in a method + # to separate the prefix setting and the keys used + self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"]) + self.n_group = self.find_hparam(["n_groups", "num_groups"]) + self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model + + def get_attn_layers(self): + # Explicit list of layer type names + if layer_types := self.hparams.get("layer_types"): + return [ + i for i, typ in enumerate(layer_types) + if typ == "attention" + ] + + # Layer types indicated by index or period + attn_layers = self.hparams.get("attn_layer_indices", []) + if not attn_layers: + attn_period = self.hparams.get("attn_layer_period") + assert attn_period, "Didn't find attn_layer_indices or attn_layer_period" + attn_offset = self.hparams.get("attn_layer_offset") + assert attn_offset is not None, "No attention layer offset set with attn_layer_period" + attn_layers = [ + i for i in range(self.block_count) + if i % attn_period == attn_offset + ] + return attn_layers + + def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: + prefixed = [] + for pfx in self.hparam_prefixes: + prefixed.extend( + "_".join([pfx, k]) + for k in keys + ) + keys = list(keys) + prefixed + return Mamba2Model.find_hparam(self, keys, *args, **kwargs) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if ( + name.endswith("block_sparse_moe.input_linear.weight") + or "shared_mlp" in name + ): + yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid) + return + + # Determine whether this is a mamba layer or an attention layer + if bid in self._ssm_layers: + yield from Mamba2Model.modify_tensors(self, data_torch, name, bid) + return + elif bid in self._attn_layers: + yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid) + return + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + + def set_gguf_parameters(self): + """This method merges params from both parents and some that are + specific to this model. The result is some duplication of how the params + get set. The following warnings are expected during conversion: + + WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv' + WARNING:Duplicated key name 'granitehybrid.context_length' + """ + GraniteMoeModel.set_gguf_parameters(self) + + ## Mamba mixer params ## + self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"])) + self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state", "state_dim", "ssm_state_size"])) + self.gguf_writer.add_ssm_group_count(self.n_group) + self.gguf_writer.add_ssm_inner_size(self.d_inner) + # NOTE: The mamba_dt_rank is _not_ the right field for how this is used + # in llama.cpp + self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads", "num_heads"])) + + ## Attention params ## + head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) + head_count_kv_vec = [ + head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count) + ] + if rope_dim := self.hparams.get("attn_rotary_emb"): + self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_head_count_kv(head_count_kv_vec) + + ## If Bamba or non-hybrid, use rope, otherwise don't + use_rope = ( + "BambaForCausalLM" in self.hparams["architectures"] + or not self._ssm_layers + ) + self.gguf_writer.add_rope_scaling_finetuned(use_rope) + if not use_rope: + self.gguf_writer.add_context_length(2**20) + + ## Validation ## + d_head = self.find_hparam(["d_head"], optional=True) or 64 + assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" + assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" + + def set_vocab(self): + self.hparams["pad_vocab_size_multiple"] = 8 + Mamba2Model.set_vocab(self) + + +@ModelBase.register("GraniteSpeechForConditionalGeneration") +class GraniteSpeechMmprojModel(MmprojModel): + has_vision_encoder = False + has_audio_encoder = True + + _batch_norm_tensors: list[dict[str, Tensor]] | None = None + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config.get("encoder_config") + + def set_gguf_parameters(self): + assert self.hparams_audio is not None + a = self.hparams_audio + a["hidden_size"] = a["hidden_dim"] + a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"] + a["num_attention_heads"] = a["num_heads"] + a["num_hidden_layers"] = a["num_layers"] + + super().set_gguf_parameters() + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH) + self.gguf_writer.add_audio_num_mel_bins(a["input_dim"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + self.gguf_writer.add_audio_chunk_size(a["context_size"]) + self.gguf_writer.add_audio_conv_kernel_size(a["conv_kernel_size"]) + self.gguf_writer.add_audio_max_pos_emb(a["max_pos_emb"]) + + p = self.global_config + self.gguf_writer.add_audio_projector_window_size(p["window_size"]) + self.gguf_writer.add_audio_projector_downsample_rate(p["downsample_rate"]) + self.gguf_writer.add_audio_projector_head_count(p["projector_config"]["num_attention_heads"]) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if "encoder" in name or "projector" in name: + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + if "attention_dists" in name or "num_batches_tracked" in name: + return None + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # fold running_mean, running_var and eps into weight and bias for batch_norm + if "batch_norm" in name and "encoder.layers." in name: + if self._batch_norm_tensors is None: + self._batch_norm_tensors = [{} for _ in range(self.block_count)] + assert bid is not None + self._batch_norm_tensors[bid][name] = data_torch + if len(self._batch_norm_tensors[bid]) < 4: + return + prefix = f"encoder.layers.{bid}.conv.batch_norm" + weight = self._batch_norm_tensors[bid][f"{prefix}.weight"] + bias = self._batch_norm_tensors[bid][f"{prefix}.bias"] + running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"] + running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"] + eps = 1e-5 + a = weight / torch.sqrt(running_var + eps) + b = bias - running_mean * a + yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid) + yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid) + return + + if ".attn.to_kv.weight" in name: + k_weight, v_weight = data_torch.chunk(2, dim=0) + yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid) + yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid) + return + + if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"): + if data_torch.ndim == 3 and data_torch.shape[2] == 1: + data_torch = data_torch.squeeze(2) + + if "depth_conv" in name and name.endswith(".weight"): + if data_torch.ndim == 3 and data_torch.shape[1] == 1: + data_torch = data_torch.squeeze(1) + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/grok.py b/conversion/grok.py new file mode 100644 index 00000000000..9098e514a3a --- /dev/null +++ b/conversion/grok.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import sys + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM") +class GrokModel(TextModel): + model_arch = gguf.MODEL_ARCH.GROK + + def set_vocab(self): + if (self.dir_model / 'tokenizer.model').is_file(): + self._set_vocab_sentencepiece() + return + + if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file(): + logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer') + sys.exit(1) + + self._set_vocab_gpt2() + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0)) + self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0)) + if (final_logit_softcap := self.hparams.get("final_logit_softcapping")): + self.gguf_writer.add_final_logit_softcapping(final_logit_softcap) + + if (rope_dim := self.hparams.get("head_dim")) is None: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + + # Treat "original" as "yarn", seems to have been a mistake + if self.hparams.get("rope_type") in ("yarn", "original"): + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"]) + self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"]) + self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"]) + self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"]) + + if temp_len := self.hparams.get("attn_temperature_len"): + self.gguf_writer.add_attn_temperature_length(temp_len) + + self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5)) + self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"]) + self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"]) + + _experts: list[dict[str, list[Tensor]]] | None = None + _cur_expert = "" + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + deferred: list[tuple[Tensor, str, int | None]] = [] + is_expert = ".moe." in name or ".block_sparse_moe.experts." in name + + if not is_expert: + deferred.append((data_torch, name, bid)) + + # process the experts separately + if is_expert or self._cur_expert: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + # concatenate split tensors + if name in self._experts[bid]: + self._cur_expert = name + self._experts[bid][name].append(data_torch) + return + elif is_expert: + self._cur_expert = name + self._experts[bid][name] = [data_torch] + return + else: + self._cur_expert = "" + + for bid in range(self.block_count): + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight" + if ename not in self._experts[bid]: + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight" + tensor_list = self._experts[bid][ename] + datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + + for t in deferred: + yield from super().modify_tensors(*t) diff --git a/conversion/grovemoe.py b/conversion/grovemoe.py new file mode 100644 index 00000000000..a8be931cb90 --- /dev/null +++ b/conversion/grovemoe.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM") +class GroveMoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.GROVEMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") + # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299 + self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128) + # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298 + self.gguf_writer.add_experts_per_group(2) + # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376 + self.gguf_writer.add_expert_group_scale(0.05) + + _experts: list[dict[str, Tensor]] | None = None + _chunk_experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith(".expert_bias"): + # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303 + return + + # process the experts separately + if name.find("chunk_experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) // 2 # see add_experts_per_group + assert bid is not None + + if self._chunk_experts is None: + self._chunk_experts = [{} for _ in range(self.block_count)] + + self._chunk_experts[bid][name] = data_torch + + if len(self._chunk_experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight" + datas.append(self._chunk_experts[bid][ename]) + del self._chunk_experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + elif name.find("experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._chunk_experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + chunk_experts = [k for d in self._chunk_experts for k in d.keys()] + if len(chunk_experts) > 0: + raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}") + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/hunyuan.py b/conversion/hunyuan.py new file mode 100644 index 00000000000..be54f5810b0 --- /dev/null +++ b/conversion/hunyuan.py @@ -0,0 +1,407 @@ +from __future__ import annotations + +import json + +from pathlib import Path +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf, logger + +from .qwen import QwenModel + + +@ModelBase.register("HunYuanMoEV1ForCausalLM") +class HunYuanMoEModel(TextModel): + model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE + + def set_vocab(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + # 1. Get the pre-tokenizer identifier hash + tokpre = self.get_vocab_base_pre(tokenizer) + + # 2. Reverse-engineer the merges list from mergeable_ranks + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: # todo this is an assert in Qwen, why? + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # 3. Generate the tokens and toktypes lists + vocab_size = self.hparams["vocab_size"] + assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token = reverse_vocab[i] + tokens.append(token) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + # 4. Write all vocab-related fields to the GGUF writer + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + # 5. Add special tokens and chat templates + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + # FIX for BOS token: Overwrite incorrect id read from config.json + self.gguf_writer.add_bos_token_id(127959) # <|bos|> + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"]) + + moe_intermediate_size = hparams["moe_intermediate_size"] + assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size) + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0]) + + moe_topk = hparams["moe_topk"] + assert all(topk == moe_topk[0] for topk in moe_topk) + self.gguf_writer.add_expert_used_count(moe_topk[0]) + + moe_shared_expert = hparams["num_shared_expert"] + assert all(n == moe_shared_expert[0] for n in moe_shared_expert) + self.gguf_writer.add_expert_shared_count(moe_shared_expert[0]) + + # Rope + if self.rope_parameters.get("rope_type") == "dynamic": + # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ + # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) + alpha = self.rope_parameters.get("alpha", 1000) + base = self.rope_parameters.get("rope_theta", 10000.0) + dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128 + scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251 + self.gguf_writer.add_rope_freq_base(scaled_base) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_rope_scaling_factor(1) + # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k + self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length + self.gguf_writer.add_context_length(256 * 1024) # 256k context length + + # if any of our assumptions about the values are wrong, something has changed and this may need to be updated + assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ + "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name == "lm_head.weight": + if self.hparams.get("tie_word_embeddings", False): + logger.info("Skipping tied output layer 'lm_head.weight'") + return + + if name.find("mlp.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("HunYuanDenseV1ForCausalLM") +class HunYuanModel(TextModel): + model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE + + def _get_eod_token_id(self) -> int | None: + """Get the actual end-of-generation token from config (eod_token_id).""" + return self.hparams.get("eod_token_id") + + def _get_eot_token_id(self) -> int | None: + """Get the end-of-turn token from generation_config.json. + This is the first entry in eos_token_id when it's a list.""" + gen_cfg_path = self.dir_model / "generation_config.json" + if gen_cfg_path.is_file(): + with open(gen_cfg_path, encoding="utf-8") as f: + gen_cfg = json.load(f) + eos = gen_cfg.get("eos_token_id") + if isinstance(eos, list) and len(eos) >= 2: + return eos[0] + return None + + def _fix_special_tokens(self): + """Fix EOS/EOT tokens that are incorrect in upstream configs.""" + eod_id = self._get_eod_token_id() + if eod_id is not None: + self.gguf_writer.add_eos_token_id(eod_id) + eot_id = self._get_eot_token_id() + if eot_id is not None: + self.gguf_writer.add_eot_token_id(eot_id) + + def set_vocab(self): + if (self.dir_model / "tokenizer.json").is_file(): + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + # HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab + token_types = None + if (self.hparams.get("pad_token_id") or 0) < 0: + token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask') + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types) + special_vocab.add_to_gguf(self.gguf_writer) + self._fix_special_tokens() + else: + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + # 1. Get the pre-tokenizer identifier hash + tokpre = self.get_vocab_base_pre(tokenizer) + + # 2. Reverse-engineer the merges list from mergeable_ranks + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # 3. Generate the tokens and toktypes lists + vocab_size = self.hparams["vocab_size"] + assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token = reverse_vocab[i] + tokens.append(token) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + # 4. Write all vocab-related fields to the GGUF writer + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + # 5. Add special tokens and chat templates + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + # FIX for BOS token: Overwrite incorrect id read from config.json + if self.hparams['hidden_size'] == 4096: + self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token + self._fix_special_tokens() + + def set_gguf_parameters(self): + # HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it + saved_num_experts = self.hparams.pop("num_experts", None) + super().set_gguf_parameters() + if saved_num_experts is not None and saved_num_experts > 1: + self.hparams["num_experts"] = saved_num_experts + hparams = self.hparams + + # Rope + if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"): + # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ + # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) + alpha = self.rope_parameters.get("alpha", 50) + base = self.rope_parameters.get("rope_theta", 10000.0) + dim = hparams["head_dim"] + scaled_base = base * (alpha ** (dim / (dim - 2))) + self.gguf_writer.add_rope_freq_base(scaled_base) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_rope_scaling_factor(1) + if self.rope_parameters.get("rope_type") == "dynamic": + # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k + self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length + self.gguf_writer.add_context_length(256 * 1024) # 256k context length + + # if any of our assumptions about the values are wrong, something has changed and this may need to be updated + assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ + "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name == "lm_head.weight": + if self.hparams.get("tie_word_embeddings", False): + logger.info("Skipping tied output layer 'lm_head.weight'") + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("HunYuanVLForConditionalGeneration") +class HunyuanVLVisionModel(MmprojModel): + # Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name + # "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout. + # Each variant maps to a different projector type in clip.cpp so image + # preprocessing follows the correct code path. + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + # HunyuanOCR / HunyuanVL uses max_image_size instead of image_size + if "image_size" not in self.hparams_vision: + self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048) + + @staticmethod + def is_ocr_variant(hparams: dict) -> bool: + """Return True for HunyuanOCR, False for HunyuanVL. + + The projector's output dim must equal the text model's hidden_size by + construction (that's what "projector" means). HunyuanOCR pairs a 1B text + backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the + ViT -> LLM projection dim is a hard architectural signature, not a + magic number. + """ + vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) + return vision_out == 1024 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + vcfg = self.hparams_vision + + if self.is_ocr_variant(self.global_config): + # --- HunyuanOCR --- + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5)) + self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2)) + self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) + self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) + return + + # --- HunyuanVL --- + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL) + self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu") + self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"])) + self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"])) + self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) + self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if not name.startswith("vit."): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # strip CLS token (row 0) from position embeddings so resize_position_embeddings works + if "position_embedding" in name: + data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd] + yield from super().modify_tensors(data_torch, name, bid) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal + # Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2. + if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"): + return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + +@ModelBase.register("HunYuanVLForConditionalGeneration") +class HunyuanVLTextModel(HunYuanModel): + # The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR + # and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE), + # while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from + # the config and pick the matching GGUF architecture. + model_arch = gguf.MODEL_ARCH.HUNYUAN_VL + + @staticmethod + def _is_ocr_config(hparams: dict) -> bool: + # OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that + # outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with + # HunyuanVLVisionModel.is_ocr_variant. + return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024 + + def __init__(self, dir_model: Path, *args, **kwargs): + raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False) + if self._is_ocr_config(raw_hparams): + self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE + else: + self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL + super().__init__(dir_model, *args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses + # the HunYuan-Dense arch which already handles standard rope in super(). + if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL: + return + + if self.rope_parameters.get("rope_type") != "xdrope": + return + + # defaults for HunyuanVL. The C++ side later computes: + # freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2)) + self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"])) + self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"])) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1))) + + ctx_len = int(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len) + self.gguf_writer.add_context_length(ctx_len) + + self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"])) diff --git a/conversion/internlm.py b/conversion/internlm.py new file mode 100644 index 00000000000..7e11aca3ce0 --- /dev/null +++ b/conversion/internlm.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import json +import sys + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger + +from .llama import LlamaModel + + +@ModelBase.register("InternLM2ForCausalLM") +class InternLM2Model(TextModel): + model_arch = gguf.MODEL_ARCH.INTERNLM2 + + def set_vocab(self): + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + logger.error(f'Error: Missing {tokenizer_path}') + sys.exit(1) + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + if text == b"\x00": + # (TODO): fixme + # Hack here and replace the \x00 characters. + logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") + text = "🐉".encode("utf-8") + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + # take care of ununsed raw token + if piece.startswith('[UNUSED'): + toktype = SentencePieceTokenTypes.UNUSED + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + chat_eos_token = '<|im_end|>' + chat_eos_token_id = None + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, foken_data in added_tokens_decoder.items(): + token_id = int(token_id) + token = foken_data["content"] + if token == chat_eos_token: + chat_eos_token_id = token_id + token = token.encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + tokenizer_file = self.dir_model / 'tokenizer.json' + if tokenizer_file.is_file(): + with open(tokenizer_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get("added_tokens", []) + for foken_data in added_tokens: + token_id = int(foken_data["id"]) + token = foken_data["content"] + if token == chat_eos_token: + chat_eos_token_id = token_id + token = token.encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + old_eos = special_vocab.special_token_ids["eos"] + if chat_eos_token_id is not None: + # For the chat model, we replace the eos with '<|im_end|>'. + # TODO: this is a hack, should be fixed + # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048 + special_vocab.special_token_ids["eos"] = chat_eos_token_id + logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" + " in chat mode so that the conversation can end normally.") + + special_vocab.add_to_gguf(self.gguf_writer) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + num_heads = self.hparams["num_attention_heads"] + num_kv_heads = self.hparams["num_key_value_heads"] + n_embd = self.hparams["hidden_size"] + q_per_kv = num_heads // num_kv_heads + head_dim = n_embd // num_heads + num_groups = num_heads // q_per_kv + + if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: + qkv = data_torch + + qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) + q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] + + # The model weights of q and k equire additional reshape. + q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) + k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) + v = v.reshape((-1, v.shape[-1])) + + yield from super().modify_tensors(q, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + yield from super().modify_tensors(k, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(v, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) + else: + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("InternLM3ForCausalLM") +class InternLM3Model(TextModel): + model_arch = gguf.MODEL_ARCH.LLAMA + + def set_vocab(self): + tokens, scores, toktypes = self._create_vocab_sentencepiece() + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + if "added_tokens_decoder" in tokenizer_config_json: + for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items(): + if token_data.get("special"): + token_id = int(token_id) + token = token_data["content"] + special_vocab._set_special_token(token, token_id) + # update eos token + if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids: + special_vocab.special_token_ids["eos"] = token_id + + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith(("mlp", "vision_model")): + # skip visual tensors + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/internvl.py b/conversion/internvl.py new file mode 100644 index 00000000000..9a2a1e43df7 --- /dev/null +++ b/conversion/internvl.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + + +@ModelBase.register("InternVisionModel") +class InternVisionModel(MmprojModel): + + min_dynamic_tiles: int = 0 + max_dynamic_tiles: int = 0 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0) + self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0) + + def set_gguf_parameters(self): + assert self.hparams_vision is not None + if isinstance(self.hparams_vision['image_size'], list): + self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0] + if isinstance(self.hparams_vision['patch_size'], list): + self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0] + super().set_gguf_parameters() + + hparams = self.hparams + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL) + self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) + # hidden_act + if hparams["hidden_act"] == "silu": + self.gguf_writer.add_vision_use_silu(True) + elif hparams["hidden_act"] == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + else: + raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") + # downsample_ratio + downsample_ratio = self.global_config.get("downsample_ratio") + assert downsample_ratio is not None + self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) + # older models may not have min/max_dynamic_patch in config + if self.min_dynamic_tiles > 0: + self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles) + if self.max_dynamic_tiles > 0: + self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".position_embd." in new_name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector'] + if not any([name.startswith(prefix) for prefix in vision_prefix]): + return None + # deal with intern-s1 special case + names_map = { + "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias", + "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight", + "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias", + "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight", + "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias", + "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight", + } + if name in names_map: + name = names_map[name] + # correct name + if name.startswith("vision_model"): + name = "vision_tower." + name + if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"): + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # split QKV tensors if needed + if ".qkv." in name: + if data_torch.ndim == 2: # weight + c3, _ = data_torch.shape + else: # bias + c3 = data_torch.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = data_torch[:c] + wk = data_torch[c: c * 2] + wv = data_torch[c * 2:] + yield from super().modify_tensors(wq, name.replace("attn.qkv", "self_attn.q_proj"), bid) + yield from super().modify_tensors(wk, name.replace("attn.qkv", "self_attn.k_proj"), bid) + yield from super().modify_tensors(wv, name.replace("attn.qkv", "self_attn.v_proj"), bid) + else: + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/jais.py b/conversion/jais.py new file mode 100644 index 00000000000..00add4c77fc --- /dev/null +++ b/conversion/jais.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +import math + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("Jais2ForCausalLM") +class Jais2Model(TextModel): + model_arch = gguf.MODEL_ARCH.JAIS2 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + head_dim = hparams.get("head_dim", hparams["hidden_size"] // hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count(head_dim) + + +@ModelBase.register("JAISLMHeadModel") +class JaisModel(TextModel): + model_arch = gguf.MODEL_ARCH.JAIS + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # ALiBi position embedding + assert self.hparams["position_embedding_type"] == "alibi" + + # Embeddings scale + self.embeddings_scale = 1.0 + if 'mup_embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['mup_embeddings_scale'] + elif 'embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['embeddings_scale'] + else: + assert False + + self.width_scale = 1.0 + if 'mup_output_alpha' in self.hparams: + assert 'mup_width_scale' in self.hparams + self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] + elif 'width_scale' in self.hparams: + self.width_scale = self.hparams['width_scale'] + else: + assert False + + self.max_alibi_bias = 8.0 + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # we don't need these + if name.endswith((".attn.bias")): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith(("relative_pe.slopes")): + # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) + # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, + # but Jais's PyTorch model simply precalculates the slope values and places them + # in relative_pes.slopes + n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) + first_val = float(data_torch[0].item()) + self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) + + return + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + yield from super().modify_tensors(data_torch * self.embeddings_scale, new_name, bid) + elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + yield from super().modify_tensors(data_torch * self.width_scale, new_name, bid) + else: + yield from super().modify_tensors(data_torch, new_name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) diff --git a/conversion/jamba.py b/conversion/jamba.py new file mode 100644 index 00000000000..da712ba5014 --- /dev/null +++ b/conversion/jamba.py @@ -0,0 +1,119 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("JambaForCausalLM") +class JambaModel(TextModel): + model_arch = gguf.MODEL_ARCH.JAMBA + + def set_vocab(self): + if (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + else: + self._set_vocab_llama_hf() + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + d_model = self.find_hparam(["hidden_size", "mamba_d_model"]) + d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4 + d_inner = self.hparams["mamba_expand"] * d_model + d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 + dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6 + n_kv_head = self.hparams["num_key_value_heads"] + attn_offset = self.hparams["attn_layer_offset"] + attn_period = self.hparams["attn_layer_period"] + n_kv_vec = [0 for _ in range(attn_offset)] + [ + n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count) + ] + + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"])) + self.gguf_writer.add_embedding_length(d_model) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(n_kv_vec) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"])) + self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"])) + self.gguf_writer.add_file_type(self.ftype) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # Mini-Jamba + name = name.replace(".moe.", ".feed_forward.") + if bid is not None: + moe_offset = self.hparams["expert_layer_offset"] + moe_period = self.hparams["expert_layer_period"] + + if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0): + name = name.replace(".experts.0.", ".") + + # process the experts separately + if ".feed_forward.experts." in name: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + + # merge the experts into a single 3d tensor + for wid in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + # using the same merged name as qwen2moe + merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + yield new_name, data_torch + return + + new_name = self.map_tensor_name(name) + + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): + data_torch = data_torch.squeeze() + + if name.endswith(".A_log"): + logger.debug("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + yield (new_name, data_torch) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/januspro.py b/conversion/januspro.py new file mode 100644 index 00000000000..b49691205cc --- /dev/null +++ b/conversion/januspro.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + +from .llama import LlamaModel + + +@ModelBase.register("JanusForConditionalGeneration") +class JanusProModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip vision, aligner, and generation tensors + skip_prefixes = ( + 'model.vision_model.', + 'model.aligner.', + 'model.vqmodel.', + 'model.generation_embeddings.', + 'model.generation_aligner.', + 'model.generation_head.', + ) + if name.startswith(skip_prefixes): + return None + + return super().filter_tensors(item) + + +@ModelBase.register("JanusForConditionalGeneration") +class JanusProVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + if "intermediate_size" not in self.hparams_vision: + mlp_ratio = self.hparams_vision.get("mlp_ratio") + hidden_size = self.hparams_vision.get("hidden_size") + if mlp_ratio is not None and hidden_size is not None: + self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO) + + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) + + hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() + if hidden_act == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + elif hidden_act == "silu": + self.gguf_writer.add_vision_use_silu(True) + + def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]: + """Map aligner tensors to projector format""" + suffix = ".bias" if name.endswith(".bias") else ".weight" + + if name.startswith("model.aligner."): + local_name = name[len("model.aligner."):] + elif name.startswith("aligner."): + local_name = name[len("aligner."):] + else: + raise ValueError(f"Unsupported Janus aligner prefix: {name}") + + if local_name.startswith("fc1."): + mm_index = 0 + elif local_name.startswith("hidden_layers."): + parts = local_name.split(".", 2) + if len(parts) < 3: + raise ValueError(f"Unexpected Janus aligner tensor name: {name}") + mm_index = int(parts[1]) + 1 + else: + raise ValueError(f"Unsupported Janus aligner tensor: {name}") + + tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix) + return [(tensor_name, data_torch)] + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip generation-related components + skip_generation_prefixes = ( + 'model.vqmodel.', + 'vqmodel.', + 'model.generation_embeddings.', + 'generation_embeddings.', + 'model.generation_aligner.', + 'generation_aligner.', + 'model.generation_head.', + 'generation_head.', + ) + if name.startswith(skip_generation_prefixes): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Handle aligner tensors + if name.startswith(('model.aligner.', 'aligner.')): + yield from self._map_aligner_tensor(data_torch, name) + return + + # Handle vision tensors + if name.startswith(('model.vision_model.', 'vision_model.')): + yield from super().modify_tensors(data_torch, name, bid) + return + + return diff --git a/conversion/kimi_linear.py b/conversion/kimi_linear.py new file mode 100644 index 00000000000..f2e6cda83c1 --- /dev/null +++ b/conversion/kimi_linear.py @@ -0,0 +1,223 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + +from .qwen import QwenModel + + +@ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM") +class KimiLinearModel(TextModel): + """Kimi-Linear model with hybrid MLA+KDA architecture""" + model_arch = gguf.MODEL_ARCH.KIMI_LINEAR + + _experts: list[dict[str, Tensor]] | None = None + + def set_vocab(self): + try: + self._set_vocab_gpt2() + return + except Exception: + pass + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + tokpre = self.get_vocab_base_pre(tokenizer) + + if tokpre == "kimi-k2": + # Build merges list using the approach similar to HunYuanMoE + merges = [] + vocab = {} + mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + # Build token list + vocab_size = self.hparams["vocab_size"] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token = reverse_vocab[i] + tokens.append(token) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + # override eos id in config.json with tiktoken eos id + self.gguf_writer.add_eos_token_id(tokenizer.eos_id) # ty: ignore[unresolved-attribute] + else: + raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") + + def set_gguf_parameters(self): + # note: To enable MLA KV cache, attention needs to be converted into MQA (ie: GQA with 1 group) + self.hparams["num_key_value_heads"] = 1 + + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + # KDA & MLA params + # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv + linear_attn_config = self.hparams["linear_attn_config"] + # n_head == 0 for KDA layers, n_head > 0 for MLA layers + # full_attention_layers list will be used to distinguish layer type + _num_kv_heads = list() + _full_attn_layers = linear_attn_config["full_attn_layers"] + for il in range(self.hparams["num_hidden_layers"]): + if il + 1 in _full_attn_layers: + _num_kv_heads.append(self.hparams["num_key_value_heads"]) + else: + _num_kv_heads.append(0) + assert len(_num_kv_heads) == self.hparams["num_hidden_layers"] + self.gguf_writer.add_head_count_kv(_num_kv_heads) + + if (ssm_d_conv := linear_attn_config.get("short_conv_kernel_size")) is not None: + self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) + if (kda_head_dim := linear_attn_config.get("head_dim")) is not None: + self.gguf_writer.add_kda_head_dim(kda_head_dim) + + # MLA params - use add_* methods that handle arch substitution + # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv) + if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=True)) is not None: + self.gguf_writer.add_q_lora_rank(q_lora_rank) + # To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA + kv_lora_rank = self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False) + self.gguf_writer.add_kv_lora_rank(kv_lora_rank) + + # MLA head dimensions + # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim + qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") + # Rotation - use qk_rope_head_dim for Kimi + qk_rope_head_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=False) + self.gguf_writer.add_rope_dimension_count(qk_rope_head_dim) + self.gguf_writer.add_key_length(kv_lora_rank + qk_rope_head_dim) + v_head_dim = self.hparams.get("v_head_dim") + + # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim + if (n_embd_head_k_mla := self.find_hparam(["n_embd_head_k_mla"], optional=True)) is not None: + self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) + elif qk_nope_head_dim is not None: + n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim + self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) + + # n_embd_head_v_mla = v_head_dim + if (n_embd_head_v_mla := self.hparams.get("n_embd_head_v_mla")) is not None: + self.gguf_writer.add_value_length_mla(n_embd_head_v_mla) + elif v_head_dim is not None: + self.gguf_writer.add_value_length_mla(v_head_dim) + + # moe_intermediate_size (1024 for Kimi) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + # num_shared_experts (1 for Kimi) + self.gguf_writer.add_expert_shared_count(self.hparams["num_shared_experts"]) + # first_k_dense_replace (1 for Kimi - first layer uses dense MLP) + self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) + # Routed scaling factor (expert_weights_scale = 2.446 for Kimi) + self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}") + + # Handle KDA conv1d weights + # HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest + # llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest + # GGUF reverses numpy shape when writing, so numpy (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] + # Memory layouts match: both have conv_step (d_conv) changing fastest + if name.endswith((".q_conv1d.weight", ".k_conv1d.weight", ".v_conv1d.weight")): + # HF shape: [d_inner, d_conv] e.g. [4096, 4] + # Target numpy shape: (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] + if data_torch.ndim == 2: + d_inner, d_conv = data_torch.shape + # Reshape to (1, d_inner, 1, d_conv) - memory layout preserved (d_conv fastest) + data_torch = data_torch.reshape(1, d_inner, 1, d_conv) + logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") + elif data_torch.ndim == 3: + # Already 3D [d_inner, 1, d_conv] from unsqueeze + d_inner, _, d_conv = data_torch.shape + data_torch = data_torch.reshape(1, d_inner, 1, d_conv) + logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") + + # Handle A_log: iHF stores as [1, 1, num_heads, 1] + # llama.cpp expects ggml ne = [1, num_heads, 1, 1] + # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1] + if name.endswith(".A_log"): + data_torch = -torch.exp(data_torch) + if name.endswith(".dt_bias"): + name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" + logger.info("Changed dt_bias to dt_proj.bias") + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + # w1: gate, w2: down, w3: up + for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), + ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), + ("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]: + datas: list[Tensor] = [] + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + data_torch = torch.stack(datas, dim=0) + new_name = self.format_tensor_name(tname, bid) + yield from super().modify_tensors(data_torch, new_name, bid) + return + + # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.find_hparam(["n_embd_head_v_mla", "v_head_dim"], optional=False) + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + logger.info("Split kv_b n_head_kv %d\n" % n_head_kv) + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2) + yield from super().modify_tensors(k_b, name_kb, bid) + yield from super().modify_tensors(v_b, name_vb, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/kimivl.py b/conversion/kimivl.py new file mode 100644 index 00000000000..63b8a079b72 --- /dev/null +++ b/conversion/kimivl.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + + +@ModelBase.register("KimiVLForConditionalGeneration") +class KimiVLModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = 64 * 14 # for compatibility + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_projector_scale_factor(2) + # eps is the same as pytorch's default value + assert self.hparams_vision is not None + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5)) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name + + if not is_vision_tensor: + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "pos_emb.weight" in name: + data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2]) + + if "wqkv" in name: + split_dim = 0 if "weight" in name else -1 + wq, wk, wv = data_torch.chunk(3, dim=split_dim) + yield from super().modify_tensors(wq, name.replace("wqkv", "wq"), bid) + yield from super().modify_tensors(wk, name.replace("wqkv", "wk"), bid) + yield from super().modify_tensors(wv, name.replace("wqkv", "wv"), bid) + else: + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("KimiK25ForConditionalGeneration") +class KimiK25Model(MmprojModel): + """Kimi-K2.5 with MoonViT3d vision encoder""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + assert self.hparams_vision is not None, "Kimi-K2.5 requires vision_config in model config" + + self.merge_kernel_size = tuple(self.hparams_vision.get("merge_kernel_size", [2, 2])) + self.patch_size = self.hparams_vision.get("patch_size", 14) + + # Set image_size for compatibility with base class + # Use position embedding dimensions as image_size reference + pos_emb_h = self.hparams_vision.get("init_pos_emb_height", 64) + self.hparams_vision["image_size"] = pos_emb_h * self.patch_size + + def set_gguf_parameters(self): + # Base class MmprojModel.set_gguf_parameters() already writes: + # - vision_block_count, vision_head_count, vision_embedding_length + # - vision_feed_forward_length, vision_patch_size, image_mean, image_std + # via find_vparam() which handles the vt_* prefixed keys in Kimi-K2.5's config + super().set_gguf_parameters() + assert self.hparams_vision is not None + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIK25) + + # Position embedding parameters (for interpolation) + self.gguf_writer.add_uint32("vision.pos_emb_height", self.hparams_vision.get("init_pos_emb_height", 64)) + self.gguf_writer.add_uint32("vision.pos_emb_width", self.hparams_vision.get("init_pos_emb_width", 64)) + self.gguf_writer.add_uint32("vision.pos_emb_time", self.hparams_vision.get("init_pos_emb_time", 4)) + + # Projector parameters + self.gguf_writer.add_vision_use_gelu(self.hparams_vision.get("projector_hidden_act", "gelu") == "gelu") + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5)) + self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0]) + + # Image size limits + # Note: in_patch_limit is for images, in_patch_limit_each_frame is for video (not supported yet) + in_patch_limit = self.preprocessor_config.get("in_patch_limit", 16384) + min_patches = 8 # reasonable minimum + pixels_per_patch = self.patch_size ** 2 + self.gguf_writer.add_vision_min_pixels(min_patches * pixels_per_patch) + self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch) + + @staticmethod + def permute(weights: Tensor, n_head: int) -> Tensor: + out_dim, in_dim = weights.shape + head_dim = out_dim // n_head + w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim) + w = w.permute(0, 2, 1, 3, 4) + return w.reshape(out_dim, in_dim) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Only process vision and projector tensors + is_vision = any(x in name for x in ["vision_tower", "mm_projector"]) + + if not is_vision: + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + assert self.hparams_vision is not None + n_head = self.hparams_vision.get("num_attention_heads", 16) + + # Permute Q/K weights/biases from interleaved to split RoPE format + # This allows using build_rope_2d at runtime without post-permutation. + if "wqkv" in name: + out_dim = data_torch.shape[0] + qkv_dim = out_dim // 3 + head_dim = qkv_dim // n_head + + if "weight" in name: + wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2 * qkv_dim, :], data_torch[2 * qkv_dim:, :] + wq = self.permute(wq, n_head) + wk = self.permute(wk, n_head) + data_torch = torch.cat([wq, wk, wv], dim=0) + elif "bias" in name: + bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2 * qkv_dim], data_torch[2 * qkv_dim:] + bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1) + bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1) + data_torch = torch.cat([bq, bk, bv], dim=0) + + # Temporal embeddings: (T, 1, C) → (T, C) + if "pos_emb.time_weight" in name: + T, _, C = data_torch.shape + data_torch = data_torch.reshape(T, C) + + # PatchMergerMLP tensor name mapping + # proj.0.weight → proj.linear_1.weight + # proj.2.weight → proj.linear_2.weight + if "mm_projector.proj.0." in name: + name = name.replace(".proj.0.", ".proj.linear_1.") + elif "mm_projector.proj.2." in name: + name = name.replace(".proj.2.", ".proj.linear_2.") + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/lfm2.py b/conversion/lfm2.py new file mode 100644 index 00000000000..f28fccf10f2 --- /dev/null +++ b/conversion/lfm2.py @@ -0,0 +1,256 @@ +from __future__ import annotations + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf + +from .gemma import ConformerAudioModel + + +@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM") +class LFM2Model(TextModel): + model_arch = gguf.MODEL_ARCH.LFM2 + + def _add_feed_forward_length(self): + ff_dim = self.find_hparam(["block_ff_dim", "intermediate_size"]) + auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"] + ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"] + multiple_of = self.hparams["block_multiple_of"] + + if auto_adjust_ff_dim: + ff_dim = int(2 * ff_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + ff_dim = int(ffn_dim_multiplier * ff_dim) + ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of) + + self.gguf_writer.add_feed_forward_length(ff_dim) + + def set_gguf_parameters(self): + # set num_key_value_heads only for attention layers + self.hparams["num_key_value_heads"] = [ + self.hparams["num_key_value_heads"] if layer_type != "conv" else 0 + for layer_type in self.hparams["layer_types"] + ] + + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"]) + self._add_feed_forward_length() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if ConformerAudioModel.is_audio_tensor(name): + # skip multimodal tensors + return None + + name = name.replace("lfm.", "model.") # audio + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # conv op requires 2d tensor + if 'conv.conv' in name: + data_torch = data_torch.squeeze(1) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Lfm2Model") +class LFM2ColBertModel(LFM2Model): + model_arch = gguf.MODEL_ARCH.LFM2 + dense_tensor_name = "dense_2" + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if not name.startswith(self.dense_tensor_name): + name = "model." + name + + yield from super().modify_tensors(data_torch, name, bid) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # dense tensor is stored in a separate safetensors file + from safetensors.torch import load_file + tensors_file = self.dir_model / "1_Dense" / "model.safetensors" + assert tensors_file.is_file() + tensor = load_file(tensors_file)["linear.weight"] + self.gguf_writer.add_embedding_length_out(tensor.shape[0]) + yield f"{self.dense_tensor_name}.weight", tensor.clone() + + +@ModelBase.register("Lfm2MoeForCausalLM") +class LFM2MoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.LFM2MOE + + def set_gguf_parameters(self): + # set num_key_value_heads only for attention layers + self.hparams["num_key_value_heads"] = [ + self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0 + for layer_type in self.hparams["layer_types"] + ] + + super().set_gguf_parameters() + + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"]) + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) + + # cache for experts weights for merging + _experts_cache: dict[int, dict[str, Tensor]] = {} + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith(".expert_bias"): + name = name.replace(".expert_bias", ".expert_bias.bias") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # conv op requires 2d tensor + if 'conv.conv' in name: + data_torch = data_torch.squeeze(1) + + # merge expert weights + if 'experts' in name: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + expert_cache = self._experts_cache.setdefault(bid, {}) + expert_cache[name] = data_torch + expert_weights = ["w1", "w2", "w3"] + + # not enough expert weights to merge + if len(expert_cache) < n_experts * len(expert_weights): + return + + for w_name in expert_weights: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight" + datas.append(expert_cache[ename]) + del expert_cache[ename] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + + del self._experts_cache[bid] + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + assert not self._experts_cache + + +@ModelBase.register("Lfm2VlForConditionalGeneration") +class LFM2VLModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + # TODO(tarek): for dynamic resolution image_size is not specified, setting here for compatibility + self.hparams_vision["image_size"] = 256 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2) + self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["layer_norm_eps"])) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2)) + self.gguf_writer.add_vision_use_gelu(True) + # python notation, e.g. for vision_feature_layer == -1, we pick last layer -> vision_feature_layers_to_drop = 0 + vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer", -1) + 1) + self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + name = name.replace("model.vision_tower.", "vision_tower.") + name = name.replace("model.multi_modal_projector.", "multi_modal_projector.") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "patch_embedding.weight" in name: + data_torch = data_torch.view(data_torch.shape[0], 16, 16, 3).permute(0, 3, 1, 2) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Lfm2AudioForConditionalGeneration") +class LFM2AudioModel(ConformerAudioModel): + has_vision_encoder = False + has_audio_encoder = True + model_name = "Lfm2AudioEncoder" + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config.get("encoder") + + def set_gguf_parameters(self): + assert self.hparams_audio is not None + self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] + self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"] + self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"] + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # skip language model tensors + if name.startswith("lfm."): + return None + + # for training only + if any(p in name for p in ["audio_loss_weight"]): + return None + + # for audio output + if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): + return None + + return super().filter_tensors(item) + + +@ModelBase.register("Lfm25AudioTokenizer") +class LFM25AudioTokenizer(LFM2Model): + model_arch = gguf.MODEL_ARCH.LFM2 + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_embedding_length_out(self.hparams["output_size"]) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # skip language model tensors + if name == "istft.window" or name.startswith("emb.emb"): + return None + + if name.startswith("lin"): + name = name.replace("lin", "dense_2_out") + + return super().filter_tensors((name, gen)) diff --git a/conversion/lighton_ocr.py b/conversion/lighton_ocr.py new file mode 100644 index 00000000000..ead3200ac18 --- /dev/null +++ b/conversion/lighton_ocr.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, gguf + +from .llava import LlavaVisionModel + + +@ModelBase.register("LightOnOCRForConditionalGeneration") +class LightOnOCRVisionModel(LlavaVisionModel): + is_mistral_format = False + use_break_tok = False + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + name = name.replace("model.vision_encoder.", "vision_tower.") + name = name.replace("model.vision_projection.", "multi_modal_projector.") + + return super().filter_tensors((name, gen)) diff --git a/conversion/llada.py b/conversion/llada.py new file mode 100644 index 00000000000..98dc9de95b3 --- /dev/null +++ b/conversion/llada.py @@ -0,0 +1,172 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("LLaDAModelLM") +class LLaDAModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLADA + undo_permute = True + + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] + vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) + assert max(vocab_dict.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + # Check if it's a special token - treat special tokens as CONTROL tokens + if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder: + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|> + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + return tokens, toktypes, tokpre + + def set_vocab(self): + self._set_vocab_gpt2() + + # LLaDA specific parameters + self.gguf_writer.add_add_bos_token(True) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self._try_set_pooling_type() + + # Add parameters similar to LlamaModel + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if (rope_dim := hparams.get("head_dim")) is None: + n_heads = hparams.get("num_attention_heads", hparams.get("n_heads")) + assert n_heads is not None + rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads + self.gguf_writer.add_rope_dimension_count(rope_dim) + + # Set context length for LLaDA + context_length = self.hparams.get("max_sequence_length", 4096) + self.gguf_writer.add_context_length(context_length) + + # Set embedding length (dimension size) + embedding_length = self.hparams.get("d_model", 4096) + self.gguf_writer.add_embedding_length(embedding_length) + + # Set feed forward length (MLP hidden size) + feed_forward_length = self.hparams.get("mlp_hidden_size", 12288) + self.gguf_writer.add_feed_forward_length(feed_forward_length) + + # LLaDA models use non-causal attention for diffusion, similar to Dream + self.gguf_writer.add_causal_attention(False) + + # LLaDA models don't shift their logits + self.gguf_writer.add_diffusion_shift_logits(False) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads")) + assert n_head is not None + n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads")) + + if self.undo_permute: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LLaDAModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head) + + # LLaDA model tensors should be mapped directly since it's the base model + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM") +class LLaDAMoEModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLADA_MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) + + self.gguf_writer.add_mask_token_id(156895) + self.gguf_writer.add_causal_attention(False) + self.gguf_writer.add_diffusion_shift_logits(False) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/llama.py b/conversion/llama.py new file mode 100644 index 00000000000..41fde5143f8 --- /dev/null +++ b/conversion/llama.py @@ -0,0 +1,312 @@ +from __future__ import annotations + +import json +import math + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register( + "LLaMAForCausalLM", + "LlamaForCausalLM", + "MistralForCausalLM", + "MixtralForCausalLM", + "VLlama3ForCausalLM", + "LlavaForConditionalGeneration", + "VoxtralForConditionalGeneration", + "IQuestCoderForCausalLM", + "LlamaModel") +class LlamaModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLAMA + undo_permute = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # fix for SmolVLM2, missing `num_attention_heads` in config.json + if self.hf_arch == "VLlama3ForCausalLM": + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) + # Mistral consolidated format has no config.json; origin_hf_arch is HF-only. + if self.is_mistral_format: + self.origin_hf_arch = None + else: + hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + def set_vocab(self): + if self.origin_hf_arch == "GlmasrModel": + return self._set_vocab_glmedge() + + if self.is_mistral_format: + return self._set_vocab_mistral() + + path_tekken_json = self.dir_model / "tekken.json" + path_tokenizer_json = self.dir_model / "tokenizer.json" + if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): + self._set_vocab_mistral() + + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + try: + self._set_vocab_llama_hf() + except (FileNotFoundError, TypeError): + # Llama 3 + self._set_vocab_gpt2() + + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) + if self.hparams.get("vocab_size", 32000) == 32016: + special_vocab = gguf.SpecialVocab( + self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot'] + ) + special_vocab._set_special_token("prefix", 32007) + special_vocab._set_special_token("suffix", 32008) + special_vocab._set_special_token("middle", 32009) + special_vocab._set_special_token("eot", 32010) + special_vocab.add_to_gguf(self.gguf_writer) + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + # Apply to granite small models only + if self.hparams.get("vocab_size", 32000) == 49152: + self.gguf_writer.add_add_bos_token(False) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + if not self.is_mistral_format: + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): + # Mirror the BF16 Q/K RoPE permutation site in modify_tensors; the NVFP4 path bypasses it. + if self.undo_permute: + n_head = self.find_hparam(["n_heads", "num_attention_heads"], optional=True) + n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"], optional=True) + if n_head is not None: + if name.endswith("q_proj.weight"): + weight = LlamaModel.permute(weight, n_head, n_head) + scale = LlamaModel.permute(scale, n_head, n_head) + elif name.endswith("k_proj.weight"): + weight = LlamaModel.permute(weight, n_head, n_kv_head) + scale = LlamaModel.permute(scale, n_head, n_kv_head) + super()._repack_nvfp4(name, weight, scale, scale2, input_scale) + + _experts: list[dict[str, Tensor]] | None = None + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "text_model." in name: + name = name.replace("text_model.", "") # for SmolVLM + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.find_hparam(["n_heads", "num_attention_heads"]) + n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"]) + + if self.hf_arch == "LlamaModel": + name = "model." + name + + if self.undo_permute: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for wid in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10000.0) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + # assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4 + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("ArceeForCausalLM") +class ArceeModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.ARCEE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self._try_set_pooling_type() + + +@ModelBase.register( + "Llama4ForConditionalGeneration", + "Llama4ForCausalLM", +) +class Llama4Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA4 + undo_permute = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this + self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"] + self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"] + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"]) + if "layer_types" in self.hparams: + if all(lt == "full_attention" for lt in self.hparams["layer_types"]): + # all layers are full attention (for MobileLLM), disable swa + self.gguf_writer.add_sliding_window(0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + # split the gate_up into gate and up + if "gate_up_proj" in name: + name_up = name.replace("gate_up_proj", "up_proj.weight") + name_gate = name.replace("gate_up_proj", "gate_proj.weight") + dim_half = data_torch.shape[-1] // 2 + gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2) + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + return + + if name.endswith("down_proj"): + name += ".weight" + data_torch = data_torch.transpose(-1, -2) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("LlamaBidirectionalModel") +class LlamaEmbedNemotronModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA_EMBED + + +@ModelBase.register("SmolLM3ForCausalLM") +class SmolLM3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.SMOLLM3 + + +@ModelBase.register("ApertusForCausalLM") +class ApertusModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.APERTUS + undo_permute = False + + _alpha_n = {} + _alpha_p = {} + _beta = {} + _eps = {} + + def modify_tensors(self, data_torch, name, bid): + # Handle xIELU activation parameters + n_layers = self.hparams["num_hidden_layers"] + if name.endswith(".act_fn.alpha_n"): + self._alpha_n[bid] = data_torch.to("cpu").float().item() + if (len(self._alpha_n) == n_layers): + self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)]) + return + if name.endswith(".act_fn.alpha_p"): + self._alpha_p[bid] = data_torch.to("cpu").float().item() + if (len(self._alpha_p) == n_layers): + self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)]) + return + if name.endswith(".act_fn.beta"): + self._beta[bid] = data_torch.to("cpu").float().item() + if (len(self._beta) == n_layers): + self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)]) + return + if name.endswith(".act_fn.eps"): + self._eps[bid] = data_torch.to("cpu").float().item() + if (len(self._eps) == n_layers): + self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)]) + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/llama4.py b/conversion/llama4.py new file mode 100644 index 00000000000..f84c7629619 --- /dev/null +++ b/conversion/llama4.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + + +@ModelBase.register("Llama4ForConditionalGeneration") +class Llama4VisionModel(MmprojModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"]) + self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"])) + assert self.hparams["hidden_act"] == "gelu" + self.gguf_writer.add_vision_use_gelu(True) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "multi_modal_projector" not in name and "vision_model" not in name: + return None + + if "positional_embedding_vlm" in name and ".weight" not in name: + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "multi_modal_projector.linear_1" in name: + # despite the name with number postfix, this is a single fully connected layer + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch) + else: + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/llava.py b/conversion/llava.py new file mode 100644 index 00000000000..31d6e2ad80e --- /dev/null +++ b/conversion/llava.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +import json + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf, logger + +from .llama import LlamaModel + + +@ModelBase.register( + "LlavaForConditionalGeneration", # pixtral + "Mistral3ForConditionalGeneration", # mistral small 3.1 +) +class LlavaVisionModel(MmprojModel): + img_break_tok_id = -1 + use_break_tok = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams.get("model_type") == "pixtral": + # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py + self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5) + if self.use_break_tok: + self.img_break_tok_id = self.get_token_id("[IMG_BREAK]") + elif self.is_mistral_format: + # hparams is already vision config here so norm_eps is only defined in global_config. + self.hparams["norm_eps"] = self.global_config.get("norm_eps", None) + assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json" + if self.use_break_tok: + self.img_break_tok_id = self.find_vparam(["image_break_token_id"]) + + # params.json may ship -1 placeholders (Mistral Medium 3.5) + # resolve the real id from the bundled tokenizer in that case + if self.img_break_tok_id < 0: + self.img_break_tok_id = self.get_mistral_token_id("[IMG_BREAK]") + else: + raise ValueError(f"Unsupported model type: {self.hparams['model_type']}") + logger.info(f"Image break token id: {self.img_break_tok_id}") + + def get_token_id(self, token: str) -> int: + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + added_tokens_decoder = json.load(f).get('added_tokens_decoder') or {} + for id_, token_data in added_tokens_decoder.items(): + if token_data.get("content") == token: + return int(id_) + # fallthrough to tokenizer.json + with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + for token_data in tokenizer_json["added_tokens"]: + if token_data["content"] == token: + return int(token_data["id"]) + raise ValueError(f"Token '{token}' not found in tokenizer config.") + + def get_mistral_token_id(self, token: str) -> int: + # mistral native format ships tekken.json or a versioned spm tokenizer + tekken_file = self.dir_model / "tekken.json" + if tekken_file.is_file(): + with open(tekken_file, "r", encoding="utf-8") as f: + data = json.load(f) + for entry in data.get("special_tokens", []): + if entry.get("token_str") == token: + return int(entry["rank"]) + tokenizer_json_file = self.dir_model / "tokenizer.json" + if tokenizer_json_file.is_file(): + with open(tokenizer_json_file, "r", encoding="utf-8") as f: + data = json.load(f) + for entry in data.get("added_tokens", []): + if entry.get("content") == token: + return int(entry["id"]) + raise ValueError(f"Token '{token}' not found in mistral tokenizer files.") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if hparams.get("model_type") == "pixtral": + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) + self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) + + # hidden_act + if hparams["hidden_act"] == "silu": + self.gguf_writer.add_vision_use_silu(True) + elif hparams["hidden_act"] == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + else: + raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") + + # spatial_merge_size + if "spatial_merge_size" in self.global_config: + self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = ( + self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"]) + ) + n_kv_head = n_head + + valid_prefixes = ( + "multi_modal_projector.", + "vision_tower.", + "vision_encoder.", + "vision_language_adapter.", + "patch_merger.", + "pre_mm_projector_norm", + ) + + if any(name.startswith(prefix) for prefix in valid_prefixes): + # process vision tensors + if name.endswith(("q_proj.weight", "q_proj.bias")) and not self.is_mistral_format: + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format: + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + yield from super().modify_tensors(data_torch, name, bid) + return + + embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight" + if self.img_break_tok_id > 0 and embed_key in name: + logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") + # for pixtral model, we need to extract the [IMG_BREAK] token embedding + img_break_embd = data_torch[self.img_break_tok_id] + name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] + yield from super().modify_tensors(img_break_embd, name, bid) + + return # skip other tensors diff --git a/conversion/maincoder.py b/conversion/maincoder.py new file mode 100644 index 00000000000..18b625b08fb --- /dev/null +++ b/conversion/maincoder.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("MaincoderForCausalLM") +class MaincoderModel(TextModel): + model_arch = gguf.MODEL_ARCH.MAINCODER + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_rope_dimension_count(head_dim) diff --git a/conversion/mamba.py b/conversion/mamba.py new file mode 100644 index 00000000000..be0e36a29ba --- /dev/null +++ b/conversion/mamba.py @@ -0,0 +1,199 @@ +from __future__ import annotations + +import json + +from pathlib import Path +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") +class MambaModel(TextModel): + model_arch = gguf.MODEL_ARCH.MAMBA + + def __init__(self, dir_model: Path, *args, **kwargs): + # Avoid using AutoConfig for hparams + hparams = kwargs.pop("hparams", None) + if hparams is None: + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + super().__init__(dir_model, *args, hparams=hparams, **kwargs) + + def set_vocab(self): + vocab_size = self.hparams["vocab_size"] + # Round vocab size to next multiple of 8 + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + elif (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + else: + # Use the GPT-NeoX tokenizer when no tokenizer files are present + self._set_vocab_builtin("gpt-neox", vocab_size) + + def set_gguf_parameters(self): + d_model = self.find_hparam(["hidden_size", "d_model"]) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + use_dt_b_c_norm = False + # For falconmamba we do apply RMS norm on B / DT and C layers + if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): + use_dt_b_c_norm = True + # Fail early for models which don't have a block expansion factor of 2 + assert d_inner == 2 * d_model + + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_embedding_length(d_model) + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers + self.gguf_writer.add_file_type(self.ftype) + + _tok_embd = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) + + new_name = self.map_tensor_name(name) + + if name.endswith(".A_log"): + logger.debug("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + # [4 1 8192 1] -> [4 8192 1 1] + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): + data_torch = data_torch.squeeze() + + # assuming token_embd.weight is seen before output.weight + if self._tok_embd is not None and new_name == output_name: + if torch.equal(self._tok_embd, data_torch): + logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") + return + elif new_name == tok_embd_name: + self._tok_embd = data_torch + + yield from super().modify_tensors(data_torch, new_name, bid) + + +@ModelBase.register("Mamba2ForCausalLM") +class Mamba2Model(TextModel): + model_arch = gguf.MODEL_ARCH.MAMBA2 + + def __init__(self, dir_model: Path, *args, **kwargs): + # Avoid using AutoConfig for hparams + # It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1 + hparams = kwargs.pop("hparams", None) + if hparams is None: + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + if "llm_config" in hparams: + hparams["text_config"] = hparams["llm_config"] + super().__init__(dir_model, *args, hparams=hparams, **kwargs) + self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) + self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model + self.n_group = self.find_hparam(["n_groups"], optional=True) or 1 + + def set_vocab(self): + vocab_size = self.hparams["vocab_size"] + # Round vocab size to next multiple of 16 + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16) + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + if (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + elif (self.dir_model / "tokenizer.model.v3").is_file(): + # mamba-codestral + raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}") + elif (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # Use the GPT-NeoX tokenizer when no tokenizer files are present + self._set_vocab_builtin("gpt-neox", vocab_size) + + def set_gguf_parameters(self): + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128 + head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64 + + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + + # Fail early for models which don't have a block expansion factor of 2 + # TODO: does this really matter? + # skip the assertion for FalconH1 Model + if self.model_arch != gguf.MODEL_ARCH.FALCON_H1: + assert self.d_inner == 2 * self.d_model + assert self.d_inner % head_dim == 0 + + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_embedding_length(self.d_model) + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(self.d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim) + self.gguf_writer.add_ssm_group_count(self.n_group) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_file_type(self.ftype) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith(("model.backbone", "model.lm_head")): + # map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2 + name = name.removeprefix("model.") + + if name.endswith(".dt_bias"): + name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): + data_torch = data_torch.squeeze() + elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [ + gguf.MODEL_TENSOR.SSM_A, + gguf.MODEL_TENSOR.SSM_D, + ]): + # unsqueeze A to use similar shape semantics as Mamba-1 + # (D is also unsqueezed, but for more straightforward broadcast internally) + data_torch = data_torch.reshape((*data_torch.shape, 1)) + elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid): + data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group)) + + if name.endswith(".A_log"): + logger.debug("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + yield (new_name, data_torch) diff --git a/conversion/mimo.py b/conversion/mimo.py new file mode 100644 index 00000000000..d4067aab4b6 --- /dev/null +++ b/conversion/mimo.py @@ -0,0 +1,295 @@ +from __future__ import annotations + +import re + +from typing import Callable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf + + +@ModelBase.register("MiMoV2FlashForCausalLM", "MiMoV2ForCausalLM") +class MimoV2Model(TextModel): + model_arch = gguf.MODEL_ARCH.MIMO2 + + # MiMo V2-Flash, V2.5 and V2.5-Pro all ship 3 trained MTP layers under model.mtp.layers.{0,1,2}. + # The HF config does not expose the count, so it's hardcoded to match the count found in the safetensors. + _n_nextn = 3 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.block_count = self.hparams["num_hidden_layers"] + self._n_nextn + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + @staticmethod + def _tp_aware_qkv_dequant(weight: Tensor, scale_inv: Tensor, + n_q: int, n_kv: int, hd: int, vhd: int, + bs: int = 128) -> Tensor: + # MiMo-V2.5 (TP=4) and V2.5-Pro (TP=8) ship qkv_proj sharded across TP + # ranks; per rank, rows are stacked as [Q_per | K_per | V_per]. + # weight_scale_inv has ceil(rows_per_rank/bs) block-rows per rank (last + # may extend past rows_per_rank with phantom rows not in the weight). + # Naive repeat_interleave aligns rank 0 only and mis-applies scales to + # later ranks once rows_per_rank isn't a multiple of bs. + # Re-group the per-rank [Q_per|K_per|V_per] rows into a single fused + # [Q | K | V] tensor matching the un-sharded original layout. + q_size = n_q * hd + k_size = n_kv * hd + v_size = n_kv * vhd + total_rows = q_size + k_size + v_size + if weight.shape[0] != total_rows: + raise ValueError(f"qkv_proj weight rows {weight.shape[0]} != q+k+v {total_rows}") + + # detect TP from scale_inv block count, descending order so larger matches first + tp = None + for cand in (8, 4): + if total_rows % cand != 0: + continue + rpr = total_rows // cand + bpr = (rpr + bs - 1) // bs + if scale_inv.shape[0] == cand * bpr: + tp = cand + break + if tp is None: + raise ValueError( + f"qkv_proj: cannot detect TP - scale_inv rows {scale_inv.shape[0]}, " + f"q+k+v {total_rows}") + + q_per = q_size // tp + k_per = k_size // tp + v_per = v_size // tp + rows_per_rank = q_per + k_per + v_per + blocks_per_rank = (rows_per_rank + bs - 1) // bs + + scale_inv = scale_inv.float() + # per-row scale-row index: rank * blocks_per_rank + (rr_in_rank // bs) + row_idx = torch.arange(total_rows) + rr = row_idx % rows_per_rank + rank = row_idx // rows_per_rank + scale_row_idx = rank * blocks_per_rank + (rr // bs) + # gather: (total_rows, n_col_blocks) + scale_per_row_block = scale_inv[scale_row_idx] + # expand col-blocks -> cols: each block-col covers `bs` weight cols + scale_full = scale_per_row_block.repeat_interleave(bs, dim=1) + # crop to weight col count (in case last col-block isn't full) + scale_full = scale_full[:, : weight.shape[1]] + dequant = weight.float() * scale_full + + if tp == 1: + return dequant + + # Re-group per-rank [Q_per|K_per|V_per] rows into unified [Q | K | V] + qs, ks, vs = [], [], [] + for r in range(tp): + base = r * rows_per_rank + qs.append(dequant[base : base + q_per]) + ks.append(dequant[base + q_per : base + q_per + k_per]) + vs.append(dequant[base + q_per + k_per : base + rows_per_rank]) + return torch.cat(qs + ks + vs, dim=0) + + def dequant_model(self): + # Capture raw FP8 (weight, scale_inv) lambdas for qkv_proj BEFORE super + # rewrites them with the existing dequant. Replace super's lambda after + # it runs so scale_inv removal still happens via the standard path. + qkv_overrides: dict[str, tuple[Callable, Callable, int]] = {} + qc = self.hparams.get("quantization_config") + if isinstance(qc, dict) and qc.get("quant_method") == "fp8": + pat = re.compile(r"^model\.layers\.(\d+)\.self_attn\.qkv_proj\.weight_scale_inv$") + for name in list(self.model_tensors.keys()): + m = pat.match(name) + if not m: + continue + weight_name = name.removesuffix("_scale_inv") + if weight_name not in self.model_tensors: + continue + qkv_overrides[weight_name] = ( + self.model_tensors[weight_name], + self.model_tensors[name], + int(m.group(1)), + ) + + super().dequant_model() + + if not qkv_overrides: + return + + n_q = self.hparams["num_attention_heads"] + hd = self.hparams["head_dim"] + vhd = self.hparams["v_head_dim"] + hybrid = self.hparams["hybrid_layer_pattern"] + n_layer_text = self.hparams["num_hidden_layers"] + for weight_name, (w_fn, s_fn, bid) in qkv_overrides.items(): + # MTP layers (bid >= n_layer_text) use SWA-style attention dims + is_swa = True if bid >= n_layer_text else hybrid[bid] == 1 + n_kv = self.hparams["swa_num_key_value_heads" if is_swa else "num_key_value_heads"] + self.model_tensors[weight_name] = ( + lambda w_fn=w_fn, s_fn=s_fn, n_q=n_q, n_kv=n_kv, hd=hd, vhd=vhd: + MimoV2Model._tp_aware_qkv_dequant(w_fn(), s_fn(), n_q, n_kv, hd, vhd) + ) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + assert self.hparams["swa_head_dim"] == self.hparams["head_dim"] + assert self.hparams["swa_num_attention_heads"] == self.hparams["num_attention_heads"] + assert self.hparams["swa_v_head_dim"] == self.hparams["v_head_dim"] + assert self.hparams["topk_method"] == "noaux_tc" + + n_head_kv = self.hparams["num_key_value_heads"] + n_head_kv_swa = self.hparams["swa_num_key_value_heads"] + # Extend the per-layer pattern with SWA entries for the MTP blocks so the + # runtime arrays (sized to extended block_count) are fully populated. + hybrid = list(self.hparams["hybrid_layer_pattern"]) + [1] * self._n_nextn + n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in hybrid] + self.gguf_writer.add_head_count_kv(n_head_kv_arr) + + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_sliding_window_pattern(hybrid) + self.gguf_writer.add_value_length(self.hparams["v_head_dim"]) + self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + + rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"]) + self.gguf_writer.add_rope_dimension_count(rope_dim) + + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5)) + + v_scale = self.hparams.get("attention_value_scale") + if v_scale is not None: + self.gguf_writer.add_attn_value_scale(float(v_scale)) + + self.gguf_writer.add_nextn_predict_layers(self._n_nextn) + + _experts: list[dict[str, Tensor]] | None = None + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "attention_sink" in name and not name.endswith(".weight"): + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch, name, bid): + # Remap MTP/NextN tensors to additional layer slots so the standard tensor map handles them. + # HF: model.mtp.layers.{i}.foo -> model.layers.{n_layer_text + i}.foo + m = re.match(r"^model\.mtp\.layers\.(\d+)\.(.*)$", name) + if m is not None: + mtp_idx = int(m.group(1)) + assert mtp_idx < self._n_nextn, f"MTP layer index {mtp_idx} >= _n_nextn ({self._n_nextn})" + rest = m.group(2) + n_layer_text = self.hparams["num_hidden_layers"] + new_bid = n_layer_text + mtp_idx + name = f"model.layers.{new_bid}.{rest}" + bid = new_bid + + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["gate_proj", "up_proj", "down_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename_to_retrieve]) + del self._experts[bid][ename_to_retrieve] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("MiMoV2ForCausalLM") +class MiMoV2VisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + hp = self.hparams_vision + + hp["image_size"] = hp.get("image_size", 560) + hp["num_attention_heads"] = hp.get("num_heads", 32) + hp["num_hidden_layers"] = hp.get("depth", 28) + + self.n_q_heads = int(hp["num_heads"]) + self.num_kv_heads = int(hp.get("num_key_value_heads", 8)) + self.head_dim = int(hp.get("qk_channels", 64)) + self.spatial_merge_size = int(hp["spatial_merge_size"]) + # MiMoV2 vision RMSNorm: HF uses getattr(config, "rms_norm_eps", 1e-6) and the + # field is absent from MiMo-V2.5's vision_config + self.rms_norm_eps = float(hp.get("rms_norm_eps", 1e-6)) + + # fullatt_block_indexes are also reflected in vit_window_attn_types as -1 + self.fullatt_block_indexes = list(hp.get("fullatt_block_indexes") or []) + self.vit_window_attn_types = list(hp.get("vit_window_attn_types") or []) + self.visual_token_window_size = int(hp.get("visual_token_window_size", -1)) + self.use_sink = bool(hp.get("use_sink", False)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MIMOVL) + self.gguf_writer.add_vision_use_silu(True) + self.gguf_writer.add_vision_head_count_kv(self.num_kv_heads) + self.gguf_writer.add_vision_spatial_merge_size(self.spatial_merge_size) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.WINDOW_SIZE, self.visual_token_window_size) + self.gguf_writer.add_vision_wa_pattern_mode(self.vit_window_attn_types) + self.gguf_writer.add_vision_attention_layernorm_eps(self.rms_norm_eps) + self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) + self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # Sinks must be F32: any sink-style softmax/mask add in ggml requires + # F32, and we fold sinks into a host-built F32 mask at encode time. + if new_name.endswith(".attn_sinks"): + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, _ = item + if not name.startswith("visual."): + return None + return super().filter_tensors(item) + + def modify_tensors(self, data_torch, name, bid): + # Conv3D patch embed: split along the temporal axis (kt=2) into two Conv2D + # weights that the existing qwen2vl-style two-Conv2D path consumes. + if name == "visual.patch_embed.proj.weight": + _, _, kt, _, _ = data_torch.shape + if kt != 2: + raise ValueError(f"unexpected temporal_patch_size: {kt}") + embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + yield (embd_name + ".weight", data_torch[:, :, 0, ...]) + yield (embd_name + ".weight.1", data_torch[:, :, 1, ...]) + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/minicpm.py b/conversion/minicpm.py new file mode 100644 index 00000000000..e9a4c4a74dc --- /dev/null +++ b/conversion/minicpm.py @@ -0,0 +1,184 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf, logger + +from .llama import LlamaModel +from .qwen import Qwen3_5TextModel + + +@ModelBase.register("MiniCPMForCausalLM") +class MiniCPMModel(TextModel): + model_arch = gguf.MODEL_ARCH.MINICPM + + def set_gguf_parameters(self): + super().set_gguf_parameters() + embedding_scale = float(self.hparams["scale_emb"]) + self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") + residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 + self.gguf_writer.add_residual_scale(residual_scale) + logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") + logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] + self.gguf_writer.add_logit_scale(logit_scale) + logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is not None: + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("MiniCPM3ForCausalLM") +class MiniCPM3Model(TextModel): + model_arch = gguf.MODEL_ARCH.MINICPM3 + + def set_gguf_parameters(self): + hparams = self.hparams + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is not None: + rope_dims = self.hparams["qk_rope_head_dim"] + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + +# MiniCPM-V 4.6: text tower is Qwen3.5 (linear+full hybrid attention) wrapped under +# `model.language_model.*`; vision tower is SigLIP + a window-attention ViT merger +# + a final DownsampleMLP merger. The same HF arch is registered twice below: once as +# the LM (text mode) and once as the mmproj (vision mode), mirroring the Qwen3-VL setup. + +@ModelBase.register("MiniCPMV4_6ForConditionalGeneration") +class MiniCPMV4_6TextModel(Qwen3_5TextModel): + model_arch = gguf.MODEL_ARCH.QWEN35 + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("model.merger."): + return None + # MTP tensors are not used at inference yet; align with Qwen3Next behaviour + if name.startswith("mtp"): + return None + + return super().filter_tensors(item) + + +@ModelBase.register("MiniCPMV4_6ForConditionalGeneration") +class MiniCPMV4_6VisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams_vision is not None: + # In MiniCPM-V 4.6 `vision_config.image_size` (980) describes the SigLIP + # positional embedding bucket grid (70 x 70), while the per-slice processing + # resolution is the preprocessor's `scale_resolution` (typically 448). + # The CLIP loader in tools/mtmd/clip.cpp consumes `clip.vision.image_size` + # as the slice size and warmup resolution, so report `scale_resolution` there + # to match the upstream MiniCPMV4_6ImageProcessorPil slicing rules. + scale_resolution = self.preprocessor_config.get("scale_resolution") + if scale_resolution is not None: + self.hparams_vision["image_size"] = int(scale_resolution) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + + # projector type string is consumed by clip_projector_type_from_string() in clip.cpp + # (mapped to PROJECTOR_TYPE_MINICPMV4_6). + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MINICPMV4_6) + + # ViT merger 2x2 + final merger 2x2 = 4x spatial merge per dimension; used for slice alignment + self.gguf_writer.add_vision_projector_scale_factor(4) + + # borrow wa_layer_indexes for vit_merger insertion point + insert_layer_id = int(self.global_config.get( + "insert_layer_id", self.hparams_vision.get("insert_layer_id", 6))) + self.gguf_writer.add_vision_wa_layer_indexes([insert_layer_id]) + + # SigLIP vision body uses gelu_pytorch_tanh, which matches ggml_gelu (tanh approx). + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps( + self.hparams_vision.get("layer_norm_eps", 1e-6)) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # lm_head / MTP -> belong to the LM file + if name.startswith(("lm_head.", "mtp")): + return None + + return super().filter_tensors(item) diff --git a/conversion/minimax.py b/conversion/minimax.py new file mode 100644 index 00000000000..4857775cbfb --- /dev/null +++ b/conversion/minimax.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("MiniMaxM2ForCausalLM") +class MiniMaxM2Model(TextModel): + model_arch = gguf.MODEL_ARCH.MINIMAXM2 + _experts_cache: dict[int, dict[str, Tensor]] = {} + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"])) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + # merge expert weights + if 'experts' in name: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + expert_cache = self._experts_cache.setdefault(bid, {}) + expert_cache[name] = data_torch + expert_weights = ["w1", "w2", "w3"] + + # not enough expert weights to merge + if len(expert_cache) < n_experts * len(expert_weights): + return + + for w_name in expert_weights: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" + datas.append(expert_cache[ename]) + del expert_cache[ename] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" + new_name = self.map_tensor_name(merged_name) + yield from super().modify_tensors(data_torch, new_name, bid) + + del self._experts_cache[bid] + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/mistral.py b/conversion/mistral.py new file mode 100644 index 00000000000..7a7d6e03933 --- /dev/null +++ b/conversion/mistral.py @@ -0,0 +1,201 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MistralTokenizerType, MistralVocab, _mistral_common_installed, _mistral_import_error_msg, gguf, logger + +from .deepseek import DeepseekV2Model +from .llama import LlamaModel + +if _mistral_common_installed: + from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import] + from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import] + from mistral_common.tokens.tokenizers.sentencepiece import SentencePieceTokenizer # type: ignore[import-not-found, ty:unresolved-import] +else: + TokenizerVersion = None # type: ignore[assignment] + Tekkenizer = None # type: ignore[assignment] + SentencePieceTokenizer = None # type: ignore[assignment] + + +class MistralModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.MISTRAL3 + model_name = "Mistral" + hf_arch = "" + is_mistral_format = True + undo_permute = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # for compatibility, we use LLAMA arch for older models + # TODO: remove this once everyone migrates to newer version of llama.cpp + if "llama_4_scaling" not in self.hparams: + self.model_arch = gguf.MODEL_ARCH.LLAMA + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] + self.gguf_writer.add_architecture() + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def dequant_model(self): + # transform quantization config into HF format + quant_config = self.hparams.get("quantization") + if quant_config is not None: + assert quant_config["qformat_weight"] == "fp8_e4m3" + self.hparams["quantization_config"] = { + "activation_scheme": "static", + "quant_method": "fp8", + "weight_block_size": None, + } + return super().dequant_model() + + @staticmethod + def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool): + assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg + assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), ( + f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}" + ) + + if vocab.tokenizer.version == TokenizerVersion.v1: + return "mistral-v1" + elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm: + return "mistral-v3" + elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken: + return "mistral-v3-tekken" + elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm: + return "mistral-v7" + elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken: + return "mistral-v7-tekken" + elif vocab.tokenizer.version == TokenizerVersion.v11: + template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja" + elif vocab.tokenizer.version == TokenizerVersion.v13: + template_file = "unsloth-mistral-Devstral-Small-2507.jinja" + else: + err_message = f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}" + if is_mistral_format: + err_message += ( + " . Please pass --disable-mistral-community-chat-template argument to the CLI " + "if you want to skip this error and use the Mistral official `mistral-common` pre-processing library." + ) + raise ValueError(err_message) + + template_path = templates_dir / template_file + if not template_path.exists(): + raise FileNotFoundError(f"Template file not found: {template_path}") + + with open(template_path, "r", encoding="utf-8") as f: + template = f.read() + + return template + + def set_gguf_parameters(self): + super().set_gguf_parameters() + MistralModel.set_mistral_config(self.gguf_writer, self.hparams) + + @staticmethod + def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict): + if "yarn" in hparams: + yarn_params = hparams["yarn"] + mscale_all_dim = 1.0 if not yarn_params["apply_scale"] else 0.0 + gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + gguf_writer.add_rope_scaling_factor(yarn_params["factor"]) + gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"]) + gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"]) + gguf_writer.add_rope_scaling_yarn_log_mul(mscale_all_dim) + gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"]) + + if "llama_4_scaling" in hparams: + gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"]) + + +class MistralMoeModel(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + model_name = "Mistral" + hf_arch = "" + is_mistral_format = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + logger.info("Using MistralMoeModel") + # remap hparams from Mistral MoE format to DeepseekV2 format + # we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic + # ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py + config = self.hparams + # Mistral key -> HF key + config_mapping = { + "dim": "hidden_size", + "norm_eps": "rms_norm_eps", + "n_kv_heads": "num_key_value_heads", + "n_layers": "num_hidden_layers", + "n_heads": "num_attention_heads", + "hidden_dim": "intermediate_size", + } + # HF key -> (Mistral key, default value) + top_level_mapping_with_default = { + "model_type": ("model_type", "transformer"), + "hidden_act": ("activation", "silu"), + "tie_word_embeddings": ("tied_embeddings", False), + "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)), + "max_position_embeddings": ("max_position_embeddings", 128_000), + } + # mapping top-level keys + for key, new_key in config_mapping.items(): + if key in config: + config[new_key] = config[key] + for new_key, (key, default_value) in top_level_mapping_with_default.items(): + config[new_key] = config.get(key, default_value) + # mapping MoE-specific keys + moe_config_map = { + "route_every_n": "moe_layer_freq", + "first_k_dense_replace": "first_k_dense_replace", + "num_experts_per_tok": "num_experts_per_tok", + "num_experts": "n_routed_experts", + "expert_hidden_dim": "moe_intermediate_size", + "routed_scale": "routed_scaling_factor", + "num_shared_experts": "n_shared_experts", + "num_expert_groups": "n_group", + "num_expert_groups_per_tok": "topk_group", + } + moe = config["moe"] + for key, new_key in moe_config_map.items(): + if key in moe: + config[new_key] = moe[key] + # provide missing values + config["topk_method"] = None + config["norm_topk_prob"] = True + config["scoring_func"] = "softmax" + + def set_vocab(self): + self._set_vocab_mistral() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + MistralModel.set_mistral_config(self.gguf_writer, self.hparams) + yarn_params = self.hparams["yarn"] + self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"]) + + # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul + # ref https://github.com/ggml-org/llama.cpp/pull/17945 + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1 + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic + if name.endswith(".qscale_act"): + name = name.replace(".qscale_act", ".input_scale") + if name.endswith(".qscale_weight"): + name = name.replace(".qscale_weight", ".weight_scale") + if ".wkv_b." in name: + name = name.replace(".wkv_b.", ".kv_b_proj.") + if ".experts." in name: + name = name.replace(".experts.", ".mlp.experts.") + name = name.replace(".w1.", ".gate_proj.") + name = name.replace(".w2.", ".down_proj.") + name = name.replace(".w3.", ".up_proj.") + name = "model." + name + + return super().filter_tensors((name, gen)) diff --git a/conversion/mistral3.py b/conversion/mistral3.py new file mode 100644 index 00000000000..af9438ae705 --- /dev/null +++ b/conversion/mistral3.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + +from .deepseek import DeepseekV2Model +from .llama import LlamaModel + + +@ModelBase.register( + "Mistral3ForConditionalGeneration", + "Ministral3ForCausalLM", +) +class Mistral3Model(TextModel): + class Ministral3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.MISTRAL3 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + rope_params = self.rope_parameters + if self.hparams.get("model_type") == "ministral3": + assert rope_params, "ministral3 must have 'rope_parameters' config" + assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'" + self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) + self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"]) + + class Mistral4Model(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.MISTRAL4 + skip_mtp = False # model contains no MTP layers, so no need to skip + merge_expert = False # experts are already stacked as 3D + + def modify_tensors(self, data_torch, name, bid): + if name.endswith(".down_proj") or name.endswith(".gate_up_proj"): + name = name + ".weight" + yield from super().modify_tensors(data_torch, name, bid) + + model_arch = gguf.MODEL_ARCH.MISTRAL3 # unused + impl: TextModel + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams.get("model_type") == "mistral4": + self.impl = Mistral3Model.Mistral4Model(*args, **kwargs) + else: + self.impl = Mistral3Model.Ministral3Model(*args, **kwargs) + + def set_vocab(self): + self.impl.set_vocab() + + def set_gguf_parameters(self): + self.impl.set_gguf_parameters() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + yield from self.impl.modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + self.impl.prepare_tensors() + + def write_vocab(self): + self.impl.write_vocab() + + def write(self): + self.impl.write() diff --git a/conversion/mpt.py b/conversion/mpt.py new file mode 100644 index 00000000000..9557ab7fa64 --- /dev/null +++ b/conversion/mpt.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("MPTForCausalLM") +class MPTModel(TextModel): + model_arch = gguf.MODEL_ARCH.MPT + + def set_vocab(self): + try: + self._set_vocab_gpt2() + except Exception: + # Fallback for SEA-LION model + self._set_vocab_sentencepiece() + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_pad_token_id(3) + self.gguf_writer.add_eos_token_id(1) + self.gguf_writer.add_unk_token_id(0) + + def set_gguf_parameters(self): + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): + self.gguf_writer.add_head_count_kv(kv_n_heads) + self.gguf_writer.add_layer_norm_eps(1e-5) + if self.hparams["attn_config"]["clip_qkv"] is not None: + self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) + if self.hparams["attn_config"]["alibi"]: + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + else: + self.gguf_writer.add_max_alibi_bias(0.0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "scales" in name: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) + new_name = new_name.replace("scales", "act.scales") + else: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) + + yield from super().modify_tensors(data_torch, new_name, bid) diff --git a/conversion/nemotron.py b/conversion/nemotron.py new file mode 100644 index 00000000000..dfeeb978582 --- /dev/null +++ b/conversion/nemotron.py @@ -0,0 +1,384 @@ +from __future__ import annotations + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf, logger + +from .granite import GraniteHybridModel + + +@ModelBase.register( + "NemotronH_Nano_VL_V2", + "RADIOModel", +) +class NemotronNanoV2VLModel(MmprojModel): + # ViT-Huge architecture parameters for RADIO v2.5-h + _vit_hidden_size = 1280 + _vit_intermediate_size = 5120 + _vit_num_layers = 32 + _vit_num_heads = 16 + + def get_vision_config(self) -> dict[str, Any] | None: + # RADIO config doesn't have standard ViT parameters, so they need to be constructed manually + vision_config = self.global_config.get("vision_config") + if vision_config is None: + return None + # Add ViT-H parameters + vision_config = { + **vision_config, + "hidden_size": self._vit_hidden_size, + "intermediate_size": self._vit_intermediate_size, + "num_hidden_layers": self._vit_num_layers, + "num_attention_heads": self._vit_num_heads, + "image_size": self.global_config.get("force_image_size", 512), + } + return vision_config + + def set_gguf_parameters(self): + if "image_mean" not in self.preprocessor_config: + self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406] + if "image_std" not in self.preprocessor_config: + self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225] + + super().set_gguf_parameters() + hparams = self.global_config + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL) + self.gguf_writer.add_vision_attention_layernorm_eps(1e-6) + self.gguf_writer.add_vision_use_gelu(True) + downsample_ratio = hparams.get("downsample_ratio", 0.5) + self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".position_embd." in new_name or "pos_embed" in new_name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "input_conditioner" in name: + return None + + # mtmd does not support video yet so skip tensors related to video. + if "radio_model.model.patch_generator.video_embedder" in name: + return None + + if not name.startswith("vision_model.radio_model.model.") and not name.startswith("mlp1."): + return None + + if "patch_generator.pos_embed" in name: + if not name.endswith(".weight"): + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it + if "patch_generator.pos_embed" in name: + # Downsample position embeddings for fixed 512x512 image size + import torch.nn.functional as F + n_embd = self.hparams["hidden_size"] + image_size = self.global_config.get("force_image_size", 512) + patch_size = self.hparams["patch_size"] + target_patches_per_side = image_size // patch_size # 32 + max_patches_per_side = int((data_torch.shape[1]) ** 0.5) # 128 + if target_patches_per_side != max_patches_per_side: + # Reshape to grid, interpolate, flatten back + data_torch = data_torch.reshape(1, max_patches_per_side, max_patches_per_side, n_embd) + data_torch = data_torch.permute(0, 3, 1, 2).float() # [1, n_embd, 128, 128] + data_torch = F.interpolate(data_torch, size=(target_patches_per_side, target_patches_per_side), + mode='bilinear', align_corners=True) + data_torch = data_torch.permute(0, 2, 3, 1) # [1, 32, 32, n_embd] + data_torch = data_torch.reshape(1, target_patches_per_side * target_patches_per_side, n_embd) + + # Reshape linear patch embedding to conv2d format for ggml_conv_2d + # From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size] + if "patch_generator.embedder" in name: + patch_size = self.hparams["patch_size"] + n_embd = self.hparams["hidden_size"] + data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("NemotronForCausalLM") +class NemotronModel(TextModel): + model_arch = gguf.MODEL_ARCH.NEMOTRON + + def set_vocab(self): + self._set_vocab_sentencepiece() + self.gguf_writer.add_pad_token_id(0) + self.gguf_writer.add_unk_token_id(1) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"]) + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + + # * Partial RoPE + rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + + # * RopeScaling for Nemotron + if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + else: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side + # model.layers.{l}.input_layernorm.weight + # model.layers.{l}.post_attention_layernorm.weight + # model.norm.weight + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("NemotronHForCausalLM") +class NemotronHModel(GraniteHybridModel): + """Hybrid mamba2/attention model from NVIDIA""" + model_arch = gguf.MODEL_ARCH.NEMOTRON_H + is_moe: bool = False + + def __init__(self, *args, **kwargs): + # We have to determine the correct model architecture (MoE vs non-MoE) before + # calling the parent __init__. This is because the parent constructor + # uses self.model_arch to build the tensor name map, and all MoE-specific + # mappings would be missed if it were called with the default non-MoE arch. + hparams = ModelBase.load_hparams(args[0], self.is_mistral_format) + has_moe_params = ( + "num_experts_per_tok" in hparams + or (isinstance(hparams.get("llm_config"), dict) and "num_experts_per_tok" in hparams["llm_config"]) + ) + if has_moe_params: + self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE + self.is_moe = True + + super().__init__(*args, **kwargs) + + # Save the top-level head_dim for later + self.head_dim = self.hparams.get("head_dim", self.hparams.get("attention_head_dim")) + assert self.head_dim is not None, "Could not find the attention head dim in config" + + # Don't use expand to calculate d_inner + self.d_inner = self.find_hparam(["num_heads"]) * self.d_model + + # Update the ssm / attn / mlp layers + # M: Mamba2, *: Attention, -: MLP + # MoE: + # M: Mamba2, *: Attention, E: Expert + pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type") + if pattern is None: + self._ssm_layers = [] + self._mlp_layers = [] + elif isinstance(pattern, str): + self._ssm_layers = [i for i, val in enumerate(pattern) if val == "M"] + self._mlp_layers = [i for i, val in enumerate(pattern) if val == ("E" if self.is_moe else "-")] + else: + self._ssm_layers = [i for i, val in enumerate(pattern) if val == "mamba"] + self._mlp_layers = [i for i, val in enumerate(pattern) if val == "moe"] + + def get_attn_layers(self): + pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type") + if pattern is None: + return [] + assert len(pattern) == self.block_count, f"Mismatch between pattern ({len(pattern)}) and block_count ({self.block_count})!" + if isinstance(pattern, str): + return [i for i, val in enumerate(pattern) if val == "*"] + + return [i for i, val in enumerate(pattern) if val == "attention"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + head_dim = self.head_dim + if head_dim is None: + raise ValueError("Could not find the attention head dim in config") + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + + # Set feed_forward_length + # NOTE: This will trigger an override warning. This is preferable to + # duplicating all the parent logic + if not self.is_moe: + n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"]) + self.gguf_writer.add_feed_forward_length([ + n_ff if i in self._mlp_layers else 0 for i in range(self.block_count) + ]) + else: + moe_intermediate_size = self.hparams["moe_intermediate_size"] + self.gguf_writer.add_feed_forward_length([ + moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count) + ]) + self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"]) + self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"]) + self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) + self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_group_count(self.hparams["n_group"]) + + # number of experts used per token (top-k) + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + + if (latent_size := self.hparams.get("moe_latent_size")) is not None: + self.gguf_writer.add_moe_latent_size(latent_size) + + def set_vocab(self): + # The NemotronH config uses pattern characters (e.g. '-') that may not + # be supported by the installed transformers version. AutoTokenizer + # internally calls AutoConfig which triggers this parsing failure. + # Using trust_remote_code=True to load the model's own config class. + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + # Pad vocab size (from Mamba2Model/GraniteHybridModel) + self.hparams["pad_vocab_size_multiple"] = 8 # Setting this here since GraniteHybridModel.set_vocab() isn't being invoked now. + # From Mamba2Model.set_vocab(): + vocab_size = self.hparams["vocab_size"] + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16) + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + # From TextModel.set_vocab_gpt2(): + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + # The tokenizer _does_ add a BOS token (via post_processor type + # TemplateProcessing) but does not set add_bos_token to true in the + # config, so we need to explicitly override it here. + if not self.is_moe: + self.gguf_writer.add_add_bos_token(True) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.is_moe and bid is not None: + # Skip Multi-Token Prediction (MTP) tensors. These are used for + # for speculative decoding but we don't include them in this model + # conversion. See https://github.com/ggml-org/llama.cpp/pull/18886 + if name.startswith("mtp."): + logger.info(f"gguf: Skipping MTP (Speculative) layer: {name}") + return + + if name.endswith("mixer.gate.e_score_correction.bias"): + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + return + + if name.endswith("mixer.dt_bias"): + new_name = name.replace("dt_bias", "dt.bias") + yield from ModelBase.modify_tensors(self, data_torch, new_name, bid) + return + + if name.endswith("mixer.conv1d.weight"): + squeezed_data = data_torch.squeeze() + yield from ModelBase.modify_tensors(self, squeezed_data, name, bid) + return + + if name.endswith("mixer.A_log"): + transformed_data = -torch.exp(data_torch) + reshaped_data = transformed_data.squeeze().reshape(-1, 1) + yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) + return + + if name.endswith("mixer.D"): + reshaped_data = data_torch.squeeze().reshape(-1, 1) + yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) + return + + if name.endswith("mixer.norm.weight"): + reshaped_data = data_torch.reshape(self.n_group, -1) + yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) + return + + if name.find("mixer.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 2: + # merge the experts into a single tensor + for w_name in ["down_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/olmo.py b/conversion/olmo.py new file mode 100644 index 00000000000..1664c30e402 --- /dev/null +++ b/conversion/olmo.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + +from .llama import LlamaModel + + +@ModelBase.register("OlmoForCausalLM") +@ModelBase.register("OLMoForCausalLM") +class OlmoModel(TextModel): + model_arch = gguf.MODEL_ARCH.OLMO + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_eps(1e-5) + clip_qkv = self.hparams.get("clip_qkv") + if clip_qkv is not None: + self.gguf_writer.add_clamp_kqv(clip_qkv) + + # Same as super class, but permuting q_proj, k_proj + # Copied from: LlamaModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("SeedOssForCausalLM") +class SeedOssModel(TextModel): + model_arch = gguf.MODEL_ARCH.SEED_OSS + + +@ModelBase.register("Olmo2ForCausalLM") +@ModelBase.register("Olmo3ForCausalLM") +class Olmo2Model(TextModel): + model_arch = gguf.MODEL_ARCH.OLMO2 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + if "sliding_window" in self.hparams: + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + + sliding_window_pattern = [] + if "layer_types" in self.hparams: + sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]] + else: + # Olmo2 does not use sliding window attention. + # Olmo3 defaults to using sliding window for all layers except every 4th. + for i in range(self.hparams["num_hidden_layers"]): + sliding_window_pattern.append((i + 1) % 4 != 0) + + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + + +@ModelBase.register("OlmoeForCausalLM") +class OlmoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.OLMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_rms_eps(1e-5) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/openelm.py b/conversion/openelm.py new file mode 100644 index 00000000000..ecc746dc348 --- /dev/null +++ b/conversion/openelm.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from typing import Any, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("OpenELMForCausalLM") +class OpenELMModel(TextModel): + model_arch = gguf.MODEL_ARCH.OPENELM + + @staticmethod + def _make_divisible(v: float | int, divisor: int) -> int: + # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 + new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + ffn_multipliers: list[float] = self.hparams["ffn_multipliers"] + ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"] + self._n_embd: int = self.hparams["model_dim"] + self._num_kv_heads: list[int] = self.hparams["num_kv_heads"] + self._num_query_heads: list[int] = self.hparams["num_query_heads"] + self._ffn_dims: list[int] = [ + OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor) + for multiplier in ffn_multipliers + ] + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) + + # Uses the tokenizer from meta-llama/Llama-2-7b-hf + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"]) + + def set_gguf_parameters(self): + n_embd = self._n_embd + head_dim = self.hparams["head_dim"] + rot_pct = 1.0 + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_query_heads) + assert self.block_count == len(self._ffn_dims) + + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["max_context_length"]) + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(self._ffn_dims) + self.gguf_writer.add_head_count(self._num_query_heads) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) + # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 + self.gguf_writer.add_layer_norm_rms_eps(1e-6) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + self.gguf_writer.add_file_type(self.ftype) + + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: + if "n_layers" in keys: + return self.hparams["num_transformer_layers"] + + return super().find_hparam(keys, optional) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # split ff + if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": + ff_dim = self._ffn_dims[bid] + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]) + return + + yield (self.map_tensor_name(name), data_torch) diff --git a/conversion/orion.py b/conversion/orion.py new file mode 100644 index 00000000000..8dfceeed1f7 --- /dev/null +++ b/conversion/orion.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("OrionForCausalLM") +class OrionModel(TextModel): + model_arch = gguf.MODEL_ARCH.ORION + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 + self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) diff --git a/conversion/pangu.py b/conversion/pangu.py new file mode 100644 index 00000000000..42016ba0286 --- /dev/null +++ b/conversion/pangu.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import json + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("PanguEmbeddedForCausalLM") +class PanguEmbeddedModel(TextModel): + model_arch = gguf.MODEL_ARCH.PANGU_EMBED + + def set_vocab(self): + self._set_vocab_sentencepiece() + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + # PanguEmbedded's hparam loaded from config.json without head_dim + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + if hparams.get("head_dim") is None: + self.gguf_writer.add_key_length(rope_dim) + self.gguf_writer.add_value_length(rope_dim) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name == "lm_head.weight": + if self.hparams.get("tie_word_embeddings", False): + logger.info("Skipping tied output layer 'lm_head.weight'") + return + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/phi.py b/conversion/phi.py new file mode 100644 index 00000000000..5e0d72847aa --- /dev/null +++ b/conversion/phi.py @@ -0,0 +1,390 @@ +from __future__ import annotations + +import json +import math + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger + + +@ModelBase.register("PhiForCausalLM") +class Phi2Model(TextModel): + model_arch = gguf.MODEL_ARCH.PHI2 + + def set_gguf_parameters(self): + rot_pct = self.find_hparam(["partial_rotary_factor"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) + + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(4 * n_embd) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_add_bos_token(False) + + +@ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV") +class Phi3MiniModel(TextModel): + model_arch = gguf.MODEL_ARCH.PHI3 + + def set_vocab(self): + # Phi-4 model uses GPT2Tokenizer + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + tokenizer_class = tokenizer_config_json['tokenizer_class'] + if tokenizer_class == 'GPT2Tokenizer': + return self._set_vocab_gpt2() + + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + raise ValueError(f'Error: Missing {tokenizer_path}') + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, foken_data in added_tokens_decoder.items(): + token_id = int(token_id) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + tokenizer_file = self.dir_model / 'tokenizer.json' + if tokenizer_file.is_file(): + with open(tokenizer_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get("added_tokens", []) + for foken_data in added_tokens: + token_id = int(foken_data["id"]) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) + rms_eps = self.find_hparam(["rms_norm_eps"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + rope_dims = int(rot_pct * n_embd) // n_head + + self.gguf_writer.add_context_length(max_pos_embds) + self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_rms_eps(rms_eps) + self.gguf_writer.add_rope_dimension_count(rope_dims) + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"]) + self.gguf_writer.add_file_type(self.ftype) + sliding_window = self.hparams.get("sliding_window") + # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models + if sliding_window is None: + sliding_window = 0 + self.gguf_writer.add_sliding_window(sliding_window) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + rope_dims = int(rot_pct * n_embd) // n_head + + # write rope scaling for long context (128k) model + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is None: + return + + scale = max_pos_embds / orig_max_pos_embds + + rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower() + if len(rope_scaling_type) == 0: + raise KeyError('Missing the required key rope_scaling.type') + + if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': + attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 + elif rope_scaling_type == 'yarn': + attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 + else: + raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') + + self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + + +@ModelBase.register("Phi4ForCausalLMV") +class Phi4VisionMmprojModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + + self.vision_total_layers = int(self.find_vparam(self.n_block_keys)) + if self.vision_total_layers < 2: + raise ValueError( + f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}" + ) + + # Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and + # drop post-layernorm/head weights. This makes the GGUF runtime output match + # the feature map consumed by the patched siglip.cpp Phi-4 projector path. + self.vision_export_layers = self.vision_total_layers - 1 + self.vision_last_layer_idx = self.vision_total_layers - 1 + + for key in self.n_block_keys: + if key in self.hparams_vision: + self.hparams_vision[key] = self.vision_export_layers + break + + self.block_count = self.vision_export_layers + self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) + + patch_size = self.preprocessor_config.get("patch_size") + if patch_size is None: + raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json") + + self.hparams_vision["patch_size"] = patch_size + + pos_emb_name = next( + ( + name for name in self.model_tensors + if name.endswith("vision_model.embeddings.position_embedding.weight") + ), + None, + ) + if pos_emb_name is None: + raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight") + + pos_emb_shape = self.model_tensors[pos_emb_name]().shape + base_grid_tokens = int(pos_emb_shape[0]) + grid_side = math.isqrt(base_grid_tokens) + if grid_side * grid_side != base_grid_tokens: + raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}") + + self.hparams_vision["image_size"] = grid_side * patch_size + + min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches")) + max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches")) + if min_num_patches is None or max_num_patches is None: + raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches") + + self.min_pixels = int(min_num_patches) * patch_size * patch_size + self.max_pixels = int(max_num_patches) * patch_size * patch_size + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4) + self.gguf_writer.add_vision_min_pixels(self.min_pixels) + self.gguf_writer.add_vision_max_pixels(self.max_pixels) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + name = name.replace("model.vision_tower.vision_tower.", "vision_tower.") + + if not name.startswith(("vision_tower.", "model.mm_projector.", "mm_projector.")): + return None + + if ".vision_model.head." in name: + return None + + if ".vision_model.post_layernorm." in name: + return None + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("vision_tower."): + if bid is not None and bid == self.vision_last_layer_idx: + return + + if name.endswith("vision_model.embeddings.patch_embedding.weight"): + assert self.hparams_vision is not None + if data_torch.ndim != 2: + raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}") + + patch_area = self.hparams_vision["patch_size"] ** 2 + in_features = data_torch.shape[1] + if in_features % patch_area != 0: + raise ValueError( + f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}" + ) + + num_channels = in_features // patch_area + patch_size = self.hparams_vision["patch_size"] + data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels) + data_torch = data_torch.permute(0, 3, 1, 2) + + yield from super().modify_tensors(data_torch, name, bid) + return + + if name.startswith(("model.mm_projector.", "mm_projector.")): + local_name = name + local_name = local_name.replace("model.mm_projector.", "") + local_name = local_name.replace("mm_projector.", "") + + if not (local_name.startswith("0.") or local_name.startswith("2.")): + return + + suffix = ".bias" if local_name.endswith(".bias") else ".weight" + mm_idx = int(local_name.split(".", maxsplit=1)[0]) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch) + return + + return + + +@ModelBase.register("PhiMoEForCausalLM") +class PhiMoeModel(Phi3MiniModel): + model_arch = gguf.MODEL_ARCH.PHIMOE + + _experts: list[dict[str, Tensor]] | None = None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"])) + self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"])) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/pixtral.py b/conversion/pixtral.py new file mode 100644 index 00000000000..acd9ce1cf70 --- /dev/null +++ b/conversion/pixtral.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from typing import Sequence + +from .base import gguf + +from .llava import LlavaVisionModel + + +class PixtralModel(LlavaVisionModel): + model_name = "Pixtral" + hf_arch = "" + is_mistral_format = True + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) + + self.gguf_writer.add_vision_attention_layernorm_eps( + self.find_hparam(["norm_eps"]) + ) + self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"])) + + self.gguf_writer.add_vision_use_silu(True) + + # spatial_merge_size + if self.find_vparam(["mm_projector_id"], optional=True) == "patch_merge": + self.gguf_writer.add_vision_spatial_merge_size( + self.find_vparam(["spatial_merge_size"]) + ) + + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + if name == "vision_language_adapter.w_in.weight": + return "mm.1.weight" + elif name == "vision_language_adapter.w_in.bias": + return "mm.1.bias" + elif name == "vision_language_adapter.w_out.weight": + return "mm.2.weight" + elif name == "vision_language_adapter.w_out.bias": + return "mm.2.bias" + return super().map_tensor_name(name, try_suffixes) diff --git a/conversion/plamo.py b/conversion/plamo.py new file mode 100644 index 00000000000..c4bcbdf06bc --- /dev/null +++ b/conversion/plamo.py @@ -0,0 +1,195 @@ +from __future__ import annotations + +import json + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("PlamoForCausalLM") +class PlamoModel(TextModel): + model_arch = gguf.MODEL_ARCH.PLAMO + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + hparams = self.hparams + + self.gguf_writer.add_context_length(4096) # not in config.json + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + def shuffle_attn_q_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(8, 5, 128, 5120) + data_torch = torch.permute(data_torch, (1, 0, 2, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def shuffle_attn_output_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(5120, 8, 5, 128) + data_torch = torch.permute(data_torch, (0, 2, 1, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + # shuffle for broadcasting of gqa in ggml_mul_mat + if new_name.endswith("attn_q.weight"): + data_torch = self.shuffle_attn_q_weight(data_torch) + elif new_name.endswith("attn_output.weight"): + data_torch = self.shuffle_attn_output_weight(data_torch) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM") +class Plamo2Model(TextModel): + model_arch = gguf.MODEL_ARCH.PLAMO2 + + def set_vocab(self): + self._set_vocab_plamo() + + def set_gguf_parameters(self): + hparams = self.hparams + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + # Which layers are Mamba layers + # PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer) + # This logic matches modeling_plamo.py's is_mamba function + mamba_step = hparams.get("mamba_step", 2) + mamba_enabled = hparams.get("mamba_enabled", True) + num_key_value_heads = [] + num_attention_heads = [] + + if mamba_enabled: + for i in range(self.block_count): + if self.block_count <= (mamba_step // 2): + # use attention in last layer + is_mamba = (i != self.block_count - 1) + else: + is_mamba = (i % mamba_step) != (mamba_step // 2) + if is_mamba: + num_key_value_heads.append(0) + num_attention_heads.append(0) + else: + num_key_value_heads.append(hparams.get("num_key_value_heads", 4)) + num_attention_heads.append(hparams.get("num_attention_heads", 32)) + + if num_key_value_heads and num_attention_heads: + self.gguf_writer.add_head_count_kv(num_key_value_heads) + self.gguf_writer.add_head_count(num_attention_heads) + + self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048)) + self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096)) + self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128)) + self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128)) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06)) + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000)) + + # Mamba parameters + self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64)) + self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4)) + self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_num_heads", 64)) + intermediate_size = hparams.get("mamba_num_heads", 64) * hparams.get("hidden_size_per_head", 128) + self.gguf_writer.add_ssm_inner_size(intermediate_size) + self.gguf_writer.add_ssm_group_count(0) + + # MLP feed forward parameters (for attention layers) + self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312)) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith(".A_log"): + data_torch = -torch.exp(data_torch) + elif name.endswith(".dt_bias"): + name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" + elif name.endswith(".dt_norm_weight"): + name = name.rpartition(".dt_norm_weight")[0] + ".dt_norm.weight" + elif name.endswith(".B_norm_weight"): + name = name.rpartition(".B_norm_weight")[0] + ".B_norm.weight" + elif name.endswith(".C_norm_weight"): + name = name.rpartition(".C_norm_weight")[0] + ".C_norm.weight" + elif name.endswith(".k_weight"): + name = name.rpartition(".k_weight")[0] + ".k.weight" + elif name.endswith(".q_weight"): + name = name.rpartition(".q_weight")[0] + ".q.weight" + elif name.endswith(".conv1d.weight"): + data_torch = torch.squeeze(data_torch) # remove (, 1, ) + assert data_torch.ndim == 2 + elif name.endswith(".pre_mixer_norm.weight"): + data_torch += 1.0 + elif name.endswith(".post_mixer_norm.weight"): + data_torch += 1.0 / 5 + elif name.endswith(".pre_mlp_norm.weight"): + data_torch += 1.0 + elif name.endswith(".post_mlp_norm.weight"): + data_torch += 1.0 / (5**1.5) + elif name.endswith(".norm.weight"): + data_torch += 1.0 + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM") +class Plamo3Model(TextModel): + model_arch = gguf.MODEL_ARCH.PLAMO3 + + def set_vocab(self): + self._set_vocab_plamo() + + tokenizer_config_path = self.dir_model / "tokenizer_config.json" + tokenizer_config = {} + + if tokenizer_config_path.is_file(): + with open(tokenizer_config_path, encoding="utf-8") as f: + tokenizer_config = json.load(f) + + chat_template = tokenizer_config.get("chat_template") + chat_template_jinja = self.dir_model / "chat_template.jinja" + + if chat_template_jinja.is_file(): + with open(chat_template_jinja, encoding="utf-8") as f: + chat_template = f.read() + + if chat_template: + self.gguf_writer.add_chat_template(chat_template) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None: + self.gguf_writer.add_sliding_window(sliding_window) + self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + if name.endswith(".pre_mixer_norm.weight"): + data_torch = data_torch + 1.0 + elif name.endswith(".post_mixer_norm.weight"): + data_torch = data_torch + 1.0 / 5 + elif name.endswith(".pre_mlp_norm.weight"): + data_torch = data_torch + 1.0 + elif name.endswith(".post_mlp_norm.weight"): + data_torch = data_torch + 1.0 / (5**1.5) + elif name.endswith((".mixer.q_norm.weight", ".mixer.k_norm.weight")): + data_torch = data_torch + 1.0 + elif name.endswith(".norm.weight"): + data_torch = data_torch + 1.0 + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/plm.py b/conversion/plm.py new file mode 100644 index 00000000000..3fde487085b --- /dev/null +++ b/conversion/plm.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("PLMForCausalLM") +class PLMModel(TextModel): + model_arch = gguf.MODEL_ARCH.PLM + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(hparams["v_head_dim"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + def prepare_tensors(self): + super().prepare_tensors() diff --git a/conversion/qwen.py b/conversion/qwen.py new file mode 100644 index 00000000000..919ecddcb91 --- /dev/null +++ b/conversion/qwen.py @@ -0,0 +1,544 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("QWenLMHeadModel") +class QwenModel(TextModel): + model_arch = gguf.MODEL_ARCH.QWEN + + @staticmethod + def token_bytes_to_string(b): + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] + byte_encoder = bytes_to_unicode() + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + + @staticmethod + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + + def set_vocab(self): + self._set_vocab_qwen() + + +@ModelBase.register( + "Qwen2Model", + "Qwen2ForCausalLM", + "Qwen2AudioForConditionalGeneration", + "KORMoForCausalLM", + "AudioFlamingo3ForConditionalGeneration", + "DotsOCRForCausalLM", +) +class Qwen2Model(TextModel): + model_arch = gguf.MODEL_ARCH.QWEN2 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self._try_set_pooling_type() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.hf_arch == "Qwen2Model": + name = f"model.{name}" # map to Qwen2ForCausalLM tensors + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen2MoeForCausalLM") +class Qwen2MoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.QWEN2MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") + if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None: + self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) + logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # handle aggregated expert tensors + # GGUF stores dimensions reversed from PyTorch, so: + # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A} + # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp) + # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down + if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"): + mapped = f"{name}.weight" if not name.endswith(".weight") else name + # HF: [n_expert, n_embd, n_ff] -> GGML: {n_ff, n_embd, n_expert} + yield from super().modify_tensors(data_torch, mapped, bid) + return + + if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): + if data_torch.ndim < 3 or data_torch.shape[-2] % 2 != 0: + raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}") + # HF: [n_expert, 2*n_ff, n_embd] -> split on dim=-2 + n_ff = data_torch.shape[-2] // 2 + gate = data_torch[..., :n_ff, :].contiguous() + up = data_torch[..., n_ff:, :].contiguous() + # gate/up: [n_expert, n_ff, n_embd] -> GGML: {n_embd, n_ff, n_expert} + base_name = name.removesuffix(".weight").removesuffix(".gate_up_proj") + mapped_gate = f"{base_name}.gate_proj.weight" + mapped_up = f"{base_name}.up_proj.weight" + yield from super().modify_tensors(gate, mapped_gate, bid) + yield from super().modify_tensors(up, mapped_up, bid) + return + + if name.find("experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("Qwen3ForCausalLM", "Qwen3Model") +class Qwen3Model(Qwen2Model): + model_arch = gguf.MODEL_ARCH.QWEN3 + + # extra logic for rerank models + is_rerank: bool = False + is_tied_embeddings: bool = False + token_false_id: int | None = None + token_true_id: int | None = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # track for intern-s1-mini + hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + if self._is_qwen3_reranker(): + self._find_rerank_config() + + def _is_qwen3_reranker(self) -> bool: + readme_path = self.dir_model / "README.md" + readme_text = "" + if readme_path.exists(): + with readme_path.open("r", encoding="utf-8") as f: + readme_text = f.read() + + name_hints = [ + str(self.dir_model.name), + str(self.hparams.get("_name_or_path", "")), + str(self.hparams.get("model_type", "")), + str(self.origin_hf_arch or ""), + ] + name_hints = [hint.lower() for hint in name_hints if hint] + + if "# qwen3-reranker" in readme_text.lower() or "# qwen3-vl-reranker" in readme_text.lower(): + return True + + if any("qwen3-reranker" in hint or "qwen3-vl-reranker" in hint for hint in name_hints): + return True + + return "sequenceclassification" in (self.origin_hf_arch or "").lower() + + def set_vocab(self): + # deal with intern-s1-mini + if self.origin_hf_arch == 'InternS1ForConditionalGeneration': + self._set_vocab_interns1() + return + + super().set_vocab() + + def _find_rerank_config(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + + self.is_rerank = True + self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False) + self.token_false_id = tokenizer.convert_tokens_to_ids("no") # ty: ignore[unresolved-attribute, invalid-assignment] + self.token_true_id = tokenizer.convert_tokens_to_ids("yes") # ty: ignore[unresolved-attribute, invalid-assignment] + self.sep_token_id = tokenizer.convert_tokens_to_ids("|") # ty: ignore[unresolved-attribute] + + assert self.token_false_id is not None and self.token_true_id is not None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.is_rerank: + self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK) + self.gguf_writer.add_classifier_output_labels(["yes", "no"]) + self.gguf_writer.add_chat_template([{ + "name": "rerank", + "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n" + "<|im_start|>user\n: Given a web search query, retrieve relevant passages that answer the query\n: {query}\n: {document}<|im_end|>\n" + "<|im_start|>assistant\n\n\n\n\n" + }]) + + def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor: + # extract "yes" and "no" tokens from the output lm_head tensor + false_row = data_torch[self.token_false_id] + true_row = data_torch[self.token_true_id] + return torch.stack([true_row, false_row], dim=0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.is_rerank: + is_tied_head = self.is_tied_embeddings and "embed_tokens" in name + is_real_head = not self.is_tied_embeddings and "lm_head" in name + if is_tied_head or is_real_head: + cls_out_head = ( + gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight", + self._get_cls_out_tensor(data_torch), + ) + yield cls_out_head + if is_tied_head: + yield from super().modify_tensors(data_torch, name, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen3MoeForCausalLM") +class Qwen3MoeModel(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.QWEN3MOE + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + hparams = ModelBase.load_hparams(self.dir_model, False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + def set_vocab(self): + # deal with intern-s1 + if self.origin_hf_arch == 'InternS1ForConditionalGeneration': + self._set_vocab_interns1() + return + + super().set_vocab() + + +@ModelBase.register("Qwen3NextForCausalLM") +class Qwen3NextModel(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.QWEN3NEXT + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_ssm_conv_kernel(self.hparams["linear_conv_kernel_dim"]) + self.gguf_writer.add_ssm_state_size(self.hparams["linear_key_head_dim"]) + self.gguf_writer.add_ssm_group_count(self.hparams["linear_num_key_heads"]) + self.gguf_writer.add_ssm_time_step_rank(self.hparams["linear_num_value_heads"]) + self.gguf_writer.add_ssm_inner_size(self.hparams["linear_value_head_dim"] * self.hparams["linear_num_value_heads"]) + self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4)) + if (rope_dim := self.hparams.get("head_dim")) is None: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25))) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("mtp"): + # ignore MTP layers for now + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith(".A_log"): + data_torch = -torch.exp(data_torch) + elif name.endswith(".dt_bias"): + name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" + elif "conv1d" in name: + data_torch = data_torch.squeeze() + elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"): + data_torch = data_torch + 1 + + if "in_proj_qkvz.weight" in name: + # original order: [q, k, v, z] * head_count + # corrected order: [q * head_count, k * head_count, v * head_count, z * head_count] + head_k_dim = self.hparams["linear_key_head_dim"] + head_v_dim = self.hparams["linear_value_head_dim"] + num_v_heads = self.hparams["linear_num_value_heads"] + num_k_heads = self.hparams["linear_num_key_heads"] + hidden_size = self.hparams["hidden_size"] + split_arg_list_qkvz = [ + head_k_dim, # q partition + head_k_dim, # k partition + (num_v_heads // num_k_heads * head_v_dim), # v partition + (num_v_heads // num_k_heads * head_v_dim), # z partition + ] + # view as (n_embd, head_count, [q+k+v+z]) + data_torch = data_torch.permute(1, 0).contiguous() + data_torch = data_torch.view(-1, num_k_heads, sum(split_arg_list_qkvz)) + # split into q, k, v, z + q, k, v, z = torch.split(data_torch, split_arg_list_qkvz, dim=-1) + # flatten dim + head_count + q = q.contiguous().view(hidden_size, -1) + k = k.contiguous().view(hidden_size, -1) + v = v.contiguous().view(hidden_size, -1) + z = z.contiguous().view(hidden_size, -1) + # stack back + qkv = torch.cat([q, k, v], dim=-1).permute(1, 0).contiguous() + z = z.permute(1, 0).contiguous() + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, ".weight"), qkv) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid, ".weight"), z) + else: + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("RND1") +class RND1Model(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.RND1 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # RND1 specific parameters + # RND1 uses bidirectional attention + self.gguf_writer.add_causal_attention(False) + + if (mask_token_id := self.hparams.get("mask_token_id")) is not None: + self.gguf_writer.add_mask_token_id(mask_token_id) + + +class _LinearAttentionVReorderBase(Qwen3NextModel): + model_arch = gguf.MODEL_ARCH.QWEN3NEXT # overridden by subclasses + """reorders V heads from grouped to tiled order for ggml broadcast + + see https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306 + + Linear attention may has num_k_heads < num_v_heads. The HF weights store + V heads grouped by K head: [G0_v0..v{r-1}, G1_v0..v{r-1}, ...]. + ggml binary ops use tiled broadcast: [K0, K1, ..., K0, K1, ...]. + We reorder V heads to tiled order so ggml_repeat can replace the expensive + interleaved repeat: [G0_v0, G1_v0, ..., G0_v1, G1_v1, ...]. + """ + + @staticmethod + def _reorder_v_heads(tensor: Tensor, dim: int, num_k_heads: int, num_v_per_k: int, head_dim: int) -> Tensor: + """Reorder V heads from grouped (by K head) to tiled order along the given dimension.""" + shape = list(tensor.shape) + if dim < 0: + dim += len(shape) + new_shape = shape[:dim] + [num_k_heads, num_v_per_k, head_dim] + shape[dim + 1:] + tensor = tensor.reshape(*new_shape) + perm = list(range(len(new_shape))) + perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim] + return tensor.permute(*perm).contiguous().reshape(*shape) + + def _transform_nvfp4_weight(self, name: str, weight: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]: + if not name.endswith(( + ".linear_attn.in_proj_qkv.weight", + ".linear_attn.in_proj_z.weight", + ".linear_attn.in_proj_a.weight", + ".linear_attn.in_proj_b.weight", + ".linear_attn.out_proj.weight", + )): + return weight, scale + + num_k_heads = self.hparams["linear_num_key_heads"] + num_v_heads = self.hparams["linear_num_value_heads"] + head_k_dim = self.hparams["linear_key_head_dim"] + head_v_dim = self.hparams["linear_value_head_dim"] + num_v_per_k = num_v_heads // num_k_heads + + def unpack_nibbles(qs: Tensor) -> Tensor: + lo = torch.bitwise_and(qs, 0x0F) + hi = torch.bitwise_right_shift(qs, 4) + return torch.stack((lo, hi), dim=-1).reshape(*qs.shape[:-1], qs.shape[-1] * 2) + + def pack_nibbles(codes: Tensor) -> Tensor: + codes = codes.reshape(*codes.shape[:-1], codes.shape[-1] // 2, 2) + lo = torch.bitwise_and(codes[..., 0], 0x0F) + hi = torch.bitwise_left_shift(torch.bitwise_and(codes[..., 1], 0x0F), 4) + return torch.bitwise_or(lo, hi).contiguous() + + def apply_col_perm(qs: Tensor, scales: Tensor, col_perm: Tensor) -> tuple[Tensor, Tensor]: + assert qs.ndim >= 2 + assert scales.ndim >= 2 + + k = qs.shape[-1] * 2 + assert col_perm.numel() == k + assert k % 16 == 0 + + group_cols = col_perm.reshape(-1, 16) + group_starts = group_cols[:, 0] + expected = group_starts.unsqueeze(1) + torch.arange(16, dtype=col_perm.dtype) + assert torch.equal(group_cols, expected) + assert torch.all(group_starts % 16 == 0) + + group_perm = (group_starts // 16).to(dtype=torch.long) + expected_groups = torch.arange(scales.shape[-1], dtype=torch.long) + assert group_perm.numel() == scales.shape[-1] + assert torch.equal(torch.sort(group_perm).values, expected_groups) + + codes = unpack_nibbles(qs) + codes = codes.index_select(-1, col_perm.to(device=qs.device, dtype=torch.long)) + qs = pack_nibbles(codes) + scales = scales.index_select(-1, group_perm.to(device=scales.device)) + return qs, scales + + def reorder_rows(qs: Tensor, scales: Tensor, head_dim: int) -> tuple[Tensor, Tensor]: + row_perm = self._reorder_v_heads( + torch.arange(num_v_heads * head_dim, dtype=torch.long).unsqueeze(-1), + 0, num_k_heads, num_v_per_k, head_dim, + ).squeeze(-1) + return ( + qs.index_select(0, row_perm.to(device=qs.device)), + scales.index_select(0, row_perm.to(device=scales.device)), + ) + + if name.endswith(".linear_attn.in_proj_qkv.weight"): + q_dim = head_k_dim * num_k_heads + k_dim = head_k_dim * num_k_heads + q = weight[:q_dim] + k = weight[q_dim:q_dim + k_dim] + v = weight[q_dim + k_dim:] + q_scale = scale[:q_dim] + k_scale = scale[q_dim:q_dim + k_dim] + v_scale = scale[q_dim + k_dim:] + v, v_scale = reorder_rows(v, v_scale, head_v_dim) + return torch.cat([q, k, v], dim=0), torch.cat([q_scale, k_scale, v_scale], dim=0) + + if name.endswith(".linear_attn.in_proj_z.weight"): + weight, scale = reorder_rows(weight, scale, head_v_dim) + elif name.endswith((".linear_attn.in_proj_a.weight", ".linear_attn.in_proj_b.weight")): + weight, scale = reorder_rows(weight, scale, 1) + elif name.endswith(".linear_attn.out_proj.weight"): + col_perm = self._reorder_v_heads( + torch.arange(num_v_heads * head_v_dim, dtype=torch.long).unsqueeze(0), + 1, num_k_heads, num_v_per_k, head_v_dim, + ).squeeze(0) + weight, scale = apply_col_perm(weight, scale, col_perm) + + return weight, scale + + def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): + weight, scale = self._transform_nvfp4_weight(name, weight, scale) + super()._repack_nvfp4(name, weight, scale, scale2, input_scale) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + num_k_heads = self.hparams.get("linear_num_key_heads", 0) + num_v_heads = self.hparams.get("linear_num_value_heads", 0) + + if num_k_heads > 0 and num_v_heads > 0 and num_k_heads != num_v_heads and "linear_attn." in name: + head_k_dim = self.hparams["linear_key_head_dim"] + head_v_dim = self.hparams["linear_value_head_dim"] + num_v_per_k = num_v_heads // num_k_heads + + if ".in_proj_qkv." in name: + # QKV weight: reorder only the V rows + q_dim = head_k_dim * num_k_heads + k_dim = head_k_dim * num_k_heads + q = data_torch[:q_dim] + k = data_torch[q_dim:q_dim + k_dim] + v = data_torch[q_dim + k_dim:] + v = self._reorder_v_heads(v, 0, num_k_heads, num_v_per_k, head_v_dim) + data_torch = torch.cat([q, k, v], dim=0) + + elif ".in_proj_z." in name: + # Z gate weight: reorder rows (num_v_heads * head_v_dim) + data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, head_v_dim) + + elif ".in_proj_b." in name or ".in_proj_a." in name: + # Beta/Alpha weight: reorder rows (num_v_heads, head_dim=1) + data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, 1) + + elif ".A_log" in name or ".dt_bias" in name or ".dt_proj" in name: + # A_log / dt_bias: 1D parameters with num_v_heads elements + if data_torch.ndim == 1: + data_torch = self._reorder_v_heads( + data_torch.unsqueeze(-1), 0, num_k_heads, num_v_per_k, 1 + ).squeeze(-1) + else: + data_torch = self._reorder_v_heads(data_torch, -1, num_k_heads, num_v_per_k, 1) + + elif ".conv1d" in name: + # Conv1d kernel: reorder only the V channel portion + data = data_torch.squeeze() + qk_channels = head_k_dim * num_k_heads * 2 + qk_part = data[:qk_channels] + v_part = data[qk_channels:] + v_part = self._reorder_v_heads(v_part, 0, num_k_heads, num_v_per_k, head_v_dim) + data_torch = torch.cat([qk_part, v_part], dim=0) + + elif ".out_proj." in name: + # Out projection weight: reorder columns (input dimension) + data_torch = self._reorder_v_heads(data_torch, 1, num_k_heads, num_v_per_k, head_v_dim) + + yield from super().modify_tensors(data_torch, name, bid) + + +class _Qwen35MRopeMixin: + # Qwen3.5 always applies interleaved MRoPE (see Qwen3_5RotaryEmbedding in transformers); + # the upstream default mrope_section is [11, 11, 10] and llama.cpp's QWEN35 / QWEN35MOE + # loaders treat qwen35.rope.dimension_sections as required, so make sure it is always + # written even when a particular checkpoint omits the field in `rope_parameters`. + _QWEN35_DEFAULT_MROPE_SECTION = [11, 11, 10, 0] + + gguf_writer: gguf.GGUFWriter + rope_parameters: dict + + def set_gguf_parameters(self): + super().set_gguf_parameters() # ty: ignore[unresolved-attribute] + if "mrope_section" not in self.rope_parameters: + self.gguf_writer.add_rope_dimension_sections(self._QWEN35_DEFAULT_MROPE_SECTION) + + +@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM") +class Qwen3_5TextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase): + model_arch = gguf.MODEL_ARCH.QWEN35 + + +@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM") +class Qwen3_5MoeTextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase): + model_arch = gguf.MODEL_ARCH.QWEN35MOE diff --git a/conversion/qwen3vl.py b/conversion/qwen3vl.py new file mode 100644 index 00000000000..2d20b30dbde --- /dev/null +++ b/conversion/qwen3vl.py @@ -0,0 +1,357 @@ +from __future__ import annotations + +import json + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf, logger + +from .qwen import Qwen3Model, Qwen3MoeModel +from .qwenvl import Qwen25AudioModel + + +@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration", "Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration") +class Qwen3VLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams_vision is None: + logger.info("No vision config found, skipping vision tensor processing") + return + + # Compute image_size if not present + if "image_size" not in self.hparams_vision: + # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings + num_pos = self.hparams_vision.get("num_position_embeddings", 2304) + patch_size = self.hparams_vision.get("patch_size", 16) + # num_position_embeddings = (image_size / patch_size) ** 2 + # So image_size = sqrt(num_position_embeddings) * patch_size + image_size = int(num_pos**0.5 * patch_size) + self.hparams_vision["image_size"] = image_size + + # Rename config values for compatibility + self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads") + self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth") + + self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0) + for idx in self.hparams_vision.get("deepstack_visual_indexes", []): + self.is_deepstack_layers[idx] = True + + def set_gguf_parameters(self): + super().set_gguf_parameters() + # in case mixed modalities, the arch will be handled by subclass + if not self.has_audio_encoder: + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL) + self.gguf_writer.add_vision_use_gelu(True) + + if self.hparams_vision is not None: + merge_size = self.hparams_vision.get("spatial_merge_size") + if merge_size is not None: + self.gguf_writer.add_vision_spatial_merge_size(int(merge_size)) + + # Use text config's rms_norm_eps for vision attention layernorm eps + rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6) + self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) + + if self.is_deepstack_layers: + self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip text model tensors + if name.startswith("lm_head."): + return None + + # Skip MTP tensors + if name.startswith("mtp."): + return None + + if name.startswith("model.visual."): + name = name.replace("model.visual.", "visual.", 1) + + if not name.startswith("visual."): + return None + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + assert self.hparams_vision is not None + + if name.startswith("visual.deepstack_merger_list."): + prefix, rest = name.split(".", maxsplit=3)[2:] + # prefix is the layer index, convert to absolute clip layer index! + idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)] + target = rest + + tensor_type: gguf.MODEL_TENSOR + if target.startswith("norm."): + tensor_type = gguf.MODEL_TENSOR.V_DS_NORM + suffix = target.split(".", 1)[1] + elif target.startswith("linear_fc1."): + tensor_type = gguf.MODEL_TENSOR.V_DS_FC1 + suffix = target.split(".", 1)[1] + elif target.startswith("linear_fc2."): + tensor_type = gguf.MODEL_TENSOR.V_DS_FC2 + suffix = target.split(".", 1)[1] + else: + raise ValueError(f"Unexpected deepstack tensor: {name}") + + new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}") + yield from super().modify_tensors(data_torch, new_name, bid) + return + + if name.startswith("visual.merger."): + suffix = name.split(".", 2)[2] + if suffix.startswith("linear_fc"): + fc_idx_str, tail = suffix.split(".", 1) + fc_num = int(fc_idx_str.replace("linear_fc", "")) + # Qwen3VL has linear_fc1 and linear_fc2 + # Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2) + if fc_num == 1: + fc_idx = 0 + elif fc_num == 2: + fc_idx = 2 + else: + raise ValueError(f"unexpected fc index {fc_num} in {name}") + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}") + elif suffix.startswith("norm."): + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}") + else: + raise ValueError(f"Unexpected merger tensor: {name}") + yield (new_name, data_torch) + return + + if name == "visual.patch_embed.proj.weight": + # split Conv3D into Conv2Ds along temporal dimension + c1, c2, kt, _, _ = data_torch.shape + del c1, c2 + if kt != 2: + raise ValueError("Current implementation only supports temporal_patch_size of 2") + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]) + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) + return + + if name == "visual.patch_embed.proj.bias": + # Include the bias - it's used by the C++ code + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch) + return + + yield from MmprojModel.modify_tensors(self, data_torch, name, bid) + + +@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") +class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel): + has_audio_encoder = True + has_vision_encoder = True + + def get_vision_config(self) -> dict[str, Any] | None: + if self.has_vision_encoder: + return self.global_config["thinker_config"].get("vision_config") + else: + return None + + def get_audio_config(self) -> dict[str, Any] | None: + if self.has_audio_encoder: + return self.global_config["thinker_config"].get("audio_config") + else: + return None + + def set_gguf_parameters(self): + if self.has_vision_encoder: + Qwen3VLVisionModel.set_gguf_parameters(self) + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.QWEN3VL) + if self.has_audio_encoder: + Qwen25AudioModel.set_gguf_parameters(self) + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.QWEN3A) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip text model tensors + if name.startswith("lm_head."): + return None + + # Skip MTP tensors + if name.startswith("mtp."): + return None + + if name.startswith("model.visual."): + name = name.replace("model.visual.", "visual.", 1) + + if "visual." not in name and "audio_tower." not in name: + return None + + return MmprojModel.filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "visual." in name: + if not self.has_vision_encoder: + raise ValueError(f"Model does not have vision encoder, but found tensor {name}") + # need to transform vision tensor naming, so that modify_tensors() logic can be used correctly + name = name.replace("thinker.visual.", "model.visual.") + if ".merger_list." in name: + name = name.replace(".merger_list.", ".deepstack_merger_list.") + name = name.replace(".ln_q", ".norm") + name = name.replace(".mlp.0", ".linear_fc1") + name = name.replace(".mlp.2", ".linear_fc2") + elif ".merger." in name: + name = name.replace(".ln_q", ".norm") + name = name.replace(".mlp.0", ".linear_fc1") + name = name.replace(".mlp.2", ".linear_fc2") + yield from Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid) + elif "audio_tower." in name: + if not self.has_audio_encoder: + raise ValueError(f"Model does not have audio encoder, but found tensor {name}") + if "conv2d" in name and name.endswith(".bias"): + # transform conv2d bias [n_embd] --> [1, 1, n_embd] + data_torch = data_torch.unsqueeze(-1).unsqueeze(-1) + yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) + + +@ModelBase.register("Qwen3ASRForConditionalGeneration") +class Qwen3ASRMmprojModel(Qwen3OmniMmprojModel): + has_audio_encoder = True + has_vision_encoder = False + + +@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration") +class Glm4VVisionModel(Qwen3VLVisionModel): + def set_gguf_parameters(self): + MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters + assert self.hparams_vision is not None + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) + + hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() + if hidden_act == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + elif hidden_act == "silu": + self.gguf_writer.add_vision_use_silu(True) + + rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5) + self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("visual.merger."): + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + return + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen3VLForConditionalGeneration") +class Qwen3VLTextModel(Qwen3Model): + model_arch = gguf.MODEL_ARCH.QWEN3VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if "thinker_config" in self.hparams: + vision_config = self.hparams["thinker_config"].get("vision_config", {}) + else: + vision_config = self.hparams.get("vision_config", {}) + deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) + self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + name = name.replace("thinker.", "") + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("Qwen3VLMoeForConditionalGeneration") +class Qwen3VLMoeTextModel(Qwen3MoeModel): + model_arch = gguf.MODEL_ARCH.QWEN3VLMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + vision_config = self.hparams.get("vision_config", {}) + deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) + self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + name = name.replace("thinker.", "") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Qwen3VL has transposed packed tensors, so we treat it differently from general Qwen2MoE packed tensors + if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"): + mapped = f"{name}.weight" if not name.endswith(".weight") else name + permuted = data_torch.permute(0, 2, 1).contiguous() + yield from ModelBase.modify_tensors(self, permuted, mapped, bid) + return + + if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): + if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0: + raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}") + split_dim = data_torch.shape[-1] // 2 + gate = data_torch[..., :split_dim].contiguous() + up = data_torch[..., split_dim:].contiguous() + # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768) + # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128} + # Need PyTorch: (128, 768, 2048) [reversed of GGML] + # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048) + base_name = name.removesuffix(".weight") + base = base_name.rsplit('.', 1)[0] + mapped_gate = f"{base}.gate_proj.weight" + mapped_up = f"{base}.up_proj.weight" + perm_gate = gate.permute(0, 2, 1).contiguous() + perm_up = up.permute(0, 2, 1).contiguous() + yield from ModelBase.modify_tensors(self, perm_gate, mapped_gate, bid) + yield from ModelBase.modify_tensors(self, perm_up, mapped_up, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") +class Qwen3OmniMoeTextModel(Qwen3VLMoeTextModel): + model_arch = gguf.MODEL_ARCH.QWEN3VLMOE + + def set_vocab(self): + super().set_vocab() + # correct BOS/EOS tokens + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + added_tokens = tokenizer_config.get("added_tokens_decoder", {}) + for token_id, data in added_tokens.items(): + if data.get("content") == "<|im_end|>": + self.gguf_writer.add_bos_token_id(int(token_id)) + self.gguf_writer.add_eos_token_id(int(token_id)) + break + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_num_deepstack_layers(0) + + +@ModelBase.register("Qwen3ASRForConditionalGeneration") +class Qwen3ASRTextModel(Qwen3VLTextModel): + model_arch = gguf.MODEL_ARCH.QWEN3VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_num_deepstack_layers(0) + + def set_vocab(self): + super().set_vocab() + # fix chat template, use correct chatml format + self.gguf_writer.add_chat_template("{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}") + # correct BOS/EOS tokens + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + added_tokens = tokenizer_config.get("added_tokens_decoder", {}) + for token_id, data in added_tokens.items(): + if data.get("content") == "<|im_end|>": + self.gguf_writer.add_bos_token_id(int(token_id)) + self.gguf_writer.add_eos_token_id(int(token_id)) + break diff --git a/conversion/qwenvl.py b/conversion/qwenvl.py new file mode 100644 index 00000000000..7befd0c8d81 --- /dev/null +++ b/conversion/qwenvl.py @@ -0,0 +1,200 @@ +from __future__ import annotations + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import numpy as np +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf + + +@ModelBase.register( + "Qwen2VLModel", + "Qwen2VLForConditionalGeneration", + "Qwen2_5_VLForConditionalGeneration", + "Qwen2_5OmniModel", +) +class Qwen2VLModel(TextModel): + model_arch = gguf.MODEL_ARCH.QWEN2VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("thinker."): + name = name.replace("thinker.", "") + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") +class Qwen2VLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560) + # rename config.json values + self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads") + self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth") + if "embed_dim" in self.hparams_vision: # qwen2vl + self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size") + self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + hparams = self.hparams_vision + model_type = self.global_config['model_type'] + if model_type == 'qwen2_vl': + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL) + elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni': + if model_type == 'qwen2_5_omni': + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O) + else: + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL) + self.gguf_writer.add_vision_use_silu(True) + # find n_wa_pattern (window attention pattern) + fullatt_block_indexes = hparams.get("fullatt_block_indexes") + assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" + n_wa_pattern = fullatt_block_indexes[0] + 1 + # validate n_wa_pattern + for i in range(1, len(fullatt_block_indexes)): + if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: + raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") + self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) + else: + raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}") + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6)) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".position_embd." in new_name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if not name.startswith("visual."): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # split QKV tensors if needed + if ".qkv." in name: + if data_torch.ndim == 2: # weight + c3, _ = data_torch.shape + else: # bias + c3 = data_torch.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = data_torch[:c] + wk = data_torch[c: c * 2] + wv = data_torch[c * 2:] + yield from super().modify_tensors(wq, name.replace("qkv", "q"), bid) + yield from super().modify_tensors(wk, name.replace("qkv", "k"), bid) + yield from super().modify_tensors(wv, name.replace("qkv", "v"), bid) + elif 'patch_embed.proj.weight' in name: + # split Conv3D into Conv2Ds + c1, c2, kt, kh, kw = data_torch.shape + del c1, c2, kh, kw # unused + assert kt == 2, "Current implementation only support temporal_patch_size of 2" + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]) + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) + else: + yield from super().modify_tensors(data_torch, name, bid) + + +class Qwen25AudioModel(MmprojModel): + has_audio_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_audio is not None + self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] + self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"] + self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_audio is not None + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"]) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5)) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # SinusoidsPositionEmbedding + assert self.hparams_audio is not None + max_timescale = 10000 + length = 1500 + channels = self.hparams_audio["hidden_size"] + log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1) + inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float()) + scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :] + pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32) + yield ("audio_tower.embed_positions.weight", pos_embd) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F16 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "conv1.bias" in name or "conv2.bias" in name: + # transpose conv1 and conv2 bias + data_torch = data_torch.unsqueeze(-1) + + yield from MmprojModel.modify_tensors(self, data_torch, name, bid) + + +@ModelBase.register("Qwen2_5OmniModel") +class Qwen25OmniModel(Qwen2VLVisionModel, Qwen25AudioModel): + has_audio_encoder = True + has_vision_encoder = True + + def get_vision_config(self) -> dict[str, Any] | None: + return self.global_config["thinker_config"].get("vision_config") + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config["thinker_config"].get("audio_config") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if not name.startswith("visual.") and not name.startswith("audio_tower."): + return None + + if name.startswith("thinker."): + name = name.replace("thinker.", "") + + if "audio_bos_eos_token" in name: + # this tensor is left unused in transformers code + # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809 + return None + + return MmprojModel.filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "visual." in name: + yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid) + elif "audio_tower." in name: + yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) + return # skip other tensors diff --git a/conversion/refact.py b/conversion/refact.py new file mode 100644 index 00000000000..1170cddeb2c --- /dev/null +++ b/conversion/refact.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("GPTRefactForCausalLM") +class RefactModel(TextModel): + model_arch = gguf.MODEL_ARCH.REFACT + + def set_vocab(self): + super().set_vocab() + + # TODO: how to determine special FIM tokens automatically? + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot']) + special_vocab._set_special_token("prefix", 1) + special_vocab._set_special_token("suffix", 3) + special_vocab._set_special_token("middle", 2) + special_vocab.chat_template = None # do not add it twice + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + # refact uses Alibi. So this is from config.json which might be used by training. + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + + self.gguf_writer.add_feed_forward_length(ff_dim) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + n_head = self.hparams["n_head"] + n_head_kv = 1 + head_dim = self.hparams["n_embd"] // n_head + + if bid is not None: + if name == f"transformer.h.{bid}.attn.kv.weight": + yield from super().modify_tensors(data_torch[:n_head_kv * head_dim], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(data_torch[n_head_kv * head_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) + return + if name == f"transformer.h.{bid}.attn.q.weight": + yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + return + if name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": + yield from super().modify_tensors(data_torch[:ff_dim], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) + yield from super().modify_tensors(data_torch[ff_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/rwkv.py b/conversion/rwkv.py new file mode 100644 index 00000000000..2de0aa5346e --- /dev/null +++ b/conversion/rwkv.py @@ -0,0 +1,302 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("Rwkv6ForCausalLM") +class Rwkv6Model(TextModel): + model_arch = gguf.MODEL_ARCH.RWKV6 + + def set_vocab(self): + self._set_vocab_rwkv_world() + + def set_gguf_parameters(self): + head_size = self.hparams["head_size"] + hidden_size = self.hparams["hidden_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + rescale_every_n_layers = self.hparams["rescale_every"] + intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) + time_mix_extra_dim = 64 if hidden_size == 4096 else 32 + time_decay_extra_dim = 128 if hidden_size == 4096 else 64 + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + lerp_weights: dict[int, dict[str, Tensor]] = {} + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"): + data_torch = data_torch.transpose(0, 1) + + if new_name.endswith("time_mix_w2.weight"): + data_torch = data_torch.permute(0, 2, 1) + + if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: + data_torch = data_torch.squeeze() + + try: + rescale_every_n_layers = self.hparams["rescale_every"] + if rescale_every_n_layers > 0: + if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): + data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) + except KeyError: + pass + + # concat time_mix_lerp weights to reduce some cpu overhead + # also reduces the number of tensors in the model + if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name: + try: + self.lerp_weights[bid][new_name] = data_torch + except KeyError: + self.lerp_weights[bid] = {new_name: data_torch} + if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1) + yield (new_name, data) + return + + yield (new_name, data_torch) + + +@ModelBase.register("RWKV6Qwen2ForCausalLM") +class RWKV6Qwen2Model(Rwkv6Model): + model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + num_attention_heads = self.hparams["num_attention_heads"] + num_key_value_heads = self.hparams["num_key_value_heads"] + hidden_size = self.hparams["hidden_size"] + head_size = hidden_size // num_attention_heads + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32) + time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64) + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # special parameters for time_mixing in RWKV6QWEN2 + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_token_shift_count(1) + # RWKV6QWEN2 use grouped key/value like GQA + self.gguf_writer.add_head_count_kv(num_key_value_heads) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + for new_name, data in super().modify_tensors(data_torch, name, bid): + if "time_mix_w1" in new_name or "time_mix_w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg + # permute them here to avoid code changes + data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1]) + if "w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + yield (new_name, data) + continue + yield (new_name, data) + + +@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") +class Rwkv7Model(TextModel): + model_arch = gguf.MODEL_ARCH.RWKV7 + + def set_vocab(self): + self._set_vocab_rwkv_world() + + def calc_lora_rank(self, hidden_size, exponent, multiplier): + return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32 + + def set_gguf_parameters(self): + try: + head_size = self.hparams["head_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + except KeyError: + head_size = self.hparams["head_dim"] + layer_norm_eps = self.hparams["norm_eps"] + hidden_size = self.hparams["hidden_size"] + intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4) + + # ICLR: In-Context-Learning-Rate + try: + lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + except KeyError: + lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + lerp_weights: dict[int, dict[str, Tensor]] = {} + lora_needs_transpose: bool = True + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # unify tensor names here to make life easier + name = name.replace("blocks", "layers").replace("ffn", "feed_forward") + name = name.replace("self_attn", "attention").replace("attn", "attention") + name = name.replace("time_mixer.", "") + + name = name.replace("feed_forward_norm", "ln2") + name = name.replace("g_norm", "ln_x") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # lora layer names in fla-hub's impl + if "_lora.lora" in name: + self.lora_needs_transpose = False + name = name.replace("_lora.lora.0.weight", "1.weight") + name = name.replace("_lora.lora.2.weight", "2.weight") + name = name.replace("_lora.lora.2.bias", "0.weight") + + if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0: + # some models have dummy v0/v1/v2 on first layer while others don't + # ignore them all since they are not used + return + + wkv_has_gate = self.hparams.get("wkv_has_gate", True) + lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"] + + if bid is not None and "attention.x_" in name: + if "attention.x_x" in name: + # already concatenated + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = data_torch.reshape(len(lerp_list), 1, 1, -1) + yield (new_name, data) + else: + try: + self.lerp_weights[bid][name] = data_torch + except KeyError: + self.lerp_weights[bid] = {name: data_torch} + if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0) + yield (new_name, data) + return + else: + data_torch = data_torch.squeeze() + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if self.lora_needs_transpose and any( + new_name.endswith(t) for t in [ + "time_mix_w1.weight", "time_mix_w2.weight", + "time_mix_a1.weight", "time_mix_a2.weight", + "time_mix_v1.weight", "time_mix_v2.weight", + "time_mix_g1.weight", "time_mix_g2.weight", + ] + ): + data_torch = data_torch.transpose(0, 1) + + if 'r_k' in new_name: + data_torch = data_torch.flatten() + + if bid == 0 and "time_mix_a" in new_name: + # dummy v0/v1/v2 on first layer + # easiest way to make llama happy + yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch) + + yield (new_name, data_torch) + + +@ModelBase.register("RwkvHybridForCausalLM") +class ARwkv7Model(Rwkv7Model): + model_arch = gguf.MODEL_ARCH.ARWKV7 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + hidden_size = self.hparams["hidden_size"] + head_size = self.hparams["head_size"] + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + wkv_has_gate = self.hparams["wkv_has_gate"] + assert self.hparams["wkv_version"] == 7 + + # ICLR: In-Context-Learning-Rate + lora_rank_decay = 64 + lora_rank_iclr = 64 + lora_rank_value_residual_mix = 32 + lora_rank_gate = 128 if wkv_has_gate else 0 + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_token_shift_count(1) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) diff --git a/conversion/sarashina2.py b/conversion/sarashina2.py new file mode 100644 index 00000000000..05448db812e --- /dev/null +++ b/conversion/sarashina2.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, gguf + +from .llama import LlamaModel +from .qwenvl import Qwen2VLVisionModel + + +@ModelBase.register("Sarashina2VisionForCausalLM") +class Sarashina2VLTextModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + if name.startswith("llm."): + name = name.replace("llm.", "", 1) + elif name.startswith("norm."): + return None + return super().filter_tensors((name, gen)) + + +@ModelBase.register("Sarashina2VisionForCausalLM") +class Sarashina2VLVisionModel(Qwen2VLVisionModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.global_config['model_type'] = "qwen2_vl" diff --git a/conversion/smallthinker.py b/conversion/smallthinker.py new file mode 100644 index 00000000000..1b0f79aa3ea --- /dev/null +++ b/conversion/smallthinker.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("SmallThinkerForCausalLM") +class SmallThinkerModel(TextModel): + model_arch = gguf.MODEL_ARCH.SMALLTHINKER + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (n_experts := self.hparams.get("moe_num_primary_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + if (n_experts_used := self.hparams.get("moe_num_active_primary_experts")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + self.gguf_writer.add_feed_forward_length(moe_intermediate_size) + logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") + if (self.hparams.get('moe_primary_router_apply_softmax')): + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + + sliding_window_layout = self.hparams.get("sliding_window_layout") + if sliding_window_layout: + for i in sliding_window_layout: + if i != 0: + sliding_window = self.hparams.get("sliding_window_size") + if sliding_window: + self.gguf_writer.add_sliding_window(sliding_window) + break + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams.get("moe_num_primary_experts") or self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down", "gate", "up"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/smolvlm.py b/conversion/smolvlm.py new file mode 100644 index 00000000000..30e9dca329b --- /dev/null +++ b/conversion/smolvlm.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + + +@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration") +class SmolVLMModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams["model_type"] == "smolvlm_vision": + # fix for SmolVLM2, missing some keys in config.json + # default values are taken from transformers code + self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152) + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16) + self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2)) + self.gguf_writer.add_vision_use_gelu(True) + + # Add the preprocessor longest edge size + preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size) + self.gguf_writer.add_vision_preproc_image_size(preproc_image_size) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name + + if not is_vision_tensor: + return None + + return super().filter_tensors(item) diff --git a/conversion/stablelm.py b/conversion/stablelm.py new file mode 100644 index 00000000000..ba5e9aa6ca9 --- /dev/null +++ b/conversion/stablelm.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") +class StableLMModel(TextModel): + model_arch = gguf.MODEL_ARCH.STABLELM + + def set_vocab(self): + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab + self._set_vocab_qwen() + + def set_gguf_parameters(self): + hparams = self.hparams + + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) + self.gguf_writer.add_file_type(self.ftype) + + _q_norms: list[dict[str, Tensor]] | None = None + _k_norms: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams["num_key_value_heads"] + + if name.find("q_layernorm.norms") != -1: + assert bid is not None + + if self._q_norms is None: + self._q_norms = [{} for _ in range(self.block_count)] + + self._q_norms[bid][name] = data_torch + + if len(self._q_norms[bid]) >= n_head: + return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") + else: + return + + if name.find("k_layernorm.norms") != -1: + assert bid is not None + + if self._k_norms is None: + self._k_norms = [{} for _ in range(self.block_count)] + + self._k_norms[bid][name] = data_torch + + if len(self._k_norms[bid]) >= n_kv_head: + return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): + datas: list[Tensor] = [] + # extract the norms in order + for xid in range(n_head): + ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" + datas.append(norms[ename]) + del norms[ename] + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._q_norms is not None or self._k_norms is not None: + # flatten two `list[dict[str, Tensor]]` into a single `list[str]` + norms = ( + [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] + ) + ( + [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] + ) + if len(norms) > 0: + raise ValueError(f"Unprocessed norms: {norms}") diff --git a/conversion/starcoder.py b/conversion/starcoder.py new file mode 100644 index 00000000000..0b4ffd84702 --- /dev/null +++ b/conversion/starcoder.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("GPTBigCodeForCausalLM") +class StarCoderModel(TextModel): + model_arch = gguf.MODEL_ARCH.STARCODER + + def set_gguf_parameters(self): + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +@ModelBase.register("Starcoder2ForCausalLM") +class StarCoder2Model(TextModel): + model_arch = gguf.MODEL_ARCH.STARCODER2 diff --git a/conversion/step3.py b/conversion/step3.py new file mode 100644 index 00000000000..ba867fb831b --- /dev/null +++ b/conversion/step3.py @@ -0,0 +1,231 @@ +from __future__ import annotations + +import math +import re + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, _MISTRAL_COMMON_DATASET_MEAN, _MISTRAL_COMMON_DATASET_STD, gguf + +from .qwen import Qwen3Model + + +@ModelBase.register("StepVLForConditionalGeneration") +class Step3VLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + + if not self.hparams_vision.get("intermediate_size"): + hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0 + assert hidden_size > 0 + mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536)) + self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio)) + + self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN)) + self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + + projector_stride = int(self.global_config.get("understand_projector_stride", -1)) + hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1))) + num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1))) + assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), ( + "current Step3-VL conversion path is only validated for Step3-VL-10B" + ) + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL) + self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5))) + self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2) + # 3024 max resize comes from step3-vl-10b processing_step3.py. + self.gguf_writer.add_vision_preproc_image_size(3024) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".position_embd." in new_name: + return gguf.GGMLQuantizationType.F32 + if ("mm.0." in new_name or "mm.1." in new_name) and new_name.endswith(".weight"): + return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith(("model.", "lm_head.")): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("vision_model.vit_downsampler"): + match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name) + if match is None: + raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}") + + proj_id = int(match.group(1)) - 1 + suffix = f".{match.group(2)}" + yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch) + return + + if name == "vit_large_projector.weight": + yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch) + return + + if name.startswith("vision_model."): + if name == "vision_model.positional_embedding": + name += ".weight" + elif name.endswith(".gamma") and ".ls_" in name: + name = name.removesuffix(".gamma") + ".weight" + + name = name.replace("attn.in_proj_weight", "attn.in_proj.weight") + name = name.replace("attn.in_proj_bias", "attn.in_proj.bias") + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("StepVLForConditionalGeneration") +class Step3VLTextModel(Qwen3Model): + model_arch = gguf.MODEL_ARCH.QWEN3 + + +@ModelBase.register("Step3p5ForCausalLM") +class Step35Model(TextModel): + model_arch = gguf.MODEL_ARCH.STEP35 + + def set_gguf_parameters(self): + rope_theta = self.hparams.get("rope_theta") + if isinstance(rope_theta, list): + self.hparams["rope_theta"] = float(rope_theta[0]) + self.hparams["local_rope_theta"] = float(rope_theta[1]) + self.rope_parameters["rope_theta"] = self.hparams["rope_theta"] + self.rope_parameters["sliding_attention"] = {"rope_theta": self.hparams["local_rope_theta"]} + + super().set_gguf_parameters() + + layer_types = self.hparams.get("layer_types") or [] + partial_rotary_factors = self.hparams.get("partial_rotary_factors") or [] + attn_other = self.hparams.get("attention_other_setting") or {} + + n_head_base = self.hparams["num_attention_heads"] + n_kv_base = self.hparams["num_attention_groups"] + + n_head_swa = attn_other.get("num_attention_heads", n_head_base) + n_kv_swa = attn_other.get("num_attention_groups", n_kv_base) + + layer_types = layer_types[: self.block_count] + partial_rotary_factors = partial_rotary_factors[: self.block_count] + assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors + head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types] + kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] + swa_pat = [lt == "sliding_attention" for lt in layer_types] + + self.gguf_writer.add_head_count(head_arr) + self.gguf_writer.add_head_count_kv(kv_arr) + + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_sliding_window_pattern(swa_pat) + + self.gguf_writer.add_value_length(self.hparams["head_dim"]) + + # MoE params + self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["share_expert_dim"]) + + if (moe_router_scaling_factor := self.hparams.get("moe_router_scaling_factor")) is not None: + self.gguf_writer.add_expert_weights_scale(moe_router_scaling_factor) + if (norm_expert_weight := self.hparams.get("norm_expert_weight")) is not None: + self.gguf_writer.add_expert_weights_norm(norm_expert_weight) + + # leading dense blocks + leading_dense = 0 + moe_layers_enum = self.hparams.get("moe_layers_enum") + if isinstance(moe_layers_enum, str) and moe_layers_enum.strip(): + moe_layers = sorted(int(i) for i in moe_layers_enum.strip().split(",")) + if moe_layers: + leading_dense = max(0, moe_layers[0]) + self.gguf_writer.add_leading_dense_block_count(leading_dense) + self.gguf_writer.add_moe_every_n_layers(int(self.hparams.get("moe_every_n_layer", 1))) + + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5)) + + # Optional per-layer SwiGLU clamps. + if (limits := self.hparams.get("swiglu_limits")) is not None: + limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]] + self.gguf_writer.add_swiglu_clamp_exp(limits_f) + if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None: + limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]] + self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Map router bias (expert selection bias) to a GGUF bias tensor + if name.endswith(".moe.router_bias"): + name += ".bias" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + # remove mtp layers + if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None: + il = int(m.group(1)) + n_main = int(self.hparams.get("num_hidden_layers", self.block_count)) + if il >= n_main: + return + if name.endswith("norm.weight"): + data_torch += 1.0 + + if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")): + data_torch = data_torch.squeeze().contiguous() + + yield from super().modify_tensors(data_torch, name, bid) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3"). + # llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS). + rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) + rope_type = rope_params.get("rope_type") or "" + if rope_type.lower() != "llama3": + return + + # Step35 configs can carry per-layer rope_theta as a list; for llama3 rope factors we use the base value. + rope_theta = self.hparams.get("rope_theta", 10000.0) + if isinstance(rope_theta, list): + rope_theta = rope_theta[0] + base = float(rope_theta) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + dim = int(dim) + + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = float(rope_params.get("factor", 8.0)) + low_freq_factor = float(rope_params.get("low_freq_factor", 1.0)) + high_freq_factor = float(rope_params.get("high_freq_factor", 4.0)) + old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192))) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + + rope_factors: list[float] = [] + for freq in freqs: + wavelen = 2 * math.pi / float(freq) + if wavelen < high_freq_wavelen: + rope_factors.append(1.0) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) diff --git a/conversion/t5.py b/conversion/t5.py new file mode 100644 index 00000000000..73dcfd1a2ce --- /dev/null +++ b/conversion/t5.py @@ -0,0 +1,286 @@ +from __future__ import annotations + +import json +import os + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger + + +@ModelBase.register("T5WithLMHeadModel") +@ModelBase.register("T5ForConditionalGeneration") +@ModelBase.register("MT5ForConditionalGeneration") +@ModelBase.register("UMT5ForConditionalGeneration") +@ModelBase.register("UMT5Model") +class T5Model(TextModel): + model_arch = gguf.MODEL_ARCH.T5 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.shared_token_embeddings_found = False + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + # many older models use spiece.model tokenizer model filename + if not tokenizer_path.is_file(): + tokenizer_path = self.dir_model / 'spiece.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct + assert tokenizer_path.name == 'tokenizer.model' + return self._set_vocab_sentencepiece() + else: + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: + logger.warning("Couldn't find context length in config.json, assuming default value of 512") + n_ctx = 512 + self.gguf_writer.add_context_length(n_ctx) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) + self.gguf_writer.add_block_count(self.block_count) + if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None: + self.gguf_writer.add_decoder_block_count(dec_n_layer) + self.gguf_writer.add_head_count(self.hparams["num_heads"]) + self.gguf_writer.add_key_length(self.hparams["d_kv"]) + self.gguf_writer.add_value_length(self.hparams["d_kv"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. + if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: + if not self.shared_token_embeddings_found: + name = "shared.weight" + self.shared_token_embeddings_found = True + else: + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("T5EncoderModel") +class T5EncoderModel(TextModel): + model_arch = gguf.MODEL_ARCH.T5ENCODER + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.shared_token_embeddings_found = False + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + # many older models use spiece.model tokenizer model filename + if not tokenizer_path.is_file(): + tokenizer_path = self.dir_model / 'spiece.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct + assert tokenizer_path.name == 'tokenizer.model' + return self._set_vocab_sentencepiece() + else: + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: + logger.warning("Couldn't find context length in config.json, assuming default value of 512") + n_ctx = 512 + self.gguf_writer.add_context_length(n_ctx) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(self.hparams["num_heads"]) + self.gguf_writer.add_key_length(self.hparams["d_kv"]) + self.gguf_writer.add_value_length(self.hparams["d_kv"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. + if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: + if not self.shared_token_embeddings_found: + name = "shared.weight" + self.shared_token_embeddings_found = True + else: + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/ultravox.py b/conversion/ultravox.py new file mode 100644 index 00000000000..347188733a5 --- /dev/null +++ b/conversion/ultravox.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf + + +@ModelBase.register("UltravoxModel") +class UltravoxModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLAMA # dummy + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument") + + +@ModelBase.register("GlmasrModel") +class GlmASRWhisperEncoderModel(MmprojModel): + has_vision_encoder = False + has_audio_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams: + self.hparams["hidden_size"] = self.hparams["d_model"] + self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"] + self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA) + self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"]) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) + self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"]) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F16 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith(("model.", "lm_head.")): + # skip language model tensors + return None + + if name.startswith("audio_encoder.whisper."): + name = name.replace("audio_encoder.whisper.","audio_tower.") + if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name: + name = name.replace("audio_encoder.", "audio_encoder.adapting.") + if name.startswith("audio_encoder.adapting."): + name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.") + if ".layer_norm." in name: + name = name.replace(".layer_norm.", ".ln_pre.") + if ".0." in name: + name = name.replace(".0.", ".linear_1.") + if ".2." in name: + name = name.replace(".2.", ".linear_2.") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("audio_encoder.audio_bos_eos_token."): + yield from super().modify_tensors(data_torch[0], "model.vision.boi", bid) + yield from super().modify_tensors(data_torch[1], "model.vision.eoi", bid) + return + + if name.startswith("audio_encoder.adapting."): + if ".proj." in name: + return + + if "conv1.bias" in name or "conv2.bias" in name: + # transpose conv1 and conv2 bias + data_torch = data_torch.unsqueeze(-1) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen2AudioForConditionalGeneration") +class WhisperEncoderModel(MmprojModel): + has_vision_encoder = False # no vision encoder + has_audio_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams: + self.hparams["hidden_size"] = self.hparams["d_model"] + self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"] + self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"]) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F16 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # prevent clash naming with vision tensors + if name.startswith("multi_modal_projector"): + name = "audio." + name + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "conv1.bias" in name or "conv2.bias" in name: + # transpose conv1 and conv2 bias + data_torch = data_torch.unsqueeze(-1) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("UltravoxModel") +class UltravoxWhisperEncoderModel(WhisperEncoderModel): + has_vision_encoder = False # no vision encoder + has_audio_encoder = True + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX) + self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"]) + + +@ModelBase.register("MERaLiON2ForConditionalGeneration") +class MERaLiONWhisperEncoderModel(WhisperEncoderModel): + has_vision_encoder = False + has_audio_encoder = True + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config.get("speech_config") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MERALION) + self.gguf_writer.add_audio_stack_factor(self.global_config.get("speech_mlp_scale_factor", 15)) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("text_decoder."): + return None + + if name.startswith("speech_encoder."): + name = name.replace("speech_encoder.", "audio_tower.") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + suffix = "." + name.rsplit(".", 1)[-1] + + if name.startswith("ln_speech."): + yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MM_NORM_PRE, suffix=suffix), data_torch) + return + + if name.startswith("speech_audio_adapter."): + if ".mlp_adapter.0." in name: + yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 0, suffix=suffix), data_torch) + elif ".gate_proj." in name: + yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 1, suffix=suffix), data_torch) + elif ".pool_proj." in name: + yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 2, suffix=suffix), data_torch) + elif ".out_proj." in name: + yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 3, suffix=suffix), data_torch) + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("VoxtralForConditionalGeneration") +class VoxtralWhisperEncoderModel(WhisperEncoderModel): + has_vision_encoder = False # no vision encoder + has_audio_encoder = True + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL) + self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size + + +@ModelBase.register("AudioFlamingo3ForConditionalGeneration") +class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + # Was trained in BF16, being safe, avoiding quantizing to FP16 + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) diff --git a/conversion/wavtokenizer.py b/conversion/wavtokenizer.py new file mode 100644 index 00000000000..7d25447be88 --- /dev/null +++ b/conversion/wavtokenizer.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("WavTokenizerDec") +class WavTokenizerDecModel(TextModel): + model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if \ + name.endswith("codebook.cluster_size") or \ + name.endswith("codebook.embed_avg") or \ + name.endswith("codebook.inited"): + logger.debug(f"Skipping {name!r}") + return None + + return super().filter_tensors(item) + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) + self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) + self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) + self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) + + self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) + self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) + + self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) + self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) + + self.gguf_writer.add_causal_attention(False) diff --git a/conversion/xverse.py b/conversion/xverse.py new file mode 100644 index 00000000000..fa8a31a133f --- /dev/null +++ b/conversion/xverse.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import re + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("XverseForCausalLM") +class XverseModel(TextModel): + model_arch = gguf.MODEL_ARCH.XVERSE + + def set_vocab(self): + assert (self.dir_model / "tokenizer.json").is_file() + dir_model = self.dir_model + hparams = self.hparams + + tokens: list[bytes] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] + # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, + # because vocab_size is the count of items, and indexes start at 0. + max_vocab_index = max(tokenizer.get_vocab().values()) # ty: ignore[unresolved-attribute] + if max_vocab_index >= vocab_size: + raise ValueError("Vocabulary size exceeds expected maximum size.") + + reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + for token_id in range(vocab_size): + token_text = reverse_vocab[token_id].encode('utf-8') + # replace "\x00" to string with length > 0 + if token_text == b"\x00": + toktype = gguf.TokenType.BYTE # special + token_text = f"<{token_text}>".encode('utf-8') + elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + toktype = gguf.TokenType.BYTE # special + elif reverse_vocab[token_id] in added_vocab: + if tokenizer.added_tokens_decoder[token_id].special: # ty: ignore[unresolved-attribute] + toktype = gguf.TokenType.CONTROL + else: + toktype = gguf.TokenType.USER_DEFINED + else: + toktype = gguf.TokenType.NORMAL + + tokens.append(token_text) + toktypes.append(toktype) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + # HF models permute some of the tensors, so we need to undo that + if name.endswith("q_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) + if name.endswith("k_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) + + yield from super().modify_tensors(data_torch, name, bid) + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) diff --git a/conversion/youtuvl.py b/conversion/youtuvl.py new file mode 100644 index 00000000000..cabc44445f3 --- /dev/null +++ b/conversion/youtuvl.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf, logger + + +@ModelBase.register("YoutuVLForConditionalGeneration") +class YoutuVLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + + # Handle activation function + hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower() + if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"): + self.gguf_writer.add_vision_use_gelu(True) + elif hidden_act == "silu": + self.gguf_writer.add_vision_use_silu(True) + else: + raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}") + + self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2)) + + window_size = self.hparams.get("window_size") + if window_size is not None: + self.gguf_writer.add_vision_window_size(window_size) + # fullatt_block_indexes contains explicit layer indices that use full attention + # e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention + # All other layers use window attention + fullatt_block_indexes = self.hparams.get("fullatt_block_indexes") + assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl" + # Store the explicit layer indices for YoutuVL (irregular pattern approach) + self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip language model tensors + skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.') + if name.startswith(skip_prefixes): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Try to map the tensor using TensorNameMap (handles vision encoder and projector) + try: + yield from super().modify_tensors(data_torch, name, bid) + except ValueError: + # If mapping fails, log warning and skip + logger.warning(f"Cannot map tensor: {name}") + return diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5cff8684856..7173f616009 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3,13981 +3,46 @@ from __future__ import annotations -import ast -import logging -import argparse -import contextlib -import json -import os -import re -import sys -from enum import IntEnum -from pathlib import Path -from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast -from itertools import chain -from transformers import AutoConfig - -import math -import numpy as np -import torch - -if TYPE_CHECKING: - from torch import Tensor - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import gguf -from gguf.vocab import MistralTokenizerType, MistralVocab - -try: - from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import] - from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found, ty:unresolved-import] - from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import] - from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import] - SentencePieceTokenizer, - ) - - _mistral_common_installed = True - _mistral_import_error_msg = "" -except ImportError: - _MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) - _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) - - _mistral_common_installed = False - TokenizerVersion: Any = None - Tekkenizer: Any = None - SentencePieceTokenizer: Any = None - _mistral_import_error_msg = ( - "Mistral format requires `mistral-common` to be installed. Please run " - "`pip install mistral-common[image,audio]` to install it." - ) - - -logger = logging.getLogger("hf-to-gguf") - - -###### MODEL DEFINITIONS ###### - -class SentencePieceTokenTypes(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 - USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - - -class ModelType(IntEnum): - TEXT = 1 - MMPROJ = 2 - - -AnyModel = TypeVar("AnyModel", bound="type[ModelBase]") - - -class ModelBase: - _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = { - ModelType.TEXT: {}, - ModelType.MMPROJ: {}, - } - - dir_model: Path - ftype: gguf.LlamaFileType - fname_out: Path - is_big_endian: bool - endianess: gguf.GGUFEndian - use_temp_file: bool - lazy: bool - dry_run: bool - hparams: dict[str, Any] - model_tensors: dict[str, Callable[[], Tensor]] - gguf_writer: gguf.GGUFWriter - model_name: str | None - metadata_override: Path | None - dir_model_card: Path - remote_hf_model_id: str | None - - # subclasses should define this! - model_arch: gguf.MODEL_ARCH - - # subclasses should initialize this! - block_count: int - tensor_map: gguf.TensorNameMap - - # Mistral format specifics - is_mistral_format: bool = False - disable_mistral_community_chat_template: bool = False - sentence_transformers_dense_modules: bool = False - - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False, - use_temp_file: bool = False, eager: bool = False, - metadata_override: Path | None = None, model_name: str | None = None, - split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, - small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, - disable_mistral_community_chat_template: bool = False, - sentence_transformers_dense_modules: bool = False, - fuse_gate_up_exps: bool = False): - if type(self) is ModelBase or \ - type(self) is TextModel or \ - type(self) is MmprojModel: - raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") - - if self.is_mistral_format and not _mistral_common_installed: - raise ImportError(_mistral_import_error_msg) - - self.dir_model = dir_model - self.ftype = ftype - self.fname_out = fname_out - self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - self.use_temp_file = use_temp_file - self.lazy = not eager or (remote_hf_model_id is not None) - self.dry_run = dry_run - self.remote_hf_model_id = remote_hf_model_id - self.sentence_transformers_dense_modules = sentence_transformers_dense_modules - self.fuse_gate_up_exps = fuse_gate_up_exps - self._gate_exp_buffer: dict[int, Tensor] = {} - self._up_exp_buffer: dict[int, Tensor] = {} - self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams - self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) - self.metadata_override = metadata_override - self.model_name = model_name - self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py - self._is_nvfp4 = False - self._is_mxfp4 = False - - # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype - # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. - if self.ftype == gguf.LlamaFileType.GUESSED: - for _, tensor in self.get_tensors(): - if tensor.dim() < 2: - continue - - if tensor.dtype == torch.bfloat16: - self.ftype = gguf.LlamaFileType.MOSTLY_BF16 - logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16") - break - elif tensor.dtype == torch.float16: - self.ftype = gguf.LlamaFileType.MOSTLY_F16 - logger.info("heuristics detected float16 tensor dtype, setting --outtype f16") - break - else: - self.ftype = gguf.LlamaFileType.MOSTLY_F16 - logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16") - - # Configure GGUF Writer - self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, - split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) - - # Mistral specific - self.disable_mistral_community_chat_template = disable_mistral_community_chat_template - - @classmethod - def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: - stem, suffix = path.stem, path.suffix - new_name = f"{prefix}{stem}{suffix}" - return path.with_name(new_name) - - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - key = next((k for k in keys if k in self.hparams), None) - if key is not None: - return self.hparams[key] - if optional: - return None - raise KeyError(f"could not find any of: {keys}") - - def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: - tensors: dict[str, Callable[[], Tensor]] = {} - - if remote_hf_model_id is not None: - is_safetensors = True - - logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") - remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) - for name, remote_tensor in remote_tensors.items(): - data_gen = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r) # noqa: E731 - if titem := self.filter_tensors((name, data_gen)): - tname, tgen = titem - tensors[tname] = tgen - - return tensors - - prefix = "model" if not self.is_mistral_format else "consolidated" - part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors") - is_safetensors: bool = len(part_names) > 0 - if not is_safetensors: - part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - - tensor_names_from_index: set[str] = set() - tensor_names_from_parts: set[str] = set() - - if not self.is_mistral_format: - index_name = "model.safetensors" if is_safetensors else "pytorch_model.bin" - index_name += ".index.json" - index_file = self.dir_model / index_name - - if index_file.is_file(): - logger.info(f"gguf: loading model weight map from '{index_name}'") - with open(index_file, "r", encoding="utf-8") as f: - index: dict[str, Any] = json.load(f) - weight_map = index.get("weight_map") - if weight_map is None or not isinstance(weight_map, dict): - raise ValueError(f"Can't load 'weight_map' from {index_name!r}") - tensor_names_from_index.update(weight_map.keys()) - part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment] - part_names = sorted(part_dict.keys()) - else: - weight_map = {} - else: - weight_map = {} - - for part_name in part_names: - logger.info(f"gguf: indexing model part '{part_name}'") - ctx: ContextManager[Any] - if is_safetensors: - ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name)) - else: - ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) - - with ctx as model_part: - assert model_part is not None - - for name in model_part.keys(): - tensor_names_from_parts.add(name) - if is_safetensors: - data: gguf.utility.LocalTensor = model_part[name] - if self.lazy: - data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data) # noqa: E731 - else: - dtype = LazyTorchTensor._dtype_str_map[data.dtype] - data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape) # noqa: E731 - else: - data_torch: Tensor = model_part[name] - if self.lazy: - data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data) # noqa: E731 - else: - data_gen = lambda data=data_torch: data # noqa: E731 - if titem := self.filter_tensors((name, data_gen)): - tname, tgen = titem - tensors[tname] = tgen - - # verify tensor name presence and identify potentially missing files - if len(tensor_names_from_index) > 0: - if len(tensor_names_from_parts.symmetric_difference(tensor_names_from_index)) > 0: - missing = sorted(tensor_names_from_index.difference(tensor_names_from_parts)) - extra = sorted(tensor_names_from_parts.difference(tensor_names_from_index)) - missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) - if len(extra) == 0 and len(missing_files) > 0: - raise ValueError(f"Missing or incomplete model files: {missing_files}\n" - f"Missing tensors: {missing}") - else: - raise ValueError("Mismatch between weight map and model parts for tensor names:\n" - f"Missing tensors: {missing}\n" - f"Extra tensors: {extra}") - - return tensors - - @staticmethod - def _scale_is_trivial(scale: Tensor) -> bool: - return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6 - - def _write_scale_tensor(self, scale_name: str, scale: Tensor): - if not self._scale_is_trivial(scale): - scale_f32 = scale.float().numpy().flatten() - logger.info(f" + {scale_name} (per-tensor scale, shape [{scale_f32.size}])") - self.gguf_writer.add_tensor(scale_name, scale_f32) - - def _write_scales_tensor(self, scale_name: str, scales: list[float]): - if not np.allclose(scales, 1.0, atol=1e-6): - scale_vals = np.array(scales, dtype=np.float32) - logger.info(f" + {scale_name} (per-expert scale, shape [{len(scales)}])") - self.gguf_writer.add_tensor(scale_name, scale_vals) - - def dequant_model(self): - # If all quantized tensors were already handled (e.g. pure NVFP4), skip - if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors): - return - - tensors_to_remove: list[str] = [] - new_tensors: dict[str, Callable[[], Tensor]] = {} - - if (quant_config := self.hparams.get("quantization_config")) and isinstance(quant_config, dict): - quant_method = quant_config.get("quant_method") - - def dequant_bitnet(weight: Tensor, scale: Tensor) -> Tensor: - weight = weight.view(torch.uint8) - orig_shape = weight.shape - - shift = torch.tensor([0, 2, 4, 6], dtype=torch.uint8).reshape((4, *(1 for _ in range(len(orig_shape))))) - data = weight.unsqueeze(0).expand((4, *orig_shape)) >> shift - data = data & 3 - data = (data.float() - 1).reshape((orig_shape[0] * 4, *orig_shape[1:])) - - # The scale is inverted - return data / scale.float() - - def dequant_simple(weight: Tensor, scale: Tensor, block_size: Sequence[int] | None = None) -> Tensor: - scale = scale.float() - - if block_size is not None: - dim_offset = scale.ndim - len(block_size) - for i, size in enumerate(block_size): - scale = scale.repeat_interleave(size, dim_offset + i) - # unpad the scale (e.g. when the tensor size isn't a multiple of the block size) - scale = scale[tuple(slice(0, size) for size in weight.shape)] - - # align scale dims to weight for correct broadcasting (e.g. [128] -> [128, 1, 1]) - while scale.ndim < weight.ndim: - scale = scale.unsqueeze(-1) - - return weight.float() * scale - - # ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476 - def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) -> Tensor: - bits = quant_config["bits"] - assert bits in (2, 3, 4, 8) - assert qweight.dtype == qzeros.dtype - maxq = (2 ** bits) - 1 - weight = None - zeros = None - pack_dtype_bits = qweight.dtype.itemsize * 8 - - if bits in [2, 4, 8]: - pack_factor = pack_dtype_bits // bits - wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0) - if self.lazy: - wf = LazyTorchTensor.from_eager(wf) - - zeros = torch.bitwise_right_shift( - qzeros.unsqueeze(2).expand(-1, -1, pack_factor), - wf.unsqueeze(0) - ).to(torch.int16 if bits == 8 else torch.int8) - zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape) - - weight = torch.bitwise_and( - torch.bitwise_right_shift( - qweight.unsqueeze(1).expand(-1, pack_factor, -1), - wf.unsqueeze(-1) - ).to(torch.int16 if bits == 8 else torch.int8), - maxq - ) - elif bits == 3: - raise NotImplementedError("3-bit gptq dequantization is not yet implemented") - - assert weight is not None - assert zeros is not None - - weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2]) - - # gptq_v2 doesn't need to offset zeros - if quant_config.get("checkpoint_format", "gptq") == "gptq": - zeros += 1 - - return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T - - def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: Tensor | None, num_bits: int, group_size: int): - assert w.dtype == torch.int32 - shape = tuple(shape_tensor.tolist()) - assert len(shape) == 2 - mask = (1 << num_bits) - 1 - - shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32) - if self.lazy: - shifts = LazyTorchTensor.from_eager(shifts) - - if zero_point is None: - offset = 1 << (num_bits - 1) - else: - assert len(zero_point.shape) == 2 - offset = (zero_point.unsqueeze(1) >> shifts.reshape(1, -1, 1)) & mask - offset = offset.reshape(-1, zero_point.shape[1]) - # trim padding, and prepare for broadcast - # NOTE: the zero-point is packed along dim 0 - offset = offset[:shape[0], :].unsqueeze(-1) - - # extract values - # NOTE: the weights are packed along dim 1 - unpacked = (w.unsqueeze(-1) >> shifts.reshape(1, 1, -1)) & mask - unpacked = unpacked.reshape(shape[0], -1) - - # trim padding - unpacked = unpacked[:, :shape[1]] - - # prepare for broadcast of the scale - unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size) - unpacked = unpacked - offset - - return (unpacked * scale.unsqueeze(-1).float()).reshape(shape) - - if quant_method == "bitnet": - for name in self.model_tensors.keys(): - if name.endswith(".weight_scale"): - weight_name = name.removesuffix("_scale") - w = self.model_tensors[weight_name] - s = self.model_tensors[name] - self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s()) - tensors_to_remove.append(name) - elif quant_method == "fp8": - block_size = quant_config.get("weight_block_size") - for name in self.model_tensors.keys(): - if name.endswith("_scale_inv"): - weight_name = name.removesuffix("_scale_inv") - w = self.model_tensors[weight_name] - s = self.model_tensors[name] - self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs) - tensors_to_remove.append(name) - if name.endswith(".activation_scale"): # unused - tensors_to_remove.append(name) - if name.endswith("_activation_scale"): # Mistral-Small-4-119B-2602, unused - tensors_to_remove.append(name) - # mistral format - if name.endswith(".qscale_weight"): - weight_name = name.removesuffix("qscale_weight") + "weight" - w = self.model_tensors[weight_name] - s = self.model_tensors[name] - self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs) - tensors_to_remove.append(name) - if name.endswith(".qscale_act"): - tensors_to_remove.append(name) - elif quant_method == "gptq": - for name in self.model_tensors.keys(): - if name.endswith(".qweight"): - base_name = name.removesuffix(".qweight") - g_idx = self.model_tensors[base_name + ".g_idx"] - qweight = self.model_tensors[base_name + ".qweight"] - qzeros = self.model_tensors[base_name + ".qzeros"] - scales = self.model_tensors[base_name + ".scales"] - new_tensors[base_name + ".weight"] = ( - lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq( - g(), w(), z(), s() - ) - ) - tensors_to_remove += [ - base_name + n - for n in ( - ".g_idx", - ".qzeros", - ".qweight", - ".scales", - ) - ] - elif quant_method == "compressed-tensors": - quant_format = quant_config["format"] - groups = quant_config["config_groups"] - if len(groups) > 1: - raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet") - weight_config = tuple(groups.values())[0]["weights"] - - if quant_format == "float-quantized" or quant_format == "int-quantized" or quant_format == "naive-quantized": - block_size = weight_config.get("block_structure", None) - strategy = weight_config.get("strategy") - assert strategy == "channel" or strategy == "block" - assert weight_config.get("group_size") is None # didn't find a model using this yet - for name in self.model_tensors.keys(): - if name.endswith(".weight_scale"): - weight_name = name.removesuffix("_scale") - w = self.model_tensors[weight_name] - s = self.model_tensors[name] - self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size) - tensors_to_remove.append(name) - elif quant_format == "pack-quantized": - assert weight_config.get("strategy") == "group" - assert weight_config.get("type", "int") == "int" - num_bits = weight_config.get("num_bits") - group_size = weight_config.get("group_size") - assert isinstance(num_bits, int) - assert isinstance(group_size, int) - for name in self.model_tensors.keys(): - if name.endswith(".weight_packed"): - base_name = name.removesuffix("_packed") - w = self.model_tensors[name] - scale = self.model_tensors[base_name + "_scale"] - shape = self.model_tensors[base_name + "_shape"] - zero_point = self.model_tensors.get(base_name + "_zero_point", lambda: None) - new_tensors[base_name] = ( - lambda w=w, scale=scale, shape=shape, zero_point=zero_point: dequant_packed( - w(), scale(), shape(), zero_point(), num_bits, group_size, - ) - ) - tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")] - if (base_name + "_zero_point") in self.model_tensors: - tensors_to_remove.append(base_name + "_zero_point") - else: - raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported") - elif quant_method == "modelopt": - # Mixed-precision ModelOpt models: NVFP4 tensors are handled by - # _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and - # are dequantized here. k/v scale tensors are unused. - for name in self.model_tensors.keys(): - if name.endswith(".weight_scale"): - weight_name = name.removesuffix("_scale") - w = self.model_tensors[weight_name] - s = self.model_tensors[name] - self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None) - tensors_to_remove.append(name) - if name.endswith((".input_scale", ".k_scale", ".v_scale")): - tensors_to_remove.append(name) - elif quant_method is not None: - raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}") - - for name in tensors_to_remove: - if name in self.model_tensors: - del self.model_tensors[name] - - for name, value in new_tensors.items(): - self.model_tensors[name] = value - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith("e_score_correction_bias"): - name = name.replace("e_score_correction_bias", "e_score_correction.bias") - - if "language_model." in name: - name = name.replace("language_model.", "") - - return name, gen - - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for name, gen in self.model_tensors.items(): - yield name, gen() - - def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: - if key not in gguf.MODEL_TENSORS[self.model_arch]: - raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") - name: str = gguf.TENSOR_NAMES[key] - if "{bid}" in name: - assert bid is not None - name = name.format(bid=bid) - return name + suffix - - def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool: - if key not in gguf.MODEL_TENSORS[self.model_arch]: - return False - key_name: str = gguf.TENSOR_NAMES[key] - if "{bid}" in key_name: - if bid is None: - return False - key_name = key_name.format(bid=bid) - else: - if bid is not None: - return False - return name == (key_name + suffix) - - def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: - new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) - if new_name is None: - raise ValueError(f"Can not map tensor {name!r}") - return new_name - - def set_gguf_parameters(self): - raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - # Handle gate/up expert tensor fusion if enabled - if self.fuse_gate_up_exps and bid is not None: - if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid): - self._gate_exp_buffer[bid] = data_torch - elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): - self._up_exp_buffer[bid] = data_torch - - # Check if both gate and up are buffered for this layer - if bid in self._gate_exp_buffer and bid in self._up_exp_buffer: - gate_data = self._gate_exp_buffer.pop(bid) - up_data = self._up_exp_buffer.pop(bid) - # gate/up shape: (n_expert, n_ff, n_embd), concatenate to (n_expert, n_ff*2, n_embd) - fused_data = torch.cat([gate_data, up_data], dim=1) - fused_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, bid) - logger.info(f"Fused gate_exps and up_exps for layer {bid}") - return [(fused_name, fused_data)] - - # If we buffered a gate/up tensor, wait for the other - if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid) or \ - self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): - return [] - - return [(new_name, data_torch)] - - def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: - del name, new_name, bid, n_dims # unused - - return False - - # some models need extra generated tensors (like rope_freqs) - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - return () - - @staticmethod - def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]: - """Repack NVFP4 ModelOpt tensors into ggml super-block layout. - Preserves original E4M3 scale bits as UE4M3 (strip sign bit). - The per-tensor scale2 factor is stored as a separate tensor and applied at inference time via ggml_mul(). - Returns (raw_data, logical_shape).""" - - out_features = weight.shape[0] - n_blocks = scale.shape[1] - - # Unpack ModelOpt nibble-packed weights - w = weight.reshape(out_features, n_blocks, 8) - vals = torch.stack([w & 0x0F, w >> 4], dim=-1).reshape(out_features, n_blocks, 16) - - # Preserve original E4M3 scale bits as UE4M3 (strip sign bit) - d_ue = scale.view(torch.uint8).numpy().reshape(out_features, n_blocks) & 0x7F - qs = (vals[:, :, :8] | (vals[:, :, 8:] << 4)).to(torch.uint8).numpy() - - # Pack into super-blocks: [4 UE4M3 scales, 32 qs bytes] = 36 bytes per 64 elements - n_super = n_blocks // 4 - d_grouped = d_ue.reshape(out_features, n_super, 4) - qs_grouped = qs.reshape(out_features, n_super, 4, 8).reshape(out_features, n_super, 32) - raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36) - return raw, [out_features, n_super * 64] - - def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): - new_name = self.map_tensor_name(name) - - raw, shape = self._nvfp4_pack(weight, scale) - logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4") - self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4) - - self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2) - self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale) - - def _generate_nvfp4_tensors(self): - # Per-layer expert merging to avoid holding all experts in memory - expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {} - expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {} - expert_input_scales: dict[tuple[int, str], list[tuple[int, float]]] = {} - expert_shapes: dict[tuple[int, str], list[int]] = {} - n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0 - consumed: list[str] = [] - - for name in self.model_tensors.keys(): - if not name.endswith(".weight"): - continue - scale_name = name.replace(".weight", ".weight_scale") - scale2_name = name.replace(".weight", ".weight_scale_2") - input_scale_name = name.replace(".weight", ".input_scale") - if scale_name not in self.model_tensors: - continue - # Force eager materialization of lazy tensors - weight = LazyTorchTensor.to_eager(self.model_tensors[name]()) - scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]()) - - # Skip non-NVFP4 tensors (e.g. FP8 with per-channel 1D scales) - if scale.ndim < 2: - continue - - scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))()) - input_scale = LazyTorchTensor.to_eager(self.model_tensors.get(input_scale_name, lambda: torch.tensor(1.0))()) - - # Mark tensors for removal from model_tensors (already written to gguf) - consumed.extend([name, scale_name]) - if scale2_name in self.model_tensors: - consumed.append(scale2_name) - if input_scale_name in self.model_tensors: - consumed.append(input_scale_name) - - # Check if this is a per-expert tensor - m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name) - if m: - expert_id = int(m.group(1)) - proj_type = m.group(2) - bid_m = re.search(r'\.layers\.(\d+)\.', name) - bid = int(bid_m.group(1)) if bid_m else 0 - key = (bid, proj_type) - - raw, shape = self._nvfp4_pack(weight, scale) - - if key not in expert_blocks: - expert_blocks[key] = [] - expert_scales[key] = [] - expert_input_scales[key] = [] - expert_shapes[key] = shape - expert_blocks[key].append((expert_id, raw.copy())) - # Collect per-expert scale2 (scalar per expert) - expert_scales[key].append((expert_id, float(scale2.float().sum()))) - # Collect per-expert input_scale (scalar per expert) - expert_input_scales[key].append((expert_id, float(input_scale.float().sum()))) - - # Flush when all experts for this (layer, proj) are collected - if n_experts > 0 and len(expert_blocks[key]) >= n_experts: - self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type) - else: - self._repack_nvfp4(name, weight, scale, scale2, input_scale) - - # Flush any remaining experts (fallback if n_experts was unknown) - for bid, proj_type in list(expert_blocks.keys()): - self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type) - - # Remove consumed tensors so get_tensors/modify_tensors won't see them - for name in consumed: - self.model_tensors.pop(name, None) - - # Remove any remaining unused auxiliary tensors - for name in list(self.model_tensors.keys()): - if name.endswith((".k_scale", ".v_scale")): - del self.model_tensors[name] - - def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type): - experts = expert_blocks.pop(key) - scales = expert_scales.pop(key) - input_scales = expert_input_scales.pop(key) - shape = expert_shapes.pop(key) - - experts.sort(key=lambda x: x[0]) - merged = np.stack([e[1] for e in experts], axis=0) - merged_name = f"model.layers.{bid}.mlp.experts.{proj_type}.weight" - new_name = self.map_tensor_name(merged_name) - logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4") - self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4) - - scales.sort(key=lambda x: x[0]) - self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales]) - - input_scales.sort(key=lambda x: x[0]) - self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales]) - - del experts, merged - - def prepare_tensors(self): - # detect NVFP4 quantization (ModelOpt format) - quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo") - quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method") - quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {} - quant_config_file = self.dir_model / "hf_quant_config.json" - - if (not quant_algo or not quant_layers) and quant_config_file.is_file(): - with open(quant_config_file, "r", encoding="utf-8") as f: - hf_quant_config = json.load(f) - quant_config = hf_quant_config.get("quantization") or {} - producer = hf_quant_config.get("producer") or {} - producer_name = (producer.get("name") or "").lower() - if quant_method is None: - self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name - quant_algo = quant_config.get("quant_algo", quant_algo) - quant_layers = quant_config.get("quantized_layers", quant_layers) or {} - - # Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with - # per-layer NVFP4/FP8) instead of a single global "NVFP4" value. - if quant_algo != "NVFP4": - if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)): - quant_algo = "NVFP4" - - self._is_nvfp4 = quant_algo == "NVFP4" - self._is_mxfp4 = quant_method == "mxfp4" - - # NVFP4 weights are repacked and written directly to gguf_writer. - # This must run before dequant_model so NVFP4 tensors are removed - # from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant. - if self._is_nvfp4: - self._generate_nvfp4_tensors() - - self.dequant_model() - - # Handle empty tensor_map for models with block_count=0 (like MobileNetV5) - if self.tensor_map.mapping: - max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") - else: - max_name_len = len("vision_encoder.weight,") # Default reasonable length - - for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # use the first number-like part of the tensor name as the block id - bid = None - for part in name.split("."): - if part.isdecimal(): - bid = int(part) - break - - for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): - # TODO: why do we squeeze here? - # data = data_torch.squeeze().numpy() - data = data_torch.numpy() - - n_dims = len(data.shape) - data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) - - # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - if n_dims <= 1 or new_name.endswith("_norm.weight"): - data_qtype = gguf.GGMLQuantizationType.F32 - - # Conditions should closely match those in llama_model_quantize_internal in llama.cpp - # Some tensor types are always in float32 - if data_qtype is False and ( - any( - self.match_model_tensor_name(new_name, key, bid) - for key in ( - gguf.MODEL_TENSOR.FFN_GATE_INP, - gguf.MODEL_TENSOR.FFN_GATE_INP_SHEXP, - gguf.MODEL_TENSOR.POS_EMBD, - gguf.MODEL_TENSOR.TOKEN_TYPES, - gguf.MODEL_TENSOR.SSM_CONV1D, - gguf.MODEL_TENSOR.SHORTCONV_CONV, - gguf.MODEL_TENSOR.TIME_MIX_FIRST, - gguf.MODEL_TENSOR.TIME_MIX_W1, - gguf.MODEL_TENSOR.TIME_MIX_W2, - gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, - gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, - gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED, - gguf.MODEL_TENSOR.POSNET_NORM1, - gguf.MODEL_TENSOR.POSNET_NORM2, - gguf.MODEL_TENSOR.V_ENC_EMBD_POS, - gguf.MODEL_TENSOR.A_ENC_EMBD_POS, - gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF, - gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF, - # Kimi KDA conv weights should be F32 - gguf.MODEL_TENSOR.SSM_CONV1D_Q, - gguf.MODEL_TENSOR.SSM_CONV1D_K, - gguf.MODEL_TENSOR.SSM_CONV1D_V, - ) - ) - or new_name[-7:] not in (".weight", ".lora_a", ".lora_b") - ): - data_qtype = gguf.GGMLQuantizationType.F32 - - if data_qtype is False and any( - self.match_model_tensor_name(new_name, key, bid) - for key in ( - gguf.MODEL_TENSOR.TOKEN_EMBD, - gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD, - gguf.MODEL_TENSOR.OUTPUT, - gguf.MODEL_TENSOR.ALTUP_ROUTER, - gguf.MODEL_TENSOR.LAUREL_L, - gguf.MODEL_TENSOR.LAUREL_R, - ) - ): - if self.ftype in ( - gguf.LlamaFileType.MOSTLY_TQ1_0, - gguf.LlamaFileType.MOSTLY_TQ2_0, - ): - # TODO: use Q4_K and Q6_K - data_qtype = gguf.GGMLQuantizationType.F16 - - # No override (data_qtype is False), or wants to be quantized (data_qtype is True) - if isinstance(data_qtype, bool): - if self.ftype == gguf.LlamaFileType.ALL_F32: - data_qtype = gguf.GGMLQuantizationType.F32 - elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: - data_qtype = gguf.GGMLQuantizationType.F16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data_qtype = gguf.GGMLQuantizationType.BF16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: - data_qtype = gguf.GGMLQuantizationType.Q8_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: - data_qtype = gguf.GGMLQuantizationType.TQ1_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: - data_qtype = gguf.GGMLQuantizationType.TQ2_0 - else: - raise ValueError(f"Unknown file type: {self.ftype.name}") - - try: - data = gguf.quants.quantize(data, data_qtype) - except gguf.QuantError as e: - logger.warning("%s, %s", e, "falling back to F16") - data_qtype = gguf.GGMLQuantizationType.F16 - data = gguf.quants.quantize(data, data_qtype) - - shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape - - # reverse shape to make it similar to the internal ggml dimension order - shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" - - # n_dims is implicit in the shape - logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") - - self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) - - def set_type(self): - self.gguf_writer.add_type(gguf.GGUFType.MODEL) - - def prepare_metadata(self, vocab_only: bool): - - total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() - - self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params) - - # If we are using HF model id, set the metadata name to the model id - if self.remote_hf_model_id: - self.metadata.name = self.remote_hf_model_id - - # Fallback to model directory name if metadata name is still missing - if self.metadata.name is None: - self.metadata.name = self.dir_model.name - - if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16): - if self._is_nvfp4: - self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4 - elif self._is_mxfp4: - self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE - - # Generate parameter weight class (useful for leader boards) if not yet determined - if self.metadata.size_label is None and total_params > 0: - self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) - - self.set_type() - - logger.info("Set meta model") - self.metadata.set_gguf_meta_model(self.gguf_writer) - - logger.info("Set model parameters") - self.set_gguf_parameters() - - logger.info("Set model quantization version") - self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - - def write_vocab(self): - raise NotImplementedError("write_vocab() must be implemented in subclasses") - - def write(self): - self.prepare_tensors() - self.prepare_metadata(vocab_only=False) - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() - - @staticmethod - def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: - part_names: list[str] = [] - for filename in os.listdir(dir_model): - if filename.startswith(prefix) and filename.endswith(suffix): - part_names.append(filename) - - part_names.sort() - - return part_names - - @staticmethod - def load_hparams(dir_model: Path, is_mistral_format: bool): - if is_mistral_format: - with open(dir_model / "params.json", "r", encoding="utf-8") as f: - config = json.load(f) - return config - - try: - # for security reason, we don't allow loading remote code by default - # if a model need remote code, we will fallback to config.json - config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict() - except Exception as e: - logger.warning(f"Failed to load model config from {dir_model}: {e}") - logger.warning("Trying to load config.json instead") - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - config = json.load(f) - if "llm_config" in config: - # rename for InternVL - config["text_config"] = config["llm_config"] - if "lm_config" in config: - # rename for GlmASR - config["text_config"] = config["lm_config"] - if "thinker_config" in config: - # rename for Qwen2.5-Omni - config["text_config"] = config["thinker_config"]["text_config"] - if "language_config" in config: - # rename for DeepSeekOCR - config["text_config"] = config["language_config"] - if "lfm" in config: - # rename for LFM2-Audio - config["text_config"] = config["lfm"] - return config - - @classmethod - def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: - assert names - - def func(modelcls: AnyModel) -> AnyModel: - model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT - for name in names: - cls._model_classes[model_type][name] = modelcls - return modelcls - return func - - @classmethod - def print_registered_models(cls): - for model_type, model_classes in cls._model_classes.items(): - logger.error(f"{model_type.name} models:") - for name in sorted(model_classes.keys()): - logger.error(f" - {name}") - - @classmethod - def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]: - try: - return cls._model_classes[model_type][arch] - except KeyError: - raise NotImplementedError(f'Architecture {arch!r} not supported!') from None - - -class TextModel(ModelBase): - model_type = ModelType.TEXT - hf_arch: str - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if not self.is_mistral_format: - self.hf_arch = get_model_architecture(self.hparams, self.model_type) - else: - self.hf_arch = "" - - if "text_config" in self.hparams: - # move the text_config to the root level - self.hparams = {**self.hparams, **self.hparams["text_config"]} - - self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {} - - rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True) - local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True) - - # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters - if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters: - if local_rope_theta is not None: - self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta} - if "rope_theta" not in self.rope_parameters and rope_theta is not None: - self.rope_parameters["rope_theta"] = rope_theta - if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None: - self.rope_parameters["rope_type"] = rope_type - - @classmethod - def __init_subclass__(cls): - # can't use an abstract property, because overriding it without type errors - # would require using decorated functions instead of simply defining the property - if "model_arch" not in cls.__dict__: - raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip multimodal tensors - if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \ - or "visual." in name or "vision." in name or "audio." in name or "talker." in name \ - or "vision_" in name or "audio_" in name or "sam_model" in name \ - or "token2wav." in name or "code2wav." in name \ - or "projector." in name or "pre_mm_projector_norm" in name \ - or "image_newline" in name or "view_seperator" in name \ - or "patch_embed" in name or "patch_embedding" in name \ - or "patch_merger." in name or "model.connector." in name: - return None - - return super().filter_tensors(item) - - def set_vocab(self): - self._set_vocab_gpt2() - - def prepare_metadata(self, vocab_only: bool): - super().prepare_metadata(vocab_only=vocab_only) - - total_params = self.gguf_writer.get_total_parameter_count()[0] - # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' - output_type: str = self.ftype.name.partition("_")[2] - - # Filename Output - if self.fname_out.is_dir(): - # Generate default filename based on model specification and available metadata - if not vocab_only: - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) - else: - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") - - # Use the default filename - self.fname_out = self.fname_out / f"{fname_default}.gguf" - else: - # Output path is a custom defined templated filename - # Note: `not is_dir()` is used because `.is_file()` will not detect - # file template strings as it doesn't actually exist as a file - - # Process templated file name with the output ftype, useful with the "auto" ftype - self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) - - logger.info("Set model tokenizer") - self.set_vocab() - - def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) - - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None: - self.gguf_writer.add_context_length(n_ctx) - logger.info(f"gguf: context length = {n_ctx}") - - if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None: - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") - - if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - logger.info(f"gguf: feed forward length = {n_ff}") - - if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None: - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") - - if (n_head_kv := self.find_hparam(["num_key_value_heads", "n_kv_heads"], optional=True)) is not None: - self.gguf_writer.add_head_count_kv(n_head_kv) - logger.info(f"gguf: key-value head count = {n_head_kv}") - - if self.hparams.get("is_causal") is False: - self.gguf_writer.add_causal_attention(False) - logger.info("gguf: causal attention = False") - - # TODO: Handle "sliding_attention" similarly when models start implementing it - rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) - if (rope_type := rope_params.get("rope_type")) is not None: - rope_factor = rope_params.get("factor") - rope_gguf_type = gguf.RopeScalingType.NONE - if rope_type == "linear" and rope_factor is not None: - rope_gguf_type = gguf.RopeScalingType.LINEAR - self.gguf_writer.add_rope_scaling_type(rope_gguf_type) - self.gguf_writer.add_rope_scaling_factor(rope_factor) - elif rope_type == "yarn" and rope_factor is not None: - rope_gguf_type = gguf.RopeScalingType.YARN - self.gguf_writer.add_rope_scaling_type(rope_gguf_type) - self.gguf_writer.add_rope_scaling_factor(rope_factor) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"]) - if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None: - self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor) - if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None: - self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor) - if (yarn_beta_fast := rope_params.get("beta_fast")) is not None: - self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast) - if (yarn_beta_slow := rope_params.get("beta_slow")) is not None: - self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow) - # self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) - elif rope_type == "su" or rope_type == "longrope": - rope_gguf_type = gguf.RopeScalingType.LONGROPE - self.gguf_writer.add_rope_scaling_type(rope_gguf_type) - elif rope_type == "dynamic": - # HunYuan, handled in model class - pass - elif rope_type.lower() == "llama3": - # Handled in generate_extra_tensors - pass - else: - logger.warning(f"Unknown RoPE type: {rope_type}") - logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}") - - if "mrope_section" in self.rope_parameters: - mrope_section = self.rope_parameters["mrope_section"] - # Pad to 4 dimensions [time, height, width, extra] - while len(mrope_section) < 4: - mrope_section.append(0) - self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) - logger.info(f"gguf: mrope sections: {mrope_section[:4]}") - - if (rope_theta := rope_params.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - logger.info(f"gguf: rope theta = {rope_theta}") - if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base_swa(local_rope_theta) - logger.info(f"gguf: rope theta swa = {local_rope_theta}") - if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: - self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None: - self.gguf_writer.add_expert_count(n_experts) - logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - logger.info(f"gguf: experts used count = {n_experts_used}") - if (n_expert_groups := self.hparams.get("n_group")) is not None: - self.gguf_writer.add_expert_group_count(n_expert_groups) - logger.info(f"gguf: expert groups count = {n_expert_groups}") - if (n_group_used := self.hparams.get("topk_group")) is not None: - self.gguf_writer.add_expert_group_used_count(n_group_used) - logger.info(f"gguf: expert groups used count = {n_group_used}") - - if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None: - if score_func == "sigmoid": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - elif score_func == "softmax": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) - else: - raise ValueError(f"Unsupported expert score gating function value: {score_func}") - logger.info(f"gguf: expert score gating function = {score_func}") - - if (head_dim := self.hparams.get("head_dim")) is not None: - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") - - def write_vocab(self): - if len(self.gguf_writer.tensors) != 1: - raise ValueError('Splitting the vocabulary is not supported') - - self.prepare_metadata(vocab_only=True) - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.close() - - def does_token_look_special(self, token: str | bytes) -> bool: - if isinstance(token, (bytes, bytearray)): - token_text = token.decode(encoding="utf-8") - elif isinstance(token, memoryview): - token_text = token.tobytes().decode(encoding="utf-8") - else: - token_text = token - - # Some models mark some added tokens which ought to be control tokens as not special. - # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2}) - seems_special = token_text in ( - "", # deepseek-coder - "", "<2mass>", "[@BOS@]", # gemma{,-2} - ) - - seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) - seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder - - # TODO: should these be marked as UNUSED instead? (maybe not) - seems_special = seems_special or (token_text.startswith("")) # gemma{,-2} - - return seems_special - - # used for GPT-2 BPE and WordPiece vocabs - def get_vocab_base(self) -> tuple[list[str], list[int], str]: - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] - assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token: str = reverse_vocab[i] - if token in added_vocab: - # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. - # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not added_tokens_decoder[i].normalized: - previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] - if previous_token != token: - logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - - if added_tokens_decoder[i].special or self.does_token_look_special(token): - toktypes.append(gguf.TokenType.CONTROL) - else: - # NOTE: this was added for Gemma. - # Encoding and decoding the tokens above isn't sufficient for this case. - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - toktypes.append(gguf.TokenType.NORMAL) - tokens.append(token) - - return tokens, toktypes, tokpre - - # NOTE: this function is generated by convert_hf_to_gguf_update.py - # do not modify it manually! - # ref: https://github.com/ggml-org/llama.cpp/pull/6920 - # Marker: Start get_vocab_base_pre - def get_vocab_base_pre(self, tokenizer) -> str: - # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that - # is specific for the BPE pre-tokenizer used by the model - # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can - # use in llama.cpp to implement the same pre-tokenizer - - chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' - - chktok = tokenizer.encode(chktxt) - chkhsh = sha256(str(chktok).encode()).hexdigest() - - logger.debug(f"chktok: {chktok}") - logger.debug(f"chkhsh: {chkhsh}") - - res = None - - # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script - # or pull the latest version of the model from Huggingface - # don't edit the hashes manually! - if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b": - # ref: https://huggingface.co/THUDM/glm-4-9b-chat - res = "chatglm-bpe" - if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516": - # ref: https://huggingface.co/THUDM/glm-4-9b-chat - res = "chatglm-bpe" - if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2": - # ref: https://huggingface.co/THUDM/glm-4-9b-hf - res = "glm4" - if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902": - # ref: https://huggingface.co/zai-org/GLM-4.5-Air - res = "glm4" - if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267": - # ref: https://huggingface.co/zai-org/GLM-4.7-Flash - res = "glm4" - if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": - # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 - res = "minerva-7b" - if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664": - # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct - res = "hunyuan" - if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6": - # ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct - res = "hunyuan-dense" - if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6": - # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base - res = "falcon-h1" - if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86": - # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base - res = "falcon-h1" - if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896": - # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base - res = "falcon-h1" - if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b": - # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base - res = "falcon-h1" - if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890": - # ref: https://huggingface.co/moonshotai/Kimi-K2-Base - res = "kimi-k2" - if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": - # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B - res = "qwen2" - if chkhsh == "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f": - # ref: https://huggingface.co/openbmb/MiniCPM-V-4_6 - res = "qwen35" - if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273": - # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer - res = "grok-2" - if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df": - # ref: https://huggingface.co/aari1995/German_Semantic_V3 - res = "jina-v2-de" - if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4": - # ref: https://huggingface.co/evilfreelancer/ruGPT3XL - res = "gpt-2" - if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B - res = "llama-bpe" - if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": - # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base - res = "deepseek-llm" - if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": - # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base - res = "deepseek-coder" - if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": - # ref: https://huggingface.co/tiiuae/falcon-7b - res = "falcon" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 - res = "bert-bge" - if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": - # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base - res = "falcon3" - if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": - # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 - res = "bert-bge-large" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/mosaicml/mpt-7b - res = "mpt" - if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": - # ref: https://huggingface.co/bigcode/starcoder2-3b - res = "starcoder" - if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": - # ref: https://huggingface.co/openai-community/gpt2 - res = "gpt-2" - if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": - # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b - res = "stablelm2" - if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": - # ref: https://huggingface.co/smallcloudai/Refact-1_6-base - res = "refact" - if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": - # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 - res = "command-r" - if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1": - # ref: https://huggingface.co/CohereLabs/tiny-aya-base - res = "tiny_aya" - if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": - # ref: https://huggingface.co/Qwen/Qwen1.5-7B - res = "qwen2" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf - res = "olmo" - if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": - # ref: https://huggingface.co/databricks/dbrx-base - res = "dbrx" - if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": - # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en - res = "jina-v1-en" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en - res = "jina-v2-en" - if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es - res = "jina-v2-es" - if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de - res = "jina-v2-de" - if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d": - # ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano - res = "jina-v5-nano" - if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": - # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct - res = "smaug-bpe" - if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360": - # ref: https://huggingface.co/LumiOpen/Poro-34B-chat - res = "poro-chat" - if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code - res = "jina-v2-code" - if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": - # ref: https://huggingface.co/LumiOpen/Viking-7B - res = "viking" - if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": - # ref: https://huggingface.co/core42/jais-13b - res = "jais" - if chkhsh == "bc5108ee1eb6a3d600cadd065f63190fbd0554dbc9e4bbd6a0d977970afc8d2a": - # ref: https://huggingface.co/inceptionai/Jais-2-8B-Chat - res = "jais-2" - if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f": - # ref: https://huggingface.co/WisdomShell/CodeShell-7B - res = "codeshell" - if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e": - # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 - res = "tekken" - if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": - # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M - res = "smollm" - if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7": - # ref: https://huggingface.co/bigscience/bloom - res = "bloom" - if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21": - # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small - res = "gpt3-finnish" - if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": - # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct - res = "exaone" - if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": - # ref: https://huggingface.co/microsoft/phi-2 - res = "phi-2" - if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": - # ref: https://huggingface.co/facebook/chameleon-7b - res = "chameleon" - if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": - # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base - res = "roberta-bpe" - if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb": - # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct - res = "gigachat" - if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": - # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct - res = "megrez" - if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": - # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 - res = "deepseek-v3" - if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": - # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B - res = "deepseek-r1-qwen" - if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e": - # ref: https://huggingface.co/Xenova/gpt-4o - res = "gpt-4o" - if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f": - # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k - res = "superbpe" - if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15": - # ref: https://huggingface.co/trillionlabs/Trillion-7B-preview - res = "trillion" - if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224": - # ref: https://huggingface.co/inclusionAI/Ling-lite - res = "bailingmoe" - if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406": - # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct - res = "llama4" - if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3": - # ref: https://huggingface.co/mistral-community/pixtral-12b - res = "pixtral" - if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec": - # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base - res = "seed-coder" - if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf": - # ref: https://huggingface.co/skt/A.X-4.0 - res = "a.x-4.0" - if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4": - # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct - res = "midm-2.0" - if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51": - # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer - res = "lfm2" - if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb": - # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B - res = "exaone4" - if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756": - # ref: https://huggingface.co/JetBrains/Mellum-4b-base - res = "mellum" - if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152": - # ref: https://huggingface.co/answerdotai/ModernBERT-base - res = "modern-bert" - if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df": - # ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer - res = "afmoe" - if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206": - # ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0 - res = "bailingmoe2" - if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e": - # ref: https://huggingface.co/ibm-granite/granite-docling-258M - res = "granite-docling" - if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95": - # ref: https://huggingface.co/MiniMaxAI/MiniMax-M2 - res = "minimax-m2" - if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665": - # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer - res = "kormo" - if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1": - # ref: https://huggingface.co/tencent/Youtu-LLM-2B - res = "youtu" - if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91": - # ref: https://huggingface.co/upstage/Solar-Open-100B - res = "solar-open" - if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f": - # ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B - res = "exaone-moe" - if chkhsh == "d30d75d9059f1aa2c19359de71047b3ae408c70875e8a3ccf8c5fba56c9d8af4": - # ref: https://huggingface.co/Qwen/Qwen3.5-9B-Instruct - res = "qwen35" - if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d": - # ref: https://huggingface.co/jdopensource/JoyAI-LLM-Flash - res = "joyai-llm" - if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869": - # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601 - res = "kanana2" - if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015": - # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B - res = "f2llmv2" - if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57": - # ref: https://huggingface.co/sarvamai/sarvam-30b - res = "sarvam-moe" - - if res is None: - logger.warning("\n") - logger.warning("**************************************************************************************") - logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") - logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") - logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") - logger.warning("**") - logger.warning(f"** chkhsh: {chkhsh}") - logger.warning("**************************************************************************************") - logger.warning("\n") - raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - - logger.debug(f"tokenizer.ggml.pre: {repr(res)}") - logger.debug(f"chkhsh: {chkhsh}") - - return res - # Marker: End get_vocab_base_pre - - def _set_vocab_none(self) -> None: - self.gguf_writer.add_tokenizer_model("none") - - def _set_vocab_gpt2(self) -> None: - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_qwen(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] - - tokpre = self.get_vocab_base_pre(tokenizer) - - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) == 2 - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.special_tokens # ty: ignore[unresolved-attribute] - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges - # only add special tokens when they were not already loaded from config.json - if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_sentencepiece(self, add_to_gguf=True): - tokens, scores, toktypes = self._create_vocab_sentencepiece() - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def _create_vocab_sentencepiece(self): - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.find_hparam([ - "vocab_size_per_layer_input", # gemma3n - "vocab_size", - ], optional=True) or tokenizer.vocab_size() - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - if token_id >= vocab_size: - logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}') - break - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, token_data in added_tokens_decoder.items(): - token_id = int(token_id) - token: str = token_data["content"] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token.encode("utf-8"): - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') - if token_data.get("special") or self.does_token_look_special(token): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - scores[token_id] = -1000.0 - tokens[token_id] = token.encode("utf-8") - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - return tokens, scores, toktypes - - def _set_vocab_llama_hf(self): - vocab = gguf.LlamaHfVocab(self.dir_model) - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_rwkv_world(self): - assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() - vocab_size = self.hparams.get("vocab_size", 65536) - - tokens: list[bytes] = [''.encode("utf-8")] - toktypes: list[int] = [gguf.TokenType.CONTROL] - - with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: - lines = f.readlines() - for line in lines: - parts = line.split(' ') - assert len(parts) >= 3 - token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) - token = token.encode("utf-8") if isinstance(token, str) else token - assert isinstance(token, bytes) - assert len(token) == token_len - token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" - tokens.append(token_text.encode("utf-8")) - toktypes.append(gguf.TokenType.NORMAL) - remainder = vocab_size - len(tokens) - assert remainder >= 0 - for i in range(len(tokens), vocab_size): - tokens.append(f"[PAD{i}]".encode("utf-8")) - toktypes.append(gguf.TokenType.UNUSED) - - self.gguf_writer.add_tokenizer_model("rwkv") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - if special_vocab.chat_template is None: - template_path = Path(__file__).parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja" - if template_path.is_file(): - with open(template_path, "r", encoding="utf-8") as f: - template = f.read() - else: - template = "rwkv-world" - special_vocab.chat_template = template - # hack: Add '\n\n' as the EOT token to make it chat normally - special_vocab._set_special_token("eot", 261) - # hack: Override these as they have already been set (incorrectly) - special_vocab.special_token_ids["bos"] = 0 - special_vocab.special_token_ids["eos"] = 0 - - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): - tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" - logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") - vocab_reader = gguf.GGUFReader(tokenizer_path, "r") - - default_pre = "mpt" if model_name == "gpt-neox" else "default" - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL) - assert field # tokenizer model - self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) - self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre) - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) - assert field # token list - self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) - - if model_name == "llama-spm": - field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) - assert field # token scores - self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - assert field # token types - self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) - - if model_name != "llama-spm": - field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) - assert field # token merges - self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) - - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None: - self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None: - self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None: - self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None: - self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None: - self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None: - self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) - - def _try_set_pooling_type(self) -> None: - # get pooling path - pooling_path = None - module_path = self.dir_model / "modules.json" - if module_path.is_file(): - with open(module_path, encoding="utf-8") as f: - modules = json.load(f) - for mod in modules: - if mod["type"].endswith("Pooling"): - pooling_path = mod["path"] - break - - mode_mapping = { - "mean": gguf.PoolingType.MEAN, - "cls": gguf.PoolingType.CLS, - "lasttoken": gguf.PoolingType.LAST, - } - - # get pooling type - if pooling_path is not None: - with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: - pooling = json.load(f) - if pooling.get("pooling_mode_mean_tokens"): - pooling_type = gguf.PoolingType.MEAN - elif pooling.get("pooling_mode_cls_token"): - pooling_type = gguf.PoolingType.CLS - elif pooling.get("pooling_mode_lasttoken"): - pooling_type = gguf.PoolingType.LAST - elif (pooling_mode := pooling.get("pooling_mode")) in mode_mapping: - pooling_type = mode_mapping[pooling_mode] - else: - raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported") - self.gguf_writer.add_pooling_type(pooling_type) - - def _set_vocab_glmedge(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_glm(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - # Special tokens - # Note: Using <|endoftext|> (151329) for eot causes endless generation - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # ty: ignore[unresolved-attribute] # 151331 - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] # 151336 - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] # 151329 - special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # ty: ignore[unresolved-attribute] # 151338 - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_interns1(self): - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) # ty: ignore[unresolved-attribute] - vocab_size = self.hparams.get("vocab_size", len(vocab)) - assert max(vocab.values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token: str = reverse_vocab[i] - if token in added_vocab: - # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. - # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not added_tokens_decoder[i].normalized: - previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] - if previous_token != token: - logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - - if added_tokens_decoder[i].special or self.does_token_look_special(token): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - toktypes.append(gguf.TokenType.NORMAL) - tokens.append(token) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab._set_special_token("bos", 151643) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_mistral(self): - if not _mistral_common_installed: - raise ImportError(_mistral_import_error_msg) - - vocab = MistralVocab(self.dir_model) - logger.info( - f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}." - ) - - self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model) - - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size, ( - f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})" - ) - - if vocab.tokenizer_type == MistralTokenizerType.tekken: - self.gguf_writer.add_tokenizer_pre("tekken") - self.gguf_writer.add_token_merges( - vocab.extract_vocab_merges_from_model() - ) - - logger.info( - f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." - ) - - self.gguf_writer.add_bos_token_id(vocab.bos_id) - self.gguf_writer.add_eos_token_id(vocab.eos_id) - self.gguf_writer.add_unk_token_id(vocab.unk_id) - self.gguf_writer.add_pad_token_id(vocab.pad_id) - - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_vocab_size(vocab.vocab_size) - - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(False) - - local_template_file_path = self.dir_model / "chat_template.jinja" - - if self.is_mistral_format and local_template_file_path.is_file(): - # Ministral-3 and other new Mistral models come with chat templates. - # ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main - logger.info("Using an existing Mistral local chat template.") - - with open(local_template_file_path, "r", encoding="utf-8") as f: - template = f.read() - elif not self.is_mistral_format or not self.disable_mistral_community_chat_template: - template_dir = Path(__file__).parent / "models/templates/" - - # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`. - if self.is_mistral_format: - logger.info( - "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. " - "Mistral recommends to use `mistral-common` to perform tokenization and detokenization." - ) - template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format) - else: - logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.") - template = None - - if template is not None: - self.gguf_writer.add_chat_template(template) - - def _set_vocab_plamo(self): - # PLaMo models use a custom tokenizer with a .jsonl file - tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl" - tokenizer_config_path = self.dir_model / "tokenizer_config.json" - - if not tokenizer_jsonl_path.is_file(): - raise FileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}") - - # Load tokenizer config - with open(tokenizer_config_path, "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - - # Load tokens from JSONL file (actually a list format) - tokens = [] - scores = [] - toktypes = [] - - with open(tokenizer_jsonl_path, "r", encoding="utf-8") as f: - for line_num, line in enumerate(f): - if line.strip(): - token_data = json.loads(line) - # Format: [token, score, type, ?, ?, ?, ?] - token = token_data[0].encode("utf-8") - score = float(token_data[1]) - token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL" - - tokens.append(token) - scores.append(score) - - if token_type_str == "UNKNOWN": - toktypes.append(gguf.TokenType.UNKNOWN) - elif token_type_str == "CONTROL": - toktypes.append(gguf.TokenType.CONTROL) - elif token_type_str == "BYTE": - toktypes.append(gguf.TokenType.BYTE) - else: - token_str = token_data[0] - if token_str.startswith("<|plamo:") and token_str.endswith("|>"): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.NORMAL) - - vocab_size = self.hparams["vocab_size"] - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(gguf.TokenType.UNUSED) - - self.gguf_writer.add_tokenizer_model("plamo2") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None: - token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8")) - self.gguf_writer.add_bos_token_id(token_id) - if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None: - token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8")) - self.gguf_writer.add_eos_token_id(token_id) - if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None: - token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8")) - self.gguf_writer.add_pad_token_id(token_id) - if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None: - token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8")) - self.gguf_writer.add_sep_token_id(token_id) - if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None: - token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8")) - self.gguf_writer.add_unk_token_id(token_id) - - # Add <|plamo:op|> as EOT to ensure appropriate end of generation - self.gguf_writer.add_eot_token_id(4) - - self.gguf_writer.add_add_space_prefix(False) - - -class MmprojModel(ModelBase): - model_type = ModelType.MMPROJ - model_arch = gguf.MODEL_ARCH.MMPROJ - preprocessor_config: dict[str, Any] - global_config: dict[str, Any] - - n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers", "encoder_layers", "vt_num_hidden_layers"] - - has_vision_encoder: bool = True # by default - has_audio_encoder: bool = False - - # for models having multiple encoders, we need to separate their hparams - hparams_vision: dict[str, Any] | None = None - hparams_audio: dict[str, Any] | None = None - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - if self.model_arch != gguf.MODEL_ARCH.MMPROJ: - raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ") - - # get n_embd of the text model - if not self.is_mistral_format: - if "text_config" not in self.hparams: - self.hparams["text_config"] = {} - if "audio_config" not in self.hparams: - self.hparams["audio_config"] = {} - text_config = {**self.hparams, **self.hparams["text_config"]} - self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0)) - else: - text_config = { - k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"] - } - # mistral native params.json: "dim" is the text hidden size ("hidden_dim" is the FFN intermediate size) - self.n_embd_text = text_config.get("dim", 0) - - assert self.n_embd_text > 0, "n_embd not found in hparams" - - # move vision config to the top level, while preserving the original hparams in global_config - import copy - self.global_config = copy.deepcopy(self.hparams) - self.hparams_vision = self.get_vision_config() - self.hparams_audio = self.get_audio_config() - - if self.hparams_vision is None and self.hparams_audio is None: - raise ValueError("vision_config / audio_config not found in hparams") - - # for compat with vision-only models - self.hparams = self.hparams_vision or self.hparams_audio or self.hparams - - # TODO @ngxson : this is a hack to support both vision and audio encoders - have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder - self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True) - self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) - - # load preprocessor config - self.preprocessor_config = {} - - # prefer preprocessor_config.json if possible - preprocessor_config_path = self.dir_model / "preprocessor_config.json" - if preprocessor_config_path.is_file(): - with open(preprocessor_config_path, "r", encoding="utf-8") as f: - cfg = json.load(f) - # move media_proc_cfg to root level for compat - if "media_proc_cfg" in cfg: - cfg = { - **cfg, - **cfg["media_proc_cfg"], - } - # merge configs - self.preprocessor_config = {**self.preprocessor_config, **cfg} - - # prefer processor_config.json if possible - processor_config_path = self.dir_model / "processor_config.json" - if processor_config_path.is_file(): - with open(processor_config_path, "r", encoding="utf-8") as f: - cfg = json.load(f) - # move image_processor to root level for compat - if "image_processor" in cfg: - cfg = { - **cfg, - **cfg["image_processor"], - } - # merge configs - self.preprocessor_config = {**self.preprocessor_config, **cfg} - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip non-multimodal tensors - if "language_model." in name: - return None - - return super().filter_tensors(item) - - def get_vision_config(self) -> dict[str, Any] | None: - config_name = "vision_config" if not self.is_mistral_format else "vision_encoder" - return self.global_config.get(config_name) - - def get_audio_config(self) -> dict[str, Any] | None: - mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config" - return self.global_config.get(mm_config_key) - - def set_type(self): - self.gguf_writer.add_type(gguf.GGUFType.MMPROJ) - - def prepare_metadata(self, vocab_only: bool): - super().prepare_metadata(vocab_only=vocab_only) - - output_type: str = self.ftype.name.partition("_")[2] - - if self.fname_out.is_dir(): - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=output_type, model_type=None) - self.fname_out = self.fname_out / f"mmproj-{fname_default}.gguf" - else: - self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) - - def set_gguf_parameters(self): - self.gguf_writer.add_file_type(self.ftype) - - if self.has_vision_encoder: - self.gguf_writer.add_clip_has_vision_encoder(True) - self.gguf_writer.add_vision_projection_dim(self.n_embd_text) - - # vision config - self.image_size = self.find_vparam(["image_size"]) - self.gguf_writer.add_vision_image_size(self.image_size) - self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"])) - self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", "vt_hidden_size"])) - self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"])) - self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys)) - self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", "vt_num_attention_heads"])) - - # preprocessor config - image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] - image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"] - - self.gguf_writer.add_vision_image_mean(image_mean) - self.gguf_writer.add_vision_image_std(image_std) - - if self.has_audio_encoder: - self.gguf_writer.add_clip_has_audio_encoder(True) - self.gguf_writer.add_audio_projection_dim(self.n_embd_text) - - # audio config - self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"])) - self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"])) - self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys)) - self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"])) - - if not self.has_vision_encoder and not self.has_audio_encoder: - raise ValueError("MmprojModel must have either vision or audio encoder") - - def write_vocab(self): - raise ValueError("MmprojModel does not support vocab writing") - - def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any: - assert self.hparams_vision is not None - return self._find_param(self.hparams_vision, keys, optional) - - def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any: - assert self.hparams_audio is not None - return self._find_param(self.hparams_audio, keys, optional) - - def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any: - key = next((k for k in keys if k in obj), None) - if key is not None: - return obj[key] - if optional: - return None - raise KeyError(f"could not find any of: {keys}") - - def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, name, n_dims # unused - if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name: - return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 - return False - - -@ModelBase.register("GPTNeoXForCausalLM") -class GPTNeoXModel(TextModel): - model_arch = gguf.MODEL_ARCH.GPTNEOX - - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), - ) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - assert n_head is not None - assert n_embed is not None - - if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) - data_torch = torch.cat( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.weight") - elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): - qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) - data_torch = torch.cat( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.bias") - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("BloomForCausalLM", "BloomModel") -class BloomModel(TextModel): - model_arch = gguf.MODEL_ARCH.BLOOM - - def set_gguf_parameters(self): - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - assert n_head is not None - assert n_embed is not None - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(4 * n_embed) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - assert n_head is not None - assert n_embed is not None - - name = re.sub(r'transformer\.', '', name) - - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) - data_torch = torch.cat( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) - data_torch = torch.cat( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.bias") - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("MPTForCausalLM") -class MPTModel(TextModel): - model_arch = gguf.MODEL_ARCH.MPT - - def set_vocab(self): - try: - self._set_vocab_gpt2() - except Exception: - # Fallback for SEA-LION model - self._set_vocab_sentencepiece() - self.gguf_writer.add_add_bos_token(False) - self.gguf_writer.add_pad_token_id(3) - self.gguf_writer.add_eos_token_id(1) - self.gguf_writer.add_unk_token_id(0) - - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): - self.gguf_writer.add_head_count_kv(kv_n_heads) - self.gguf_writer.add_layer_norm_eps(1e-5) - if self.hparams["attn_config"]["clip_qkv"] is not None: - self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - if self.hparams["attn_config"]["alibi"]: - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) - else: - self.gguf_writer.add_max_alibi_bias(0.0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "scales" in name: - new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) - new_name = new_name.replace("scales", "act.scales") - else: - new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) - - yield from super().modify_tensors(data_torch, new_name, bid) - - -@ModelBase.register("OrionForCausalLM") -class OrionModel(TextModel): - model_arch = gguf.MODEL_ARCH.ORION - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") - - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - # note: config provides rms norm but it is actually layer norm - # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 - self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) - - -@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM") -class BaichuanModel(TextModel): - model_arch = gguf.MODEL_ARCH.BAICHUAN - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": - logger.info(f"Unpacking and permuting layer {bid}") - yield from [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), - self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), - self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), - self._reverse_hf_part(data_torch, 2)), - ] - else: - yield from self.modify_tensors(data_torch, self.map_tensor_name(name), bid) - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def _reverse_hf_permute_part( - self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, - ) -> Tensor: - r = weights.shape[0] // 3 - return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) - - def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: - r = weights.shape[0] // 3 - return weights[r * n_part:r * n_part + r, ...] - - -@ModelBase.register("XverseForCausalLM") -class XverseModel(TextModel): - model_arch = gguf.MODEL_ARCH.XVERSE - - def set_vocab(self): - assert (self.dir_model / "tokenizer.json").is_file() - dir_model = self.dir_model - hparams = self.hparams - - tokens: list[bytes] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] - # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, - # because vocab_size is the count of items, and indexes start at 0. - max_vocab_index = max(tokenizer.get_vocab().values()) # ty: ignore[unresolved-attribute] - if max_vocab_index >= vocab_size: - raise ValueError("Vocabulary size exceeds expected maximum size.") - - reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - for token_id in range(vocab_size): - token_text = reverse_vocab[token_id].encode('utf-8') - # replace "\x00" to string with length > 0 - if token_text == b"\x00": - toktype = gguf.TokenType.BYTE # special - token_text = f"<{token_text}>".encode('utf-8') - elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): - toktype = gguf.TokenType.BYTE # special - elif reverse_vocab[token_id] in added_vocab: - if tokenizer.added_tokens_decoder[token_id].special: # ty: ignore[unresolved-attribute] - toktype = gguf.TokenType.CONTROL - else: - toktype = gguf.TokenType.USER_DEFINED - else: - toktype = gguf.TokenType.NORMAL - - tokens.append(token_text) - toktypes.append(toktype) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - # HF models permute some of the tensors, so we need to undo that - if name.endswith("q_proj.weight"): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) - if name.endswith("k_proj.weight"): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) - - yield from super().modify_tensors(data_torch, name, bid) - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - -@ModelBase.register("FalconForCausalLM", "RWForCausalLM") -class FalconModel(TextModel): - model_arch = gguf.MODEL_ARCH.FALCON - - def set_gguf_parameters(self): - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name - - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name - - self.gguf_writer.add_context_length(2048) # not in config.json - self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 - head_dim = self.hparams["hidden_size"] // n_head - - qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("GPTBigCodeForCausalLM") -class StarCoderModel(TextModel): - model_arch = gguf.MODEL_ARCH.STARCODER - - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - -@ModelBase.register("GPTRefactForCausalLM") -class RefactModel(TextModel): - model_arch = gguf.MODEL_ARCH.REFACT - - def set_vocab(self): - super().set_vocab() - - # TODO: how to determine special FIM tokens automatically? - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot']) - special_vocab._set_special_token("prefix", 1) - special_vocab._set_special_token("suffix", 3) - special_vocab._set_special_token("middle", 2) - special_vocab.chat_template = None # do not add it twice - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - - # refact uses Alibi. So this is from config.json which might be used by training. - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - - self.gguf_writer.add_feed_forward_length(ff_dim) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - n_head = self.hparams["n_head"] - n_head_kv = 1 - head_dim = self.hparams["n_embd"] // n_head - - if bid is not None: - if name == f"transformer.h.{bid}.attn.kv.weight": - yield from super().modify_tensors(data_torch[:n_head_kv * head_dim], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) - yield from super().modify_tensors(data_torch[n_head_kv * head_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) - return - if name == f"transformer.h.{bid}.attn.q.weight": - yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) - return - if name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": - yield from super().modify_tensors(data_torch[:ff_dim], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) - yield from super().modify_tensors(data_torch[ff_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") -class StableLMModel(TextModel): - model_arch = gguf.MODEL_ARCH.STABLELM - - def set_vocab(self): - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - else: - # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab - self._set_vocab_qwen() - - def set_gguf_parameters(self): - hparams = self.hparams - - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) - self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) - self.gguf_writer.add_file_type(self.ftype) - - _q_norms: list[dict[str, Tensor]] | None = None - _k_norms: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams["num_key_value_heads"] - - if name.find("q_layernorm.norms") != -1: - assert bid is not None - - if self._q_norms is None: - self._q_norms = [{} for _ in range(self.block_count)] - - self._q_norms[bid][name] = data_torch - - if len(self._q_norms[bid]) >= n_head: - return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") - else: - return - - if name.find("k_layernorm.norms") != -1: - assert bid is not None - - if self._k_norms is None: - self._k_norms = [{} for _ in range(self.block_count)] - - self._k_norms[bid][name] = data_torch - - if len(self._k_norms[bid]) >= n_kv_head: - return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): - datas: list[Tensor] = [] - # extract the norms in order - for xid in range(n_head): - ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" - datas.append(norms[ename]) - del norms[ename] - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._q_norms is not None or self._k_norms is not None: - # flatten two `list[dict[str, Tensor]]` into a single `list[str]` - norms = ( - [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] - ) + ( - [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] - ) - if len(norms) > 0: - raise ValueError(f"Unprocessed norms: {norms}") - - -@ModelBase.register( - "LLaMAForCausalLM", - "LlamaForCausalLM", - "MistralForCausalLM", - "MixtralForCausalLM", - "VLlama3ForCausalLM", - "LlavaForConditionalGeneration", - "VoxtralForConditionalGeneration", - "IQuestCoderForCausalLM", - "LlamaModel") -class LlamaModel(TextModel): - model_arch = gguf.MODEL_ARCH.LLAMA - undo_permute = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # fix for SmolVLM2, missing `num_attention_heads` in config.json - if self.hf_arch == "VLlama3ForCausalLM": - self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) - # Mistral consolidated format has no config.json; origin_hf_arch is HF-only. - if self.is_mistral_format: - self.origin_hf_arch = None - else: - hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) - self.origin_hf_arch = hparams.get('architectures', [None])[0] - - def set_vocab(self): - if self.origin_hf_arch == "GlmasrModel": - return self._set_vocab_glmedge() - - if self.is_mistral_format: - return self._set_vocab_mistral() - - path_tekken_json = self.dir_model / "tekken.json" - path_tokenizer_json = self.dir_model / "tokenizer.json" - if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): - self._set_vocab_mistral() - - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - try: - self._set_vocab_llama_hf() - except (FileNotFoundError, TypeError): - # Llama 3 - self._set_vocab_gpt2() - - # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) - if self.hparams.get("vocab_size", 32000) == 32016: - special_vocab = gguf.SpecialVocab( - self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot'] - ) - special_vocab._set_special_token("prefix", 32007) - special_vocab._set_special_token("suffix", 32008) - special_vocab._set_special_token("middle", 32009) - special_vocab._set_special_token("eot", 32010) - special_vocab.add_to_gguf(self.gguf_writer) - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - # Apply to granite small models only - if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - if not self.is_mistral_format: - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): - # Mirror the BF16 Q/K RoPE permutation site in modify_tensors; the NVFP4 path bypasses it. - if self.undo_permute: - n_head = self.find_hparam(["n_heads", "num_attention_heads"], optional=True) - n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"], optional=True) - if n_head is not None: - if name.endswith("q_proj.weight"): - weight = LlamaModel.permute(weight, n_head, n_head) - scale = LlamaModel.permute(scale, n_head, n_head) - elif name.endswith("k_proj.weight"): - weight = LlamaModel.permute(weight, n_head, n_kv_head) - scale = LlamaModel.permute(scale, n_head, n_kv_head) - super()._repack_nvfp4(name, weight, scale, scale2, input_scale) - - _experts: list[dict[str, Tensor]] | None = None - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "text_model." in name: - name = name.replace("text_model.", "") # for SmolVLM - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.find_hparam(["n_heads", "num_attention_heads"]) - n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"]) - - if self.hf_arch == "LlamaModel": - name = "model." + name - - if self.undo_permute: - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for wid in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): - if rope_params.get("rope_type", '').lower() == "llama3": - base = rope_params.get("rope_theta", 10000.0) - if (dim := self.hparams.get("head_dim")) is None: - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_params.get("factor", 8.0) - low_freq_factor = rope_params.get("low_freq_factor", 1.0) - high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - # assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4 - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("ArceeForCausalLM") -class ArceeModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.ARCEE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self._try_set_pooling_type() - - -@ModelBase.register("AfmoeForCausalLM") -class AfmoeModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.AFMOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # MoE parameters - if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None: - self.gguf_writer.add_expert_shared_count(n_shared_experts) - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None: - self.gguf_writer.add_leading_dense_block_count(n_dense_layers) - - # Route normalization and scaling - if (route_norm := self.hparams.get("route_norm")) is not None: - self.gguf_writer.add_expert_weights_norm(route_norm) - if (route_scale := self.hparams.get("route_scale")) is not None: - self.gguf_writer.add_expert_weights_scale(route_scale) - - # Sliding window attention - if (sliding_window := self.hparams.get("sliding_window")) is not None: - self.gguf_writer.add_sliding_window(sliding_window) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith(".expert_bias"): - name = name.replace(".expert_bias", ".expert_bias.bias") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Handle expert weights - they're already merged in the HF format - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["gate_proj", "up_proj", "down_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename_to_retrieve]) - del self._experts[bid][ename_to_retrieve] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid) - - return - else: - return - - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - - -@ModelBase.register( - "LlavaForConditionalGeneration", # pixtral - "Mistral3ForConditionalGeneration", # mistral small 3.1 -) -class LlavaVisionModel(MmprojModel): - img_break_tok_id = -1 - use_break_tok = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.hparams.get("model_type") == "pixtral": - # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py - self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5) - if self.use_break_tok: - self.img_break_tok_id = self.get_token_id("[IMG_BREAK]") - elif self.is_mistral_format: - # hparams is already vision config here so norm_eps is only defined in global_config. - self.hparams["norm_eps"] = self.global_config.get("norm_eps", None) - assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json" - if self.use_break_tok: - self.img_break_tok_id = self.find_vparam(["image_break_token_id"]) - - # params.json may ship -1 placeholders (Mistral Medium 3.5) - # resolve the real id from the bundled tokenizer in that case - if self.img_break_tok_id < 0: - self.img_break_tok_id = self.get_mistral_token_id("[IMG_BREAK]") - else: - raise ValueError(f"Unsupported model type: {self.hparams['model_type']}") - logger.info(f"Image break token id: {self.img_break_tok_id}") - - def get_token_id(self, token: str) -> int: - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - added_tokens_decoder = json.load(f).get('added_tokens_decoder') or {} - for id_, token_data in added_tokens_decoder.items(): - if token_data.get("content") == token: - return int(id_) - # fallthrough to tokenizer.json - with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - for token_data in tokenizer_json["added_tokens"]: - if token_data["content"] == token: - return int(token_data["id"]) - raise ValueError(f"Token '{token}' not found in tokenizer config.") - - def get_mistral_token_id(self, token: str) -> int: - # mistral native format ships tekken.json or a versioned spm tokenizer - tekken_file = self.dir_model / "tekken.json" - if tekken_file.is_file(): - with open(tekken_file, "r", encoding="utf-8") as f: - data = json.load(f) - for entry in data.get("special_tokens", []): - if entry.get("token_str") == token: - return int(entry["rank"]) - tokenizer_json_file = self.dir_model / "tokenizer.json" - if tokenizer_json_file.is_file(): - with open(tokenizer_json_file, "r", encoding="utf-8") as f: - data = json.load(f) - for entry in data.get("added_tokens", []): - if entry.get("content") == token: - return int(entry["id"]) - raise ValueError(f"Token '{token}' not found in mistral tokenizer files.") - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if hparams.get("model_type") == "pixtral": - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) - self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) - - # hidden_act - if hparams["hidden_act"] == "silu": - self.gguf_writer.add_vision_use_silu(True) - elif hparams["hidden_act"] == "gelu": - self.gguf_writer.add_vision_use_gelu(True) - else: - raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") - - # spatial_merge_size - if "spatial_merge_size" in self.global_config: - self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = ( - self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"]) - ) - n_kv_head = n_head - - valid_prefixes = ( - "multi_modal_projector.", - "vision_tower.", - "vision_encoder.", - "vision_language_adapter.", - "patch_merger.", - "pre_mm_projector_norm", - ) - - if any(name.startswith(prefix) for prefix in valid_prefixes): - # process vision tensors - if name.endswith(("q_proj.weight", "q_proj.bias")) and not self.is_mistral_format: - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format: - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - yield from super().modify_tensors(data_torch, name, bid) - return - - embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight" - if self.img_break_tok_id > 0 and embed_key in name: - logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") - # for pixtral model, we need to extract the [IMG_BREAK] token embedding - img_break_embd = data_torch[self.img_break_tok_id] - name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] - yield from super().modify_tensors(img_break_embd, name, bid) - - return # skip other tensors - - -@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration") -class SmolVLMModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.hparams["model_type"] == "smolvlm_vision": - # fix for SmolVLM2, missing some keys in config.json - # default values are taken from transformers code - self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152) - self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16) - self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) - self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2)) - self.gguf_writer.add_vision_use_gelu(True) - - # Add the preprocessor longest edge size - preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size) - self.gguf_writer.add_vision_preproc_image_size(preproc_image_size) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".embeddings." in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name - - if not is_vision_tensor: - return None - - return super().filter_tensors(item) - - -@ModelBase.register( - "Llama4ForConditionalGeneration", - "Llama4ForCausalLM", -) -class Llama4Model(LlamaModel): - model_arch = gguf.MODEL_ARCH.LLAMA4 - undo_permute = False - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this - self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"] - self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"] - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"]) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"]) - if "layer_types" in self.hparams: - if all(lt == "full_attention" for lt in self.hparams["layer_types"]): - # all layers are full attention (for MobileLLM), disable swa - self.gguf_writer.add_sliding_window(0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - # split the gate_up into gate and up - if "gate_up_proj" in name: - name_up = name.replace("gate_up_proj", "up_proj.weight") - name_gate = name.replace("gate_up_proj", "gate_proj.weight") - dim_half = data_torch.shape[-1] // 2 - gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2) - yield from super().modify_tensors(gate_proj_weight, name_gate, bid) - yield from super().modify_tensors(up_proj_weight, name_up, bid) - return - - if name.endswith("down_proj"): - name += ".weight" - data_torch = data_torch.transpose(-1, -2) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Llama4ForConditionalGeneration") -class Llama4VisionModel(MmprojModel): - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"]) - self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"])) - assert self.hparams["hidden_act"] == "gelu" - self.gguf_writer.add_vision_use_gelu(True) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "multi_modal_projector" not in name and "vision_model" not in name: - return None - - if "positional_embedding_vlm" in name and ".weight" not in name: - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "multi_modal_projector.linear_1" in name: - # despite the name with number postfix, this is a single fully connected layer - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("DeciLMForCausalLM") -class DeciModel(TextModel): - model_arch = gguf.MODEL_ARCH.DECI - - @staticmethod - def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: - # DeciLM-specific code - intermediate_size = int(2 * ffn_mult * n_embd / 3) - return DeciModel._find_multiple(intermediate_size, 256) - - @staticmethod - def _find_multiple(n: int, k: int) -> int: - # DeciLM-specific code - if n % k == 0: - return n - return n + k - (n % k) - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B - _block_configs: list[dict[str,Any]] = self.hparams["block_configs"] - assert self.block_count == len(_block_configs) - self._num_kv_heads = list() - self._num_heads = list() - _ffn_multipliers = list() - # ***linear attention layer*** - # if n_heads_in_group is None and replace_with_linear is True - # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads - # ***attention-free layer*** - # if n_heads_in_group is None and replace_with_linear is False - # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 - # ***normal attention-layer*** - # if n_heads_in_group is not None, then - # _num_kv_heads[il] is num_attention_head // n_heads_in_group and - # _num_heads[il] is num_attention_head - # ***dummy layer*** for nemotron 253B - # if n_heads_in_group is None and ffn_mult is None - # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0 - for il in range(len(_block_configs)): - if _block_configs[il]["attention"]["n_heads_in_group"] is None: - if _block_configs[il]["attention"]["replace_with_linear"] is True: - self._num_kv_heads.append(0) - self._num_heads.append(self.hparams["num_attention_heads"]) - else: - self._num_kv_heads.append(0) - self._num_heads.append(0) - else: - self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"]) - self._num_heads.append(self.hparams["num_attention_heads"]) - if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer - _ffn_multipliers.append(0.0) - else: - _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_heads) - assert self.block_count == len(_ffn_multipliers) - assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) - assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int) - assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float) - self._ffn_dims: list[int] = [ - DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"]) - for multiplier in _ffn_multipliers - ] - - def set_vocab(self): - # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's - # eos_token from '|eot_id|' to '|end_of_text|' - if self.hparams.get("vocab_size", 128256) == 128256: - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - else: - # DeciLM-7B - self._set_vocab_llama_hf() - - def set_gguf_parameters(self): - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_heads) - assert self.block_count == len(self._ffn_dims) - if (rope_theta := self.rope_parameters.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - self.gguf_writer.add_head_count(self._num_heads) - self.gguf_writer.add_feed_forward_length(self._ffn_dims) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_file_type(self.ftype) - else: # DeciLM-7B - super().set_gguf_parameters() - if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B - self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"] - assert self.block_count == len(self._num_kv_heads) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - if bid is not None: - if "num_key_value_heads_per_layer" in self.hparams: - n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid] - elif "block_configs" in self.hparams: - n_kv_head = self._num_kv_heads[bid] - n_head = self._num_heads[bid] - else: - n_kv_head = self.hparams.get("num_key_value_heads") - else: - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = DeciModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) - yield from super().modify_tensors(data_torch, name, bid) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): - if rope_params.get("rope_type", '').lower() == "llama3": - base = rope_params.get("rope_theta", 10000.0) - if (dim := self.hparams.get("head_dim")) is None: - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_params.get("factor", 8.0) - low_freq_factor = rope_params.get("low_freq_factor", 1.0) - high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - def prepare_tensors(self): - super().prepare_tensors() - - -@ModelBase.register("BitnetForCausalLM") -class BitnetModel(TextModel): - model_arch = gguf.MODEL_ARCH.BITNET - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(1.0) - - def weight_quant(self, weight: Tensor) -> Tensor: - dtype = weight.dtype - weight = weight.float() - scale = weight.abs().mean().clamp(min=1e-5) - iscale = 1 / scale - # TODO: multiply by the scale directly instead of inverting it twice - # (this is also unnecessarily doubly inverted upstream) - # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 - result = (weight * iscale).round().clamp(-1, 1) / iscale - return result.type(dtype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - if any(self.match_model_tensor_name(new_name, key, bid) for key in [ - gguf.MODEL_TENSOR.ATTN_Q, - gguf.MODEL_TENSOR.ATTN_K, - gguf.MODEL_TENSOR.ATTN_V, - gguf.MODEL_TENSOR.ATTN_OUT, - gguf.MODEL_TENSOR.FFN_UP, - gguf.MODEL_TENSOR.FFN_DOWN, - gguf.MODEL_TENSOR.FFN_GATE, - ]): - # transform weight into 1/0/-1 (in fp32) - data_torch = self.weight_quant(data_torch) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM") -class GrokModel(TextModel): - model_arch = gguf.MODEL_ARCH.GROK - - def set_vocab(self): - if (self.dir_model / 'tokenizer.model').is_file(): - self._set_vocab_sentencepiece() - return - - if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file(): - logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer') - sys.exit(1) - - self._set_vocab_gpt2() - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0)) - self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0)) - if (final_logit_softcap := self.hparams.get("final_logit_softcapping")): - self.gguf_writer.add_final_logit_softcapping(final_logit_softcap) - - if (rope_dim := self.hparams.get("head_dim")) is None: - rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - - # Treat "original" as "yarn", seems to have been a mistake - if self.hparams.get("rope_type") in ("yarn", "original"): - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"]) - self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"]) - self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"]) - self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"]) - - if temp_len := self.hparams.get("attn_temperature_len"): - self.gguf_writer.add_attn_temperature_length(temp_len) - - self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5)) - self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"]) - self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"]) - - _experts: list[dict[str, list[Tensor]]] | None = None - _cur_expert = "" - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - deferred: list[tuple[Tensor, str, int | None]] = [] - is_expert = ".moe." in name or ".block_sparse_moe.experts." in name - - if not is_expert: - deferred.append((data_torch, name, bid)) - - # process the experts separately - if is_expert or self._cur_expert: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - # concatenate split tensors - if name in self._experts[bid]: - self._cur_expert = name - self._experts[bid][name].append(data_torch) - return - elif is_expert: - self._cur_expert = name - self._experts[bid][name] = [data_torch] - return - else: - self._cur_expert = "" - - for bid in range(self.block_count): - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight" - if ename not in self._experts[bid]: - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight" - tensor_list = self._experts[bid][ename] - datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - - for t in deferred: - yield from super().modify_tensors(*t) - - -@ModelBase.register("DbrxForCausalLM") -class DbrxModel(TextModel): - model_arch = gguf.MODEL_ARCH.DBRX - - def set_gguf_parameters(self): - ffn_config = self.hparams["ffn_config"] - attn_config = self.hparams["attn_config"] - self.gguf_writer.add_block_count(self.block_count) - - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) - - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) - - self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) - - self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) - - self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) - self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) - - self.gguf_writer.add_layer_norm_eps(1e-5) - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_expert = self.hparams["ffn_config"]["moe_num_experts"] - n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] - n_embd = self.hparams["d_model"] - - # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose - # original implementation expects (n_expert, n_ff, n_embd) for all experts weights - # But llama.cpp moe graph works differently - # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions - # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor - exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} - "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - experts = False - - for exp_tensor_name in exp_tensor_names.keys(): - if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: - experts = True - data_torch = data_torch.view(n_expert, n_ff, n_embd) - if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: - data_torch = data_torch.permute(*permute_tensor) - break - - # map tensor names - # In MoE models the ffn tensors are typically most of the model weights, - # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. - # Every other model has the weight names ending in .weight, - # let's assume that is the convention which is not the case for dbrx: - # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 - new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) - - yield from super().modify_tensors(data_torch, new_name, bid) - - def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: - del name, new_name, bid # unused - - return n_dims > 1 - - -@ModelBase.register("MiniCPMForCausalLM") -class MiniCPMModel(TextModel): - model_arch = gguf.MODEL_ARCH.MINICPM - - def set_gguf_parameters(self): - super().set_gguf_parameters() - embedding_scale = float(self.hparams["scale_emb"]) - self.gguf_writer.add_embedding_scale(embedding_scale) - logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") - residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 - self.gguf_writer.add_residual_scale(residual_scale) - logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") - logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] - self.gguf_writer.add_logit_scale(logit_scale) - logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - # HF models permute some of the tensors, so we need to undo that - if name.endswith(("q_proj.weight")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("MiniCPM3ForCausalLM") -class MiniCPM3Model(TextModel): - model_arch = gguf.MODEL_ARCH.MINICPM3 - - def set_gguf_parameters(self): - hparams = self.hparams - - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: - self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: - rope_dims = self.hparams["qk_rope_head_dim"] - - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - -@ModelBase.register("QWenLMHeadModel") -class QwenModel(TextModel): - model_arch = gguf.MODEL_ARCH.QWEN - - @staticmethod - def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] - byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) - - @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - - def set_vocab(self): - self._set_vocab_qwen() - - -@ModelBase.register( - "Qwen2Model", - "Qwen2ForCausalLM", - "Qwen2AudioForConditionalGeneration", - "KORMoForCausalLM", - "AudioFlamingo3ForConditionalGeneration", - "DotsOCRForCausalLM", -) -class Qwen2Model(TextModel): - model_arch = gguf.MODEL_ARCH.QWEN2 - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self._try_set_pooling_type() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.hf_arch == "Qwen2Model": - name = f"model.{name}" # map to Qwen2ForCausalLM tensors - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("DreamModel") -class DreamModel(TextModel): - model_arch = gguf.MODEL_ARCH.DREAM - - def get_vocab_base(self) -> tuple[list[str], list[int], str]: - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - - vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] - vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) - assert max(vocab_dict.values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - # Check if it's a special token - treat special tokens as CONTROL tokens - if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder: - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|> - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - return tokens, toktypes, tokpre - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self._try_set_pooling_type() - - # Dream models use non-causal attention for diffusion - self.gguf_writer.add_causal_attention(False) - - # Add Dream-specific parameters - mask_token_id = self.hparams.get("mask_token_id") - if mask_token_id is not None: - self.gguf_writer.add_mask_token_id(mask_token_id) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Dream model tensors should be mapped directly since it's the base model - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("LLaDAModelLM") -class LLaDAModel(TextModel): - model_arch = gguf.MODEL_ARCH.LLADA - undo_permute = True - - def get_vocab_base(self) -> tuple[list[str], list[int], str]: - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - - vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] - vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) - assert max(vocab_dict.values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - # Check if it's a special token - treat special tokens as CONTROL tokens - if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder: - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|> - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - return tokens, toktypes, tokpre - - def set_vocab(self): - self._set_vocab_gpt2() - - # LLaDA specific parameters - self.gguf_writer.add_add_bos_token(True) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self._try_set_pooling_type() - - # Add parameters similar to LlamaModel - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if (rope_dim := hparams.get("head_dim")) is None: - n_heads = hparams.get("num_attention_heads", hparams.get("n_heads")) - assert n_heads is not None - rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads - self.gguf_writer.add_rope_dimension_count(rope_dim) - - # Set context length for LLaDA - context_length = self.hparams.get("max_sequence_length", 4096) - self.gguf_writer.add_context_length(context_length) - - # Set embedding length (dimension size) - embedding_length = self.hparams.get("d_model", 4096) - self.gguf_writer.add_embedding_length(embedding_length) - - # Set feed forward length (MLP hidden size) - feed_forward_length = self.hparams.get("mlp_hidden_size", 12288) - self.gguf_writer.add_feed_forward_length(feed_forward_length) - - # LLaDA models use non-causal attention for diffusion, similar to Dream - self.gguf_writer.add_causal_attention(False) - - # LLaDA models don't shift their logits - self.gguf_writer.add_diffusion_shift_logits(False) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads")) - assert n_head is not None - n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads")) - - if self.undo_permute: - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LLaDAModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head) - - # LLaDA model tensors should be mapped directly since it's the base model - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM") -class Ernie4_5Model(TextModel): - model_arch = gguf.MODEL_ARCH.ERNIE4_5 - - def set_vocab(self): - self._set_vocab_sentencepiece() - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "ernie." in name: - name = name.replace("ernie.", "model.") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - num_heads = self.hparams["num_attention_heads"] - num_kv_heads = self.hparams["num_key_value_heads"] - if (head_dim := self.hparams.get("head_dim")) is None: - head_dim = self.hparams["hidden_size"] // num_heads - - # split the qkv weights - # qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size] - if "qkv_proj" in name: - name_q = name.replace("qkv_proj.weight", "q_proj.weight") - name_k = name.replace("qkv_proj.weight", "k_proj.weight") - name_v = name.replace("qkv_proj.weight", "v_proj.weight") - total_q_dim = num_heads * head_dim - total_k_dim = num_kv_heads * head_dim - total_v_dim = num_kv_heads * head_dim - q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0) - yield from super().modify_tensors(q_proj_weight, name_q, bid) - yield from super().modify_tensors(k_proj_weight, name_k, bid) - yield from super().modify_tensors(v_proj_weight, name_v, bid) - # split the up_gate_proj into gate and up - # up_gate_proj shape: [2 * intermediate_size, hidden_size] - elif "up_gate_proj" in name: - name_up = name.replace("up_gate_proj.weight", "up_proj.weight") - name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight") - dim_half = data_torch.shape[0] // 2 - gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0) - yield from super().modify_tensors(gate_proj_weight, name_gate, bid) - yield from super().modify_tensors(up_proj_weight, name_up, bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Ernie4_5_MoeForCausalLM") -class Ernie4_5MoeModel(Ernie4_5Model): - model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE - _experts: list[dict[str, Tensor]] | None = None - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._experts = [{} for _ in range(self.block_count)] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"]) - self.gguf_writer.add_expert_used_count(self.hparams["moe_k"]) - self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"]) - self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"]) - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None: - self.gguf_writer.add_expert_shared_count(shared_expert_count) - if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None: - self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2) - match = re.match(r"model.mtp_block.(\d+)", name) - if match: - return None - - # skip all other MTP tensors for now - match = re.match(r"model.mtp_emb_norm.(\d+)", name) - if match: - return None - - match = re.match(r"model.mtp_hidden_norm.(\d+)", name) - if match: - return None - - match = re.match(r"model.mtp_linear_proj.(\d+)", name) - if match: - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["moe_num_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["gate_proj", "up_proj", "down_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename_to_retrieve]) - del self._experts[bid][ename_to_retrieve] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - yield from super().modify_tensors(data_torch, merged_name, bid) - else: - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("PaddleOCRVLForConditionalGeneration") -class PaddleOCRModel(Ernie4_5Model): - model_arch = gguf.MODEL_ARCH.PADDLEOCR - - -@ModelBase.register("PaddleOCRVisionModel") -class PaddleOCRVisionModel(MmprojModel): - # PaddleOCR-VL uses a modified version of Siglip - min_pixels: int = 0 - max_pixels: int = 0 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.min_pixels = self.preprocessor_config["min_pixels"] - self.max_pixels = self.preprocessor_config["max_pixels"] - self.hparams_vision["image_size"] = int(math.sqrt(self.max_pixels)) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - hparams = self.hparams_vision - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PADDLEOCR) - self.gguf_writer.add_vision_max_pixels(self.max_pixels) - self.gguf_writer.add_vision_min_pixels(self.min_pixels) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-6)) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "vision_model" not in name and "mlp_AR" not in name: - return None - name = name.replace("visual.", "model.") - if "packing_position_embedding" in name: - # unused - return None - if "vision_model.head" in name: - # we don't yet support image embeddings for this model - return None - - return super().filter_tensors((name, gen)) - - -@ModelBase.register( - "Qwen2VLModel", - "Qwen2VLForConditionalGeneration", - "Qwen2_5_VLForConditionalGeneration", - "Qwen2_5OmniModel", -) -class Qwen2VLModel(TextModel): - model_arch = gguf.MODEL_ARCH.QWEN2VL - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("thinker."): - name = name.replace("thinker.", "") - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") -class Qwen2VLVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560) - # rename config.json values - self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads") - self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth") - if "embed_dim" in self.hparams_vision: # qwen2vl - self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size") - self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim") - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - hparams = self.hparams_vision - model_type = self.global_config['model_type'] - if model_type == 'qwen2_vl': - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL) - elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni': - if model_type == 'qwen2_5_omni': - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O) - else: - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL) - self.gguf_writer.add_vision_use_silu(True) - # find n_wa_pattern (window attention pattern) - fullatt_block_indexes = hparams.get("fullatt_block_indexes") - assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" - n_wa_pattern = fullatt_block_indexes[0] + 1 - # validate n_wa_pattern - for i in range(1, len(fullatt_block_indexes)): - if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: - raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") - self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) - else: - raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}") - # default values below are taken from HF tranformers code - self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6)) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".position_embd." in new_name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if not name.startswith("visual."): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # split QKV tensors if needed - if ".qkv." in name: - if data_torch.ndim == 2: # weight - c3, _ = data_torch.shape - else: # bias - c3 = data_torch.shape[0] - assert c3 % 3 == 0 - c = c3 // 3 - wq = data_torch[:c] - wk = data_torch[c: c * 2] - wv = data_torch[c * 2:] - yield from super().modify_tensors(wq, name.replace("qkv", "q"), bid) - yield from super().modify_tensors(wk, name.replace("qkv", "k"), bid) - yield from super().modify_tensors(wv, name.replace("qkv", "v"), bid) - elif 'patch_embed.proj.weight' in name: - # split Conv3D into Conv2Ds - c1, c2, kt, kh, kw = data_torch.shape - del c1, c2, kh, kw # unused - assert kt == 2, "Current implementation only support temporal_patch_size of 2" - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]) - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -class Qwen25AudioModel(MmprojModel): - has_audio_encoder = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_audio is not None - self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] - self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"] - self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_audio is not None - self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"]) - self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5)) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # SinusoidsPositionEmbedding - assert self.hparams_audio is not None - max_timescale = 10000 - length = 1500 - channels = self.hparams_audio["hidden_size"] - log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1) - inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float()) - scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :] - pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32) - yield ("audio_tower.embed_positions.weight", pos_embd) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F16 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "conv1.bias" in name or "conv2.bias" in name: - # transpose conv1 and conv2 bias - data_torch = data_torch.unsqueeze(-1) - - yield from MmprojModel.modify_tensors(self, data_torch, name, bid) - - -@ModelBase.register("Qwen2_5OmniModel") -class Qwen25OmniModel(Qwen2VLVisionModel, Qwen25AudioModel): - has_audio_encoder = True - has_vision_encoder = True - - def get_vision_config(self) -> dict[str, Any] | None: - return self.global_config["thinker_config"].get("vision_config") - - def get_audio_config(self) -> dict[str, Any] | None: - return self.global_config["thinker_config"].get("audio_config") - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if not name.startswith("visual.") and not name.startswith("audio_tower."): - return None - - if name.startswith("thinker."): - name = name.replace("thinker.", "") - - if "audio_bos_eos_token" in name: - # this tensor is left unused in transformers code - # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809 - return None - - return MmprojModel.filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "visual." in name: - yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid) - elif "audio_tower." in name: - yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) - return # skip other tensors - - -@ModelBase.register("InternVisionModel") -class InternVisionModel(MmprojModel): - - min_dynamic_tiles: int = 0 - max_dynamic_tiles: int = 0 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0) - self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0) - - def set_gguf_parameters(self): - assert self.hparams_vision is not None - if isinstance(self.hparams_vision['image_size'], list): - self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0] - if isinstance(self.hparams_vision['patch_size'], list): - self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0] - super().set_gguf_parameters() - - hparams = self.hparams - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL) - self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) - # hidden_act - if hparams["hidden_act"] == "silu": - self.gguf_writer.add_vision_use_silu(True) - elif hparams["hidden_act"] == "gelu": - self.gguf_writer.add_vision_use_gelu(True) - else: - raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") - # downsample_ratio - downsample_ratio = self.global_config.get("downsample_ratio") - assert downsample_ratio is not None - self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) - # older models may not have min/max_dynamic_patch in config - if self.min_dynamic_tiles > 0: - self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles) - if self.max_dynamic_tiles > 0: - self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".position_embd." in new_name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector'] - if not any([name.startswith(prefix) for prefix in vision_prefix]): - return None - # deal with intern-s1 special case - names_map = { - "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias", - "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight", - "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias", - "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight", - "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias", - "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight", - } - if name in names_map: - name = names_map[name] - # correct name - if name.startswith("vision_model"): - name = "vision_tower." + name - if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"): - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # split QKV tensors if needed - if ".qkv." in name: - if data_torch.ndim == 2: # weight - c3, _ = data_torch.shape - else: # bias - c3 = data_torch.shape[0] - assert c3 % 3 == 0 - c = c3 // 3 - wq = data_torch[:c] - wk = data_torch[c: c * 2] - wv = data_torch[c * 2:] - yield from super().modify_tensors(wq, name.replace("attn.qkv", "self_attn.q_proj"), bid) - yield from super().modify_tensors(wk, name.replace("attn.qkv", "self_attn.k_proj"), bid) - yield from super().modify_tensors(wv, name.replace("attn.qkv", "self_attn.v_proj"), bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register( - "NemotronH_Nano_VL_V2", - "RADIOModel", -) -class NemotronNanoV2VLModel(MmprojModel): - # ViT-Huge architecture parameters for RADIO v2.5-h - _vit_hidden_size = 1280 - _vit_intermediate_size = 5120 - _vit_num_layers = 32 - _vit_num_heads = 16 - - def get_vision_config(self) -> dict[str, Any] | None: - # RADIO config doesn't have standard ViT parameters, so they need to be constructed manually - vision_config = self.global_config.get("vision_config") - if vision_config is None: - return None - # Add ViT-H parameters - vision_config = { - **vision_config, - "hidden_size": self._vit_hidden_size, - "intermediate_size": self._vit_intermediate_size, - "num_hidden_layers": self._vit_num_layers, - "num_attention_heads": self._vit_num_heads, - "image_size": self.global_config.get("force_image_size", 512), - } - return vision_config - - def set_gguf_parameters(self): - if "image_mean" not in self.preprocessor_config: - self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406] - if "image_std" not in self.preprocessor_config: - self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225] - - super().set_gguf_parameters() - hparams = self.global_config - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL) - self.gguf_writer.add_vision_attention_layernorm_eps(1e-6) - self.gguf_writer.add_vision_use_gelu(True) - downsample_ratio = hparams.get("downsample_ratio", 0.5) - self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".position_embd." in new_name or "pos_embed" in new_name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "input_conditioner" in name: - return None - - # mtmd does not support video yet so skip tensors related to video. - if "radio_model.model.patch_generator.video_embedder" in name: - return None - - if not name.startswith("vision_model.radio_model.model.") and not name.startswith("mlp1."): - return None - - if "patch_generator.pos_embed" in name: - if not name.endswith(".weight"): - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it - if "patch_generator.pos_embed" in name: - # Downsample position embeddings for fixed 512x512 image size - import torch.nn.functional as F - n_embd = self.hparams["hidden_size"] - image_size = self.global_config.get("force_image_size", 512) - patch_size = self.hparams["patch_size"] - target_patches_per_side = image_size // patch_size # 32 - max_patches_per_side = int((data_torch.shape[1]) ** 0.5) # 128 - if target_patches_per_side != max_patches_per_side: - # Reshape to grid, interpolate, flatten back - data_torch = data_torch.reshape(1, max_patches_per_side, max_patches_per_side, n_embd) - data_torch = data_torch.permute(0, 3, 1, 2).float() # [1, n_embd, 128, 128] - data_torch = F.interpolate(data_torch, size=(target_patches_per_side, target_patches_per_side), - mode='bilinear', align_corners=True) - data_torch = data_torch.permute(0, 2, 3, 1) # [1, 32, 32, n_embd] - data_torch = data_torch.reshape(1, target_patches_per_side * target_patches_per_side, n_embd) - - # Reshape linear patch embedding to conv2d format for ggml_conv_2d - # From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size] - if "patch_generator.embedder" in name: - patch_size = self.hparams["patch_size"] - n_embd = self.hparams["hidden_size"] - data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("WavTokenizerDec") -class WavTokenizerDecModel(TextModel): - model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if \ - name.endswith("codebook.cluster_size") or \ - name.endswith("codebook.embed_avg") or \ - name.endswith("codebook.inited"): - logger.debug(f"Skipping {name!r}") - return None - - return super().filter_tensors(item) - - def set_vocab(self): - self._set_vocab_none() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) - self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) - self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) - self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) - self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) - - self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) - self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) - - self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) - self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) - - self.gguf_writer.add_causal_attention(False) - - -@ModelBase.register("Qwen2MoeForCausalLM") -class Qwen2MoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.QWEN2MOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") - if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None: - self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) - logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # handle aggregated expert tensors - # GGUF stores dimensions reversed from PyTorch, so: - # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A} - # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp) - # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down - if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"): - mapped = f"{name}.weight" if not name.endswith(".weight") else name - # HF: [n_expert, n_embd, n_ff] -> GGML: {n_ff, n_embd, n_expert} - yield from super().modify_tensors(data_torch, mapped, bid) - return - - if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): - if data_torch.ndim < 3 or data_torch.shape[-2] % 2 != 0: - raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}") - # HF: [n_expert, 2*n_ff, n_embd] -> split on dim=-2 - n_ff = data_torch.shape[-2] // 2 - gate = data_torch[..., :n_ff, :].contiguous() - up = data_torch[..., n_ff:, :].contiguous() - # gate/up: [n_expert, n_ff, n_embd] -> GGML: {n_embd, n_ff, n_expert} - base_name = name.removesuffix(".weight").removesuffix(".gate_up_proj") - mapped_gate = f"{base_name}.gate_proj.weight" - mapped_up = f"{base_name}.up_proj.weight" - yield from super().modify_tensors(gate, mapped_gate, bid) - yield from super().modify_tensors(up, mapped_up, bid) - return - - if name.find("experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("Qwen3ForCausalLM", "Qwen3Model") -class Qwen3Model(Qwen2Model): - model_arch = gguf.MODEL_ARCH.QWEN3 - - # extra logic for rerank models - is_rerank: bool = False - is_tied_embeddings: bool = False - token_false_id: int | None = None - token_true_id: int | None = None - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # track for intern-s1-mini - hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) - self.origin_hf_arch = hparams.get('architectures', [None])[0] - - if self._is_qwen3_reranker(): - self._find_rerank_config() - - def _is_qwen3_reranker(self) -> bool: - readme_path = self.dir_model / "README.md" - readme_text = "" - if readme_path.exists(): - with readme_path.open("r", encoding="utf-8") as f: - readme_text = f.read() - - name_hints = [ - str(self.dir_model.name), - str(self.hparams.get("_name_or_path", "")), - str(self.hparams.get("model_type", "")), - str(self.origin_hf_arch or ""), - ] - name_hints = [hint.lower() for hint in name_hints if hint] - - if "# qwen3-reranker" in readme_text.lower() or "# qwen3-vl-reranker" in readme_text.lower(): - return True - - if any("qwen3-reranker" in hint or "qwen3-vl-reranker" in hint for hint in name_hints): - return True - - return "sequenceclassification" in (self.origin_hf_arch or "").lower() - - def set_vocab(self): - # deal with intern-s1-mini - if self.origin_hf_arch == 'InternS1ForConditionalGeneration': - self._set_vocab_interns1() - return - - super().set_vocab() - - def _find_rerank_config(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - - self.is_rerank = True - self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False) - self.token_false_id = tokenizer.convert_tokens_to_ids("no") # ty: ignore[unresolved-attribute, invalid-assignment] - self.token_true_id = tokenizer.convert_tokens_to_ids("yes") # ty: ignore[unresolved-attribute, invalid-assignment] - self.sep_token_id = tokenizer.convert_tokens_to_ids("|") # ty: ignore[unresolved-attribute] - - assert self.token_false_id is not None and self.token_true_id is not None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if self.is_rerank: - self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK) - self.gguf_writer.add_classifier_output_labels(["yes", "no"]) - self.gguf_writer.add_chat_template([{ - "name": "rerank", - "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n" - "<|im_start|>user\n: Given a web search query, retrieve relevant passages that answer the query\n: {query}\n: {document}<|im_end|>\n" - "<|im_start|>assistant\n\n\n\n\n" - }]) - - def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor: - # extract "yes" and "no" tokens from the output lm_head tensor - false_row = data_torch[self.token_false_id] - true_row = data_torch[self.token_true_id] - return torch.stack([true_row, false_row], dim=0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.is_rerank: - is_tied_head = self.is_tied_embeddings and "embed_tokens" in name - is_real_head = not self.is_tied_embeddings and "lm_head" in name - if is_tied_head or is_real_head: - cls_out_head = ( - gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight", - self._get_cls_out_tensor(data_torch), - ) - yield cls_out_head - if is_tied_head: - yield from super().modify_tensors(data_torch, name, bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Qwen3MoeForCausalLM") -class Qwen3MoeModel(Qwen2MoeModel): - model_arch = gguf.MODEL_ARCH.QWEN3MOE - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - hparams = ModelBase.load_hparams(self.dir_model, False) - self.origin_hf_arch = hparams.get('architectures', [None])[0] - - def set_vocab(self): - # deal with intern-s1 - if self.origin_hf_arch == 'InternS1ForConditionalGeneration': - self._set_vocab_interns1() - return - - super().set_vocab() - - -@ModelBase.register("Qwen3NextForCausalLM") -class Qwen3NextModel(Qwen2MoeModel): - model_arch = gguf.MODEL_ARCH.QWEN3NEXT - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_ssm_conv_kernel(self.hparams["linear_conv_kernel_dim"]) - self.gguf_writer.add_ssm_state_size(self.hparams["linear_key_head_dim"]) - self.gguf_writer.add_ssm_group_count(self.hparams["linear_num_key_heads"]) - self.gguf_writer.add_ssm_time_step_rank(self.hparams["linear_num_value_heads"]) - self.gguf_writer.add_ssm_inner_size(self.hparams["linear_value_head_dim"] * self.hparams["linear_num_value_heads"]) - self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4)) - if (rope_dim := self.hparams.get("head_dim")) is None: - rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25))) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("mtp"): - # ignore MTP layers for now - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.endswith(".A_log"): - data_torch = -torch.exp(data_torch) - elif name.endswith(".dt_bias"): - name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" - elif "conv1d" in name: - data_torch = data_torch.squeeze() - elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"): - data_torch = data_torch + 1 - - if "in_proj_qkvz.weight" in name: - # original order: [q, k, v, z] * head_count - # corrected order: [q * head_count, k * head_count, v * head_count, z * head_count] - head_k_dim = self.hparams["linear_key_head_dim"] - head_v_dim = self.hparams["linear_value_head_dim"] - num_v_heads = self.hparams["linear_num_value_heads"] - num_k_heads = self.hparams["linear_num_key_heads"] - hidden_size = self.hparams["hidden_size"] - split_arg_list_qkvz = [ - head_k_dim, # q partition - head_k_dim, # k partition - (num_v_heads // num_k_heads * head_v_dim), # v partition - (num_v_heads // num_k_heads * head_v_dim), # z partition - ] - # view as (n_embd, head_count, [q+k+v+z]) - data_torch = data_torch.permute(1, 0).contiguous() - data_torch = data_torch.view(-1, num_k_heads, sum(split_arg_list_qkvz)) - # split into q, k, v, z - q, k, v, z = torch.split(data_torch, split_arg_list_qkvz, dim=-1) - # flatten dim + head_count - q = q.contiguous().view(hidden_size, -1) - k = k.contiguous().view(hidden_size, -1) - v = v.contiguous().view(hidden_size, -1) - z = z.contiguous().view(hidden_size, -1) - # stack back - qkv = torch.cat([q, k, v], dim=-1).permute(1, 0).contiguous() - z = z.permute(1, 0).contiguous() - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, ".weight"), qkv) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid, ".weight"), z) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("RND1") -class RND1Model(Qwen2MoeModel): - model_arch = gguf.MODEL_ARCH.RND1 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # RND1 specific parameters - # RND1 uses bidirectional attention - self.gguf_writer.add_causal_attention(False) - - if (mask_token_id := self.hparams.get("mask_token_id")) is not None: - self.gguf_writer.add_mask_token_id(mask_token_id) - - -@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration", "Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration") -class Qwen3VLVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.hparams_vision is None: - logger.info("No vision config found, skipping vision tensor processing") - return - - # Compute image_size if not present - if "image_size" not in self.hparams_vision: - # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings - num_pos = self.hparams_vision.get("num_position_embeddings", 2304) - patch_size = self.hparams_vision.get("patch_size", 16) - # num_position_embeddings = (image_size / patch_size) ** 2 - # So image_size = sqrt(num_position_embeddings) * patch_size - image_size = int(num_pos**0.5 * patch_size) - self.hparams_vision["image_size"] = image_size - - # Rename config values for compatibility - self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads") - self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth") - - self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0) - for idx in self.hparams_vision.get("deepstack_visual_indexes", []): - self.is_deepstack_layers[idx] = True - - def set_gguf_parameters(self): - super().set_gguf_parameters() - # in case mixed modalities, the arch will be handled by subclass - if not self.has_audio_encoder: - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL) - self.gguf_writer.add_vision_use_gelu(True) - - if self.hparams_vision is not None: - merge_size = self.hparams_vision.get("spatial_merge_size") - if merge_size is not None: - self.gguf_writer.add_vision_spatial_merge_size(int(merge_size)) - - # Use text config's rms_norm_eps for vision attention layernorm eps - rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6) - self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) - - if self.is_deepstack_layers: - self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip text model tensors - if name.startswith("lm_head."): - return None - - # Skip MTP tensors - if name.startswith("mtp."): - return None - - if name.startswith("model.visual."): - name = name.replace("model.visual.", "visual.", 1) - - if not name.startswith("visual."): - return None - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - assert self.hparams_vision is not None - - if name.startswith("visual.deepstack_merger_list."): - prefix, rest = name.split(".", maxsplit=3)[2:] - # prefix is the layer index, convert to absolute clip layer index! - idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)] - target = rest - - tensor_type: gguf.MODEL_TENSOR - if target.startswith("norm."): - tensor_type = gguf.MODEL_TENSOR.V_DS_NORM - suffix = target.split(".", 1)[1] - elif target.startswith("linear_fc1."): - tensor_type = gguf.MODEL_TENSOR.V_DS_FC1 - suffix = target.split(".", 1)[1] - elif target.startswith("linear_fc2."): - tensor_type = gguf.MODEL_TENSOR.V_DS_FC2 - suffix = target.split(".", 1)[1] - else: - raise ValueError(f"Unexpected deepstack tensor: {name}") - - new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}") - yield from super().modify_tensors(data_torch, new_name, bid) - return - - if name.startswith("visual.merger."): - suffix = name.split(".", 2)[2] - if suffix.startswith("linear_fc"): - fc_idx_str, tail = suffix.split(".", 1) - fc_num = int(fc_idx_str.replace("linear_fc", "")) - # Qwen3VL has linear_fc1 and linear_fc2 - # Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2) - if fc_num == 1: - fc_idx = 0 - elif fc_num == 2: - fc_idx = 2 - else: - raise ValueError(f"unexpected fc index {fc_num} in {name}") - new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}") - elif suffix.startswith("norm."): - new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}") - else: - raise ValueError(f"Unexpected merger tensor: {name}") - yield (new_name, data_torch) - return - - if name == "visual.patch_embed.proj.weight": - # split Conv3D into Conv2Ds along temporal dimension - c1, c2, kt, _, _ = data_torch.shape - del c1, c2 - if kt != 2: - raise ValueError("Current implementation only supports temporal_patch_size of 2") - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]) - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) - return - - if name == "visual.patch_embed.proj.bias": - # Include the bias - it's used by the C++ code - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch) - return - - yield from MmprojModel.modify_tensors(self, data_torch, name, bid) - - -@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") -class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel): - has_audio_encoder = True - has_vision_encoder = True - - def get_vision_config(self) -> dict[str, Any] | None: - if self.has_vision_encoder: - return self.global_config["thinker_config"].get("vision_config") - else: - return None - - def get_audio_config(self) -> dict[str, Any] | None: - if self.has_audio_encoder: - return self.global_config["thinker_config"].get("audio_config") - else: - return None - - def set_gguf_parameters(self): - if self.has_vision_encoder: - Qwen3VLVisionModel.set_gguf_parameters(self) - self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.QWEN3VL) - if self.has_audio_encoder: - Qwen25AudioModel.set_gguf_parameters(self) - self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.QWEN3A) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip text model tensors - if name.startswith("lm_head."): - return None - - # Skip MTP tensors - if name.startswith("mtp."): - return None - - if name.startswith("model.visual."): - name = name.replace("model.visual.", "visual.", 1) - - if "visual." not in name and "audio_tower." not in name: - return None - - return MmprojModel.filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "visual." in name: - if not self.has_vision_encoder: - raise ValueError(f"Model does not have vision encoder, but found tensor {name}") - # need to transform vision tensor naming, so that modify_tensors() logic can be used correctly - name = name.replace("thinker.visual.", "model.visual.") - if ".merger_list." in name: - name = name.replace(".merger_list.", ".deepstack_merger_list.") - name = name.replace(".ln_q", ".norm") - name = name.replace(".mlp.0", ".linear_fc1") - name = name.replace(".mlp.2", ".linear_fc2") - elif ".merger." in name: - name = name.replace(".ln_q", ".norm") - name = name.replace(".mlp.0", ".linear_fc1") - name = name.replace(".mlp.2", ".linear_fc2") - yield from Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid) - elif "audio_tower." in name: - if not self.has_audio_encoder: - raise ValueError(f"Model does not have audio encoder, but found tensor {name}") - if "conv2d" in name and name.endswith(".bias"): - # transform conv2d bias [n_embd] --> [1, 1, n_embd] - data_torch = data_torch.unsqueeze(-1).unsqueeze(-1) - yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) - - -@ModelBase.register("Qwen3ASRForConditionalGeneration") -class Qwen3ASRMmprojModel(Qwen3OmniMmprojModel): - has_audio_encoder = True - has_vision_encoder = False - - -@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration") -class Glm4VVisionModel(Qwen3VLVisionModel): - def set_gguf_parameters(self): - MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters - assert self.hparams_vision is not None - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) - - hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() - if hidden_act == "gelu": - self.gguf_writer.add_vision_use_gelu(True) - elif hidden_act == "silu": - self.gguf_writer.add_vision_use_silu(True) - - rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5) - self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("visual.merger."): - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - return - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("StepVLForConditionalGeneration") -class Step3VLVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - - if not self.hparams_vision.get("intermediate_size"): - hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0 - assert hidden_size > 0 - mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536)) - self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio)) - - self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN)) - self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD)) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - - projector_stride = int(self.global_config.get("understand_projector_stride", -1)) - hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1))) - num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1))) - assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), ( - "current Step3-VL conversion path is only validated for Step3-VL-10B" - ) - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL) - self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5))) - self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2) - # 3024 max resize comes from step3-vl-10b processing_step3.py. - self.gguf_writer.add_vision_preproc_image_size(3024) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".position_embd." in new_name: - return gguf.GGMLQuantizationType.F32 - if ("mm.0." in new_name or "mm.1." in new_name) and new_name.endswith(".weight"): - return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith(("model.", "lm_head.")): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("vision_model.vit_downsampler"): - match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name) - if match is None: - raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}") - - proj_id = int(match.group(1)) - 1 - suffix = f".{match.group(2)}" - yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch) - return - - if name == "vit_large_projector.weight": - yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch) - return - - if name.startswith("vision_model."): - if name == "vision_model.positional_embedding": - name += ".weight" - elif name.endswith(".gamma") and ".ls_" in name: - name = name.removesuffix(".gamma") + ".weight" - - name = name.replace("attn.in_proj_weight", "attn.in_proj.weight") - name = name.replace("attn.in_proj_bias", "attn.in_proj.bias") - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Qwen3VLForConditionalGeneration") -class Qwen3VLTextModel(Qwen3Model): - model_arch = gguf.MODEL_ARCH.QWEN3VL - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if "thinker_config" in self.hparams: - vision_config = self.hparams["thinker_config"].get("vision_config", {}) - else: - vision_config = self.hparams.get("vision_config", {}) - deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) - self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - name = name.replace("thinker.", "") - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("StepVLForConditionalGeneration") -class Step3VLTextModel(Qwen3Model): - model_arch = gguf.MODEL_ARCH.QWEN3 - - -@ModelBase.register("Qwen3VLMoeForConditionalGeneration") -class Qwen3VLMoeTextModel(Qwen3MoeModel): - model_arch = gguf.MODEL_ARCH.QWEN3VLMOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - vision_config = self.hparams.get("vision_config", {}) - deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) - self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - name = name.replace("thinker.", "") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Qwen3VL has transposed packed tensors, so we treat it differently from general Qwen2MoE packed tensors - if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"): - mapped = f"{name}.weight" if not name.endswith(".weight") else name - permuted = data_torch.permute(0, 2, 1).contiguous() - yield from ModelBase.modify_tensors(self, permuted, mapped, bid) - return - - if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): - if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0: - raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}") - split_dim = data_torch.shape[-1] // 2 - gate = data_torch[..., :split_dim].contiguous() - up = data_torch[..., split_dim:].contiguous() - # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768) - # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128} - # Need PyTorch: (128, 768, 2048) [reversed of GGML] - # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048) - base_name = name.removesuffix(".weight") - base = base_name.rsplit('.', 1)[0] - mapped_gate = f"{base}.gate_proj.weight" - mapped_up = f"{base}.up_proj.weight" - perm_gate = gate.permute(0, 2, 1).contiguous() - perm_up = up.permute(0, 2, 1).contiguous() - yield from ModelBase.modify_tensors(self, perm_gate, mapped_gate, bid) - yield from ModelBase.modify_tensors(self, perm_up, mapped_up, bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") -class Qwen3OmniMoeTextModel(Qwen3VLMoeTextModel): - model_arch = gguf.MODEL_ARCH.QWEN3VLMOE - - def set_vocab(self): - super().set_vocab() - # correct BOS/EOS tokens - with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - added_tokens = tokenizer_config.get("added_tokens_decoder", {}) - for token_id, data in added_tokens.items(): - if data.get("content") == "<|im_end|>": - self.gguf_writer.add_bos_token_id(int(token_id)) - self.gguf_writer.add_eos_token_id(int(token_id)) - break - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_num_deepstack_layers(0) - - -@ModelBase.register("Qwen3ASRForConditionalGeneration") -class Qwen3ASRTextModel(Qwen3VLTextModel): - model_arch = gguf.MODEL_ARCH.QWEN3VL - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_num_deepstack_layers(0) - - def set_vocab(self): - super().set_vocab() - # fix chat template, use correct chatml format - self.gguf_writer.add_chat_template("{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}") - # correct BOS/EOS tokens - with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - added_tokens = tokenizer_config.get("added_tokens_decoder", {}) - for token_id, data in added_tokens.items(): - if data.get("content") == "<|im_end|>": - self.gguf_writer.add_bos_token_id(int(token_id)) - self.gguf_writer.add_eos_token_id(int(token_id)) - break - - -class _LinearAttentionVReorderBase(Qwen3NextModel): - model_arch = gguf.MODEL_ARCH.QWEN3NEXT # overridden by subclasses - """reorders V heads from grouped to tiled order for ggml broadcast - - see https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306 - - Linear attention may has num_k_heads < num_v_heads. The HF weights store - V heads grouped by K head: [G0_v0..v{r-1}, G1_v0..v{r-1}, ...]. - ggml binary ops use tiled broadcast: [K0, K1, ..., K0, K1, ...]. - We reorder V heads to tiled order so ggml_repeat can replace the expensive - interleaved repeat: [G0_v0, G1_v0, ..., G0_v1, G1_v1, ...]. - """ - - @staticmethod - def _reorder_v_heads(tensor: Tensor, dim: int, num_k_heads: int, num_v_per_k: int, head_dim: int) -> Tensor: - """Reorder V heads from grouped (by K head) to tiled order along the given dimension.""" - shape = list(tensor.shape) - if dim < 0: - dim += len(shape) - new_shape = shape[:dim] + [num_k_heads, num_v_per_k, head_dim] + shape[dim + 1:] - tensor = tensor.reshape(*new_shape) - perm = list(range(len(new_shape))) - perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim] - return tensor.permute(*perm).contiguous().reshape(*shape) - - def _transform_nvfp4_weight(self, name: str, weight: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]: - if not name.endswith(( - ".linear_attn.in_proj_qkv.weight", - ".linear_attn.in_proj_z.weight", - ".linear_attn.in_proj_a.weight", - ".linear_attn.in_proj_b.weight", - ".linear_attn.out_proj.weight", - )): - return weight, scale - - num_k_heads = self.hparams["linear_num_key_heads"] - num_v_heads = self.hparams["linear_num_value_heads"] - head_k_dim = self.hparams["linear_key_head_dim"] - head_v_dim = self.hparams["linear_value_head_dim"] - num_v_per_k = num_v_heads // num_k_heads - - def unpack_nibbles(qs: Tensor) -> Tensor: - lo = torch.bitwise_and(qs, 0x0F) - hi = torch.bitwise_right_shift(qs, 4) - return torch.stack((lo, hi), dim=-1).reshape(*qs.shape[:-1], qs.shape[-1] * 2) - - def pack_nibbles(codes: Tensor) -> Tensor: - codes = codes.reshape(*codes.shape[:-1], codes.shape[-1] // 2, 2) - lo = torch.bitwise_and(codes[..., 0], 0x0F) - hi = torch.bitwise_left_shift(torch.bitwise_and(codes[..., 1], 0x0F), 4) - return torch.bitwise_or(lo, hi).contiguous() - - def apply_col_perm(qs: Tensor, scales: Tensor, col_perm: Tensor) -> tuple[Tensor, Tensor]: - assert qs.ndim >= 2 - assert scales.ndim >= 2 - - k = qs.shape[-1] * 2 - assert col_perm.numel() == k - assert k % 16 == 0 - - group_cols = col_perm.reshape(-1, 16) - group_starts = group_cols[:, 0] - expected = group_starts.unsqueeze(1) + torch.arange(16, dtype=col_perm.dtype) - assert torch.equal(group_cols, expected) - assert torch.all(group_starts % 16 == 0) - - group_perm = (group_starts // 16).to(dtype=torch.long) - expected_groups = torch.arange(scales.shape[-1], dtype=torch.long) - assert group_perm.numel() == scales.shape[-1] - assert torch.equal(torch.sort(group_perm).values, expected_groups) - - codes = unpack_nibbles(qs) - codes = codes.index_select(-1, col_perm.to(device=qs.device, dtype=torch.long)) - qs = pack_nibbles(codes) - scales = scales.index_select(-1, group_perm.to(device=scales.device)) - return qs, scales - - def reorder_rows(qs: Tensor, scales: Tensor, head_dim: int) -> tuple[Tensor, Tensor]: - row_perm = self._reorder_v_heads( - torch.arange(num_v_heads * head_dim, dtype=torch.long).unsqueeze(-1), - 0, num_k_heads, num_v_per_k, head_dim, - ).squeeze(-1) - return ( - qs.index_select(0, row_perm.to(device=qs.device)), - scales.index_select(0, row_perm.to(device=scales.device)), - ) - - if name.endswith(".linear_attn.in_proj_qkv.weight"): - q_dim = head_k_dim * num_k_heads - k_dim = head_k_dim * num_k_heads - q = weight[:q_dim] - k = weight[q_dim:q_dim + k_dim] - v = weight[q_dim + k_dim:] - q_scale = scale[:q_dim] - k_scale = scale[q_dim:q_dim + k_dim] - v_scale = scale[q_dim + k_dim:] - v, v_scale = reorder_rows(v, v_scale, head_v_dim) - return torch.cat([q, k, v], dim=0), torch.cat([q_scale, k_scale, v_scale], dim=0) - - if name.endswith(".linear_attn.in_proj_z.weight"): - weight, scale = reorder_rows(weight, scale, head_v_dim) - elif name.endswith((".linear_attn.in_proj_a.weight", ".linear_attn.in_proj_b.weight")): - weight, scale = reorder_rows(weight, scale, 1) - elif name.endswith(".linear_attn.out_proj.weight"): - col_perm = self._reorder_v_heads( - torch.arange(num_v_heads * head_v_dim, dtype=torch.long).unsqueeze(0), - 1, num_k_heads, num_v_per_k, head_v_dim, - ).squeeze(0) - weight, scale = apply_col_perm(weight, scale, col_perm) - - return weight, scale - - def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): - weight, scale = self._transform_nvfp4_weight(name, weight, scale) - super()._repack_nvfp4(name, weight, scale, scale2, input_scale) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - num_k_heads = self.hparams.get("linear_num_key_heads", 0) - num_v_heads = self.hparams.get("linear_num_value_heads", 0) - - if num_k_heads > 0 and num_v_heads > 0 and num_k_heads != num_v_heads and "linear_attn." in name: - head_k_dim = self.hparams["linear_key_head_dim"] - head_v_dim = self.hparams["linear_value_head_dim"] - num_v_per_k = num_v_heads // num_k_heads - - if ".in_proj_qkv." in name: - # QKV weight: reorder only the V rows - q_dim = head_k_dim * num_k_heads - k_dim = head_k_dim * num_k_heads - q = data_torch[:q_dim] - k = data_torch[q_dim:q_dim + k_dim] - v = data_torch[q_dim + k_dim:] - v = self._reorder_v_heads(v, 0, num_k_heads, num_v_per_k, head_v_dim) - data_torch = torch.cat([q, k, v], dim=0) - - elif ".in_proj_z." in name: - # Z gate weight: reorder rows (num_v_heads * head_v_dim) - data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, head_v_dim) - - elif ".in_proj_b." in name or ".in_proj_a." in name: - # Beta/Alpha weight: reorder rows (num_v_heads, head_dim=1) - data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, 1) - - elif ".A_log" in name or ".dt_bias" in name or ".dt_proj" in name: - # A_log / dt_bias: 1D parameters with num_v_heads elements - if data_torch.ndim == 1: - data_torch = self._reorder_v_heads( - data_torch.unsqueeze(-1), 0, num_k_heads, num_v_per_k, 1 - ).squeeze(-1) - else: - data_torch = self._reorder_v_heads(data_torch, -1, num_k_heads, num_v_per_k, 1) - - elif ".conv1d" in name: - # Conv1d kernel: reorder only the V channel portion - data = data_torch.squeeze() - qk_channels = head_k_dim * num_k_heads * 2 - qk_part = data[:qk_channels] - v_part = data[qk_channels:] - v_part = self._reorder_v_heads(v_part, 0, num_k_heads, num_v_per_k, head_v_dim) - data_torch = torch.cat([qk_part, v_part], dim=0) - - elif ".out_proj." in name: - # Out projection weight: reorder columns (input dimension) - data_torch = self._reorder_v_heads(data_torch, 1, num_k_heads, num_v_per_k, head_v_dim) - - yield from super().modify_tensors(data_torch, name, bid) - - -class _Qwen35MRopeMixin: - # Qwen3.5 always applies interleaved MRoPE (see Qwen3_5RotaryEmbedding in transformers); - # the upstream default mrope_section is [11, 11, 10] and llama.cpp's QWEN35 / QWEN35MOE - # loaders treat qwen35.rope.dimension_sections as required, so make sure it is always - # written even when a particular checkpoint omits the field in `rope_parameters`. - _QWEN35_DEFAULT_MROPE_SECTION = [11, 11, 10, 0] - - gguf_writer: gguf.GGUFWriter - rope_parameters: dict - - def set_gguf_parameters(self): - super().set_gguf_parameters() # ty: ignore[unresolved-attribute] - if "mrope_section" not in self.rope_parameters: - self.gguf_writer.add_rope_dimension_sections(self._QWEN35_DEFAULT_MROPE_SECTION) - - -@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM") -class Qwen3_5TextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase): - model_arch = gguf.MODEL_ARCH.QWEN35 - - -@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM") -class Qwen3_5MoeTextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase): - model_arch = gguf.MODEL_ARCH.QWEN35MOE - - -# MiniCPM-V 4.6: text tower is Qwen3.5 (linear+full hybrid attention) wrapped under -# `model.language_model.*`; vision tower is SigLIP + a window-attention ViT merger -# + a final DownsampleMLP merger. The same HF arch is registered twice below: once as -# the LM (text mode) and once as the mmproj (vision mode), mirroring the Qwen3-VL setup. - -@ModelBase.register("MiniCPMV4_6ForConditionalGeneration") -class MiniCPMV4_6TextModel(Qwen3_5TextModel): - model_arch = gguf.MODEL_ARCH.QWEN35 - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("model.merger."): - return None - # MTP tensors are not used at inference yet; align with Qwen3Next behaviour - if name.startswith("mtp"): - return None - - return super().filter_tensors(item) - - -@ModelBase.register("MiniCPMV4_6ForConditionalGeneration") -class MiniCPMV4_6VisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.hparams_vision is not None: - # In MiniCPM-V 4.6 `vision_config.image_size` (980) describes the SigLIP - # positional embedding bucket grid (70 x 70), while the per-slice processing - # resolution is the preprocessor's `scale_resolution` (typically 448). - # The CLIP loader in tools/mtmd/clip.cpp consumes `clip.vision.image_size` - # as the slice size and warmup resolution, so report `scale_resolution` there - # to match the upstream MiniCPMV4_6ImageProcessorPil slicing rules. - scale_resolution = self.preprocessor_config.get("scale_resolution") - if scale_resolution is not None: - self.hparams_vision["image_size"] = int(scale_resolution) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - - # projector type string is consumed by clip_projector_type_from_string() in clip.cpp - # (mapped to PROJECTOR_TYPE_MINICPMV4_6). - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MINICPMV4_6) - - # ViT merger 2x2 + final merger 2x2 = 4x spatial merge per dimension; used for slice alignment - self.gguf_writer.add_vision_projector_scale_factor(4) - - # borrow wa_layer_indexes for vit_merger insertion point - insert_layer_id = int(self.global_config.get( - "insert_layer_id", self.hparams_vision.get("insert_layer_id", 6))) - self.gguf_writer.add_vision_wa_layer_indexes([insert_layer_id]) - - # SigLIP vision body uses gelu_pytorch_tanh, which matches ggml_gelu (tanh approx). - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps( - self.hparams_vision.get("layer_norm_eps", 1e-6)) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # lm_head / MTP -> belong to the LM file - if name.startswith(("lm_head.", "mtp")): - return None - - return super().filter_tensors(item) - - -@ModelBase.register("GPT2LMHeadModel") -class GPT2Model(TextModel): - model_arch = gguf.MODEL_ARCH.GPT2 - - def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["n_ctx"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # we don't need these - if name.endswith((".attn.bias", ".attn.masked_bias")): - yield from super().modify_tensors(data_torch, name, bid) - return - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): - data_torch = data_torch.transpose(1, 0) - - new_name = self.map_tensor_name(name) - - yield from super().modify_tensors(data_torch, new_name, bid) - - -@ModelBase.register("RuGPT3XLForCausalLM") -class RuGPT3XLModel(TextModel): - model_arch = gguf.MODEL_ARCH.GPT2 - - _qkv_parts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Fuse separate Q, K, V projections into a single QKV tensor - if ".self_attn.q_proj." in name or ".self_attn.k_proj." in name or ".self_attn.v_proj." in name: - suffix = "weight" if name.endswith(".weight") else "bias" - part = "q" if ".q_proj." in name else ("k" if ".k_proj." in name else "v") - key = f"{part}.{suffix}" - - assert bid is not None - if self._qkv_parts is None: - self._qkv_parts = [{} for _ in range(self.block_count)] - self._qkv_parts[bid][key] = data_torch - - q_key, k_key, v_key = f"q.{suffix}", f"k.{suffix}", f"v.{suffix}" - if all(k in self._qkv_parts[bid] for k in [q_key, k_key, v_key]): - q = self._qkv_parts[bid].pop(q_key) - k = self._qkv_parts[bid].pop(k_key) - v = self._qkv_parts[bid].pop(v_key) - data_torch = torch.cat([q, k, v], dim=0) - name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, f".{suffix}") - logger.debug(f"Fused Q/K/V {suffix} for layer {bid} -> {name}") - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._qkv_parts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - parts = [f"({i}){k}" for i, d in enumerate(self._qkv_parts) for k in d.keys()] - if len(parts) > 0: - raise ValueError(f"Unprocessed Q/K/V parts: {parts}") - - -@ModelBase.register("PhiForCausalLM") -class Phi2Model(TextModel): - model_arch = gguf.MODEL_ARCH.PHI2 - - def set_gguf_parameters(self): - rot_pct = self.find_hparam(["partial_rotary_factor"]) - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - - self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) - - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(4 * n_embd) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_add_bos_token(False) - - -@ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV") -class Phi3MiniModel(TextModel): - model_arch = gguf.MODEL_ARCH.PHI3 - - def set_vocab(self): - # Phi-4 model uses GPT2Tokenizer - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - tokenizer_class = tokenizer_config_json['tokenizer_class'] - if tokenizer_class == 'GPT2Tokenizer': - return self._set_vocab_gpt2() - - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - raise ValueError(f'Error: Missing {tokenizer_path}') - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, foken_data in added_tokens_decoder.items(): - token_id = int(token_id) - token = foken_data["content"].encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - tokenizer_file = self.dir_model / 'tokenizer.json' - if tokenizer_file.is_file(): - with open(tokenizer_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - added_tokens = tokenizer_json.get("added_tokens", []) - for foken_data in added_tokens: - token_id = int(foken_data["id"]) - token = foken_data["content"].encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) - rms_eps = self.find_hparam(["rms_norm_eps"]) - max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rot_pct = self.hparams.get("partial_rotary_factor", 1.0) - rope_dims = int(rot_pct * n_embd) // n_head - - self.gguf_writer.add_context_length(max_pos_embds) - self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(rms_eps) - self.gguf_writer.add_rope_dimension_count(rope_dims) - self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"]) - self.gguf_writer.add_file_type(self.ftype) - sliding_window = self.hparams.get("sliding_window") - # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models - if sliding_window is None: - sliding_window = 0 - self.gguf_writer.add_sliding_window(sliding_window) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rot_pct = self.hparams.get("partial_rotary_factor", 1.0) - rope_dims = int(rot_pct * n_embd) // n_head - - # write rope scaling for long context (128k) model - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is None: - return - - scale = max_pos_embds / orig_max_pos_embds - - rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower() - if len(rope_scaling_type) == 0: - raise KeyError('Missing the required key rope_scaling.type') - - if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': - attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 - elif rope_scaling_type == 'yarn': - attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 - else: - raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') - - self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) - - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.') - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - - -@ModelBase.register("Phi4ForCausalLMV") -class Phi4VisionMmprojModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - - self.vision_total_layers = int(self.find_vparam(self.n_block_keys)) - if self.vision_total_layers < 2: - raise ValueError( - f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}" - ) - - # Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and - # drop post-layernorm/head weights. This makes the GGUF runtime output match - # the feature map consumed by the patched siglip.cpp Phi-4 projector path. - self.vision_export_layers = self.vision_total_layers - 1 - self.vision_last_layer_idx = self.vision_total_layers - 1 - - for key in self.n_block_keys: - if key in self.hparams_vision: - self.hparams_vision[key] = self.vision_export_layers - break - - self.block_count = self.vision_export_layers - self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) - - patch_size = self.preprocessor_config.get("patch_size") - if patch_size is None: - raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json") - - self.hparams_vision["patch_size"] = patch_size - - pos_emb_name = next( - ( - name for name in self.model_tensors - if name.endswith("vision_model.embeddings.position_embedding.weight") - ), - None, - ) - if pos_emb_name is None: - raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight") - - pos_emb_shape = self.model_tensors[pos_emb_name]().shape - base_grid_tokens = int(pos_emb_shape[0]) - grid_side = math.isqrt(base_grid_tokens) - if grid_side * grid_side != base_grid_tokens: - raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}") - - self.hparams_vision["image_size"] = grid_side * patch_size - - min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches")) - max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches")) - if min_num_patches is None or max_num_patches is None: - raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches") - - self.min_pixels = int(min_num_patches) * patch_size * patch_size - self.max_pixels = int(max_num_patches) * patch_size * patch_size - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4) - self.gguf_writer.add_vision_min_pixels(self.min_pixels) - self.gguf_writer.add_vision_max_pixels(self.max_pixels) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - name = name.replace("model.vision_tower.vision_tower.", "vision_tower.") - - if not name.startswith(("vision_tower.", "model.mm_projector.", "mm_projector.")): - return None - - if ".vision_model.head." in name: - return None - - if ".vision_model.post_layernorm." in name: - return None - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("vision_tower."): - if bid is not None and bid == self.vision_last_layer_idx: - return - - if name.endswith("vision_model.embeddings.patch_embedding.weight"): - assert self.hparams_vision is not None - if data_torch.ndim != 2: - raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}") - - patch_area = self.hparams_vision["patch_size"] ** 2 - in_features = data_torch.shape[1] - if in_features % patch_area != 0: - raise ValueError( - f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}" - ) - - num_channels = in_features // patch_area - patch_size = self.hparams_vision["patch_size"] - data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels) - data_torch = data_torch.permute(0, 3, 1, 2) - - yield from super().modify_tensors(data_torch, name, bid) - return - - if name.startswith(("model.mm_projector.", "mm_projector.")): - local_name = name - local_name = local_name.replace("model.mm_projector.", "") - local_name = local_name.replace("mm_projector.", "") - - if not (local_name.startswith("0.") or local_name.startswith("2.")): - return - - suffix = ".bias" if local_name.endswith(".bias") else ".weight" - mm_idx = int(local_name.split(".", maxsplit=1)[0]) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch) - return - - return - - -@ModelBase.register("PhiMoEForCausalLM") -class PhiMoeModel(Phi3MiniModel): - model_arch = gguf.MODEL_ARCH.PHIMOE - - _experts: list[dict[str, Tensor]] | None = None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"])) - self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"])) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("PlamoForCausalLM") -class PlamoModel(TextModel): - model_arch = gguf.MODEL_ARCH.PLAMO - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - hparams = self.hparams - - self.gguf_writer.add_context_length(4096) # not in config.json - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong - self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - def shuffle_attn_q_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(8, 5, 128, 5120) - data_torch = torch.permute(data_torch, (1, 0, 2, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def shuffle_attn_output_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(5120, 8, 5, 128) - data_torch = torch.permute(data_torch, (0, 2, 1, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - # shuffle for broadcasting of gqa in ggml_mul_mat - if new_name.endswith("attn_q.weight"): - data_torch = self.shuffle_attn_q_weight(data_torch) - elif new_name.endswith("attn_output.weight"): - data_torch = self.shuffle_attn_output_weight(data_torch) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM") -class Plamo2Model(TextModel): - model_arch = gguf.MODEL_ARCH.PLAMO2 - - def set_vocab(self): - self._set_vocab_plamo() - - def set_gguf_parameters(self): - hparams = self.hparams - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - # Which layers are Mamba layers - # PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer) - # This logic matches modeling_plamo.py's is_mamba function - mamba_step = hparams.get("mamba_step", 2) - mamba_enabled = hparams.get("mamba_enabled", True) - num_key_value_heads = [] - num_attention_heads = [] - - if mamba_enabled: - for i in range(self.block_count): - if self.block_count <= (mamba_step // 2): - # use attention in last layer - is_mamba = (i != self.block_count - 1) - else: - is_mamba = (i % mamba_step) != (mamba_step // 2) - if is_mamba: - num_key_value_heads.append(0) - num_attention_heads.append(0) - else: - num_key_value_heads.append(hparams.get("num_key_value_heads", 4)) - num_attention_heads.append(hparams.get("num_attention_heads", 32)) - - if num_key_value_heads and num_attention_heads: - self.gguf_writer.add_head_count_kv(num_key_value_heads) - self.gguf_writer.add_head_count(num_attention_heads) - - self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048)) - self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096)) - self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128)) - self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128)) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06)) - self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000)) - - # Mamba parameters - self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64)) - self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4)) - self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_num_heads", 64)) - intermediate_size = hparams.get("mamba_num_heads", 64) * hparams.get("hidden_size_per_head", 128) - self.gguf_writer.add_ssm_inner_size(intermediate_size) - self.gguf_writer.add_ssm_group_count(0) - - # MLP feed forward parameters (for attention layers) - self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312)) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.endswith(".A_log"): - data_torch = -torch.exp(data_torch) - elif name.endswith(".dt_bias"): - name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" - elif name.endswith(".dt_norm_weight"): - name = name.rpartition(".dt_norm_weight")[0] + ".dt_norm.weight" - elif name.endswith(".B_norm_weight"): - name = name.rpartition(".B_norm_weight")[0] + ".B_norm.weight" - elif name.endswith(".C_norm_weight"): - name = name.rpartition(".C_norm_weight")[0] + ".C_norm.weight" - elif name.endswith(".k_weight"): - name = name.rpartition(".k_weight")[0] + ".k.weight" - elif name.endswith(".q_weight"): - name = name.rpartition(".q_weight")[0] + ".q.weight" - elif name.endswith(".conv1d.weight"): - data_torch = torch.squeeze(data_torch) # remove (, 1, ) - assert data_torch.ndim == 2 - elif name.endswith(".pre_mixer_norm.weight"): - data_torch += 1.0 - elif name.endswith(".post_mixer_norm.weight"): - data_torch += 1.0 / 5 - elif name.endswith(".pre_mlp_norm.weight"): - data_torch += 1.0 - elif name.endswith(".post_mlp_norm.weight"): - data_torch += 1.0 / (5**1.5) - elif name.endswith(".norm.weight"): - data_torch += 1.0 - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM") -class Plamo3Model(TextModel): - model_arch = gguf.MODEL_ARCH.PLAMO3 - - def set_vocab(self): - self._set_vocab_plamo() - - tokenizer_config_path = self.dir_model / "tokenizer_config.json" - tokenizer_config = {} - - if tokenizer_config_path.is_file(): - with open(tokenizer_config_path, encoding="utf-8") as f: - tokenizer_config = json.load(f) - - chat_template = tokenizer_config.get("chat_template") - chat_template_jinja = self.dir_model / "chat_template.jinja" - - if chat_template_jinja.is_file(): - with open(chat_template_jinja, encoding="utf-8") as f: - chat_template = f.read() - - if chat_template: - self.gguf_writer.add_chat_template(chat_template) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None: - self.gguf_writer.add_sliding_window(sliding_window) - self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - - if name.endswith(".pre_mixer_norm.weight"): - data_torch = data_torch + 1.0 - elif name.endswith(".post_mixer_norm.weight"): - data_torch = data_torch + 1.0 / 5 - elif name.endswith(".pre_mlp_norm.weight"): - data_torch = data_torch + 1.0 - elif name.endswith(".post_mlp_norm.weight"): - data_torch = data_torch + 1.0 / (5**1.5) - elif name.endswith((".mixer.q_norm.weight", ".mixer.k_norm.weight")): - data_torch = data_torch + 1.0 - elif name.endswith(".norm.weight"): - data_torch = data_torch + 1.0 - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("CodeShellForCausalLM") -class CodeShellModel(TextModel): - model_arch = gguf.MODEL_ARCH.CODESHELL - - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_freq_base(10000.0) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(1.0) - - -@ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM") -class KimiLinearModel(TextModel): - """Kimi-Linear model with hybrid MLA+KDA architecture""" - model_arch = gguf.MODEL_ARCH.KIMI_LINEAR - - _experts: list[dict[str, Tensor]] | None = None - - def set_vocab(self): - try: - self._set_vocab_gpt2() - return - except Exception: - pass - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - tokpre = self.get_vocab_base_pre(tokenizer) - - if tokpre == "kimi-k2": - # Build merges list using the approach similar to HunYuanMoE - merges = [] - vocab = {} - mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - if len(merged) == 2: - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - # Build token list - vocab_size = self.hparams["vocab_size"] - special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} - tokens: list[str] = [] - toktypes: list[int] = [] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token = reverse_vocab[i] - tokens.append(token) - if i in special_tokens.values(): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_token_merges(merges) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.add_to_gguf(self.gguf_writer) - # override eos id in config.json with tiktoken eos id - self.gguf_writer.add_eos_token_id(tokenizer.eos_id) # ty: ignore[unresolved-attribute] - else: - raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") - - def set_gguf_parameters(self): - # note: To enable MLA KV cache, attention needs to be converted into MQA (ie: GQA with 1 group) - self.hparams["num_key_value_heads"] = 1 - - super().set_gguf_parameters() - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - # KDA & MLA params - # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv - linear_attn_config = self.hparams["linear_attn_config"] - # n_head == 0 for KDA layers, n_head > 0 for MLA layers - # full_attention_layers list will be used to distinguish layer type - _num_kv_heads = list() - _full_attn_layers = linear_attn_config["full_attn_layers"] - for il in range(self.hparams["num_hidden_layers"]): - if il + 1 in _full_attn_layers: - _num_kv_heads.append(self.hparams["num_key_value_heads"]) - else: - _num_kv_heads.append(0) - assert len(_num_kv_heads) == self.hparams["num_hidden_layers"] - self.gguf_writer.add_head_count_kv(_num_kv_heads) - - if (ssm_d_conv := linear_attn_config.get("short_conv_kernel_size")) is not None: - self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) - if (kda_head_dim := linear_attn_config.get("head_dim")) is not None: - self.gguf_writer.add_kda_head_dim(kda_head_dim) - - # MLA params - use add_* methods that handle arch substitution - # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv) - if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=True)) is not None: - self.gguf_writer.add_q_lora_rank(q_lora_rank) - # To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA - kv_lora_rank = self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False) - self.gguf_writer.add_kv_lora_rank(kv_lora_rank) - - # MLA head dimensions - # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim - qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") - # Rotation - use qk_rope_head_dim for Kimi - qk_rope_head_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=False) - self.gguf_writer.add_rope_dimension_count(qk_rope_head_dim) - self.gguf_writer.add_key_length(kv_lora_rank + qk_rope_head_dim) - v_head_dim = self.hparams.get("v_head_dim") - - # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim - if (n_embd_head_k_mla := self.find_hparam(["n_embd_head_k_mla"], optional=True)) is not None: - self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) - elif qk_nope_head_dim is not None: - n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim - self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) - - # n_embd_head_v_mla = v_head_dim - if (n_embd_head_v_mla := self.hparams.get("n_embd_head_v_mla")) is not None: - self.gguf_writer.add_value_length_mla(n_embd_head_v_mla) - elif v_head_dim is not None: - self.gguf_writer.add_value_length_mla(v_head_dim) - - # moe_intermediate_size (1024 for Kimi) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - # num_shared_experts (1 for Kimi) - self.gguf_writer.add_expert_shared_count(self.hparams["num_shared_experts"]) - # first_k_dense_replace (1 for Kimi - first layer uses dense MLP) - self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) - # Routed scaling factor (expert_weights_scale = 2.446 for Kimi) - self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) - - def prepare_tensors(self): - super().prepare_tensors() - if self._experts is not None: - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}") - - # Handle KDA conv1d weights - # HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest - # llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest - # GGUF reverses numpy shape when writing, so numpy (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] - # Memory layouts match: both have conv_step (d_conv) changing fastest - if name.endswith((".q_conv1d.weight", ".k_conv1d.weight", ".v_conv1d.weight")): - # HF shape: [d_inner, d_conv] e.g. [4096, 4] - # Target numpy shape: (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] - if data_torch.ndim == 2: - d_inner, d_conv = data_torch.shape - # Reshape to (1, d_inner, 1, d_conv) - memory layout preserved (d_conv fastest) - data_torch = data_torch.reshape(1, d_inner, 1, d_conv) - logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") - elif data_torch.ndim == 3: - # Already 3D [d_inner, 1, d_conv] from unsqueeze - d_inner, _, d_conv = data_torch.shape - data_torch = data_torch.reshape(1, d_inner, 1, d_conv) - logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") - - # Handle A_log: iHF stores as [1, 1, num_heads, 1] - # llama.cpp expects ggml ne = [1, num_heads, 1, 1] - # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1] - if name.endswith(".A_log"): - data_torch = -torch.exp(data_torch) - if name.endswith(".dt_bias"): - name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" - logger.info("Changed dt_bias to dt_proj.bias") - - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - # w1: gate, w2: down, w3: up - for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), - ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), - ("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]: - datas: list[Tensor] = [] - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - data_torch = torch.stack(datas, dim=0) - new_name = self.format_tensor_name(tname, bid) - yield from super().modify_tensors(data_torch, new_name, bid) - return - - # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed - if name.endswith("kv_b_proj.weight"): - name_kb = name.replace("kv_b_proj", "k_b_proj") - name_vb = name.replace("kv_b_proj", "v_b_proj") - n_head_kv = self.hparams["num_key_value_heads"] - v_head_dim = self.find_hparam(["n_embd_head_v_mla", "v_head_dim"], optional=False) - qk_nope_head_dim = self.hparams["qk_nope_head_dim"] - logger.info("Split kv_b n_head_kv %d\n" % n_head_kv) - assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) - kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) - k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) - k_b = k_b.transpose(1, 2) - yield from super().modify_tensors(k_b, name_kb, bid) - yield from super().modify_tensors(v_b, name_vb, bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("InternLM2ForCausalLM") -class InternLM2Model(TextModel): - model_arch = gguf.MODEL_ARCH.INTERNLM2 - - def set_vocab(self): - # (TODO): Is there a better way? - # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character - # \x00 specially and convert it into an emoji character to prevent it from being mistakenly - # recognized as an empty string in C++. - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') - sys.exit(1) - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - for token_id in range(vocab_size): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - if text == b"\x00": - # (TODO): fixme - # Hack here and replace the \x00 characters. - logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") - text = "🐉".encode("utf-8") - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - # take care of ununsed raw token - if piece.startswith('[UNUSED'): - toktype = SentencePieceTokenTypes.UNUSED - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - - chat_eos_token = '<|im_end|>' - chat_eos_token_id = None - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, foken_data in added_tokens_decoder.items(): - token_id = int(token_id) - token = foken_data["content"] - if token == chat_eos_token: - chat_eos_token_id = token_id - token = token.encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - tokenizer_file = self.dir_model / 'tokenizer.json' - if tokenizer_file.is_file(): - with open(tokenizer_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - added_tokens = tokenizer_json.get("added_tokens", []) - for foken_data in added_tokens: - token_id = int(foken_data["id"]) - token = foken_data["content"] - if token == chat_eos_token: - chat_eos_token_id = token_id - token = token.encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - old_eos = special_vocab.special_token_ids["eos"] - if chat_eos_token_id is not None: - # For the chat model, we replace the eos with '<|im_end|>'. - # TODO: this is a hack, should be fixed - # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048 - special_vocab.special_token_ids["eos"] = chat_eos_token_id - logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" - " in chat mode so that the conversation can end normally.") - - special_vocab.add_to_gguf(self.gguf_writer) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - num_heads = self.hparams["num_attention_heads"] - num_kv_heads = self.hparams["num_key_value_heads"] - n_embd = self.hparams["hidden_size"] - q_per_kv = num_heads // num_kv_heads - head_dim = n_embd // num_heads - num_groups = num_heads // q_per_kv - - if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: - qkv = data_torch - - qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) - q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] - - # The model weights of q and k equire additional reshape. - q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) - k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) - v = v.reshape((-1, v.shape[-1])) - - yield from super().modify_tensors(q, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) - yield from super().modify_tensors(k, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) - yield from super().modify_tensors(v, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("InternLM3ForCausalLM") -class InternLM3Model(TextModel): - model_arch = gguf.MODEL_ARCH.LLAMA - - def set_vocab(self): - tokens, scores, toktypes = self._create_vocab_sentencepiece() - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - if "added_tokens_decoder" in tokenizer_config_json: - for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items(): - if token_data.get("special"): - token_id = int(token_id) - token = token_data["content"] - special_vocab._set_special_token(token, token_id) - # update eos token - if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids: - special_vocab.special_token_ids["eos"] = token_id - - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith(("mlp", "vision_model")): - # skip visual tensors - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification") -class BertModel(TextModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.vocab_size = None - - if cls_out_labels := self.hparams.get("id2label"): - if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0": - # Remove dummy labels added by AutoConfig - cls_out_labels = None - self.cls_out_labels = cls_out_labels - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_causal_attention(False) - self._try_set_pooling_type() - - if self.cls_out_labels: - self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())]) - - def set_vocab(self): - tokens, toktypes, tokpre = self.get_vocab_base() - self.vocab_size = len(tokens) - - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - - # convert to phantom space vocab - def phantom(tok, toktype): - if toktype == gguf.TokenType.CONTROL: - return tok - if tok.startswith("##"): - return tok[2:] - return "\u2581" + tok - assert len(tokens) == len(toktypes) - tokens = list(map(phantom, tokens, toktypes)) - - # add vocab to gguf - self.gguf_writer.add_tokenizer_model("bert") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - # handle special tokens - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("bert."): - name = name[5:] - - if name.endswith(".gamma"): - name = name[:-6] + ".weight" - - if name.endswith(".beta"): - name = name[:-5] + ".bias" - - # we are only using BERT for embeddings so we don't need the pooling layer - if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): - return None - - if name.startswith("cls.predictions"): - return None - - if name.startswith("cls.seq_relationship"): - return None - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.cls_out_labels: - # For BertForSequenceClassification (direct projection layer) - if name == "classifier.weight": - name = "classifier.out_proj.weight" - - if name == "classifier.bias": - name = "classifier.out_proj.bias" - - yield from super().modify_tensors(data_torch, name, bid) - - def _xlmroberta_tokenizer_init(self) -> None: - # we need the pad_token_id to know how to chop down position_embd matrix - if (pad_token_id := self.hparams.get("pad_token_id")) is not None: - self._position_offset = 1 + pad_token_id - if "max_position_embeddings" in self.hparams: - self.hparams["max_position_embeddings"] -= self._position_offset - else: - self._position_offset = None - - def _xlmroberta_set_vocab(self) -> None: - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 - os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'sentencepiece.bpe.model' - - tokenizer_json = {} - tokenizer_config_json = {} - if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / 'tokenizer.json' - tokenizer_config_path = self.dir_model / 'tokenizer_config.json' - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - from base64 import b64decode - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - - with open(tokenizer_path, "r", encoding="utf-8") as fp: - tokenizer_json = json.load(fp) - - if tokenizer_config_path.is_file(): - with open(tokenizer_config_path, "r", encoding="utf-8") as fp: - tokenizer_config_json = json.load(fp) - - add_prefix = tokenizer.add_prefix_space # ty: ignore[unresolved-attribute] - remove_whitespaces = tokenizer.clean_up_tokenization_spaces # ty: ignore[unresolved-attribute] - precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"]) - - vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) # ty: ignore[unresolved-attribute] - else: - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - if isinstance(tokenizer, SentencePieceProcessor): - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - else: - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - unk_token = tokenizer_config_json.get("unk_token") - unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) # ty: ignore[no-matching-overload] - - for token_id in range(tokenizer.vocab_size): # ty: ignore[unresolved-attribute] - piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] - if (piece := tokenizer._convert_id_to_token(token_id)) is not None: # ty: ignore[unresolved-attribute] - text = piece.encode("utf-8") - score = tokenizer_json["model"]["vocab"][token_id][1] - - toktype = SentencePieceTokenTypes.NORMAL - if token_id == unk_token_id: - toktype = SentencePieceTokenTypes.UNKNOWN - elif token_id in tokenizer.all_special_ids: # ty: ignore[unresolved-attribute] - toktype = SentencePieceTokenTypes.CONTROL - elif token_id in added_vocab.values(): - toktype = SentencePieceTokenTypes.USER_DEFINED - # No reliable way to detect this, but jina doesn't have any - # elif tokenizer.IsByte(token_id): - # toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - if isinstance(tokenizer, SentencePieceProcessor): - # realign tokens (see HF tokenizer code) - tokens = [b'', b'', b'', b''] + tokens[3:-1] - scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] - toktypes = [ - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.UNKNOWN, - ] + toktypes[3:-1] - - if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE: - # Add mask token missing from sentencepiece.bpe.model - tokens[250001] = b'' - scores[250001] = 0.0 - toktypes[250001] = SentencePieceTokenTypes.CONTROL - - self.gguf_writer.add_tokenizer_model("t5") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) - if precompiled_charsmap: - self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - -@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification") -class DistilBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def set_gguf_parameters(self): - self.gguf_writer.add_layer_norm_eps(1e-12) - logger.info("gguf: layer norm epsilon = 1e-12") - super().set_gguf_parameters() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("distilbert."): - name = name[11:] - - # These layers act as MLM head, so we don't need them - if name.startswith("vocab_"): - return None - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("RobertaModel", "RobertaForSequenceClassification") -class RobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # we need the pad_token_id to know how to chop down position_embd matrix - if (pad_token_id := self.hparams.get("pad_token_id")) is not None: - self._position_offset = 1 + pad_token_id - if "max_position_embeddings" in self.hparams: - self.hparams["max_position_embeddings"] -= self._position_offset - else: - self._position_offset = None - - def set_vocab(self): - """Support BPE tokenizers for roberta models""" - bpe_tok_path = self.dir_model / "tokenizer.json" - if bpe_tok_path.exists(): - self._set_vocab_gpt2() - - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - - else: - return super().set_vocab() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor - if name == "embeddings.position_embeddings.weight": - if self._position_offset is not None: - data_torch = data_torch[self._position_offset:,:] - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("NomicBertModel") -class NomicBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): - hparams = kwargs.pop("hparams", None) - if hparams is None: - hparams = ModelBase.load_hparams(dir_model, False) - - self.is_moe = bool(hparams.get("moe_every_n_layers")) - self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT - - super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) - - self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta() - if self._tokenizer_is_xlmroberta: - self._xlmroberta_tokenizer_init() - - npos, mtp = self.hparams["n_positions"], self.hparams.get("max_trained_positions", 2048) - if npos == 8192 and mtp == 2048: - self.hparams["n_positions"] = 2048 # nomic-embed-text v1 and v1.5 are trained for 2048 tokens. - elif npos == 2048 and mtp == 2048: - self.hparams["n_positions"] = 512 # nomic-embed-text-v2-moe is trained for 512 tokens. - else: - raise ValueError(f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}") - - assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu" - - # this doesn't do anything in the HF version - assert self.hparams["causal"] is False - # no bias tensors unless MoE - assert self.hparams["qkv_proj_bias"] == self.is_moe - assert self.hparams["mlp_fc1_bias"] == self.is_moe - assert self.hparams["mlp_fc2_bias"] == self.is_moe - - # norm at end of layer - assert self.hparams["prenorm"] is False - # standard RoPE - assert self.hparams["rotary_emb_fraction"] == 1.0 - assert self.hparams["rotary_emb_interleaved"] is False - assert self.hparams["rotary_emb_scale_base"] is None - - def set_vocab(self) -> None: - if self._tokenizer_is_xlmroberta: - return self._xlmroberta_set_vocab() - return super().set_vocab() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # If the tensor is an experts bias tensor, skip it. - if "mlp.experts.bias" in name: - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - if "mlp.experts.mlp.w1" in name: - data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"]) - name += ".weight" - - if "mlp.experts.mlp.w2" in name: - data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"]) - data_torch = data_torch.transpose(1, 2) - name += ".weight" - - yield from super().modify_tensors(data_torch, name, bid) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if self.is_moe: - self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"]) - self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) - - def _is_tokenizer_xlmroberta(self) -> bool: - with open(self.dir_model / "tokenizer.json") as f: - tokenizer_json = json.load(f) - toktyp = tokenizer_json["model"]["type"] - if toktyp == "Unigram": - return True - if toktyp == "WordPiece": - return False - raise ValueError(f"unknown tokenizer: {toktyp}") - - -@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification") -class NeoBert(BertModel): - model_arch = gguf.MODEL_ARCH.NEO_BERT - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # NeoBERT uses 2/3 of the intermediate size as feed forward length - self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3)) - self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT - self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - - self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("decoder."): - return None - - if name.startswith("model."): - name = name[6:] - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model") -class EuroBertModel(TextModel): - model_arch = gguf.MODEL_ARCH.EUROBERT - - def set_vocab(self): - self.gguf_writer.add_add_bos_token(False) - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # EuroBert is bidirectional (encoder) - self.gguf_writer.add_causal_attention(False) - - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - self._try_set_pooling_type() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("model."): - name = name[6:] - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") -class XLMRobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - _lora_files = {} - _lora_names = [] - - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): - hparams = kwargs.pop("hparams", None) - if hparams is None: - hparams = ModelBase.load_hparams(dir_model, False) - - if lora_names := hparams.get("lora_adaptations"): - self._lora_names = lora_names - self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3 - - super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) - self._xlmroberta_tokenizer_init() - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if self._lora_names: - for name in self._lora_names: - fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-") - self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run) - - return super().generate_extra_tensors() - - def set_type(self): - for lora_writer in self._lora_files.values(): - lora_writer.add_type(gguf.GGUFType.ADAPTER) - lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") - super().set_type() - - def set_vocab(self): - self._xlmroberta_set_vocab() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] - - # jina-embeddings-v3 - if ".parametrizations." in name: - name = name.replace(".parametrizations.", ".") - if name.endswith(".original"): - name = name[:-9] - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor - if name == "embeddings.position_embeddings.weight": - if self._position_offset is not None: - data_torch = data_torch[self._position_offset:,:] - - if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"): - if name.startswith("pooler.dense"): - return - - num_loras = data_torch.size(0) - assert num_loras == len(self._lora_names) - - # Split out each LoRA in their own GGUF - for i, lora_writer in enumerate(self._lora_files.values()): - new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower() - data = data_torch[i, :, :] - # Transpose/flip token_embd/types into correct shape - if new_name == "token_embd.weight.lora_b": - data = data.T - elif new_name.startswith("token_types.weight."): - new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b") - lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32) - - return - - yield from super().modify_tensors(data_torch, name, bid) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # jina-embeddings-v3 - lora_alpha = self.hparams.get("lora_alpha") - if lora_prompt_prefixes := self.hparams.get("task_instructions"): - assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys()) - for lora_name, lora_writer in self._lora_files.items(): - lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0) - lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name) - if lora_prompt_prefixes: - lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name]) - - def write(self): - super().write() - for lora_writer in self._lora_files.values(): - lora_writer.write_header_to_file() - lora_writer.write_kv_data_to_file() - lora_writer.write_tensors_to_file(progress=True) - lora_writer.close() - - -@ModelBase.register("GemmaForCausalLM") -class GemmaModel(TextModel): - model_arch = gguf.MODEL_ARCH.GEMMA - - def set_vocab(self): - self._set_vocab_sentencepiece() - - # TODO: these special tokens should be exported only for the CodeGemma family - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) - special_vocab._set_special_token("prefix", 67) - special_vocab._set_special_token("suffix", 69) - special_vocab._set_special_token("middle", 68) - special_vocab._set_special_token("fsep", 70) - special_vocab._set_special_token("eot", 107) - special_vocab.chat_template = None # do not add it twice - special_vocab.add_to_gguf(self.gguf_writer) - - self.gguf_writer.add_add_space_prefix(False) - - def set_gguf_parameters(self): - hparams = self.hparams - - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(hparams["head_dim"]) - self.gguf_writer.add_value_length(hparams["head_dim"]) - self.gguf_writer.add_file_type(self.ftype) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model - # To prevent errors, skip loading lm_head.weight. - if name == "lm_head.weight": - logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Gemma2ForCausalLM") -class Gemma2Model(TextModel): - model_arch = gguf.MODEL_ARCH.GEMMA2 - - def set_vocab(self): - self._set_vocab_sentencepiece() - - self.gguf_writer.add_add_space_prefix(False) - - def set_gguf_parameters(self): - hparams = self.hparams - - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(hparams["head_dim"]) - self.gguf_writer.add_value_length(hparams["head_dim"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_attn_logit_softcapping( - self.hparams["attn_logit_softcapping"] - ) - self.gguf_writer.add_final_logit_softcapping( - self.hparams["final_logit_softcapping"] - ) - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model - # To prevent errors, skip loading lm_head.weight. - if name == "lm_head.weight": - logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") -class Gemma3Model(TextModel): - model_arch = gguf.MODEL_ARCH.GEMMA3 - - def norm_shift(self, name: str) -> float: - return 1.0 if name.endswith("norm.weight") else 0.0 # Gemma3RMSNorm adds 1.0 to the norm value - - def set_vocab(self): - if (self.dir_model / "tokenizer.model").is_file(): - self._set_vocab_sentencepiece() - self.gguf_writer.add_add_space_prefix(False) - else: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - # some default values are not specified in the hparams - self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072)) - self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8)) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6)) - self.gguf_writer.add_key_length(hparams.get("head_dim", 256)) - self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) - self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers - # attn_logit_softcapping is removed in Gemma3 - assert hparams.get("attn_logit_softcapping") is None - if (final_logit_softcap := hparams.get("final_logit_softcapping")): - self.gguf_writer.add_final_logit_softcapping(final_logit_softcap) - if hparams.get("sliding_window_pattern") != 1: - self.gguf_writer.add_sliding_window(hparams["sliding_window"]) - self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # remove OOV (out-of-vocabulary) rows in token_embd - if "embed_tokens.weight" in name: - n_vocab_real = -1 - if (self.dir_model / "tokenizer.model").is_file(): - tokens = self._create_vocab_sentencepiece()[0] - n_vocab_real = len(tokens) - else: - with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - n_vocab_real = len(tokenizer_json["model"]["vocab"]) + len(tokenizer_json["added_tokens"]) - data_torch = data_torch[:n_vocab_real] - - # ref code in Gemma3RMSNorm - # output = output * (1.0 + self.weight.float()) - # note: this is not the case on gemma3n - f_shift = self.norm_shift(name) - if f_shift != 0.0: - data_torch = data_torch + f_shift - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Gemma3TextModel") -class EmbeddingGemma(Gemma3Model): - model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING - module_paths = [] - dense_features_dims = {} - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.sentence_transformers_dense_modules: - # read modules.json to determine if model has Dense layers - modules_file = self.dir_model / "modules.json" - if modules_file.is_file(): - with open(modules_file, encoding="utf-8") as modules_json_file: - mods = json.load(modules_json_file) - for mod in mods: - if mod["type"].endswith("Dense"): - mod_path = mod["path"] - # check if model.safetensors file for Dense layer exists - model_tensors_file = self.dir_model / mod_path / "model.safetensors" - if model_tensors_file.is_file(): - self.module_paths.append(mod_path) - # read config.json of the Dense layer to get in/out features - mod_conf_file = self.dir_model / mod_path / "config.json" - if mod_conf_file.is_file(): - with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file: - mod_conf = json.load(mod_conf_json_file) - # hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights - prefix = self._get_dense_prefix(mod_path) - if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None: - self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"]) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - from safetensors.torch import load_file - module_paths = list(self.module_paths) - for i, module_path in enumerate(module_paths): - tensors_file = self.dir_model / module_path / "model.safetensors" - local_tensors = load_file(tensors_file) - tensor_name = self._get_dense_prefix(module_path) - for name, local_tensor in local_tensors.items(): - if not name.endswith(".weight"): - continue - orig_name = name.replace("linear", tensor_name) - name = self.map_tensor_name(orig_name) - yield name, local_tensor.clone() - - @staticmethod - def _get_dense_prefix(module_path) -> str: - """Get the tensor name prefix for the Dense layer from module path.""" - tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3" - return tensor_name - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # Override the sliding window size as it gets adjusted by the Gemma3TextConfig - # constructor. We want to use the value from the original model's config.json. - # ref: https://github.com/huggingface/transformers/pull/40700 - with open(self.dir_model / "config.json", "r", encoding="utf-8") as f: - config = json.load(f) - orig_sliding_window = config.get("sliding_window") - if orig_sliding_window is None: - raise ValueError("sliding_window not found in model config - this is required for the model") - - logger.info(f"Using original sliding_window from config: {orig_sliding_window} " - f"instead of {self.hparams['sliding_window']}") - self.gguf_writer.add_sliding_window(orig_sliding_window) - if self.sentence_transformers_dense_modules: - for dense, dims in self.dense_features_dims.items(): - logger.info(f"Setting dense layer {dense} in/out features to {dims}") - self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1]) - - self._try_set_pooling_type() - - -@ModelBase.register("Gemma3ForConditionalGeneration") -class Gemma3VisionModel(MmprojModel): - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3) - # default values below are taken from HF transformers code - self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) - self.gguf_writer.add_vision_use_gelu(True) - # calculate proj_scale_factor (used by tinygemma3 test model) - image_seq_length = self.preprocessor_config.get("image_seq_length", 256) - n_per_side = int(image_seq_length ** 0.5) - image_size = self.hparams["image_size"] - patch_size = self.hparams["patch_size"] - proj_scale_factor = (image_size // patch_size) // n_per_side - if proj_scale_factor > 0 and proj_scale_factor != 4: - # we only need to write this if it's not the default value - # in this case, we are converting a test model - self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - # related to https://github.com/ggml-org/llama.cpp/issues/13025 - if "input_projection" in name: - return gguf.GGMLQuantizationType.F16 - if ".embeddings." in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "vision_model.head." in name: - # skip redundant tensors for tinygemma3 - return None - - if not name.startswith(("multi_modal_projector.", "vision_tower.", "multimodal_projector.", "vision_model.")): - return None - - name = name.replace("_weight", ".weight") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector - # the other norm values are part of SigLIP model, and they are already correct - # ref code: Gemma3RMSNorm - if "soft_emb_norm.weight" in name: - logger.info(f"Correcting norm value for '{name}'") - data_torch = data_torch + 1 - - yield from super().modify_tensors(data_torch, name, bid) - - -class ConformerAudioModel(MmprojModel): - _batch_norm_tensors: list[dict[str, Tensor]] | None = None - - @staticmethod - def is_audio_tensor(name: str): - return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"]) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ConformerAudioModel.is_audio_tensor(name): - if ".conv" in name or "_conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # fold running_mean, running_var and eps into weight and bias for batch_norm - if "batch_norm" in name: - if self._batch_norm_tensors is None: - self._batch_norm_tensors = [{} for _ in range(self.block_count)] - assert bid is not None - self._batch_norm_tensors[bid][name] = data_torch - - if len(self._batch_norm_tensors[bid]) < 5: - return - - weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"] - bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"] - running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"] - running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"] - eps = 1e-5 # default value - - a = weight / torch.sqrt(running_var + eps) - b = bias - running_mean * a - yield from super().modify_tensors(a, f"conformer.layers.{bid}.conv.batch_norm.weight", bid) - yield from super().modify_tensors(b, f"conformer.layers.{bid}.conv.batch_norm.bias", bid) - return - - # reshape conv weights - if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"): - data_torch = data_torch[:, None, None] - if "conv.depthwise_conv" in name and name.endswith(".weight"): - assert data_torch.shape[1] == 1 - data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2]) - if "conv.pointwise_conv" in name and name.endswith(".weight"): - assert data_torch.shape[2] == 1 - data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1]) - - mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) - yield (mapped_name, data_torch) - - -@ModelBase.register("DeepseekOCRForCausalLM") -class DeepseekOCRVisionModel(MmprojModel): - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR) - # default values below are taken from HF tranformers code - self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) - self.gguf_writer.add_vision_use_gelu(True) - # calculate proj_scale_factor (used by tinygemma3 test model) - image_seq_length = self.preprocessor_config.get("image_seq_length", 256) - n_per_side = int(image_seq_length ** 0.5) - image_size = self.hparams["image_size"] - patch_size = self.hparams["patch_size"] - proj_scale_factor = (image_size // patch_size) // n_per_side - if proj_scale_factor > 0 and proj_scale_factor != 4: - # we only need to write this if it's not the default value - # in this case, we are converting a test model - self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) - # @bluebread: there's no window_size in config but just add it here anyway - self.gguf_writer.add_vision_window_size(self.hparams.get("window_size", 14)) - - # SAM configuration - sam_hparams = hparams['sam'] - self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers']) - self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width']) - self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads']) - - def get_vision_config(self) -> dict[str, Any]: - vision_config: dict[str, Any] | None = self.global_config.get("vision_config") - - if not vision_config: - raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found") - - vision_config['sam'] = vision_config['width']['sam_vit_b'] - vision_config.update(vision_config['width']['clip-l-14-224']) - vision_config['hidden_size'] = vision_config['width'] - vision_config['num_heads'] = vision_config['heads'] - vision_config['intermediate_size'] = vision_config['heads'] * 4 - - return vision_config - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".embeddings." in name or 'pos_embed' in name: - return gguf.GGMLQuantizationType.F32 - if ".rel_pos_h" in name or '.rel_pos_w' in name: - return gguf.GGMLQuantizationType.F32 - if ".neck." in name or ".net_" in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Only process vision-related tensors, skip language model tensors - # Vision components: sam_model, vision_model, projector, image_newline, view_seperator - # Language model components to skip: lm_head, embed_tokens, layers, norm - if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")): - return None - - if name.endswith("pos_embed") or name.endswith("rel_pos_h") or name.endswith("rel_pos_w"): - name += ".weight" - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("Gemma3nForConditionalGeneration") -class Gemma3nVisionAudioModel(ConformerAudioModel): - has_audio_encoder = True - has_vision_encoder = True - - # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py) - # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py - block_tensor_mapping = { - "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.blk.{bid}.{sid}.conv_pwl.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.blk.{bid}.{sid}.bn2.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.blk.{bid}.{sid}.dw_start.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.blk.{bid}.{sid}.dw_start.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.blk.{bid}.{sid}.dw_mid.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.blk.{bid}.{sid}.dw_mid.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.blk.{bid}.{sid}.pw_exp.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.blk.{bid}.{sid}.pw_exp.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.blk.{bid}.{sid}.pw_proj.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.blk.{bid}.{sid}.pw_proj.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.blk.{bid}.{sid}.layer_scale.gamma", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.blk.{bid}.{sid}.attn.query.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.blk.{bid}.{sid}.attn.key.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.blk.{bid}.{sid}.attn.value.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.blk.{bid}.{sid}.attn.output.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.blk.{bid}.{sid}.attn.key.down_conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.blk.{bid}.{sid}.attn.key.norm.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.blk.{bid}.{sid}.attn.value.norm.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight", - } - - def __init__(self, *args, **kwargs): - # Parent init will call find_hparam which now returns 0 for empty keys - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it - self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4 - self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8) - - # MobileNetV5 does not use image_mean/std - self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0] - self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0] - self.hparams_vision["image_size"] = self.preprocessor_config.get( - "size", {"height": 768, "width": 768} - )["height"] - - # Image sequence length (256 tokens = 16x16 for Gemma3n) - image_seq_length = self.preprocessor_config.get("image_seq_length", 256) - image_size = self.hparams_vision["image_size"] - self.hparams_vision["patch_size"] = image_size // image_seq_length - - # remap audio hparams - assert self.hparams_audio is not None - self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"] - self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"] - self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"] - self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # vision params - self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) - - # audio params - assert self.hparams_audio is not None - self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA) - self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) - self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - # Force quantization settings for specific tensor types - if "input_projection" in name or "input_proj" in name: - return gguf.GGMLQuantizationType.F16 - if ".embeddings." in name or "stem" in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - def custom_map(self, name: str) -> str: - """Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping.""" - parts = name.split(".") - # MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix - if len(parts) >= 7: - bid, sid = parts[4], parts[5] - suffix = ".".join(parts[6:]) - template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}" - if template in self.block_tensor_mapping: - return self.block_tensor_mapping[template].format(bid=bid, sid=sid) - - raise ValueError(f"Unknown name: {name}") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if (ConformerAudioModel.is_audio_tensor(name)): - name = name.replace("model.audio_tower.conformer.", "conformer.layers.") - yield from super().modify_tensors(data_torch, name, bid) - - # Gemma3n uses - # - model.embed_vision.* for projection layers - # - model.vision_tower.* for vision encoder - # Skip non-vision tensors - if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")): - return - - if name.startswith("model.vision_tower.timm_model.blocks."): - # Double-indexed block tensors through custom logic - yield (self.custom_map(name), data_torch) - return - else: - # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py - new_name = self.map_tensor_name(name) - - if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"): - data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1] - - yield from ModelBase.modify_tensors(self, data_torch, new_name, bid) - - -@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") -class Gemma3NModel(Gemma3Model): - model_arch = gguf.MODEL_ARCH.GEMMA3N - - _altup_proj: list[Tensor] = [] - _altup_unembd: list[Tensor] = [] - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs" - self._altup_proj = [ - torch.Tensor(), # to be replaced - torch.Tensor(), # to be replaced - torch.Tensor(), # to be replaced - ] - self._altup_unembd = [ - torch.Tensor(), # to be replaced - torch.Tensor(), # to be replaced - torch.Tensor(), # to be replaced - ] - - def norm_shift(self, name: str) -> float: - del name - return 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code - - def set_vocab(self): - # For Gemma3n multimodal models, we need the FULL vocab_size (262400) - # which includes special tokens from 262144-262399 for vision/audio. - # The vocab_size_per_layer_input (262144) is only the embedding size per layer. - # Temporarily override the hparams lookup order to prioritize vocab_size. - - # Store original vocab_size_per_layer_input if it exists - vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input") - - # Temporarily remove vocab_size_per_layer_input to force using vocab_size - if vocab_size_per_layer_input is not None: - del self.hparams["vocab_size_per_layer_input"] - - # Call parent set_vocab which will now use vocab_size (262400) - super().set_vocab() - - # Restore vocab_size_per_layer_input for later use - if vocab_size_per_layer_input is not None: - self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"]) - self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"]) - self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"]) - self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"]) - - activation_sparsity_scale = [] - for s in self.hparams["activation_sparsity_pattern"]: - normal_dist = torch.distributions.normal.Normal(0, 1) - std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32)) - activation_sparsity_scale.append(std_multiplier.item()) - self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale) - - sliding_window_pattern = [] - for t in self.hparams["layer_types"]: - sliding_window_pattern.append(t == "sliding_attention") - self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - - def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None: - has_all = all(m.numel() > 0 for m in matrices) - if not has_all: - return None - else: - return torch.stack(matrices, dim=0) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith("_scale"): - name = name + ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # TODO: implement self.prediction_coefs.weight.clamp_(...) - - # Pad token embeddings for vision/audio special tokens (262144-262399) - if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name: - # Move to CPU to avoid meta device issues during padding - data_torch = data_torch.to(device="cpu") - - vocab_size = self.hparams.get("vocab_size", 262400) - current_size = data_torch.shape[0] # First dimension is vocab_size - - if current_size < vocab_size: - # Pad with zeros for vision/audio tokens (they get embeddings from vision tower) - padding_size = vocab_size - current_size - tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings" - logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)") - - # Create padding with zeros (vision tokens won't use these embeddings) - padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device) - data_torch = torch.cat([data_torch, padding], dim=0) - - # Continue with normal processing - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - return - - if "altup_unembed_projections" in name: - data_torch = data_torch.to(device="cpu") - # altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based - # They should NOT be padded - if ".0." in name: - self._altup_unembd[0] = data_torch - elif ".1." in name: - self._altup_unembd[1] = data_torch - elif ".2." in name: - self._altup_unembd[2] = data_torch - else: - raise ValueError(f"Unknown name: {name}") - out = self._stack_matrices(self._altup_unembd) - if out is not None: - yield from ModelBase.modify_tensors(self, out, "model.altup_unembed_projections.weight", bid) - return - else: - return - - if "altup_projections" in name: - data_torch = data_torch.to(device="cpu") - if ".0." in name: - self._altup_proj[0] = data_torch - elif ".1." in name: - self._altup_proj[1] = data_torch - elif ".2." in name: - self._altup_proj[2] = data_torch - else: - raise ValueError(f"Unknown name: {name}") - out = self._stack_matrices(self._altup_proj) - if out is not None: - yield from ModelBase.modify_tensors(self, out, "model.altup_projections.weight", bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Gemma4ForConditionalGeneration") -class Gemma4Model(Gemma3Model): - model_arch = gguf.MODEL_ARCH.GEMMA4 - - def norm_shift(self, name: str) -> float: - del name # unused - return 0.0 - - def set_vocab(self): - vocab = gguf.LlamaHfVocab(self.dir_model) - tokens = [] - scores = [] - toktypes = [] - visible_tokens = {"<|channel>", "", "<|tool_call>", "", "<|tool_response>", "", "<|\"|>"} - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - text_str = text.decode() - if text_str in visible_tokens: - # always render these tokens, so that the chat parser can read them - toktypes.append(gguf.TokenType.USER_DEFINED) - logger.info(f"Token '{text_str}' is set to USER_DEFINED") - else: - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size - - self.gguf_writer.add_tokenizer_model("gemma4") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - self.gguf_writer.add_add_space_prefix(False) - self.gguf_writer.add_add_bos_token(True) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - num_kv_shared_layers = self.hparams["num_kv_shared_layers"] - self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers) - - # per-layer embedding is optional - n_pl_embd = self.hparams.get("hidden_size_per_layer_input") or 0 - self.gguf_writer.add_embedding_length_per_layer_input(n_pl_embd) - - swa_layers = [t == "sliding_attention" for t in self.hparams["layer_types"]] - self.gguf_writer.add_sliding_window_pattern(swa_layers) - - head_dim_full = self.hparams["global_head_dim"] - head_dim_swa = self.hparams["head_dim"] - # correct the head dim for global/swa layers - self.gguf_writer.add_key_length(head_dim_full) - self.gguf_writer.add_value_length(head_dim_full) - self.gguf_writer.add_key_length_swa(head_dim_swa) - self.gguf_writer.add_value_length_swa(head_dim_swa) - - expert_intermediate_size = self.find_hparam(["expert_intermediate_size", "moe_intermediate_size"]) - if expert_intermediate_size is not None: - self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) - - # if use_double_wide_mlp is set, we need to adjust the value for kv shared layers - use_double_wide_mlp = self.hparams.get("use_double_wide_mlp", False) - first_kv_shared_layer_idx = self.block_count - num_kv_shared_layers - if use_double_wide_mlp: - n_ff = self.hparams["intermediate_size"] - n_ff_arr = [n_ff if il < first_kv_shared_layer_idx else n_ff * 2 for il in range(self.block_count)] - self.gguf_writer.add_feed_forward_length(n_ff_arr) - - # handle num_global_key_value_heads - num_key_value_heads_full = self.hparams.get("num_global_key_value_heads") - num_key_value_heads_swa = self.hparams.get("num_key_value_heads") - if num_key_value_heads_full is not None and num_key_value_heads_swa is not None: - value_arr = [num_key_value_heads_swa if is_swa else num_key_value_heads_full for is_swa in swa_layers] - self.gguf_writer.add_head_count_kv(value_arr) - - # handle n_rot differently for global vs swa layers - partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0) - n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors - n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa) - self.gguf_writer.add_rope_dimension_count(n_rot_full) - self.gguf_writer.add_rope_dimension_count_swa(n_rot_swa) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # full layer uses "proportional" rope with partial_rotary_factor=0.25 - # the expected ordering is cc000000ss000000 (c = cos, s = sin, 0 = unrotated), - # but ggml neox only supports ccss000000000000, and we cannot rearrange the head because that will break use_alternative_attention - # solution is to set specific freq_factors for the unrotated dims - - # IMPORTANT: this ROPE_FREQS tensor is ONLY used by the full_attention layers - rope_params_full = self.hparams["rope_parameters"]["full_attention"] - assert rope_params_full["rope_type"] == "proportional" - head_dim_full = (self.hparams["global_head_dim"]) - partial_rotary_factor_full = rope_params_full["partial_rotary_factor"] - n_rot_full = int(head_dim_full * partial_rotary_factor_full / 2) - n_unrot_full = int(head_dim_full / 2) - n_rot_full - values = [1.0] * n_rot_full + [1e30] * n_unrot_full - rope_freqs_full = torch.tensor(values, dtype=torch.float32) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full) - - def _generate_nvfp4_tensors(self): - # Gemma-4 stores a per-layer router.per_expert_scale ([n_expert]) that scales - # each expert's contribution. It's mathematically equivalent to a per-expert - # scalar on the down_proj output, which is exactly where ffn_down_exps_s is - # applied at inference. Fold it into each expert's NVFP4 weight_scale_2 so the - # existing NVFP4 path produces the right scales. - n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0 - for name in [n for n in self.model_tensors if n.endswith(".router.per_expert_scale")]: - bid_match = re.search(r"\.layers\.(\d+)\.", name) - if bid_match is None: - continue - bid = bid_match.group(1) - prefix = name[: name.index(f".layers.{bid}.") + len(f".layers.{bid}.")] - w2_targets = [f"{prefix}experts.{e}.down_proj.weight_scale_2" for e in range(n_experts)] - present = [w2 in self.model_tensors for w2 in w2_targets] - if not any(present): - continue - assert all(present), f"layer {bid}: partial NVFP4 quantization across experts" - r = self.model_tensors.pop(name) - for e, w2 in enumerate(w2_targets): - s = self.model_tensors[w2] - self.model_tensors[w2] = lambda s=s, r=r, i=e: s() * r()[i] - super()._generate_nvfp4_tensors() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith("per_dim_scale") or name.endswith("layer_scalar"): - name = name + ".weight" - if ".experts." in name and not name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")): - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.endswith("router.scale"): - name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, ".scale") - yield (name, data_torch) - return - if ".per_expert_scale" in name: - # convert per-expert scale to FFN down scale - name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN_EXP, bid, ".scale") - yield (name, data_torch) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Gemma4ForConditionalGeneration") -class Gemma4VisionAudioModel(MmprojModel): - has_audio_encoder = True - has_vision_encoder = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["image_size"] = 224 # unused, but set to avoid error - - # remap audio hparams - if self.hparams_audio: - self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128) - self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4 - else: - self.has_audio_encoder = False - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # vision params - self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) - - # audio params - if self.hparams_audio: - self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A) - self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) - self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) - - def is_audio_tensor(self, name: str) -> bool: - return "audio_tower" in name or "embed_audio" in name - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if self.is_audio_tensor(name): - if ".conv" in name or "_conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F32 - if "position_embedding_table" in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - if len(data_torch.shape) == 0: - # convert scalar tensors (input/output_mix/max) to 1D tensors - data_torch = data_torch.unsqueeze(0) - - if self.is_audio_tensor(name): - assert self.hparams_audio is not None - name = name.replace("model.audio_tower.", "conformer.") - name = name.replace(".linear.", ".") - if name.endswith("per_dim_key_scale") or name.endswith("per_dim_scale"): - name = name + ".weight" - data_torch = torch.nn.functional.softplus(data_torch) - if "lconv1d.depthwise_conv1d" in name and name.endswith(".weight"): - assert data_torch.shape[1] == 1 - data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2]) - mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) - yield (mapped_name, data_torch) - - else: - name = name.replace("model.vision_tower.encoder.", "vision_model.model.") - name = name.replace(".linear.weight", ".weight") - if name.endswith("layer_scalar") or name.endswith("position_embedding_table"): - name = name + ".weight" - if name.endswith("patch_embedder.input_proj.weight"): - n_embd, ksize_sq_c = data_torch.shape - patch_size = int((ksize_sq_c // 3) ** 0.5) - data_torch = data_torch.reshape(n_embd, patch_size, patch_size, 3) - data_torch = data_torch.permute(0, 3, 1, 2).contiguous() - mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) - yield (mapped_name, data_torch) - - -@ModelBase.register("Starcoder2ForCausalLM") -class StarCoder2Model(TextModel): - model_arch = gguf.MODEL_ARCH.STARCODER2 - - -@ModelBase.register("Rwkv6ForCausalLM") -class Rwkv6Model(TextModel): - model_arch = gguf.MODEL_ARCH.RWKV6 - - def set_vocab(self): - self._set_vocab_rwkv_world() - - def set_gguf_parameters(self): - head_size = self.hparams["head_size"] - hidden_size = self.hparams["hidden_size"] - layer_norm_eps = self.hparams["layer_norm_epsilon"] - rescale_every_n_layers = self.hparams["rescale_every"] - intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) - time_mix_extra_dim = 64 if hidden_size == 4096 else 32 - time_decay_extra_dim = 128 if hidden_size == 4096 else 64 - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_layer_norm_eps(layer_norm_eps) - self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) - self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - lerp_weights: dict[int, dict[str, Tensor]] = {} - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - if not (new_name.endswith(".weight") or new_name.endswith(".bias")): - new_name += ".weight" - - if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"): - data_torch = data_torch.transpose(0, 1) - - if new_name.endswith("time_mix_w2.weight"): - data_torch = data_torch.permute(0, 2, 1) - - if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: - data_torch = data_torch.squeeze() - - try: - rescale_every_n_layers = self.hparams["rescale_every"] - if rescale_every_n_layers > 0: - if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): - data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) - except KeyError: - pass - - # concat time_mix_lerp weights to reduce some cpu overhead - # also reduces the number of tensors in the model - if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name: - try: - self.lerp_weights[bid][new_name] = data_torch - except KeyError: - self.lerp_weights[bid] = {new_name: data_torch} - if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]): - new_name = f"blk.{bid}.time_mix_lerp_fused.weight" - data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1) - yield (new_name, data) - return - - yield (new_name, data_torch) - - -@ModelBase.register("RWKV6Qwen2ForCausalLM") -class RWKV6Qwen2Model(Rwkv6Model): - model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - num_attention_heads = self.hparams["num_attention_heads"] - num_key_value_heads = self.hparams["num_key_value_heads"] - hidden_size = self.hparams["hidden_size"] - head_size = hidden_size // num_attention_heads - rms_norm_eps = self.hparams["rms_norm_eps"] - intermediate_size = self.hparams["intermediate_size"] - time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32) - time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64) - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) - self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - - # special parameters for time_mixing in RWKV6QWEN2 - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_token_shift_count(1) - # RWKV6QWEN2 use grouped key/value like GQA - self.gguf_writer.add_head_count_kv(num_key_value_heads) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - for new_name, data in super().modify_tensors(data_torch, name, bid): - if "time_mix_w1" in new_name or "time_mix_w2" in new_name: - data = data.view(5, -1, data.shape[-1]) - # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg - # permute them here to avoid code changes - data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1]) - if "w2" in new_name: - data = data.view(5, -1, data.shape[-1]) - yield (new_name, data) - continue - yield (new_name, data) - - -@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") -class Rwkv7Model(TextModel): - model_arch = gguf.MODEL_ARCH.RWKV7 - - def set_vocab(self): - self._set_vocab_rwkv_world() - - def calc_lora_rank(self, hidden_size, exponent, multiplier): - return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32 - - def set_gguf_parameters(self): - try: - head_size = self.hparams["head_size"] - layer_norm_eps = self.hparams["layer_norm_epsilon"] - except KeyError: - head_size = self.hparams["head_dim"] - layer_norm_eps = self.hparams["norm_eps"] - hidden_size = self.hparams["hidden_size"] - intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4) - - # ICLR: In-Context-Learning-Rate - try: - lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) - lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) - lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) - lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) - except KeyError: - lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) - lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) - lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) - lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_layer_norm_eps(layer_norm_eps) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_decay_lora_rank(lora_rank_decay) - self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) - self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) - self.gguf_writer.add_gate_lora_rank(lora_rank_gate) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - lerp_weights: dict[int, dict[str, Tensor]] = {} - lora_needs_transpose: bool = True - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # unify tensor names here to make life easier - name = name.replace("blocks", "layers").replace("ffn", "feed_forward") - name = name.replace("self_attn", "attention").replace("attn", "attention") - name = name.replace("time_mixer.", "") - - name = name.replace("feed_forward_norm", "ln2") - name = name.replace("g_norm", "ln_x") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # lora layer names in fla-hub's impl - if "_lora.lora" in name: - self.lora_needs_transpose = False - name = name.replace("_lora.lora.0.weight", "1.weight") - name = name.replace("_lora.lora.2.weight", "2.weight") - name = name.replace("_lora.lora.2.bias", "0.weight") - - if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0: - # some models have dummy v0/v1/v2 on first layer while others don't - # ignore them all since they are not used - return - - wkv_has_gate = self.hparams.get("wkv_has_gate", True) - lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"] - - if bid is not None and "attention.x_" in name: - if "attention.x_x" in name: - # already concatenated - new_name = f"blk.{bid}.time_mix_lerp_fused.weight" - data = data_torch.reshape(len(lerp_list), 1, 1, -1) - yield (new_name, data) - else: - try: - self.lerp_weights[bid][name] = data_torch - except KeyError: - self.lerp_weights[bid] = {name: data_torch} - if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list): - new_name = f"blk.{bid}.time_mix_lerp_fused.weight" - data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0) - yield (new_name, data) - return - else: - data_torch = data_torch.squeeze() - new_name = self.map_tensor_name(name) - - if not (new_name.endswith(".weight") or new_name.endswith(".bias")): - new_name += ".weight" - - if self.lora_needs_transpose and any( - new_name.endswith(t) for t in [ - "time_mix_w1.weight", "time_mix_w2.weight", - "time_mix_a1.weight", "time_mix_a2.weight", - "time_mix_v1.weight", "time_mix_v2.weight", - "time_mix_g1.weight", "time_mix_g2.weight", - ] - ): - data_torch = data_torch.transpose(0, 1) - - if 'r_k' in new_name: - data_torch = data_torch.flatten() - - if bid == 0 and "time_mix_a" in new_name: - # dummy v0/v1/v2 on first layer - # easiest way to make llama happy - yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch) - - yield (new_name, data_torch) - - -@ModelBase.register("RwkvHybridForCausalLM") -class ARwkv7Model(Rwkv7Model): - model_arch = gguf.MODEL_ARCH.ARWKV7 - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - hidden_size = self.hparams["hidden_size"] - head_size = self.hparams["head_size"] - rms_norm_eps = self.hparams["rms_norm_eps"] - intermediate_size = self.hparams["intermediate_size"] - wkv_has_gate = self.hparams["wkv_has_gate"] - assert self.hparams["wkv_version"] == 7 - - # ICLR: In-Context-Learning-Rate - lora_rank_decay = 64 - lora_rank_iclr = 64 - lora_rank_value_residual_mix = 32 - lora_rank_gate = 128 if wkv_has_gate else 0 - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_decay_lora_rank(lora_rank_decay) - self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) - self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) - self.gguf_writer.add_gate_lora_rank(lora_rank_gate) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_token_shift_count(1) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - -@ModelBase.register("MaincoderForCausalLM") -class MaincoderModel(TextModel): - model_arch = gguf.MODEL_ARCH.MAINCODER - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - if (head_dim := self.hparams.get("head_dim")) is not None: - self.gguf_writer.add_rope_dimension_count(head_dim) - - -@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") -class MambaModel(TextModel): - model_arch = gguf.MODEL_ARCH.MAMBA - - def __init__(self, dir_model: Path, *args, **kwargs): - # Avoid using AutoConfig for hparams - hparams = kwargs.pop("hparams", None) - if hparams is None: - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - super().__init__(dir_model, *args, hparams=hparams, **kwargs) - - def set_vocab(self): - vocab_size = self.hparams["vocab_size"] - # Round vocab size to next multiple of 8 - pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) - # pad using ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - vocab_size = -(vocab_size // -pad_vocab) * pad_vocab - self.hparams["vocab_size"] = vocab_size - - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - elif (self.dir_model / "tokenizer.model").is_file(): - self._set_vocab_sentencepiece() - else: - # Use the GPT-NeoX tokenizer when no tokenizer files are present - self._set_vocab_builtin("gpt-neox", vocab_size) - - def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "d_model"]) - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model - d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 - # ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - use_dt_b_c_norm = False - # For falconmamba we do apply RMS norm on B / DT and C layers - if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): - use_dt_b_c_norm = True - # Fail early for models which don't have a block expansion factor of 2 - assert d_inner == 2 * d_model - - self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default - self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading - self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_ssm_conv_kernel(d_conv) - self.gguf_writer.add_ssm_inner_size(d_inner) - self.gguf_writer.add_ssm_state_size(d_state) - self.gguf_writer.add_ssm_time_step_rank(dt_rank) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers - self.gguf_writer.add_file_type(self.ftype) - - _tok_embd = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) - tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) - - new_name = self.map_tensor_name(name) - - if name.endswith(".A_log"): - logger.debug("A_log --> A ==> " + new_name) - data_torch = -torch.exp(data_torch) - - # [4 1 8192 1] -> [4 8192 1 1] - if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): - data_torch = data_torch.squeeze() - - # assuming token_embd.weight is seen before output.weight - if self._tok_embd is not None and new_name == output_name: - if torch.equal(self._tok_embd, data_torch): - logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") - return - elif new_name == tok_embd_name: - self._tok_embd = data_torch - - yield from super().modify_tensors(data_torch, new_name, bid) - - -@ModelBase.register("Mamba2ForCausalLM") -class Mamba2Model(TextModel): - model_arch = gguf.MODEL_ARCH.MAMBA2 - - def __init__(self, dir_model: Path, *args, **kwargs): - # Avoid using AutoConfig for hparams - # It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1 - hparams = kwargs.pop("hparams", None) - if hparams is None: - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - if "llm_config" in hparams: - hparams["text_config"] = hparams["llm_config"] - super().__init__(dir_model, *args, hparams=hparams, **kwargs) - self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) - self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model - self.n_group = self.find_hparam(["n_groups"], optional=True) or 1 - - def set_vocab(self): - vocab_size = self.hparams["vocab_size"] - # Round vocab size to next multiple of 16 - pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16) - # pad using ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - vocab_size = -(vocab_size // -pad_vocab) * pad_vocab - self.hparams["vocab_size"] = vocab_size - - if (self.dir_model / "tokenizer.model").is_file(): - self._set_vocab_sentencepiece() - elif (self.dir_model / "tokenizer.model.v3").is_file(): - # mamba-codestral - raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}") - elif (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - else: - # Use the GPT-NeoX tokenizer when no tokenizer files are present - self._set_vocab_builtin("gpt-neox", vocab_size) - - def set_gguf_parameters(self): - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128 - head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64 - - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - - # Fail early for models which don't have a block expansion factor of 2 - # TODO: does this really matter? - # skip the assertion for FalconH1 Model - if self.model_arch != gguf.MODEL_ARCH.FALCON_H1: - assert self.d_inner == 2 * self.d_model - assert self.d_inner % head_dim == 0 - - self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default - self.gguf_writer.add_embedding_length(self.d_model) - self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading - self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_ssm_conv_kernel(d_conv) - self.gguf_writer.add_ssm_inner_size(self.d_inner) - self.gguf_writer.add_ssm_state_size(d_state) - self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim) - self.gguf_writer.add_ssm_group_count(self.n_group) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_file_type(self.ftype) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith(("model.backbone", "model.lm_head")): - # map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2 - name = name.removeprefix("model.") - - if name.endswith(".dt_bias"): - name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): - data_torch = data_torch.squeeze() - elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [ - gguf.MODEL_TENSOR.SSM_A, - gguf.MODEL_TENSOR.SSM_D, - ]): - # unsqueeze A to use similar shape semantics as Mamba-1 - # (D is also unsqueezed, but for more straightforward broadcast internally) - data_torch = data_torch.reshape((*data_torch.shape, 1)) - elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid): - data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group)) - - if name.endswith(".A_log"): - logger.debug("A_log --> A ==> " + new_name) - data_torch = -torch.exp(data_torch) - - yield (new_name, data_torch) - - -@ModelBase.register("JambaForCausalLM") -class JambaModel(TextModel): - model_arch = gguf.MODEL_ARCH.JAMBA - - def set_vocab(self): - if (self.dir_model / "tokenizer.model").is_file(): - self._set_vocab_sentencepiece() - else: - self._set_vocab_llama_hf() - self.gguf_writer.add_add_space_prefix(False) - - def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "mamba_d_model"]) - d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4 - d_inner = self.hparams["mamba_expand"] * d_model - d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16 - # ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6 - n_kv_head = self.hparams["num_key_value_heads"] - attn_offset = self.hparams["attn_layer_offset"] - attn_period = self.hparams["attn_layer_period"] - n_kv_vec = [0 for _ in range(attn_offset)] + [ - n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count) - ] - - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"])) - self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(n_kv_vec) - self.gguf_writer.add_ssm_conv_kernel(d_conv) - self.gguf_writer.add_ssm_inner_size(d_inner) - self.gguf_writer.add_ssm_state_size(d_state) - self.gguf_writer.add_ssm_time_step_rank(dt_rank) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"])) - self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"])) - self.gguf_writer.add_file_type(self.ftype) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - - # Mini-Jamba - name = name.replace(".moe.", ".feed_forward.") - if bid is not None: - moe_offset = self.hparams["expert_layer_offset"] - moe_period = self.hparams["expert_layer_period"] - - if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0): - name = name.replace(".experts.0.", ".") - - # process the experts separately - if ".feed_forward.experts." in name: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - - # merge the experts into a single 3d tensor - for wid in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - # using the same merged name as qwen2moe - merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight" - - new_name = self.map_tensor_name(merged_name) - - yield new_name, data_torch - return - - new_name = self.map_tensor_name(name) - - if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): - data_torch = data_torch.squeeze() - - if name.endswith(".A_log"): - logger.debug("A_log --> A ==> " + new_name) - data_torch = -torch.exp(data_torch) - - yield (new_name, data_torch) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("CohereForCausalLM") -class CommandR2Model(TextModel): - model_arch = gguf.MODEL_ARCH.COMMAND_R - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # max_position_embeddings = 8192 in config.json but model was actually - # trained on 128k context length - # aya-23 models don't have model_max_length specified - self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - -@ModelBase.register("Cohere2ForCausalLM") -class Cohere2Model(TextModel): - model_arch = gguf.MODEL_ARCH.COHERE2 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - rotary_pct = self.hparams["rotary_pct"] - hidden_size = self.hparams["hidden_size"] - num_attention_heads = self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads))) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Cohere2 runtime in llama.cpp expects no bias tensors; - # the actual weight only contains 0-value tensors as bias, we can skip them - if name.endswith(".bias"): - if torch.any(data_torch != 0): - raise ValueError(f"Bias tensor {name!r} is not zero.") - logger.debug(f"Skipping bias tensor {name!r} for Cohere2 conversion.") - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("OlmoForCausalLM") -@ModelBase.register("OLMoForCausalLM") -class OlmoModel(TextModel): - model_arch = gguf.MODEL_ARCH.OLMO - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_layer_norm_eps(1e-5) - clip_qkv = self.hparams.get("clip_qkv") - if clip_qkv is not None: - self.gguf_writer.add_clamp_kqv(clip_qkv) - - # Same as super class, but permuting q_proj, k_proj - # Copied from: LlamaModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("SeedOssForCausalLM") -class SeedOssModel(TextModel): - model_arch = gguf.MODEL_ARCH.SEED_OSS - - -@ModelBase.register("Olmo2ForCausalLM") -@ModelBase.register("Olmo3ForCausalLM") -class Olmo2Model(TextModel): - model_arch = gguf.MODEL_ARCH.OLMO2 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - if "sliding_window" in self.hparams: - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - - sliding_window_pattern = [] - if "layer_types" in self.hparams: - sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]] - else: - # Olmo2 does not use sliding window attention. - # Olmo3 defaults to using sliding window for all layers except every 4th. - for i in range(self.hparams["num_hidden_layers"]): - sliding_window_pattern.append((i + 1) % 4 != 0) - - self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - - -@ModelBase.register("OlmoeForCausalLM") -class OlmoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.OLMOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_layer_norm_rms_eps(1e-5) - - _experts: list[dict[str, Tensor]] | None = None - - # Copied from: Qwen2MoeModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - # Copied from: Qwen2MoeModel - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM") -class JinaBertV2Model(BertModel): - model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 - - def set_vocab(self): - tokenizer_class = 'BertTokenizer' - with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_class = json.load(f)['tokenizer_class'] - - if tokenizer_class == 'BertTokenizer': - super().set_vocab() - elif tokenizer_class == 'RobertaTokenizer': - self._set_vocab_gpt2() - self.gguf_writer.add_token_type_count(2) - else: - raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') - - -@ModelBase.register("OpenELMForCausalLM") -class OpenELMModel(TextModel): - model_arch = gguf.MODEL_ARCH.OPENELM - - @staticmethod - def _make_divisible(v: float | int, divisor: int) -> int: - # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 - new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) - # Make sure that round down does not go down by more than 10%. - if new_v < 0.9 * v: - new_v += divisor - return new_v - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - ffn_multipliers: list[float] = self.hparams["ffn_multipliers"] - ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"] - self._n_embd: int = self.hparams["model_dim"] - self._num_kv_heads: list[int] = self.hparams["num_kv_heads"] - self._num_query_heads: list[int] = self.hparams["num_query_heads"] - self._ffn_dims: list[int] = [ - OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor) - for multiplier in ffn_multipliers - ] - assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) - assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) - - # Uses the tokenizer from meta-llama/Llama-2-7b-hf - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"]) - - def set_gguf_parameters(self): - n_embd = self._n_embd - head_dim = self.hparams["head_dim"] - rot_pct = 1.0 - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_query_heads) - assert self.block_count == len(self._ffn_dims) - - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["max_context_length"]) - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(self._ffn_dims) - self.gguf_writer.add_head_count(self._num_query_heads) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) - # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 - self.gguf_writer.add_layer_norm_rms_eps(1e-6) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - self.gguf_writer.add_file_type(self.ftype) - - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - if "n_layers" in keys: - return self.hparams["num_transformer_layers"] - - return super().find_hparam(keys, optional) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - - # split ff - if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": - ff_dim = self._ffn_dims[bid] - yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]) - return - - yield (self.map_tensor_name(name), data_torch) - - -@ModelBase.register("ArcticForCausalLM") -class ArcticModel(TextModel): - model_arch = gguf.MODEL_ARCH.ARCTIC - - def set_vocab(self): - # The reason for using a custom implementation here is that the - # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from - # tokenizer.model and used them as BOS and EOS instead of adding new tokens. - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') - sys.exit(1) - - # Read the whole vocabulary from the tokenizer.model file - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - # Use the added_tokens_decoder field from tokeniser_config.json as the source - # of information about added/redefined tokens and modify them accordingly. - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - - if "added_tokens_decoder" in tokenizer_config_json: - added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] - for token_id, token_json in added_tokens_decoder.items(): - token_id = int(token_id) - if token_id >= vocab_size: - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - token_content = token_json["content"] - token_type = SentencePieceTokenTypes.USER_DEFINED - token_score = -10000.0 - - # Map unk_token to UNKNOWN, other special tokens to CONTROL - # Set the score to 0.0 as in the original tokenizer.model - if ("special" in token_json) and token_json["special"]: - if token_content == tokenizer_config_json["unk_token"]: - token_type = SentencePieceTokenTypes.UNKNOWN - else: - token_type = SentencePieceTokenTypes.CONTROL - token_score = 0.0 - - logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") - tokens[token_id] = token_content.encode("utf-8") - toktypes[token_id] = token_type - scores[token_id] = token_score - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for wid in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("DeepseekForCausalLM") -class DeepseekModel(TextModel): - model_arch = gguf.MODEL_ARCH.DEEPSEEK - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - - self.gguf_writer.add_rope_dimension_count(rope_dim) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_weights_scale(1.0) - self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) - self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) - - _experts: list[dict[str, Tensor]] | None = None - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = DeepseekModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register( - "DeepseekV2ForCausalLM", - "DeepseekV3ForCausalLM", - "KimiVLForConditionalGeneration", - "KimiK25ForConditionalGeneration", - "YoutuForCausalLM", - "YoutuVLForConditionalGeneration", -) -class DeepseekV2Model(TextModel): - model_arch = gguf.MODEL_ARCH.DEEPSEEK2 - - # TODO @ngxson : remove this when we support MTP for deepseek models - skip_mtp = True - - merge_expert = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - hparams: dict = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) - self.origin_hf_arch = hparams.get('architectures', [None])[0] - - # special handling for Deepseek OCR - if self.origin_hf_arch == "DeepseekOCRForCausalLM": - self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR - self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] - self.gguf_writer.add_architecture() - # default jinja template - self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}") - - def set_vocab(self): - try: - self._set_vocab_gpt2() - return - except Exception: - pass - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - tokpre = self.get_vocab_base_pre(tokenizer) - - if tokpre == "kimi-k2": - # Build merges list using the approach similar to HunYuanMoE - merges = [] - vocab = {} - mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - if len(merged) == 2: - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # Build token list - vocab_size = self.hparams["vocab_size"] - special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} - tokens: list[str] = [] - toktypes: list[int] = [] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token = reverse_vocab[i] - tokens.append(token) - if i in special_tokens.values(): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_token_merges(merges) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.add_to_gguf(self.gguf_writer) - else: - raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") - - def set_gguf_parameters(self): - is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR) - - if is_ocr: - self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0) - else: - # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) - self.hparams["num_key_value_heads"] = 1 - - self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6) - - super().set_gguf_parameters() - hparams = self.hparams - - # first_k_dense_replace: number of leading layers using dense FFN instead of MoE - # For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers - # For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers - has_moe = hparams.get("n_routed_experts") is not None - first_k_dense_replace = hparams.get("first_k_dense_replace") - if first_k_dense_replace is None: - # Default: if no MoE, all layers are dense; if MoE, none are dense - first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0 - self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) - kv_lora_rank = hparams.get("kv_lora_rank", 512) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: - self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - - # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA - if not is_ocr: - self.gguf_writer.add_kv_lora_rank(kv_lora_rank) - self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(kv_lora_rank) - self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) - - # MoE parameters (required by C++ code for DEEPSEEK2 arch) - # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length - moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False) - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - - if (n_routed_experts := hparams.get("n_routed_experts")) is not None: - self.gguf_writer.add_expert_count(n_routed_experts) - - # expert_shared_count is required by C++ code, default to 0 for non-MoE models - n_shared_experts = hparams.get("n_shared_experts", 0) - self.gguf_writer.add_expert_shared_count(n_shared_experts) - - # When not set, C++ code will use scale_w = false to skip the no-op scaling - if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None: - self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) - - if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob: - self.gguf_writer.add_expert_weights_norm(norm_topk_prob) - - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - - if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None: - # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] - # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul - # ref https://github.com/ggml-org/llama.cpp/pull/17945 - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # skip lm_head.weight if tie_word_embeddings is True - if self.hparams.get("tie_word_embeddings", False): - if name == "lm_head.weight" or name == "model.lm_head.weight": - logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)") - return - - # skip Multi-Token Prediction (MTP) layers - if self.skip_mtp: - block_count = self.hparams["num_hidden_layers"] - match = re.match(r"model.layers.(\d+)", name) - if match and int(match.group(1)) >= block_count: - return - - # process the experts separately - if self.merge_expert and name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed - if name.endswith("kv_b_proj.weight"): - name_kb = name.replace("kv_b_proj", "k_b_proj") - name_vb = name.replace("kv_b_proj", "v_b_proj") - - n_head_kv = self.hparams["num_key_value_heads"] - v_head_dim = self.hparams["v_head_dim"] - qk_nope_head_dim = self.hparams["qk_nope_head_dim"] - - assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) - - kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) - k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) - k_b = k_b.transpose(1, 2) - - yield from super().modify_tensors(k_b, name_kb, bid) - yield from super().modify_tensors(v_b, name_vb, bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register( - "Mistral3ForConditionalGeneration", - "Ministral3ForCausalLM", -) -class Mistral3Model(TextModel): - class Ministral3Model(LlamaModel): - model_arch = gguf.MODEL_ARCH.MISTRAL3 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - rope_params = self.rope_parameters - if self.hparams.get("model_type") == "ministral3": - assert rope_params, "ministral3 must have 'rope_parameters' config" - assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'" - self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) - self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"]) - - class Mistral4Model(DeepseekV2Model): - model_arch = gguf.MODEL_ARCH.MISTRAL4 - skip_mtp = False # model contains no MTP layers, so no need to skip - merge_expert = False # experts are already stacked as 3D - - def modify_tensors(self, data_torch, name, bid): - if name.endswith(".down_proj") or name.endswith(".gate_up_proj"): - name = name + ".weight" - yield from super().modify_tensors(data_torch, name, bid) - - model_arch = gguf.MODEL_ARCH.MISTRAL3 # unused - impl: TextModel - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.hparams.get("model_type") == "mistral4": - self.impl = Mistral3Model.Mistral4Model(*args, **kwargs) - else: - self.impl = Mistral3Model.Ministral3Model(*args, **kwargs) - - def set_vocab(self): - self.impl.set_vocab() - - def set_gguf_parameters(self): - self.impl.set_gguf_parameters() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - yield from self.impl.modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - self.impl.prepare_tensors() - - def write_vocab(self): - self.impl.write_vocab() - - def write(self): - self.impl.write() - - -@ModelBase.register("MiniMaxM2ForCausalLM") -class MiniMaxM2Model(TextModel): - model_arch = gguf.MODEL_ARCH.MINIMAXM2 - _experts_cache: dict[int, dict[str, Tensor]] = {} - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"])) - self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"])) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - # merge expert weights - if 'experts' in name: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - expert_cache = self._experts_cache.setdefault(bid, {}) - expert_cache[name] = data_torch - expert_weights = ["w1", "w2", "w3"] - - # not enough expert weights to merge - if len(expert_cache) < n_experts * len(expert_weights): - return - - for w_name in expert_weights: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" - datas.append(expert_cache[ename]) - del expert_cache[ename] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - yield from super().modify_tensors(data_torch, new_name, bid) - - del self._experts_cache[bid] - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("MiMoV2FlashForCausalLM", "MiMoV2ForCausalLM") -class MimoV2Model(TextModel): - model_arch = gguf.MODEL_ARCH.MIMO2 - - # MiMo V2-Flash, V2.5 and V2.5-Pro all ship 3 trained MTP layers under model.mtp.layers.{0,1,2}. - # The HF config does not expose the count, so it's hardcoded to match the count found in the safetensors. - _n_nextn = 3 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.block_count = self.hparams["num_hidden_layers"] + self._n_nextn - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - @staticmethod - def _tp_aware_qkv_dequant(weight: Tensor, scale_inv: Tensor, - n_q: int, n_kv: int, hd: int, vhd: int, - bs: int = 128) -> Tensor: - # MiMo-V2.5 (TP=4) and V2.5-Pro (TP=8) ship qkv_proj sharded across TP - # ranks; per rank, rows are stacked as [Q_per | K_per | V_per]. - # weight_scale_inv has ceil(rows_per_rank/bs) block-rows per rank (last - # may extend past rows_per_rank with phantom rows not in the weight). - # Naive repeat_interleave aligns rank 0 only and mis-applies scales to - # later ranks once rows_per_rank isn't a multiple of bs. - # Re-group the per-rank [Q_per|K_per|V_per] rows into a single fused - # [Q | K | V] tensor matching the un-sharded original layout. - q_size = n_q * hd - k_size = n_kv * hd - v_size = n_kv * vhd - total_rows = q_size + k_size + v_size - if weight.shape[0] != total_rows: - raise ValueError(f"qkv_proj weight rows {weight.shape[0]} != q+k+v {total_rows}") - - # detect TP from scale_inv block count, descending order so larger matches first - tp = None - for cand in (8, 4): - if total_rows % cand != 0: - continue - rpr = total_rows // cand - bpr = (rpr + bs - 1) // bs - if scale_inv.shape[0] == cand * bpr: - tp = cand - break - if tp is None: - raise ValueError( - f"qkv_proj: cannot detect TP - scale_inv rows {scale_inv.shape[0]}, " - f"q+k+v {total_rows}") - - q_per = q_size // tp - k_per = k_size // tp - v_per = v_size // tp - rows_per_rank = q_per + k_per + v_per - blocks_per_rank = (rows_per_rank + bs - 1) // bs - - scale_inv = scale_inv.float() - # per-row scale-row index: rank * blocks_per_rank + (rr_in_rank // bs) - row_idx = torch.arange(total_rows) - rr = row_idx % rows_per_rank - rank = row_idx // rows_per_rank - scale_row_idx = rank * blocks_per_rank + (rr // bs) - # gather: (total_rows, n_col_blocks) - scale_per_row_block = scale_inv[scale_row_idx] - # expand col-blocks -> cols: each block-col covers `bs` weight cols - scale_full = scale_per_row_block.repeat_interleave(bs, dim=1) - # crop to weight col count (in case last col-block isn't full) - scale_full = scale_full[:, : weight.shape[1]] - dequant = weight.float() * scale_full - - if tp == 1: - return dequant - - # Re-group per-rank [Q_per|K_per|V_per] rows into unified [Q | K | V] - qs, ks, vs = [], [], [] - for r in range(tp): - base = r * rows_per_rank - qs.append(dequant[base : base + q_per]) - ks.append(dequant[base + q_per : base + q_per + k_per]) - vs.append(dequant[base + q_per + k_per : base + rows_per_rank]) - return torch.cat(qs + ks + vs, dim=0) - - def dequant_model(self): - # Capture raw FP8 (weight, scale_inv) lambdas for qkv_proj BEFORE super - # rewrites them with the existing dequant. Replace super's lambda after - # it runs so scale_inv removal still happens via the standard path. - qkv_overrides: dict[str, tuple[Callable, Callable, int]] = {} - qc = self.hparams.get("quantization_config") - if isinstance(qc, dict) and qc.get("quant_method") == "fp8": - pat = re.compile(r"^model\.layers\.(\d+)\.self_attn\.qkv_proj\.weight_scale_inv$") - for name in list(self.model_tensors.keys()): - m = pat.match(name) - if not m: - continue - weight_name = name.removesuffix("_scale_inv") - if weight_name not in self.model_tensors: - continue - qkv_overrides[weight_name] = ( - self.model_tensors[weight_name], - self.model_tensors[name], - int(m.group(1)), - ) - - super().dequant_model() - - if not qkv_overrides: - return - - n_q = self.hparams["num_attention_heads"] - hd = self.hparams["head_dim"] - vhd = self.hparams["v_head_dim"] - hybrid = self.hparams["hybrid_layer_pattern"] - n_layer_text = self.hparams["num_hidden_layers"] - for weight_name, (w_fn, s_fn, bid) in qkv_overrides.items(): - # MTP layers (bid >= n_layer_text) use SWA-style attention dims - is_swa = True if bid >= n_layer_text else hybrid[bid] == 1 - n_kv = self.hparams["swa_num_key_value_heads" if is_swa else "num_key_value_heads"] - self.model_tensors[weight_name] = ( - lambda w_fn=w_fn, s_fn=s_fn, n_q=n_q, n_kv=n_kv, hd=hd, vhd=vhd: - MimoV2Model._tp_aware_qkv_dequant(w_fn(), s_fn(), n_q, n_kv, hd, vhd) - ) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - assert self.hparams["swa_head_dim"] == self.hparams["head_dim"] - assert self.hparams["swa_num_attention_heads"] == self.hparams["num_attention_heads"] - assert self.hparams["swa_v_head_dim"] == self.hparams["v_head_dim"] - assert self.hparams["topk_method"] == "noaux_tc" - - n_head_kv = self.hparams["num_key_value_heads"] - n_head_kv_swa = self.hparams["swa_num_key_value_heads"] - # Extend the per-layer pattern with SWA entries for the MTP blocks so the - # runtime arrays (sized to extended block_count) are fully populated. - hybrid = list(self.hparams["hybrid_layer_pattern"]) + [1] * self._n_nextn - n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in hybrid] - self.gguf_writer.add_head_count_kv(n_head_kv_arr) - - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_sliding_window_pattern(hybrid) - self.gguf_writer.add_value_length(self.hparams["v_head_dim"]) - self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - - rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"]) - self.gguf_writer.add_rope_dimension_count(rope_dim) - - self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5)) - - v_scale = self.hparams.get("attention_value_scale") - if v_scale is not None: - self.gguf_writer.add_attn_value_scale(float(v_scale)) - - self.gguf_writer.add_nextn_predict_layers(self._n_nextn) - - _experts: list[dict[str, Tensor]] | None = None - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "attention_sink" in name and not name.endswith(".weight"): - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch, name, bid): - # Remap MTP/NextN tensors to additional layer slots so the standard tensor map handles them. - # HF: model.mtp.layers.{i}.foo -> model.layers.{n_layer_text + i}.foo - m = re.match(r"^model\.mtp\.layers\.(\d+)\.(.*)$", name) - if m is not None: - mtp_idx = int(m.group(1)) - assert mtp_idx < self._n_nextn, f"MTP layer index {mtp_idx} >= _n_nextn ({self._n_nextn})" - rest = m.group(2) - n_layer_text = self.hparams["num_hidden_layers"] - new_bid = n_layer_text + mtp_idx - name = f"model.layers.{new_bid}.{rest}" - bid = new_bid - - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["gate_proj", "up_proj", "down_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename_to_retrieve]) - del self._experts[bid][ename_to_retrieve] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("MiMoV2ForCausalLM") -class MiMoV2VisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - hp = self.hparams_vision - - hp["image_size"] = hp.get("image_size", 560) - hp["num_attention_heads"] = hp.get("num_heads", 32) - hp["num_hidden_layers"] = hp.get("depth", 28) - - self.n_q_heads = int(hp["num_heads"]) - self.num_kv_heads = int(hp.get("num_key_value_heads", 8)) - self.head_dim = int(hp.get("qk_channels", 64)) - self.spatial_merge_size = int(hp["spatial_merge_size"]) - # MiMoV2 vision RMSNorm: HF uses getattr(config, "rms_norm_eps", 1e-6) and the - # field is absent from MiMo-V2.5's vision_config - self.rms_norm_eps = float(hp.get("rms_norm_eps", 1e-6)) - - # fullatt_block_indexes are also reflected in vit_window_attn_types as -1 - self.fullatt_block_indexes = list(hp.get("fullatt_block_indexes") or []) - self.vit_window_attn_types = list(hp.get("vit_window_attn_types") or []) - self.visual_token_window_size = int(hp.get("visual_token_window_size", -1)) - self.use_sink = bool(hp.get("use_sink", False)) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MIMOVL) - self.gguf_writer.add_vision_use_silu(True) - self.gguf_writer.add_vision_head_count_kv(self.num_kv_heads) - self.gguf_writer.add_vision_spatial_merge_size(self.spatial_merge_size) - self.gguf_writer.add_uint32(gguf.Keys.ClipVision.WINDOW_SIZE, self.visual_token_window_size) - self.gguf_writer.add_vision_wa_pattern_mode(self.vit_window_attn_types) - self.gguf_writer.add_vision_attention_layernorm_eps(self.rms_norm_eps) - self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) - self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - # Sinks must be F32: any sink-style softmax/mask add in ggml requires - # F32, and we fold sinks into a host-built F32 mask at encode time. - if new_name.endswith(".attn_sinks"): - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, _ = item - if not name.startswith("visual."): - return None - return super().filter_tensors(item) - - def modify_tensors(self, data_torch, name, bid): - # Conv3D patch embed: split along the temporal axis (kt=2) into two Conv2D - # weights that the existing qwen2vl-style two-Conv2D path consumes. - if name == "visual.patch_embed.proj.weight": - _, _, kt, _, _ = data_torch.shape - if kt != 2: - raise ValueError(f"unexpected temporal_patch_size: {kt}") - embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] - yield (embd_name + ".weight", data_torch[:, :, 0, ...]) - yield (embd_name + ".weight.1", data_torch[:, :, 1, ...]) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Step3p5ForCausalLM") -class Step35Model(TextModel): - model_arch = gguf.MODEL_ARCH.STEP35 - - def set_gguf_parameters(self): - rope_theta = self.hparams.get("rope_theta") - if isinstance(rope_theta, list): - self.hparams["rope_theta"] = float(rope_theta[0]) - self.hparams["local_rope_theta"] = float(rope_theta[1]) - self.rope_parameters["rope_theta"] = self.hparams["rope_theta"] - self.rope_parameters["sliding_attention"] = {"rope_theta": self.hparams["local_rope_theta"]} - - super().set_gguf_parameters() - - layer_types = self.hparams.get("layer_types") or [] - partial_rotary_factors = self.hparams.get("partial_rotary_factors") or [] - attn_other = self.hparams.get("attention_other_setting") or {} - - n_head_base = self.hparams["num_attention_heads"] - n_kv_base = self.hparams["num_attention_groups"] - - n_head_swa = attn_other.get("num_attention_heads", n_head_base) - n_kv_swa = attn_other.get("num_attention_groups", n_kv_base) - - layer_types = layer_types[: self.block_count] - partial_rotary_factors = partial_rotary_factors[: self.block_count] - assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors - head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types] - kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] - swa_pat = [lt == "sliding_attention" for lt in layer_types] - - self.gguf_writer.add_head_count(head_arr) - self.gguf_writer.add_head_count_kv(kv_arr) - - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_sliding_window_pattern(swa_pat) - - self.gguf_writer.add_value_length(self.hparams["head_dim"]) - - # MoE params - self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"]) - self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["share_expert_dim"]) - - if (moe_router_scaling_factor := self.hparams.get("moe_router_scaling_factor")) is not None: - self.gguf_writer.add_expert_weights_scale(moe_router_scaling_factor) - if (norm_expert_weight := self.hparams.get("norm_expert_weight")) is not None: - self.gguf_writer.add_expert_weights_norm(norm_expert_weight) - - # leading dense blocks - leading_dense = 0 - moe_layers_enum = self.hparams.get("moe_layers_enum") - if isinstance(moe_layers_enum, str) and moe_layers_enum.strip(): - moe_layers = sorted(int(i) for i in moe_layers_enum.strip().split(",")) - if moe_layers: - leading_dense = max(0, moe_layers[0]) - self.gguf_writer.add_leading_dense_block_count(leading_dense) - self.gguf_writer.add_moe_every_n_layers(int(self.hparams.get("moe_every_n_layer", 1))) - - self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5)) - - # Optional per-layer SwiGLU clamps. - if (limits := self.hparams.get("swiglu_limits")) is not None: - limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]] - self.gguf_writer.add_swiglu_clamp_exp(limits_f) - if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None: - limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]] - self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Map router bias (expert selection bias) to a GGUF bias tensor - if name.endswith(".moe.router_bias"): - name += ".bias" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - # remove mtp layers - if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None: - il = int(m.group(1)) - n_main = int(self.hparams.get("num_hidden_layers", self.block_count)) - if il >= n_main: - return - if name.endswith("norm.weight"): - data_torch += 1.0 - - if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")): - data_torch = data_torch.squeeze().contiguous() - - yield from super().modify_tensors(data_torch, name, bid) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3"). - # llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS). - rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) - rope_type = rope_params.get("rope_type") or "" - if rope_type.lower() != "llama3": - return - - # Step35 configs can carry per-layer rope_theta as a list; for llama3 rope factors we use the base value. - rope_theta = self.hparams.get("rope_theta", 10000.0) - if isinstance(rope_theta, list): - rope_theta = rope_theta[0] - base = float(rope_theta) - if (dim := self.hparams.get("head_dim")) is None: - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - dim = int(dim) - - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = float(rope_params.get("factor", 8.0)) - low_freq_factor = float(rope_params.get("low_freq_factor", 1.0)) - high_freq_factor = float(rope_params.get("high_freq_factor", 4.0)) - old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192))) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - - rope_factors: list[float] = [] - for freq in freqs: - wavelen = 2 * math.pi / float(freq) - if wavelen < high_freq_wavelen: - rope_factors.append(1.0) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - -@ModelBase.register("PanguEmbeddedForCausalLM") -class PanguEmbeddedModel(TextModel): - model_arch = gguf.MODEL_ARCH.PANGU_EMBED - - def set_vocab(self): - self._set_vocab_sentencepiece() - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - # PanguEmbedded's hparam loaded from config.json without head_dim - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - if hparams.get("head_dim") is None: - self.gguf_writer.add_key_length(rope_dim) - self.gguf_writer.add_value_length(rope_dim) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name == "lm_head.weight": - if self.hparams.get("tie_word_embeddings", False): - logger.info("Skipping tied output layer 'lm_head.weight'") - return - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Dots1ForCausalLM") -class Dots1Model(Qwen2MoeModel): - model_arch = gguf.MODEL_ARCH.DOTS1 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.hparams["num_experts"] = self.hparams["n_routed_experts"] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) - self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"]) - self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - if "shared_experts" in name: - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("PLMForCausalLM") -class PLMModel(TextModel): - model_arch = gguf.MODEL_ARCH.PLM - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(hparams["v_head_dim"]) - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - - def prepare_tensors(self): - super().prepare_tensors() - - -@ModelBase.register("T5WithLMHeadModel") -@ModelBase.register("T5ForConditionalGeneration") -@ModelBase.register("MT5ForConditionalGeneration") -@ModelBase.register("UMT5ForConditionalGeneration") -@ModelBase.register("UMT5Model") -class T5Model(TextModel): - model_arch = gguf.MODEL_ARCH.T5 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.shared_token_embeddings_found = False - - def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 - os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - # many older models use spiece.model tokenizer model filename - if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / 'spiece.model' - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - - # some models like Pile-T5 family use BPE tokenizer instead of Unigram - if sentencepiece_model.trainer_spec.model_type == 2: # BPE - # assure the tokenizer model file name is correct - assert tokenizer_path.name == 'tokenizer.model' - return self._set_vocab_sentencepiece() - else: - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - self.gguf_writer.add_tokenizer_model("t5") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) - if precompiled_charsmap: - self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: - logger.warning("Couldn't find context length in config.json, assuming default value of 512") - n_ctx = 512 - self.gguf_writer.add_context_length(n_ctx) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) - self.gguf_writer.add_block_count(self.block_count) - if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None: - self.gguf_writer.add_decoder_block_count(dec_n_layer) - self.gguf_writer.add_head_count(self.hparams["num_heads"]) - self.gguf_writer.add_key_length(self.hparams["d_kv"]) - self.gguf_writer.add_value_length(self.hparams["d_kv"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. - if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: - if not self.shared_token_embeddings_found: - name = "shared.weight" - self.shared_token_embeddings_found = True - else: - logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("T5EncoderModel") -class T5EncoderModel(TextModel): - model_arch = gguf.MODEL_ARCH.T5ENCODER - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.shared_token_embeddings_found = False - - def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 - os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - # many older models use spiece.model tokenizer model filename - if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / 'spiece.model' - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - - # some models like Pile-T5 family use BPE tokenizer instead of Unigram - if sentencepiece_model.trainer_spec.model_type == 2: # BPE - # assure the tokenizer model file name is correct - assert tokenizer_path.name == 'tokenizer.model' - return self._set_vocab_sentencepiece() - else: - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - self.gguf_writer.add_tokenizer_model("t5") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) - if precompiled_charsmap: - self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: - logger.warning("Couldn't find context length in config.json, assuming default value of 512") - n_ctx = 512 - self.gguf_writer.add_context_length(n_ctx) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(self.hparams["num_heads"]) - self.gguf_writer.add_key_length(self.hparams["d_kv"]) - self.gguf_writer.add_value_length(self.hparams["d_kv"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. - if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: - if not self.shared_token_embeddings_found: - name = "shared.weight" - self.shared_token_embeddings_found = True - else: - logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Jais2ForCausalLM") -class Jais2Model(TextModel): - model_arch = gguf.MODEL_ARCH.JAIS2 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - head_dim = hparams.get("head_dim", hparams["hidden_size"] // hparams["num_attention_heads"]) - self.gguf_writer.add_rope_dimension_count(head_dim) - - -@ModelBase.register("JAISLMHeadModel") -class JaisModel(TextModel): - model_arch = gguf.MODEL_ARCH.JAIS - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # SwigLU activation - assert self.hparams["activation_function"] == "swiglu" - # ALiBi position embedding - assert self.hparams["position_embedding_type"] == "alibi" - - # Embeddings scale - self.embeddings_scale = 1.0 - if 'mup_embeddings_scale' in self.hparams: - self.embeddings_scale = self.hparams['mup_embeddings_scale'] - elif 'embeddings_scale' in self.hparams: - self.embeddings_scale = self.hparams['embeddings_scale'] - else: - assert False - - self.width_scale = 1.0 - if 'mup_output_alpha' in self.hparams: - assert 'mup_width_scale' in self.hparams - self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] - elif 'width_scale' in self.hparams: - self.width_scale = self.hparams['width_scale'] - else: - assert False - - self.max_alibi_bias = 8.0 - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # we don't need these - if name.endswith((".attn.bias")): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.endswith(("relative_pe.slopes")): - # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) - # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, - # but Jais's PyTorch model simply precalculates the slope values and places them - # in relative_pes.slopes - n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) - first_val = float(data_torch[0].item()) - self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) - - return - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): - data_torch = data_torch.transpose(1, 0) - - new_name = self.map_tensor_name(name) - - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - yield from super().modify_tensors(data_torch * self.embeddings_scale, new_name, bid) - elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): - yield from super().modify_tensors(data_torch * self.width_scale, new_name, bid) - else: - yield from super().modify_tensors(data_torch, new_name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) - - -@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration") -class Glm4Model(TextModel): - model_arch = gguf.MODEL_ARCH.GLM4 - use_mrope = False - partial_rotary_factor = 0.5 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5) - if "mrope_section" in self.rope_parameters: - self.use_mrope = True - logger.info("Q/K weight will need to be permuted for M-RoPE") - - def set_vocab(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (rope_dim := self.hparams.get("head_dim")) is None: - rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor)) - - @staticmethod - def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor: - orig_shape = weights.shape - if len(orig_shape) == 1: - weights = weights.unsqueeze(1) # [out_dim, 1] - if len(weights.shape) != 2: - raise ValueError("Only 1D and 2D tensors are supported.") - n_effective_heads = weights.shape[0] // head_dim - if n_head_kv is not None and n_effective_heads != n_head: - if n_effective_heads != n_head_kv: - raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}") - rotary_dim = int(head_dim * partial_rotary_factor) - if rotary_dim % 2 != 0: - raise ValueError("rotary_dim must be even.") - reshaped = weights.reshape(n_effective_heads, head_dim, -1) - rot_part = reshaped[:, :rotary_dim, :] - non_rot_part = reshaped[:, rotary_dim:, :] - permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1) - combined = torch.cat((permuted_rot, non_rot_part), dim=1) - result = combined.reshape(weights.shape) - return result if len(orig_shape) != 1 else result.squeeze(1) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.use_mrope: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams["num_key_value_heads"] - n_embd = self.hparams["hidden_size"] - head_dim = self.hparams.get("head_dim", n_embd // n_head) - # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor) - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("GlmOcrForConditionalGeneration") -class GlmOCRModel(Glm4Model): - model_arch = gguf.MODEL_ARCH.GLM4 - use_mrope = False - partial_rotary_factor = 0.5 - - # Note: GLM-OCR is the same as GLM4, but with an extra NextN/MTP prediction layer - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # GLM-OCR has num_hidden_layers + 1 actual layers (including NextN layer) - self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - # NextN/MTP prediction layers - if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: - self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) - - -@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration") -class Glm4MoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.GLM4_MOE - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer) - self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_vocab(self): - return self._set_vocab_glm() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (rope_dim := self.hparams.get("head_dim")) is None: - rope_dim = ( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - ) - self.gguf_writer.add_rope_dimension_count( - int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)) - ) - - # MoE parameters - Use only routed expert count (shared experts handled separately) - if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None: - self.gguf_writer.add_expert_count(n_routed_experts) - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None: - self.gguf_writer.add_expert_shared_count(n_shared_experts) - if (first_k_dense_replace := self.hparams.get("first_k_dense_replace")) is not None: - self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) - - # Expert gating function (sigmoid for GLM4_MOE) - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - - # Routed scaling factor - if (routed_scaling_factor := self.hparams.get("routed_scaling_factor")) is not None: - self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) - - # Normalise topk probabilities - if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None: - self.gguf_writer.add_expert_weights_norm(norm_topk_prob) - - # NextN/MTP prediction layers - if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: - self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) - - _experts: list[dict[str, Tensor]] | None = None - - # note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Handle main token embedding (but not layer-specific NextN embeddings) - if name == "model.embed_tokens.weight" and ".layers." not in name: - yield from super().modify_tensors(data_torch, "token_embd.weight", bid) - return - - # Handle routed experts - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("Glm4MoeLiteForCausalLM") -class Glm4MoeLiteModel(DeepseekV2Model): - model_arch = gguf.MODEL_ARCH.DEEPSEEK2 - - def set_vocab(self): - return self._set_vocab_glm() - - -@ModelBase.register("GlmMoeDsaForCausalLM") -class GlmMoeDsaModel(DeepseekV2Model): - model_arch = gguf.MODEL_ARCH.GLM_DSA - skip_mtp = False - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_vocab(self): - return self._set_vocab_glm() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - rope_dim = self.hparams["qk_rope_head_dim"] - partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0) - self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor)) - - # NextN/MTP prediction layers - if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: - self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) - - # DSA indexer parameters - self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"]) - self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"]) - self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"]) - - -@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") -class ChatGLMModel(TextModel): - model_arch = gguf.MODEL_ARCH.CHATGLM - - def set_vocab_chatglm3(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytes] = [] - toktypes: list[int] = [] - scores: list[float] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) # ty: ignore[unresolved-attribute] - assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] - role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] - special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens - for token_id in range(vocab_size): - piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] - if token_id == 0: - piece = "" - elif token_id == 1: - piece = "" - elif token_id == 2: - piece = "" - - text = piece.encode("utf-8") # ty: ignore[unresolved-attribute] - score = 0.0 - # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), - # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() - if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute, invalid-argument-type] - score = tokenizer.tokenizer.sp_model.get_score(token_id) # ty: ignore[unresolved-attribute] - - if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute] - if piece in special_tokens: - toktype = SentencePieceTokenTypes.CONTROL - elif len(piece) == 0: # ty: ignore[invalid-argument-type] - text = f"[PAD{token_id}]".encode("utf-8") - toktype = SentencePieceTokenTypes.UNUSED - else: - toktype = SentencePieceTokenTypes.USER_DEFINED - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - continue - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.tokenizer.sp_model.is_unknown(token_id): # ty: ignore[unresolved-attribute] - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.tokenizer.sp_model.is_control(token_id): # ty: ignore[unresolved-attribute] - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.tokenizer.sp_model.is_unused(token_id): # ty: ignore[unresolved-attribute] - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.tokenizer.sp_model.is_byte(token_id): # ty: ignore[unresolved-attribute] - toktype = SentencePieceTokenTypes.BYTE - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - self.gguf_writer.add_tokenizer_model("llama") - # glm3 needs prefix and suffix formatted as: - # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" - self.gguf_writer.add_tokenizer_pre("chatglm-spm") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - @staticmethod - def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] - byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) - - @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - - def set_vocab(self): - if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""): - self.set_vocab_chatglm3() - return - - dir_model = self.dir_model - hparams = self.hparams - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) - assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] - - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - # only add special tokens when they were not already loaded from config.json - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - assert n_embed is not None - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - assert n_head is not None - n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head)) - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed))) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5)) - self.gguf_writer.add_file_type(self.ftype) - if "attention_dim" in self.hparams: - rope_dim = self.hparams["attention_dim"] - else: - rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) - self.gguf_writer.add_add_bos_token(False) - rope_freq = 10000 - if "rope_ratio" in self.hparams: - rope_freq = rope_freq * self.hparams["rope_ratio"] - self.gguf_writer.add_rope_freq_base(rope_freq) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith(".rotary_pos_emb.inv_freq"): - return None - - name = name.removeprefix("transformer.") - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("NemotronForCausalLM") -class NemotronModel(TextModel): - model_arch = gguf.MODEL_ARCH.NEMOTRON - - def set_vocab(self): - self._set_vocab_sentencepiece() - self.gguf_writer.add_pad_token_id(0) - self.gguf_writer.add_unk_token_id(1) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"]) - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - - # * Partial RoPE - rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"]) - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - - # * RopeScaling for Nemotron - if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - else: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side - # model.layers.{l}.input_layernorm.weight - # model.layers.{l}.post_attention_layernorm.weight - # model.norm.weight - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("ExaoneForCausalLM") -class ExaoneModel(TextModel): - model_arch = gguf.MODEL_ARCH.EXAONE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - assert (hparams["activation_function"] == "silu") - - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) - rotary_factor = rotary_factor if rotary_factor is not None else 1.0 - self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): - if rope_params.get("rope_type", '').lower() == "llama3": - base = self.rope_parameters.get("rope_theta", 10000.0) - if (dim := self.hparams.get("head_dim")) is None: - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_params.get("factor", 8.0) - low_freq_factor = rope_params.get("low_freq_factor", 1.0) - high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - -@ModelBase.register("Exaone4ForCausalLM") -class Exaone4Model(TextModel): - model_arch = gguf.MODEL_ARCH.EXAONE4 - - def set_vocab(self): - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if hparams.get("sliding_window") is not None: - self.gguf_writer.add_sliding_window(hparams["sliding_window"]) - if "layer_types" in hparams: - self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]]) - elif "sliding_window_pattern" in hparams: - sliding_window_pattern = [] - if isinstance(hparams["sliding_window_pattern"], str): # e.g. LLLG - for i in range(hparams["num_hidden_layers"]): - sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L") - if isinstance(hparams["sliding_window_pattern"], int): # e.g. 4 - for i in range(hparams["num_hidden_layers"]): - sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0) - if len(sliding_window_pattern) == hparams["num_hidden_layers"]: - self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): - if rope_params.get("rope_type", '').lower() == "llama3": - base = rope_params.get("rope_theta", 10_000.0) - if (dim := self.hparams.get("head_dim")) is None: - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_params.get("factor", 16.0) - low_freq_factor = rope_params.get("low_freq_factor", 1.0) - high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - -@ModelBase.register("ExaoneMoEForCausalLM") -class ExaoneMoEModel(Exaone4Model): - model_arch = gguf.MODEL_ARCH.EXAONE_MOE - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - moe_intermediate_size = self.hparams["moe_intermediate_size"] - num_shared_experts = self.hparams["num_shared_experts"] - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - self.gguf_writer.add_expert_shared_count(num_shared_experts) - self.gguf_writer.add_expert_shared_feed_forward_length(moe_intermediate_size * num_shared_experts) - self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) - n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0)) - self.gguf_writer.add_leading_dense_block_count(n_dense_layer) - self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0)) - - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("mtp."): - if name.find("layers.") != -1: - # `mtp.layers.0.[module_name]` format - name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + self.hparams['num_hidden_layers']}") - else: - # mtp fc/norm weights - remapper = { - "mtp.fc": "model.layers.{bid}.eh_proj", - "mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm", - "mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm", - "mtp.norm": "model.layers.{bid}.shared_head.norm", - } - _n = Path(name) - new_name = remapper[_n.stem] + _n.suffix - - # set shared weights for all NextN/MTP layers - for bid in range(self.hparams['num_hidden_layers'], self.block_count): - yield from super().modify_tensors(data_torch, new_name.format(bid=bid), bid) - return - - if name.find("mlp.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - yield from super().modify_tensors(data_torch, new_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration") -class GraniteModel(LlamaModel): - """Conversion for IBM's GraniteForCausalLM""" - model_arch = gguf.MODEL_ARCH.GRANITE - - def set_gguf_parameters(self): - """Granite uses standard llama parameters with the following differences: - - - No head_dim support - - New multiplier params: - - attention_scale - - embedding_scale - - residual_scale - - logits_scaling - """ - if head_dim := self.hparams.pop("head_dim", None): - logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) - super().set_gguf_parameters() - # NOTE: Convert _multiplier params to _scale params for naming - # consistency - if attention_scale := self.hparams.get("attention_multiplier"): - self.gguf_writer.add_attention_scale(attention_scale) - logger.info("gguf: (granite) attention_scale = %s", attention_scale) - if embedding_scale := self.hparams.get("embedding_multiplier"): - self.gguf_writer.add_embedding_scale(embedding_scale) - logger.info("gguf: (granite) embedding_scale = %s", embedding_scale) - if residual_scale := self.hparams.get("residual_multiplier"): - self.gguf_writer.add_residual_scale(residual_scale) - logger.info("gguf: (granite) residual_scale = %s", residual_scale) - if logits_scale := self.hparams.get("logits_scaling"): - self.gguf_writer.add_logit_scale(logits_scale) - logger.info("gguf: (granite) logits_scale = %s", logits_scale) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - if name.startswith("encoder."): - return None - return super().filter_tensors(item) - - -@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM") -class GraniteMoeModel(GraniteModel): - """Conversion for IBM's GraniteMoeForCausalLM""" - model_arch = gguf.MODEL_ARCH.GRANITE_MOE - - def set_gguf_parameters(self): - """GraniteMoeShared uses GraniteMoe parameters plus the following: - - shared_intermediate_size - """ - super().set_gguf_parameters() - if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"): - self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length) - logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - """In modeling_granitemoe, the JetMoe implementation of parallel experts - is used. This essentially merges w1 and w3 into a single tensor with 2x - the hidden size that is then split during forward. To keep compatibility - with existing mixtral support, we pull them apart here. - """ - - if name.endswith("block_sparse_moe.input_linear.weight"): - ffn_dim = self.hparams["intermediate_size"] - assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" - gate, up = data_torch.split(ffn_dim, dim=-2) - yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid) - yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid) - return - - has_experts = bool(self.hparams.get('num_local_experts')) - - if name.endswith("shared_mlp.input_linear.weight"): - ffn_dim = self.hparams["shared_intermediate_size"] - assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size" - gate, up = data_torch.split(ffn_dim, dim=-2) - if has_experts: - yield from ModelBase.modify_tensors(self, gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid) - yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid) - return - yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) - yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) - return - - if not has_experts and name.endswith("shared_mlp.output_linear.weight"): - yield from ModelBase.modify_tensors(self, data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM") -class GraniteHybridModel(Mamba2Model, GraniteMoeModel): - """GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM - layers and optionally uses MoE w/ a shared expert""" - model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID - undo_permute = True - - def __init__(self, *args, **kwargs): - - # Hybrid mamba models use a prefix for the mamba-specific params. - # TODO: Extend this if the prefix(es) need to be configurable - self.hparam_prefixes = ["mamba"] - - super().__init__(*args, **kwargs) - - # Lists of which layers use ssm vs attention - self._attn_layers = self.get_attn_layers() - self._ssm_layers = [ - i for i in range(self.block_count) - if i not in self._attn_layers - ] - - # There are some models in this family that are non-hybrid, but keep the - # same parent class by setting all layers to "attention." If this is the - # case, the model architecture needs to be updated to a standard - # "granite" or "granitemoe" model - if not self._ssm_layers: - has_experts = self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True) - new_arch = ( - gguf.MODEL_ARCH.GRANITE_MOE - if has_experts else - gguf.MODEL_ARCH.GRANITE - ) - self.model_arch = new_arch - self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch] - self.gguf_writer.add_architecture() - - # n_group and d_inner are used during reshape_tensors for mamba2 - # NOTE: Explicitly include hparam prefix prefix for d_model to - # disambiguate with top-level head_dim - # NOTE 2: If needed for future models, this can be isolated in a method - # to separate the prefix setting and the keys used - self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"]) - self.n_group = self.find_hparam(["n_groups", "num_groups"]) - self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model - - def get_attn_layers(self): - # Explicit list of layer type names - if layer_types := self.hparams.get("layer_types"): - return [ - i for i, typ in enumerate(layer_types) - if typ == "attention" - ] - - # Layer types indicated by index or period - attn_layers = self.hparams.get("attn_layer_indices", []) - if not attn_layers: - attn_period = self.hparams.get("attn_layer_period") - assert attn_period, "Didn't find attn_layer_indices or attn_layer_period" - attn_offset = self.hparams.get("attn_layer_offset") - assert attn_offset is not None, "No attention layer offset set with attn_layer_period" - attn_layers = [ - i for i in range(self.block_count) - if i % attn_period == attn_offset - ] - return attn_layers - - def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: - prefixed = [] - for pfx in self.hparam_prefixes: - prefixed.extend( - "_".join([pfx, k]) - for k in keys - ) - keys = list(keys) + prefixed - return Mamba2Model.find_hparam(self, keys, *args, **kwargs) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if ( - name.endswith("block_sparse_moe.input_linear.weight") - or "shared_mlp" in name - ): - yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid) - return - - # Determine whether this is a mamba layer or an attention layer - if bid in self._ssm_layers: - yield from Mamba2Model.modify_tensors(self, data_torch, name, bid) - return - elif bid in self._attn_layers: - yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid) - return - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - - def set_gguf_parameters(self): - """This method merges params from both parents and some that are - specific to this model. The result is some duplication of how the params - get set. The following warnings are expected during conversion: - - WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv' - WARNING:Duplicated key name 'granitehybrid.context_length' - """ - GraniteMoeModel.set_gguf_parameters(self) - - ## Mamba mixer params ## - self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"])) - self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state", "state_dim", "ssm_state_size"])) - self.gguf_writer.add_ssm_group_count(self.n_group) - self.gguf_writer.add_ssm_inner_size(self.d_inner) - # NOTE: The mamba_dt_rank is _not_ the right field for how this is used - # in llama.cpp - self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads", "num_heads"])) - - ## Attention params ## - head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) - head_count_kv_vec = [ - head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count) - ] - if rope_dim := self.hparams.get("attn_rotary_emb"): - self.gguf_writer.add_rope_dimension_count(rope_dim) - self.gguf_writer.add_head_count_kv(head_count_kv_vec) - - ## If Bamba or non-hybrid, use rope, otherwise don't - use_rope = ( - "BambaForCausalLM" in self.hparams["architectures"] - or not self._ssm_layers - ) - self.gguf_writer.add_rope_scaling_finetuned(use_rope) - if not use_rope: - self.gguf_writer.add_context_length(2**20) - - ## Validation ## - d_head = self.find_hparam(["d_head"], optional=True) or 64 - assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" - assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" - - def set_vocab(self): - self.hparams["pad_vocab_size_multiple"] = 8 - Mamba2Model.set_vocab(self) - - -@ModelBase.register("NemotronHForCausalLM") -class NemotronHModel(GraniteHybridModel): - """Hybrid mamba2/attention model from NVIDIA""" - model_arch = gguf.MODEL_ARCH.NEMOTRON_H - is_moe: bool = False - - def __init__(self, *args, **kwargs): - # We have to determine the correct model architecture (MoE vs non-MoE) before - # calling the parent __init__. This is because the parent constructor - # uses self.model_arch to build the tensor name map, and all MoE-specific - # mappings would be missed if it were called with the default non-MoE arch. - hparams = ModelBase.load_hparams(args[0], self.is_mistral_format) - has_moe_params = ( - "num_experts_per_tok" in hparams - or (isinstance(hparams.get("llm_config"), dict) and "num_experts_per_tok" in hparams["llm_config"]) - ) - if has_moe_params: - self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE - self.is_moe = True - - super().__init__(*args, **kwargs) - - # Save the top-level head_dim for later - self.head_dim = self.hparams.get("head_dim", self.hparams.get("attention_head_dim")) - assert self.head_dim is not None, "Could not find the attention head dim in config" - - # Don't use expand to calculate d_inner - self.d_inner = self.find_hparam(["num_heads"]) * self.d_model - - # Update the ssm / attn / mlp layers - # M: Mamba2, *: Attention, -: MLP - # MoE: - # M: Mamba2, *: Attention, E: Expert - pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type") - if pattern is None: - self._ssm_layers = [] - self._mlp_layers = [] - elif isinstance(pattern, str): - self._ssm_layers = [i for i, val in enumerate(pattern) if val == "M"] - self._mlp_layers = [i for i, val in enumerate(pattern) if val == ("E" if self.is_moe else "-")] - else: - self._ssm_layers = [i for i, val in enumerate(pattern) if val == "mamba"] - self._mlp_layers = [i for i, val in enumerate(pattern) if val == "moe"] - - def get_attn_layers(self): - pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type") - if pattern is None: - return [] - assert len(pattern) == self.block_count, f"Mismatch between pattern ({len(pattern)}) and block_count ({self.block_count})!" - if isinstance(pattern, str): - return [i for i, val in enumerate(pattern) if val == "*"] - - return [i for i, val in enumerate(pattern) if val == "attention"] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - head_dim = self.head_dim - if head_dim is None: - raise ValueError("Could not find the attention head dim in config") - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - - # Set feed_forward_length - # NOTE: This will trigger an override warning. This is preferable to - # duplicating all the parent logic - if not self.is_moe: - n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"]) - self.gguf_writer.add_feed_forward_length([ - n_ff if i in self._mlp_layers else 0 for i in range(self.block_count) - ]) - else: - moe_intermediate_size = self.hparams["moe_intermediate_size"] - self.gguf_writer.add_feed_forward_length([ - moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count) - ]) - self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"]) - self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) - self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"]) - self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) - self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_group_count(self.hparams["n_group"]) - - # number of experts used per token (top-k) - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - - if (latent_size := self.hparams.get("moe_latent_size")) is not None: - self.gguf_writer.add_moe_latent_size(latent_size) - - def set_vocab(self): - # The NemotronH config uses pattern characters (e.g. '-') that may not - # be supported by the installed transformers version. AutoTokenizer - # internally calls AutoConfig which triggers this parsing failure. - # Using trust_remote_code=True to load the model's own config class. - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - - # Pad vocab size (from Mamba2Model/GraniteHybridModel) - self.hparams["pad_vocab_size_multiple"] = 8 # Setting this here since GraniteHybridModel.set_vocab() isn't being invoked now. - # From Mamba2Model.set_vocab(): - vocab_size = self.hparams["vocab_size"] - pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16) - # ref: https://stackoverflow.com/a/17511341/22827863 - vocab_size = -(vocab_size // -pad_vocab) * pad_vocab - self.hparams["vocab_size"] = vocab_size - - assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token: str = reverse_vocab[i] - if token in added_vocab: - if not added_tokens_decoder[i].normalized: - previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] - if previous_token != token: - logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - - if added_tokens_decoder[i].special or self.does_token_look_special(token): - toktypes.append(gguf.TokenType.CONTROL) - else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - toktypes.append(gguf.TokenType.NORMAL) - tokens.append(token) - - # From TextModel.set_vocab_gpt2(): - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - - # The tokenizer _does_ add a BOS token (via post_processor type - # TemplateProcessing) but does not set add_bos_token to true in the - # config, so we need to explicitly override it here. - if not self.is_moe: - self.gguf_writer.add_add_bos_token(True) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.is_moe and bid is not None: - # Skip Multi-Token Prediction (MTP) tensors. These are used for - # for speculative decoding but we don't include them in this model - # conversion. See https://github.com/ggml-org/llama.cpp/pull/18886 - if name.startswith("mtp."): - logger.info(f"gguf: Skipping MTP (Speculative) layer: {name}") - return - - if name.endswith("mixer.gate.e_score_correction.bias"): - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - return - - if name.endswith("mixer.dt_bias"): - new_name = name.replace("dt_bias", "dt.bias") - yield from ModelBase.modify_tensors(self, data_torch, new_name, bid) - return - - if name.endswith("mixer.conv1d.weight"): - squeezed_data = data_torch.squeeze() - yield from ModelBase.modify_tensors(self, squeezed_data, name, bid) - return - - if name.endswith("mixer.A_log"): - transformed_data = -torch.exp(data_torch) - reshaped_data = transformed_data.squeeze().reshape(-1, 1) - yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) - return - - if name.endswith("mixer.D"): - reshaped_data = data_torch.squeeze().reshape(-1, 1) - yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) - return - - if name.endswith("mixer.norm.weight"): - reshaped_data = data_torch.reshape(self.n_group, -1) - yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) - return - - if name.find("mixer.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 2: - # merge the experts into a single tensor - for w_name in ["down_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("LlamaBidirectionalModel") -class LlamaEmbedNemotronModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.LLAMA_EMBED - - -@ModelBase.register("BailingMoeForCausalLM") -class BailingMoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.BAILINGMOE - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - - self.gguf_writer.add_rope_dimension_count(rope_dim) - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_weights_scale(1.0) - self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) - self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) - - _experts: list[dict[str, Tensor]] | None = None - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - n_embd = self.hparams["hidden_size"] - if (head_dim := self.hparams.get("head_dim")) is None: - head_dim = n_embd // n_head - - output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) - - if name.endswith("attention.dense.weight"): - yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), bid) - return - elif name.endswith("query_key_value.weight"): - q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2) - - yield from super().modify_tensors(BailingMoeModel.permute(q, n_head, n_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) - yield from super().modify_tensors(BailingMoeModel.permute(k, n_head, n_kv_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) - yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) - return - elif name.find("mlp.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - yield from super().modify_tensors(data_torch, new_name, bid) - - return - - new_name = self.map_tensor_name(name) - - if new_name == output_name and self.hparams.get("norm_head"): - data_torch = data_torch.float() - data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7 - - yield from super().modify_tensors(data_torch, new_name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("BailingMoeV2ForCausalLM") -class BailingMoeV2Model(TextModel): - model_arch = gguf.MODEL_ARCH.BAILINGMOE2 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0): - self.block_count = self.hparams["num_hidden_layers"] + nextn_layers - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"])) - self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) - self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) - - if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None: - self.gguf_writer.add_nextn_predict_layers(nextn_layers) - - _experts: list[dict[str, Tensor]] | None = None - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith(".expert_bias"): - name = name.replace(".expert_bias", ".expert_bias.bias") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "mlp.experts" in name: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM") -class SarvamMoEModel(BailingMoeV2Model): - model_arch = gguf.MODEL_ARCH.BAILINGMOE2 - # Sarvam-MoE shares the BailingMoeV2 architecture; only differences: - # - full rotary (no partial_rotary_factor) - # - expert bias is zero-mean normalized at load time - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - # Override the partial-rotary value written by BailingMoeV2 with the full rotary dim - self.gguf_writer.add_rope_dimension_count(rope_dim) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - if name.endswith(".expert_bias"): - # Sarvam normalizes expert bias to zero mean - inner = gen - - def gen(): - t = inner() - return t - t.mean() - return super().filter_tensors((name, gen)) - - -@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM") -class GroveMoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.GROVEMOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") - # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299 - self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128) - # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298 - self.gguf_writer.add_experts_per_group(2) - # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376 - self.gguf_writer.add_expert_group_scale(0.05) - - _experts: list[dict[str, Tensor]] | None = None - _chunk_experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.endswith(".expert_bias"): - # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303 - return - - # process the experts separately - if name.find("chunk_experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) // 2 # see add_experts_per_group - assert bid is not None - - if self._chunk_experts is None: - self._chunk_experts = [{} for _ in range(self.block_count)] - - self._chunk_experts[bid][name] = data_torch - - if len(self._chunk_experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight" - datas.append(self._chunk_experts[bid][ename]) - del self._chunk_experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - elif name.find("experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._chunk_experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - chunk_experts = [k for d in self._chunk_experts for k in d.keys()] - if len(chunk_experts) > 0: - raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}") - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("ChameleonForConditionalGeneration") -@ModelBase.register("ChameleonForCausalLM") # obsolete -class ChameleonModel(TextModel): - model_arch = gguf.MODEL_ARCH.CHAMELEON - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False)) - - def set_vocab(self): - self._set_vocab_gpt2() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # ignore image tokenizer for now - # TODO: image support for Chameleon - if name.startswith("model.vqmodel"): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - hidden_dim = self.hparams.get("hidden_size") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - if name.endswith(("q_norm.weight", "q_norm.bias")): - data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim) - if name.endswith(("k_norm.weight", "k_norm.bias")): - data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim) - - yield from super().modify_tensors(data_torch, name, bid) - - # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 - @staticmethod - def _reverse_hf_permute(data_torch, n_heads, hidden_dim): - head_dim = hidden_dim // n_heads - data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1) - data_torch = data_torch.repeat_interleave(n_heads, 0) - return data_torch - - -@ModelBase.register("UltravoxModel") -class UltravoxModel(TextModel): - model_arch = gguf.MODEL_ARCH.LLAMA # dummy - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument") - - -@ModelBase.register("GlmasrModel") -class GlmASRWhisperEncoderModel(MmprojModel): - has_vision_encoder = False - has_audio_encoder = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams: - self.hparams["hidden_size"] = self.hparams["d_model"] - self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"] - self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA) - self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"]) - self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) - self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"]) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F16 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith(("model.", "lm_head.")): - # skip language model tensors - return None - - if name.startswith("audio_encoder.whisper."): - name = name.replace("audio_encoder.whisper.","audio_tower.") - if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name: - name = name.replace("audio_encoder.", "audio_encoder.adapting.") - if name.startswith("audio_encoder.adapting."): - name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.") - if ".layer_norm." in name: - name = name.replace(".layer_norm.", ".ln_pre.") - if ".0." in name: - name = name.replace(".0.", ".linear_1.") - if ".2." in name: - name = name.replace(".2.", ".linear_2.") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("audio_encoder.audio_bos_eos_token."): - yield from super().modify_tensors(data_torch[0], "model.vision.boi", bid) - yield from super().modify_tensors(data_torch[1], "model.vision.eoi", bid) - return - - if name.startswith("audio_encoder.adapting."): - if ".proj." in name: - return - - if "conv1.bias" in name or "conv2.bias" in name: - # transpose conv1 and conv2 bias - data_torch = data_torch.unsqueeze(-1) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Qwen2AudioForConditionalGeneration") -class WhisperEncoderModel(MmprojModel): - has_vision_encoder = False # no vision encoder - has_audio_encoder = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams: - self.hparams["hidden_size"] = self.hparams["d_model"] - self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"] - self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A) - self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"]) - self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F16 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # prevent clash naming with vision tensors - if name.startswith("multi_modal_projector"): - name = "audio." + name - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "conv1.bias" in name or "conv2.bias" in name: - # transpose conv1 and conv2 bias - data_torch = data_torch.unsqueeze(-1) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("UltravoxModel") -class UltravoxWhisperEncoderModel(WhisperEncoderModel): - has_vision_encoder = False # no vision encoder - has_audio_encoder = True - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX) - self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"]) - - -@ModelBase.register("MERaLiON2ForConditionalGeneration") -class MERaLiONWhisperEncoderModel(WhisperEncoderModel): - has_vision_encoder = False - has_audio_encoder = True - - def get_audio_config(self) -> dict[str, Any] | None: - return self.global_config.get("speech_config") - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MERALION) - self.gguf_writer.add_audio_stack_factor(self.global_config.get("speech_mlp_scale_factor", 15)) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("text_decoder."): - return None - - if name.startswith("speech_encoder."): - name = name.replace("speech_encoder.", "audio_tower.") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - suffix = "." + name.rsplit(".", 1)[-1] - - if name.startswith("ln_speech."): - yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MM_NORM_PRE, suffix=suffix), data_torch) - return - - if name.startswith("speech_audio_adapter."): - if ".mlp_adapter.0." in name: - yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 0, suffix=suffix), data_torch) - elif ".gate_proj." in name: - yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 1, suffix=suffix), data_torch) - elif ".pool_proj." in name: - yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 2, suffix=suffix), data_torch) - elif ".out_proj." in name: - yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 3, suffix=suffix), data_torch) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("VoxtralForConditionalGeneration") -class VoxtralWhisperEncoderModel(WhisperEncoderModel): - has_vision_encoder = False # no vision encoder - has_audio_encoder = True - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL) - self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size - - -@ModelBase.register("AudioFlamingo3ForConditionalGeneration") -class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel): - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".conv" in name and ".weight" in name: - # Was trained in BF16, being safe, avoiding quantizing to FP16 - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - -@ModelBase.register("FalconH1ForCausalLM") -class FalconH1Model(Mamba2Model): - model_arch = gguf.MODEL_ARCH.FALCON_H1 - - def __init__(self, *args, **kwargs): - # Set the hparam prefixes for Falcon Mamba2 - self.hparam_prefixes = ["mamba"] - - # Initialize the base Mamba2Model - super().__init__(*args, **kwargs) - - # Use Llama conversion for attention - self._transformer_model_class = LlamaModel - - # n_group and d_inner are used during reshape_tensors for mamba2 - self.n_group = self.find_hparam(["n_groups"]) - self.d_inner = self.find_hparam(["mamba_d_ssm"]) - self.d_head = self.find_hparam(["d_head"]) - - # Initialize any Falcon Mamba2 specific attributes - self.has_attention = True # Falcon Mamba2 has attention components - - # Load Falcon-H1 multipliers from hyperparameters - self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True) - self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True) - self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True) - self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True) - self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True) - self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True) - self.intermediate_size = self.find_hparam(["intermediate_size"]) - self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True) - - def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: - prefixed = [] - for pfx in self.hparam_prefixes: - prefixed.extend( - "_".join([pfx, k]) - for k in keys - ) - keys = list(keys) + prefixed - return super().find_hparam(keys, *args, **kwargs) - - def set_vocab(self): - self._set_vocab_gpt2() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - tensors = list(super().modify_tensors(data_torch, name, bid)) - tensor = tensors[0][1] - - if "down_proj" in name: - tensor = tensor * self.mlp_multipliers[1] - elif "gate_proj" in name: - tensor = tensor * self.mlp_multipliers[0] - elif "k_proj" in name: - tensor = tensor * self.key_multiplier * self.attention_in_multiplier - elif "q_proj" in name: - tensor = tensor * self.attention_in_multiplier - elif "v_proj" in name: - tensor = tensor * self.attention_in_multiplier - elif "o_proj" in name: - tensor = tensor * self.attention_out_multiplier - elif "out_proj" in name: - tensor = tensor * self.ssm_out_multiplier - elif "in_proj" in name: - tensor = tensor * self.ssm_in_multiplier - zxbcdt_multipliers = self.hparams["ssm_multipliers"] - intermediate_size = self.hparams["mamba_d_ssm"] - groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"] - tensor[:intermediate_size, :] *= zxbcdt_multipliers[0] - tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1] - tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2] - tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3] - tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4] - elif "lm_head" in name: - tensor = tensor * self.hparams["lm_head_multiplier"] - elif "embed_tokens" in name: - tensor = tensor * self.hparams["embedding_multiplier"] - elif "mamba.norm" in name: - tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group) - - tensors = [(tensors[0][0], tensor)] - return tensors - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - ## General Params ## - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - # Override some Mamba2 defaults - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0)) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - - ## Attention params ## - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2 - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_key_length(self.hparams["head_dim"]) - self.gguf_writer.add_value_length(self.hparams["head_dim"]) - - ## Validation ## - assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" - assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}" - - # Add any other Falcon Mamba2 specific configuration - self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"]) - - -@ModelBase.register("HunYuanMoEV1ForCausalLM") -class HunYuanMoEModel(TextModel): - model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE - - def set_vocab(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - - # 1. Get the pre-tokenizer identifier hash - tokpre = self.get_vocab_base_pre(tokenizer) - - # 2. Reverse-engineer the merges list from mergeable_ranks - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - if len(merged) == 2: # todo this is an assert in Qwen, why? - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # 3. Generate the tokens and toktypes lists - vocab_size = self.hparams["vocab_size"] - assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] - special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} - tokens: list[str] = [] - toktypes: list[int] = [] - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token = reverse_vocab[i] - tokens.append(token) - if i in special_tokens.values(): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.NORMAL) - - # 4. Write all vocab-related fields to the GGUF writer - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_token_merges(merges) - - # 5. Add special tokens and chat templates - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.add_to_gguf(self.gguf_writer) - # FIX for BOS token: Overwrite incorrect id read from config.json - self.gguf_writer.add_bos_token_id(127959) # <|bos|> - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"]) - - moe_intermediate_size = hparams["moe_intermediate_size"] - assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size) - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0]) - - moe_topk = hparams["moe_topk"] - assert all(topk == moe_topk[0] for topk in moe_topk) - self.gguf_writer.add_expert_used_count(moe_topk[0]) - - moe_shared_expert = hparams["num_shared_expert"] - assert all(n == moe_shared_expert[0] for n in moe_shared_expert) - self.gguf_writer.add_expert_shared_count(moe_shared_expert[0]) - - # Rope - if self.rope_parameters.get("rope_type") == "dynamic": - # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ - # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) - alpha = self.rope_parameters.get("alpha", 1000) - base = self.rope_parameters.get("rope_theta", 10000.0) - dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128 - scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251 - self.gguf_writer.add_rope_freq_base(scaled_base) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_rope_scaling_factor(1) - # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k - self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length - self.gguf_writer.add_context_length(256 * 1024) # 256k context length - - # if any of our assumptions about the values are wrong, something has changed and this may need to be updated - assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ - "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name == "lm_head.weight": - if self.hparams.get("tie_word_embeddings", False): - logger.info("Skipping tied output layer 'lm_head.weight'") - return - - if name.find("mlp.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - if self._experts is not None: - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM") -class LLaDAMoEModel(TextModel): - model_arch = gguf.MODEL_ARCH.LLADA_MOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) - - self.gguf_writer.add_mask_token_id(156895) - self.gguf_writer.add_causal_attention(False) - self.gguf_writer.add_diffusion_shift_logits(False) - - _experts: list[dict[str, Tensor]] | None = None - - # Copied from: Qwen2MoeModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - # Copied from: Qwen2MoeModel - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("HunYuanDenseV1ForCausalLM") -class HunYuanModel(TextModel): - model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE - - def _get_eod_token_id(self) -> int | None: - """Get the actual end-of-generation token from config (eod_token_id).""" - return self.hparams.get("eod_token_id") - - def _get_eot_token_id(self) -> int | None: - """Get the end-of-turn token from generation_config.json. - This is the first entry in eos_token_id when it's a list.""" - gen_cfg_path = self.dir_model / "generation_config.json" - if gen_cfg_path.is_file(): - with open(gen_cfg_path, encoding="utf-8") as f: - gen_cfg = json.load(f) - eos = gen_cfg.get("eos_token_id") - if isinstance(eos, list) and len(eos) >= 2: - return eos[0] - return None - - def _fix_special_tokens(self): - """Fix EOS/EOT tokens that are incorrect in upstream configs.""" - eod_id = self._get_eod_token_id() - if eod_id is not None: - self.gguf_writer.add_eos_token_id(eod_id) - eot_id = self._get_eot_token_id() - if eot_id is not None: - self.gguf_writer.add_eot_token_id(eot_id) - - def set_vocab(self): - if (self.dir_model / "tokenizer.json").is_file(): - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - # HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab - token_types = None - if (self.hparams.get("pad_token_id") or 0) < 0: - token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask') - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types) - special_vocab.add_to_gguf(self.gguf_writer) - self._fix_special_tokens() - else: - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - - # 1. Get the pre-tokenizer identifier hash - tokpre = self.get_vocab_base_pre(tokenizer) - - # 2. Reverse-engineer the merges list from mergeable_ranks - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - if len(merged) == 2: - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # 3. Generate the tokens and toktypes lists - vocab_size = self.hparams["vocab_size"] - assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] - special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} - tokens: list[str] = [] - toktypes: list[int] = [] - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token = reverse_vocab[i] - tokens.append(token) - if i in special_tokens.values(): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.NORMAL) - - # 4. Write all vocab-related fields to the GGUF writer - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_token_merges(merges) - - # 5. Add special tokens and chat templates - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.add_to_gguf(self.gguf_writer) - # FIX for BOS token: Overwrite incorrect id read from config.json - if self.hparams['hidden_size'] == 4096: - self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token - self._fix_special_tokens() - - def set_gguf_parameters(self): - # HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it - saved_num_experts = self.hparams.pop("num_experts", None) - super().set_gguf_parameters() - if saved_num_experts is not None and saved_num_experts > 1: - self.hparams["num_experts"] = saved_num_experts - hparams = self.hparams - - # Rope - if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"): - # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ - # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) - alpha = self.rope_parameters.get("alpha", 50) - base = self.rope_parameters.get("rope_theta", 10000.0) - dim = hparams["head_dim"] - scaled_base = base * (alpha ** (dim / (dim - 2))) - self.gguf_writer.add_rope_freq_base(scaled_base) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_rope_scaling_factor(1) - if self.rope_parameters.get("rope_type") == "dynamic": - # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k - self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length - self.gguf_writer.add_context_length(256 * 1024) # 256k context length - - # if any of our assumptions about the values are wrong, something has changed and this may need to be updated - assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ - "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name == "lm_head.weight": - if self.hparams.get("tie_word_embeddings", False): - logger.info("Skipping tied output layer 'lm_head.weight'") - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("HunYuanVLForConditionalGeneration") -class HunyuanVLVisionModel(MmprojModel): - # Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name - # "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout. - # Each variant maps to a different projector type in clip.cpp so image - # preprocessing follows the correct code path. - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - # HunyuanOCR / HunyuanVL uses max_image_size instead of image_size - if "image_size" not in self.hparams_vision: - self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048) - - @staticmethod - def is_ocr_variant(hparams: dict) -> bool: - """Return True for HunyuanOCR, False for HunyuanVL. - - The projector's output dim must equal the text model's hidden_size by - construction (that's what "projector" means). HunyuanOCR pairs a 1B text - backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the - ViT -> LLM projection dim is a hard architectural signature, not a - magic number. - """ - vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) - return vision_out == 1024 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - vcfg = self.hparams_vision - - if self.is_ocr_variant(self.global_config): - # --- HunyuanOCR --- - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5)) - self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2)) - self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) - self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) - return - - # --- HunyuanVL --- - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL) - self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu") - self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"])) - self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"])) - self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) - self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if not name.startswith("vit."): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # strip CLS token (row 0) from position embeddings so resize_position_embeddings works - if "position_embedding" in name: - data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd] - yield from super().modify_tensors(data_torch, name, bid) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal - # Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2. - if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"): - return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - -@ModelBase.register("HunYuanVLForConditionalGeneration") -class HunyuanVLTextModel(HunYuanModel): - # The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR - # and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE), - # while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from - # the config and pick the matching GGUF architecture. - model_arch = gguf.MODEL_ARCH.HUNYUAN_VL - - @staticmethod - def _is_ocr_config(hparams: dict) -> bool: - # OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that - # outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with - # HunyuanVLVisionModel.is_ocr_variant. - return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024 - - def __init__(self, dir_model: Path, *args, **kwargs): - raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False) - if self._is_ocr_config(raw_hparams): - self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE - else: - self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL - super().__init__(dir_model, *args, **kwargs) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses - # the HunYuan-Dense arch which already handles standard rope in super(). - if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL: - return - - if self.rope_parameters.get("rope_type") != "xdrope": - return - - # defaults for HunyuanVL. The C++ side later computes: - # freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2)) - self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"])) - self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"])) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1))) - - ctx_len = int(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len) - self.gguf_writer.add_context_length(ctx_len) - - self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"])) - - -@ModelBase.register("SmolLM3ForCausalLM") -class SmolLM3Model(LlamaModel): - model_arch = gguf.MODEL_ARCH.SMOLLM3 - - -@ModelBase.register("GptOssForCausalLM") -class GptOssModel(TextModel): - model_arch = gguf.MODEL_ARCH.GPT_OSS - - # TODO: remove once MXFP4 is supported more generally - def dequant_model(self): - if self._is_mxfp4: - return - return super().dequant_model() - - def transform_nibble_layout(self, tensor): - assert tensor.dtype == torch.uint8 - assert tensor.shape[-1] == 16 - # swap nibbles - t_lo = tensor & 0x0F - t_hi = tensor & 0xF0 - t_swapped = (t_lo << 4) | (t_hi >> 4) - tensor = t_swapped - # transform aaaa...bbbb... to abababab... - blk_a, blk_b = tensor.chunk(2, dim=-1) - # get a_ - blk_a0 = (blk_a & 0xF0).view(-1, 1) - blk_a1 = (blk_a << 4).view(-1, 1) - blk_a = torch.stack((blk_a0, blk_a1), dim=2).view(tensor.shape) - # get _b - blk_b0 = (blk_b >> 4).view(-1, 1) - blk_b1 = (blk_b & 0x0F).view(-1, 1) - blk_b = torch.stack((blk_b0, blk_b1), dim=2).view(tensor.shape) - # swap once more - out = blk_a | blk_b - out_h = out & 0xF0 - out_l = out & 0x0F - out = (out_h >> 4) | (out_l << 4) - return out - - def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor): - assert blocks.dtype == torch.uint8 - assert scales.dtype == torch.uint8 - scales = scales.unsqueeze(-1) - assert len(blocks.shape) == 4 - assert len(scales.shape) == 4 - blocks = self.transform_nibble_layout(blocks) - new_data = torch.concat((scales, blocks), dim=-1) - new_shape = [new_data.shape[0], new_data.shape[1], new_data.shape[2] * 32] - logger.info(f"Repacked {new_name} with shape {new_shape} and quantization MXFP4") - # flatten last dim - new_data = new_data.view(new_data.shape[0], new_data.shape[1], new_data.shape[2] * new_data.shape[3]) - new_data = new_data.numpy() - self.gguf_writer.add_tensor(new_name, new_data, raw_dtype=gguf.GGMLQuantizationType.MXFP4) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - blocks0: Tensor = torch.zeros(1) - blocks1: Tensor = torch.zeros(1) - # we assume that tensors are loaded in the correct order - for name, data_torch in self.get_tensors(): - if "mlp.experts.down_proj_blocks" in name: - blocks0 = data_torch - elif "mlp.experts.down_proj_scales" in name: - new_name = self.map_tensor_name(name.replace("_scales", ".weight")) - self.repack_mxfp4(new_name, blocks0, data_torch) - elif "mlp.experts.gate_up_proj_blocks" in name: - blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :] - elif "mlp.experts.gate_up_proj_scales" in name: - scales0, scales1 = data_torch[:, ::2, :], data_torch[:, 1::2, :] - new_name_gate = self.map_tensor_name(name.replace("gate_up_proj_scales", "gate_proj.weight")) - new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight")) - self.repack_mxfp4(new_name_gate, blocks0, scales0) - self.repack_mxfp4(new_name_up, blocks1, scales1) - return [] - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "sinks" in name: - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # correct naming for down_proj - if "down_proj" in name: - if name.endswith("_bias"): - name = name.replace("down_proj_bias", "down_proj.bias") - elif "_blocks" not in name and "_scales" not in name: - logger.warning(f"{name} is not in MXFP4, performance may be degraded") - name = name.replace("down_proj", "down_proj.weight") - data_torch = data_torch.transpose(-1, -2) - else: - # otherwise, it should already be repacked to ggml MXFP4 format - return - - # split the gate_up into gate and up - if "gate_up_proj" in name: - if name.endswith("_bias"): - name_up = name.replace("gate_up_proj_bias", "up_proj.bias") - name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias") - gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2] - yield from super().modify_tensors(gate_proj_bias, name_gate, bid) - yield from super().modify_tensors(up_proj_bias, name_up, bid) - elif "_blocks" not in name and "_scales" not in name: - logger.warning(f"{name} is not in MXFP4, performance may be degraded") - name_up = name.replace("gate_up_proj", "up_proj.weight") - name_gate = name.replace("gate_up_proj", "gate_proj.weight") - data_torch = data_torch.transpose(-1, -2) - gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :] - yield from super().modify_tensors(gate_proj_weight, name_gate, bid) - yield from super().modify_tensors(up_proj_weight, name_up, bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"]) - - -@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM") -class LFM2Model(TextModel): - model_arch = gguf.MODEL_ARCH.LFM2 - - def _add_feed_forward_length(self): - ff_dim = self.find_hparam(["block_ff_dim", "intermediate_size"]) - auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"] - ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"] - multiple_of = self.hparams["block_multiple_of"] - - if auto_adjust_ff_dim: - ff_dim = int(2 * ff_dim / 3) - # custom dim factor multiplier - if ffn_dim_multiplier is not None: - ff_dim = int(ffn_dim_multiplier * ff_dim) - ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of) - - self.gguf_writer.add_feed_forward_length(ff_dim) - - def set_gguf_parameters(self): - # set num_key_value_heads only for attention layers - self.hparams["num_key_value_heads"] = [ - self.hparams["num_key_value_heads"] if layer_type != "conv" else 0 - for layer_type in self.hparams["layer_types"] - ] - - super().set_gguf_parameters() - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"]) - self._add_feed_forward_length() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if ConformerAudioModel.is_audio_tensor(name): - # skip multimodal tensors - return None - - name = name.replace("lfm.", "model.") # audio - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # conv op requires 2d tensor - if 'conv.conv' in name: - data_torch = data_torch.squeeze(1) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Lfm2Model") -class LFM2ColBertModel(LFM2Model): - model_arch = gguf.MODEL_ARCH.LFM2 - dense_tensor_name = "dense_2" - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if not name.startswith(self.dense_tensor_name): - name = "model." + name - - yield from super().modify_tensors(data_torch, name, bid) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # dense tensor is stored in a separate safetensors file - from safetensors.torch import load_file - tensors_file = self.dir_model / "1_Dense" / "model.safetensors" - assert tensors_file.is_file() - tensor = load_file(tensors_file)["linear.weight"] - self.gguf_writer.add_embedding_length_out(tensor.shape[0]) - yield f"{self.dense_tensor_name}.weight", tensor.clone() - - -@ModelBase.register("Lfm2MoeForCausalLM") -class LFM2MoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.LFM2MOE - - def set_gguf_parameters(self): - # set num_key_value_heads only for attention layers - self.hparams["num_key_value_heads"] = [ - self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0 - for layer_type in self.hparams["layer_types"] - ] - - super().set_gguf_parameters() - - self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"]) - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) - - # cache for experts weights for merging - _experts_cache: dict[int, dict[str, Tensor]] = {} - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith(".expert_bias"): - name = name.replace(".expert_bias", ".expert_bias.bias") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # conv op requires 2d tensor - if 'conv.conv' in name: - data_torch = data_torch.squeeze(1) - - # merge expert weights - if 'experts' in name: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - expert_cache = self._experts_cache.setdefault(bid, {}) - expert_cache[name] = data_torch - expert_weights = ["w1", "w2", "w3"] - - # not enough expert weights to merge - if len(expert_cache) < n_experts * len(expert_weights): - return - - for w_name in expert_weights: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight" - datas.append(expert_cache[ename]) - del expert_cache[ename] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - - del self._experts_cache[bid] - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - assert not self._experts_cache - - -@ModelBase.register("Lfm2VlForConditionalGeneration") -class LFM2VLModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - # TODO(tarek): for dynamic resolution image_size is not specified, setting here for compatibility - self.hparams_vision["image_size"] = 256 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2) - self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["layer_norm_eps"])) - self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2)) - self.gguf_writer.add_vision_use_gelu(True) - # python notation, e.g. for vision_feature_layer == -1, we pick last layer -> vision_feature_layers_to_drop = 0 - vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer", -1) + 1) - self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - name = name.replace("model.vision_tower.", "vision_tower.") - name = name.replace("model.multi_modal_projector.", "multi_modal_projector.") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "patch_embedding.weight" in name: - data_torch = data_torch.view(data_torch.shape[0], 16, 16, 3).permute(0, 3, 1, 2) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Lfm2AudioForConditionalGeneration") -class LFM2AudioModel(ConformerAudioModel): - has_vision_encoder = False - has_audio_encoder = True - model_name = "Lfm2AudioEncoder" - - def get_audio_config(self) -> dict[str, Any] | None: - return self.global_config.get("encoder") - - def set_gguf_parameters(self): - assert self.hparams_audio is not None - self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] - self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"] - self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"] - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2A) - self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) - self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # skip language model tensors - if name.startswith("lfm."): - return None - - # for training only - if any(p in name for p in ["audio_loss_weight"]): - return None - - # for audio output - if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): - return None - - return super().filter_tensors(item) - - -@ModelBase.register("GraniteSpeechForConditionalGeneration") -class GraniteSpeechMmprojModel(MmprojModel): - has_vision_encoder = False - has_audio_encoder = True - - _batch_norm_tensors: list[dict[str, Tensor]] | None = None - - def get_audio_config(self) -> dict[str, Any] | None: - return self.global_config.get("encoder_config") - - def set_gguf_parameters(self): - assert self.hparams_audio is not None - a = self.hparams_audio - a["hidden_size"] = a["hidden_dim"] - a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"] - a["num_attention_heads"] = a["num_heads"] - a["num_hidden_layers"] = a["num_layers"] - - super().set_gguf_parameters() - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH) - self.gguf_writer.add_audio_num_mel_bins(a["input_dim"]) - self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) - self.gguf_writer.add_audio_chunk_size(a["context_size"]) - self.gguf_writer.add_audio_conv_kernel_size(a["conv_kernel_size"]) - self.gguf_writer.add_audio_max_pos_emb(a["max_pos_emb"]) - - p = self.global_config - self.gguf_writer.add_audio_projector_window_size(p["window_size"]) - self.gguf_writer.add_audio_projector_downsample_rate(p["downsample_rate"]) - self.gguf_writer.add_audio_projector_head_count(p["projector_config"]["num_attention_heads"]) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if "encoder" in name or "projector" in name: - if ".conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - if "attention_dists" in name or "num_batches_tracked" in name: - return None - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # fold running_mean, running_var and eps into weight and bias for batch_norm - if "batch_norm" in name and "encoder.layers." in name: - if self._batch_norm_tensors is None: - self._batch_norm_tensors = [{} for _ in range(self.block_count)] - assert bid is not None - self._batch_norm_tensors[bid][name] = data_torch - if len(self._batch_norm_tensors[bid]) < 4: - return - prefix = f"encoder.layers.{bid}.conv.batch_norm" - weight = self._batch_norm_tensors[bid][f"{prefix}.weight"] - bias = self._batch_norm_tensors[bid][f"{prefix}.bias"] - running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"] - running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"] - eps = 1e-5 - a = weight / torch.sqrt(running_var + eps) - b = bias - running_mean * a - yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid) - yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid) - return - - if ".attn.to_kv.weight" in name: - k_weight, v_weight = data_torch.chunk(2, dim=0) - yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid) - yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid) - return - - if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"): - if data_torch.ndim == 3 and data_torch.shape[2] == 1: - data_torch = data_torch.squeeze(2) - - if "depth_conv" in name and name.endswith(".weight"): - if data_torch.ndim == 3 and data_torch.shape[1] == 1: - data_torch = data_torch.squeeze(1) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Lfm25AudioTokenizer") -class LFM25AudioTokenizer(LFM2Model): - model_arch = gguf.MODEL_ARCH.LFM2 - - def set_vocab(self): - self._set_vocab_none() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_embedding_length_out(self.hparams["output_size"]) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # skip language model tensors - if name == "istft.window" or name.startswith("emb.emb"): - return None - - if name.startswith("lin"): - name = name.replace("lin", "dense_2_out") - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("SmallThinkerForCausalLM") -class SmallThinkerModel(TextModel): - model_arch = gguf.MODEL_ARCH.SMALLTHINKER - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (n_experts := self.hparams.get("moe_num_primary_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - if (n_experts_used := self.hparams.get("moe_num_active_primary_experts")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - self.gguf_writer.add_feed_forward_length(moe_intermediate_size) - logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") - if (self.hparams.get('moe_primary_router_apply_softmax')): - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) - else: - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - - sliding_window_layout = self.hparams.get("sliding_window_layout") - if sliding_window_layout: - for i in sliding_window_layout: - if i != 0: - sliding_window = self.hparams.get("sliding_window_size") - if sliding_window: - self.gguf_writer.add_sliding_window(sliding_window) - break - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.hparams.get("moe_num_primary_experts") or self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down", "gate", "up"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification") -class ModernBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.MODERN_BERT - - def set_vocab(self): - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - self.gguf_writer.add_add_sep_token(True) - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_sliding_window(self.hparams["local_attention"]) - if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None: - self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("model."): - name = name[6:] - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.cls_out_labels: - # For BertForSequenceClassification (direct projection layer) - if name == "classifier.weight": - name = "classifier.out_proj.weight" - - if name == "classifier.bias": - name = "classifier.out_proj.bias" - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("ApertusForCausalLM") -class ApertusModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.APERTUS - undo_permute = False - - _alpha_n = {} - _alpha_p = {} - _beta = {} - _eps = {} - - def modify_tensors(self, data_torch, name, bid): - # Handle xIELU activation parameters - n_layers = self.hparams["num_hidden_layers"] - if name.endswith(".act_fn.alpha_n"): - self._alpha_n[bid] = data_torch.to("cpu").float().item() - if (len(self._alpha_n) == n_layers): - self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)]) - return - if name.endswith(".act_fn.alpha_p"): - self._alpha_p[bid] = data_torch.to("cpu").float().item() - if (len(self._alpha_p) == n_layers): - self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)]) - return - if name.endswith(".act_fn.beta"): - self._beta[bid] = data_torch.to("cpu").float().item() - if (len(self._beta) == n_layers): - self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)]) - return - if name.endswith(".act_fn.eps"): - self._eps[bid] = data_torch.to("cpu").float().item() - if (len(self._eps) == n_layers): - self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)]) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -class MistralModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.MISTRAL3 - model_name = "Mistral" - hf_arch = "" - is_mistral_format = True - undo_permute = False - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # for compatibility, we use LLAMA arch for older models - # TODO: remove this once everyone migrates to newer version of llama.cpp - if "llama_4_scaling" not in self.hparams: - self.model_arch = gguf.MODEL_ARCH.LLAMA - self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] - self.gguf_writer.add_architecture() - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def dequant_model(self): - # transform quantization config into HF format - quant_config = self.hparams.get("quantization") - if quant_config is not None: - assert quant_config["qformat_weight"] == "fp8_e4m3" - self.hparams["quantization_config"] = { - "activation_scheme": "static", - "quant_method": "fp8", - "weight_block_size": None, - } - return super().dequant_model() - - @staticmethod - def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool): - assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg - assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), ( - f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}" - ) - - if vocab.tokenizer.version == TokenizerVersion.v1: - return "mistral-v1" - elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm: - return "mistral-v3" - elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken: - return "mistral-v3-tekken" - elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm: - return "mistral-v7" - elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken: - return "mistral-v7-tekken" - elif vocab.tokenizer.version == TokenizerVersion.v11: - template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja" - elif vocab.tokenizer.version == TokenizerVersion.v13: - template_file = "unsloth-mistral-Devstral-Small-2507.jinja" - else: - err_message = f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}" - if is_mistral_format: - err_message += ( - " . Please pass --disable-mistral-community-chat-template argument to the CLI " - "if you want to skip this error and use the Mistral official `mistral-common` pre-processing library." - ) - raise ValueError(err_message) - - template_path = templates_dir / template_file - if not template_path.exists(): - raise FileNotFoundError(f"Template file not found: {template_path}") - - with open(template_path, "r", encoding="utf-8") as f: - template = f.read() - - return template - - def set_gguf_parameters(self): - super().set_gguf_parameters() - MistralModel.set_mistral_config(self.gguf_writer, self.hparams) - - @staticmethod - def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict): - if "yarn" in hparams: - yarn_params = hparams["yarn"] - mscale_all_dim = 1.0 if not yarn_params["apply_scale"] else 0.0 - gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - gguf_writer.add_rope_scaling_factor(yarn_params["factor"]) - gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"]) - gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"]) - gguf_writer.add_rope_scaling_yarn_log_mul(mscale_all_dim) - gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"]) - - if "llama_4_scaling" in hparams: - gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"]) - - -class MistralMoeModel(DeepseekV2Model): - model_arch = gguf.MODEL_ARCH.DEEPSEEK2 - model_name = "Mistral" - hf_arch = "" - is_mistral_format = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - logger.info("Using MistralMoeModel") - # remap hparams from Mistral MoE format to DeepseekV2 format - # we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic - # ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py - config = self.hparams - # Mistral key -> HF key - config_mapping = { - "dim": "hidden_size", - "norm_eps": "rms_norm_eps", - "n_kv_heads": "num_key_value_heads", - "n_layers": "num_hidden_layers", - "n_heads": "num_attention_heads", - "hidden_dim": "intermediate_size", - } - # HF key -> (Mistral key, default value) - top_level_mapping_with_default = { - "model_type": ("model_type", "transformer"), - "hidden_act": ("activation", "silu"), - "tie_word_embeddings": ("tied_embeddings", False), - "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)), - "max_position_embeddings": ("max_position_embeddings", 128_000), - } - # mapping top-level keys - for key, new_key in config_mapping.items(): - if key in config: - config[new_key] = config[key] - for new_key, (key, default_value) in top_level_mapping_with_default.items(): - config[new_key] = config.get(key, default_value) - # mapping MoE-specific keys - moe_config_map = { - "route_every_n": "moe_layer_freq", - "first_k_dense_replace": "first_k_dense_replace", - "num_experts_per_tok": "num_experts_per_tok", - "num_experts": "n_routed_experts", - "expert_hidden_dim": "moe_intermediate_size", - "routed_scale": "routed_scaling_factor", - "num_shared_experts": "n_shared_experts", - "num_expert_groups": "n_group", - "num_expert_groups_per_tok": "topk_group", - } - moe = config["moe"] - for key, new_key in moe_config_map.items(): - if key in moe: - config[new_key] = moe[key] - # provide missing values - config["topk_method"] = None - config["norm_topk_prob"] = True - config["scoring_func"] = "softmax" - - def set_vocab(self): - self._set_vocab_mistral() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - MistralModel.set_mistral_config(self.gguf_writer, self.hparams) - yarn_params = self.hparams["yarn"] - self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"]) - - # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] - # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul - # ref https://github.com/ggml-org/llama.cpp/pull/17945 - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1 - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic - if name.endswith(".qscale_act"): - name = name.replace(".qscale_act", ".input_scale") - if name.endswith(".qscale_weight"): - name = name.replace(".qscale_weight", ".weight_scale") - if ".wkv_b." in name: - name = name.replace(".wkv_b.", ".kv_b_proj.") - if ".experts." in name: - name = name.replace(".experts.", ".mlp.experts.") - name = name.replace(".w1.", ".gate_proj.") - name = name.replace(".w2.", ".down_proj.") - name = name.replace(".w3.", ".up_proj.") - name = "model." + name - - return super().filter_tensors((name, gen)) - - -class PixtralModel(LlavaVisionModel): - model_name = "Pixtral" - hf_arch = "" - is_mistral_format = True - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) - - self.gguf_writer.add_vision_attention_layernorm_eps( - self.find_hparam(["norm_eps"]) - ) - self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"])) - - self.gguf_writer.add_vision_use_silu(True) - - # spatial_merge_size - if self.find_vparam(["mm_projector_id"], optional=True) == "patch_merge": - self.gguf_writer.add_vision_spatial_merge_size( - self.find_vparam(["spatial_merge_size"]) - ) - - def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: - if name == "vision_language_adapter.w_in.weight": - return "mm.1.weight" - elif name == "vision_language_adapter.w_in.bias": - return "mm.1.bias" - elif name == "vision_language_adapter.w_out.weight": - return "mm.2.weight" - elif name == "vision_language_adapter.w_out.bias": - return "mm.2.bias" - return super().map_tensor_name(name, try_suffixes) - - -@ModelBase.register("LightOnOCRForConditionalGeneration") -class LightOnOCRVisionModel(LlavaVisionModel): - is_mistral_format = False - use_break_tok = False - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - name = name.replace("model.vision_encoder.", "vision_tower.") - name = name.replace("model.vision_projection.", "multi_modal_projector.") - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("KimiVLForConditionalGeneration") -class KimiVLModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["image_size"] = 64 * 14 # for compatibility - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_projector_scale_factor(2) - # eps is the same as pytorch's default value - assert self.hparams_vision is not None - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5)) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name - - if not is_vision_tensor: - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "pos_emb.weight" in name: - data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2]) - - if "wqkv" in name: - split_dim = 0 if "weight" in name else -1 - wq, wk, wv = data_torch.chunk(3, dim=split_dim) - yield from super().modify_tensors(wq, name.replace("wqkv", "wq"), bid) - yield from super().modify_tensors(wk, name.replace("wqkv", "wk"), bid) - yield from super().modify_tensors(wv, name.replace("wqkv", "wv"), bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("KimiK25ForConditionalGeneration") -class KimiK25Model(MmprojModel): - """Kimi-K2.5 with MoonViT3d vision encoder""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - assert self.hparams_vision is not None, "Kimi-K2.5 requires vision_config in model config" - - self.merge_kernel_size = tuple(self.hparams_vision.get("merge_kernel_size", [2, 2])) - self.patch_size = self.hparams_vision.get("patch_size", 14) - - # Set image_size for compatibility with base class - # Use position embedding dimensions as image_size reference - pos_emb_h = self.hparams_vision.get("init_pos_emb_height", 64) - self.hparams_vision["image_size"] = pos_emb_h * self.patch_size - - def set_gguf_parameters(self): - # Base class MmprojModel.set_gguf_parameters() already writes: - # - vision_block_count, vision_head_count, vision_embedding_length - # - vision_feed_forward_length, vision_patch_size, image_mean, image_std - # via find_vparam() which handles the vt_* prefixed keys in Kimi-K2.5's config - super().set_gguf_parameters() - assert self.hparams_vision is not None - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIK25) - - # Position embedding parameters (for interpolation) - self.gguf_writer.add_uint32("vision.pos_emb_height", self.hparams_vision.get("init_pos_emb_height", 64)) - self.gguf_writer.add_uint32("vision.pos_emb_width", self.hparams_vision.get("init_pos_emb_width", 64)) - self.gguf_writer.add_uint32("vision.pos_emb_time", self.hparams_vision.get("init_pos_emb_time", 4)) - - # Projector parameters - self.gguf_writer.add_vision_use_gelu(self.hparams_vision.get("projector_hidden_act", "gelu") == "gelu") - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5)) - self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0]) - - # Image size limits - # Note: in_patch_limit is for images, in_patch_limit_each_frame is for video (not supported yet) - in_patch_limit = self.preprocessor_config.get("in_patch_limit", 16384) - min_patches = 8 # reasonable minimum - pixels_per_patch = self.patch_size ** 2 - self.gguf_writer.add_vision_min_pixels(min_patches * pixels_per_patch) - self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch) - - @staticmethod - def permute(weights: Tensor, n_head: int) -> Tensor: - out_dim, in_dim = weights.shape - head_dim = out_dim // n_head - w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim) - w = w.permute(0, 2, 1, 3, 4) - return w.reshape(out_dim, in_dim) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Only process vision and projector tensors - is_vision = any(x in name for x in ["vision_tower", "mm_projector"]) - - if not is_vision: - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - assert self.hparams_vision is not None - n_head = self.hparams_vision.get("num_attention_heads", 16) - - # Permute Q/K weights/biases from interleaved to split RoPE format - # This allows using build_rope_2d at runtime without post-permutation. - if "wqkv" in name: - out_dim = data_torch.shape[0] - qkv_dim = out_dim // 3 - head_dim = qkv_dim // n_head - - if "weight" in name: - wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2 * qkv_dim, :], data_torch[2 * qkv_dim:, :] - wq = self.permute(wq, n_head) - wk = self.permute(wk, n_head) - data_torch = torch.cat([wq, wk, wv], dim=0) - elif "bias" in name: - bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2 * qkv_dim], data_torch[2 * qkv_dim:] - bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1) - bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1) - data_torch = torch.cat([bq, bk, bv], dim=0) - - # Temporal embeddings: (T, 1, C) → (T, C) - if "pos_emb.time_weight" in name: - T, _, C = data_torch.shape - data_torch = data_torch.reshape(T, C) - - # PatchMergerMLP tensor name mapping - # proj.0.weight → proj.linear_1.weight - # proj.2.weight → proj.linear_2.weight - if "mm_projector.proj.0." in name: - name = name.replace(".proj.0.", ".proj.linear_1.") - elif "mm_projector.proj.2." in name: - name = name.replace(".proj.2.", ".proj.linear_2.") - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("CogVLMForCausalLM") -class CogVLMVisionModel(MmprojModel): - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if not name.startswith("model.vision."): - return None - - return super().filter_tensors(item) - - -@ModelBase.register("CogVLMForCausalLM") -class CogVLMModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.COGVLM - - -@ModelBase.register("JanusForConditionalGeneration") -class JanusProModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip vision, aligner, and generation tensors - skip_prefixes = ( - 'model.vision_model.', - 'model.aligner.', - 'model.vqmodel.', - 'model.generation_embeddings.', - 'model.generation_aligner.', - 'model.generation_head.', - ) - if name.startswith(skip_prefixes): - return None - - return super().filter_tensors(item) - - -@ModelBase.register("JanusForConditionalGeneration") -class JanusProVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - if "intermediate_size" not in self.hparams_vision: - mlp_ratio = self.hparams_vision.get("mlp_ratio") - hidden_size = self.hparams_vision.get("hidden_size") - if mlp_ratio is not None and hidden_size is not None: - self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio)) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO) - - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) - - hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() - if hidden_act == "gelu": - self.gguf_writer.add_vision_use_gelu(True) - elif hidden_act == "silu": - self.gguf_writer.add_vision_use_silu(True) - - def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]: - """Map aligner tensors to projector format""" - suffix = ".bias" if name.endswith(".bias") else ".weight" - - if name.startswith("model.aligner."): - local_name = name[len("model.aligner."):] - elif name.startswith("aligner."): - local_name = name[len("aligner."):] - else: - raise ValueError(f"Unsupported Janus aligner prefix: {name}") - - if local_name.startswith("fc1."): - mm_index = 0 - elif local_name.startswith("hidden_layers."): - parts = local_name.split(".", 2) - if len(parts) < 3: - raise ValueError(f"Unexpected Janus aligner tensor name: {name}") - mm_index = int(parts[1]) + 1 - else: - raise ValueError(f"Unsupported Janus aligner tensor: {name}") - - tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix) - return [(tensor_name, data_torch)] - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip generation-related components - skip_generation_prefixes = ( - 'model.vqmodel.', - 'vqmodel.', - 'model.generation_embeddings.', - 'generation_embeddings.', - 'model.generation_aligner.', - 'generation_aligner.', - 'model.generation_head.', - 'generation_head.', - ) - if name.startswith(skip_generation_prefixes): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Handle aligner tensors - if name.startswith(('model.aligner.', 'aligner.')): - yield from self._map_aligner_tensor(data_torch, name) - return - - # Handle vision tensors - if name.startswith(('model.vision_model.', 'vision_model.')): - yield from super().modify_tensors(data_torch, name, bid) - return - - return - - -@ModelBase.register("YoutuVLForConditionalGeneration") -class YoutuVLVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) - - # Handle activation function - hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower() - if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"): - self.gguf_writer.add_vision_use_gelu(True) - elif hidden_act == "silu": - self.gguf_writer.add_vision_use_silu(True) - else: - raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}") - - self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2)) - - window_size = self.hparams.get("window_size") - if window_size is not None: - self.gguf_writer.add_vision_window_size(window_size) - # fullatt_block_indexes contains explicit layer indices that use full attention - # e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention - # All other layers use window attention - fullatt_block_indexes = self.hparams.get("fullatt_block_indexes") - assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl" - # Store the explicit layer indices for YoutuVL (irregular pattern approach) - self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip language model tensors - skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.') - if name.startswith(skip_prefixes): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Try to map the tensor using TensorNameMap (handles vision encoder and projector) - try: - yield from super().modify_tensors(data_torch, name, bid) - except ValueError: - # If mapping fails, log warning and skip - logger.warning(f"Cannot map tensor: {name}") - return - - -@ModelBase.register("SolarOpenForCausalLM") -class SolarOpenModel(Glm4MoeModel): - model_arch = gguf.MODEL_ARCH.GLM4_MOE - - def set_vocab(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()[""]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab.add_to_gguf(self.gguf_writer) - - -@ModelBase.register("DotsOCRForCausalLM") -class DotsOCRVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["image_size"] = 0 # dynamic resolution - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR) - self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) - self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) - self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"])) - self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"])) - self.gguf_writer.add_vision_use_silu(True) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if not name.startswith("vision_tower."): - return None - - if "vision_tower.blocks." in name and ".mlp." in name: - # note: to avoid naming conflicts in tensor_mapping.py, we need to handle FFN renaming here - # x = F.silu(self.fc1(x)) * self.fc3(x) - # x = self.fc2(x) - # fc1 -> gate, fc2 -> down, fc3 -> up - # mapping original names to Qwen2.5 naming scheme - name = name.replace("vision_tower.blocks.", "visual.blocks.") - name = name.replace(".fc1", ".gate_proj") - name = name.replace(".fc2", ".down_proj") - name = name.replace(".fc3", ".up_proj") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Sarashina2VisionForCausalLM") -class Sarashina2VLTextModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.LLAMA - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - if name.startswith("llm."): - name = name.replace("llm.", "", 1) - elif name.startswith("norm."): - return None - return super().filter_tensors((name, gen)) - - -@ModelBase.register("Sarashina2VisionForCausalLM") -class Sarashina2VLVisionModel(Qwen2VLVisionModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.global_config['model_type'] = "qwen2_vl" - - -###### CONVERSION LOGIC ###### - - -# tree of lazy tensors -class LazyTorchTensor(gguf.LazyBase): - _tensor_type = torch.Tensor - # to keep the type-checker happy - dtype: torch.dtype - shape: torch.Size - - # only used when converting a torch.Tensor to a np.ndarray - _dtype_map: dict[torch.dtype, type] = { - torch.float16: np.float16, - torch.float32: np.float32, - torch.uint8: np.uint8, - } - - # only used when byteswapping data. Only correct size is needed - # TODO: uncomment uint64, uint32, and uint16, ref: https://github.com/pytorch/pytorch/issues/58734 - _dtype_byteswap_map: dict[torch.dtype, type] = { - torch.float64: np.float64, - torch.float32: np.float32, - torch.bfloat16: np.float16, - torch.float16: np.float16, - torch.int64: np.int64, - # torch.uint64: np.uint64, - torch.int32: np.int32, - # torch.uint32: np.uint32, - torch.int16: np.int16, - # torch.uint16: np.uint16, - torch.int8: np.int8, - torch.uint8: np.uint8, - torch.bool: np.uint8, - torch.float8_e4m3fn: np.uint8, - torch.float8_e5m2: np.uint8, - } - - # used for safetensors slices - # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 - # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 - _dtype_str_map: dict[str, torch.dtype] = { - "F64": torch.float64, - "F32": torch.float32, - "BF16": torch.bfloat16, - "F16": torch.float16, - # "U64": torch.uint64, - "I64": torch.int64, - # "U32": torch.uint32, - "I32": torch.int32, - # "U16": torch.uint16, - "I16": torch.int16, - "U8": torch.uint8, - "I8": torch.int8, - "BOOL": torch.bool, - "F8_E4M3": torch.float8_e4m3fn, - "F8_E5M2": torch.float8_e5m2, - } - - def numpy(self) -> gguf.LazyNumpyTensor: - dtype = self._dtype_map[self.dtype] - return gguf.LazyNumpyTensor( - meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), - args=(self,), - func=(lambda s: s.numpy()) - ) - - @classmethod - def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: - return torch.empty(size=shape, dtype=dtype, device="meta") +import argparse +import logging +import os +import sys +from pathlib import Path - @classmethod - def from_safetensors_slice(cls, st_slice: Any) -> Tensor: - dtype = cls._dtype_str_map[st_slice.get_dtype()] - shape: tuple[int, ...] = tuple(st_slice.get_shape()) - lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:]) - return cast(torch.Tensor, lazy) +import torch - @classmethod - def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor: - def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor: - def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray: - if sys.byteorder == 'big': - # switch data back to big endian - tensor = tensor.view(dtype).byteswap(inplace=False) - return tensor - dtype = cls._dtype_str_map[tensor.dtype] - numpy_dtype = cls._dtype_byteswap_map[dtype] - return torch.from_numpy(byteswap_tensor(tensor.mmap_bytes(), numpy_dtype)).view(dtype).reshape(tensor.shape) - dtype = cls._dtype_str_map[t.dtype] - shape = t.shape - lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r)) - return cast(torch.Tensor, lazy) +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf - @classmethod - def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor): - def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray: - if sys.byteorder == 'big': - # switch data back to big endian - tensor = tensor.view(dtype).byteswap(inplace=False) - return tensor - dtype = cls._dtype_str_map[remote_tensor.dtype] - numpy_dtype = cls._dtype_byteswap_map[dtype] - shape = remote_tensor.shape - meta = cls.meta_with_dtype_and_shape(dtype, shape) - lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.from_numpy(byteswap_tensor(np.frombuffer(r.data(), dtype=numpy_dtype), numpy_dtype)).view(dtype).reshape(shape)) - return cast(torch.Tensor, lazy) +from conversion import ( + ModelBase, + ModelType, + get_model_architecture, + get_model_class, + logger, + print_registered_models, + _mistral_common_installed, + _mistral_import_error_msg, +) - @classmethod - def __torch_function__(cls, func, types, args=(), kwargs=None): - del types # unused - if kwargs is None: - kwargs = {} +def split_str_to_n_bytes(split_str: str) -> int: + if split_str.endswith("K"): + n = int(split_str[:-1]) * 1000 + elif split_str.endswith("M"): + n = int(split_str[:-1]) * 1000 * 1000 + elif split_str.endswith("G"): + n = int(split_str[:-1]) * 1000 * 1000 * 1000 + elif split_str.isnumeric(): + n = int(split_str) + else: + raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") - if func is torch.Tensor.numpy: - assert len(args) - return args[0].numpy() + if n < 0: + raise ValueError(f"Invalid split size: {split_str}, must be positive") - return cls._wrap_fn(func)(*args, **kwargs) + return n def parse_args() -> argparse.Namespace: @@ -14082,58 +147,12 @@ def parse_args() -> argparse.Namespace: return args -def split_str_to_n_bytes(split_str: str) -> int: - if split_str.endswith("K"): - n = int(split_str[:-1]) * 1000 - elif split_str.endswith("M"): - n = int(split_str[:-1]) * 1000 * 1000 - elif split_str.endswith("G"): - n = int(split_str[:-1]) * 1000 * 1000 * 1000 - elif split_str.isnumeric(): - n = int(split_str) - else: - raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") - - if n < 0: - raise ValueError(f"Invalid split size: {split_str}, must be positive") - - return n - - -def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str: - # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders - # maybe we should fallback to text model's arch in that case, since not many models have both - text_config = hparams.get("text_config", {}) - vision_config = hparams.get("vision_config", {}) - arch = None - if (arches := hparams.get("architectures")) is not None and len(arches) > 0: - arch = arches[0] - elif "ssm_cfg" in hparams: - # For non-hf Mamba and Mamba2 models - arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM" - - # Step3-VL keeps text config under text_config but uses a custom top-level architecture. - # For text conversion we route to a dedicated text-only class. - # TODO: refactor this later to avoid adding exception here - if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"): - return arch - - # if "architectures" is found in the sub-config, use that instead - if model_type == ModelType.TEXT and text_config.get("architectures") is not None: - arch = text_config["architectures"][0] - elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None: - arch = vision_config["architectures"][0] - if arch is None: - raise ValueError("Failed to detect model architecture") - return arch - - def main() -> None: args = parse_args() if args.print_supported_models: logger.error("Supported models:") - ModelBase.print_registered_models() + print_registered_models() sys.exit(0) if args.verbose: @@ -14199,16 +218,19 @@ def main() -> None: model_architecture = get_model_architecture(hparams, model_type) logger.info(f"Model architecture: {model_architecture}") try: - model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) + model_class = get_model_class(model_architecture, mmproj=(model_type == ModelType.MMPROJ)) except NotImplementedError: logger.error(f"Model {model_architecture} is not supported") sys.exit(1) elif args.mmproj: assert hparams.get("vision_encoder") is not None, "This model does not support multimodal" + from conversion.pixtral import PixtralModel model_class = PixtralModel elif "moe" in hparams: + from conversion.mistral import MistralMoeModel model_class = MistralMoeModel else: + from conversion.mistral import MistralModel model_class = MistralModel model_instance = model_class(dir_model, output_type, fname_out, diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 8d73b1f5546..8b2a9454f98 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -19,7 +19,7 @@ logger = logging.getLogger("convert_hf_to_gguf_update") sess = requests.Session() -convert_py_pth = pathlib.Path("convert_hf_to_gguf.py") +convert_py_pth = pathlib.Path("conversion/base.py") convert_py = convert_py_pth.read_text(encoding="utf-8") hf_token_pth = pathlib.Path.home() / ".cache" / "huggingface" / "token" hf_token = hf_token_pth.read_text(encoding="utf-8").strip() if hf_token_pth.exists() else None @@ -374,7 +374,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: convert_py_pth.write_text(convert_py, encoding="utf-8") -logger.info("+++ convert_hf_to_gguf.py was updated") +logger.info(f"+++ {convert_py_pth} was updated") # generate tests for each tokenizer model diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index ad4751bb96e..1b7334617d1 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -22,12 +22,11 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf - -# reuse model definitions from convert_hf_to_gguf.py -from convert_hf_to_gguf import LazyTorchTensor, ModelBase - from gguf.constants import GGUFValueType +# reuse model definitions from the conversion/ package +from conversion import LazyTorchTensor, ModelBase, get_model_class + logger = logging.getLogger("lora-to-gguf") @@ -384,7 +383,7 @@ def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None] with torch.inference_mode(): try: - model_class = ModelBase.from_model_architecture(hparams["architectures"][0]) + model_class = get_model_class(hparams["architectures"][0]) except NotImplementedError: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) From 18d1717d623c7f83e35ebb48ee6ba507eb12cc3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 15 May 2026 18:38:39 +0200 Subject: [PATCH 007/309] convert : fix Qwen3 ASR conversion (#23081) * fix qwen3asr * fix qwen3asr --- conversion/qwen3vl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conversion/qwen3vl.py b/conversion/qwen3vl.py index 2d20b30dbde..9f11757697f 100644 --- a/conversion/qwen3vl.py +++ b/conversion/qwen3vl.py @@ -183,6 +183,9 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca if name.startswith("model.visual."): name = name.replace("model.visual.", "visual.", 1) + if name.startswith("thinker.audio_tower."): + name = name.replace("thinker.audio_tower.", "audio_tower.", 1) + if "visual." not in name and "audio_tower." not in name: return None From 8be1786707cdac011b90bf0d6ddb67c3eb6877f3 Mon Sep 17 00:00:00 2001 From: Pascal Date: Fri, 15 May 2026 19:25:38 +0200 Subject: [PATCH 008/309] webui: fix theme from --webui-config-file not applied on first load (fresh localStorage) (#22902) --- tools/server/webui/src/lib/stores/settings.svelte.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/server/webui/src/lib/stores/settings.svelte.ts b/tools/server/webui/src/lib/stores/settings.svelte.ts index 0177d79af87..da5e4024e8a 100644 --- a/tools/server/webui/src/lib/stores/settings.svelte.ts +++ b/tools/server/webui/src/lib/stores/settings.svelte.ts @@ -357,6 +357,11 @@ class SettingsStore { for (const [key, value] of Object.entries(webuiSettings)) { if (!this.userOverrides.has(key) && value !== undefined) { setConfigValue(this.config, key, value); + + // theme lives in mode-watcher, not just in config -> propagate + if (key === SETTINGS_KEYS.THEME) { + setMode(value as ColorMode); + } } } } From 72e60f500d739eb9dc26e51cda5fdfe062faf766 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 15 May 2026 19:32:47 +0200 Subject: [PATCH 009/309] mtmd: add chunks and fix preproc for qwen3a (#23073) * mtmd: add chunks and fix preproc for qwen3a * add attn_mask * limit mtmd_chunk size (avoid blow up memory) * correct audio tokens * re-order the set_input case * remove attn_mask --- tools/mtmd/clip-graph.h | 7 ++- tools/mtmd/clip.cpp | 33 ++++------- tools/mtmd/clip.h | 1 - tools/mtmd/models/qwen3a.cpp | 104 +++++++++++++++++++++-------------- tools/mtmd/mtmd-audio.cpp | 104 +++++++++++++++++++++++++++++++++++ tools/mtmd/mtmd-audio.h | 9 +++ tools/mtmd/mtmd.cpp | 7 ++- 7 files changed, 197 insertions(+), 68 deletions(-) diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 39f069501bc..c5e880c71ec 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -11,6 +11,10 @@ #define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS) +struct build_vit_opts { + ggml_tensor * attn_mask = nullptr; +}; + struct clip_graph { const clip_model & model; const clip_hparams & hparams; @@ -63,7 +67,8 @@ struct clip_graph { norm_type norm_t, ffn_op_type ffn_t, ggml_tensor * learned_pos_embd, - std::function add_pos); + std::function add_pos, + const build_vit_opts & opts = {}); // build the input after conv2d (inp_raw --> patches) // returns tensor with shape [n_embd, n_patches] diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 02f0982797e..9727a738ed8 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -300,7 +300,8 @@ ggml_tensor * clip_graph::build_vit( norm_type norm_t, ffn_op_type ffn_t, ggml_tensor * learned_pos_embd, - std::function add_pos + std::function add_pos, + const build_vit_opts & opts ) { if (learned_pos_embd) { inp = ggml_add(ctx0, inp, learned_pos_embd); @@ -427,7 +428,7 @@ ggml_tensor * clip_graph::build_vit( } cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, opts.attn_mask, kq_scale, il); cb(cur, "attn_out", il); } @@ -663,6 +664,9 @@ ggml_tensor * clip_graph::build_attn( k = ggml_cast(ctx0, k, GGML_TYPE_F16); v = ggml_cast(ctx0, v, GGML_TYPE_F16); + if (kq_mask) { + kq_mask = ggml_cast(ctx0, kq_mask, GGML_TYPE_F16); + } cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f); ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); @@ -3244,12 +3248,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_QWEN3A: { - // 3x stride-2 conv2d: each step is floor((n-1)/2)+1 - int n = img->nx; - n = (n - 1) / 2 + 1; - n = (n - 1) / 2 + 1; - n = (n - 1) / 2 + 1; - n_patches = n; + // chunk_size=100 frames --> 3x stride-2 conv2d --> 13 tokens per chunk + const int chunk_size = 100; + const int tokens_per_chunk = 13; + n_patches = (img->nx / chunk_size) * tokens_per_chunk; } break; case PROJECTOR_TYPE_GLMA: { @@ -4292,21 +4294,6 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_AUDIO; } -bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { - switch (ctx->proj_type()) { - case PROJECTOR_TYPE_ULTRAVOX: - case PROJECTOR_TYPE_QWEN2A: - case PROJECTOR_TYPE_QWEN3A: - case PROJECTOR_TYPE_GLMA: - case PROJECTOR_TYPE_VOXTRAL: - case PROJECTOR_TYPE_MERALION: - case PROJECTOR_TYPE_MUSIC_FLAMINGO: - return true; - default: - return false; - } -} - bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { clip_image_f32 clip_img; clip_img.buf.resize(h * w * 3); diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 5f9dc93c6b0..f643ed6e979 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -115,7 +115,6 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); -bool clip_has_whisper_encoder(const struct clip_ctx * ctx); struct clip_cap { bool has_vision; diff --git a/tools/mtmd/models/qwen3a.cpp b/tools/mtmd/models/qwen3a.cpp index 1384e5155ee..4de96955d96 100644 --- a/tools/mtmd/models/qwen3a.cpp +++ b/tools/mtmd/models/qwen3a.cpp @@ -1,68 +1,88 @@ #include "models.h" ggml_cgraph * clip_graph_qwen3a::build() { + // Ref implementation: https://github.com/QwenLM/Qwen3-ASR/blob/main/qwen_asr/core/transformers_backend/modeling_qwen3_asr.py + + // inp_raw: [n_frames, n_mel, 1] (nx=n_frames, ny=n_mel) ggml_tensor * inp = build_inp_raw(1); - // conv2d block - // TODO: do we need to split by chunks of n_window each like on transformers impl? - { - inp = ggml_conv_2d(ctx0, model.conv2d_1_w, inp, 2, 2, 1, 1, 1, 1); - inp = ggml_add(ctx0, inp, model.conv2d_1_b); - inp = ggml_gelu_erf(ctx0, inp); + const int64_t n_frames = inp->ne[0]; // total frames, padded to multiple of chunk_size + const int64_t n_mel = inp->ne[1]; // 128 + const int64_t chunk_size = 100; // n_window * 2 (n_window=50 from model config) + const int64_t n_chunks = n_frames / chunk_size; - inp = ggml_conv_2d(ctx0, model.conv2d_2_w, inp, 2, 2, 1, 1, 1, 1); - inp = ggml_add(ctx0, inp, model.conv2d_2_b); - inp = ggml_gelu_erf(ctx0, inp); + GGML_ASSERT(n_frames % chunk_size == 0); // preprocessor should already pad the input + GGML_ASSERT(inp->type == GGML_TYPE_F32); - inp = ggml_conv_2d(ctx0, model.conv2d_3_w, inp, 2, 2, 1, 1, 1, 1); - inp = ggml_add(ctx0, inp, model.conv2d_3_b); - inp = ggml_gelu_erf(ctx0, inp); + // View mel spectrogram as batched 100-frame chunks: [chunk_size, n_mel, 1, n_chunks] + inp = ggml_view_4d(ctx0, inp, + chunk_size, n_mel, 1, n_chunks, + n_frames * (int64_t)sizeof(float), // nb[1]: stride over mel bins + chunk_size * (int64_t)sizeof(float), // nb[2]: stride for C=1 (unused) + chunk_size * (int64_t)sizeof(float), // nb[3]: stride over chunks + 0); + inp = ggml_cont(ctx0, inp); + cb(inp, "inp_chunks", -1); - // inp [n_pos, n_mels/8, channels, 1] (W, H, C, N) - cb(inp, "after_conv_blocks", -1); + // 3 x conv2d + gelu + { + // conv output [OW, OH, C_out, n_chunks] + auto conv_block = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { + x = ggml_conv_2d(ctx0, w, x, 2, 2, 1, 1, 1, 1); + if (b) { + x = ggml_add(ctx0, x, ggml_reshape_4d(ctx0, b, 1, 1, x->ne[2], 1)); + } + return ggml_gelu_erf(ctx0, x); + }; - const int64_t n_pos_after_conv = inp->ne[0]; - const int64_t n_mel_after_conv = inp->ne[1]; // 128/8 = 16 + inp = conv_block(inp, model.conv2d_1_w, model.conv2d_1_b); + inp = conv_block(inp, model.conv2d_2_w, model.conv2d_2_b); + inp = conv_block(inp, model.conv2d_3_w, model.conv2d_3_b); + // inp: [OW=13, OH=16, OC=480, n_chunks] + cb(inp, "after_conv_blocks", -1); + } - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 3, 1)); - inp = ggml_reshape_2d(ctx0, inp, n_pos_after_conv, n_mel_after_conv * inp->ne[3]); // [n_pos, 7680] - inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); // [7680, n_pos] + // permute [OW=25, OH=16, OC=480, n_chunks] -> [OH=16, OC=480, OW=25, n_chunks] + // reshape to [OH*OC=7680, OW*n_chunks] + // feature index h+16*c = c*16+f (matches python code) + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 2, 0, 1, 3)); + inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2] * inp->ne[3]); - // project to n_embd - inp = ggml_mul_mat(ctx0, model.conv_out_w, inp); - if (model.conv_out_b) { - inp = ggml_add(ctx0, inp, model.conv_out_b); - } - cb(inp, "after_conv_out", -1); + // Project to d_model: [d_model, 25*n_chunks] + inp = ggml_mul_mat(ctx0, model.conv_out_w, inp); + if (model.conv_out_b) { + inp = ggml_add(ctx0, inp, model.conv_out_b); } + cb(inp, "after_conv_out", -1); - auto n_pos = inp->ne[1]; + const int64_t n_pos = inp->ne[1]; // 25 * n_chunks - ggml_tensor * pos_embd_selected = ggml_view_2d( - ctx0, model.position_embeddings, - model.position_embeddings->ne[0], n_pos, - model.position_embeddings->nb[1], 0 - ); - ggml_tensor * cur = build_vit( - inp, n_pos, - NORM_TYPE_NORMAL, - hparams.ffn_op, - pos_embd_selected, - nullptr); + // Per-chunk positional embeddings: repeat pos[0:13] for each chunk + // (position indices reset 0..12 per chunk, not sequential across chunks) + { + const int64_t tokens_per_chunk = n_pos / n_chunks; // 13 + ggml_tensor * pos_tmp = ggml_view_2d(ctx0, model.position_embeddings, + model.position_embeddings->ne[0], tokens_per_chunk, + model.position_embeddings->nb[1], 0); + ggml_tensor * tgt = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, + model.position_embeddings->ne[0], n_pos); + inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, pos_tmp, tgt)); + } + ggml_tensor * cur = build_vit(inp, n_pos, + NORM_TYPE_NORMAL, hparams.ffn_op, + nullptr, // pos embd already added above + nullptr); cb(cur, "after_transformer", -1); - // projector + // MLP projector cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, - FFN_GELU_ERF, - -1); - + FFN_GELU_ERF, -1); cb(cur, "projected", -1); ggml_build_forward_expand(gf, cur); - return gf; } diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 8f0a9875b10..85352904739 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -609,6 +609,110 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s return true; } +// +// mtmd_audio_preprocessor_qwen3a +// +// Matches the Python WhisperFeatureExtractor called with truncation=False: +// - reflection padding of n_fft/2 samples at each end (center=True) +// - Whisper-style log10 + (max-8)/4 normalization applied to full audio +// - output split into ≤30s (3000 mel frames) windows, each padded to a +// multiple of 200 frames (n_window * 2) for the cgraph batch view +// + +void mtmd_audio_preprocessor_qwen3a::initialize() { + cache.fill_sin_cos_table(hparams.audio_n_fft); + cache.fill_hann_window(hparams.audio_window_len, true); + cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate); +} + +bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * samples, + size_t n_samples, + std::vector & output) { + if (n_samples == 0) { + return false; + } + + GGML_ASSERT(!cache.sin_vals.empty()); + GGML_ASSERT(!cache.cos_vals.empty()); + GGML_ASSERT(!cache.filters.data.empty()); + + // Reflection-pad n_fft/2 samples at each end, matching WhisperFeatureExtractor center=True + const int pad = hparams.audio_n_fft / 2; // = 200 + + std::vector padded(n_samples + 2 * pad, 0.0f); + // Reflect start: padded[0..pad-1] = samples[pad..1] (reversed) + for (int i = 0; i < pad; i++) { + int src = pad - i; // samples[pad], samples[pad-1], ..., samples[1] + padded[i] = (src < (int)n_samples) ? samples[src] : 0.0f; + } + std::copy(samples, samples + n_samples, padded.begin() + pad); + // Reflect end: padded[n+pad..n+2*pad-1] = samples[n-2..n-pad-1] (reversed) + for (int i = 0; i < pad; i++) { + int src = (int)n_samples - 2 - i; // samples[n-2], samples[n-3], ... + padded[n_samples + pad + i] = (src >= 0) ? samples[src] : 0.0f; + } + + filter_params params; + params.n_mel = hparams.n_mel_bins; + params.n_fft_bins = 1 + (hparams.audio_n_fft / 2); + params.hann_window_size = hparams.audio_window_len; + params.hop_length = hparams.audio_hop_len; + params.sample_rate = hparams.audio_sample_rate; + params.no_padding = true; // reflection padding already applied above + params.use_natural_log = false; // log10 + + mtmd_audio_mel mel_full; + bool ok = log_mel_spectrogram(padded.data(), (int)padded.size(), 4, params, cache, mel_full); + if (!ok) { + return false; + } + + // Whisper-style normalization: clamp to (max - 8), scale to [-1, 1] + { + double mmax = -1e20; + for (float v : mel_full.data) { + if (v > mmax) mmax = v; + } + mmax -= 8.0; + for (float & v : mel_full.data) { + v = (std::max((double)v, mmax) + 4.0) / 4.0; + } + } + + // The effective frame count: center-padded STFT gives ~n_samples/hop_length frames. + // We take min(mel_full.n_len, n_samples/hop + 1) to avoid including excess frames. + const int n_eff = std::min(mel_full.n_len, + (int)(n_samples / hparams.audio_hop_len) + 1); + + // Split into inference windows matching n_window_infer=800 from model config. + // Each window is padded to the next multiple of chunk_size for the cgraph. + // The mtmd caller loops over output entries, so long audio is handled automatically. + const int chunk_size = 100; // conv sub-chunk size (n_window * 2, n_window=50) + const int window_size = 800; // mel frames per forward pass (n_window_infer=800) + + for (int off = 0; off < n_eff; off += window_size) { + const int win_eff = std::min(window_size, n_eff - off); + const int n_chunks = (win_eff + chunk_size - 1) / chunk_size; + const int n_padded = n_chunks * chunk_size; + + mtmd_audio_mel out; + out.n_mel = mel_full.n_mel; + out.n_len = n_padded; + out.n_len_org = win_eff; + out.data.assign(out.n_mel * out.n_len, 0.0f); + for (int m = 0; m < out.n_mel; m++) { + const int copy_len = std::min(win_eff, mel_full.n_len - off); + if (copy_len > 0) { + std::copy(mel_full.data.begin() + (size_t)m * mel_full.n_len + off, + mel_full.data.begin() + (size_t)m * mel_full.n_len + off + copy_len, + out.data.begin() + (size_t)m * out.n_len); + } + } + output.push_back(std::move(out)); + } + return true; +} + // // mtmd_audio_preprocessor_conformer // diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index c1a705de522..98ccb642415 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -96,6 +96,15 @@ struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor { mtmd_audio_cache cache; }; +struct mtmd_audio_preprocessor_qwen3a : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_qwen3a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; + + private: + mtmd_audio_cache cache; +}; + // // streaming ISTFT - converts spectrogram frames back to audio one frame at a time // diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 1ab8a4c0464..8f12d0b43ea 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -515,7 +515,6 @@ struct mtmd_context { // set preprocessor switch (proj) { case PROJECTOR_TYPE_QWEN2A: - case PROJECTOR_TYPE_QWEN3A: case PROJECTOR_TYPE_QWEN25O: { // <|audio_bos|> ... (embeddings) ... <|audio_eos|> @@ -523,6 +522,12 @@ struct mtmd_context { aud_end = "<|audio_eos|>"; audio_preproc = std::make_unique(ctx_a); } break; + case PROJECTOR_TYPE_QWEN3A: + { + aud_beg = "<|audio_start|>"; + aud_end = "<|audio_end|>"; + audio_preproc = std::make_unique(ctx_a); + } break; case PROJECTOR_TYPE_VOXTRAL: { // [BEGIN_AUDIO] ... (embeddings) ... From 6831fe470cdfbeeebcf5645e8d5d9acae3a4ddf0 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Fri, 15 May 2026 19:33:12 +0200 Subject: [PATCH 010/309] docs: document `usage` object in server timings response (#23110) * docs: document `usage` object in server timings response Co-Authored-By: julien-agent * Apply suggestion from @julien-c --------- Co-authored-by: julien-agent --- tools/server/README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/server/README.md b/tools/server/README.md index 5ba1a58f8c1..2b3a2b1687d 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1322,6 +1322,22 @@ This provides information on the performance of the server. It also allows calcu The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n` +The response also includes a standard `usage` object: + +```js +{ + // ... + "usage": { + "completion_tokens": 48, + "prompt_tokens": 44, + "total_tokens": 92, + "prompt_tokens_details": { + "cached_tokens": 0 + } + } +} +``` + *Reasoning support* The server supports parsing and returning reasoning via the `reasoning_content` field, similar to Deepseek API. From cfabeb1bad9d789fefa853ed7e336ec98d981b9b Mon Sep 17 00:00:00 2001 From: Pascal Date: Fri, 15 May 2026 19:35:05 +0200 Subject: [PATCH 011/309] tests: add BF16 non-contig coverage for MUL_MAT permutations (#22689) The MUL_MAT test loop iterates over base_types[] to generate non-contig permutation cases (3 standard permutations across n in {1, 8, 16}). BF16 is absent from base_types[], so these 9 cases were never generated for BF16 even though every other type covered by base_types[] tests them. Add the missing 9 cases explicitly: BF16 x F32, m=16, k=256, bs=[2,3], permutations {0,2,1,3}, {0,1,3,2}, {0,3,2,1}, with n in {1, 8, 16}. Suggested-by: @jeffbolznv --- tests/test-backend-ops.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index c60f6d2c2b0..8a561c038a1 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8337,6 +8337,18 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {1536, 1}, {1, 1})); } } + + // BF16 is absent from base_types: add the 3 standard non-contig permutations explicitly + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1})); + for (ggml_type type_a : other_types) { for (ggml_type type_b : {GGML_TYPE_F32}) { if (ggml_blck_size(type_a) != 256) { From 1348f67c58f561808136e8a152a9eddec168f221 Mon Sep 17 00:00:00 2001 From: Omer Ozarslan Date: Fri, 15 May 2026 10:38:16 -0700 Subject: [PATCH 012/309] webui: Use lowercase hash for HF checksum check (#23107) --- scripts/webui-download.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/webui-download.cmake b/scripts/webui-download.cmake index 344ba5ecf53..6695c018035 100644 --- a/scripts/webui-download.cmake +++ b/scripts/webui-download.cmake @@ -184,8 +184,8 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED) foreach(asset ${ASSETS}) set(download_path "${PUBLIC_DIR}/${asset}") file(SHA256 "${download_path}" asset_hash) - string(TOUPPER "${asset_hash}" EXPECTED_HASH_UPPER) - string(REGEX MATCH "${EXPECTED_HASH_UPPER}[ \\t]+${asset}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}") + string(TOLOWER "${asset_hash}" EXPECTED_HASH_LOWER) + string(REGEX MATCH "${EXPECTED_HASH_LOWER}[ \\t]+${asset}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}") if(NOT CHECKSUM_LINE) message(WARNING "WebUI: checksum verification failed for ${asset}") message(WARNING " downloaded file may not match expected checksum, but will be used") From 49d1701bd24e4cedf6dfec9e50e185111203946b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 16 May 2026 01:09:28 +0200 Subject: [PATCH 013/309] ci : fix release symlinks (#23119) --- .github/workflows/release.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 218d24296be..bb512fd875c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -104,7 +104,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -182,7 +182,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -259,7 +259,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -337,7 +337,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -426,7 +426,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/ReleaseOV/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -867,7 +867,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -979,7 +979,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -1240,7 +1240,7 @@ jobs: - name: Pack artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 From 59778f0196a82db32580bb649d5d839355d6d7bf Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Sat, 16 May 2026 02:02:40 +0200 Subject: [PATCH 014/309] ui: Restructure repo to use `tools/ui` folder and `ui` / `UI` / `llama-ui` / `LLAMA_UI` naming (#23064) * webui: Move static build output from `tools/server/public` to `build/ui` directory * refactor: Move to `tools/ui` * refactor: rename CMake variables and preprocessor defines - Rename LLAMA_BUILD_WEBUI -> LLAMA_BUILD_UI (old kept as deprecated) - Rename LLAMA_USE_PREBUILT_WEBUI -> LLAMA_USE_PREBUILT_UI (old kept as deprecated) - Backward compat: old vars auto-forward to new ones with DEPRECATION warning - Rename internal vars: WEBUI_SOURCE -> UI_SOURCE, WEBUI_SOURCE_DIR -> UI_SOURCE_DIR, etc. - Rename HF bucket: LLAMA_WEBUI_HF_BUCKET -> LLAMA_UI_HF_BUCKET - Emit both LLAMA_BUILD_WEBUI and LLAMA_BUILD_UI preprocessor defines - Emit both LLAMA_WEBUI_DEFAULT_ENABLED and LLAMA_UI_DEFAULT_ENABLED * refactor: rename CLI flags (--webui -> --ui) with backward compat - Add --ui/--no-ui (old --webui/--no-webui kept as deprecated aliases) - Add --ui-config (old --webui-config kept as deprecated alias) - Add --ui-config-file (old --webui-config-file kept as deprecated alias) - Add --ui-mcp-proxy/--no-ui-mcp-proxy (old --webui-mcp-proxy kept as deprecated) - Add new env vars: LLAMA_ARG_UI, LLAMA_ARG_UI_CONFIG, LLAMA_ARG_UI_CONFIG_FILE, LLAMA_ARG_UI_MCP_PROXY - C++ struct fields: params.ui, params.ui_config_json, params.ui_mcp_proxy added alongside old fields - Backward compat: old fields synced to new ones in g_params_to_internals * refactor: update C++ server internals with backward compat - Rename json_webui_settings -> json_ui_settings (both kept in server_context_meta) - Rename params.webui usage -> params.ui (both synced, old still works) - JSON API emits both "ui"/"ui_settings" and "webui"/"webui_settings" keys - Server routes use params.ui_mcp_proxy || params.webui_mcp_proxy - Preprocessor guards use #if defined(LLAMA_BUILD_UI) || defined(LLAMA_BUILD_WEBUI) * refactor: rename CI/CD workflows, artifacts, and build script - Rename webui-build.yml -> ui-build.yml; artifact webui-build -> ui-build - Rename webui-publish.yml -> ui-publish.yml; var HF_BUCKET_WEBUI_STATIC_OUTPUT -> HF_BUCKET_UI_STATIC_OUTPUT - Rename server-webui.yml -> server-ui.yml; job webui-build/checks -> ui-build/checks - Update server.yml: job/artifact refs webui-build -> ui-build - Update release.yml: all webui-build/publish refs -> ui-build/publish; HF_TOKEN_WEBUI_STATIC_OUTPUT -> HF_TOKEN_UI_STATIC_OUTPUT - Update server-self-hosted.yml: webui-build -> ui-build - Update build-self-hosted.yml: HF_WEBUI_VERSION -> HF_UI_VERSION - Rename webui-download.cmake -> ui-download.cmake (internal refs updated) - Update labeler.yml: server/webui -> server/ui path label * docs: update CODEOWNERS and server README docs - Update CODEOWNERS: team ggml-org/llama-webui -> ggml-org/llama-ui, path /tools/server/webui/ -> /tools/ui/ - Update server README.md: CLI tables show --ui flags with deprecated --webui aliases - Update server README-dev.md: "WebUI" -> "UI", paths updated to tools/ui/ * fix: Small fixes for UI build * fix: CMake.txt syntax * chore: Formatting * fix: `.editorconfig` for llama-ui * chore: Formatting * refactor: Use `APP_NAME` in Error route * refactor: Cleanup * refactor: Single migration service * make llama-ui a linkable target * fix: UI Build output * fix: Missing change * fix: separate llama-ui npm build output into build/tools/ui/dist subfolder + use cmake npm build instead of downloading ui-build.yml artifacts in CI * refactor: UI workflows cleanup --------- Co-authored-by: Xuan Son Nguyen --- .editorconfig | 2 +- .github/labeler.yml | 4 +- .github/workflows/build-self-hosted.yml | 20 +- .github/workflows/release.yml | 142 +++-- .github/workflows/server-sanitize.yml | 7 + .github/workflows/server-self-hosted.yml | 14 +- .github/workflows/server.yml | 22 +- .../{webui-build.yml => ui-build.yml} | 18 +- .../workflows/{server-webui.yml => ui-ci.yml} | 52 +- .../{webui-publish.yml => ui-publish.yml} | 16 +- .gitignore | 5 +- CMakeLists.txt | 19 +- CODEOWNERS | 4 +- common/arg.cpp | 58 +- common/common.h | 16 +- ...webui-download.cmake => ui-download.cmake} | 57 +- scripts/xxd.cmake | 2 +- tools/CMakeLists.txt | 1 + tools/server/CMakeLists.txt | 127 +---- tools/server/README-dev.md | 6 +- tools/server/README.md | 16 +- tools/server/server-context.cpp | 27 +- tools/server/server-context.h | 3 +- tools/server/server-http.cpp | 23 +- tools/server/server-models.cpp | 7 +- tools/server/server-models.h | 15 +- tools/server/server.cpp | 3 +- tools/server/webui/scripts/post-build.sh | 3 - .../src/lib/constants/localstorage-keys.ts | 6 - tools/{server/webui => ui}/.gitignore | 0 tools/{server/webui => ui}/.npmrc | 0 tools/{server/webui => ui}/.prettierignore | 0 tools/{server/webui => ui}/.prettierrc | 0 .../decorators/ModeWatcherDecorator.svelte | 0 .../TooltipProviderDecorator.svelte | 0 tools/{server/webui => ui}/.storybook/main.ts | 0 .../webui => ui}/.storybook/preview.ts | 0 .../webui => ui}/.storybook/vitest.setup.ts | 0 tools/ui/CMakeLists.txt | 157 ++++++ tools/{server/webui => ui}/README.md | 27 +- tools/{server/webui => ui}/components.json | 0 .../high-level-architecture-simplified.md | 0 .../architecture/high-level-architecture.md | 0 .../webui => ui}/docs/flows/chat-flow.md | 0 .../docs/flows/conversations-flow.md | 0 .../flows/data-flow-simplified-model-mode.md | 0 .../flows/data-flow-simplified-router-mode.md | 0 .../webui => ui}/docs/flows/database-flow.md | 0 .../webui => ui}/docs/flows/mcp-flow.md | 0 .../webui => ui}/docs/flows/models-flow.md | 0 .../webui => ui}/docs/flows/server-flow.md | 0 .../webui => ui}/docs/flows/settings-flow.md | 4 +- tools/{server/webui => ui}/eslint.config.js | 4 +- tools/{server/webui => ui}/package-lock.json | 0 tools/{server/webui => ui}/package.json | 2 +- .../{server/webui => ui}/playwright.config.ts | 2 +- tools/{server/webui => ui}/scripts/dev.sh | 10 +- .../webui => ui}/scripts/install-git-hooks.sh | 28 +- .../scripts/vite-plugin-llama-cpp-build.ts | 51 +- tools/{server/webui => ui}/src/app.css | 0 tools/{server/webui => ui}/src/app.d.ts | 0 tools/{server/webui => ui}/src/app.html | 0 .../src/lib/actions/fade-in-view.svelte.ts | 0 .../src/lib/components/app/SKILL.md | 0 .../components/app/actions/ActionIcon.svelte | 0 .../actions/ActionIconCopyToClipboard.svelte | 0 .../src/lib/components/app/actions/index.ts | 0 .../components/app/badges/BadgeInfo.svelte | 0 .../app/badges/BadgesModality.svelte | 0 .../src/lib/components/app/badges/index.ts | 0 .../ChatAttachmentsList.svelte | 0 .../ChatAttachmentsListItem.svelte | 0 .../ChatAttachmentsListItemMcpPrompt.svelte | 0 .../ChatAttachmentsListItemMcpResource.svelte | 0 ...hatAttachmentsListItemThumbnailFile.svelte | 0 ...atAttachmentsListItemThumbnailImage.svelte | 0 .../ChatAttachmentsPreview.svelte | 0 .../ChatAttachmentsPreviewCurrentItem.svelte | 0 ...tAttachmentsPreviewCurrentItemAudio.svelte | 0 ...tAttachmentsPreviewCurrentItemImage.svelte | 0 ...hatAttachmentsPreviewCurrentItemPdf.svelte | 0 ...atAttachmentsPreviewCurrentItemText.svelte | 0 ...hmentsPreviewCurrentItemUnavailable.svelte | 0 .../ChatAttachmentsPreviewFileInfo.svelte | 0 .../ChatAttachmentsPreviewNavButtons.svelte | 0 ...hatAttachmentsPreviewThumbnailStrip.svelte | 0 .../app/chat/ChatForm/ChatForm.svelte | 0 .../ChatFormActionAddButton.svelte | 0 .../ChatFormActionAddDropdown.svelte | 7 +- .../ChatFormActionAddMcpServersSubmenu.svelte | 0 .../ChatFormActionAddSheet.svelte | 7 +- .../ChatFormActionAddToolsSubmenu.svelte | 3 +- .../ChatFormActionsAdd.svelte | 0 .../ChatFormActionModels.svelte | 0 .../ChatFormActionRecord.svelte | 0 .../ChatFormActionSubmit.svelte | 0 .../ChatFormActions/ChatFormActions.svelte | 0 .../ChatFormFileInputInvisible.svelte | 0 .../ChatForm/ChatFormMcpResourcesList.svelte | 0 .../ChatFormPickerItemHeader.svelte | 0 .../ChatFormPicker/ChatFormPickerList.svelte | 0 .../ChatFormPickerListItem.svelte | 0 .../ChatFormPickerListItemSkeleton.svelte | 0 .../ChatFormPickerPopover.svelte | 0 .../ChatFormPickerMcpPrompts.svelte | 0 .../ChatFormPromptPickerArgumentForm.svelte | 0 .../ChatFormPromptPickerArgumentInput.svelte | 0 .../ChatFormPickerMcpResources.svelte | 0 .../ChatFormPickers/ChatFormPickers.svelte | 0 .../app/chat/ChatForm/ChatFormTextarea.svelte | 0 .../ChatMessage/ChatMessage.svelte | 0 .../ChatMessageAssistant.svelte | 0 .../ChatMessageMcpPrompt.svelte | 0 .../ChatMessageMcpPromptContent.svelte | 0 .../ChatMessageSystem.svelte | 0 .../ChatMessageUser/ChatMessageUser.svelte | 0 .../ChatMessageUserBubble.svelte | 0 .../ChatMessageUserPending.svelte | 0 .../ChatMessageActionCard.svelte | 0 ...hatMessageActionCardContinueRequest.svelte | 0 ...tMessageActionCardPermissionRequest.svelte | 0 .../ChatMessageActionIcons.svelte | 0 ...MessageActionIconsBranchingControls.svelte | 0 .../ChatMessageAgenticContent.svelte | 4 +- .../ChatMessages/ChatMessageEditForm.svelte | 0 .../ChatMessageStatistics.svelte | 0 .../ChatMessageStatisticsBadge.svelte | 0 .../app/chat/ChatMessages/ChatMessages.svelte | 0 .../app/chat/ChatScreen/ChatScreen.svelte | 0 .../ChatScreen/ChatScreenDragOverlay.svelte | 0 .../app/chat/ChatScreen/ChatScreenForm.svelte | 0 .../ChatScreenProcessingInfo.svelte | 0 .../src/lib/components/app/chat/index.ts | 0 .../content/CollapsibleContentBlock.svelte | 0 .../MarkdownContent/MarkdownContent.svelte | 0 .../plugins/rehype/enhance-code-blocks.ts | 0 .../plugins/rehype/enhance-links.ts | 0 .../plugins/rehype/rehype-rtl-support.ts | 0 .../rehype/resolve-attachment-images.ts | 0 .../plugins/rehype/table-html-restorer.ts | 0 .../plugins/remark/literal-html.ts | 0 .../app/content/SyntaxHighlightedCode.svelte | 0 .../src/lib/components/app/content/index.ts | 0 .../DialogChatAttachmentsPreview.svelte | 0 .../app/dialogs/DialogChatError.svelte | 0 .../app/dialogs/DialogCodePreview.svelte | 0 .../app/dialogs/DialogConfirmation.svelte | 0 .../DialogConversationSelection.svelte | 0 .../DialogConversationTitleUpdate.svelte | 0 .../app/dialogs/DialogEmptyFileAlert.svelte | 0 .../app/dialogs/DialogExportSettings.svelte | 0 .../app/dialogs/DialogFileUploadError.svelte | 0 .../dialogs/DialogMcpResourcePreview.svelte | 0 .../dialogs/DialogMcpResourcesBrowser.svelte | 0 .../app/dialogs/DialogMcpServerAddNew.svelte | 0 .../app/dialogs/DialogModelInformation.svelte | 0 .../dialogs/DialogModelNotAvailable.svelte | 0 .../src/lib/components/app/dialogs/index.ts | 0 .../app/forms/InputWithSuggestions.svelte | 0 .../components/app/forms/KeyValuePairs.svelte | 0 .../components/app/forms/SearchInput.svelte | 0 .../src/lib/components/app/forms/index.ts | 0 .../src/lib/components/app/index.ts | 0 .../app/mcp/McpActiveServersAvatars.svelte | 0 .../app/mcp/McpCapabilitiesBadges.svelte | 0 .../app/mcp/McpConnectionLogs.svelte | 0 .../src/lib/components/app/mcp/McpLogo.svelte | 0 .../app/mcp/McpResourcePreview.svelte | 0 .../app/mcp/McpResourceTemplateForm.svelte | 0 .../McpResourcesBrowser.svelte | 0 .../McpResourcesBrowserEmptyState.svelte | 0 .../McpResourcesBrowserHeader.svelte | 0 .../McpResourcesBrowserServerItem.svelte | 0 .../mcp-resources-browser.ts | 0 .../mcp/McpServerCard/McpServerCard.svelte | 0 .../McpServerCard/McpServerCardActions.svelte | 0 .../McpServerCardDeleteDialog.svelte | 0 .../McpServerCardEditForm.svelte | 0 .../McpServerCard/McpServerCardHeader.svelte | 0 .../McpServerCardToolsList.svelte | 0 .../app/mcp/McpServerCardSkeleton.svelte | 0 .../components/app/mcp/McpServerForm.svelte | 3 +- .../app/mcp/McpServerIdentity.svelte | 0 .../components/app/mcp/McpServerInfo.svelte | 0 .../src/lib/components/app/mcp/index.ts | 0 .../app/misc/CodeBlockActions.svelte | 0 .../app/misc/ConversationSelection.svelte | 0 .../app/misc/HorizontalScrollCarousel.svelte | 0 .../app/misc/KeyboardShortcutInfo.svelte | 0 .../components/app/misc/TruncatedText.svelte | 0 .../src/lib/components/app/misc/index.ts | 0 .../components/app/models/ModelBadge.svelte | 0 .../lib/components/app/models/ModelId.svelte | 0 .../app/models/ModelsSelectorDropdown.svelte | 0 .../app/models/ModelsSelectorList.svelte | 0 .../app/models/ModelsSelectorOption.svelte | 0 .../app/models/ModelsSelectorSheet.svelte | 0 .../src/lib/components/app/models/index.ts | 0 .../src/lib/components/app/models/utils.ts | 0 .../app/navigation/DesktopIconStrip.svelte | 0 .../app/navigation/DropdownMenuActions.svelte | 0 .../navigation/DropdownMenuSearchable.svelte | 0 .../SidebarNavigation.svelte | 4 +- .../SidebarNavigationActions.svelte | 0 .../SidebarNavigationConversationItem.svelte | 0 .../SidebarNavigationSearch.svelte | 0 .../lib/components/app/navigation/index.ts | 0 .../app/server/ServerErrorSplash.svelte | 0 .../app/server/ServerLoadingSplash.svelte | 0 .../components/app/server/ServerStatus.svelte | 0 .../src/lib/components/app/server/index.ts | 0 .../settings/SettingsChat/SettingsChat.svelte | 0 .../SettingsChat/SettingsChatFields.svelte | 0 .../SettingsChatImportExportSection.svelte | 0 .../SettingsChatImportExportTab.svelte | 0 ...ettingsChatParameterSourceIndicator.svelte | 0 .../SettingsChat/SettingsChatToolsTab.svelte | 0 .../SettingsChatDesktopSidebar.svelte | 0 .../settings/SettingsChatMobileHeader.svelte | 0 .../app/settings/SettingsFooter.svelte | 0 .../app/settings/SettingsGroup.svelte | 0 .../app/settings/SettingsMcpServers.svelte | 0 .../src/lib/components/app/settings/index.ts | 0 .../alert-dialog/alert-dialog-action.svelte | 0 .../alert-dialog/alert-dialog-cancel.svelte | 0 .../alert-dialog/alert-dialog-content.svelte | 0 .../alert-dialog-description.svelte | 0 .../alert-dialog/alert-dialog-footer.svelte | 0 .../alert-dialog/alert-dialog-header.svelte | 0 .../alert-dialog/alert-dialog-overlay.svelte | 0 .../ui/alert-dialog/alert-dialog-title.svelte | 0 .../alert-dialog/alert-dialog-trigger.svelte | 0 .../lib/components/ui/alert-dialog/index.ts | 0 .../ui/alert/alert-description.svelte | 0 .../components/ui/alert/alert-title.svelte | 0 .../src/lib/components/ui/alert/alert.svelte | 0 .../src/lib/components/ui/alert/index.ts | 0 .../src/lib/components/ui/badge/badge.svelte | 0 .../src/lib/components/ui/badge/index.ts | 0 .../ui/button-group/button-group-root.svelte | 0 .../button-group-separator.svelte | 0 .../lib/components/ui/button-group/index.ts | 0 .../lib/components/ui/button/button.svelte | 0 .../src/lib/components/ui/button/index.ts | 0 .../lib/components/ui/card/card-action.svelte | 0 .../components/ui/card/card-content.svelte | 0 .../ui/card/card-description.svelte | 0 .../lib/components/ui/card/card-footer.svelte | 0 .../lib/components/ui/card/card-header.svelte | 0 .../lib/components/ui/card/card-title.svelte | 0 .../src/lib/components/ui/card/card.svelte | 0 .../src/lib/components/ui/card/index.ts | 0 .../components/ui/checkbox/checkbox.svelte | 0 .../src/lib/components/ui/checkbox/index.ts | 0 .../ui/collapsible/collapsible-content.svelte | 0 .../ui/collapsible/collapsible-trigger.svelte | 0 .../ui/collapsible/collapsible.svelte | 0 .../lib/components/ui/collapsible/index.ts | 0 .../components/ui/dialog/dialog-close.svelte | 0 .../ui/dialog/dialog-content.svelte | 0 .../ui/dialog/dialog-description.svelte | 0 .../components/ui/dialog/dialog-footer.svelte | 0 .../components/ui/dialog/dialog-header.svelte | 0 .../ui/dialog/dialog-overlay.svelte | 0 .../components/ui/dialog/dialog-title.svelte | 0 .../ui/dialog/dialog-trigger.svelte | 0 .../src/lib/components/ui/dialog/index.ts | 0 .../dropdown-menu-checkbox-item.svelte | 0 .../dropdown-menu-content.svelte | 0 .../dropdown-menu-group-heading.svelte | 0 .../dropdown-menu/dropdown-menu-group.svelte | 0 .../dropdown-menu/dropdown-menu-item.svelte | 0 .../dropdown-menu/dropdown-menu-label.svelte | 0 .../dropdown-menu-radio-group.svelte | 0 .../dropdown-menu-radio-item.svelte | 0 .../dropdown-menu-separator.svelte | 0 .../dropdown-menu-shortcut.svelte | 0 .../dropdown-menu-sub-content.svelte | 0 .../dropdown-menu-sub-trigger.svelte | 0 .../dropdown-menu-trigger.svelte | 0 .../lib/components/ui/dropdown-menu/index.ts | 0 .../src/lib/components/ui/input/index.ts | 0 .../src/lib/components/ui/input/input.svelte | 0 .../src/lib/components/ui/label/index.ts | 0 .../src/lib/components/ui/label/label.svelte | 0 .../src/lib/components/ui/popover/index.ts | 0 .../ui/popover/popover-close.svelte | 0 .../ui/popover/popover-content.svelte | 0 .../ui/popover/popover-portal.svelte | 0 .../ui/popover/popover-trigger.svelte | 0 .../lib/components/ui/popover/popover.svelte | 0 .../lib/components/ui/scroll-area/index.ts | 0 .../scroll-area/scroll-area-scrollbar.svelte | 0 .../ui/scroll-area/scroll-area.svelte | 0 .../src/lib/components/ui/select/index.ts | 0 .../ui/select/select-content.svelte | 0 .../ui/select/select-group-heading.svelte | 0 .../components/ui/select/select-group.svelte | 0 .../components/ui/select/select-item.svelte | 0 .../components/ui/select/select-label.svelte | 0 .../select/select-scroll-down-button.svelte | 0 .../ui/select/select-scroll-up-button.svelte | 0 .../ui/select/select-separator.svelte | 0 .../ui/select/select-trigger.svelte | 0 .../src/lib/components/ui/separator/index.ts | 0 .../components/ui/separator/separator.svelte | 0 .../src/lib/components/ui/sheet/index.ts | 0 .../components/ui/sheet/sheet-close.svelte | 0 .../components/ui/sheet/sheet-content.svelte | 0 .../ui/sheet/sheet-description.svelte | 0 .../components/ui/sheet/sheet-footer.svelte | 0 .../components/ui/sheet/sheet-header.svelte | 0 .../components/ui/sheet/sheet-overlay.svelte | 0 .../components/ui/sheet/sheet-title.svelte | 0 .../components/ui/sheet/sheet-trigger.svelte | 0 .../lib/components/ui/sidebar/constants.ts | 0 .../components/ui/sidebar/context.svelte.ts | 0 .../src/lib/components/ui/sidebar/index.ts | 0 .../ui/sidebar/sidebar-content.svelte | 0 .../ui/sidebar/sidebar-footer.svelte | 0 .../ui/sidebar/sidebar-group-action.svelte | 0 .../ui/sidebar/sidebar-group-content.svelte | 0 .../ui/sidebar/sidebar-group-label.svelte | 0 .../ui/sidebar/sidebar-group.svelte | 0 .../ui/sidebar/sidebar-header.svelte | 0 .../ui/sidebar/sidebar-input.svelte | 0 .../ui/sidebar/sidebar-inset.svelte | 0 .../ui/sidebar/sidebar-menu-action.svelte | 0 .../ui/sidebar/sidebar-menu-badge.svelte | 0 .../ui/sidebar/sidebar-menu-button.svelte | 0 .../ui/sidebar/sidebar-menu-item.svelte | 0 .../ui/sidebar/sidebar-menu-skeleton.svelte | 0 .../ui/sidebar/sidebar-menu-sub-button.svelte | 0 .../ui/sidebar/sidebar-menu-sub-item.svelte | 0 .../ui/sidebar/sidebar-menu-sub.svelte | 0 .../components/ui/sidebar/sidebar-menu.svelte | 0 .../ui/sidebar/sidebar-provider.svelte | 0 .../components/ui/sidebar/sidebar-rail.svelte | 0 .../ui/sidebar/sidebar-separator.svelte | 0 .../ui/sidebar/sidebar-trigger.svelte | 0 .../lib/components/ui/sidebar/sidebar.svelte | 0 .../src/lib/components/ui/skeleton/index.ts | 0 .../components/ui/skeleton/skeleton.svelte | 0 .../src/lib/components/ui/switch/index.ts | 0 .../lib/components/ui/switch/switch.svelte | 0 .../src/lib/components/ui/table/index.ts | 0 .../lib/components/ui/table/table-body.svelte | 0 .../components/ui/table/table-caption.svelte | 0 .../lib/components/ui/table/table-cell.svelte | 0 .../components/ui/table/table-footer.svelte | 0 .../lib/components/ui/table/table-head.svelte | 0 .../components/ui/table/table-header.svelte | 0 .../lib/components/ui/table/table-row.svelte | 0 .../src/lib/components/ui/table/table.svelte | 0 .../src/lib/components/ui/textarea/index.ts | 0 .../components/ui/textarea/textarea.svelte | 0 .../src/lib/components/ui/tooltip/index.ts | 0 .../ui/tooltip/tooltip-content.svelte | 0 .../ui/tooltip/tooltip-trigger.svelte | 0 .../src/lib/components/ui/utils.ts | 0 .../webui => ui}/src/lib/constants/agentic.ts | 0 .../src/lib/constants/api-endpoints.ts | 0 .../src/lib/constants/attachment-labels.ts | 0 .../src/lib/constants/attachment-menu.ts | 0 .../src/lib/constants/auto-scroll.ts | 0 .../src/lib/constants/binary-detection.ts | 0 .../webui => ui}/src/lib/constants/cache.ts | 0 .../src/lib/constants/chat-form.ts | 0 tools/ui/src/lib/constants/cli-flags.ts | 6 + .../src/lib/constants/code-blocks.ts | 0 .../webui => ui}/src/lib/constants/code.ts | 0 .../src/lib/constants/context-keys.ts | 0 .../src/lib/constants/css-classes.ts | 0 tools/ui/src/lib/constants/database.ts | 29 + .../lib/constants/floating-ui-constraints.ts | 0 .../src/lib/constants/formatters.ts | 0 .../webui => ui}/src/lib/constants/icons.ts | 0 .../webui => ui}/src/lib/constants/index.ts | 4 +- .../src/lib/constants/key-value-pairs.ts | 0 .../src/lib/constants/latex-protection.ts | 0 .../src/lib/constants/literal-html.ts | 0 .../src/lib/constants/markdown.ts | 0 .../src/lib/constants/max-bundle-size.ts | 0 .../src/lib/constants/mcp-form.ts | 0 .../src/lib/constants/mcp-resource.ts | 0 .../webui => ui}/src/lib/constants/mcp.ts | 5 +- .../src/lib/constants/message-export.ts | 0 .../src/lib/constants/model-id.ts | 0 .../src/lib/constants/precision.ts | 0 .../src/lib/constants/processing-info.ts | 0 .../webui => ui}/src/lib/constants/routes.ts | 0 .../src/lib/constants/settings-keys.ts | 0 .../src/lib/constants/settings-registry.ts | 68 ++- tools/ui/src/lib/constants/storage.ts | 46 ++ .../src/lib/constants/supported-file-types.ts | 2 +- .../src/lib/constants/table-html-restorer.ts | 0 .../src/lib/constants/title-generation.ts | 0 .../webui => ui}/src/lib/constants/tools.ts | 0 .../src/lib/constants/tooltip-config.ts | 0 .../webui => ui}/src/lib/constants/ui.ts | 0 .../src/lib/constants/uri-template.ts | 0 .../webui => ui}/src/lib/constants/url.ts | 0 .../src/lib/constants/viewport.ts | 0 .../src/lib/contexts/chat-actions.context.ts | 0 .../contexts/chat-settings-config.context.ts | 0 .../webui => ui}/src/lib/contexts/index.ts | 0 .../src/lib/contexts/message-edit.context.ts | 0 .../lib/contexts/processing-info.context.ts | 0 .../webui => ui}/src/lib/enums/agentic.ts | 0 .../webui => ui}/src/lib/enums/attachment.ts | 0 .../webui => ui}/src/lib/enums/chat.ts | 0 .../webui => ui}/src/lib/enums/files.ts | 2 +- .../webui => ui}/src/lib/enums/index.ts | 0 .../webui => ui}/src/lib/enums/keyboard.ts | 0 .../{server/webui => ui}/src/lib/enums/mcp.ts | 0 .../webui => ui}/src/lib/enums/model.ts | 0 .../webui => ui}/src/lib/enums/server.ts | 0 .../webui => ui}/src/lib/enums/settings.ts | 0 .../webui => ui}/src/lib/enums/tools.ts | 0 .../{server/webui => ui}/src/lib/enums/ui.ts | 0 .../src/lib/hooks/is-mobile.svelte.ts | 0 .../lib/hooks/use-attachment-menu.svelte.ts | 0 .../src/lib/hooks/use-auto-scroll.svelte.ts | 0 .../lib/hooks/use-draft-messages.svelte.ts | 0 .../hooks/use-keyboard-shortcuts.svelte.ts | 0 .../hooks/use-message-edit-context.svelte.ts | 0 .../lib/hooks/use-models-selector.svelte.ts | 3 +- .../lib/hooks/use-processing-state.svelte.ts | 0 .../lib/hooks/use-scroll-carousel.svelte.ts | 0 .../hooks/use-settings-navigation.svelte.ts | 0 .../src/lib/hooks/use-tools-panel.svelte.ts | 5 +- .../src/lib/services/chat.service.ts | 2 +- .../src/lib/services/database.service.ts | 350 ++++++------ .../webui => ui}/src/lib/services/index.ts | 30 +- .../src/lib/services/mcp.service.ts | 0 .../ui/src/lib/services/migration.service.ts | 524 ++++++++++++++++++ .../src/lib/services/models.service.ts | 0 .../services/parameter-sync.service.spec.ts | 2 +- .../lib/services/parameter-sync.service.ts | 13 +- .../src/lib/services/props.service.ts | 0 .../src/lib/services/router.service.ts | 0 .../src/lib/services/tools.service.ts | 0 .../src/lib/stores/agentic.svelte.ts | 14 +- .../src/lib/stores/chat.svelte.ts | 24 +- .../src/lib/stores/conversations.svelte.ts | 13 +- .../src/lib/stores/draft-messages.svelte.ts | 0 .../src/lib/stores/mcp-resources.svelte.ts | 0 .../webui => ui}/src/lib/stores/mcp.svelte.ts | 9 +- .../src/lib/stores/models.svelte.ts | 16 +- .../src/lib/stores/permissions.svelte.ts | 1 + .../src/lib/stores/persisted.svelte.ts | 0 .../src/lib/stores/server.svelte.ts | 4 +- .../lib/stores/settings-referrer.svelte.ts | 0 .../src/lib/stores/settings.svelte.ts | 37 +- .../src/lib/stores/tools.svelte.ts | 7 +- .../webui => ui}/src/lib/types/agentic.d.ts | 0 .../webui => ui}/src/lib/types/api.d.ts | 2 + .../webui => ui}/src/lib/types/chat.d.ts | 0 .../webui => ui}/src/lib/types/common.d.ts | 0 .../webui => ui}/src/lib/types/database.d.ts | 2 +- .../webui => ui}/src/lib/types/index.ts | 0 .../webui => ui}/src/lib/types/mcp.d.ts | 0 .../webui => ui}/src/lib/types/models.d.ts | 0 .../webui => ui}/src/lib/types/settings.d.ts | 0 .../webui => ui}/src/lib/types/tools.d.ts | 0 .../webui => ui}/src/lib/utils/abort.ts | 0 .../webui => ui}/src/lib/utils/agentic.ts | 0 .../webui => ui}/src/lib/utils/api-fetch.ts | 0 .../webui => ui}/src/lib/utils/api-headers.ts | 0 .../src/lib/utils/api-key-validation.ts | 0 .../src/lib/utils/attachment-display.ts | 0 .../src/lib/utils/attachment-type.ts | 0 .../src/lib/utils/audio-recording.ts | 0 .../src/lib/utils/autoresize-textarea.ts | 0 .../webui => ui}/src/lib/utils/branching.ts | 0 .../src/lib/utils/browser-only.ts | 0 .../webui => ui}/src/lib/utils/cache-ttl.ts | 0 .../webui => ui}/src/lib/utils/clipboard.ts | 0 .../webui => ui}/src/lib/utils/code.ts | 0 .../src/lib/utils/config-helpers.ts | 0 .../src/lib/utils/conversation-utils.ts | 0 .../src/lib/utils/convert-files-to-extra.ts | 0 .../webui => ui}/src/lib/utils/cors-proxy.ts | 0 .../{server/webui => ui}/src/lib/utils/css.ts | 0 .../webui => ui}/src/lib/utils/data-url.ts | 0 .../webui => ui}/src/lib/utils/debounce.ts | 0 .../src/lib/utils/file-preview.ts | 0 .../webui => ui}/src/lib/utils/file-type.ts | 0 .../webui => ui}/src/lib/utils/formatters.ts | 0 .../webui => ui}/src/lib/utils/headers.ts | 0 .../src/lib/utils/image-error-fallback.ts | 0 .../webui => ui}/src/lib/utils/index.ts | 3 - .../src/lib/utils/is-ime-composing.ts | 0 .../src/lib/utils/latex-protection.ts | 0 .../src/lib/utils/legacy-migration.ts | 24 +- .../{server/webui => ui}/src/lib/utils/mcp.ts | 0 .../src/lib/utils/modality-file-validation.ts | 0 .../webui => ui}/src/lib/utils/model-names.ts | 0 .../src/lib/utils/pdf-processing.ts | 0 .../src/lib/utils/portal-to-body.ts | 0 .../webui => ui}/src/lib/utils/precision.ts | 0 .../src/lib/utils/process-uploaded-files.ts | 0 .../webui => ui}/src/lib/utils/redact.ts | 0 .../src/lib/utils/request-helpers.ts | 0 .../webui => ui}/src/lib/utils/sanitize.ts | 0 .../webui => ui}/src/lib/utils/svg-to-png.ts | 0 .../lib/utils/syntax-highlight-language.ts | 0 .../webui => ui}/src/lib/utils/text-files.ts | 0 .../webui => ui}/src/lib/utils/text.ts | 0 .../src/lib/utils/uri-template.ts | 0 .../{server/webui => ui}/src/lib/utils/url.ts | 0 .../webui => ui}/src/lib/utils/uuid.ts | 0 .../webui => ui}/src/lib/utils/viewport.ts | 0 .../webui => ui}/src/lib/utils/webp-to-png.ts | 0 .../src/routes/(chat)/+layout.svelte | 0 .../src/routes/(chat)/+page.svelte | 0 .../webui => ui}/src/routes/(chat)/+page.ts | 0 .../src/routes/(chat)/chat/[id]/+page.svelte | 0 .../src/routes/(chat)/chat/[id]/+page.ts | 0 .../webui => ui}/src/routes/+error.svelte | 3 +- .../webui => ui}/src/routes/+layout.svelte | 0 .../src/routes/mcp-servers/+page.svelte | 0 .../src/routes/settings/+layout.svelte | 0 .../routes/settings/[[section]]/+page.svelte | 0 .../webui => ui}/src/styles/katex-custom.scss | 0 tools/{server/webui => ui}/static/favicon.svg | 0 .../{server/webui => ui}/static/loading.html | 0 tools/{server/webui => ui}/svelte.config.js | 4 +- .../client/components/TestWrapper.svelte | 0 .../tests/client/page.svelte.test.ts | 0 .../webui => ui}/tests/e2e/demo.test.ts | 0 .../tests/stories/ChatMessage.stories.svelte | 0 .../stories/ChatScreenForm.stories.svelte | 0 .../tests/stories/Introduction.mdx | 2 +- .../stories/MarkdownContent.stories.svelte | 0 .../stories/SidebarNavigation.stories.svelte | 0 .../tests/stories/fixtures/ai-tutorial.ts | 0 .../tests/stories/fixtures/api-docs.ts | 0 .../tests/stories/fixtures/assets/1.jpg | Bin .../assets/beautiful-flowers-lotus.webp | Bin .../tests/stories/fixtures/assets/example.pdf | Bin .../tests/stories/fixtures/assets/hf-logo.svg | 0 .../tests/stories/fixtures/blog-post.ts | 0 .../tests/stories/fixtures/data-analysis.ts | 0 .../tests/stories/fixtures/empty.ts | 0 .../tests/stories/fixtures/math-formulas.ts | 0 .../tests/stories/fixtures/readme.ts | 0 .../tests/stories/fixtures/storybook-mocks.ts | 0 .../tests/unit/agentic-sections.test.ts | 24 +- .../tests/unit/agentic-strip.test.ts | 0 .../webui => ui}/tests/unit/clipboard.test.ts | 0 .../tests/unit/latex-protection.test.ts | 0 .../tests/unit/mcp-service.test.ts | 0 .../tests/unit/model-id-parser.test.ts | 0 .../tests/unit/model-names.test.ts | 0 .../tests/unit/reasoning-context.test.ts | 0 .../webui => ui}/tests/unit/redact.test.ts | 0 .../tests/unit/request-helpers.test.ts | 0 .../tests/unit/sanitize-headers.test.ts | 0 .../tests/unit/uri-template.test.ts | 0 tools/{server/webui => ui}/tsconfig.json | 0 tools/ui/ui.cpp | 7 + tools/ui/ui.h | 17 + tools/{server/webui => ui}/vite.config.ts | 0 .../webui => ui}/vitest-setup-client.ts | 0 565 files changed, 1614 insertions(+), 698 deletions(-) rename .github/workflows/{webui-build.yml => ui-build.yml} (67%) rename .github/workflows/{server-webui.yml => ui-ci.yml} (75%) rename .github/workflows/{webui-publish.yml => ui-publish.yml} (84%) rename scripts/{webui-download.cmake => ui-download.cmake} (76%) delete mode 100755 tools/server/webui/scripts/post-build.sh delete mode 100644 tools/server/webui/src/lib/constants/localstorage-keys.ts rename tools/{server/webui => ui}/.gitignore (100%) rename tools/{server/webui => ui}/.npmrc (100%) rename tools/{server/webui => ui}/.prettierignore (100%) rename tools/{server/webui => ui}/.prettierrc (100%) rename tools/{server/webui => ui}/.storybook/decorators/ModeWatcherDecorator.svelte (100%) rename tools/{server/webui => ui}/.storybook/decorators/TooltipProviderDecorator.svelte (100%) rename tools/{server/webui => ui}/.storybook/main.ts (100%) rename tools/{server/webui => ui}/.storybook/preview.ts (100%) rename tools/{server/webui => ui}/.storybook/vitest.setup.ts (100%) create mode 100644 tools/ui/CMakeLists.txt rename tools/{server/webui => ui}/README.md (96%) rename tools/{server/webui => ui}/components.json (100%) rename tools/{server/webui => ui}/docs/architecture/high-level-architecture-simplified.md (100%) rename tools/{server/webui => ui}/docs/architecture/high-level-architecture.md (100%) rename tools/{server/webui => ui}/docs/flows/chat-flow.md (100%) rename tools/{server/webui => ui}/docs/flows/conversations-flow.md (100%) rename tools/{server/webui => ui}/docs/flows/data-flow-simplified-model-mode.md (100%) rename tools/{server/webui => ui}/docs/flows/data-flow-simplified-router-mode.md (100%) rename tools/{server/webui => ui}/docs/flows/database-flow.md (100%) rename tools/{server/webui => ui}/docs/flows/mcp-flow.md (100%) rename tools/{server/webui => ui}/docs/flows/models-flow.md (100%) rename tools/{server/webui => ui}/docs/flows/server-flow.md (100%) rename tools/{server/webui => ui}/docs/flows/settings-flow.md (98%) rename tools/{server/webui => ui}/eslint.config.js (93%) rename tools/{server/webui => ui}/package-lock.json (100%) rename tools/{server/webui => ui}/package.json (98%) rename tools/{server/webui => ui}/playwright.config.ts (70%) rename tools/{server/webui => ui}/scripts/dev.sh (89%) rename tools/{server/webui => ui}/scripts/install-git-hooks.sh (62%) rename tools/{server/webui => ui}/scripts/vite-plugin-llama-cpp-build.ts (64%) rename tools/{server/webui => ui}/src/app.css (100%) rename tools/{server/webui => ui}/src/app.d.ts (100%) rename tools/{server/webui => ui}/src/app.html (100%) rename tools/{server/webui => ui}/src/lib/actions/fade-in-view.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/SKILL.md (100%) rename tools/{server/webui => ui}/src/lib/components/app/actions/ActionIcon.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/actions/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/badges/BadgeInfo.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/badges/BadgesModality.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/badges/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsList.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItem.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpResource.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailFile.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailImage.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItem.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemAudio.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemImage.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemPdf.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemText.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemUnavailable.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewFileInfo.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewNavButtons.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewThumbnailStrip.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatForm.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddButton.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte (97%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddMcpServersSubmenu.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte (98%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte (97%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormMcpResourcesList.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerItemHeader.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerList.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItem.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItemSkeleton.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerPopover.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentForm.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentInput.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpResources.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickers.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessage.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageAssistant/ChatMessageAssistant.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPrompt.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPromptContent.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageSystem/ChatMessageSystem.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUser.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserBubble.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserPending.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCard.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardContinueRequest.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardPermissionRequest.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIcons.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIconsBranchingControls.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte (99%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics/ChatMessageStatistics.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics/ChatMessageStatisticsBadge.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatScreen/ChatScreenDragOverlay.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatScreen/ChatScreenForm.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/chat/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/content/CollapsibleContentBlock.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/content/MarkdownContent/plugins/rehype/enhance-code-blocks.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/content/MarkdownContent/plugins/rehype/enhance-links.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/content/MarkdownContent/plugins/rehype/rehype-rtl-support.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/content/MarkdownContent/plugins/rehype/resolve-attachment-images.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/content/MarkdownContent/plugins/rehype/table-html-restorer.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/content/MarkdownContent/plugins/remark/literal-html.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/content/SyntaxHighlightedCode.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/content/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogChatAttachmentsPreview.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogChatError.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogCodePreview.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogConfirmation.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogConversationSelection.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogConversationTitleUpdate.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogEmptyFileAlert.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogExportSettings.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogFileUploadError.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogMcpResourcePreview.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogMcpResourcesBrowser.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogMcpServerAddNew.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogModelInformation.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/DialogModelNotAvailable.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/dialogs/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/forms/InputWithSuggestions.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/forms/KeyValuePairs.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/forms/SearchInput.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/forms/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpActiveServersAvatars.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpCapabilitiesBadges.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpConnectionLogs.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpLogo.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpResourcePreview.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpResourceTemplateForm.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpResourcesBrowser/McpResourcesBrowser.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpResourcesBrowser/McpResourcesBrowserEmptyState.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpResourcesBrowser/McpResourcesBrowserHeader.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpResourcesBrowser/McpResourcesBrowserServerItem.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpResourcesBrowser/mcp-resources-browser.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpServerCard/McpServerCard.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpServerCard/McpServerCardActions.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpServerCard/McpServerCardDeleteDialog.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpServerCard/McpServerCardEditForm.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpServerCard/McpServerCardHeader.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpServerCard/McpServerCardToolsList.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpServerCardSkeleton.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpServerForm.svelte (96%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpServerIdentity.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/McpServerInfo.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/mcp/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/misc/CodeBlockActions.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/misc/ConversationSelection.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/misc/HorizontalScrollCarousel.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/misc/KeyboardShortcutInfo.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/misc/TruncatedText.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/misc/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/models/ModelBadge.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/models/ModelId.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/models/ModelsSelectorDropdown.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/models/ModelsSelectorList.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/models/ModelsSelectorOption.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/models/ModelsSelectorSheet.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/models/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/models/utils.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/navigation/DesktopIconStrip.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/navigation/DropdownMenuActions.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/navigation/DropdownMenuSearchable.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte (99%) rename tools/{server/webui => ui}/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationActions.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationConversationItem.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationSearch.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/navigation/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/server/ServerErrorSplash.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/server/ServerLoadingSplash.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/server/ServerStatus.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/server/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/SettingsChat/SettingsChatImportExportSection.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/SettingsChat/SettingsChatImportExportTab.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/SettingsChat/SettingsChatParameterSourceIndicator.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/SettingsChat/SettingsChatToolsTab.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/SettingsChatDesktopSidebar.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/SettingsChatMobileHeader.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/SettingsFooter.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/SettingsGroup.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/SettingsMcpServers.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/app/settings/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert-dialog/alert-dialog-action.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert-dialog/alert-dialog-cancel.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert-dialog/alert-dialog-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert-dialog/alert-dialog-description.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert-dialog/alert-dialog-footer.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert-dialog/alert-dialog-header.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert-dialog/alert-dialog-overlay.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert-dialog/alert-dialog-title.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert-dialog/alert-dialog-trigger.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert-dialog/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert/alert-description.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert/alert-title.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert/alert.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/alert/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/badge/badge.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/badge/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/button-group/button-group-root.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/button-group/button-group-separator.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/button-group/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/button/button.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/button/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/card/card-action.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/card/card-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/card/card-description.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/card/card-footer.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/card/card-header.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/card/card-title.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/card/card.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/card/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/checkbox/checkbox.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/checkbox/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/collapsible/collapsible-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/collapsible/collapsible-trigger.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/collapsible/collapsible.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/collapsible/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dialog/dialog-close.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dialog/dialog-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dialog/dialog-description.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dialog/dialog-footer.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dialog/dialog-header.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dialog/dialog-overlay.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dialog/dialog-title.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dialog/dialog-trigger.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dialog/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-checkbox-item.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-group-heading.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-group.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-item.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-label.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-group.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-item.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-separator.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-shortcut.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-trigger.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/dropdown-menu-trigger.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/dropdown-menu/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/input/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/input/input.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/label/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/label/label.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/popover/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/popover/popover-close.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/popover/popover-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/popover/popover-portal.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/popover/popover-trigger.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/popover/popover.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/scroll-area/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/scroll-area/scroll-area-scrollbar.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/scroll-area/scroll-area.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/select/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/select/select-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/select/select-group-heading.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/select/select-group.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/select/select-item.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/select/select-label.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/select/select-scroll-down-button.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/select/select-scroll-up-button.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/select/select-separator.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/select/select-trigger.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/separator/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/separator/separator.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sheet/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sheet/sheet-close.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sheet/sheet-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sheet/sheet-description.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sheet/sheet-footer.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sheet/sheet-header.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sheet/sheet-overlay.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sheet/sheet-title.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sheet/sheet-trigger.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/constants.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/context.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-footer.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-group-action.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-group-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-group-label.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-group.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-header.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-input.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-inset.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-menu-action.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-menu-badge.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-menu-button.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-menu-item.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-menu-skeleton.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-menu-sub-button.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-menu-sub-item.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-menu-sub.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-menu.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-provider.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-rail.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-separator.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar-trigger.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/sidebar/sidebar.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/skeleton/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/skeleton/skeleton.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/switch/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/switch/switch.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/table/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/table/table-body.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/table/table-caption.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/table/table-cell.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/table/table-footer.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/table/table-head.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/table/table-header.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/table/table-row.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/table/table.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/textarea/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/textarea/textarea.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/tooltip/index.ts (100%) rename tools/{server/webui => ui}/src/lib/components/ui/tooltip/tooltip-content.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/tooltip/tooltip-trigger.svelte (100%) rename tools/{server/webui => ui}/src/lib/components/ui/utils.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/agentic.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/api-endpoints.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/attachment-labels.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/attachment-menu.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/auto-scroll.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/binary-detection.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/cache.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/chat-form.ts (100%) create mode 100644 tools/ui/src/lib/constants/cli-flags.ts rename tools/{server/webui => ui}/src/lib/constants/code-blocks.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/code.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/context-keys.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/css-classes.ts (100%) create mode 100644 tools/ui/src/lib/constants/database.ts rename tools/{server/webui => ui}/src/lib/constants/floating-ui-constraints.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/formatters.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/icons.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/index.ts (93%) rename tools/{server/webui => ui}/src/lib/constants/key-value-pairs.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/latex-protection.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/literal-html.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/markdown.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/max-bundle-size.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/mcp-form.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/mcp-resource.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/mcp.ts (93%) rename tools/{server/webui => ui}/src/lib/constants/message-export.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/model-id.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/precision.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/processing-info.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/routes.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/settings-keys.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/settings-registry.ts (94%) create mode 100644 tools/ui/src/lib/constants/storage.ts rename tools/{server/webui => ui}/src/lib/constants/supported-file-types.ts (98%) rename tools/{server/webui => ui}/src/lib/constants/table-html-restorer.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/title-generation.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/tools.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/tooltip-config.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/ui.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/uri-template.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/url.ts (100%) rename tools/{server/webui => ui}/src/lib/constants/viewport.ts (100%) rename tools/{server/webui => ui}/src/lib/contexts/chat-actions.context.ts (100%) rename tools/{server/webui => ui}/src/lib/contexts/chat-settings-config.context.ts (100%) rename tools/{server/webui => ui}/src/lib/contexts/index.ts (100%) rename tools/{server/webui => ui}/src/lib/contexts/message-edit.context.ts (100%) rename tools/{server/webui => ui}/src/lib/contexts/processing-info.context.ts (100%) rename tools/{server/webui => ui}/src/lib/enums/agentic.ts (100%) rename tools/{server/webui => ui}/src/lib/enums/attachment.ts (100%) rename tools/{server/webui => ui}/src/lib/enums/chat.ts (100%) rename tools/{server/webui => ui}/src/lib/enums/files.ts (98%) rename tools/{server/webui => ui}/src/lib/enums/index.ts (100%) rename tools/{server/webui => ui}/src/lib/enums/keyboard.ts (100%) rename tools/{server/webui => ui}/src/lib/enums/mcp.ts (100%) rename tools/{server/webui => ui}/src/lib/enums/model.ts (100%) rename tools/{server/webui => ui}/src/lib/enums/server.ts (100%) rename tools/{server/webui => ui}/src/lib/enums/settings.ts (100%) rename tools/{server/webui => ui}/src/lib/enums/tools.ts (100%) rename tools/{server/webui => ui}/src/lib/enums/ui.ts (100%) rename tools/{server/webui => ui}/src/lib/hooks/is-mobile.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/hooks/use-attachment-menu.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/hooks/use-auto-scroll.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/hooks/use-draft-messages.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/hooks/use-keyboard-shortcuts.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/hooks/use-message-edit-context.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/hooks/use-models-selector.svelte.ts (99%) rename tools/{server/webui => ui}/src/lib/hooks/use-processing-state.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/hooks/use-scroll-carousel.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/hooks/use-settings-navigation.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/hooks/use-tools-panel.svelte.ts (89%) rename tools/{server/webui => ui}/src/lib/services/chat.service.ts (99%) rename tools/{server/webui => ui}/src/lib/services/database.service.ts (51%) rename tools/{server/webui => ui}/src/lib/services/index.ts (92%) rename tools/{server/webui => ui}/src/lib/services/mcp.service.ts (100%) create mode 100644 tools/ui/src/lib/services/migration.service.ts rename tools/{server/webui => ui}/src/lib/services/models.service.ts (100%) rename tools/{server/webui => ui}/src/lib/services/parameter-sync.service.spec.ts (98%) rename tools/{server/webui => ui}/src/lib/services/parameter-sync.service.ts (95%) rename tools/{server/webui => ui}/src/lib/services/props.service.ts (100%) rename tools/{server/webui => ui}/src/lib/services/router.service.ts (100%) rename tools/{server/webui => ui}/src/lib/services/tools.service.ts (100%) rename tools/{server/webui => ui}/src/lib/stores/agentic.svelte.ts (99%) rename tools/{server/webui => ui}/src/lib/stores/chat.svelte.ts (99%) rename tools/{server/webui => ui}/src/lib/stores/conversations.svelte.ts (99%) rename tools/{server/webui => ui}/src/lib/stores/draft-messages.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/stores/mcp-resources.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/stores/mcp.svelte.ts (99%) rename tools/{server/webui => ui}/src/lib/stores/models.svelte.ts (98%) rename tools/{server/webui => ui}/src/lib/stores/permissions.svelte.ts (99%) rename tools/{server/webui => ui}/src/lib/stores/persisted.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/stores/server.svelte.ts (96%) rename tools/{server/webui => ui}/src/lib/stores/settings-referrer.svelte.ts (100%) rename tools/{server/webui => ui}/src/lib/stores/settings.svelte.ts (93%) rename tools/{server/webui => ui}/src/lib/stores/tools.svelte.ts (99%) rename tools/{server/webui => ui}/src/lib/types/agentic.d.ts (100%) rename tools/{server/webui => ui}/src/lib/types/api.d.ts (98%) rename tools/{server/webui => ui}/src/lib/types/chat.d.ts (100%) rename tools/{server/webui => ui}/src/lib/types/common.d.ts (100%) rename tools/{server/webui => ui}/src/lib/types/database.d.ts (97%) rename tools/{server/webui => ui}/src/lib/types/index.ts (100%) rename tools/{server/webui => ui}/src/lib/types/mcp.d.ts (100%) rename tools/{server/webui => ui}/src/lib/types/models.d.ts (100%) rename tools/{server/webui => ui}/src/lib/types/settings.d.ts (100%) rename tools/{server/webui => ui}/src/lib/types/tools.d.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/abort.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/agentic.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/api-fetch.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/api-headers.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/api-key-validation.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/attachment-display.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/attachment-type.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/audio-recording.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/autoresize-textarea.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/branching.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/browser-only.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/cache-ttl.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/clipboard.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/code.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/config-helpers.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/conversation-utils.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/convert-files-to-extra.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/cors-proxy.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/css.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/data-url.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/debounce.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/file-preview.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/file-type.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/formatters.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/headers.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/image-error-fallback.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/index.ts (97%) rename tools/{server/webui => ui}/src/lib/utils/is-ime-composing.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/latex-protection.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/legacy-migration.ts (93%) rename tools/{server/webui => ui}/src/lib/utils/mcp.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/modality-file-validation.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/model-names.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/pdf-processing.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/portal-to-body.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/precision.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/process-uploaded-files.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/redact.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/request-helpers.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/sanitize.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/svg-to-png.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/syntax-highlight-language.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/text-files.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/text.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/uri-template.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/url.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/uuid.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/viewport.ts (100%) rename tools/{server/webui => ui}/src/lib/utils/webp-to-png.ts (100%) rename tools/{server/webui => ui}/src/routes/(chat)/+layout.svelte (100%) rename tools/{server/webui => ui}/src/routes/(chat)/+page.svelte (100%) rename tools/{server/webui => ui}/src/routes/(chat)/+page.ts (100%) rename tools/{server/webui => ui}/src/routes/(chat)/chat/[id]/+page.svelte (100%) rename tools/{server/webui => ui}/src/routes/(chat)/chat/[id]/+page.ts (100%) rename tools/{server/webui => ui}/src/routes/+error.svelte (95%) rename tools/{server/webui => ui}/src/routes/+layout.svelte (100%) rename tools/{server/webui => ui}/src/routes/mcp-servers/+page.svelte (100%) rename tools/{server/webui => ui}/src/routes/settings/+layout.svelte (100%) rename tools/{server/webui => ui}/src/routes/settings/[[section]]/+page.svelte (100%) rename tools/{server/webui => ui}/src/styles/katex-custom.scss (100%) rename tools/{server/webui => ui}/static/favicon.svg (100%) rename tools/{server/webui => ui}/static/loading.html (100%) rename tools/{server/webui => ui}/svelte.config.js (89%) rename tools/{server/webui => ui}/tests/client/components/TestWrapper.svelte (100%) rename tools/{server/webui => ui}/tests/client/page.svelte.test.ts (100%) rename tools/{server/webui => ui}/tests/e2e/demo.test.ts (100%) rename tools/{server/webui => ui}/tests/stories/ChatMessage.stories.svelte (100%) rename tools/{server/webui => ui}/tests/stories/ChatScreenForm.stories.svelte (100%) rename tools/{server/webui => ui}/tests/stories/Introduction.mdx (93%) rename tools/{server/webui => ui}/tests/stories/MarkdownContent.stories.svelte (100%) rename tools/{server/webui => ui}/tests/stories/SidebarNavigation.stories.svelte (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/ai-tutorial.ts (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/api-docs.ts (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/assets/1.jpg (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/assets/beautiful-flowers-lotus.webp (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/assets/example.pdf (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/assets/hf-logo.svg (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/blog-post.ts (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/data-analysis.ts (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/empty.ts (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/math-formulas.ts (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/readme.ts (100%) rename tools/{server/webui => ui}/tests/stories/fixtures/storybook-mocks.ts (100%) rename tools/{server/webui => ui}/tests/unit/agentic-sections.test.ts (94%) rename tools/{server/webui => ui}/tests/unit/agentic-strip.test.ts (100%) rename tools/{server/webui => ui}/tests/unit/clipboard.test.ts (100%) rename tools/{server/webui => ui}/tests/unit/latex-protection.test.ts (100%) rename tools/{server/webui => ui}/tests/unit/mcp-service.test.ts (100%) rename tools/{server/webui => ui}/tests/unit/model-id-parser.test.ts (100%) rename tools/{server/webui => ui}/tests/unit/model-names.test.ts (100%) rename tools/{server/webui => ui}/tests/unit/reasoning-context.test.ts (100%) rename tools/{server/webui => ui}/tests/unit/redact.test.ts (100%) rename tools/{server/webui => ui}/tests/unit/request-helpers.test.ts (100%) rename tools/{server/webui => ui}/tests/unit/sanitize-headers.test.ts (100%) rename tools/{server/webui => ui}/tests/unit/uri-template.test.ts (100%) rename tools/{server/webui => ui}/tsconfig.json (100%) create mode 100644 tools/ui/ui.cpp create mode 100644 tools/ui/ui.h rename tools/{server/webui => ui}/vite.config.ts (100%) rename tools/{server/webui => ui}/vitest-setup-client.ts (100%) diff --git a/.editorconfig b/.editorconfig index 7c1af01a1d8..5663b8fdbfc 100644 --- a/.editorconfig +++ b/.editorconfig @@ -45,7 +45,7 @@ insert_final_newline = unset trim_trailing_whitespace = unset insert_final_newline = unset -[tools/server/webui/**] +[tools/ui/**] indent_style = unset indent_size = unset end_of_line = unset diff --git a/.github/labeler.yml b/.github/labeler.yml index 2120672eee5..60aa51d2cc2 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -73,10 +73,10 @@ android: - changed-files: - any-glob-to-any-file: - examples/llama.android/** -server/webui: +server/ui: - changed-files: - any-glob-to-any-file: - - tools/server/webui/** + - tools/ui/** server: - changed-files: - any-glob-to-any-file: diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml index ee8fd41d7c7..2851c45601f 100644 --- a/.github/workflows/build-self-hosted.yml +++ b/.github/workflows/build-self-hosted.yml @@ -68,6 +68,8 @@ jobs: - name: Determine tag name id: tag uses: ./.github/actions/get-tag-name + env: + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} ggml-ci-nvidia-cuda: needs: determine-tag @@ -81,7 +83,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | nvidia-smi GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp @@ -98,7 +100,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp @@ -115,7 +117,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp @@ -205,7 +207,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp @@ -234,7 +236,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \ bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp @@ -251,7 +253,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp @@ -270,7 +272,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp @@ -291,7 +293,7 @@ jobs: MSYSTEM: UCRT64 CHERE_INVOKING: 1 PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }} - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create @@ -332,7 +334,7 @@ jobs: - name: Test id: ggml-ci env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} + HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | source ./openvino_toolkit/setupvars.sh GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index bb512fd875c..1880c155c85 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -36,13 +36,8 @@ env: CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON" jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml macOS-cpu: - needs: - - webui-build strategy: matrix: @@ -71,11 +66,12 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -113,8 +109,6 @@ jobs: name: llama-bin-macos-${{ matrix.build }}.tar.gz ubuntu-cpu: - needs: - - webui-build strategy: matrix: @@ -135,11 +129,12 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: ccache if: ${{ matrix.build != 's390x' }} @@ -191,8 +186,6 @@ jobs: name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz ubuntu-vulkan: - needs: - - webui-build strategy: matrix: @@ -211,11 +204,12 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -268,8 +262,6 @@ jobs: name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz android-arm64: - needs: - - webui-build runs-on: ubuntu-latest @@ -283,11 +275,12 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -346,8 +339,6 @@ jobs: name: llama-bin-android-arm64.tar.gz ubuntu-24-openvino: - needs: - - webui-build runs-on: ubuntu-24.04 @@ -370,11 +361,12 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -435,8 +427,6 @@ jobs: name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz windows-cpu: - needs: - - webui-build runs-on: windows-2025 @@ -452,11 +442,12 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -496,8 +487,6 @@ jobs: name: llama-bin-win-cpu-${{ matrix.arch }}.zip windows: - needs: - - webui-build runs-on: windows-2025 @@ -522,11 +511,12 @@ jobs: id: checkout uses: actions/checkout@v6 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -587,8 +577,6 @@ jobs: name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip windows-cuda: - needs: - - webui-build runs-on: windows-2022 @@ -601,11 +589,12 @@ jobs: id: checkout uses: actions/checkout@v6 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install ccache uses: ggml-org/ccache-action@v1.2.21 @@ -667,8 +656,6 @@ jobs: name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip windows-sycl: - needs: - - webui-build runs-on: windows-2022 @@ -708,11 +695,12 @@ jobs: Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -781,8 +769,6 @@ jobs: name: llama-bin-win-sycl-x64.zip ubuntu-24-sycl: - needs: - - webui-build strategy: matrix: @@ -831,11 +817,12 @@ jobs: wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: ccache uses: ggml-org/ccache-action@v1.2.21 @@ -876,8 +863,6 @@ jobs: name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz ubuntu-22-rocm: - needs: - - webui-build runs-on: ubuntu-22.04 @@ -895,11 +880,12 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Free up disk space uses: ggml-org/free-disk-space@v1.3.1 @@ -988,8 +974,6 @@ jobs: name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz windows-hip: - needs: - - webui-build runs-on: windows-2022 @@ -1007,11 +991,12 @@ jobs: id: checkout uses: actions/checkout@v6 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Grab rocWMMA package id: grab_rocwmma @@ -1259,7 +1244,6 @@ jobs: runs-on: ubuntu-slim needs: - - webui-build - windows - windows-cpu - windows-cuda @@ -1404,14 +1388,14 @@ jobs: } } - webui-publish: + ui-publish: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} needs: - release - uses: ./.github/workflows/webui-publish.yml + uses: ./.github/workflows/ui-publish.yml with: version_tag: ${{ needs.release.outputs.tag_name }} secrets: - hf_token: ${{ secrets.HF_TOKEN_WEBUI_STATIC_OUTPUT }} + hf_token: ${{ secrets.HF_TOKEN_UI_STATIC_OUTPUT }} diff --git a/.github/workflows/server-sanitize.yml b/.github/workflows/server-sanitize.yml index 4c9f447cf8b..53c9968ee96 100644 --- a/.github/workflows/server-sanitize.yml +++ b/.github/workflows/server-sanitize.yml @@ -67,6 +67,13 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + - name: Setup Node.js + uses: actions/setup-node@v6 + with: + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" + - name: Build id: cmake_build run: | diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index 117c79c2f87..d3b3e4cc7e8 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -39,12 +39,7 @@ concurrency: cancel-in-progress: true jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml - server-metal: - needs: webui-build runs-on: [self-hosted, llama-server, macOS, ARM64] name: server-metal (${{ matrix.wf_name }}) @@ -72,11 +67,12 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Build id: cmake_build diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index f12e7b7366f..7b9c5a3a3d8 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -54,12 +54,7 @@ concurrency: cancel-in-progress: true jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml - server: - needs: webui-build runs-on: ubuntu-latest name: server (${{ matrix.wf_name }}) @@ -98,11 +93,12 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Build id: cmake_build @@ -136,7 +132,6 @@ jobs: SLOW_TESTS=1 pytest -v -x server-windows: - needs: webui-build runs-on: windows-2022 steps: @@ -147,11 +142,10 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" - name: Build id: cmake_build diff --git a/.github/workflows/webui-build.yml b/.github/workflows/ui-build.yml similarity index 67% rename from .github/workflows/webui-build.yml rename to .github/workflows/ui-build.yml index 7f512a69d3d..511c96fb6fc 100644 --- a/.github/workflows/webui-build.yml +++ b/.github/workflows/ui-build.yml @@ -1,11 +1,11 @@ -name: Build WebUI +name: UI Build on: workflow_call: jobs: build: - name: Build WebUI + name: Build static output runs-on: ubuntu-slim env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} @@ -19,26 +19,26 @@ jobs: with: node-version: "24" cache: "npm" - cache-dependency-path: "tools/server/webui/package-lock.json" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install dependencies run: npm ci - working-directory: tools/server/webui + working-directory: tools/ui - name: Build application run: npm run build - working-directory: tools/server/webui + working-directory: tools/ui - name: Generate checksums run: | - cd tools/server/public + cd build/tools/ui/dist for f in *; do sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt done - - name: Upload built webui + - name: Upload built UI uses: actions/upload-artifact@v6 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/dist/ retention-days: 1 diff --git a/.github/workflows/server-webui.yml b/.github/workflows/ui-ci.yml similarity index 75% rename from .github/workflows/server-webui.yml rename to .github/workflows/ui-ci.yml index 72c2016efe7..43d6e125687 100644 --- a/.github/workflows/server-webui.yml +++ b/.github/workflows/ui-ci.yml @@ -1,4 +1,4 @@ -name: Server WebUI +name: CI (UI) on: workflow_dispatch: @@ -11,15 +11,15 @@ on: branches: - master paths: [ - '.github/workflows/server-webui.yml', - 'tools/server/webui/**.*', + '.github/workflows/ui-ci.yml', + 'tools/ui/**.*', 'tools/server/tests/**.*' ] pull_request: types: [opened, synchronize, reopened] paths: [ - '.github/workflows/server-webui.yml', - 'tools/server/webui/**.*', + '.github/workflows/ui-ci.yml', + 'tools/ui/**.*', 'tools/server/tests/**.*' ] @@ -34,13 +34,13 @@ concurrency: cancel-in-progress: true jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml + ui-build: + name: Build static output + uses: ./.github/workflows/ui-build.yml - webui-checks: - name: WebUI Checks - needs: webui-build + ui-checks: + name: UI Checks + needs: ui-build runs-on: ubuntu-24.04-arm continue-on-error: true steps: @@ -56,43 +56,43 @@ jobs: with: node-version: "24" cache: "npm" - cache-dependency-path: "tools/server/webui/package-lock.json" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install dependencies id: setup if: ${{ steps.node.conclusion == 'success' }} run: npm ci - working-directory: tools/server/webui + working-directory: tools/ui - name: Run type checking if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run check - working-directory: tools/server/webui + working-directory: tools/ui - name: Run linting if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run lint - working-directory: tools/server/webui + working-directory: tools/ui - name: Install Playwright browsers id: playwright if: ${{ always() && steps.setup.conclusion == 'success' }} run: npx playwright install --with-deps - working-directory: tools/server/webui + working-directory: tools/ui - name: Run Client tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:client - working-directory: tools/server/webui + working-directory: tools/ui - name: Run Unit tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:unit - working-directory: tools/server/webui + working-directory: tools/ui e2e-tests: name: E2E Tests - needs: webui-build + needs: ui-build runs-on: ubuntu-24.04-arm steps: - name: Checkout code @@ -107,36 +107,36 @@ jobs: with: node-version: "24" cache: "npm" - cache-dependency-path: "tools/server/webui/package-lock.json" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install dependencies id: setup if: ${{ steps.node.conclusion == 'success' }} run: npm ci - working-directory: tools/server/webui + working-directory: tools/ui - name: Build application if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run build - working-directory: tools/server/webui + working-directory: tools/ui - name: Install Playwright browsers id: playwright if: ${{ always() && steps.setup.conclusion == 'success' }} run: npx playwright install --with-deps - working-directory: tools/server/webui + working-directory: tools/ui - name: Build Storybook if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run build-storybook - working-directory: tools/server/webui + working-directory: tools/ui - name: Run UI tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:ui -- --testTimeout=60000 - working-directory: tools/server/webui + working-directory: tools/ui - name: Run E2E tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:e2e - working-directory: tools/server/webui + working-directory: tools/ui diff --git a/.github/workflows/webui-publish.yml b/.github/workflows/ui-publish.yml similarity index 84% rename from .github/workflows/webui-publish.yml rename to .github/workflows/ui-publish.yml index bf0d707a23e..33d7415c9c4 100644 --- a/.github/workflows/webui-publish.yml +++ b/.github/workflows/ui-publish.yml @@ -1,4 +1,4 @@ -name: WebUI Publish +name: UI Publish on: workflow_call: @@ -14,14 +14,14 @@ on: jobs: publish: - name: Publish WebUI Static Output + name: Publish UI Static Output runs-on: ubuntu-24.04-arm permissions: contents: read env: - HF_BUCKET_NAME: ${{ vars.HF_BUCKET_WEBUI_STATIC_OUTPUT }} + HF_BUCKET_NAME: ${{ vars.HF_BUCKET_UI_STATIC_OUTPUT }} steps: - name: Checkout code @@ -29,11 +29,11 @@ jobs: with: fetch-depth: 1 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: build/tools/ui/dist/ - name: Install Hugging Face Hub CLI run: pip install -U huggingface_hub @@ -44,12 +44,12 @@ jobs: - name: Sync built files to Hugging Face bucket (version tag) run: | # Upload the built files to the Hugging Face bucket under the release version - hf buckets sync tools/server/public hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet + hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet - name: Sync built files to Hugging Face bucket (latest) run: | # Also upload to the 'latest' directory for fallback downloads - hf buckets sync tools/server/public hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet + hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet - name: Verify upload run: | diff --git a/.gitignore b/.gitignore index 5f53fbf7a61..8dc9d7d0b8d 100644 --- a/.gitignore +++ b/.gitignore @@ -54,7 +54,6 @@ /tmp/ /autogen-*.md /common/build-info.cpp -/tools/server/public # Deprecated @@ -93,10 +92,12 @@ !/examples/sycl/*.bat !/examples/sycl/*.sh -# Server Web UI temporary files +# Server Web UI temporary files (+ legacy directory) /tools/server/webui/node_modules /tools/server/webui/dist +/tools/ui/node_modules +/tools/ui/dist # Python diff --git a/CMakeLists.txt b/CMakeLists.txt index 244f4cb4982..44746072313 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,8 +108,23 @@ option(LLAMA_BUILD_TESTS "llama: build tests" option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server" ON) -option(LLAMA_USE_PREBUILT_WEBUI "llama: use prebuilt WebUI from HF Bucket when available (requires LLAMA_BUILD_WEBUI=ON)" ON) +# Deprecated: use LLAMA_BUILD_UI instead (kept for backward compat) +option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server (deprecated: use LLAMA_BUILD_UI)" ON) +option(LLAMA_USE_PREBUILT_WEBUI "llama: use prebuilt WebUI from HF Bucket when available (deprecated: use LLAMA_USE_PREBUILT_UI)" ON) + +# New option names +option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON) +option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON) + +# Backward compat: when old var is set but new one isn't, forward the value +if(DEFINED LLAMA_BUILD_WEBUI AND NOT DEFINED LLAMA_BUILD_UI) + set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI}) + message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead") +endif() +if(DEFINED LLAMA_USE_PREBUILT_WEBUI AND NOT DEFINED LLAMA_USE_PREBUILT_UI) + set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI}) + message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead") +endif() option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) option(LLAMA_TESTS_INSTALL "llama: install tests" ON) diff --git a/CODEOWNERS b/CODEOWNERS index a4395969fe3..f58f0f830fa 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -15,7 +15,7 @@ # ggml-org/llama-common : ggerganov, aldehir, angt, danbev, ngxson, pwilkin # ggml-org/llama-mtmd : ngxson # ggml-org/llama-server : ggerganov, ngxson, allozaur, angt, ServeurpersoCom -# ggml-org/llama-webui : allozaur +# ggml-org/llama-ui : allozaur /.devops/*.Dockerfile @ngxson /.github/actions/ @ggml-org/ci @@ -107,7 +107,7 @@ /tools/rpc/ @ggml-org/ggml-rpc /tools/server/* @ggml-org/llama-server # no subdir /tools/server/tests/ @ggml-org/llama-server -/tools/server/webui/ @ggml-org/llama-webui +/tools/ui/ @ggml-org/llama-ui /tools/tokenize/ @ggerganov /tools/tts/ @ggerganov /vendor/ @ggerganov diff --git a/common/arg.cpp b/common/arg.cpp index 15d5ad77af1..2129a9c7266 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2844,28 +2844,64 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.api_prefix = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX")); + // Deprecated: use --ui-config instead (kept for backward compat) add_opt(common_arg( {"--webui-config"}, "JSON", - "JSON that provides default WebUI settings (overrides WebUI defaults)", + "[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)", [](common_params & params, const std::string & value) { + params.ui_config_json = value; params.webui_config_json = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG")); + + add_opt(common_arg( + {"--ui-config"}, "JSON", + "JSON that provides default UI settings (overrides UI defaults)", + [](common_params & params, const std::string & value) { + params.ui_config_json = value; + params.webui_config_json = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG")); + + // Deprecated: use --ui-config-file instead (kept for backward compat) add_opt(common_arg( {"--webui-config-file"}, "PATH", - "JSON file that provides default WebUI settings (overrides WebUI defaults)", + "[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)", [](common_params & params, const std::string & value) { - params.webui_config_json = read_file(value); + params.ui_config_json = read_file(value); + params.webui_config_json = params.ui_config_json; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE")); + + add_opt(common_arg( + {"--ui-config-file"}, "PATH", + "JSON file that provides default UI settings (overrides UI defaults)", + [](common_params & params, const std::string & value) { + params.ui_config_json = read_file(value); + params.webui_config_json = params.ui_config_json; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE")); + + // Deprecated: use --ui-mcp-proxy instead (kept for backward compat) add_opt(common_arg( {"--webui-mcp-proxy"}, {"--no-webui-mcp-proxy"}, - string_format("experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: %s)", params.webui_mcp_proxy ? "enabled" : "disabled"), + "[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy", [](common_params & params, bool value) { + params.ui_mcp_proxy = value; params.webui_mcp_proxy = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY")); + + add_opt(common_arg( + {"--ui-mcp-proxy"}, + {"--no-ui-mcp-proxy"}, + "experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)", + [](common_params & params, bool value) { + params.ui_mcp_proxy = value; + params.webui_mcp_proxy = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY")); add_opt(common_arg( {"--tools"}, "TOOL1,TOOL2,...", "experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n" @@ -2875,14 +2911,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.server_tools = parse_csv_row(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS")); + // Deprecated: use --ui/--no-ui instead (kept for backward compat) add_opt(common_arg( {"--webui"}, {"--no-webui"}, - string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), + "[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI", [](common_params & params, bool value) { + params.ui = value; params.webui = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI")); + + add_opt(common_arg( + {"--ui"}, + {"--no-ui"}, + string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.ui = value; + params.webui = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI")); add_opt(common_arg( {"--embedding", "--embeddings"}, string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), diff --git a/common/common.h b/common/common.h index 764574e23a0..c6223c4b515 100644 --- a/common/common.h +++ b/common/common.h @@ -604,15 +604,23 @@ struct common_params { std::map default_template_kwargs; - // webui configs -#ifdef LLAMA_WEBUI_DEFAULT_ENABLED - bool webui = LLAMA_WEBUI_DEFAULT_ENABLED != 0; + // UI configs +#ifdef LLAMA_UI_DEFAULT_ENABLED + bool ui = LLAMA_UI_DEFAULT_ENABLED != 0; +#elif defined(LLAMA_WEBUI_DEFAULT_ENABLED) + bool ui = LLAMA_WEBUI_DEFAULT_ENABLED != 0; #else - bool webui = true; // default to enabled when not set + bool ui = true; // default to enabled when not set #endif + + // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead + bool webui = ui; bool webui_mcp_proxy = false; std::string webui_config_json; + bool ui_mcp_proxy = false; + std::string ui_config_json; + // "advanced" endpoints are disabled by default for better security bool endpoint_slots = true; bool endpoint_props = false; // only control POST requests, not GET diff --git a/scripts/webui-download.cmake b/scripts/ui-download.cmake similarity index 76% rename from scripts/webui-download.cmake rename to scripts/ui-download.cmake index 6695c018035..65143642a2c 100644 --- a/scripts/webui-download.cmake +++ b/scripts/ui-download.cmake @@ -1,5 +1,5 @@ -# Download webui assets from Hugging Face Bucket at build time -# Usage: cmake -DPUBLIC_DIR=... -DHF_BUCKET=... -DHF_VERSION=... -DASSETS="a;b;c" -P scripts/webui-download.cmake +# Download UI assets from Hugging Face Bucket at build time +# Usage: cmake -DPUBLIC_DIR=... -DHF_BUCKET=... -DHF_VERSION=... -DASSETS="a;b;c" -P scripts/ui-download.cmake # # Asset provisioning priority: # 1. Pre-built assets already in PUBLIC_DIR (cached from a previous run) @@ -14,7 +14,7 @@ set(HF_VERSION "" CACHE STRING "Version to download (empty = resolve from git) set(ASSETS "" CACHE STRING "Plus-separated list of asset filenames (+)") set(STAMP_FILE "" CACHE STRING "Stamp file to create on success (optional)") set(SOURCE_DIR "" CACHE STRING "Project source root (to resolve version from git)") -set(NPM_DIR "" CACHE STRING "WebUI source directory (to run npm build)") +set(NPM_DIR "" CACHE STRING "UI source directory (to run npm build)") set(HF_ENABLED "" CACHE STRING "Whether to allow HF Bucket download (ON/OFF)") # --------------------------------------------------------------------------- @@ -25,8 +25,8 @@ if("${RESOLVED_VERSION}" STREQUAL "" AND NOT "${SOURCE_DIR}" STREQUAL "") if(EXISTS "${SOURCE_DIR}/cmake/build-info.cmake") include("${SOURCE_DIR}/cmake/build-info.cmake") if(NOT "${BUILD_NUMBER}" STREQUAL "" AND NOT BUILD_NUMBER EQUAL 0) - set(RESOLVED_VERSION "${BUILD_NUMBER}") - message(STATUS "WebUI: resolved version from git: ${RESOLVED_VERSION}") + set(RESOLVED_VERSION "b${BUILD_NUMBER}") + message(STATUS "UI: resolved version from git: ${RESOLVED_VERSION}") endif() endif() endif() @@ -43,7 +43,7 @@ if(NOT "${STAMP_FILE}" STREQUAL "" AND EXISTS "${STAMP_FILE}") file(READ "${STAMP_FILE}" STAMPED_VERSION) string(STRIP "${STAMPED_VERSION}" STAMPED_VERSION) if(NOT "${STAMPED_VERSION}" STREQUAL "${RESOLVED_VERSION}") - message(STATUS "WebUI: version changed (${STAMPED_VERSION} -> ${RESOLVED_VERSION}), re-building") + message(STATUS "UI: version changed (${STAMPED_VERSION} -> ${RESOLVED_VERSION}), re-building") set(FORCE_REBUILD TRUE) endif() endif() @@ -60,7 +60,7 @@ foreach(asset ${ASSETS}) endforeach() if(ALL_EXISTS AND NOT FORCE_REBUILD) - message(STATUS "WebUI: all assets already exist in ${PUBLIC_DIR}, skipping") + message(STATUS "UI: all assets already exist in ${PUBLIC_DIR}, skipping") return() endif() @@ -76,27 +76,27 @@ if(NOT PROVISION_SUCCESS AND NOT "${NPM_DIR}" STREQUAL "") # Check if npm is available before attempting npm build find_program(NPM_EXECUTABLE npm) if(NPM_EXECUTABLE) - message(STATUS "WebUI: building from source in ${NPM_DIR}") + message(STATUS "UI: building from source in ${NPM_DIR}") # Run npm install if node_modules is missing if(NOT EXISTS "${NPM_DIR}/node_modules") - message(STATUS "WebUI: running npm install (first time)") + message(STATUS "UI: running npm install (first time)") execute_process( - COMMAND npm install + COMMAND ${NPM_EXECUTABLE} install WORKING_DIRECTORY "${NPM_DIR}" RESULT_VARIABLE NPM_INSTALL_RESULT OUTPUT_VARIABLE NPM_OUT ERROR_VARIABLE NPM_ERR ) if(NOT NPM_INSTALL_RESULT EQUAL 0) - message(STATUS "WebUI: npm install failed (${NPM_INSTALL_RESULT}), falling back to download") + message(STATUS "UI: npm install failed (${NPM_INSTALL_RESULT}), falling back to download") message(STATUS " stderr: ${NPM_ERR}") endif() endif() # Run the build execute_process( - COMMAND npm run build + COMMAND ${NPM_EXECUTABLE} run build WORKING_DIRECTORY "${NPM_DIR}" RESULT_VARIABLE NPM_BUILD_RESULT OUTPUT_VARIABLE NPM_OUT @@ -114,20 +114,20 @@ if(NOT PROVISION_SUCCESS AND NOT "${NPM_DIR}" STREQUAL "") endforeach() if(ALL_BUILT) - message(STATUS "WebUI: local npm build succeeded") + message(STATUS "UI: local npm build succeeded") set(PROVISION_SUCCESS TRUE) else() - message(STATUS "WebUI: npm build completed but assets missing from ${PUBLIC_DIR}, falling back to download") + message(STATUS "UI: npm build completed but assets missing from ${PUBLIC_DIR}, falling back to download") endif() else() - message(STATUS "WebUI: npm build failed (${NPM_BUILD_RESULT}), falling back to download") + message(STATUS "UI: npm build failed (${NPM_BUILD_RESULT}), falling back to download") message(STATUS " stderr: ${NPM_ERR}") endif() else() - message(STATUS "WebUI: npm not found, skipping npm build and trying HF Bucket download") + message(STATUS "UI: npm not found, skipping npm build and trying HF Bucket download") endif() else() - message(STATUS "WebUI: NPM_DIR (${NPM_DIR}) has no package.json, skipping npm build") + message(STATUS "UI: NPM_DIR (${NPM_DIR}) has no package.json, skipping npm build") endif() endif() @@ -148,7 +148,7 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED) string(REGEX REPLACE "^([^:]+):.*$" "\\1" url_label "${entry}") string(REGEX REPLACE "^[^:]+:(.*)$" "\\1" base_url "${entry}") - message(STATUS "WebUI: downloading assets from ${url_label}: ${base_url}") + message(STATUS "UI: downloading assets from ${url_label}: ${base_url}") # Download each asset set(ALL_OK TRUE) @@ -161,11 +161,11 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED) list(GET download_status 0 download_result) if(NOT download_result EQUAL 0) list(GET download_status 1 error_message) - message(STATUS "WebUI: failed to download ${asset} from ${url_label}: ${error_message}") + message(STATUS "UI: failed to download ${asset} from ${url_label}: ${error_message}") set(ALL_OK FALSE) break() endif() - message(STATUS "WebUI: downloaded ${asset}") + message(STATUS "UI: downloaded ${asset}") endforeach() if(NOT ALL_OK) @@ -179,7 +179,7 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED) ) list(GET checksum_status 0 checksum_result) if(checksum_result EQUAL 0) - message(STATUS "WebUI: verifying checksums...") + message(STATUS "UI: verifying checksums...") file(STRINGS "${PUBLIC_DIR}/checksums.txt" CHECKSUMS_CONTENT) foreach(asset ${ASSETS}) set(download_path "${PUBLIC_DIR}/${asset}") @@ -187,12 +187,13 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED) string(TOLOWER "${asset_hash}" EXPECTED_HASH_LOWER) string(REGEX MATCH "${EXPECTED_HASH_LOWER}[ \\t]+${asset}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}") if(NOT CHECKSUM_LINE) - message(WARNING "WebUI: checksum verification failed for ${asset}") - message(WARNING " downloaded file may not match expected checksum, but will be used") + message(WARNING "UI: checksum verification failed for ${asset}") + set(ALL_OK FALSE) + break() endif() endforeach() if(ALL_OK) - message(STATUS "WebUI: all checksums verified") + message(STATUS "UI: all checksums verified") endif() endif() @@ -203,9 +204,9 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED) endforeach() if(PROVISION_SUCCESS) - message(STATUS "WebUI: provisioning complete") + message(STATUS "UI: provisioning complete") else() - message(WARNING "WebUI: failed to download assets from HF Bucket (${HF_BUCKET})") + message(WARNING "UI: failed to download assets from HF Bucket (${HF_BUCKET})") endif() endif() @@ -217,6 +218,6 @@ if(PROVISION_SUCCESS) file(WRITE "${STAMP_FILE}" "${RESOLVED_VERSION}") endif() else() - message(WARNING "WebUI: no source available. Neither local build (${NPM_DIR}) nor HF Bucket download succeeded.") - message(WARNING "WebUI: building server without embedded WebUI. Set LLAMA_BUILD_WEBUI=OFF to suppress this warning.") + message(WARNING "UI: no source available. Neither local build (${NPM_DIR}) nor HF Bucket download succeeded.") + message(WARNING "UI: building server without embedded UI. Set LLAMA_BUILD_UI=OFF to suppress this warning.") endif() diff --git a/scripts/xxd.cmake b/scripts/xxd.cmake index 14d2753808a..73f6cfff7f2 100644 --- a/scripts/xxd.cmake +++ b/scripts/xxd.cmake @@ -1,5 +1,5 @@ # CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}` -# Usage: cmake -DINPUT=tools/server/public/index.html -DOUTPUT=tools/server/index.html.hpp -P scripts/xxd.cmake +# Usage: cmake -DINPUT=build/tools/ui/dist/index.html -DOUTPUT=build/tools/ui/dist/index.html.hpp -P scripts/xxd.cmake SET(INPUT "" CACHE STRING "Input File") SET(OUTPUT "" CACHE STRING "Output File") diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index b433c91d85e..a60d3dab469 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -22,6 +22,7 @@ else() add_subdirectory(perplexity) add_subdirectory(quantize) if (LLAMA_BUILD_SERVER) + add_subdirectory(ui) add_subdirectory(cli) add_subdirectory(server) endif() diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index 20e4eb37697..57d3e871d99 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -40,136 +40,11 @@ set(TARGET_SRCS server-models.h ) -# Option to specify custom HF bucket for webui (defaults to llama-ui) -# Usage: cmake -B build -DLLAMA_WEBUI_HF_BUCKET=llama-ui -set(LLAMA_WEBUI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt webui assets") - -if (LLAMA_BUILD_WEBUI) - set(PUBLIC_ASSETS - index.html - bundle.js - bundle.css - loading.html - ) - - # Determine source of webui assets (priority: local > HF Bucket) - set(WEBUI_SOURCE "") - set(WEBUI_SOURCE_DIR "") - - # Priority 1: Check for local webui build output - set(LOCAL_WEBUI_DIR "${CMAKE_CURRENT_SOURCE_DIR}/public") - - # Verify all required assets exist before declaring local source valid - set(ALL_ASSETS_PRESENT TRUE) - foreach(asset ${PUBLIC_ASSETS}) - if(NOT EXISTS "${LOCAL_WEBUI_DIR}/${asset}") - set(ALL_ASSETS_PRESENT FALSE) - break() - endif() - endforeach() - - if(ALL_ASSETS_PRESENT) - set(WEBUI_SOURCE "local") - set(WEBUI_SOURCE_DIR "${LOCAL_WEBUI_DIR}") - message(STATUS "WebUI: using local build from ${WEBUI_SOURCE_DIR}") - endif() - - # Priority 2: Build-time asset provisioning (npm build → HF Bucket fallback) - if(NOT WEBUI_SOURCE_DIR) - # Environment variable takes precedence (e.g., from CI workflows) - if(DEFINED ENV{HF_WEBUI_VERSION}) - set(HF_WEBUI_VERSION "$ENV{HF_WEBUI_VERSION}") - # Validate against allowed characters to prevent CMake list separator - # or path-traversal issues in stamp filenames and download URLs - if(NOT HF_WEBUI_VERSION MATCHES "^[A-Za-z0-9._-]+$") - message(FATAL_ERROR "WebUI: invalid HF_WEBUI_VERSION='${HF_WEBUI_VERSION}' - must match ^[A-Za-z0-9._-]+$") - endif() - message(STATUS "WebUI: using HF_WEBUI_VERSION from environment=${HF_WEBUI_VERSION}") - elseif(DEFINED LLAMA_BUILD_NUMBER) - set(HF_WEBUI_VERSION "b${LLAMA_BUILD_NUMBER}") - message(STATUS "WebUI: using LLAMA_BUILD_NUMBER=${HF_WEBUI_VERSION}") - else() - set(HF_WEBUI_VERSION "") - message(STATUS "WebUI: version not specified (will use HF 'latest')") - endif() - - # Stamp file embeds the version tag so a changed build number triggers - # a fresh provision run on the next `cmake --build` without reconfiguring. - if("${HF_WEBUI_VERSION}" STREQUAL "") - set(WEBUI_VERSION_TAG "provisioned") - else() - set(WEBUI_VERSION_TAG "${HF_WEBUI_VERSION}") - endif() - set(WEBUI_STAMP "${CMAKE_CURRENT_BINARY_DIR}/.webui-${WEBUI_VERSION_TAG}.stamp") - - # Join assets with + separator (safe across all platforms, unlike ; and |) - string(REPLACE ";" "+" PUBLIC_ASSETS_JOINED "${PUBLIC_ASSETS}") - - add_custom_command( - OUTPUT ${WEBUI_STAMP} - COMMAND ${CMAKE_COMMAND} - "-DSOURCE_DIR=${PROJECT_SOURCE_DIR}" - "-DPUBLIC_DIR=${CMAKE_CURRENT_SOURCE_DIR}/public" - "-DHF_BUCKET=${LLAMA_WEBUI_HF_BUCKET}" - "-DHF_VERSION=${HF_WEBUI_VERSION}" - "-DHF_ENABLED=${LLAMA_USE_PREBUILT_WEBUI}" - "-DASSETS=${PUBLIC_ASSETS_JOINED}" - "-DSTAMP_FILE=${WEBUI_STAMP}" - "-DNPM_DIR=${CMAKE_CURRENT_SOURCE_DIR}/webui" - -P ${PROJECT_SOURCE_DIR}/scripts/webui-download.cmake - COMMENT "Building/provisioning WebUI assets (npm build -> HF Bucket fallback)" - ) - - set(WEBUI_SOURCE "provisioned") - set(WEBUI_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/public") - endif() - - # Process assets from the determined source - if(WEBUI_SOURCE_DIR) - foreach(asset ${PUBLIC_ASSETS}) - set(input "${WEBUI_SOURCE_DIR}/${asset}") - set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") - list(APPEND TARGET_SRCS ${output}) - - if(WEBUI_SOURCE STREQUAL "local") - # Local build: files exist at configure time - if(NOT EXISTS "${input}") - message(FATAL_ERROR "WebUI asset not found: ${input}") - endif() - set(dependency "${input}") - else() - # HF Bucket: files are downloaded at build time - set(dependency "${WEBUI_STAMP}") - endif() - - add_custom_command( - DEPENDS ${dependency} - OUTPUT "${output}" - COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" - ) - set_source_files_properties(${output} PROPERTIES GENERATED TRUE) - endforeach() - - add_definitions(-DLLAMA_BUILD_WEBUI) - add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=1) - message(STATUS "WebUI: embedded with source: ${WEBUI_SOURCE}") - else() - # WebUI source not found - issue warning but don't fail the build - # The server will still build but without webui embedded - message(WARNING "WebUI: no source available. Neither local build (tools/server/public/) nor HF Bucket download succeeded.") - message(WARNING "WebUI: building server without embedded WebUI. Set LLAMA_BUILD_WEBUI=OFF to suppress this warning.") - add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=0) - endif() -else() - # WebUI is disabled at build time - add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=0) -endif() - add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) target_include_directories(${TARGET} PRIVATE ../mtmd) target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) -target_link_libraries(${TARGET} PRIVATE server-context PUBLIC llama-common cpp-httplib ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE server-context llama-ui PUBLIC llama-common cpp-httplib ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md index a9c1e7385fc..0ff334724a3 100644 --- a/tools/server/README-dev.md +++ b/tools/server/README-dev.md @@ -224,7 +224,7 @@ The SvelteKit-based Web UI is introduced in this PR: https://github.com/ggml-org ### Architecture -The WebUI follows a layered architecture: +The UI follows a layered architecture: ``` Routes → Components → Hooks → Stores → Services → Storage/API @@ -234,7 +234,7 @@ Routes → Components → Hooks → Stores → Services → Storage/API - **Services** - stateless API/database communication (`ChatService`, `ModelsService`, `PropsService`, `DatabaseService`) - **Hooks** - reusable logic (`useModelChangeValidation`, `useProcessingState`) -For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/): +For detailed architecture diagrams, see [`tools/ui/docs/`](../ui/docs/): - `high-level-architecture.mmd` - full architecture with all modules - `high-level-architecture-simplified.mmd` - simplified overview @@ -246,7 +246,7 @@ For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/ ```sh # make sure you have Node.js installed -cd tools/server/webui +cd tools/ui npm i # run dev server (with hot reload) diff --git a/tools/server/README.md b/tools/server/README.md index 2b3a2b1687d..2ed7fe16ee2 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -189,11 +189,11 @@ For the full list of features, please refer to [server's changelog](https://gith | `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)
(env: LLAMA_ARG_REUSE_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )
(env: LLAMA_ARG_API_PREFIX) | -| `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG) | -| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG_FILE) | -| `--webui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_WEBUI_MCP_PROXY) | +| `--ui-config JSON` / `--webui-config JSON` (deprecated) | JSON that provides default UI settings (overrides UI defaults)
(env: LLAMA_ARG_UI_CONFIG / LLAMA_ARG_WEBUI_CONFIG) | +| `--ui-config-file PATH` / `--webui-config-file PATH` (deprecated) | JSON file that provides default UI settings (overrides UI defaults)
(env: LLAMA_ARG_UI_CONFIG_FILE / LLAMA_ARG_WEBUI_CONFIG_FILE) | +| `--ui-mcp-proxy, --no-ui-mcp-proxy` / `--webui-mcp-proxy, --no-webui-mcp-proxy` (deprecated) | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_UI_MCP_PROXY / LLAMA_ARG_WEBUI_MCP_PROXY) | | `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)
specify "all" to enable all tools
available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime
(env: LLAMA_ARG_TOOLS) | -| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_WEBUI) | +| `--ui, --no-ui` / `--webui, --no-webui` (deprecated) | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_UI / LLAMA_ARG_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)
(env: LLAMA_API_KEY) | @@ -1831,10 +1831,12 @@ Apart from error types supported by OAI, we also have custom types that are spec ### Custom default Web UI preferences -You can specify default preferences for the web UI using `--webui-config ` or `--webui-config-file `. For example, you can disable pasting long text as attachments and enable rendering Markdown in user messages with this command: +You can specify default preferences for the web UI using `--ui-config ` or `--ui-config-file `. For example, you can disable pasting long text as attachments and enable rendering Markdown in user messages with this command: ```bash -./llama-server -m model.gguf --webui-config '{"pasteLongTextToFileLen": 0, "renderUserContentAsMarkdown": true}' +./llama-server -m model.gguf --ui-config '{"pasteLongTextToFileLen": 0, "renderUserContentAsMarkdown": true}' ``` -You may find available preferences in [settings-config.ts](webui/src/lib/constants/settings-config.ts). +> **Note:** The old flags `--webui-config` and `--webui-config-file` are deprecated but still work as aliases. + +You may find available preferences in [settings-config.ts](../ui/src/lib/constants/settings-config.ts). diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index d49c986feef..1dc19536866 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -671,7 +671,8 @@ struct server_context_impl { server_metrics metrics; - json json_webui_settings = json::object(); + json json_ui_settings = json::object(); // Primary: new name + json json_webui_settings = json::object(); // Deprecated: use json_ui_settings instead (kept for compat) // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; @@ -996,13 +997,18 @@ struct server_context_impl { } } - // populate webui settings + // populate UI settings (from either new ui_config_json or deprecated webui_config_json) { - if (!params_base.webui_config_json.empty()) { + const std::string & cfg = !params_base.ui_config_json.empty() + ? params_base.ui_config_json + : params_base.webui_config_json; + if (!cfg.empty()) { try { - json_webui_settings = json::parse(params_base.webui_config_json); + json json_settings = json::parse(cfg); + json_ui_settings = json_settings; + json_webui_settings = json_settings; // deprecated: keep in sync } catch (const std::exception & e) { - SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); + SRV_ERR("%s: failed to parse UI config: %s\n", __func__, e.what()); return false; } } @@ -3292,7 +3298,8 @@ server_context_meta server_context::get_meta() const { /* has_mtmd */ impl->mctx != nullptr, /* has_inp_image */ impl->chat_params.allow_image, /* has_inp_audio */ impl->chat_params.allow_audio, - /* json_webui_settings */ impl->json_webui_settings, + /* json_ui_settings */ impl->json_ui_settings, + /* json_webui_settings */ impl->json_webui_settings, // Deprecated /* slot_n_ctx */ impl->get_slot_n_ctx(), /* pooling_type */ llama_pooling_type(impl->ctx_tgt), @@ -3814,8 +3821,12 @@ void server_routes::init_routes() { { "endpoint_slots", params.endpoint_slots }, { "endpoint_props", params.endpoint_props }, { "endpoint_metrics", params.endpoint_metrics }, - { "webui", params.webui }, - { "webui_settings", meta->json_webui_settings }, + // New keys + { "ui", params.ui }, + { "ui_settings", meta->json_ui_settings }, + // Deprecated: use ui/ui_settings instead (kept for backward compat) + { "webui", params.webui }, + { "webui_settings", meta->json_webui_settings }, { "chat_template", tmpl_default }, { "chat_template_caps", meta->chat_template_caps }, { "bos_token", meta->bos_token_str }, diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 58dda891441..65853438c93 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -21,7 +21,8 @@ struct server_context_meta { bool has_mtmd; bool has_inp_image; bool has_inp_audio; - json json_webui_settings; + json json_ui_settings; // Primary: new name + json json_webui_settings; // Deprecated: use json_ui_settings instead (kept for backward compat) int slot_n_ctx; enum llama_pooling_type pooling_type; diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index af4536fdd77..39a21f4ecc7 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -1,6 +1,7 @@ #include "common.h" #include "server-http.h" #include "server-common.h" +#include "ui.h" #include @@ -10,14 +11,6 @@ #include #include -#ifdef LLAMA_BUILD_WEBUI -// auto generated files (see README.md for details) -#include "index.html.hpp" -#include "bundle.js.hpp" -#include "bundle.css.hpp" -#include "loading.html.hpp" -#endif - // // HTTP implementation using cpp-httplib // @@ -238,10 +231,11 @@ bool server_http_context::init(const common_params & params) { }; auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) { - (void)req; // suppress unused parameter warning when LLAMA_BUILD_WEBUI is not defined + (void)req; // suppress unused parameter warning when LLAMA_BUILD_UI / LLAMA_BUILD_WEBUI is not defined bool ready = is_ready.load(); if (!ready) { -#ifdef LLAMA_BUILD_WEBUI +// Support both old and new preprocessor defines +#if defined(LLAMA_BUILD_UI) || defined(LLAMA_BUILD_WEBUI) auto tmp = string_split(req.path, '.'); if (req.path == "/" || (tmp.size() > 0 && tmp.back() == "html")) { res.status = 503; @@ -305,8 +299,10 @@ bool server_http_context::init(const common_params & params) { // Web UI setup // - if (!params.webui) { - SRV_INF("%s", "the WebUI is disabled\n"); + // Use new `params.ui` field (backed by old `params.webui` for compat) + if (!params.ui) { + SRV_INF("%s", "The UI is disabled\n"); + SRV_INF("%s", "Use --ui/--no-ui (or deprecated --webui/--no-webui) to enable/disable\n"); } else { // register static assets routes if (!params.public_path.empty()) { @@ -317,7 +313,8 @@ bool server_http_context::init(const common_params & params) { return 1; } } else { -#ifdef LLAMA_BUILD_WEBUI +// Support both old and new preprocessor defines +#if defined(LLAMA_BUILD_UI) || defined(LLAMA_BUILD_WEBUI) // using embedded static index.html srv->Get(params.api_prefix + "/", [](const httplib::Request & /*req*/, httplib::Response & res) { // COEP and COOP headers, required by pyodide (python interpreter) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 698489a11e2..433d2d8f04e 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -1152,14 +1152,17 @@ void server_models_routes::init_routes() { {"role", "router"}, {"max_instances", params.models_max}, {"models_autoload", params.models_autoload}, - // this is a dummy response to make sure webui doesn't break + // this is a dummy response to make sure the UI doesn't break {"model_alias", "llama-server"}, {"model_path", "none"}, {"default_generation_settings", { {"params", json{}}, {"n_ctx", 0}, }}, - {"webui_settings", webui_settings}, + // New key + {"ui_settings", ui_settings}, + // Deprecated: use ui_settings instead (kept for backward compat) + {"webui_settings", webui_settings}, {"build_info", std::string(llama_build_info())}, }); return res; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index f1206c71448..e96d76c9169 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -175,15 +175,22 @@ struct server_models { struct server_models_routes { common_params params; - json webui_settings = json::object(); + json ui_settings = json::object(); // Primary: new name + json webui_settings = json::object(); // Deprecated: use ui_settings (kept for compat) server_models models; server_models_routes(const common_params & params, int argc, char ** argv) : params(params), models(params, argc, argv) { - if (!this->params.webui_config_json.empty()) { + // Support both new ui_config_json and deprecated webui_config_json + const std::string & cfg = !this->params.ui_config_json.empty() + ? this->params.ui_config_json + : this->params.webui_config_json; + if (!cfg.empty()) { try { - webui_settings = json::parse(this->params.webui_config_json); + json json_settings = json::parse(cfg); + ui_settings = json_settings; + webui_settings = json_settings; // Deprecated: keep in sync } catch (const std::exception & e) { - LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); + LOG_ERR("%s: failed to parse UI config: %s\n", __func__, e.what()); throw; } } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 823ae5bda36..a232550789c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -208,7 +208,8 @@ int main(int argc, char ** argv) { ctx_http.register_gcp_compat(); // CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP) - if (params.webui_mcp_proxy) { + // Supports both new ui_mcp_proxy and deprecated webui_mcp_proxy fields + if (params.ui_mcp_proxy || params.webui_mcp_proxy) { SRV_WRN("%s", "-----------------\n"); SRV_WRN("%s", "CORS proxy is enabled, do not expose server to untrusted environments\n"); SRV_WRN("%s", "This feature is EXPERIMENTAL and may be removed or changed in future versions\n"); diff --git a/tools/server/webui/scripts/post-build.sh b/tools/server/webui/scripts/post-build.sh deleted file mode 100755 index 55e46d5d5c6..00000000000 --- a/tools/server/webui/scripts/post-build.sh +++ /dev/null @@ -1,3 +0,0 @@ -rm -rf ../public/_app; -rm ../public/favicon.svg; -rm -f ../public/index.html.gz; # deprecated, but may still be generated by older versions of the build process diff --git a/tools/server/webui/src/lib/constants/localstorage-keys.ts b/tools/server/webui/src/lib/constants/localstorage-keys.ts deleted file mode 100644 index a04194c46f6..00000000000 --- a/tools/server/webui/src/lib/constants/localstorage-keys.ts +++ /dev/null @@ -1,6 +0,0 @@ -export const ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY = 'LlamaCppWebui.alwaysAllowedTools'; -export const CONFIG_LOCALSTORAGE_KEY = 'LlamaCppWebui.config'; -export const DISABLED_TOOLS_LOCALSTORAGE_KEY = 'LlamaCppWebui.disabledTools'; -export const FAVORITE_MODELS_LOCALSTORAGE_KEY = 'LlamaCppWebui.favoriteModels'; -export const MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY = 'LlamaCppWebui.mcpDefaultEnabled'; -export const USER_OVERRIDES_LOCALSTORAGE_KEY = 'LlamaCppWebui.userOverrides'; diff --git a/tools/server/webui/.gitignore b/tools/ui/.gitignore similarity index 100% rename from tools/server/webui/.gitignore rename to tools/ui/.gitignore diff --git a/tools/server/webui/.npmrc b/tools/ui/.npmrc similarity index 100% rename from tools/server/webui/.npmrc rename to tools/ui/.npmrc diff --git a/tools/server/webui/.prettierignore b/tools/ui/.prettierignore similarity index 100% rename from tools/server/webui/.prettierignore rename to tools/ui/.prettierignore diff --git a/tools/server/webui/.prettierrc b/tools/ui/.prettierrc similarity index 100% rename from tools/server/webui/.prettierrc rename to tools/ui/.prettierrc diff --git a/tools/server/webui/.storybook/decorators/ModeWatcherDecorator.svelte b/tools/ui/.storybook/decorators/ModeWatcherDecorator.svelte similarity index 100% rename from tools/server/webui/.storybook/decorators/ModeWatcherDecorator.svelte rename to tools/ui/.storybook/decorators/ModeWatcherDecorator.svelte diff --git a/tools/server/webui/.storybook/decorators/TooltipProviderDecorator.svelte b/tools/ui/.storybook/decorators/TooltipProviderDecorator.svelte similarity index 100% rename from tools/server/webui/.storybook/decorators/TooltipProviderDecorator.svelte rename to tools/ui/.storybook/decorators/TooltipProviderDecorator.svelte diff --git a/tools/server/webui/.storybook/main.ts b/tools/ui/.storybook/main.ts similarity index 100% rename from tools/server/webui/.storybook/main.ts rename to tools/ui/.storybook/main.ts diff --git a/tools/server/webui/.storybook/preview.ts b/tools/ui/.storybook/preview.ts similarity index 100% rename from tools/server/webui/.storybook/preview.ts rename to tools/ui/.storybook/preview.ts diff --git a/tools/server/webui/.storybook/vitest.setup.ts b/tools/ui/.storybook/vitest.setup.ts similarity index 100% rename from tools/server/webui/.storybook/vitest.setup.ts rename to tools/ui/.storybook/vitest.setup.ts diff --git a/tools/ui/CMakeLists.txt b/tools/ui/CMakeLists.txt new file mode 100644 index 00000000000..9687ca92e55 --- /dev/null +++ b/tools/ui/CMakeLists.txt @@ -0,0 +1,157 @@ +set(TARGET llama-ui) + +# Deprecated: use LLAMA_UI_HF_BUCKET instead +set(LLAMA_WEBUI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt webui assets (deprecated: use LLAMA_UI_HF_BUCKET)") +set(LLAMA_UI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt UI assets") + +# Backward compat: forward old var to new one +if(DEFINED LLAMA_WEBUI_HF_BUCKET AND NOT DEFINED LLAMA_UI_HF_BUCKET) + set(LLAMA_UI_HF_BUCKET ${LLAMA_WEBUI_HF_BUCKET}) +elseif(DEFINED LLAMA_WEBUI_HF_BUCKET AND NOT "${LLAMA_WEBUI_HF_BUCKET}" STREQUAL "${LLAMA_UI_HF_BUCKET}") + message(DEPRECATION "LLAMA_WEBUI_HF_BUCKET is deprecated, use LLAMA_UI_HF_BUCKET instead") +endif() + +set(TARGET_SRCS "") +set(UI_COMPILE_DEFS "") + +# Support both old (LLAMA_BUILD_WEBUI) and new (LLAMA_BUILD_UI) option names +if(LLAMA_BUILD_WEBUI OR LLAMA_BUILD_UI) + if(LLAMA_BUILD_WEBUI AND NOT LLAMA_BUILD_UI) + message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead") + endif() + + set(PUBLIC_ASSETS + index.html + bundle.js + bundle.css + loading.html + ) + + # Determine source of UI assets (priority: local > HF Bucket) + set(UI_SOURCE "") + set(UI_SOURCE_DIR "") + + # Priority 1: Check for local build output + set(LOCAL_UI_DIR "${PROJECT_SOURCE_DIR}/build/tools/ui/dist") + + # Verify all required assets exist before declaring local source valid + set(ALL_ASSETS_PRESENT TRUE) + foreach(asset ${PUBLIC_ASSETS}) + if(NOT EXISTS "${LOCAL_UI_DIR}/${asset}") + set(ALL_ASSETS_PRESENT FALSE) + break() + endif() + endforeach() + + if(ALL_ASSETS_PRESENT) + set(UI_SOURCE "local") + set(UI_SOURCE_DIR "${LOCAL_UI_DIR}") + message(STATUS "UI: using local build from ${UI_SOURCE_DIR}") + endif() + + # Priority 2: Build-time asset provisioning (npm build → HF Bucket fallback) + if(NOT UI_SOURCE_DIR) + # Environment variable takes precedence (e.g., from CI workflows) + # Deprecated: use HF_UI_VERSION instead + if(DEFINED ENV{HF_WEBUI_VERSION}) + set(HF_UI_VERSION "$ENV{HF_WEBUI_VERSION}") + message(DEPRECATION "HF_WEBUI_VERSION env var is deprecated, use HF_UI_VERSION instead") + if(NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$") + message(FATAL_ERROR "UI: invalid HF_WEBUI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$") + endif() + elseif(DEFINED ENV{HF_UI_VERSION}) + set(HF_UI_VERSION "$ENV{HF_UI_VERSION}") + if(NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$") + message(FATAL_ERROR "UI: invalid HF_UI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$") + endif() + elseif(DEFINED LLAMA_BUILD_NUMBER) + set(HF_UI_VERSION "b${LLAMA_BUILD_NUMBER}") + message(STATUS "UI: derived HF_UI_VERSION=b${LLAMA_BUILD_NUMBER}") + else() + set(HF_UI_VERSION "") + message(STATUS "UI: version not specified (will use HF 'latest')") + endif() + + if("${HF_UI_VERSION}" STREQUAL "") + set(UI_VERSION_TAG "provisioned") + else() + set(UI_VERSION_TAG "${HF_UI_VERSION}") + endif() + set(UI_STAMP "${CMAKE_CURRENT_BINARY_DIR}/.ui-${UI_VERSION_TAG}.stamp") + + string(REPLACE ";" "+" PUBLIC_ASSETS_JOINED "${PUBLIC_ASSETS}") + + add_custom_command( + OUTPUT ${UI_STAMP} + COMMAND ${CMAKE_COMMAND} + "-DSOURCE_DIR=${PROJECT_SOURCE_DIR}" + "-DPUBLIC_DIR=${PROJECT_SOURCE_DIR}/build/tools/ui/dist" + "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}" + "-DHF_VERSION=${HF_UI_VERSION}" + "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}" + "-DASSETS=${PUBLIC_ASSETS_JOINED}" + "-DSTAMP_FILE=${UI_STAMP}" + "-DNPM_DIR=${PROJECT_SOURCE_DIR}/tools/ui" + -P ${PROJECT_SOURCE_DIR}/scripts/ui-download.cmake + COMMENT "Building/provisioning UI assets (npm build -> HF Bucket fallback)" + ) + + set(UI_SOURCE "provisioned") + set(UI_SOURCE_DIR "${PROJECT_SOURCE_DIR}/build/tools/ui/dist") + endif() + + # Process assets from the determined source + if(UI_SOURCE_DIR) + foreach(asset ${PUBLIC_ASSETS}) + set(input "${UI_SOURCE_DIR}/${asset}") + set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") + list(APPEND TARGET_SRCS ${output}) + + if(UI_SOURCE STREQUAL "local") + if(NOT EXISTS "${input}") + message(FATAL_ERROR "UI asset not found: ${input}") + endif() + set(dependency "${input}") + else() + set(dependency "${UI_STAMP}") + endif() + + add_custom_command( + DEPENDS ${dependency} + OUTPUT "${output}" + COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" + ) + set_source_files_properties(${output} PROPERTIES GENERATED TRUE) + endforeach() + + list(APPEND UI_COMPILE_DEFS + LLAMA_BUILD_WEBUI # Deprecated: use LLAMA_BUILD_UI + LLAMA_BUILD_UI + LLAMA_WEBUI_DEFAULT_ENABLED=1 # Deprecated: use LLAMA_UI_DEFAULT_ENABLED + LLAMA_UI_DEFAULT_ENABLED=1 + ) + message(STATUS "UI: embedded with source: ${UI_SOURCE}") + else() + message(WARNING "UI: no source available. Neither local build (build/tools/ui/dist/) nor HF Bucket download succeeded.") + message(WARNING "UI: building server without embedded UI. Set LLAMA_BUILD_UI=OFF to suppress this warning.") + list(APPEND UI_COMPILE_DEFS LLAMA_WEBUI_DEFAULT_ENABLED=0 LLAMA_UI_DEFAULT_ENABLED=0) + endif() +else() + list(APPEND UI_COMPILE_DEFS LLAMA_WEBUI_DEFAULT_ENABLED=0 LLAMA_UI_DEFAULT_ENABLED=0) +endif() + +# Build the static library +add_library(${TARGET} STATIC ui.cpp) + +target_include_directories(${TARGET} PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} +) + +target_compile_definitions(${TARGET} PUBLIC ${UI_COMPILE_DEFS}) + +if(TARGET_SRCS) + # List generated .hpp files as sources so CMake tracks them as build dependencies + target_sources(${TARGET} PRIVATE ${TARGET_SRCS}) + set_source_files_properties(${TARGET_SRCS} PROPERTIES HEADER_FILE_ONLY TRUE) +endif() diff --git a/tools/server/webui/README.md b/tools/ui/README.md similarity index 96% rename from tools/server/webui/README.md rename to tools/ui/README.md index 40742b00e1a..abbbabe923d 100644 --- a/tools/server/webui/README.md +++ b/tools/ui/README.md @@ -2,7 +2,7 @@ A modern, feature-rich web interface for llama-server built with SvelteKit. This UI provides an intuitive chat interface with advanced file handling, conversation management, and comprehensive model interaction capabilities. -The WebUI supports two server operation modes: +Llama UI supports two server operation modes: - **MODEL mode** - Single model operation (standard llama-server) - **ROUTER mode** - Multi-model operation with dynamic model loading/unloading @@ -88,7 +88,7 @@ The WebUI supports two server operation modes: ### 1. Install Dependencies ```bash -cd tools/server/webui +cd tools/ui npm install ``` @@ -112,7 +112,7 @@ npm run dev This starts: -- **Vite dev server** at `http://localhost:5173` - The main WebUI +- **Vite dev server** at `http://localhost:5173` - The main UI frontend app - **Storybook** at `http://localhost:6006` - Component documentation The Vite dev server proxies API requests to `http://localhost:8080` (default llama-server port): @@ -186,7 +186,7 @@ npm run build The build process: 1. **Vite Build** - Bundles all TypeScript, Svelte, and CSS -2. **Static Adapter** - Outputs to `../public` (llama-server's static file directory) +2. **Static Adapter** - Outputs to `../../build/tools/ui/dist` (llama-server's static file directory) 3. **Post-Build Script** - Cleans up intermediate files 4. **Custom Plugin** - Creates `index.html` with: - Inlined favicon as base64 @@ -194,7 +194,7 @@ The build process: - Deterministic output (zeroed timestamps) ```text -tools/server/webui/ → build → tools/server/public/ +tools/ui/ → build → build/tools/ui/dist/ ├── src/ ├── index.html (served by llama-server) ├── static/ └── (favicon inlined) └── ... @@ -205,8 +205,8 @@ tools/server/webui/ → build → tools/server/public/ ```javascript // svelte.config.js adapter: adapter({ - pages: '../public', // Output directory - assets: '../public', // Static assets + pages: '../../build/tools/ui/dist', // Output directory + assets: '../../build/tools/ui/dist', // Static assets fallback: 'index.html', // SPA fallback strict: true }), @@ -217,20 +217,19 @@ output: { ### Integration with llama-server -The WebUI is embedded directly into the llama-server binary: +llama-ui is embedded directly into the llama-server binary: -1. `npm run build` outputs `index.html` to `tools/server/public/` +1. `npm run build` outputs `index.html` to `build/tools/ui/dist/` 2. llama-server compiles this into the binary at build time -3. When accessing `/`, llama-server serves the gzipped HTML -4. All assets are inlined (CSS, JS, fonts, favicon) +3. When accessing `/`, llama-server serves the bundled HTML -This results in a **single portable binary** with the full WebUI included. +This results in a **single portable binary** with the full Llama UI included. --- ## Architecture -The WebUI follows a layered architecture with unidirectional data flow: +Llama UI follows a layered architecture with unidirectional data flow: ```text Routes → Components → Hooks → Stores → Services → Storage/API @@ -659,7 +658,7 @@ npm run check # TypeScript type checking ## Project Structure ```text -tools/server/webui/ +tools/ui/ ├── src/ │ ├── lib/ │ │ ├── components/ # UI components (app/, ui/) diff --git a/tools/server/webui/components.json b/tools/ui/components.json similarity index 100% rename from tools/server/webui/components.json rename to tools/ui/components.json diff --git a/tools/server/webui/docs/architecture/high-level-architecture-simplified.md b/tools/ui/docs/architecture/high-level-architecture-simplified.md similarity index 100% rename from tools/server/webui/docs/architecture/high-level-architecture-simplified.md rename to tools/ui/docs/architecture/high-level-architecture-simplified.md diff --git a/tools/server/webui/docs/architecture/high-level-architecture.md b/tools/ui/docs/architecture/high-level-architecture.md similarity index 100% rename from tools/server/webui/docs/architecture/high-level-architecture.md rename to tools/ui/docs/architecture/high-level-architecture.md diff --git a/tools/server/webui/docs/flows/chat-flow.md b/tools/ui/docs/flows/chat-flow.md similarity index 100% rename from tools/server/webui/docs/flows/chat-flow.md rename to tools/ui/docs/flows/chat-flow.md diff --git a/tools/server/webui/docs/flows/conversations-flow.md b/tools/ui/docs/flows/conversations-flow.md similarity index 100% rename from tools/server/webui/docs/flows/conversations-flow.md rename to tools/ui/docs/flows/conversations-flow.md diff --git a/tools/server/webui/docs/flows/data-flow-simplified-model-mode.md b/tools/ui/docs/flows/data-flow-simplified-model-mode.md similarity index 100% rename from tools/server/webui/docs/flows/data-flow-simplified-model-mode.md rename to tools/ui/docs/flows/data-flow-simplified-model-mode.md diff --git a/tools/server/webui/docs/flows/data-flow-simplified-router-mode.md b/tools/ui/docs/flows/data-flow-simplified-router-mode.md similarity index 100% rename from tools/server/webui/docs/flows/data-flow-simplified-router-mode.md rename to tools/ui/docs/flows/data-flow-simplified-router-mode.md diff --git a/tools/server/webui/docs/flows/database-flow.md b/tools/ui/docs/flows/database-flow.md similarity index 100% rename from tools/server/webui/docs/flows/database-flow.md rename to tools/ui/docs/flows/database-flow.md diff --git a/tools/server/webui/docs/flows/mcp-flow.md b/tools/ui/docs/flows/mcp-flow.md similarity index 100% rename from tools/server/webui/docs/flows/mcp-flow.md rename to tools/ui/docs/flows/mcp-flow.md diff --git a/tools/server/webui/docs/flows/models-flow.md b/tools/ui/docs/flows/models-flow.md similarity index 100% rename from tools/server/webui/docs/flows/models-flow.md rename to tools/ui/docs/flows/models-flow.md diff --git a/tools/server/webui/docs/flows/server-flow.md b/tools/ui/docs/flows/server-flow.md similarity index 100% rename from tools/server/webui/docs/flows/server-flow.md rename to tools/ui/docs/flows/server-flow.md diff --git a/tools/server/webui/docs/flows/settings-flow.md b/tools/ui/docs/flows/settings-flow.md similarity index 98% rename from tools/server/webui/docs/flows/settings-flow.md rename to tools/ui/docs/flows/settings-flow.md index 40ad3bd94d7..260713a17b8 100644 --- a/tools/server/webui/docs/flows/settings-flow.md +++ b/tools/ui/docs/flows/settings-flow.md @@ -58,8 +58,8 @@ sequenceDiagram end end - alt serverStore.props has webuiSettings - settingsStore->>settingsStore: Apply webuiSettings from server + alt serverStore.props has uiSettings + settingsStore->>settingsStore: Apply uiSettings from server Note right of settingsStore: Server-provided UI settings
(e.g. showRawOutputSwitch) end diff --git a/tools/server/webui/eslint.config.js b/tools/ui/eslint.config.js similarity index 93% rename from tools/server/webui/eslint.config.js rename to tools/ui/eslint.config.js index cd20fb383a4..185da1dabbe 100644 --- a/tools/server/webui/eslint.config.js +++ b/tools/ui/eslint.config.js @@ -29,7 +29,9 @@ export default ts.config( 'no-undef': 'off', 'svelte/no-at-html-tags': 'off', // This app uses hash-based routing (#/) where resolve() from $app/paths does not apply - 'svelte/no-navigation-without-resolve': 'off' + 'svelte/no-navigation-without-resolve': 'off', + // Enforce empty line at end of file + 'eol-last': 'error' } }, { diff --git a/tools/server/webui/package-lock.json b/tools/ui/package-lock.json similarity index 100% rename from tools/server/webui/package-lock.json rename to tools/ui/package-lock.json diff --git a/tools/server/webui/package.json b/tools/ui/package.json similarity index 98% rename from tools/server/webui/package.json rename to tools/ui/package.json index 2338c38402a..5a1cec66645 100644 --- a/tools/server/webui/package.json +++ b/tools/ui/package.json @@ -5,7 +5,7 @@ "type": "module", "scripts": { "dev": "bash scripts/dev.sh", - "build": "vite build && ./scripts/post-build.sh", + "build": "vite build", "preview": "vite preview", "prepare": "svelte-kit sync || echo ''", "check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json", diff --git a/tools/server/webui/playwright.config.ts b/tools/ui/playwright.config.ts similarity index 70% rename from tools/server/webui/playwright.config.ts rename to tools/ui/playwright.config.ts index 26d3be535d1..178fd7ba856 100644 --- a/tools/server/webui/playwright.config.ts +++ b/tools/ui/playwright.config.ts @@ -2,7 +2,7 @@ import { defineConfig } from '@playwright/test'; export default defineConfig({ webServer: { - command: 'npm run build && http-server ../public -p 8181', + command: 'npm run build && http-server ../../build/tools/ui/dist -p 8181', port: 8181, timeout: 120000, reuseExistingServer: false diff --git a/tools/server/webui/scripts/dev.sh b/tools/ui/scripts/dev.sh similarity index 89% rename from tools/server/webui/scripts/dev.sh rename to tools/ui/scripts/dev.sh index 97c14b87328..9256f255aec 100644 --- a/tools/server/webui/scripts/dev.sh +++ b/tools/ui/scripts/dev.sh @@ -2,14 +2,14 @@ # Development script for llama-ui # -# This script starts the webui development servers (Storybook and Vite). +# This script starts the llama-ui development servers (Storybook and Vite). # Note: You need to start llama-server separately. # # Usage: # bash scripts/dev.sh # npm run dev -cd ../../../ +cd ../../ # Check and install git hooks if missing check_and_install_hooks() { @@ -22,13 +22,13 @@ check_and_install_hooks() { if [ "$hooks_missing" = true ]; then echo "🔧 Git hooks missing, installing them..." - cd tools/server/webui + cd tools/ui if bash scripts/install-git-hooks.sh; then echo "✅ Git hooks installed successfully" else echo "⚠️ Failed to install git hooks, continuing anyway..." fi - cd ../../../ + cd ../../ else echo "✅ Git hooks already installed" fi @@ -48,7 +48,7 @@ trap cleanup SIGINT SIGTERM echo "🚀 Starting development servers..." echo "📝 Note: Make sure to start llama-server separately if needed" -cd tools/server/webui +cd tools/ui # Use --insecure-http-parser to handle malformed HTTP responses from llama-server # (some responses have both Content-Length and Transfer-Encoding headers) storybook dev -p 6006 --ci & NODE_OPTIONS="--insecure-http-parser" vite dev --host 0.0.0.0 & diff --git a/tools/server/webui/scripts/install-git-hooks.sh b/tools/ui/scripts/install-git-hooks.sh similarity index 62% rename from tools/server/webui/scripts/install-git-hooks.sh rename to tools/ui/scripts/install-git-hooks.sh index 8aa1014ba36..213feb08dcf 100755 --- a/tools/server/webui/scripts/install-git-hooks.sh +++ b/tools/ui/scripts/install-git-hooks.sh @@ -1,29 +1,29 @@ #!/bin/bash -# Script to install pre-commit hook for webui -# Pre-commit: formats, checks, and builds webui +# Script to install pre-commit hook for llama-ui +# Pre-commit: formats, checks, and builds the UI app REPO_ROOT=$(git rev-parse --show-toplevel) PRE_COMMIT_HOOK="$REPO_ROOT/.git/hooks/pre-commit" -echo "Installing pre-commit hook for webui..." +echo "Installing pre-commit hook for llama-ui..." # Create the pre-commit hook cat > "$PRE_COMMIT_HOOK" << 'EOF' #!/bin/bash -# Check if there are any changes in the webui directory -if git diff --cached --name-only | grep -q "^tools/server/webui/"; then +# Check if there are any changes in the tools/ui directory +if git diff --cached --name-only | grep -q "^tools/ui/"; then REPO_ROOT=$(git rev-parse --show-toplevel) - cd "$REPO_ROOT/tools/server/webui" + cd "$REPO_ROOT/tools/ui" # Check if package.json exists if [ ! -f "package.json" ]; then - echo "Error: package.json not found in tools/server/webui" + echo "Error: package.json not found in tools/ui" exit 1 fi - echo "Formatting and checking webui code..." + echo "Formatting and checking llama-ui code..." # Run the format command npm run format @@ -46,17 +46,17 @@ if git diff --cached --name-only | grep -q "^tools/server/webui/"; then exit 1 fi - echo "✅ Webui code formatted and checked successfully" + echo "✅ llama-ui code formatted and checked successfully" - # Build the webui - echo "Building webui..." + # Build the llama-ui + echo "Building llama-ui..." npm run build if [ $? -ne 0 ]; then echo "❌ npm run build failed" exit 1 fi - echo "✅ Webui built successfully" + echo "✅ llama-ui built successfully" fi exit 0 @@ -70,8 +70,8 @@ if [ $? -eq 0 ]; then echo " Pre-commit: $PRE_COMMIT_HOOK" echo "" echo "The hook will automatically:" - echo " • Format, lint and check webui code before commits" - echo " • Build webui" + echo " • Format, lint and check llama-ui code before commits" + echo " • Build llama-ui" else echo "❌ Failed to make hook executable" exit 1 diff --git a/tools/server/webui/scripts/vite-plugin-llama-cpp-build.ts b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts similarity index 64% rename from tools/server/webui/scripts/vite-plugin-llama-cpp-build.ts rename to tools/ui/scripts/vite-plugin-llama-cpp-build.ts index 0330a1dda12..ddf6fa1e56e 100644 --- a/tools/server/webui/scripts/vite-plugin-llama-cpp-build.ts +++ b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts @@ -1,4 +1,12 @@ -import { readFileSync, writeFileSync, existsSync, readdirSync, copyFileSync } from 'fs'; +import { + readFileSync, + writeFileSync, + existsSync, + readdirSync, + copyFileSync, + rmSync, + unlinkSync +} from 'fs'; import { resolve } from 'path'; import type { Plugin } from 'vite'; @@ -11,28 +19,28 @@ const GUIDE_FOR_FRONTEND = ` --> `.trim(); +const OUTPUT_DIR = '../../build/tools/ui/dist'; + export function llamaCppBuildPlugin(): Plugin { return { name: 'llamacpp:build', apply: 'build', closeBundle() { - // Ensure the SvelteKit adapter has finished writing to ../public setTimeout(() => { try { - const indexPath = resolve('../public/index.html'); + const outDir = resolve(OUTPUT_DIR); + const indexPath = resolve(outDir, 'index.html'); if (!existsSync(indexPath)) return; let content = readFileSync(indexPath, 'utf-8'); + // Inline favicon as base64 data URL const faviconPath = resolve('static/favicon.svg'); - if (existsSync(faviconPath)) { const faviconContent = readFileSync(faviconPath, 'utf-8'); const faviconBase64 = Buffer.from(faviconContent).toString('base64'); const faviconDataUrl = `data:image/svg+xml;base64,${faviconBase64}`; - content = content.replace(/href="[^"]*favicon\.svg"/g, `href="${faviconDataUrl}"`); - console.log('✓ Inlined favicon.svg as base64 data URL'); } @@ -48,17 +56,16 @@ export function llamaCppBuildPlugin(): Plugin { writeFileSync(indexPath, content, 'utf-8'); console.log('✓ Updated index.html'); - // Copy bundle.*.js -> ../public/bundle.js - const immutableDir = resolve('../public/_app/immutable'); - const bundleDir = resolve('../public/_app/immutable/assets'); + // Copy bundle.*.js -> bundle.js at output root + const immutableDir = resolve(outDir, '_app/immutable'); + const bundleDir = resolve(outDir, '_app/immutable/assets'); if (existsSync(immutableDir)) { const jsFiles = readdirSync(immutableDir).filter((f) => f.match(/^bundle\..+\.js$/)); - if (jsFiles.length > 0) { - copyFileSync(resolve(immutableDir, jsFiles[0]), resolve('../public/bundle.js')); + copyFileSync(resolve(immutableDir, jsFiles[0]), resolve(outDir, 'bundle.js')); // Normalize __sveltekit_ to __sveltekit__ in bundle.js - const bundleJsPath = resolve('../public/bundle.js'); + const bundleJsPath = resolve(outDir, 'bundle.js'); let bundleJs = readFileSync(bundleJsPath, 'utf-8'); bundleJs = bundleJs.replace(/__sveltekit_[a-z0-9]+/g, '__sveltekit__'); writeFileSync(bundleJsPath, bundleJs, 'utf-8'); @@ -66,17 +73,29 @@ export function llamaCppBuildPlugin(): Plugin { } } - // Copy bundle.*.css -> ../public/bundle.css + // Copy bundle.*.css -> bundle.css at output root if (existsSync(bundleDir)) { const cssFiles = readdirSync(bundleDir).filter((f) => f.match(/^bundle\..+\.css$/)); - if (cssFiles.length > 0) { - copyFileSync(resolve(bundleDir, cssFiles[0]), resolve('../public/bundle.css')); + copyFileSync(resolve(bundleDir, cssFiles[0]), resolve(outDir, 'bundle.css')); console.log(`✓ Copied ${cssFiles[0]} -> bundle.css`); } } + + // Cleanup: remove _app directory, favicon.svg, and legacy index.html.gz + const appDir = resolve(outDir, '_app'); + if (existsSync(appDir)) { + rmSync(appDir, { recursive: true, force: true }); + console.log('✓ Removed _app directory'); + } + + const faviconOut = resolve(outDir, 'favicon.svg'); + if (existsSync(faviconOut)) { + unlinkSync(faviconOut); + console.log('✓ Removed favicon.svg'); + } } catch (error) { - console.error('Failed to update index.html:', error); + console.error('Failed to process build output:', error); } }, 100); } diff --git a/tools/server/webui/src/app.css b/tools/ui/src/app.css similarity index 100% rename from tools/server/webui/src/app.css rename to tools/ui/src/app.css diff --git a/tools/server/webui/src/app.d.ts b/tools/ui/src/app.d.ts similarity index 100% rename from tools/server/webui/src/app.d.ts rename to tools/ui/src/app.d.ts diff --git a/tools/server/webui/src/app.html b/tools/ui/src/app.html similarity index 100% rename from tools/server/webui/src/app.html rename to tools/ui/src/app.html diff --git a/tools/server/webui/src/lib/actions/fade-in-view.svelte.ts b/tools/ui/src/lib/actions/fade-in-view.svelte.ts similarity index 100% rename from tools/server/webui/src/lib/actions/fade-in-view.svelte.ts rename to tools/ui/src/lib/actions/fade-in-view.svelte.ts diff --git a/tools/server/webui/src/lib/components/app/SKILL.md b/tools/ui/src/lib/components/app/SKILL.md similarity index 100% rename from tools/server/webui/src/lib/components/app/SKILL.md rename to tools/ui/src/lib/components/app/SKILL.md diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte b/tools/ui/src/lib/components/app/actions/ActionIcon.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte rename to tools/ui/src/lib/components/app/actions/ActionIcon.svelte diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte b/tools/ui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte rename to tools/ui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte diff --git a/tools/server/webui/src/lib/components/app/actions/index.ts b/tools/ui/src/lib/components/app/actions/index.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/actions/index.ts rename to tools/ui/src/lib/components/app/actions/index.ts diff --git a/tools/server/webui/src/lib/components/app/badges/BadgeInfo.svelte b/tools/ui/src/lib/components/app/badges/BadgeInfo.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/badges/BadgeInfo.svelte rename to tools/ui/src/lib/components/app/badges/BadgeInfo.svelte diff --git a/tools/server/webui/src/lib/components/app/badges/BadgesModality.svelte b/tools/ui/src/lib/components/app/badges/BadgesModality.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/badges/BadgesModality.svelte rename to tools/ui/src/lib/components/app/badges/BadgesModality.svelte diff --git a/tools/server/webui/src/lib/components/app/badges/index.ts b/tools/ui/src/lib/components/app/badges/index.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/badges/index.ts rename to tools/ui/src/lib/components/app/badges/index.ts diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsList.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsList.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsList.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsList.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItem.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItem.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItem.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItem.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpResource.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpResource.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpResource.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpResource.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailFile.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailFile.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailFile.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailFile.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailImage.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailImage.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailImage.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailImage.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItem.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItem.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItem.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItem.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemAudio.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemAudio.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemAudio.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemAudio.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemImage.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemImage.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemImage.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemImage.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemPdf.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemPdf.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemPdf.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemPdf.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemText.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemText.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemText.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemText.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemUnavailable.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemUnavailable.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemUnavailable.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemUnavailable.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewFileInfo.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewFileInfo.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewFileInfo.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewFileInfo.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewNavButtons.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewNavButtons.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewNavButtons.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewNavButtons.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewThumbnailStrip.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewThumbnailStrip.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewThumbnailStrip.svelte rename to tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewThumbnailStrip.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddButton.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddButton.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddButton.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddButton.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte similarity index 97% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte index 175eb3c8c7f..e053e6f836b 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddDropdown.svelte @@ -54,7 +54,12 @@ } const attachmentMenu = useAttachmentMenu( - () => ({ hasVisionModality, hasAudioModality, hasMcpPromptsSupport, hasMcpResourcesSupport }), + () => ({ + hasVisionModality, + hasAudioModality, + hasMcpPromptsSupport, + hasMcpResourcesSupport + }), () => ({ onFileUpload, onSystemPromptClick, onMcpPromptClick, onMcpResourcesClick }), () => { dropdownOpen = false; diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddMcpServersSubmenu.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddMcpServersSubmenu.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddMcpServersSubmenu.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddMcpServersSubmenu.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte similarity index 98% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte index 99daa10e3e2..4713ec47758 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte @@ -46,7 +46,12 @@ let sheetOpen = $state(false); const attachmentMenu = useAttachmentMenu( - () => ({ hasVisionModality, hasAudioModality, hasMcpPromptsSupport, hasMcpResourcesSupport }), + () => ({ + hasVisionModality, + hasAudioModality, + hasMcpPromptsSupport, + hasMcpResourcesSupport + }), () => ({ onFileUpload, onSystemPromptClick, onMcpPromptClick, onMcpResourcesClick }), () => { sheetOpen = false; diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte similarity index 97% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte index ccc35d98f87..b11467da823 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte @@ -5,6 +5,7 @@ import * as DropdownMenu from '$lib/components/ui/dropdown-menu'; import * as Tooltip from '$lib/components/ui/tooltip'; import { toolsStore } from '$lib/stores/tools.svelte'; + import { CLI_FLAGS } from '$lib/constants'; import { mcpStore } from '$lib/stores/mcp.svelte'; import { useToolsPanel } from '$lib/hooks/use-tools-panel.svelte'; @@ -33,7 +34,7 @@ - Run llama-server with --tools flag to enable + Run llama-server with {CLI_FLAGS.TOOLS} flag to enable Built-in Tools. diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionsAdd.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormMcpResourcesList.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormMcpResourcesList.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormMcpResourcesList.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormMcpResourcesList.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerItemHeader.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerItemHeader.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerItemHeader.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerItemHeader.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerList.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerList.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerList.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerList.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItem.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItem.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItem.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItem.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItemSkeleton.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItemSkeleton.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItemSkeleton.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerListItemSkeleton.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerPopover.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerPopover.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerPopover.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPicker/ChatFormPickerPopover.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentForm.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentForm.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentForm.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentForm.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentInput.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentInput.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentInput.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPromptPickerArgumentInput.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpResources.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpResources.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpResources.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpResources.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickers.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickers.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickers.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickers.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte rename to tools/ui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessage.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessage.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessage.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessage.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageAssistant/ChatMessageAssistant.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageAssistant/ChatMessageAssistant.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageAssistant/ChatMessageAssistant.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageAssistant/ChatMessageAssistant.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPrompt.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPrompt.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPrompt.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPrompt.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPromptContent.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPromptContent.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPromptContent.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageMcpPrompt/ChatMessageMcpPromptContent.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageSystem/ChatMessageSystem.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageSystem/ChatMessageSystem.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageSystem/ChatMessageSystem.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageSystem/ChatMessageSystem.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUser.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUser.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUser.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUser.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserBubble.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserBubble.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserBubble.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserBubble.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserPending.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserPending.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserPending.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessage/ChatMessageUser/ChatMessageUserPending.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCard.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCard.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCard.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCard.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardContinueRequest.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardContinueRequest.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardContinueRequest.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardContinueRequest.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardPermissionRequest.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardPermissionRequest.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardPermissionRequest.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionCard/ChatMessageActionCardPermissionRequest.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIcons.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIcons.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIcons.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIcons.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIconsBranchingControls.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIconsBranchingControls.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIconsBranchingControls.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageActions/ChatMessageActionIcons/ChatMessageActionIconsBranchingControls.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte similarity index 99% rename from tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte rename to tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte index e9b77ba2f4f..3a9cc7e9356 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte @@ -274,7 +274,9 @@ {:else if section.toolResult}
{#each section.parsedLines as line, i (i)} -
{line.text}
+
+ {line.text} +
{#if line.image} (Run
llama-server
with -
--webui-mcp-proxy
+
{CLI_FLAGS.MCP_PROXY}
flag) {/if} diff --git a/tools/server/webui/src/lib/components/app/mcp/McpServerIdentity.svelte b/tools/ui/src/lib/components/app/mcp/McpServerIdentity.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/mcp/McpServerIdentity.svelte rename to tools/ui/src/lib/components/app/mcp/McpServerIdentity.svelte diff --git a/tools/server/webui/src/lib/components/app/mcp/McpServerInfo.svelte b/tools/ui/src/lib/components/app/mcp/McpServerInfo.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/mcp/McpServerInfo.svelte rename to tools/ui/src/lib/components/app/mcp/McpServerInfo.svelte diff --git a/tools/server/webui/src/lib/components/app/mcp/index.ts b/tools/ui/src/lib/components/app/mcp/index.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/mcp/index.ts rename to tools/ui/src/lib/components/app/mcp/index.ts diff --git a/tools/server/webui/src/lib/components/app/misc/CodeBlockActions.svelte b/tools/ui/src/lib/components/app/misc/CodeBlockActions.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/CodeBlockActions.svelte rename to tools/ui/src/lib/components/app/misc/CodeBlockActions.svelte diff --git a/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte b/tools/ui/src/lib/components/app/misc/ConversationSelection.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte rename to tools/ui/src/lib/components/app/misc/ConversationSelection.svelte diff --git a/tools/server/webui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte b/tools/ui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte rename to tools/ui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte diff --git a/tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte b/tools/ui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte rename to tools/ui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte diff --git a/tools/server/webui/src/lib/components/app/misc/TruncatedText.svelte b/tools/ui/src/lib/components/app/misc/TruncatedText.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/TruncatedText.svelte rename to tools/ui/src/lib/components/app/misc/TruncatedText.svelte diff --git a/tools/server/webui/src/lib/components/app/misc/index.ts b/tools/ui/src/lib/components/app/misc/index.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/misc/index.ts rename to tools/ui/src/lib/components/app/misc/index.ts diff --git a/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte b/tools/ui/src/lib/components/app/models/ModelBadge.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelBadge.svelte rename to tools/ui/src/lib/components/app/models/ModelBadge.svelte diff --git a/tools/server/webui/src/lib/components/app/models/ModelId.svelte b/tools/ui/src/lib/components/app/models/ModelId.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelId.svelte rename to tools/ui/src/lib/components/app/models/ModelId.svelte diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelectorDropdown.svelte b/tools/ui/src/lib/components/app/models/ModelsSelectorDropdown.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelsSelectorDropdown.svelte rename to tools/ui/src/lib/components/app/models/ModelsSelectorDropdown.svelte diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelectorList.svelte b/tools/ui/src/lib/components/app/models/ModelsSelectorList.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelsSelectorList.svelte rename to tools/ui/src/lib/components/app/models/ModelsSelectorList.svelte diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelectorOption.svelte b/tools/ui/src/lib/components/app/models/ModelsSelectorOption.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelsSelectorOption.svelte rename to tools/ui/src/lib/components/app/models/ModelsSelectorOption.svelte diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelectorSheet.svelte b/tools/ui/src/lib/components/app/models/ModelsSelectorSheet.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/models/ModelsSelectorSheet.svelte rename to tools/ui/src/lib/components/app/models/ModelsSelectorSheet.svelte diff --git a/tools/server/webui/src/lib/components/app/models/index.ts b/tools/ui/src/lib/components/app/models/index.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/models/index.ts rename to tools/ui/src/lib/components/app/models/index.ts diff --git a/tools/server/webui/src/lib/components/app/models/utils.ts b/tools/ui/src/lib/components/app/models/utils.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/models/utils.ts rename to tools/ui/src/lib/components/app/models/utils.ts diff --git a/tools/server/webui/src/lib/components/app/navigation/DesktopIconStrip.svelte b/tools/ui/src/lib/components/app/navigation/DesktopIconStrip.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/navigation/DesktopIconStrip.svelte rename to tools/ui/src/lib/components/app/navigation/DesktopIconStrip.svelte diff --git a/tools/server/webui/src/lib/components/app/navigation/DropdownMenuActions.svelte b/tools/ui/src/lib/components/app/navigation/DropdownMenuActions.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/navigation/DropdownMenuActions.svelte rename to tools/ui/src/lib/components/app/navigation/DropdownMenuActions.svelte diff --git a/tools/server/webui/src/lib/components/app/navigation/DropdownMenuSearchable.svelte b/tools/ui/src/lib/components/app/navigation/DropdownMenuSearchable.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/navigation/DropdownMenuSearchable.svelte rename to tools/ui/src/lib/components/app/navigation/DropdownMenuSearchable.svelte diff --git a/tools/server/webui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte similarity index 99% rename from tools/server/webui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte rename to tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte index 105576bb4f5..ddaf4d5b875 100644 --- a/tools/server/webui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte +++ b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte @@ -174,7 +174,9 @@
-

{APP_NAME}

+

+ {APP_NAME} +

+
diff --git a/tools/ui/src/lib/components/app/chat/index.ts b/tools/ui/src/lib/components/app/chat/index.ts index 5f65979803c..9c7ce864e21 100644 --- a/tools/ui/src/lib/components/app/chat/index.ts +++ b/tools/ui/src/lib/components/app/chat/index.ts @@ -667,3 +667,10 @@ export { default as ChatScreenForm } from './ChatScreen/ChatScreenForm.svelte'; * Only visible when `isCurrentConversationLoading` is true. */ export { default as ChatScreenProcessingInfo } from './ChatScreen/ChatScreenProcessingInfo.svelte'; + +/** + * Scroll-to-bottom action button. Displays a floating button when the user + * has scrolled up more than half a viewport height from the bottom. + * Takes the chat container element as a prop to manage scroll state internally. + */ +export { default as ChatScreenActionScrollDown } from './ChatScreen/ChatScreenActionScrollDown.svelte'; diff --git a/tools/ui/src/lib/hooks/use-auto-scroll.svelte.ts b/tools/ui/src/lib/hooks/use-auto-scroll.svelte.ts index f59e3ed4b72..7bac452e4e9 100644 --- a/tools/ui/src/lib/hooks/use-auto-scroll.svelte.ts +++ b/tools/ui/src/lib/hooks/use-auto-scroll.svelte.ts @@ -100,6 +100,14 @@ export class AutoScrollController { this._autoScrollEnabled = true; } + /** + * Resets scroll state when switching conversations. + */ + resetScrollState(): void { + this._userScrolledUp = false; + this._autoScrollEnabled = true; + } + /** * Starts the auto-scroll interval for continuous scrolling during streaming. */ From 3a9c1b854dc9290be7cf8e4c89bd404770020161 Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Mon, 18 May 2026 16:26:01 +0200 Subject: [PATCH 058/309] ui: Update KaTeX package and clean up logs from `sass` warnings (#23275) * ui: migrate katex imports to @use to resolve SCSS deprecation warnings * ci: Use `ubuntu-slim` for CI (UI) workflow --- .github/workflows/ui-ci.yml | 4 ++-- tools/ui/package-lock.json | 6 +++--- tools/ui/src/styles/katex-custom.scss | 9 ++++++--- tools/ui/vite.config.ts | 12 ------------ 4 files changed, 11 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ui-ci.yml b/.github/workflows/ui-ci.yml index 43d6e125687..7f6f467ddaa 100644 --- a/.github/workflows/ui-ci.yml +++ b/.github/workflows/ui-ci.yml @@ -41,7 +41,7 @@ jobs: ui-checks: name: UI Checks needs: ui-build - runs-on: ubuntu-24.04-arm + runs-on: ubuntu-slim continue-on-error: true steps: - name: Checkout code @@ -93,7 +93,7 @@ jobs: e2e-tests: name: E2E Tests needs: ui-build - runs-on: ubuntu-24.04-arm + runs-on: ubuntu-slim steps: - name: Checkout code uses: actions/checkout@v6 diff --git a/tools/ui/package-lock.json b/tools/ui/package-lock.json index bf23307b82c..3686eb3261e 100644 --- a/tools/ui/package-lock.json +++ b/tools/ui/package-lock.json @@ -6008,9 +6008,9 @@ } }, "node_modules/katex": { - "version": "0.16.22", - "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.22.tgz", - "integrity": "sha512-XCHRdUw4lf3SKBaJe4EvgqIuWwkPSo9XoeO8GjQW94Bp7TWv9hNhzZjZ+OH9yf1UmLygb7DIT5GSFQiyt16zYg==", + "version": "0.16.47", + "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.47.tgz", + "integrity": "sha512-Eeo8Ys1doU1z+x8AZsPpQu+p/QcZBI5PeOo7QGQdy2x2m0MU/hYagBbGOmXwr5KVbEfVuWv9LpnQWeehogurjg==", "dev": true, "funding": [ "https://opencollective.com/katex", diff --git a/tools/ui/src/styles/katex-custom.scss b/tools/ui/src/styles/katex-custom.scss index 9c8b96ed52f..0e385844a18 100644 --- a/tools/ui/src/styles/katex-custom.scss +++ b/tools/ui/src/styles/katex-custom.scss @@ -8,6 +8,9 @@ $use-ttf: false; $font-folder: 'katex-fonts'; // Import KaTeX SCSS with overridden variables -// Note: @import is deprecated but required because KaTeX uses @import internally -// The deprecation warnings are from KaTeX's code and cannot be avoided -@import 'katex/src/styles/katex.scss'; +@use 'katex/src/styles/katex.scss' with ( + $use-woff2: true, + $use-woff: false, + $use-ttf: false, + $font-folder: 'katex-fonts' +); diff --git a/tools/ui/vite.config.ts b/tools/ui/vite.config.ts index d3db24bf2bf..f89a689d5e6 100644 --- a/tools/ui/vite.config.ts +++ b/tools/ui/vite.config.ts @@ -23,18 +23,6 @@ export default defineConfig({ minify: true }, - css: { - preprocessorOptions: { - scss: { - additionalData: ` - $use-woff2: true; - $use-woff: false; - $use-ttf: false; - ` - } - } - }, - plugins: [tailwindcss(), sveltekit(), devtoolsJson(), llamaCppBuildPlugin()], test: { From 45b455e66fc09abed65b7d52d42a4a29ba0d45d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Mon, 18 May 2026 17:11:47 +0200 Subject: [PATCH 059/309] common : remove hf cache migration (#23266) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- common/arg.cpp | 7 -- common/hf-cache.cpp | 274 -------------------------------------------- common/hf-cache.h | 4 - 3 files changed, 285 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index d7a935fc179..ab23b77e021 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -4,7 +4,6 @@ #include "chat.h" #include "common.h" #include "download.h" -#include "hf-cache.h" #include "json-schema-to-grammar.h" #include "log.h" #include "sampling.h" @@ -586,12 +585,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // parse the first time to get -hf option (used for remote preset) parse_cli_args(); - // TODO: Remove later - try { - hf_cache::migrate_old_cache_to_hf_cache(params.hf_token, params.offline); - } catch (const std::exception & e) { - LOG_WRN("HF cache migration failed: %s\n", e.what()); - } // export_graph_ops loads only metadata const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS; diff --git a/common/hf-cache.cpp b/common/hf-cache.cpp index 20f33e4c7f4..ba7417a12bb 100644 --- a/common/hf-cache.cpp +++ b/common/hf-cache.cpp @@ -11,7 +11,6 @@ #include #include #include -#include // migration only #include #include #include @@ -336,15 +335,9 @@ hf_files get_repo_files(const std::string & repo_id, if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) { file.oid = item["lfs"]["oid"].get(); } - if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) { - file.size = item["lfs"]["size"].get(); - } } else if (item.contains("oid") && item["oid"].is_string()) { file.oid = item["oid"].get(); } - if (file.size == 0 && item.contains("size") && item["size"].is_number()) { - file.size = item["size"].get(); - } if (!file.oid.empty() && !is_valid_oid(file.oid)) { LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str()); @@ -502,271 +495,4 @@ std::string finalize_file(const hf_file & file) { return file.final_path; } -// delete everything after this line, one day - -// copied from download.cpp without the tag part -struct gguf_split_info { - std::string prefix; // tag included - int index; - int count; -}; - -static gguf_split_info get_gguf_split_info(const std::string & path) { - static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase); - std::smatch m; - - std::string prefix = path; - if (!string_remove_suffix(prefix, ".gguf")) { - return {}; - } - - int index = 1; - int count = 1; - - if (std::regex_match(prefix, m, re_split)) { - index = std::stoi(m[2].str()); - count = std::stoi(m[3].str()); - prefix = m[1].str(); - } - - return {std::move(prefix), index, count}; -} - -static std::pair parse_manifest_name(std::string & filename) { - static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)"); - std::smatch match; - if (std::regex_match(filename, match, re)) { - return {match[1].str(), match[2].str()}; - } - return {}; -} - -static std::string make_old_cache_filename(const std::string & owner, - const std::string & repo, - const std::string & filename) { - auto result = owner + "_" + repo + "_" + filename; - string_replace_all(result, "/", "_"); - return result; -} - -struct migrate_file { - std::string path; - std::string sha256; - size_t size; - fs::path old_path; - fs::path etag_path; - const hf_file * file; -}; - -using migrate_files = std::vector; - -static bool collect_file(const fs::path & old_cache, - const std::string & owner, - const std::string & repo, - const std::string & path, - const std::string & sha256, - const hf_files & files, - migrate_files & to_migrate) { - - const hf_file * file = nullptr; - - for (const auto & f : files) { - if (f.path == path) { - file = &f; - break; - } - } - - std::string old_filename = make_old_cache_filename(owner, repo, path); - fs::path old_path = old_cache / old_filename; - fs::path etag_path = old_path.string() + ".etag"; - - if (!fs::exists(old_path)) { - if (file && fs::exists(file->final_path)) { - return true; - } - LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str()); - return false; - } - - if (!file) { - LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str()); - return false; - } - - if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) { - LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str()); - return false; - } - - if (file->size > 0) { - size_t size = fs::file_size(old_path); - if (size != file->size) { - LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size); - return false; - } - } - - to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file}); - return true; -} - -static bool collect_files(const fs::path & old_cache, - const std::string & owner, - const std::string & repo, - const nl::json & node, - const hf_files & files, - migrate_files & to_migrate) { - - if (!node.contains("rfilename") || - !node.contains("lfs") || - !node["lfs"].contains("sha256")) { - return true; - } - - std::string path = node["rfilename"]; - std::string sha256 = node["lfs"]["sha256"]; - - auto split = get_gguf_split_info(path); - - if (split.count <= 1) { - return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate); - } - - std::vector> splits; - - for (const auto & f : files) { - auto split_f = get_gguf_split_info(f.path); - if (split_f.count == split.count && split_f.prefix == split.prefix) { - // sadly the manifest only provides the sha256 of the first file (index == 1) - // the rest will be verified using the size... - std::string f_sha256 = (split_f.index == 1) ? sha256 : ""; - splits.emplace_back(f.path, f_sha256); - } - } - - if ((int)splits.size() != split.count) { - LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size()); - return false; - } - - for (const auto & [f_path, f_sha256] : splits) { - if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) { - return false; - } - } - - return true; -} - -static bool migrate_file(const migrate_file & file) { - std::error_code ec; - - fs::path new_path(file.file->local_path); - fs::create_directories(new_path.parent_path(), ec); - - if (!fs::exists(new_path, ec)) { - fs::rename(file.old_path, new_path, ec); - if (ec) { - fs::copy_file(file.old_path, new_path, ec); - if (ec) { - LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str()); - return false; - } - } - fs::remove(file.old_path, ec); - } - fs::remove(file.etag_path, ec); - - std::string filename = finalize_file(*file.file); - LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str()); - return true; -} - -void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) { - fs::path old_cache = fs_get_cache_directory(); - if (!fs::exists(old_cache)) { - return; - } - - if (offline) { - LOG_WRN("%s: skipping migration in offline mode (will run when online)\n", __func__); - return; // -hf is not going to work - } - - bool warned = false; - - for (const auto & entry : fs::directory_iterator(old_cache)) { - if (!entry.is_regular_file()) { - continue; - } - auto filename = entry.path().filename().string(); - auto [owner, repo] = parse_manifest_name(filename); - - if (owner.empty() || repo.empty()) { - continue; - } - - if (!warned) { - warned = true; - LOG_WRN("================================================================================\n" - "WARNING: Migrating cache to HuggingFace cache directory\n" - " Old cache: %s\n" - " New cache: %s\n" - "This one-time migration moves models previously downloaded with -hf\n" - "from the legacy llama.cpp cache to the standard HuggingFace cache.\n" - "Models downloaded with --model-url are not affected.\n" - "================================================================================\n", - old_cache.string().c_str(), get_cache_directory().string().c_str()); - } - - auto repo_id = owner + "/" + repo; - auto files = get_repo_files(repo_id, token); - - if (files.empty()) { - LOG_WRN("%s: could not get repo files for %s, skipping\n", __func__, repo_id.c_str()); - continue; - } - - migrate_files to_migrate; - bool ok = true; - - try { - std::ifstream manifest(entry.path()); - auto json = nl::json::parse(manifest); - for (const char * key : {"ggufFile", "mmprojFile"}) { - if (json.contains(key)) { - if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) { - ok = false; - break; - } - } - } - } catch (const std::exception & e) { - LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what()); - continue; - } - - if (!ok) { - LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__); - continue; - } - - for (const auto & file : to_migrate) { - if (!migrate_file(file)) { - ok = false; - break; - } - } - - if (!ok) { - LOG_WRN("%s: migration failed: could not migrate all files\n", __func__); - continue; - } - - LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str()); - fs::remove(entry.path()); - } -} - } // namespace hf_cache diff --git a/common/hf-cache.h b/common/hf-cache.h index 9e46f977437..23fa0adb729 100644 --- a/common/hf-cache.h +++ b/common/hf-cache.h @@ -14,7 +14,6 @@ struct hf_file { std::string final_path; std::string oid; std::string repo_id; - size_t size = 0; // only for the migration }; using hf_files = std::vector; @@ -30,7 +29,4 @@ hf_files get_cached_files(const std::string & repo_id = {}); // Create snapshot path (link or move/copy) and return it std::string finalize_file(const hf_file & file); -// TODO: Remove later -void migrate_old_cache_to_hf_cache(const std::string & token, bool offline = false); - } // namespace hf_cache From 5cbaa5e69e09bde3334cd8c355570553a0dca027 Mon Sep 17 00:00:00 2001 From: SamareshSingh <97642706+ssam18@users.noreply.github.com> Date: Mon, 18 May 2026 15:14:45 -0500 Subject: [PATCH 060/309] docker : add OCI image labels for version and build date (#21653) * docker: add OCI image labels to all published images * docker: propagate OCI labels as manifest and index annotations * docker: drop hardcoded org URL and revert accidental intel version bump The OCI image url and source are now driven by build args with a sensible default. The workflow passes the actual repository url so fork builds get labels pointing at the fork instead of upstream. Also restores the IGC, compute runtime, and IGDGMM versions in the intel Dockerfile labeled stage which I accidentally bumped in the first commit. * docker: add skip_s390x workflow_dispatch input for fast test runs Lets maintainers and PR authors trigger the docker workflow without the s390x build target, which depends on the IBM Z runner and is by far the slowest job in the matrix. The flag filters the s390x row out of the build matrix before merge_matrix is derived, so the merge job sees a consistent shape too. Signed-off-by: Samaresh Kumar Singh --------- Signed-off-by: Samaresh Kumar Singh --- .devops/cann.Dockerfile | 16 ++++++ .devops/cpu.Dockerfile | 16 ++++++ .devops/cuda.Dockerfile | 17 +++++++ .devops/intel.Dockerfile | 16 ++++++ .devops/llama-cli-cann.Dockerfile | 17 +++++++ .devops/musa.Dockerfile | 17 +++++++ .devops/openvino.Dockerfile | 16 ++++++ .devops/rocm.Dockerfile | 17 +++++++ .devops/s390x.Dockerfile | 16 ++++++ .devops/vulkan.Dockerfile | 16 ++++++ .github/workflows/docker.yml | 83 +++++++++++++++++++++++++++++-- 11 files changed, 242 insertions(+), 5 deletions(-) diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile index 843fe37d062..acd1e26bcec 100644 --- a/.devops/cann.Dockerfile +++ b/.devops/cann.Dockerfile @@ -5,6 +5,9 @@ # Define the CANN base image for easier version updates later ARG CHIP_TYPE=910b ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A # ============================================================================== # BUILD STAGE @@ -67,6 +70,19 @@ RUN mkdir -p /app/full && \ # ============================================================================== FROM ${CANN_BASE_IMAGE} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + # -- Install runtime dependencies -- RUN yum install -y libgomp curl && \ yum clean all && \ diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile index d6579ecf1ad..c8f32235d16 100644 --- a/.devops/cpu.Dockerfile +++ b/.devops/cpu.Dockerfile @@ -1,4 +1,7 @@ ARG UBUNTU_VERSION=24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ubuntu:$UBUNTU_VERSION AS build @@ -35,6 +38,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ubuntu:$UBUNTU_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index b3f6ccfc984..3805ea3a009 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -6,6 +6,10 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + FROM ${BASE_CUDA_DEV_CONTAINER} AS build # CUDA architecture to build for (defaults to all supported archs) @@ -40,6 +44,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_CUDA_RUN_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index da164dcfa5b..218418b80b6 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -1,4 +1,7 @@ ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A ## Build Image @@ -40,6 +43,19 @@ RUN mkdir -p /app/full \ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + ARG IGC_VERSION=v2.20.5 ARG IGC_VERSION_FULL=2_2.20.5+19972 ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10 diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index d54e70838f2..447d871ac4f 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -1,4 +1,7 @@ ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ascendai/cann:$ASCEND_VERSION AS build @@ -28,6 +31,20 @@ RUN echo "Building with static libs" && \ # TODO: use image with NNRT FROM ascendai/cann:$ASCEND_VERSION AS runtime + +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion / ENV LC_ALL=C.utf8 diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index 665a76f58ce..a7f70b5f0df 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -6,6 +6,10 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + FROM ${BASE_MUSA_DEV_CONTAINER} AS build # MUSA architecture to build for (defaults to all supported archs) @@ -45,6 +49,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_MUSA_RUN_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile index 31b58736d7e..1266713f378 100644 --- a/.devops/openvino.Dockerfile +++ b/.devops/openvino.Dockerfile @@ -18,6 +18,10 @@ ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2 ARG http_proxy= ARG https_proxy= +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + ## Build Image FROM ubuntu:${UBUNTU_VERSION} AS build @@ -88,6 +92,18 @@ FROM ubuntu:${UBUNTU_VERSION} AS base # Pass proxy args to runtime stage ARG http_proxy ARG https_proxy +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE RUN apt-get update \ && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \ diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 525ddc79051..2da15975d13 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -7,6 +7,10 @@ ARG AMDGPU_VERSION=7.2.1 # Target the ROCm build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + ### Build image FROM ${BASE_ROCM_DEV_CONTAINER} AS build @@ -57,6 +61,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_ROCM_DEV_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile index 757cd97cd4c..d36f5f3ccc5 100644 --- a/.devops/s390x.Dockerfile +++ b/.devops/s390x.Dockerfile @@ -1,5 +1,8 @@ ARG GCC_VERSION=15.2.0 ARG UBUNTU_VERSION=24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A ### Build Llama.cpp stage FROM gcc:${GCC_VERSION} AS build @@ -52,6 +55,19 @@ COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py ### Base image FROM ubuntu:${UBUNTU_VERSION} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ apt update -y && \ diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index f4d199ed426..464ccfef1ce 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -1,4 +1,7 @@ ARG UBUNTU_VERSION=26.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ubuntu:$UBUNTU_VERSION AS build @@ -31,6 +34,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ubuntu:$UBUNTU_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \ libglvnd0 libgl1 libglx0 libegl1 libgles2 \ diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index a5bae7141fe..6f1f2721e45 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -11,6 +11,11 @@ name: Publish Docker image on: workflow_dispatch: # allows manual triggering + inputs: + skip_s390x: + description: "Skip the s390x build target (useful for fast test runs that do not need the IBM Z runner)" + type: boolean + default: false schedule: # Rebuild daily rather than on every push because it is expensive - cron: '12 4 * * *' @@ -64,6 +69,8 @@ jobs: - name: Generate build and merge matrices id: matrices shell: bash + env: + SKIP_S390X: ${{ inputs.skip_s390x || 'false' }} run: | set -euo pipefail @@ -86,6 +93,11 @@ jobs: ] JSON + if [ "${SKIP_S390X}" = "true" ]; then + jq 'map(select(.platforms != "linux/s390x"))' build-matrix.json > build-matrix.json.tmp + mv build-matrix.json.tmp build-matrix.json + fi + BUILD_MATRIX="$(jq -c . build-matrix.json)" MERGE_MATRIX="$(jq -c ' reduce .[] as $entry ({}; .[$entry.tag] |= ( @@ -132,6 +144,7 @@ jobs: config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }} steps: - name: Check out the repo + id: checkout uses: actions/checkout@v6 with: fetch-depth: 0 @@ -187,6 +200,10 @@ jobs: env: GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' + - name: Get build date + id: build_date + run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT + - name: Free Disk Space (Ubuntu) if: ${{ matrix.config.free_disk_space == true }} uses: ggml-org/free-disk-space@v1.3.1 @@ -211,13 +228,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: full provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -235,13 +265,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: light provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -259,13 +302,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: server provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -330,10 +386,15 @@ jobs: steps: - name: Check out the repo + id: checkout uses: actions/checkout@v6 with: fetch-depth: 0 + - name: Get build date + id: build_date + run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT + - name: Download digest metadata uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 with: @@ -361,6 +422,8 @@ jobs: IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}" PREFIX="${IMAGE_REPO}:" SRC_TAG="${{ needs.create_tag.outputs.source_tag }}" + BUILD_DATE="${{ steps.build_date.outputs.date }}" + COMMIT_SHA="${{ steps.checkout.outputs.commit }}" TAGS="${{ matrix.config.tag }}" ARCHES="${{ matrix.config.arches }}" DIGEST_GLOB="/tmp/digests/*.tsv" @@ -412,11 +475,21 @@ jobs: refs+=("${IMAGE_REPO}@${digest}") done + local annotations=( + --annotation "index:org.opencontainers.image.created=${BUILD_DATE}" + --annotation "index:org.opencontainers.image.version=${SRC_TAG}" + --annotation "index:org.opencontainers.image.revision=${COMMIT_SHA}" + --annotation "index:org.opencontainers.image.title=llama.cpp" + --annotation "index:org.opencontainers.image.description=LLM inference in C/C++" + --annotation "index:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}" + --annotation "index:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}" + ) + echo "Creating ${merged_tag} from ${refs[*]}" - docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}" + docker buildx imagetools create "${annotations[@]}" --tag "${merged_tag}" "${refs[@]}" echo "Creating ${merged_versioned_tag} from ${refs[*]}" - docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}" + docker buildx imagetools create "${annotations[@]}" --tag "${merged_versioned_tag}" "${refs[@]}" } for tag in $TAGS; do From b7340443d4cf6f89e8616de87ec6b3557055e581 Mon Sep 17 00:00:00 2001 From: Pranav Dhinakar Date: Mon, 18 May 2026 13:39:36 -0700 Subject: [PATCH 061/309] ggml-hexagon: add PAD op HVX kernel (#23078) * ggml-hexagon: add PAD op HVX kernel Implements GGML_OP_PAD on the Hexagon HTP backend using HVX vectorized kernels. Supports zero-padding and circular padding across all 4 tensor dimensions. * hex-ggml: remove duplicate op cases (merge conflict) * hex-pad: fix editorconfig checks and macro alignment --------- Co-authored-by: Max Krasnyansky --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 18 + ggml/src/ggml-hexagon/htp/CMakeLists.txt | 1 + ggml/src/ggml-hexagon/htp/htp-ctx.h | 1 + ggml/src/ggml-hexagon/htp/htp-ops.h | 1 + ggml/src/ggml-hexagon/htp/main.c | 3 + ggml/src/ggml-hexagon/htp/pad-ops.c | 545 +++++++++++++++++++++++ 6 files changed, 569 insertions(+) create mode 100644 ggml/src/ggml-hexagon/htp/pad-ops.c diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 3d1c9da8329..c24a2305e4c 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2744,6 +2744,18 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session * return true; } +static bool ggml_hexagon_supported_pad(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * dst = op; + + if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { + return false; + } + + GGML_UNUSED(sess); + return true; +} + static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * dst = op; @@ -2857,6 +2869,8 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) { case GGML_OP_FILL: return HTP_OP_FILL; case GGML_OP_DIAG: return HTP_OP_DIAG; case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI; + case GGML_OP_PAD: return HTP_OP_PAD; + case GGML_OP_UNARY: switch (ggml_get_unary_op(t)) { case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU; @@ -3416,6 +3430,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_solve_tri(sess, op); break; + case GGML_OP_PAD: + supp = ggml_hexagon_supported_pad(sess, op); + break; + default: break; } diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt index bcadac11f95..36f923243cd 100644 --- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt @@ -38,6 +38,7 @@ add_library(${HTP_LIB} SHARED diag-ops.c solve-tri-ops.c gated-delta-net-ops.c + pad-ops.c ) target_compile_definitions(${HTP_LIB} PRIVATE diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index 92f02eac6e3..e500ce46212 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -107,5 +107,6 @@ int op_fill(struct htp_ops_context * octx); int op_diag(struct htp_ops_context * octx); int op_solve_tri(struct htp_ops_context * octx); int op_gated_delta_net(struct htp_ops_context * octx); +int op_pad(struct htp_ops_context * octx); #endif /* HTP_CTX_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index 98db864dd42..985ded6f299 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -86,6 +86,7 @@ enum htp_op_code { HTP_OP_SOLVE_TRI, HTP_OP_L2_NORM, HTP_OP_GATED_DELTA_NET, + HTP_OP_PAD, HTP_OP_INVALID }; diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 883a31d6163..85569f07289 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -595,6 +595,9 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_SOLVE_TRI: return op_solve_tri(octx); + case HTP_OP_PAD: + return op_pad(octx); + case HTP_OP_GATED_DELTA_NET: return op_gated_delta_net(octx); diff --git a/ggml/src/ggml-hexagon/htp/pad-ops.c b/ggml/src/ggml-hexagon/htp/pad-ops.c new file mode 100644 index 00000000000..3abc3c2ead1 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/pad-ops.c @@ -0,0 +1,545 @@ +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunused-but-set-variable" + +#include +#include + +#include + +#include "hex-dma.h" +#include "hvx-utils.h" + +#define GGML_COMMON_DECL_C +#include "ggml-common.h" +#include "htp-ctx.h" +#include "htp-ops.h" + +/* Circular wrap: maps any integer x into [0, n) */ +static inline uint32_t wrap_around(int32_t x, uint32_t n) { + return (uint32_t)(((x % (int32_t)n) + (int32_t)n) % (int32_t)n); +} + +/* Decompose a flat dst row index into (i1, i2, i3) */ +static inline void pad_decompose_row(uint32_t ir, uint32_t ne1, uint32_t ne2, + uint32_t *i1, uint32_t *i2, uint32_t *i3) { + *i1 = ir % ne1; + *i2 = (ir / ne1) % ne2; + *i3 = ir / (ne1 * ne2); +} + +/* Return non-zero if row (i1,i2,i3) falls in the non-padded interior */ +static inline int pad_is_interior(uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t rp1, uint32_t ne1, + int32_t lp2, int32_t rp2, uint32_t ne2, + int32_t lp3, int32_t rp3, uint32_t ne3) { + return ((int32_t)i1 >= lp1 && (int32_t)i1 < (int32_t)ne1 - rp1) && + ((int32_t)i2 >= lp2 && (int32_t)i2 < (int32_t)ne2 - rp2) && + ((int32_t)i3 >= lp3 && (int32_t)i3 < (int32_t)ne3 - rp3); +} + +/* Compute the DDR src row pointer for a zero-pad interior row */ +static inline const uint8_t * pad_src_row_ptr(const struct htp_tensor * src, + uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t lp2, int32_t lp3) { + return (const uint8_t *) src->data + + (i1 - (uint32_t)lp1) * src->nb[1] + + (i2 - (uint32_t)lp2) * src->nb[2] + + (i3 - (uint32_t)lp3) * src->nb[3]; +} + +/* Compute the DDR src row pointer for a circular row (wrap-around indexing) */ +static inline const uint8_t * pad_circ_src_row_ptr(const struct htp_tensor * src, + uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t lp2, int32_t lp3) { + return (const uint8_t *) src->data + + wrap_around((int32_t)i1 - lp1, src->ne[1]) * src->nb[1] + + wrap_around((int32_t)i2 - lp2, src->ne[2]) * src->nb[2] + + wrap_around((int32_t)i3 - lp3, src->ne[3]) * src->nb[3]; +} + +struct htp_pad_context { + struct htp_ops_context * octx; + + int32_t lp0, rp0; + int32_t lp1, rp1; + int32_t lp2, rp2; + int32_t lp3, rp3; + + uint32_t nrows_per_thread; + uint32_t total_dst_rows; + + size_t type_size; + + // Row sizes for DMA kernel (populated when VTCM is available) + size_t src_row_size; + size_t src_row_size_aligned; + size_t dst_row_size; + size_t dst_row_size_aligned; +}; + +#define htp_pad_preamble \ + const struct htp_tensor * src = octx->src[0]; \ + const struct htp_tensor * dst = octx->dst; \ + \ + const uint32_t ne00 = src->ne[0]; \ + const uint32_t nb00 = src->nb[0]; \ + \ + const uint32_t ne0 = dst->ne[0]; \ + const uint32_t ne1 = dst->ne[1]; \ + const uint32_t ne2 = dst->ne[2]; \ + const uint32_t ne3 = dst->ne[3]; \ + \ + const uint32_t nb1 = dst->nb[1]; \ + const uint32_t nb2 = dst->nb[2]; \ + const uint32_t nb3 = dst->nb[3]; \ + \ + const int32_t lp0 = pctx->lp0, rp0 = pctx->rp0; \ + const int32_t lp1 = pctx->lp1, rp1 = pctx->rp1; \ + const int32_t lp2 = pctx->lp2, rp2 = pctx->rp2; \ + const int32_t lp3 = pctx->lp3, rp3 = pctx->rp3; \ + \ + const size_t type_size = pctx->type_size; \ + \ + const uint32_t row_start = pctx->nrows_per_thread * ith; \ + const uint32_t row_end = MIN(row_start + pctx->nrows_per_thread, pctx->total_dst_rows); + + +#define htp_pad_dma_preamble \ + const size_t src_row_size = pctx->src_row_size; \ + const size_t src_row_size_aligned = pctx->src_row_size_aligned; \ + const size_t dst_row_size = pctx->dst_row_size; \ + const size_t dst_row_size_aligned = pctx->dst_row_size_aligned; \ + \ + uint8_t * src_spad_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread; \ + uint8_t * dst_spad_base = octx->dst_spad.data + ith * octx->dst_spad.size_per_thread; \ + \ + dma_queue * dma = octx->ctx->dma[ith]; + +// --------------------------------------------------------------------------- +// HVX vectorized PAD kernel +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) { + uint32_t i1, i2, i3; + pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + if (!interior) { + hvx_splat_f32_u(dst_ptr, 0.0f, ne0); + } else { + const uint8_t * src_ptr = pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3); + + if (lp0 > 0) { + hvx_splat_f32_u(dst_ptr, 0.0f, (uint32_t)lp0); + } + + uint8_t * dst_row_start = dst_ptr + (size_t)lp0 * type_size; + if (nb00 == type_size) { + hvx_copy_f32_uu(dst_row_start, src_ptr, ne00); + } else { + for (uint32_t i = 0; i < ne00; i++) { + memcpy(dst_row_start + i * type_size, + src_ptr + (size_t)i * nb00, + type_size); + } + } + + if (rp0 > 0) { + hvx_splat_f32_u(dst_ptr + ((size_t)lp0 + ne00) * type_size, 0.0f, (uint32_t)rp0); + } + } + } + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX + DMA PAD kernel — aligned, double-buffered +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_dma(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + htp_pad_dma_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + // ----------------------------------------------------------------------- + // Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the + // double-buffer pipeline before the main loop begins. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) { + uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned; + uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned; + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr((uint8_t *)dst->data, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 0); + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + const uint8_t * src_ptr = interior + ? pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3) : NULL; + + // Interior row: real DMA (1 row) from DDR to VTCM. + // Border row: null DMA (nrows=0) + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + src_ptr ? src_ptr : (const uint8_t *)src_spad_cur), + src_row_size_aligned, src_row_size, src_ptr ? 1 : 0); + } + + // ----------------------------------------------------------------------- + // Main loop: pop completed DMAs, compute in VTCM with aligned HVX ops, + // push dst DMA and prefetch src for the next+1 row. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start; ir < row_end; ir++) { + uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src; + uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst; + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + if (!interior) { + hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0); + } else { + hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0); + + uint8_t * dst_interior = dst_spad_cur + (size_t)lp0 * type_size; + + if ((uintptr_t)dst_interior % VLEN == 0) { + hvx_copy_f32_aa(dst_interior, src_spad_cur, ne00); + } else { + hvx_copy_f32_ua(dst_interior, src_spad_cur, ne00); + } + } + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr(dst_ptr, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 1); + + const uint32_t next_row = ir + 2; + if (next_row < row_end) { + uint32_t ni1, ni2, ni3; + pad_decompose_row(next_row, ne1, ne2, &ni1, &ni2, &ni3); + const int next_interior = pad_is_interior(ni1, ni2, ni3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + const uint8_t * next_src_ptr = next_interior + ? pad_src_row_ptr(src, ni1, ni2, ni3, lp1, lp2, lp3) : NULL; + + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + next_src_ptr ? next_src_ptr : (const uint8_t *)src_spad_cur), + src_row_size_aligned, src_row_size, next_src_ptr ? 1 : 0); + } + } + + dma_queue_flush(dma); + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX circular PAD kernel +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_circular(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) { + uint32_t i1, i2, i3; + pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + const uint8_t * src_row = pad_circ_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3); + + if (nb00 == type_size) { + + if (lp0 > 0) { + if ((uint32_t)lp0 < 32) { + memcpy(dst_ptr, + src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size, + (size_t)lp0 * type_size); + } else { + hvx_copy_f32_uu(dst_ptr, + src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size, + (uint32_t)lp0); + } + } + hvx_copy_f32_uu(dst_ptr + (size_t)lp0 * type_size, src_row, ne00); + if (rp0 > 0) { + if ((uint32_t)rp0 < 32) { + memcpy(dst_ptr + ((size_t)lp0 + ne00) * type_size, + src_row, + (size_t)rp0 * type_size); + } else { + hvx_copy_f32_uu(dst_ptr + ((size_t)lp0 + ne00) * type_size, + src_row, + (uint32_t)rp0); + } + } + } else { + for (uint32_t i = 0; i < (uint32_t)lp0; i++) { + *(float *)(dst_ptr + i * type_size) = + *(const float *)(src_row + (size_t)(ne00 - (uint32_t)lp0 + i) * nb00); + } + for (uint32_t i = 0; i < ne00; i++) { + *(float *)(dst_ptr + ((size_t)lp0 + i) * type_size) = + *(const float *)(src_row + (size_t)i * nb00); + } + for (uint32_t i = 0; i < (uint32_t)rp0; i++) { + *(float *)(dst_ptr + ((size_t)lp0 + ne00 + i) * type_size) = + *(const float *)(src_row + (size_t)i * nb00); + } + } + } + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-circ %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX + DMA circular PAD kernel — aligned, double-buffered +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_circular_dma(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + htp_pad_dma_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + // ----------------------------------------------------------------------- + // Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the + // double-buffer pipeline. Every row is a real src DMA (no null DMAs). + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) { + uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned; + uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned; + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr((uint8_t *)dst->data, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 0); + + uint32_t pi1, pi2, pi3; + pad_decompose_row(ir, ne1, ne2, &pi1, &pi2, &pi3); + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, pad_circ_src_row_ptr(src, pi1, pi2, pi3, lp1, lp2, lp3)), + src_row_size_aligned, src_row_size, 1); + } + + // ----------------------------------------------------------------------- + // Main loop: pop completed DMAs, assemble circular row in VTCM with + // aligned HVX ops, push dst DMA and prefetch src for the next+1 row. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start; ir < row_end; ir++) { + uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src; + uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst; + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + + if (lp0 > 0) { + uint8_t * dst_left = dst_spad_cur; + const uint8_t * src_left = src_spad_cur + (size_t)(ne00 - (uint32_t)lp0) * type_size; + if ((uint32_t)lp0 < 32) { + memcpy(dst_left, src_left, (size_t)lp0 * type_size); + } else { + hvx_copy_f32_uu(dst_left, src_left, (uint32_t)lp0); + } + } + + { + uint8_t * dst_mid = dst_spad_cur + (size_t)lp0 * type_size; + if ((uintptr_t)dst_mid % VLEN == 0) { + hvx_copy_f32_aa(dst_mid, src_spad_cur, ne00); + } else { + hvx_copy_f32_ua(dst_mid, src_spad_cur, ne00); + } + } + + if (rp0 > 0) { + uint8_t * dst_right = dst_spad_cur + ((size_t)lp0 + ne00) * type_size; + if ((uint32_t)rp0 < 32) { + memcpy(dst_right, src_spad_cur, (size_t)rp0 * type_size); + } else { + if ((uintptr_t)dst_right % VLEN == 0) { + hvx_copy_f32_aa(dst_right, src_spad_cur, (uint32_t)rp0); + } else { + hvx_copy_f32_ua(dst_right, src_spad_cur, (uint32_t)rp0); + } + } + } + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr(dst_ptr, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 1); + + const uint32_t next_row = ir + 2; + if (next_row < row_end) { + uint32_t nri1, nri2, nri3; + pad_decompose_row(next_row, ne1, ne2, &nri1, &nri2, &nri3); + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + pad_circ_src_row_ptr(src, nri1, nri2, nri3, lp1, lp2, lp3)), + src_row_size_aligned, src_row_size, 1); + } + } + + dma_queue_flush(dma); + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-circ-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +int op_pad(struct htp_ops_context * octx) { + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * dst = octx->dst; + + // Only F32 supported + size_t type_size; + switch (src0->type) { + case HTP_TYPE_F32: type_size = 4; break; + default: + FARF(ERROR, "pad-hvx: unsupported type %u\n", src0->type); + return HTP_STATUS_NO_SUPPORT; + } + + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { + return HTP_STATUS_OK; + } + + const int32_t lp0 = octx->op_params[0]; + const int32_t rp0 = octx->op_params[1]; + const int32_t lp1 = octx->op_params[2]; + const int32_t rp1 = octx->op_params[3]; + const int32_t lp2 = octx->op_params[4]; + const int32_t rp2 = octx->op_params[5]; + const int32_t lp3 = octx->op_params[6]; + const int32_t rp3 = octx->op_params[7]; + const int32_t circular = octx->op_params[8]; + + const uint32_t ne0 = dst->ne[0]; + const uint32_t ne00 = src0->ne[0]; + + const uint32_t total_dst_rows = dst->ne[1] * dst->ne[2] * dst->ne[3]; + const uint32_t n_threads = MIN(octx->n_threads, total_dst_rows > 0 ? total_dst_rows : 1); + + const size_t src_row_size = (size_t)ne00 * type_size; + const size_t dst_row_size = (size_t)ne0 * type_size; + const size_t src_row_size_aligned = hex_round_up(src_row_size, VLEN); + const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); + + // Total VTCM needed: 2 buffers (ping+pong) for src and dst, per thread + const size_t vtcm_needed = (size_t)n_threads * 2 * (src_row_size_aligned + dst_row_size_aligned); + + const int use_dma = (src0->nb[0] == (uint32_t)type_size) && + (ne00 >= 512) && + (octx->ctx->vtcm_base != NULL) && + (octx->ctx->vtcm_size >= vtcm_needed); + + if (use_dma) { + octx->src0_spad.size_per_thread = 2 * src_row_size_aligned; + octx->dst_spad.size_per_thread = 2 * dst_row_size_aligned; + octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread; + octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread; + octx->src0_spad.data = octx->ctx->vtcm_base; + octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + } + + struct htp_pad_context pctx = { + .octx = octx, + .lp0 = lp0, .rp0 = rp0, + .lp1 = lp1, .rp1 = rp1, + .lp2 = lp2, .rp2 = rp2, + .lp3 = lp3, .rp3 = rp3, + .nrows_per_thread = (total_dst_rows + n_threads - 1) / n_threads, + .total_dst_rows = total_dst_rows, + .type_size = type_size, + .src_row_size = src_row_size, + .src_row_size_aligned = src_row_size_aligned, + .dst_row_size = dst_row_size, + .dst_row_size_aligned = dst_row_size_aligned, + }; + + FARF(HIGH, "pad-hvx%s%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) pads=(%d,%d,%d,%d,%d,%d,%d,%d)\n", + circular ? "-circ" : "", + use_dma ? "-dma" : "", + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + + if (circular && use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular_dma, &pctx, n_threads); } + else if (circular) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular, &pctx, n_threads); } + else if (use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_dma, &pctx, n_threads); } + else { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx, &pctx, n_threads); } + + return HTP_STATUS_OK; +} + From 9a532ae4bab1b164052ce60a738f78538b421c66 Mon Sep 17 00:00:00 2001 From: Pranav Dhinakar Date: Mon, 18 May 2026 14:04:57 -0700 Subject: [PATCH 062/309] hexagon: add support for TRI op (#22822) * Hexagon: TRI HVX Kernel addition to ggml hexagon HTP ops and context * addressed PR review comments for TRI op * hexagon: clang format * hex-unary: remove merge conflict markers * hex-ggml: remove duplicate op cases (merge conflict) * hex-ggml: fix editor config errors --------- Co-authored-by: Todor Boinovski Co-authored-by: Max Krasnyansky --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 20 +++++ ggml/src/ggml-hexagon/htp/htp-ctx.h | 1 + ggml/src/ggml-hexagon/htp/htp-ops.h | 1 + ggml/src/ggml-hexagon/htp/main.c | 3 + ggml/src/ggml-hexagon/htp/unary-ops.c | 113 ++++++++++++++++++++++++- 5 files changed, 137 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index c24a2305e4c..2f75e97ac66 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2828,6 +2828,21 @@ static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * return true; } +static bool ggml_hexagon_supported_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { + + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * dst = op; + + if (src0->type != GGML_TYPE_F32) { return false; } + if (dst->type != GGML_TYPE_F32) { return false; } + if (!ggml_are_same_shape(src0, dst)) { return false; } + if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { return false; } + + return true; + + GGML_UNUSED(sess); +} + static const char * ggml_backend_hexagon_name(ggml_backend_t backend) { auto sess = static_cast(backend->context); return sess->c_name(); @@ -2869,6 +2884,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) { case GGML_OP_FILL: return HTP_OP_FILL; case GGML_OP_DIAG: return HTP_OP_DIAG; case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI; + case GGML_OP_TRI: return HTP_OP_TRI; case GGML_OP_PAD: return HTP_OP_PAD; case GGML_OP_UNARY: @@ -3430,6 +3446,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_solve_tri(sess, op); break; + case GGML_OP_TRI: + supp = ggml_hexagon_supported_tri(sess, op); + break; + case GGML_OP_PAD: supp = ggml_hexagon_supported_pad(sess, op); break; diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index e500ce46212..6fe3e6c7d85 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -107,6 +107,7 @@ int op_fill(struct htp_ops_context * octx); int op_diag(struct htp_ops_context * octx); int op_solve_tri(struct htp_ops_context * octx); int op_gated_delta_net(struct htp_ops_context * octx); +int op_tri(struct htp_ops_context * octx); int op_pad(struct htp_ops_context * octx); #endif /* HTP_CTX_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index 985ded6f299..676e948a439 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -86,6 +86,7 @@ enum htp_op_code { HTP_OP_SOLVE_TRI, HTP_OP_L2_NORM, HTP_OP_GATED_DELTA_NET, + HTP_OP_TRI, HTP_OP_PAD, HTP_OP_INVALID diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 85569f07289..12003c1fd8a 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -601,6 +601,9 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_GATED_DELTA_NET: return op_gated_delta_net(octx); + case HTP_OP_TRI: + return op_tri(octx); + case HTP_OP_INVALID: break; diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c index d4ae89ee6f0..1ce881353ec 100644 --- a/ggml/src/ggml-hexagon/htp/unary-ops.c +++ b/ggml/src/ggml-hexagon/htp/unary-ops.c @@ -17,7 +17,6 @@ #include "ggml-common.h" #include "htp-ctx.h" #include "htp-ops.h" -#include "htp-ops.h" struct htp_unary_context { struct htp_ops_context * octx; @@ -277,6 +276,95 @@ static void sigmoid_f32(const float * restrict src, } } +static void tri_f32(const float * restrict src, + float * restrict dst, + uint8_t * restrict spad, + const uint32_t num_rows, + const uint32_t row_elems, + const size_t row_size, + int32_t * op_params, + const uint32_t ir, + const struct htp_unary_context * uctx) { + + const int32_t ttype = op_params[0]; + const HVX_Vector zero = hvx_vec_splat_f32(0.0f); + const uint32_t nvec = row_elems / VLEN_FP32; + const uint32_t nloe = row_elems % VLEN_FP32; + + const uint32_t ne01 = uctx->octx->src[0]->ne[1]; + + for (uint32_t b = 0; b < num_rows; b++) { + const uint32_t abs_row = ir + b; + const uint32_t i01 = abs_row % ne01; + + const HVX_Vector * restrict v_src = (const HVX_Vector *) ((const uint8_t *) src + b * row_size); + HVX_Vector * restrict v_dst = (HVX_Vector *) ((uint8_t *) dst + b * row_size); + + uint32_t boundary; + int keep_left; + switch (ttype) { + case 0: boundary = i01; keep_left = 0; break; // keep col >= row + case 1: boundary = i01 + 1; keep_left = 0; break; // keep col > row + case 2: boundary = i01 + 1; keep_left = 1; break; // keep col <= row + case 3: boundary = i01; keep_left = 1; break; // keep col < row + default: boundary = 0; keep_left = 0; break; + } + if (boundary > row_elems) boundary = row_elems; + + // Full HVX vectors — each starts at a 128-byte aligned offset + for (uint32_t i = 0; i < nvec; i++) { + const uint32_t vec_start = i * VLEN_FP32; + const uint32_t vec_end = vec_start + VLEN_FP32; + if (keep_left) { + if (vec_end <= boundary) { + v_dst[i] = v_src[i]; + } else if (vec_start >= boundary) { + v_dst[i] = zero; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + v_dst[i] = Q6_V_vmux_QVV(mask, v_src[i], zero); + } + } else { + if (vec_end <= boundary) { + v_dst[i] = zero; + } else if (vec_start >= boundary) { + v_dst[i] = v_src[i]; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + v_dst[i] = Q6_V_vmux_QVV(mask, zero, v_src[i]); + } + } + } + + // Tail elements (row_elems not a multiple of VLEN_FP32) + if (nloe > 0) { + const uint32_t vec_start = nvec * VLEN_FP32; + const uint32_t vec_end = vec_start + nloe; + HVX_Vector tail_val; + if (keep_left) { + if (vec_end <= boundary) { + tail_val = v_src[nvec]; + } else if (vec_start >= boundary) { + tail_val = zero; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + tail_val = Q6_V_vmux_QVV(mask, v_src[nvec], zero); + } + } else { + if (vec_end <= boundary) { + tail_val = zero; + } else if (vec_start >= boundary) { + tail_val = v_src[nvec]; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + tail_val = Q6_V_vmux_QVV(mask, zero, v_src[nvec]); + } + } + hvx_vec_store_a(&v_dst[nvec], nloe * sizeof(float), tail_val); + } + } +} + static void softplus_f32(const float * restrict src, float * restrict dst, uint8_t * restrict spad, @@ -498,6 +586,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * case HTP_OP_L2_NORM: l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); break; + case HTP_OP_TRI: + tri_f32(src0_spad, dst_spad, NULL, block_size, ne00, src0_row_size_aligned, op_params, ir, uctx); + break; default: break; } @@ -571,6 +662,10 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { case HTP_OP_L2_NORM: op_type = "l2norm-f32"; break; + case HTP_OP_TRI: + op_type = "tri-f32"; + break; + default: FARF(ERROR, "Unsupported unary Op %u\n", octx->op); return HTP_STATUS_NO_SUPPORT; @@ -640,6 +735,22 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { return err; } +int op_tri(struct htp_ops_context * octx) { + int err = HTP_STATUS_OK; + + switch (octx->src[0]->type) { + case HTP_TYPE_F32: + err = execute_op_unary_f32(octx); + break; + + default: + err = HTP_STATUS_NO_SUPPORT; + break; + } + + return err; +} + int op_unary(struct htp_ops_context * octx) { int err = HTP_STATUS_OK; From c3e9ade6dd3ff2a1ceafd2d59062634715b472c4 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Tue, 19 May 2026 09:42:36 +0300 Subject: [PATCH 063/309] rpc : keep last_graph_uid in the device context (#23273) With the introduction of MTP we can have multiple compute contexts for the same RPC device. In this case last_graph_uid is not updated properly when contexts are being switched. This patch fixes this by moving last_graph_uid to the device context, making sure it is always updated. closes: #23242 --- ggml/src/ggml-rpc/ggml-rpc.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 1cb8f563d85..d3805772183 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -199,6 +199,14 @@ static ggml_guid_t ggml_backend_rpc_guid() { return &guid; } +struct ggml_backend_rpc_device_context { + std::string endpoint; + uint32_t device; + std::string name; + std::string description; + uint64_t last_graph_uid; +}; + struct ggml_backend_rpc_buffer_type_context { std::string endpoint; uint32_t device; @@ -211,7 +219,6 @@ struct ggml_backend_rpc_context { std::string endpoint; uint32_t device; std::string name; - uint64_t last_graph_uid; }; struct ggml_backend_rpc_buffer_context { @@ -691,9 +698,11 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; + ggml_backend_dev_t rpc_dev = ggml_backend_get_device(backend); + ggml_backend_rpc_device_context * rpc_dev_ctx = (ggml_backend_rpc_device_context *)rpc_dev->context; GGML_ASSERT(cgraph->n_nodes > 0); - bool reuse = cgraph->uid != 0 && rpc_ctx->last_graph_uid == cgraph->uid; + bool reuse = cgraph->uid != 0 && rpc_dev_ctx->last_graph_uid == cgraph->uid; if (reuse) { rpc_msg_graph_recompute_req request; request.device = rpc_ctx->device; @@ -701,7 +710,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_RECOMPUTE, &request, sizeof(request)); RPC_STATUS_ASSERT(status); } else { - rpc_ctx->last_graph_uid = cgraph->uid; + rpc_dev_ctx->last_graph_uid = cgraph->uid; std::vector input; serialize_graph(rpc_ctx->device, cgraph, input); auto sock = get_socket(rpc_ctx->endpoint); @@ -770,7 +779,6 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) { /* .endpoint = */ endpoint, /* .device = */ device, /* .name = */ dev_name, - /* .last_graph_uid = */ 0, }; auto reg = ggml_backend_rpc_add_server(endpoint); ggml_backend_t backend = new ggml_backend { @@ -1757,15 +1765,6 @@ void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir } } -// device interface - -struct ggml_backend_rpc_device_context { - std::string endpoint; - uint32_t device; - std::string name; - std::string description; -}; - static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) { ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; @@ -1947,10 +1946,11 @@ ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint) { std::string dev_name = "RPC" + std::to_string(dev_id); std::string dev_desc = std::string(endpoint); ggml_backend_rpc_device_context * dev_ctx = new ggml_backend_rpc_device_context { - /* .endpoint = */ endpoint, - /* .device = */ ind, - /* .name = */ dev_name, - /* .description = */ dev_desc + /* .endpoint = */ endpoint, + /* .device = */ ind, + /* .name = */ dev_name, + /* .description = */ dev_desc, + /* .last_graph_uid = */ 0, }; ggml_backend_dev_t dev = new ggml_backend_device { From 439f1b193d2d7d8db4d2b70cbf63e3afcbb38df8 Mon Sep 17 00:00:00 2001 From: Intel AI Get-to Market Customer Success and Solutions Date: Mon, 18 May 2026 23:44:02 -0700 Subject: [PATCH 064/309] sycl: add GGML_SYCL_USE_ASYNC_MEM_OP env toggle (#22153) * sycl: add GGML_SYCL_USE_ASYNC_MEM_OP env toggle Signed-off-by: Chun Tao * Use async mem ops for correctness when SYCL graphs are explicitly on. Signed-off-by: Tao, Chun --------- Signed-off-by: Chun Tao Signed-off-by: Tao, Chun Co-authored-by: Chun Tao --- ggml/src/ggml-sycl/ggml-sycl.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index ebe7c5b351c..2ea47f7153a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -72,6 +72,7 @@ int g_ggml_sycl_disable_graph = 0; int g_ggml_sycl_disable_dnn = 0; int g_ggml_sycl_prioritize_dmmv = 0; int g_ggml_sycl_use_async_mem_op = 0; +int g_ggml_sycl_use_async_mem_op_requested = 1; int g_ggml_sycl_enable_level_zero = 0; int g_ggml_sycl_enable_flash_attention = 1; @@ -304,6 +305,8 @@ static void ggml_check_sycl() try { GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n"); #endif GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv); + g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1); + GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested); #ifdef SYCL_FLASH_ATTN GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention); @@ -319,11 +322,11 @@ static void ggml_check_sycl() try { fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); #endif */ - // Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be - // properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in - // other places. + // Async USM allocation/free is also useful outside the graph path: it avoids the host waits in the reorder + // staging path while preserving queue ordering semantics. Graph support still depends on the extension being + // available, but it no longer needs to control the non-graph fast path. #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC - g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph; + g_ggml_sycl_use_async_mem_op = g_ggml_sycl_use_async_mem_op_requested || !g_ggml_sycl_disable_graph; if (g_ggml_sycl_use_async_mem_op) { for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) { if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) { From f1c1c5c057f047562b637db0ac7eac11485307bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 19 May 2026 08:44:25 +0200 Subject: [PATCH 065/309] convert : filter lora tensor names (#23077) --- convert_lora_to_gguf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 1b7334617d1..81658ba03d8 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -445,6 +445,11 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) + # filter base name, ignore tensor transformations for now + data_gen = lambda g=tensor: g # noqa: E731 + if (titem := self.filter_tensors((base_name, data_gen))) is None: + continue + base_name, _ = titem # note: mergekit-extract-lora also adds token embeddings to the adapter is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name From aabee047d8ebf7abe2750585a347aa19feced3b5 Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Tue, 19 May 2026 14:44:51 +0800 Subject: [PATCH 066/309] [SCYL] add chapter for performance reference in SYCL.md (#23315) * add chapter for performance reference * rm unsupported GPU --- README.md | 2 +- docs/backend/SYCL.md | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a0c14b9d7f0..71327e51453 100644 --- a/README.md +++ b/README.md @@ -280,7 +280,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [Metal](docs/build.md#metal-build) | Apple Silicon | | [BLAS](docs/build.md#blas-build) | All | | [BLIS](docs/backend/BLIS.md) | All | -| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU | +| [SYCL](docs/backend/SYCL.md) | Intel GPU | | [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs | | [MUSA](docs/build.md#musa) | Moore Threads GPU | | [CUDA](docs/build.md#cuda) | Nvidia GPU | diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 155f933b805..0c4660b541c 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -5,6 +5,7 @@ - [News](#news) - [OS](#os) - [Hardware](#hardware) +- [Performance Reference](#performance-reference) - [Docker](#docker) - [Linux](#linux) - [Windows](#windows) @@ -51,9 +52,8 @@ The packages for FP32 and FP16 would have different accuracy and performance on ## News -- 2026.04 - - - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0. +- 2026.04-05 + - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q6_K, Q8_0. - Fused MoE. - Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package. @@ -150,6 +150,13 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the NA +## Performance Reference + + +To get the supported LLMs, GPUs, and performance reference, please check [Performance of llama.cpp on Intel GPU with SYCL backend](https://github.com/ggml-org/llama.cpp/discussions/23313). + +You could update your test result in it directly. + ## Docker The docker build option is currently limited to *Intel GPU* targets. From c85a242ed021ab6732e2973764437c3c5655102b Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 18 May 2026 23:45:41 -0700 Subject: [PATCH 067/309] ggml-webgpu : extend GDN for K>1 (#23299) --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 2 ++ .../wgsl-shaders/gated_delta_net.wgsl | 24 +++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 78cb02be06d..921c12b41ac 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -1234,6 +1234,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, const uint32_t h = (uint32_t) src2->ne[1]; const uint32_t n_tokens = (uint32_t) src2->ne[2]; const uint32_t n_seqs = (uint32_t) src2->ne[3]; + const uint32_t K = (uint32_t) src5->ne[1]; const float scale = 1.0f / sqrtf((float) s_v); uint32_t scale_u32; memcpy(&scale_u32, &scale, sizeof(scale_u32)); @@ -1258,6 +1259,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, (uint32_t) src0->ne[1], (uint32_t) (src2->ne[3] / src0->ne[3]), + K, scale_u32, }; diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl index f9d98fda40b..d68520f8282 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl @@ -39,6 +39,7 @@ struct Params { neq1: u32, rq3: u32, + K: u32, scale: f32, }; @@ -62,11 +63,14 @@ fn main( let iq3 = seq_id / params.rq3; let state_size = S_V * S_V; - let state_base = (seq_id * params.h + head_id) * state_size; + let state_in_base = (seq_id * params.K * params.h + head_id) * state_size; + let state_out_base = (seq_id * params.h + head_id) * state_size; + let state_size_per_snap = state_size * params.h * params.n_seqs; + let shift = i32(params.n_tokens) - i32(params.K); var state: array; for (var i = 0u; i < S_V; i++) { - state[i] = src_state[state_base + col * S_V + i]; + state[i] = src_state[state_in_base + col * S_V + i]; } var attn_off = (seq_id * params.n_tokens * params.h + head_id) * S_V; @@ -123,10 +127,22 @@ fn main( dst[attn_off + col] = attn_col * params.scale; attn_off += S_V * params.h; + if (params.K > 1u) { + let target_slot = i32(t) - shift; + if (target_slot >= 0 && target_slot < i32(params.K)) { + let slot_base = params.s_off + u32(target_slot) * state_size_per_snap + state_out_base; + for (var i = 0u; i < S_V; i++) { + dst[slot_base + col * S_V + i] = state[i]; + } + } + } + workgroupBarrier(); } - for (var i = 0u; i < S_V; i++) { - dst[params.s_off + state_base + col * S_V + i] = state[i]; + if (params.K == 1u) { + for (var i = 0u; i < S_V; i++) { + dst[params.s_off + state_out_base + col * S_V + i] = state[i]; + } } } From d2e179a477fc1d1935b68422c1181ef2d62ed2ef Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 09:46:05 +0300 Subject: [PATCH 068/309] llama-eval : add per-task summary stats (#23151) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * llama-eval : add per-problem summary table to HTML reports - Add chunk_idx and problem_idx to TaskState and saved case dicts - Group completed cases by problem_idx in dump_html() - Render per-problem summary table before individual task table - Columns: Problem (zero-padded), Runs, Correct (n/r), Tokens (min/avg/max), T/s (min/avg/max), Gen s (min/avg/max) - Sorted by problem index, monospace font, right-aligned numbers - Colspan headers for grouped stats, auto width - Simulator: add /v1/models endpoint, timings in response, template-aware question matching, --dataset arg (aime/aime2025) Assisted-by: llama.cpp:local pi * llama-eval : add tabs for Detailed and Summary tables, apply monospace font globally - Wrap Detailed and Summary tables in switchable tabs (Detailed active by default) - Remove summary-section wrapper, use tab labels instead - Apply monospace font to all tables and the top bar Assisted-by: llama.cpp:local pi * llama-eval : redesign top bar as CSS grid label/value pairs - Replace flat span list with 4-column grid layout (2 pairs per row) - Labels in muted color (#888), values in dark (#222) - Bold dataset name and model name - Removed media query, always uses 4 columns Assisted-by: llama.cpp:local pi * llama-eval : use realistic token counts and throughput in simulator - comp_tokens: [30, 80] → [10000, 60000] - tps_gen: derived → uniform [90.0, 110.0] - t_gen_ms: now computed from tokens/tps Assisted-by: llama.cpp:local pi * llama-eval : color Answer column green/red based on correctness Use the same .correct/.incorrect CSS classes on the Answer column to make correct answers green and incorrect answers red. Assisted-by: llama.cpp:local pi * llama-eval : fix pyright errors from max(..., key=len) type inference Use key=lambda x: len(x) instead of key=len so the type checker infers the return type as str instead of Sized, fixing: - unresolved-attribute: Object of type Sized has no attribute lower - not-subscriptable: Cannot subscript object of type Sized Assisted-by: llama.cpp:local pi --- examples/llama-eval/llama-eval.py | 189 ++++++++++++++---- examples/llama-eval/llama-server-simulator.py | 99 +++++++-- 2 files changed, 233 insertions(+), 55 deletions(-) diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index e833070eee9..4bdd239c007 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -149,6 +149,8 @@ class TaskState: t_gen_ms: Optional[float] = None reasoning_content: Optional[str] = None server_name: Optional[str] = None + chunk_idx: int = 0 + problem_idx: int = 0 class EvalState: @@ -233,7 +235,9 @@ def add_result( tps_gen: Optional[float] = None, t_gen_ms: Optional[float] = None, reasoning_content: Optional[str] = None, - server_name: Optional[str] = None + server_name: Optional[str] = None, + chunk_idx: int = 0, + problem_idx: int = 0, ): with self._lock: if "cases" not in self.task_states: @@ -252,7 +256,9 @@ def add_result( "tps_gen": tps_gen, "t_gen_ms": t_gen_ms, "reasoning_content": reasoning_content, - "server_name": server_name + "server_name": server_name, + "chunk_idx": chunk_idx, + "problem_idx": problem_idx, } self.correct = sum(1 for c in self.task_states.get("cases", {}).values() if c.get("correct", False)) @@ -289,6 +295,9 @@ def dump(self): all_cases = {} for i, task_id in tasks_to_save: question_text, prompt, expected = self.get_case(i) + # Extract chunk_idx from task_id for pending cases + _parts = task_id.rsplit("_", 2) + _chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0 if task_id in self.task_states.get("cases", {}): all_cases[task_id] = self.task_states["cases"][task_id] else: @@ -306,7 +315,9 @@ def dump(self): "tps_gen": None, "t_gen_ms": None, "reasoning_content": None, - "server_name": None + "server_name": None, + "chunk_idx": _chunk_idx, + "problem_idx": i, } ci_lower, ci_upper = self.accuracy_ci() @@ -382,11 +393,12 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A grader_log_str = self._escape_html(json.dumps(grader_log, indent=2)) escaped_server = self._escape_html(server_name) + answer_class = status_class if status == "ok" else "" rows.append(f""" {task_id} {status_text} {self._escape_html(expected)} - {self._escape_html(answer)} + {self._escape_html(answer)} {tokens_str} {tps_str} {t_gen_str} @@ -405,6 +417,53 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A rows_html = "\n".join(rows) + # ---- per-problem summary table ---- + problem_groups: Dict[int, List[Dict[str, Any]]] = {} + for _tid, _case in cases.items(): + if _case.get("status") != "ok": + continue + _pidx = _case.get("problem_idx") + if _pidx is None: + _p_parts = _tid.rsplit("_", 2) + _pidx = int(_p_parts[-1]) if len(_p_parts) >= 3 else 0 + problem_groups.setdefault(_pidx, []).append(_case) + + summary_rows_html = "" + if problem_groups: + def _stat(v, fmt=".1f", avg_fmt=None): + if not v: + return ("–", "–", "–") + af = fmt if avg_fmt is None else avg_fmt + return (f"{min(v):{fmt}}", f"{sum(v)/len(v):{af}}", f"{max(v):{fmt}}") + + summary_data = [] + for pidx, g in problem_groups.items(): + runs = len(g) + n_ok = sum(1 for c in g if c.get("correct", False)) + toks = [c["tokens"] for c in g if c.get("tokens") is not None] + tps = [c["tps_gen"] for c in g if c.get("tps_gen") is not None] + tg = [c["t_gen_ms"] / 1000 for c in g if c.get("t_gen_ms") is not None] + summary_data.append(( + pidx, runs, n_ok, + _stat(toks, "d", ".0f"), + _stat(tps), + _stat(tg), + )) + + summary_data.sort(key=lambda r: r[0]) # sort by problem index ascending + + summary_rows_html = "\n".join( + f""" + {p:03d} + {r} + {n}/{r} + {tk[0]}{tk[1]}{tk[2]} + {tp[0]}{tp[1]}{tp[2]} + {tg[0]}{tg[1]}{tg[2]} + """ + for p, r, n, tk, tp, tg in summary_data + ) + html_content = f""" @@ -412,10 +471,10 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A {self.dataset_type.upper()} Eval
- {self.dataset_type.upper()} - Model: {self.model_name or 'N/A'} - Accuracy: {accuracy:.1f}% [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%] - Correct: {n_correct} / {len(completed)} - Pending: {n_pending} - Time: {self.total_time:.1f}s - Sampling: {sampling_str} +
Dataset
{self.dataset_type.upper()}
+
Model
{self.model_name or 'N/A'}
+
Accuracy
{accuracy:.1f}% [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]
+
Correct
{n_correct} / {len(completed)}
+
Pending
{n_pending}
+
Time
{self.total_time:.1f}s
+
Sampling
{sampling_str}
+
+
+ + +
+
+ + + + + + + + + + + + + + + {rows_html} + +
IDGoldAnswerTokensT/sGen sServer
+
+
+ + + + + + + + + + + + + + + + + + + + + {summary_rows_html} + +
ProblemRunsCorrectTokensT/sGen s
minavgmaxminavgmaxminavgmax
- - - - - - - - - - - - - - - {rows_html} - -
IDGoldAnswerTokensT/sGen sServer
""" @@ -1062,12 +1172,19 @@ def _process_single_case( ) -> TaskState: question_text, prompt, expected = eval_state.get_case(i) + # Extract chunk_idx from task_id: "{dataset_type}_{chunk_idx:03d}_{index:03d}" + _parts = task_id.rsplit("_", 2) + chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0 + problem_idx = i + task_state = TaskState( task_id=task_id, prompt=prompt, expected=expected, question_text=question_text, - server_name=server_config.name + server_name=server_config.name, + chunk_idx=chunk_idx, + problem_idx=problem_idx, ) try: @@ -1085,7 +1202,8 @@ def _process_single_case( eval_state.add_result( task_id, prompt, expected, result, None, {"finish_reason": finish_reason}, False, task_state.status, - tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name + tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name, + chunk_idx, problem_idx, ) eval_state.dump() return task_state @@ -1108,7 +1226,8 @@ def _process_single_case( eval_state.add_result( task_id, prompt, expected, result, answer, grader_log, is_correct, "ok", - tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name + tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name, + chunk_idx, problem_idx, ) eval_state.dump() diff --git a/examples/llama-eval/llama-server-simulator.py b/examples/llama-eval/llama-server-simulator.py index 2f9cdc5450d..e64ba89335d 100755 --- a/examples/llama-eval/llama-server-simulator.py +++ b/examples/llama-eval/llama-server-simulator.py @@ -65,34 +65,70 @@ def normalize_number(s: str) -> Optional[int]: return int(match.group(0)) class AimeDataset: - def __init__(self, split: str = "train"): + def __init__(self, split: str = "train", dataset_type: str = "aime"): self.split = split + self.dataset_type = dataset_type self.questions: List[Dict] = [] self._load_dataset() - def _load_dataset(self): - print(f"Loading AIME dataset (split: {self.split})...") + def _get_question_text(self, question: Dict) -> str: + """Get question text, handling different dataset field names.""" + return question.get("problem", question.get("question", "")) - cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0" - if cache_path.exists(): - print(f"Using cached dataset from {cache_path}") - ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path)) + def _load_dataset(self): + if self.dataset_type == "aime": + print(f"Loading AIME dataset (split: {self.split})...") + cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path)) + else: + ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split) + elif self.dataset_type == "aime2025": + print(f"Loading AIME2025 dataset...") + ds_list = [] + for config_name in ["AIME2025-I", "AIME2025-II"]: + cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "opencompass___AIME2025" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path)) + else: + ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test") + ds_list.extend(ds) + ds = ds_list else: - ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split) + raise ValueError(f"Unknown dataset type: {self.dataset_type}") self.questions = list(ds) - print(f"AIME dataset loaded: {len(self.questions)} questions") + print(f"{self.dataset_type} dataset loaded: {len(self.questions)} questions") def find_question(self, request_text: str) -> Optional[Dict]: + # Strip common template prefixes to get the actual question text + # Templates include things like "Solve the following math problem step by step..." + # The actual question usually follows a blank line or after the template instruction + cleaned = request_text + # Split on double newline and take the part that looks like the problem + parts = cleaned.split('\n\n') + if len(parts) > 1: + # Find the part that's longest (likely the actual problem text) + problem_parts = [p for p in parts if len(p.strip()) > 100] + if problem_parts: + cleaned = max(problem_parts, key=lambda x: len(x)) + best_match = None best_distance = -1 best_index = -1 for i, question in enumerate(self.questions): - question_text = question["problem"] - request_lower = request_text.lower() + question_text = self._get_question_text(question) + request_lower = cleaned.lower() question_lower = question_text.lower() + # Check if question text is contained in the cleaned request + if question_lower in request_lower or request_lower in question_lower: + debug_log(f"DEBUG: Found substring match at index {i}") + return question + # Exact match if question_lower == request_lower: debug_log(f"DEBUG: Found exact match at index {i}") @@ -118,7 +154,7 @@ def find_question(self, request_text: str) -> Optional[Dict]: debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}") return best_match - debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...") + debug_log(f"DEBUG: No matching question found for cleaned: {cleaned[:100]}...") return None def get_answer(self, question: Dict) -> str: @@ -134,15 +170,16 @@ def __init__( port: int = 8033, host: str = "localhost", success_rate: float = 0.8, - dataset_split: str = "train" + dataset_split: str = "train", + dataset_type: str = "aime" ): self.port = port self.host = host self.success_rate = success_rate - self.dataset = AimeDataset(dataset_split) + self.dataset = AimeDataset(dataset_split, dataset_type) self.eval_state = EvalState( - id="aime-2025", - tasks=["aime"], + id=dataset_type, + tasks=[dataset_type], task_states={}, sampling_config={"temperature": 0, "max_tokens": 2048} ) @@ -159,6 +196,10 @@ def _generate_response( else: response_text = self._generate_wrong_answer(question) + comp_tokens = random.randint(10000, 60000) + tps_gen = random.uniform(90.0, 110.0) + t_gen_ms = comp_tokens / tps_gen * 1000 + return { "id": f"chatcmpl-{int(time.time())}", "object": "chat.completion", @@ -176,8 +217,12 @@ def _generate_response( ], "usage": { "prompt_tokens": 100, - "completion_tokens": 50, - "total_tokens": 150 + "completion_tokens": comp_tokens, + "total_tokens": 100 + comp_tokens + }, + "timings": { + "predicted_ms": t_gen_ms, + "predicted_per_second": tps_gen } } @@ -218,6 +263,12 @@ def _process_request(self, request_data: Dict) -> Dict: return response class RequestHandler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == "/v1/models": + self._send_json({"data": [{"id": "llama", "object": "model"}]}, 200) + return + self._send_json({"error": "Not found"}, 404) + def do_POST(self): if self.path != "/v1/chat/completions": self._send_json({"error": "Not found"}, 404) @@ -280,6 +331,13 @@ def main(): default=0.8, help="Success rate 0-1 (default: 0.8)" ) + parser.add_argument( + "--dataset", + type=str, + default="aime", + choices=["aime", "aime2025"], + help="Dataset type (default: aime)" + ) parser.add_argument( "--dataset-split", type=str, @@ -294,7 +352,8 @@ def main(): port=args.port, host=args.host, success_rate=args.success_rate, - dataset_split=args.dataset_split + dataset_split=args.dataset_split, + dataset_type=args.dataset ) server = HTTPServer((args.host, args.port), RequestHandler) @@ -304,7 +363,7 @@ def main(): print("\n=== llama-server-simulator ===") print(f"Server running on http://{args.host}:{args.port}") print(f"Success rate: {args.success_rate}") - print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions") + print(f"{args.dataset} dataset loaded: {len(simulator.dataset.questions)} questions") print("\nPress Ctrl+C to stop\n") try: From cd963fee6a86387d598ebe3888017376d6e9e8f6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 09:46:34 +0300 Subject: [PATCH 069/309] save-load-state : refactor tests and improve readability (#23196) * save-load-state : refactor into separate phase functions - Split monolithic main() into 4 self-contained phase functions, each managing its own context/sampler/batch lifecycle - Each function tokenizes internally using its local ctx instance - main() is now a clean orchestrator: init -> run phases -> assert results - Proper resource cleanup on every exit path (return {} on error) Assisted-by: llama.cpp:local pi * save-load-state : use params.out_file instead of separate state_file - Remove state_file parameter from all phase functions - Each function accesses params.out_file directly - Initialize params.out_file in main alongside params.prompt Assisted-by: llama.cpp:local pi * save-load-state : use smart pointers for ctx and smpl - Replace raw llama_context* with llama_context_ptr - Replace raw llama_sampler* with llama_sampler_ptr - Remove all manual llama_free() and llama_sampler_free() calls - Keep llama_batch as raw (managed manually with llama_batch_free) Assisted-by: llama.cpp:local pi * save-load-state : add local llama_batch_ptr RAII wrapper - Add llama_batch_ptr struct holding llama_batch by value - Calls llama_batch_free() in destructor - Eliminates all manual llama_batch_free() calls Assisted-by: llama.cpp:local pi * save-load-state : replace printf/fprintf with logging macros - Add log.h include - Replace fprintf(stderr, ...) errors with LOG_ERR - Replace fprintf(stderr, ...) info with LOG_TRC - Replace printf output with LOG Assisted-by: llama.cpp:local pi * save-load-state : refactor tests to check results inline Each follow-up phase now accepts an expected result and performs the comparison internally instead of collecting results in main(). Assisted-by: llama.cpp:local pi * save-load-state : improve test output readability Add phase labels, remove redundant run prefixes, and show PASS after each test. Assisted-by: llama.cpp:local pi * pi : add rule about git signing * save-load-state : simplify llama_batch_ptr Change get() to return a reference and remove operator*(). Use batch.get() throughout for consistency. Assisted-by: llama.cpp:local pi * save-load-state : extract generate_tokens helper Factor out the repeated token generation loop into a shared helper function used by all phases. Assisted-by: llama.cpp:local pi * save-load-state : update comments to use test terminology Replace "Phase" with "Test" and list each test's steps as bullet points. Assisted-by: llama.cpp:local pi * save-load-state : rename test functions Rename to test_baseline, test_state_load, test_seq_cp_host, test_seq_cp_device. Update comments and logs accordingly. Assisted-by: llama.cpp:local pi * pi : add rule to never git push without confirmation Assisted-by: llama.cpp:local pi * common : add model_only option to common_init_from_params Add bool model_only parameter to skip context creation, sampler init, and context-dependent setup. Use in save-load-state to initialize only the model, with each test creating its own context. Assisted-by: llama.cpp:local pi --------- Co-authored-by: ggerganov --- .pi/gg/SYSTEM.md | 2 + common/common.cpp | 14 +- common/common.h | 4 +- examples/save-load-state/save-load-state.cpp | 453 ++++++++++--------- 4 files changed, 254 insertions(+), 219 deletions(-) diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md index 727a850b183..b7597a4c3ae 100644 --- a/.pi/gg/SYSTEM.md +++ b/.pi/gg/SYSTEM.md @@ -22,6 +22,8 @@ Pull requests (PRs): Commits: - On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag - Do not explicitly set the git author in commits - rely on the default git config +- Always use `--no-gpg-sign` when committing +- Never `git push` without explicit confirmation from the user Resources (read on demand): - [CONTRIBUTING.md](CONTRIBUTING.md) diff --git a/common/common.cpp b/common/common.cpp index 9cf11ea9f5f..aef06263e3f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1160,7 +1160,7 @@ struct common_init_result::impl { std::vector samplers_seq_config; }; -common_init_result::common_init_result(common_params & params) : +common_init_result::common_init_result(common_params & params, bool model_only) : pimpl(new impl{}) { auto mparams = common_model_params_to_llama(params); auto cparams = common_context_params_to_llama(params); @@ -1183,6 +1183,10 @@ common_init_result::common_init_result(common_params & params) : pimpl->model.reset(model); + if (model_only) { + return; + } + const llama_vocab * vocab = llama_model_get_vocab(model); // load and optionally apply lora adapters @@ -1309,8 +1313,8 @@ std::vector & common_init_result::lora() { return pimpl->lora; } -common_init_result_ptr common_init_from_params(common_params & params) { - common_init_result_ptr res(new common_init_result(params)); +common_init_result_ptr common_init_from_params(common_params & params, bool model_only) { + common_init_result_ptr res(new common_init_result(params, model_only)); llama_model * model = res->model(); if (model == NULL) { @@ -1318,6 +1322,10 @@ common_init_result_ptr common_init_from_params(common_params & params) { return res; } + if (model_only) { + return res; + } + llama_context * lctx = res->context(); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); diff --git a/common/common.h b/common/common.h index 1d3d788b2de..e03f7037454 100644 --- a/common/common.h +++ b/common/common.h @@ -857,7 +857,7 @@ struct common_sampler; // note: defines the model, context, samplers, ets. lifetimes struct common_init_result { - common_init_result(common_params & params); + common_init_result(common_params & params, bool model_only = false); ~common_init_result(); llama_model * model(); @@ -875,7 +875,7 @@ struct common_init_result { using common_init_result_ptr = std::unique_ptr; -common_init_result_ptr common_init_from_params(common_params & params); +common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false); struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index e6f5e9802ab..97ab7c6de3b 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,320 +1,345 @@ #include "arg.h" #include "common.h" -#include "llama.h" +#include "log.h" +#include "llama-cpp.h" #include #include -#include +struct llama_batch_ptr { + llama_batch batch; -int main(int argc, char ** argv) { - std::setlocale(LC_NUMERIC, "C"); + llama_batch_ptr(int32_t n_tokens, int32_t embd, int32_t n_seq_max) + : batch{llama_batch_init(n_tokens, embd, n_seq_max)} {} - common_params params; + ~llama_batch_ptr() { llama_batch_free(batch); } - params.prompt = "The quick brown fox"; - params.sampling.seed = 1234; + llama_batch_ptr(const llama_batch_ptr &) = delete; + llama_batch_ptr & operator=(const llama_batch_ptr &) = delete; + llama_batch_ptr(llama_batch_ptr &&) = default; + llama_batch_ptr & operator=(llama_batch_ptr &&) = default; - const std::string_view state_file = "dump_state.bin"; + llama_batch & get() { return batch; } + const llama_batch & get() const { return batch; } +}; - common_init(); +static std::string generate_tokens(llama_context * ctx, llama_sampler * smpl, int & n_past, int32_t n_predict, llama_seq_id seq_id) { + std::string result; + llama_batch_ptr batch(1, 0, 1); - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { - return 1; - } + for (int i = 0; i < n_predict; i++) { + auto next_token = llama_sampler_sample(smpl, ctx, -1); + auto next_token_str = common_token_to_piece(ctx, next_token); - if (params.n_parallel == 1) { - // the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache - printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__); - params.kv_unified = true; - } + LOG("%s", next_token_str.c_str()); + result += next_token_str; - if (params.n_predict < 0) { - params.n_predict = 16; + common_batch_clear(batch.get()); + common_batch_add(batch.get(), next_token, n_past, {seq_id}, true); + + if (llama_decode(ctx, batch.get())) { + LOG_ERR("\n%s: failed to evaluate\n", __func__); + return {}; + } + n_past++; } - auto n_past = 0; + return result; +} - std::string result0; - std::string result1; - std::string result2; - std::string result3; +// Test 1: baseline +// - tokenize the prompt +// - decode all but the last token +// - save state to disk +// - decode the last token +// - generate n_predict tokens +static std::string test_baseline(struct llama_model * model, const struct common_params & params) { + auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))}; - // init + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); - ggml_backend_load_all(); + auto tokens = common_tokenize(ctx.get(), params.prompt, true); - auto llama_init = common_init_from_params(params); + auto n_past = 0; + if (!common_prompt_batch_decode(ctx.get(), tokens, n_past, params.n_batch, params.out_file, true)) { + LOG_ERR("%s: failed to decode prompt\n", __func__); + return {}; + } - auto * model = llama_init->model(); - auto * ctx = llama_init->context(); + LOG("\n=== Test 1: baseline ===\n"); + LOG("%s", params.prompt.c_str()); - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); - return 1; + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0); + if (result.empty()) { + return {}; } - auto sparams = llama_sampler_chain_default_params(); + LOG("\n"); - llama_sampler * smpl = llama_sampler_chain_init(sparams); + return result; +} - llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed)); - // tokenize prompt - auto tokens = common_tokenize(ctx, params.prompt, true); +// Test 2: state load +// - create a new context +// - load state from file +// - replay the last prompt token +// - generate n_predict tokens and compare against expected result +static bool test_state_load(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))}; - const bool save_state = true; - if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) { - return 1; - } + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); - // first run - printf("\nfirst run: %s", params.prompt.c_str()); + auto tokens = common_tokenize(ctx.get(), params.prompt, true); - llama_batch batch = llama_batch_init(1, 0, 1); + LOG("\n=== Test 2: state load ===\n"); + LOG("%s", params.prompt.c_str()); - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl, ctx, -1); - auto next_token_str = common_token_to_piece(ctx, next_token); + // Load state from file + std::vector unused_sts(tokens.size()); + size_t n_token_count_out = 0; - printf("%s", next_token_str.c_str()); - result0 += next_token_str; + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; + } - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {0}, true); + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); - if (llama_decode(ctx, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; } + n_past++; + + // Generate tokens + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0); + if (result.empty()) { + return false; + } + + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; + } + + LOG("\nPASS\n"); + return true; +} - printf("\n\n"); - // make new context - llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params)); +// Test 3: seq copy (host) +// - create a multi-seq context +// - load state from file +// - replay the last prompt token +// - migrate KV cache from seq 0 to seq 1 via the CPU path +// - generate n_predict tokens on seq 1 and compare against expected result +static bool test_seq_cp_host(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto params_ctx = common_context_params_to_llama(params); + params_ctx.n_seq_max = 2; + auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)}; - llama_sampler * smpl2 = llama_sampler_chain_init(sparams); + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); - llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed)); + auto tokens = common_tokenize(ctx.get(), params.prompt, true); - printf("\nsecond run: %s", params.prompt.c_str()); + LOG("\n=== Test 3: seq copy (host) ===\n"); + LOG("%s", params.prompt.c_str()); - // load state from file - std::vector unused_sts(tokens.size()); // unused session tokens. + // Load state from file + std::vector unused_sts(tokens.size()); size_t n_token_count_out = 0; - if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); - return 1; + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; } - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx2, tokens.back(), n_past)) { - return 1; + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; } - ++n_past; + n_past++; - // second run - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl2, ctx2, -1); - auto next_token_str = common_token_to_piece(ctx2, next_token); - - printf("%s", next_token_str.c_str()); - result1 += next_token_str; + // Migrate KV cache from seq 0 to seq 1 (CPU path) + { + std::vector seq_store(llama_state_seq_get_size(ctx.get(), 0)); + const size_t ncopy = llama_state_seq_get_data(ctx.get(), seq_store.data(), seq_store.size(), 0); + if (ncopy != seq_store.size()) { + LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); + return false; + } + LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy); - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {0}, true); + llama_memory_clear(llama_get_memory(ctx.get()), true); + LOG_TRC("%s: kv cache cleared\n", __func__); - if (llama_decode(ctx2, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; + const size_t nset = llama_state_seq_set_data(ctx.get(), seq_store.data(), seq_store.size(), 1); + if (nset != seq_store.size()) { + LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); + return false; } - n_past += 1; + LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset); } - printf("\n\n"); + // Generate tokens on seq 1 + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1); + if (result.empty()) { + return false; + } - if (result0 != result1) { - fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__); - return 1; + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; } - // make new context - auto params_ctx3 = common_context_params_to_llama(params); - params_ctx3.n_seq_max = 2; - llama_context * ctx3 = llama_init_from_model(model, params_ctx3); + LOG("\nPASS\n"); + return true; +} + - llama_sampler * smpl3 = llama_sampler_chain_init(sparams); +// Test 4: seq copy (device) +// - create a multi-seq context +// - load state from file +// - replay the last prompt token +// - migrate KV cache from seq 0 to seq 1 via the on-device path +// - generate n_predict tokens on seq 1 and compare against expected result +static bool test_seq_cp_device(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto params_ctx = common_context_params_to_llama(params); + params_ctx.n_seq_max = 2; + auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)}; - llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed)); + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); - printf("\nsingle seq run: %s", params.prompt.c_str()); + auto tokens = common_tokenize(ctx.get(), params.prompt, true); - // load state (rng, logits, embedding and kv_cache) from file - n_token_count_out = 0; + LOG("\n=== Test 4: seq copy (device) ===\n"); + LOG("%s", params.prompt.c_str()); - if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); - return 1; + // Load state from file + std::vector unused_sts(tokens.size()); + size_t n_token_count_out = 0; + + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; } - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx3, tokens.back(), n_past)) { - return 1; + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; } - ++n_past; + n_past++; - // save seq 0 and load into seq 1 + // Migrate KV cache from seq 0 to seq 1 (on-device path) { - // save kv of seq 0 - std::vector seq_store(llama_state_seq_get_size(ctx3, 0)); - const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0); + std::vector seq_store(llama_state_seq_get_size_ext(ctx.get(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE)); + const size_t ncopy = llama_state_seq_get_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); if (ncopy != seq_store.size()) { - fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); - return 1; + LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); + return false; } - fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); + LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy); - // erase whole kv - llama_memory_clear(llama_get_memory(ctx3), true); - fprintf(stderr, "%s : kv cache cleared\n", __func__); + llama_memory_clear(llama_get_memory(ctx.get()), true); + LOG_TRC("%s: kv cache cleared\n", __func__); - // restore kv into seq 1 - const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1); + const size_t nset = llama_state_seq_set_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); if (nset != seq_store.size()) { - fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); - return 1; + LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); + return false; } - fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); + LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset); } - // third run with seq 1 instead of 0 - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl3, ctx3, -1); - auto next_token_str = common_token_to_piece(ctx3, next_token); - - printf("%s", next_token_str.c_str()); - result2 += next_token_str; - - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {1}, true); + // Generate tokens on seq 1 + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1); + if (result.empty()) { + return false; + } - if (llama_decode(ctx3, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; } - // test on-device state save/load - auto params_ctx4 = common_context_params_to_llama(params); - params_ctx4.n_seq_max = 2; - llama_context * ctx4 = llama_init_from_model(model, params_ctx4); + LOG("\nPASS\n"); + return true; +} - llama_sampler * smpl4 = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed)); +int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); - printf("\nsingle seq run: %s", params.prompt.c_str()); + common_params params; + params.prompt = "The quick brown fox"; + params.out_file = "dump_state.bin"; + params.sampling.seed = 1234; - // load state (rng, logits, embedding and kv_cache) from file - n_token_count_out = 0; + common_init(); - if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); - - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx4, tokens.back(), n_past)) { - return 1; + if (params.n_parallel == 1) { + LOG_TRC("%s: n_parallel == 1, enabling unified kv cache\n", __func__); + params.kv_unified = true; } - ++n_past; - - // save seq 0 and load into seq 1 - { - // save kv of seq 0 - std::vector seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE)); - const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); - if (ncopy != seq_store.size()) { - fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); - - // erase whole kv - llama_memory_clear(llama_get_memory(ctx4), true); - fprintf(stderr, "%s : kv cache cleared\n", __func__); - // restore kv into seq 0 - const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); - if (nset != seq_store.size()) { - fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); + if (params.n_predict < 0) { + params.n_predict = 16; } - // forth run - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl4, ctx4, -1); - auto next_token_str = common_token_to_piece(ctx4, next_token); - - printf("%s", next_token_str.c_str()); - result3 += next_token_str; + ggml_backend_load_all(); - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {1}, true); + auto llama_init = common_init_from_params(params, true); + auto * model = llama_init->model(); - if (llama_decode(ctx4, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; + if (model == nullptr) { + LOG_ERR("%s: failed to init\n", __func__); + return 1; } - printf("\n"); - - llama_sampler_free(smpl); - llama_sampler_free(smpl2); - llama_sampler_free(smpl3); - llama_sampler_free(smpl4); + GGML_ASSERT(llama_init->context() == nullptr); - llama_batch_free(batch); - - // this one is managed by common_init_result - //llama_free(ctx); + // Test 1: baseline (saves state to disk) + auto result_baseline = test_baseline(model, params); + if (result_baseline.empty()) { + return 1; + } - llama_free(ctx2); - llama_free(ctx3); - llama_free(ctx4); + // Test 2: state load + if (!test_state_load(model, params, result_baseline)) { + return 1; + } - if (result0 != result2) { - fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); + // Test 3: seq copy (host) + if (!test_seq_cp_host(model, params, result_baseline)) { return 1; } - if (result0 != result3) { - fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); + // Test 4: seq copy (device) + if (!test_seq_cp_device(model, params, result_baseline)) { return 1; } - fprintf(stderr, "\n%s : success\n", __func__); + LOG("\nAll tests passed.\n"); return 0; } From 3c81c8deeabba01fa40869325ea80d07eef75fc6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 09:46:58 +0300 Subject: [PATCH 070/309] server : print graphs reused in slot timings (#23279) Add graphs reused counter to the per-slot timing output, printed via llama_perf_context(). Assisted-by: llama.cpp:local pi Co-authored-by: ggerganov --- tools/server/server-context.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 6b16c6b4962..88b207ad556 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -467,20 +467,26 @@ struct server_slot { const double n_gen_second = 1e3 / t_token_generation * n_decoded; SLT_INF(*this, - "\n" - "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" - " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second); + + SLT_INF(*this, + " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + t_token_generation, n_decoded, t_gen, n_gen_second); + + SLT_INF(*this, " total time = %10.2f ms / %5d tokens\n", - t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, - t_token_generation, n_decoded, t_gen, n_gen_second, t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); + SLT_INF(*this, + " graphs reused = %10d\n", + llama_perf_context(ctx_tgt).n_reused); + if (n_draft_total > 0) { const float draft_ratio = (float) n_draft_accepted / n_draft_total; - SLT_CNT(*this, - "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n", - draft_ratio, n_draft_accepted, n_draft_total - ); + SLT_INF(*this, + "draft acceptance = %0.5f (%5d accepted / %5d generated)\n", + draft_ratio, n_draft_accepted, n_draft_total); } common_speculative_print_stats(spec); From ccee42642677005555b28c6ef93760e2604348e8 Mon Sep 17 00:00:00 2001 From: Pascal Date: Tue, 19 May 2026 08:49:01 +0200 Subject: [PATCH 071/309] server-context: guarantee there is at least 1 token to decode (#23280) --- tools/server/server-context.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 88b207ad556..dc3189e1705 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2589,9 +2589,9 @@ struct server_context_impl { llama_pos pos_next = slot.prompt.tokens.pos_next(n_past); // the largest pos_min required for a checkpoint to be useful - const auto pos_min_thold = std::max(0, pos_next - n_swa); + const auto pos_min_thold = std::max(0, pos_next - n_swa - 1); - if (n_past > 0 && n_past < slot.prompt.n_tokens()) { + if (n_past > 0 && n_past <= slot.prompt.n_tokens()) { const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id); if (pos_min == -1) { SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min); From 00c461ce1a9deb238eed40a8f869a72729fa3d4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 19 May 2026 09:06:56 +0200 Subject: [PATCH 072/309] ci : install server kleidiai runner dependencies (#23259) --- .github/workflows/server-self-hosted.yml | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index d06ad3d24c5..3522681d9d1 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -152,6 +152,32 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + - name: Dependencies + id: depends + run: | + set -euxo pipefail + sudo apt-get update + sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ + apt-get install -y \ + build-essential \ + python3-venv \ + gpg \ + wget \ + time \ + git-lfs + + git lfs install + + # install the latest cmake + sudo install -d /usr/share/keyrings + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \ + | gpg --dearmor \ + | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null + echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \ + | sudo tee /etc/apt/sources.list.d/kitware.list + sudo apt-get update + sudo apt-get install -y cmake + - name: Build id: cmake_build run: | From 4b262ab662d46fd9dd1d53671b82c09d8b0af024 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 19 May 2026 10:11:04 +0200 Subject: [PATCH 073/309] ci : install libssl-dev (#23325) --- .github/workflows/server-self-hosted.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index 3522681d9d1..857c72a4619 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -160,6 +160,7 @@ jobs: sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ apt-get install -y \ build-essential \ + libssl-dev \ python3-venv \ gpg \ wget \ From 6db130445d29b243ee2171efb8cd61b84a1c5322 Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Tue, 19 May 2026 10:16:04 +0200 Subject: [PATCH 074/309] ui: Bump packages + address build warnings (#23300) * chore: Update vulnerable packages * chore: Formatting * refactor: Update Tailwind CSS imports * ci: Use `ubuntu-latest` for Unit/E2E UI tests * chore: Bump package * fix: Add missing tag * refactor: Enums files naming --- .github/workflows/ui-ci.yml | 4 +- tools/ui/.gitignore | 2 +- tools/ui/eslint.config.js | 5 +- tools/ui/package-lock.json | 54 ++++---- tools/ui/src/app.css | 5 +- ...tAttachmentsPreviewCurrentItemVideo.svelte | 1 + .../MarkdownContent/MarkdownContent.svelte | 2 +- .../settings/SettingsChat/SettingsChat.svelte | 2 +- .../SettingsChat/SettingsChatFields.svelte | 2 +- tools/ui/src/lib/constants/mcp.ts | 2 +- .../ui/src/lib/constants/settings-registry.ts | 4 +- .../src/lib/constants/supported-file-types.ts | 2 +- tools/ui/src/lib/constants/tools.ts | 2 +- .../enums/{agentic.ts => agentic.enums.ts} | 0 .../{attachment.ts => attachment.enums.ts} | 0 .../src/lib/enums/{chat.ts => chat.enums.ts} | 0 .../lib/enums/{files.ts => files.enums.ts} | 0 tools/ui/src/lib/enums/index.ts | 22 ++-- .../enums/{keyboard.ts => keyboard.enums.ts} | 0 .../ui/src/lib/enums/{mcp.ts => mcp.enums.ts} | 0 .../lib/enums/{model.ts => model.enums.ts} | 0 .../lib/enums/{server.ts => server.enums.ts} | 0 .../enums/{settings.ts => settings.enums.ts} | 0 .../lib/enums/{tools.ts => tools.enums.ts} | 0 tools/ui/src/lib/enums/{ui.ts => ui.enums.ts} | 0 tools/ui/src/lib/types/mcp.d.ts | 4 +- tools/ui/src/routes/+layout.svelte | 29 ++-- tools/ui/vitest-setup-client.ts | 124 +++++++++--------- tools/ui/vitest.shims.d.ts | 1 + 29 files changed, 138 insertions(+), 129 deletions(-) rename tools/ui/src/lib/enums/{agentic.ts => agentic.enums.ts} (100%) rename tools/ui/src/lib/enums/{attachment.ts => attachment.enums.ts} (100%) rename tools/ui/src/lib/enums/{chat.ts => chat.enums.ts} (100%) rename tools/ui/src/lib/enums/{files.ts => files.enums.ts} (100%) rename tools/ui/src/lib/enums/{keyboard.ts => keyboard.enums.ts} (100%) rename tools/ui/src/lib/enums/{mcp.ts => mcp.enums.ts} (100%) rename tools/ui/src/lib/enums/{model.ts => model.enums.ts} (100%) rename tools/ui/src/lib/enums/{server.ts => server.enums.ts} (100%) rename tools/ui/src/lib/enums/{settings.ts => settings.enums.ts} (100%) rename tools/ui/src/lib/enums/{tools.ts => tools.enums.ts} (100%) rename tools/ui/src/lib/enums/{ui.ts => ui.enums.ts} (100%) create mode 100644 tools/ui/vitest.shims.d.ts diff --git a/.github/workflows/ui-ci.yml b/.github/workflows/ui-ci.yml index 7f6f467ddaa..761a9319414 100644 --- a/.github/workflows/ui-ci.yml +++ b/.github/workflows/ui-ci.yml @@ -41,7 +41,7 @@ jobs: ui-checks: name: UI Checks needs: ui-build - runs-on: ubuntu-slim + runs-on: ubuntu-latest continue-on-error: true steps: - name: Checkout code @@ -93,7 +93,7 @@ jobs: e2e-tests: name: E2E Tests needs: ui-build - runs-on: ubuntu-slim + runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v6 diff --git a/tools/ui/.gitignore b/tools/ui/.gitignore index 051d884b08e..22ed6125f4c 100644 --- a/tools/ui/.gitignore +++ b/tools/ui/.gitignore @@ -25,4 +25,4 @@ vite.config.ts.timestamp-* *storybook.log storybook-static -*.code-workspace \ No newline at end of file +*.code-workspace diff --git a/tools/ui/eslint.config.js b/tools/ui/eslint.config.js index 185da1dabbe..4ed9dd7ca3a 100644 --- a/tools/ui/eslint.config.js +++ b/tools/ui/eslint.config.js @@ -20,9 +20,7 @@ export default ts.config( prettier, ...svelte.configs.prettier, { - languageOptions: { - globals: { ...globals.browser, ...globals.node } - }, + languageOptions: { globals: { ...globals.browser, ...globals.node } }, rules: { // typescript-eslint strongly recommend that you do not use the no-undef lint rule on TypeScript projects. // see: https://typescript-eslint.io/troubleshooting/faqs/eslint/#i-get-errors-from-the-no-undef-rule-about-global-variables-not-being-defined-even-though-there-are-no-typescript-errors @@ -30,6 +28,7 @@ export default ts.config( 'svelte/no-at-html-tags': 'off', // This app uses hash-based routing (#/) where resolve() from $app/paths does not apply 'svelte/no-navigation-without-resolve': 'off', + // Enforce empty line at end of file 'eol-last': 'error' } diff --git a/tools/ui/package-lock.json b/tools/ui/package-lock.json index 3686eb3261e..4d012c81990 100644 --- a/tools/ui/package-lock.json +++ b/tools/ui/package-lock.json @@ -2307,9 +2307,9 @@ } }, "node_modules/@sveltejs/kit": { - "version": "2.59.1", - "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz", - "integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==", + "version": "2.60.1", + "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.60.1.tgz", + "integrity": "sha512-mQjlkNo+rJvpln7V2IGY2j99BqhcFbS4UN0AQNKNYfhBAFZTuCDAdW3a1sgf330mvtNvsBXn3HpAhcmvdJTcIQ==", "dev": true, "license": "MIT", "dependencies": { @@ -2318,7 +2318,7 @@ "@types/cookie": "^0.6.0", "acorn": "^8.14.1", "cookie": "^0.6.0", - "devalue": "^5.6.4", + "devalue": "^5.8.1", "esm-env": "^1.2.2", "kleur": "^4.1.5", "magic-string": "^0.30.5", @@ -4296,9 +4296,9 @@ } }, "node_modules/devalue": { - "version": "5.6.4", - "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.6.4.tgz", - "integrity": "sha512-Gp6rDldRsFh/7XuouDbxMH3Mx8GMCcgzIb1pDTvNyn8pZGQ22u+Wa+lGV9dQCltFQ7uVw0MhRyb8XDskNFOReA==", + "version": "5.8.1", + "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.8.1.tgz", + "integrity": "sha512-4CXDYRBGqN+57wVJkuXBYmpAVUSg3L6JAQa/DFqm238G73E1wuyc/JhGQJzN7vUf/CMphYau2zXbfWzDR5aTEw==", "license": "MIT" }, "node_modules/devlop": { @@ -4856,12 +4856,12 @@ } }, "node_modules/express-rate-limit": { - "version": "8.5.0", - "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.0.tgz", - "integrity": "sha512-XKhFohWaSBdVJNTi5TaHziqnPkv04I9UQV6q1Wy7Ui6GGQZVW12ojDFwqer14EvCXxjvPG0CyWXx7cAXpALB4Q==", + "version": "8.5.2", + "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.2.tgz", + "integrity": "sha512-5Kb34ipNX694DH48vN9irak1Qx30nb0PLYHXfJgw4YEjiC3ZEmZJhwOp+VfiCYwFzvFTdB9QkArYS5kXa2cx2A==", "license": "MIT", "dependencies": { - "ip-address": "10.1.0" + "ip-address": "^10.2.0" }, "engines": { "node": ">= 16" @@ -4909,9 +4909,9 @@ "license": "MIT" }, "node_modules/fast-uri": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz", - "integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==", + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.2.tgz", + "integrity": "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==", "funding": [ { "type": "github", @@ -5541,9 +5541,9 @@ } }, "node_modules/hono": { - "version": "4.12.14", - "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.14.tgz", - "integrity": "sha512-am5zfg3yu6sqn5yjKBNqhnTX7Cv+m00ox+7jbaKkrLMRJ4rAdldd1xPd/JzbBWspqaQv6RSTrgFN95EsfhC+7w==", + "version": "4.12.19", + "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.19.tgz", + "integrity": "sha512-xa3eYXYXx68XTT4hZ7dRzsXBhaq85ToSrlUJNoR0gwz/1Ap/CNwX47wfvV7pc/xWhjKVVkLT7zBJy8chhNguqQ==", "license": "MIT", "engines": { "node": ">=16.9.0" @@ -5722,9 +5722,9 @@ "license": "MIT" }, "node_modules/ip-address": { - "version": "10.1.0", - "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz", - "integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==", + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz", + "integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==", "license": "MIT", "engines": { "node": ">= 12" @@ -9245,9 +9245,9 @@ } }, "node_modules/svelte": { - "version": "5.55.1", - "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.1.tgz", - "integrity": "sha512-QjvU7EFemf6mRzdMGlAFttMWtAAVXrax61SZYHdkD6yoVGQ89VeyKfZD4H1JrV1WLmJBxWhFch9H6ig/87VGjw==", + "version": "5.55.7", + "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.7.tgz", + "integrity": "sha512-ymI5ykLPwIHW839E053FQbI1G+jnRFJEw3Kv5Y4njixVWywQBx+NUFpkkKyk5LIb36Fg9DVXSYpqiGekLD0hyw==", "license": "MIT", "dependencies": { "@jridgewell/remapping": "^2.3.4", @@ -9259,7 +9259,7 @@ "aria-query": "5.3.1", "axobject-query": "^4.1.0", "clsx": "^2.1.1", - "devalue": "^5.6.4", + "devalue": "^5.8.1", "esm-env": "^1.2.1", "esrap": "^2.2.4", "is-reference": "^3.0.3", @@ -10606,9 +10606,9 @@ "license": "ISC" }, "node_modules/ws": { - "version": "8.18.3", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz", - "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==", + "version": "8.20.1", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.20.1.tgz", + "integrity": "sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==", "dev": true, "license": "MIT", "engines": { diff --git a/tools/ui/src/app.css b/tools/ui/src/app.css index d6dc6670c0c..29b1d3c640b 100644 --- a/tools/ui/src/app.css +++ b/tools/ui/src/app.css @@ -1,6 +1,7 @@ @import 'tailwindcss'; -@source "."; - +@source '.'; +@plugin '@tailwindcss/forms'; +@plugin '@tailwindcss/typography'; @import 'tw-animate-css'; @custom-variant dark (&:is(.dark *)); diff --git a/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte index 4ebbd592280..62040b36f9d 100644 --- a/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte @@ -15,6 +15,7 @@ {#if videoSrc} {:else} diff --git a/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte b/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte index 3a11854b6e4..0412414ae39 100644 --- a/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte +++ b/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte @@ -28,7 +28,7 @@ SETTINGS_KEYS } from '$lib/constants'; import { ColorMode, UrlProtocol } from '$lib/enums'; - import { FileTypeText } from '$lib/enums/files'; + import { FileTypeText } from '$lib/enums/files.enums'; import { highlightCode, detectIncompleteCodeBlock, type IncompleteCodeBlock } from '$lib/utils'; import '$styles/katex-custom.scss'; import githubDarkCss from 'highlight.js/styles/github-dark.css?inline'; diff --git a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte index 109c8ff9dac..d017fe20469 100644 --- a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte +++ b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte @@ -17,7 +17,7 @@ } from '$lib/constants'; import { RouterService } from '$lib/services/router.service'; import { setMode } from 'mode-watcher'; - import { ColorMode } from '$lib/enums/ui'; + import { ColorMode } from '$lib/enums/ui.enums'; import { fade } from 'svelte/transition'; import { goto } from '$app/navigation'; import { page } from '$app/state'; diff --git a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte index 069855eebef..7c1c5c89776 100644 --- a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte +++ b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte @@ -6,7 +6,7 @@ import * as Select from '$lib/components/ui/select'; import { Textarea } from '$lib/components/ui/textarea'; import { SETTING_CONFIG_INFO, SETTINGS_KEYS } from '$lib/constants'; - import { SettingsFieldType } from '$lib/enums/settings'; + import { SettingsFieldType } from '$lib/enums/settings.enums'; import { settingsStore } from '$lib/stores/settings.svelte'; import { serverStore } from '$lib/stores/server.svelte'; import { modelsStore, selectedModelName, propsCacheVersion } from '$lib/stores/models.svelte'; diff --git a/tools/ui/src/lib/constants/mcp.ts b/tools/ui/src/lib/constants/mcp.ts index 19bdd92ea75..918eb9f94b7 100644 --- a/tools/ui/src/lib/constants/mcp.ts +++ b/tools/ui/src/lib/constants/mcp.ts @@ -2,7 +2,7 @@ import { Zap, Globe, Radio } from '@lucide/svelte'; import { MCPTransportType } from '$lib/enums'; import type { ClientCapabilities, Implementation } from '$lib/types'; import type { Component } from 'svelte'; -import { MimeTypeImage } from '$lib/enums/files'; +import { MimeTypeImage } from '$lib/enums/files.enums'; export const DEFAULT_CLIENT_VERSION = '1.0.0'; export const MCP_CLIENT_NAME = 'llama-ui-mcp'; diff --git a/tools/ui/src/lib/constants/settings-registry.ts b/tools/ui/src/lib/constants/settings-registry.ts index bdbb17d962c..93b3cd5edb5 100644 --- a/tools/ui/src/lib/constants/settings-registry.ts +++ b/tools/ui/src/lib/constants/settings-registry.ts @@ -1,5 +1,5 @@ -import { ColorMode } from '$lib/enums/ui'; -import { SettingsFieldType } from '$lib/enums/settings'; +import { ColorMode } from '$lib/enums/ui.enums'; +import { SettingsFieldType } from '$lib/enums/settings.enums'; import { SyncableParameterType } from '$lib/enums'; import { Funnel, diff --git a/tools/ui/src/lib/constants/supported-file-types.ts b/tools/ui/src/lib/constants/supported-file-types.ts index 34505438916..4141161548d 100644 --- a/tools/ui/src/lib/constants/supported-file-types.ts +++ b/tools/ui/src/lib/constants/supported-file-types.ts @@ -18,7 +18,7 @@ import { MimeTypeApplication, MimeTypeText } from '$lib/enums'; -import { FileExtensionVideo, FileTypeVideo } from '$lib/enums/files'; +import { FileExtensionVideo, FileTypeVideo } from '$lib/enums/files.enums'; // File type configuration using enums export const AUDIO_FILE_TYPES = { diff --git a/tools/ui/src/lib/constants/tools.ts b/tools/ui/src/lib/constants/tools.ts index 22b22309c39..efc3476cd79 100644 --- a/tools/ui/src/lib/constants/tools.ts +++ b/tools/ui/src/lib/constants/tools.ts @@ -1,4 +1,4 @@ -import { ToolSource } from '$lib/enums/tools'; +import { ToolSource } from '$lib/enums/tools.enums'; export const TOOL_GROUP_LABELS = { [ToolSource.BUILTIN]: 'Built-in', diff --git a/tools/ui/src/lib/enums/agentic.ts b/tools/ui/src/lib/enums/agentic.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/agentic.ts rename to tools/ui/src/lib/enums/agentic.enums.ts diff --git a/tools/ui/src/lib/enums/attachment.ts b/tools/ui/src/lib/enums/attachment.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/attachment.ts rename to tools/ui/src/lib/enums/attachment.enums.ts diff --git a/tools/ui/src/lib/enums/chat.ts b/tools/ui/src/lib/enums/chat.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/chat.ts rename to tools/ui/src/lib/enums/chat.enums.ts diff --git a/tools/ui/src/lib/enums/files.ts b/tools/ui/src/lib/enums/files.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/files.ts rename to tools/ui/src/lib/enums/files.enums.ts diff --git a/tools/ui/src/lib/enums/index.ts b/tools/ui/src/lib/enums/index.ts index 3cf81286bc1..a17cca1d8e1 100644 --- a/tools/ui/src/lib/enums/index.ts +++ b/tools/ui/src/lib/enums/index.ts @@ -4,9 +4,9 @@ export { AttachmentItemEnabledWhen, AttachmentAction, AttachmentItemVisibleWhen -} from './attachment'; +} from './attachment.enums'; -export { AgenticSectionType, ToolCallType } from './agentic'; +export { AgenticSectionType, ToolCallType } from './agentic.enums'; export { ChatMessageStatsView, @@ -17,7 +17,7 @@ export { MessageType, PdfViewMode, ReasoningFormat -} from './chat'; +} from './chat.enums'; export { FileTypeCategory, @@ -38,7 +38,7 @@ export { MimeTypeImage, MimeTypeText, SpecialFileType -} from './files'; +} from './files.enums'; export { MCPConnectionPhase, @@ -48,16 +48,16 @@ export { MCPContentType, MCPRefType, JsonSchemaType -} from './mcp'; +} from './mcp.enums'; -export { ModelModality } from './model'; +export { ModelModality } from './model.enums'; -export { ServerRole, ServerModelStatus } from './server'; +export { ServerRole, ServerModelStatus } from './server.enums'; -export { ParameterSource, SyncableParameterType, SettingsFieldType } from './settings'; +export { ParameterSource, SyncableParameterType, SettingsFieldType } from './settings.enums'; -export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol } from './ui'; +export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol } from './ui.enums'; -export { KeyboardKey } from './keyboard'; +export { KeyboardKey } from './keyboard.enums'; -export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools'; +export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools.enums'; diff --git a/tools/ui/src/lib/enums/keyboard.ts b/tools/ui/src/lib/enums/keyboard.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/keyboard.ts rename to tools/ui/src/lib/enums/keyboard.enums.ts diff --git a/tools/ui/src/lib/enums/mcp.ts b/tools/ui/src/lib/enums/mcp.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/mcp.ts rename to tools/ui/src/lib/enums/mcp.enums.ts diff --git a/tools/ui/src/lib/enums/model.ts b/tools/ui/src/lib/enums/model.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/model.ts rename to tools/ui/src/lib/enums/model.enums.ts diff --git a/tools/ui/src/lib/enums/server.ts b/tools/ui/src/lib/enums/server.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/server.ts rename to tools/ui/src/lib/enums/server.enums.ts diff --git a/tools/ui/src/lib/enums/settings.ts b/tools/ui/src/lib/enums/settings.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/settings.ts rename to tools/ui/src/lib/enums/settings.enums.ts diff --git a/tools/ui/src/lib/enums/tools.ts b/tools/ui/src/lib/enums/tools.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/tools.ts rename to tools/ui/src/lib/enums/tools.enums.ts diff --git a/tools/ui/src/lib/enums/ui.ts b/tools/ui/src/lib/enums/ui.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/ui.ts rename to tools/ui/src/lib/enums/ui.enums.ts diff --git a/tools/ui/src/lib/types/mcp.d.ts b/tools/ui/src/lib/types/mcp.d.ts index 7aa050cdfa7..2a292614203 100644 --- a/tools/ui/src/lib/types/mcp.d.ts +++ b/tools/ui/src/lib/types/mcp.d.ts @@ -1,5 +1,5 @@ -import type { MCPConnectionPhase, MCPLogLevel, HealthCheckStatus } from '$lib/enums/mcp'; -import type { ToolSource } from '$lib/enums/tools'; +import type { MCPConnectionPhase, MCPLogLevel, HealthCheckStatus } from '$lib/enums/mcp.enums'; +import type { ToolSource } from '$lib/enums/tools.enums'; import type { Client, ClientCapabilities as SDKClientCapabilities, diff --git a/tools/ui/src/routes/+layout.svelte b/tools/ui/src/routes/+layout.svelte index b35d20a5cd5..78227df3ce7 100644 --- a/tools/ui/src/routes/+layout.svelte +++ b/tools/ui/src/routes/+layout.svelte @@ -7,11 +7,13 @@ import { untrack } from 'svelte'; import { onMount } from 'svelte'; import { fade } from 'svelte/transition'; + import { DesktopIconStrip, DialogConversationTitleUpdate, SidebarNavigation } from '$lib/components/app'; + import { conversationsStore } from '$lib/stores/conversations.svelte'; import * as Sidebar from '$lib/components/ui/sidebar/index.js'; import * as Tooltip from '$lib/components/ui/tooltip'; @@ -30,26 +32,29 @@ import { conversations } from '$lib/stores/conversations.svelte'; let { children } = $props(); - let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop); let isMobile = new IsMobile(); let isDesktop = $derived(!isMobile.current); let sidebarOpen = $state(false); let mounted = $state(false); let innerHeight = $state(); + let chatSidebar: - | { activateSearchMode?: () => void; editActiveConversation?: () => void } + | { + activateSearchMode?: () => void; + editActiveConversation?: () => void; + } | undefined = $state(); let titleUpdateDialogOpen = $state(false); let titleUpdateCurrentTitle = $state(''); let titleUpdateNewTitle = $state(''); let titleUpdateResolve: ((value: boolean) => void) | null = null; - const panelNav = useSettingsNavigation(); function navigateToConversation(direction: -1 | 1) { const allConvs = conversations(); + if (allConvs.length === 0) return; const currentId = page.params.id; @@ -61,6 +66,7 @@ } const idx = allConvs.findIndex((c) => c.id === currentId); + if (idx === -1) return; const targetIdx = idx + direction; @@ -75,9 +81,7 @@ // Global keyboard shortcuts const { handleKeydown } = useKeyboardShortcuts({ editActiveConversation: () => chatSidebar?.editActiveConversation?.(), - navigateToPrevConversation: () => navigateToConversation(-1), - navigateToNextConversation: () => navigateToConversation(1) }); @@ -139,6 +143,7 @@ $effect(() => { if (alwaysShowSidebarOnDesktop && isDesktop) { sidebarOpen = true; + return; } }); @@ -175,6 +180,7 @@ // Only fetch router models once when we have models loaded and in router mode if (isRouter && modelsCount > 0 && !routerModelsFetched) { routerModelsFetched = true; + untrack(() => { modelsStore.fetchRouterModels(); }); @@ -223,7 +229,6 @@ -
- - - + {#if !(alwaysShowSidebarOnDesktop && isDesktop) && !(panelNav.isSettingsRoute && !isDesktop)} {#if mounted} @@ -266,9 +271,9 @@ /> {/if} - - {@render children?.()} - + {@render children?.()}
diff --git a/tools/ui/vitest-setup-client.ts b/tools/ui/vitest-setup-client.ts index 0b753db02b1..90994442eb2 100644 --- a/tools/ui/vitest-setup-client.ts +++ b/tools/ui/vitest-setup-client.ts @@ -9,70 +9,72 @@ import { beforeEach, vi } from 'vitest'; beforeEach(() => { const originalFetch = globalThis.fetch; - vi.spyOn(globalThis, 'fetch').mockImplementation(async (input: RequestInfo | URL, init?: RequestInit) => { - const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; + vi.spyOn(globalThis, 'fetch').mockImplementation( + async (input: RequestInfo | URL, init?: RequestInit) => { + const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; - // Mock server props endpoint - if (url.includes('/server')) { - return new Response( - JSON.stringify({ - mode: 'router', - version: 'test', - git_commit: 'test', - git_branch: 'test' - }), - { status: 200, headers: { 'Content-Type': 'application/json' } } - ); - } + // Mock server props endpoint + if (url.includes('/server')) { + return new Response( + JSON.stringify({ + mode: 'router', + version: 'test', + git_commit: 'test', + git_branch: 'test' + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } - // Mock models list endpoint - if (/\/v1\/models|\/models\b/.test(url)) { - return new Response( - JSON.stringify({ - object: 'list', - data: [ - { - id: 'test-model.gguf', - object: 'model', - owned_by: 'llamacpp', - created: 0, - in_cache: false, - path: 'models/test-model.gguf', - status: { value: 'unloaded' }, - meta: {} - } - ], - models: [ - { - model: 'test-model.gguf', - name: 'Test Model', - details: {} - } - ] - }), - { status: 200, headers: { 'Content-Type': 'application/json' } } - ); - } + // Mock models list endpoint + if (/\/v1\/models|\/models\b/.test(url)) { + return new Response( + JSON.stringify({ + object: 'list', + data: [ + { + id: 'test-model.gguf', + object: 'model', + owned_by: 'llamacpp', + created: 0, + in_cache: false, + path: 'models/test-model.gguf', + status: { value: 'unloaded' }, + meta: {} + } + ], + models: [ + { + model: 'test-model.gguf', + name: 'Test Model', + details: {} + } + ] + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } - // Mock /props endpoint (used for modalities) - if (url.includes('/props')) { - return new Response( - JSON.stringify({ - default_generation_settings: { n_ctx: 2048 } - }), - { status: 200, headers: { 'Content-Type': 'application/json' } } - ); - } + // Mock /props endpoint (used for modalities) + if (url.includes('/props')) { + return new Response( + JSON.stringify({ + default_generation_settings: { n_ctx: 2048 } + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } - // Mock /tools endpoint (used for built-in tools list) - if (url.includes('/tools')) { - return new Response(JSON.stringify([]), { - status: 200, - headers: { 'Content-Type': 'application/json' } - }); - } + // Mock /tools endpoint (used for built-in tools list) + if (url.includes('/tools')) { + return new Response(JSON.stringify([]), { + status: 200, + headers: { 'Content-Type': 'application/json' } + }); + } - // Default: use real fetch - return originalFetch(input, init); - }); + // Default: use real fetch + return originalFetch(input, init); + } + ); }); diff --git a/tools/ui/vitest.shims.d.ts b/tools/ui/vitest.shims.d.ts new file mode 100644 index 00000000000..03b1801a60c --- /dev/null +++ b/tools/ui/vitest.shims.d.ts @@ -0,0 +1 @@ +/// From d14ce3dab4de197adec5166faa54ac5db8262f26 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 15:32:58 +0300 Subject: [PATCH 075/309] llama : MTP clean-up (#23269) * llama : disable equal splits for recurrent memory with partial rollback * spec : re-enable p-min with MTP drafts * spec : re-enable ngram spec in combination with RS rollback * spec : fix ngram-map-* params * spec : fix acceptance logic in combined ngram + draft configs * graph : fix reuse for combined `token` + `embd` batches * spec : log parameters for each speculative implementation - add LOG_INF in each constructor with implementation type and parameters - extract device string logic into common_speculative_get_devices_str() - move 'adding speculative implementation' log from init into constructors Assisted-by: llama.cpp:local pi * spec : extend --spec-default with ngram-map-k4v Assisted-by: llama.cpp:local pi * minor : fix n_embd log * args : update draft.n_max == 3 + regen docs * spec : relax ngram-mod rejection thold to 0.25 @ 5 low * logs : improve * docs : update speculative decoding CLI argument documentation - Add missing draft model CPU scheduling and tensor override parameters - Update --spec-type to include all available types (excluding draft-eagle3 WIP) - Fix default values to match implementation (n_max=3, n_min=0, p_min=0.0) - Remove deprecated options (spec-draft-ctx-size, spec-draft-replace) - Add environment variables for new parameters Assisted-by: llama.cpp:local pi * arg : step-back on adding k4v to the default spec config * cont : fix name --- common/arg.cpp | 18 +++- common/common.cpp | 23 ----- common/common.h | 8 +- common/ngram-map.cpp | 2 +- common/speculative.cpp | 150 +++++++++++++++++++++++-------- docs/speculative.md | 77 +++++++++++++--- src/llama-graph.h | 3 +- src/llama-memory-hybrid-iswa.cpp | 12 ++- src/llama-memory-hybrid.cpp | 12 ++- src/llama-memory-recurrent.cpp | 12 ++- src/llama-memory-recurrent.h | 1 + src/models/delta-net-base.cpp | 97 ++++++++++++-------- src/models/models.h | 3 - tools/cli/README.md | 4 +- tools/server/README.md | 5 +- 15 files changed, 293 insertions(+), 134 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index ab23b77e021..13dfd413562 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -536,7 +536,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } if (!seen_args.insert(arg).second) { - LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + const bool skip = (arg == "--spec-type"); + + if (!skip) { + LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + } } auto & tmp = arg_to_options[arg]; auto opt = *tmp.first; @@ -893,7 +897,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::mapsamplers_seq_config.size(); } - // [TAG_RS_STATE_ROLLBACK_SUPPORT] - // TODO: ngram speculative methods require checkpointing in addition to partial RS rollback - // currently this is not supported. so we disable the partial rollback - if (cparams.n_rs_seq > 0 && (llama_model_is_recurrent(model) || llama_model_is_hybrid(model))) { - auto & types = params.speculative.types; - - for (int i = 0; i < (int) types.size(); i++) { - if (types[i] == COMMON_SPECULATIVE_TYPE_NONE) { - continue; - } - if (types[i] == COMMON_SPECULATIVE_TYPE_DRAFT_MTP) { - continue; - } - - cparams.n_rs_seq = 0; - - LOG_WRN("%s: recurrent state rollback is not compatible with '%s' - disabling rollback support\n", __func__, - common_speculative_type_to_str(types[i]).c_str()); - - break; - } - } - llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); diff --git a/common/common.h b/common/common.h index e03f7037454..53c689bc11d 100644 --- a/common/common.h +++ b/common/common.h @@ -299,11 +299,11 @@ struct common_params_model { // draft-model-based speculative decoding parameters struct common_params_speculative_draft { - int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding - int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding + int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding + int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding - float p_split = 0.1f; // speculative decoding split probability - float p_min = 0.75f; // minimum speculative decoding probability (greedy) // TODO: change default to 0.0f + float p_split = 0.1f; // speculative decoding split probability + float p_min = 0.0f; // minimum speculative decoding probability (greedy) common_params_model mparams; diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index 02bc482fe84..9364159767e 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -500,7 +500,7 @@ void common_ngram_map_draft(common_ngram_map & map, draft.push_back(inp[match_pos + n + i]); } - LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__, + LOG_DBG("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__, key_offset, slot_max, curr_key.key_num, draft.size()); diff --git a/common/speculative.cpp b/common/speculative.cpp index e591bab875d..4d1b61a13ad 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -32,6 +32,19 @@ const std::map common_speculative_type_fro {"ngram-cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE} }; +static std::string common_speculative_get_devices_str(const std::vector & devices) { + if (devices.empty()) { + return "default"; + } + + std::string result; + for (size_t i = 0; i < devices.size(); i++) { + if (i > 0) result += ", "; + result += ggml_backend_dev_name(devices[i]); + } + return result; +} + struct common_speculative_config { common_speculative_type type; common_params_speculative params; @@ -144,7 +157,7 @@ struct common_speculative_impl { virtual void draft(common_speculative_draft_params_vec & dparams) = 0; - virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0; + virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0; // true if this implementation requires the target context to extract post-norm embeddings virtual bool need_embd() const = 0; @@ -167,6 +180,16 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; auto * ctx_tgt = this->params.ctx_tgt; + LOG_INF("%s: adding speculative implementation 'draft-simple'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min); + LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, + this->params.n_gpu_layers, + ggml_type_name(this->params.cache_type_k), + ggml_type_name(this->params.cache_type_v), + ctx_tgt ? "yes" : "no", + ctx_dft ? "yes" : "no", + common_speculative_get_devices_str(this->params.devices).c_str()); + batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1); // TODO: optimize or pass from outside? @@ -343,7 +366,7 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -355,8 +378,12 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { //common_params_speculative_eagle3 params; - common_speculative_impl_draft_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq) - : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) {} + common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq) + : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) + { + LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min); + } void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -371,7 +398,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { // TODO: implement } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -380,7 +407,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { } }; -struct common_speculative_state_draft_mtp : public common_speculative_impl { +struct common_speculative_impl_draft_mtp : public common_speculative_impl { common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft) llama_batch batch; @@ -407,7 +434,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { // pre-advancement before process() mirrored the verify batch. std::vector last_n_drafted; - common_speculative_state_draft_mtp(const common_params_speculative & params, uint32_t n_seq) + common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq) , params(params.draft) { @@ -417,6 +444,16 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { n_embd = llama_model_n_embd(llama_get_model(ctx_dft)); + LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd); + LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, + this->params.n_gpu_layers, + ggml_type_name(this->params.cache_type_k), + ggml_type_name(this->params.cache_type_v), + ctx_tgt ? "yes" : "no", + ctx_dft ? "yes" : "no", + common_speculative_get_devices_str(this->params.devices).c_str()); + const int32_t n_b = (int32_t) llama_n_batch(ctx_dft); batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd, /*n_seq_max=*/ 1); // llama_batch_init allocates only one of token/embd; MTP needs both. @@ -427,7 +464,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { for (auto & s : smpls) { common_params_sampling sparams; sparams.no_perf = false; - sparams.top_k = 1; // TODO: re-enable top_k == 10 and utilize `p_min` spec param + sparams.top_k = 10; sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K }; s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams)); } @@ -446,7 +483,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { last_n_drafted.assign(n_seq, 0); } - ~common_speculative_state_draft_mtp() override { + ~common_speculative_impl_draft_mtp() override { if (batch.token != nullptr) { free(batch.token); batch.token = nullptr; @@ -462,7 +499,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); if (pos_max < N - 1) { - LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d — " + LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - " "process() hook may not have run on every prefill ubatch " "(need_embd / logits=1 on every prompt position?). " "Drafts may degrade.\n", @@ -633,6 +670,14 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { // add drafted token for each sequence const llama_token id = cur_p->data[0].id; + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + drafting[seq_id] = false; + n_drafting--; + + continue; + } + common_sampler_accept(smpl, id, true); auto & dp = dparams.at(seq_id); @@ -678,7 +723,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override { if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { return; } @@ -714,7 +759,12 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { common_ngram_simple_config config) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, n_seq) , params(params.ngram_simple) - , config(config) {} + , config(config) + { + LOG_INF("%s: adding speculative implementation 'ngram-simple'\n", __func__); + LOG_INF("%s: - size_n=%d, size_m=%d, min_hits=%d\n", __func__, + this->params.size_n, this->params.size_m, this->params.min_hits); + } void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -738,7 +788,7 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -748,20 +798,21 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { }; struct common_speculative_impl_ngram_map_k : public common_speculative_impl { - common_params_speculative_ngram_map params; - // n_seq configs std::vector config; common_speculative_impl_ngram_map_k( - const common_params_speculative & params, const common_ngram_map & config, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, n_seq) - , params(params.ngram_map_k) { + { for (uint32_t i = 0; i < n_seq; i++) { this->config.push_back(config); } + + LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(this->type).c_str()); + LOG_INF("%s: - size_key=%d, size_value=%d, key_only=%d, min_hits=%d\n", __func__, + config.size_key, config.size_value, config.key_only, config.min_hits); } void begin(llama_seq_id seq_id, const llama_tokens & prompt) override { @@ -788,9 +839,13 @@ struct common_speculative_impl_ngram_map_k : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override { GGML_ASSERT((seq_id < (llama_seq_id) config.size())); + if (is_other) { + return; + } + common_ngram_map_accept(config[seq_id], n_accepted); } @@ -812,7 +867,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { // the last position in the prompt that was added to the ngram container size_t i_last = 0; - // length of the last drafted n‑gram (number of tokens returned by draft) + // length of the last drafted n-gram (number of tokens returned by draft) size_t n_draft_last = 0; // consecutive accept rounds with low acceptance fraction (< 0.5) @@ -830,8 +885,11 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { , verbose(std::getenv("LLAMA_TRACE") != nullptr) { static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t)); - LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__, - this->params.n_match, mod.size(), (float)(mod.size_bytes())/1024/1024); + LOG_INF("%s: adding speculative implementation 'ngram-mod'\n", __func__); + LOG_INF("%s: - n_match=%d, n_max=%d, n_min=%d\n", __func__, + this->params.n_match, this->params.n_max, this->params.n_min); + LOG_INF("%s: - mod size=%zu (%.3f MB)\n", __func__, + mod.size(), (float)(mod.size_bytes())/1024/1024); if (this->params.n_match < 16) { LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, " @@ -921,7 +979,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { } result.resize(result.size() - n); - // store length of drafted n‑gram for later acceptance analysis + // store length of drafted n-gram for later acceptance analysis sinfo.n_draft_last = result.size(); } @@ -943,17 +1001,21 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override { + if (is_other) { + return; + } + auto & sinfo = sinfos[seq_id]; // compute acceptance fraction if we have a recorded draft length if (sinfo.n_draft_last > 0) { const double f_acc = (double)n_accepted / (double)sinfo.n_draft_last; - if (f_acc < 0.5) { + if (f_acc < 0.25) { sinfo.n_low++; - if (sinfo.n_low >= 3) { + if (sinfo.n_low >= 5) { if (verbose) { - LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, sinfo.n_low); + LOG_WRN("%s: low acceptance streak (%d) - resetting ngram_mod\n", __func__, sinfo.n_low); } mod.reset(); @@ -1003,6 +1065,12 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl { , save_dynamic(save_dynamic) , save_static(save_static) { + LOG_INF("%s: adding speculative implementation 'ngram-cache'\n", __func__); + LOG_INF("%s: - n_draft=%d, cache_static=%s, cache_dynamic=%s\n", __func__, + n_draft, + path_static.empty() ? "none" : path_static.c_str(), + path_dynamic.empty() ? "none" : path_dynamic.c_str()); + sinfos.resize(n_seq); if (!path_static.empty()) { @@ -1099,7 +1167,7 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -1285,7 +1353,6 @@ common_speculative * common_speculative_init(common_params_speculative & params, std::vector> impls = {}; for (const common_speculative_config & config : configs) { - LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(config.type).c_str()); switch (config.type) { case COMMON_SPECULATIVE_TYPE_NONE: break; @@ -1298,7 +1365,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, break; } case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: { - impls.push_back(std::make_unique(config.params, n_seq)); + impls.push_back(std::make_unique(config.params, n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { @@ -1319,11 +1386,16 @@ common_speculative * common_speculative_init(common_params_speculative & params, impls.push_back(std::move(state)); break; } - case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: + case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: { + impls.push_back( + std::make_unique( + get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); + break; + } case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: { impls.push_back( std::make_unique( - config.params, get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); + get_common_ngram_map(config.type, config.params.ngram_map_k4v), n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: { @@ -1515,11 +1587,6 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u GGML_ASSERT(impl); - // TODO: currently only the implementation that generated the draft is used to accept it - // however, some implementations (such as MTP) need to also "see" the accepted tokens - // extend `common_speculative_impl::accept()` with an extra argument `bool is_other` to - // inform the implementation if the accepted tokens are from another implementation and - // pass the accepted tokens to all remaining implementations using `is_other == true` { common_time_meas tm(impl->t_accept_us, !impl->gen_perf); if (n_accepted > 0) { @@ -1527,9 +1594,16 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u impl->n_acc_tokens += n_accepted; } - impl->accept(seq_id, n_accepted); + impl->accept(seq_id, n_accepted, false); impl->n_call_accept++; } + + // accept with the rest of the implementations, using is_other == true + for (auto & impl_other : spec->impls) { + if (impl_other.get() != impl) { + impl_other->accept(seq_id, n_accepted, true); + } + } } void common_speculative_print_stats(const common_speculative * spec) { @@ -1549,7 +1623,7 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } - LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", + LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->n_call_begin, impl->n_call_draft, impl->n_call_accept, impl->n_gen_drafts, diff --git a/docs/speculative.md b/docs/speculative.md index fb6ef03067d..45e42d42a43 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -108,11 +108,12 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### General Speculative Parameters ``` ---spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] - type of speculative decoding to use when no draft model is provided +--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] + comma-separated list of types of speculative decoding to use (default: none) (env: LLAMA_ARG_SPEC_TYPE) ---spec-default use default speculative decoding +--spec-default use default speculative decoding config + (enables ngram-mod) ``` ### Draft Model Parameters @@ -123,8 +124,9 @@ If a draft model is combined with a draftless decoding the draftless decoding ha (env: LLAMA_ARG_SPEC_DRAFT_MODEL) --spec-draft-hf, -hfd, -hfrd, --hf-repo-draft /[:quant] HuggingFace repository for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) --spec-draft-n-max N - number of tokens to draft for speculative decoding (default: 16) + number of tokens to draft for speculative decoding (default: 3) (env: LLAMA_ARG_SPEC_DRAFT_N_MAX) --spec-draft-n-min N minimum number of draft tokens to use for speculative decoding (default: 0) @@ -133,18 +135,64 @@ If a draft model is combined with a draftless decoding the draftless decoding ha speculative decoding split probability (default: 0.10) (env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) --spec-draft-p-min, --draft-p-min P - minimum speculative decoding probability (greedy) (default: 0.75) + minimum speculative decoding probability (greedy) (default: 0.00) (env: LLAMA_ARG_SPEC_DRAFT_P_MIN) ---spec-draft-ctx-size, -cd, --ctx-size-draft N - size of the prompt context for the draft model (default: 0, 0 = loaded from model) - (env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE) --spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto) (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) --spec-draft-device, -devd, --device-draft comma-separated list of devices to use for offloading the draft model ---spec-draft-replace, --spec-replace TARGET DRAFT - translate the string in TARGET into DRAFT if the draft model and main model are not compatible + (use --list-devices to see available devices) +``` + +### Draft Model CPU Scheduling Parameters + +``` +--spec-draft-threads, -td, --threads-draft N + number of CPU threads to use during generation +--spec-draft-threads-batch, -tbd, --threads-batch-draft N + number of threads to use during batch and prompt processing (default: same as --threads-draft) +--spec-draft-cpu-mask, -Cd, --cpu-mask-draft M + Draft model CPU affinity mask. Complements cpu-range-draft +--spec-draft-cpu-range, -Crd, --cpu-range-draft lo-hi + Ranges of CPUs for affinity. Complements --cpu-mask-draft +--spec-draft-cpu-strict, --cpu-strict-draft <0|1> + Use strict CPU placement for draft model (default: same as --cpu-strict) +--spec-draft-prio, --prio-draft N + set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime +--spec-draft-poll, --poll-draft <0|1> + Use polling to wait for draft model work (default: same as --poll) +--spec-draft-cpu-mask-batch, -Cbd, --cpu-mask-batch-draft M + Draft model CPU affinity mask for batch. Complements cpu-range-batch-draft +--spec-draft-cpu-range-batch, -Crbd, --cpu-range-batch-draft lo-hi + Ranges of CPUs for affinity for batch. Complements --cpu-mask-batch-draft +--spec-draft-cpu-strict-batch, --cpu-strict-batch-draft <0|1> + Use strict CPU placement for draft model batch (default: --cpu-strict-draft) +--spec-draft-prio-batch, --prio-batch-draft N + set draft process/thread priority for batch : 0-normal, 1-medium, 2-high, 3-realtime +--spec-draft-poll-batch, --poll-batch-draft <0|1> + Use polling to wait for draft model work for batch (default: --poll-draft) +``` + +### Draft Model KV Cache and Tensor Override Parameters + +``` +--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE + KV cache data type for K for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) +--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE + KV cache data type for V for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V) +--spec-draft-override-tensor, -otd, --override-tensor-draft =,... + override tensor buffer type for draft model +--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft + keep all Mixture of Experts (MoE) weights in the CPU for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) +--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N + keep the MoE weights of the first N layers in the CPU for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) ``` ### n-gram Mod Parameters @@ -193,11 +241,13 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### `--spec-type TYPE` -Specifies a type of speculative decoding without draft model. +Specifies a comma-separated list of speculative decoding types to use. | Type | Description | |------|-------------| | `none` | No speculative decoding (default) | +| `draft-simple` | Use a simple draft model for speculation | +| `draft-mtp` | Use Masked Token Prediction (MTP) heads from the main model | | `ngram-cache` | Use n-gram cache lookup | | `ngram-simple` | Use simple n-gram pattern matching | | `ngram-map-k` | Use n-gram pattern matching with n-gram-keys | @@ -209,6 +259,11 @@ Specifies a type of speculative decoding without draft model. ./llama-server [...] --spec-type ngram-simple ``` +**Example:** Multiple speculative implementations. +```bash +./llama-server [...] --spec-type ngram-mod,ngram-map-k4v +``` + ### `--spec-ngram-*-size-n N` Sets the size N of the lookup n-gram for n-gram map based speculative decoding. diff --git a/src/llama-graph.h b/src/llama-graph.h index 9e55d0a675e..bf6778237e6 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -581,7 +581,8 @@ struct llm_graph_params { ubatch.n_seqs_unq == other.ubatch.n_seqs_unq && ( (!ubatch.token && !other.ubatch.token) || - (!ubatch.embd && !other.ubatch.embd) + (!ubatch.embd && !other.ubatch.embd) || + (ubatch.token && other.ubatch.token && ubatch.embd && other.ubatch.embd) ); // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp index a59561ea54d..72f5c2fea72 100644 --- a/src/llama-memory-hybrid-iswa.cpp +++ b/src/llama-memory-hybrid-iswa.cpp @@ -75,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) - const bool unified = (mem_attn->get_base()->get_n_stream() == 1); - ubatch = balloc.split_equal(n_ubatch, !unified); + if (mem_recr->n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) + const bool unified = (mem_attn->get_base()->get_n_stream() == 1); + ubatch = balloc.split_equal(n_ubatch, !unified); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index fd305cab79c..33b3b395e0c 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -75,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) - const bool unified = (mem_attn->get_n_stream() == 1); - ubatch = balloc.split_equal(n_ubatch, !unified); + if (mem_recr->n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) + const bool unified = (mem_attn->get_n_stream() == 1); + ubatch = balloc.split_equal(n_ubatch, !unified); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index aeb866657f2..ec5dc5835dd 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -416,9 +416,15 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // TODO: non-sequential equal split can be done if using unified KV cache - // for simplicity, we always use sequential equal split for now - ubatch = balloc.split_equal(n_ubatch, true); + if (n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // TODO: non-sequential equal split can be done if using unified KV cache + // for simplicity, we always use sequential equal split for now + ubatch = balloc.split_equal(n_ubatch, true); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h index 29c58afc9c2..b13b7b748f5 100644 --- a/src/llama-memory-recurrent.h +++ b/src/llama-memory-recurrent.h @@ -72,6 +72,7 @@ class llama_memory_recurrent : public llama_memory_i { // number of recurrent-state snapshots per seq for rollback; tensors are widened to (1 + n_rs_seq) groups uint32_t n_rs_seq = 0; + // per-seq rollback index std::vector rs_idx; diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp index 2a4e00384e9..a67238383ed 100644 --- a/src/models/delta-net-base.cpp +++ b/src/models/delta-net-base.cpp @@ -447,13 +447,6 @@ std::pair llm_build_delta_net_base::build_delta_ne return build_delta_net_chunking(q, k, v, g, b, s, il); } -bool llm_build_delta_net_base::keep_rs() const { - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - return cparams.n_rs_seq > 0 - && n_seq_tokens > 1 - && (uint32_t) n_seq_tokens <= 1 + cparams.n_rs_seq; -} - ggml_tensor * llm_build_delta_net_base::build_conv_state( llm_graph_input_rs * inp, ggml_tensor * conv_states_all, @@ -461,12 +454,12 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state( int64_t conv_kernel_size, int64_t conv_channels, int il) { - const auto * mctx_cur = inp->mctx; - const auto kv_head = mctx_cur->get_head(); - const uint32_t mem_size = mctx_cur->get_size(); - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const bool keep = keep_rs(); + const auto * mctx_cur = inp->mctx; + + const auto kv_head = mctx_cur->get_head(); + const auto mem_size = mctx_cur->get_size(); + + const int64_t n_seqs = ubatch.n_seqs; ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); cb(conv_states, "conv_states", il); @@ -480,32 +473,52 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state( ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); cb(conv_input, "conv_input", il); - if (!keep) { - ggml_tensor * last_conv_states = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], - conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); - cb(last_conv_states, "last_conv_states", il); + const int64_t row_count = (conv_kernel_size - 1) * conv_channels; + + const size_t row_size = ggml_row_size(conv_states_all->type, row_count); + + if (cparams.n_rs_seq == 0) { + const int64_t s_idx = conv_input->ne[0] - conv_states->ne[0]; + const int64_t s_slot = 0; + + ggml_tensor * conv_state_last = + ggml_view_3d(ctx0, conv_input, + conv_kernel_size - 1, conv_channels, n_seqs, + conv_input->nb[1], conv_input->nb[2], + ggml_row_size(conv_input->type, s_idx)); + cb(conv_state_last, "conv_state_last", il); - ggml_tensor * state_update_target = - ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1], - kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); - cb(state_update_target, "state_update_target", il); + ggml_tensor * conv_state_update = + ggml_view_2d(ctx0, conv_states_all, + row_count, n_seqs, conv_states_all->nb[1], + (s_slot * mem_size + kv_head) * row_size); + cb(conv_state_update, "conv_state_update", il); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update)); } else { - const int64_t row_count = (conv_kernel_size - 1) * conv_channels; - const size_t row_size = row_count * ggml_element_size(conv_states_all); - for (int64_t t = 1; t <= n_seq_tokens; ++t) { - const uint32_t slot = (uint32_t)(n_seq_tokens - t); - ggml_tensor * src = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, - conv_input->nb[1], conv_input->nb[2], - t * ggml_element_size(conv_input)); - ggml_tensor * dst = - ggml_view_2d(ctx0, conv_states_all, row_count, n_seqs, - conv_states_all->nb[1], - ((size_t) slot * mem_size + kv_head) * row_size); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst)); + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: this logic incorrectly assumes that the last (n_rs_seq + 1) tokens of a sequence in a batch are + // inside the same ubatch. currently with `split_equal()` this is not correct + + const int64_t K = (int64_t) cparams.n_rs_seq + 1; + + for (int64_t t = 1; t <= K; ++t) { + const int64_t s_idx = std::max(0, conv_input->ne[0] - conv_states->ne[0] - K + t); + const int64_t s_slot = K - t; + + ggml_tensor * conv_state_last = + ggml_view_3d(ctx0, conv_input, + conv_kernel_size - 1, conv_channels, n_seqs, + conv_input->nb[1], conv_input->nb[2], + ggml_row_size(conv_input->type, s_idx)); + + ggml_tensor * conv_state_update = + ggml_view_2d(ctx0, + conv_states_all, row_count, n_seqs, + conv_states_all->nb[1], + (s_slot * mem_size + kv_head) * row_size); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update)); } } @@ -531,7 +544,9 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( const int64_t n_seqs = s->ne[3]; const int64_t n_seq_tokens = q->ne[2]; - if (!keep_rs()) { + const bool keep = cparams.n_rs_seq > 0; + + if (!keep) { auto attn_out = build_delta_net(q, k, v, g, b, s, il); ggml_tensor * output = attn_out.first; ggml_tensor * new_state = attn_out.second; @@ -554,7 +569,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( ggml_tensor * state_3d = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0); ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d); - cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); + if (n_seq_tokens > 1) { + cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); + } else { + cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_AR, il); + } const int64_t attn_score_elems = S_v * H_v * n_seq_tokens * n_seqs; const int64_t state_size_per_snap = S_v * S_v * H_v * n_seqs; @@ -576,9 +595,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( ggml_row_size(gdn_out->type, S_v * S_v), ggml_row_size(gdn_out->type, S_v * S_v * H_v), ggml_row_size(gdn_out->type, attn_score_elems + k_i * state_size_per_snap)); + ggml_tensor * dst = ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1], ((size_t) cache_slot * mem_size + kv_head) * row_size); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst)); } diff --git a/src/models/models.h b/src/models/models.h index 4e40536a5ea..7e551eb965b 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -66,9 +66,6 @@ struct llm_build_delta_net_base : public llm_graph_context { ggml_tensor * s, int il); - // true when speculative rollback is enabled and the batch fits in the rs cache - bool keep_rs() const; - // read conv state from cache, concat with qkv_mixed, write back (single slot or per-token) // qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs) ggml_tensor * build_conv_state( diff --git a/tools/cli/README.md b/tools/cli/README.md index c40b5a21cc0..38bc78a3fdf 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -191,10 +191,10 @@ | `--spec-draft-override-tensor, -otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) | | `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) | -| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | +| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) | | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) | -| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | diff --git a/tools/server/README.md b/tools/server/README.md index 11098af2883..9b413423975 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -183,6 +183,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | | `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)
(env: LLAMA_ARG_ALIAS) | | `--tags STRING` | set model tags, comma-separated (informational, not used for routing)
(env: LLAMA_ARG_TAGS) | +| `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) | | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)
(env: LLAMA_ARG_REUSE_PORT) | @@ -244,10 +245,10 @@ For the full list of features, please refer to [server's changelog](https://gith | `--spec-draft-override-tensor, -otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) | | `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) | -| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | +| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) | | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) | -| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | From baf3cc6e1d70ce73f66a0665811e5c2228cddc5d Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 19 May 2026 18:41:44 +0200 Subject: [PATCH 076/309] model : clarify MTP layer comment in qwen35.cpp [no ci] (#23338) This commit attempts to clarify a code comment in graph_mtp regarding where the MTP layer is stored. The motivation for this is that it was not obvious to me what the original comment meant and hopefully this makes it clearer. --- src/models/qwen35.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 361d7538a03..35a0158e854 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -496,7 +496,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr const int64_t n_embd_head = hparams.n_embd_head_v(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); - // The MTP block lives at the source file's original layer index. + // hparams.n_layer includes both main model layers and MTP layers. The MTP + // layer is stored immediately after the main layers in model.layers[]. const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; const auto & layer = model.layers[il]; From ac76808e4db7bbb4082b86e7fbd615934b44ac6e Mon Sep 17 00:00:00 2001 From: Aparna M P Date: Tue, 19 May 2026 22:18:21 +0530 Subject: [PATCH 077/309] hexagon: enable support for NORM op (#23319) --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 5 +- ggml/src/ggml-hexagon/htp/htp-ops.h | 1 + ggml/src/ggml-hexagon/htp/main.c | 1 + ggml/src/ggml-hexagon/htp/unary-ops.c | 97 ++++++++++++++++++++++++++ 4 files changed, 101 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 2f75e97ac66..ebeef3bdbaf 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2870,6 +2870,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) { case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS; case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS; case GGML_OP_ARGSORT: return HTP_OP_ARGSORT; + case GGML_OP_NORM: return HTP_OP_NORM; case GGML_OP_L2_NORM: return HTP_OP_L2_NORM; case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM; case GGML_OP_SCALE: return HTP_OP_SCALE; @@ -3338,10 +3339,8 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_add_id(sess, op); break; + case GGML_OP_NORM: case GGML_OP_L2_NORM: - supp = ggml_hexagon_supported_unary(sess, op); - break; - case GGML_OP_RMS_NORM: case GGML_OP_SCALE: supp = ggml_hexagon_supported_unary(sess, op); diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index 676e948a439..9d905a30133 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -88,6 +88,7 @@ enum htp_op_code { HTP_OP_GATED_DELTA_NET, HTP_OP_TRI, HTP_OP_PAD, + HTP_OP_NORM, HTP_OP_INVALID }; diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 12003c1fd8a..8e54536f619 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -534,6 +534,7 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_ADD_ID: return op_binary(octx); + case HTP_OP_NORM: case HTP_OP_RMS_NORM: case HTP_OP_SCALE: case HTP_OP_SQR: diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c index 1ce881353ec..40d2d60153a 100644 --- a/ggml/src/ggml-hexagon/htp/unary-ops.c +++ b/ggml/src/ggml-hexagon/htp/unary-ops.c @@ -158,6 +158,79 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src, } } +static void hvx_fast_norm_f32(const uint8_t * restrict src, + uint8_t * restrict dst, + uint8_t * restrict pad, + const int num_elems, + float epsilon) { + (void)pad; + + const HVX_Vector * restrict v_src = (HVX_Vector *) src; + HVX_Vector * restrict v_dst = (HVX_Vector *) dst; + + const int nvec = num_elems / VLEN_FP32; // number of full vectors + const int nloe = num_elems % VLEN_FP32; // leftover elements + + // Compute sum of squares and sum of values for full vectors + HVX_Vector sum_sq_v = Q6_V_vsplat_R(0x00000000); + HVX_Vector sum_x_v = Q6_V_vsplat_R(0x00000000); + HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon); + + #pragma unroll(4) + for (int i = 0; i < nvec; i++) { + HVX_Vector v1 = v_src[i]; + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); + sum_sq_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_sq_v, v2); + sum_x_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_x_v, Q6_Vqf32_vadd_VsfVsf(v1, Q6_V_vzero())); + } + + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); + sum_sq_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_sq_v, v2); + sum_x_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_x_v, Q6_Vqf32_vadd_VsfVsf(v1, Q6_V_vzero())); + } + + // Reduce HVX sums + sum_sq_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_sq_v)); + sum_x_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_x_v)); + + HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems); + HVX_Vector denom_v = hvx_vec_inverse_f32(t_v); + HVX_Vector mean_sq_v = Q6_Vqf32_vmpy_VsfVsf(sum_sq_v, denom_v); + HVX_Vector mean_x_v = Q6_Vqf32_vmpy_VsfVsf(sum_x_v, denom_v); + HVX_Vector mean_x_sq_v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(mean_x_v), Q6_Vsf_equals_Vqf32(mean_x_v)); + HVX_Vector var_v = Q6_Vqf32_vsub_Vqf32Vqf32(mean_sq_v, mean_x_sq_v); + HVX_Vector var_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(var_v, epsilon_v); + + // scale = rsqrt(variance + epsilon), mean_x broadcast for subtraction + HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(var_epsilon_v)); + HVX_Vector mean_x_b = hvx_vec_splat_f32(hvx_vec_get_f32(Q6_Vsf_equals_Vqf32(mean_x_v))); + + #pragma unroll(4) + for (int i = 0; i < nvec; i++) { + HVX_Vector v1 = v_src[i]; + HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, mean_x_b); + HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2), scale_v); + v_dst[i] = Q6_Vsf_equals_Vqf32(v3); + } + + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, mean_x_b); + HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2), scale_v); + HVX_Vector result = Q6_Vsf_equals_Vqf32(v3); + + // Store with masking to avoid overwriting memory beyond the tensor + hvx_vec_store_a(&v_dst[nvec], nloe * 4, result); + } +} + static void scale_f32(const float * restrict src, float * restrict dst, uint8_t * restrict spad, @@ -196,6 +269,24 @@ static void rms_norm_f32(const float * restrict src, } } +static void norm_f32(const float * restrict src, + float * restrict dst, + uint8_t * restrict spad, + const uint32_t num_rows, + const uint32_t row_elems, + const size_t row_size, + int32_t * op_params) { + float epsilon = 0.f; + memcpy(&epsilon, op_params, sizeof(float)); + + for (uint32_t ir = 0; ir < num_rows; ir++) { + const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size); + uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size); + + hvx_fast_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon); + } +} + static void sqr_f32(const float * restrict src, float * restrict dst, uint8_t * restrict spad, @@ -556,6 +647,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * // Process block in VTCM switch (htp_op) { + case HTP_OP_NORM: + norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); + break; case HTP_OP_RMS_NORM: rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); break; @@ -632,6 +726,9 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { const char * op_type = NULL; switch (octx->op) { + case HTP_OP_NORM: + op_type = "norm-f32"; + break; case HTP_OP_RMS_NORM: op_type = "rmsnorm-f32"; break; From b7393a4d198011190ecf6420893febb997b84520 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 19 May 2026 21:16:58 +0200 Subject: [PATCH 078/309] convert : update mtp related help (#23334) * update mtp related help * remove outdated experimental text --- convert_hf_to_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ff840050861..1d18a1bf91f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -115,15 +115,15 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--mmproj", action="store_true", - help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", + help="Export multimodal projector (mmproj) for vision models. This will only work on some vision models. An 'mmproj-' prefix will be added to the output file name.", ) parser.add_argument( "--mtp", action="store_true", - help="(Experimental) Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. Output file name will get a '-MTP' suffix.", + help="Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. An 'mtp-' prefix will be added to the output file name.", ) parser.add_argument( "--no-mtp", action="store_true", - help="(Experimental) Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, so the bundled default is more space-efficient overall.", + help="Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, but even though the bundled default is more space-efficient overall, this allows differing quantization which may be more performant.", ) parser.add_argument( "--mistral-format", action="store_true", From 7256fce047b4fda9a4d82e659e3ff7ebb11e3bbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 19 May 2026 21:33:23 +0200 Subject: [PATCH 079/309] common: fix --fit verbosity with --verbosity 4 (#23282) --- common/common.cpp | 4 ++-- tools/fit-params/fit-params.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b6fdec3ce05..d77ddeda10e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1173,7 +1173,7 @@ common_init_result::common_init_result(common_params & params, bool model_only) params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx, - params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); } llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); @@ -1366,7 +1366,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode } if (params.warmup) { - LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); + LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); llama_set_warmup(lctx, true); diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp index bcdf4404016..20a5ff1ebd0 100644 --- a/tools/fit-params/fit-params.cpp +++ b/tools/fit-params/fit-params.cpp @@ -30,7 +30,7 @@ int main(int argc, char ** argv) { if (!params.fit_params_print) { const common_params_fit_status status = common_fit_params(params.model.path.c_str(), &mparams, &cparams, params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx, - params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); if (status != COMMON_PARAMS_FIT_STATUS_SUCCESS) { LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__); exit(1); From 57cb35c8867fce383140d9b34a811920daed8c46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 19 May 2026 21:34:04 +0200 Subject: [PATCH 080/309] common: fix --help for --verbosity (#23278) --- common/arg.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 13dfd413562..87462f49e76 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3364,7 +3364,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex " - 1: error\n" " - 2: warning\n" " - 3: info\n" - " - 4: debug\n" + " - 4: trace (more info)\n" + " - 5: debug\n" "(default: %d)\n", params.verbosity), [](common_params & params, int value) { params.verbosity = value; From a8078675a6a06584954a74beb654ae1b4b80041d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 19 May 2026 21:35:10 +0200 Subject: [PATCH 081/309] github: mention --log-file in issue templates (#23277) --- .github/ISSUE_TEMPLATE/011-bug-results.yml | 4 ++-- .github/ISSUE_TEMPLATE/019-bug-misc.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml index c7001edf050..23150d0b619 100644 --- a/.github/ISSUE_TEMPLATE/011-bug-results.yml +++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml @@ -100,8 +100,8 @@ body: label: Relevant log output description: > Please copy and paste any relevant log output, including the command that you entered and any generated text. - For very long logs (thousands of lines), preferably upload them as files instead. - On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command. + For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose. + On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command. value: |
Logs diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml index 831c98eb637..041a7cdb2ee 100644 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -88,8 +88,8 @@ body: description: > If applicable, please copy and paste any relevant log output, including any generated text. If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well. - For very long logs (thousands of lines), please upload them as files instead. - On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command. + For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose. + On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command. value: |
Logs From 67ace021da905e27ecbdf1176b0eef578a5288c0 Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Tue, 19 May 2026 22:38:42 +0200 Subject: [PATCH 082/309] refactor: Chat Screen UI rendering (#23333) --- .../app/chat/ChatScreen/ChatScreen.svelte | 75 +++++++------------ .../ChatScreenActionScrollDown.svelte | 15 +++- .../chat/ChatScreen/ChatScreenGreeting.svelte | 25 +++++++ .../ChatScreenProcessingInfo.svelte | 16 +++- .../ChatScreen/ChatScreenServerError.svelte | 34 +++++++++ tools/ui/src/lib/components/app/chat/index.ts | 7 ++ tools/ui/src/routes/+layout.svelte | 2 +- 7 files changed, 119 insertions(+), 55 deletions(-) create mode 100644 tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenGreeting.svelte create mode 100644 tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenServerError.svelte diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte index bd93a569ce6..e733a64a97a 100644 --- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte @@ -1,8 +1,7 @@ + + diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte index b5979db13c2..f38f3519c34 100644 --- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte @@ -6,6 +6,7 @@ import { activeMessages, activeConversation } from '$lib/stores/conversations.svelte'; import { config } from '$lib/stores/settings.svelte'; import { getProcessingInfoContext } from '$lib/contexts'; + import { page } from '$app/state'; const processingState = useProcessingState(); const processingInfoCtx = getProcessingInfoContext(); @@ -16,6 +17,14 @@ let isStreaming = $derived(isChatStreaming()); let processingDetails = $derived(processingState.getTechnicalDetails()); + let processingVisible = $derived(processingDetails.length > 0); + + let { onVisibilityChange }: { onVisibilityChange?: (visible: boolean) => void } = $props(); + + $effect(() => { + onVisibilityChange?.(processingVisible); + }); + $effect(() => { const conversation = activeConversation(); @@ -60,9 +69,12 @@
-
+
{#each processingDetails as detail (detail)} {detail} {/each} diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenServerError.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenServerError.svelte new file mode 100644 index 00000000000..2a998dbebfa --- /dev/null +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenServerError.svelte @@ -0,0 +1,34 @@ + + +{#if hasError} +
+ + + + + Server unavailable + + + + + {serverError()} + +
+{/if} diff --git a/tools/ui/src/lib/components/app/chat/index.ts b/tools/ui/src/lib/components/app/chat/index.ts index 9c7ce864e21..be553596024 100644 --- a/tools/ui/src/lib/components/app/chat/index.ts +++ b/tools/ui/src/lib/components/app/chat/index.ts @@ -674,3 +674,10 @@ export { default as ChatScreenProcessingInfo } from './ChatScreen/ChatScreenProc * Takes the chat container element as a prop to manage scroll state internally. */ export { default as ChatScreenActionScrollDown } from './ChatScreen/ChatScreenActionScrollDown.svelte'; + +/** + * Server error alert displayed when the server is unreachable. + * Shows the error message with a retry button. + * Rendered inside ChatScreen when `serverError` store has a value. + */ +export { default as ChatScreenServerError } from './ChatScreen/ChatScreenServerError.svelte'; diff --git a/tools/ui/src/routes/+layout.svelte b/tools/ui/src/routes/+layout.svelte index 78227df3ce7..0610b07ae77 100644 --- a/tools/ui/src/routes/+layout.svelte +++ b/tools/ui/src/routes/+layout.svelte @@ -240,7 +240,7 @@ /> -
+
From 17d22a35b268fe997f0f9551d6e39e576bada7fa Mon Sep 17 00:00:00 2001 From: Aparna M P Date: Wed, 20 May 2026 02:40:13 +0530 Subject: [PATCH 083/309] hexagon: add MROPE and IMROPE support in HTP rope op (#23317) --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 2 +- ggml/src/ggml-hexagon/htp/rope-ops.c | 115 +++++++++++++++++++++---- 2 files changed, 98 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index ebeef3bdbaf..080fb7f47e3 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2661,7 +2661,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess int mode = op_params[2]; - if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) { + if (mode == GGML_ROPE_TYPE_VISION) { return false; } if (mode & 1) { diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index 1d8b0796bc9..9901453e91e 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -18,9 +18,11 @@ #include "htp-ops.h" #include "htp-ops.h" -// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we can't include ggml.h +// Redefined the rope type constants as we can't include ggml.h #define HTP_ROPE_TYPE_NORMAL 0 #define HTP_ROPE_TYPE_NEOX 2 +#define HTP_ROPE_TYPE_MROPE 8 +#define HTP_ROPE_TYPE_IMROPE 40 #define HTP_ROPE_SPAD_NROWS 16 #define HTP_ROPE_SPAD_BLOCK (HTP_ROPE_SPAD_NROWS/2) @@ -82,6 +84,29 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) { return (1 - MIN(1, MAX(0, y))); } +// Compute one (cos, sin) pair into cache[i0], cache[i0+1] applying YaRN scaling. +static inline void rope_yarn_one(float theta, float freq_scale, float * corr_dims, + uint32_t i0, float ext_factor, float mscale, + float * cache) { + float theta_extrap = theta; + + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta_final = theta_interp; + float mscale_final = mscale; + + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + + cache[i0 + 0] = cosf(theta_final) * mscale_final; + cache[i0 + 1] = sinf(theta_final) * mscale_final; +} + static void rope_cache_init(const float theta_base, const float freq_scale, const float * freq_factors, @@ -96,26 +121,62 @@ static void rope_cache_init(const float theta_base, for (uint32_t i0 = 0; i0 < ne0; i0 += 2) { const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f; + rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache); - float theta_extrap = theta / ff; - - // Get n-d rotational scaling corrected for extrapolation - float theta_interp = freq_scale * theta_extrap; - float theta_final = theta_interp; - float mscale_final = mscale; + theta *= theta_scale; + } +} - if (ext_factor != 0.0f) { - float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; +// pos_t/h/w/e: the four position ids for this sequence step (t=time, h=height, w=width, e=extra). +// sections[4]: number of head dims assigned to each position component. +static void mrope_cache_init(const float pos_t, + const float pos_h, + const float pos_w, + const float pos_e, + const int32_t sections[4], + const bool is_imrope, + const float freq_scale, + const float * freq_factors, + float * corr_dims, + const uint32_t ne0, + const float ext_factor, + const float mscale, + float * cache, + const float theta_scale) { + const int sect_dims = sections[0] + sections[1] + sections[2] + sections[3]; + const int sec_w = sections[0] + sections[1]; + const int sec_e = sec_w + sections[2]; + + float theta_t = pos_t; + float theta_h = pos_h; + float theta_w = pos_w; + float theta_e = pos_e; - // Get n-d magnitude scaling corrected for interpolation - mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale); + for (uint32_t i0 = 0; i0 < ne0; i0 += 2) { + const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f; + const int sector = (i0 / 2) % sect_dims; + + float theta; + if (is_imrope) { + // Interleaved: sector mod 3 selects component + if (sector % 3 == 0 && sector < 3 * sections[0]) { theta = theta_t; } + else if (sector % 3 == 1 && sector < 3 * sections[1]) { theta = theta_h; } + else if (sector % 3 == 2 && sector < 3 * sections[2]) { theta = theta_w; } + else { theta = theta_e; } + } else { + // Contiguous sections + if (sector < sections[0]) { theta = theta_t; } + else if (sector < sec_w) { theta = theta_h; } + else if (sector < sec_e) { theta = theta_w; } + else { theta = theta_e; } } - cache[i0 + 0] = cosf(theta_final) * mscale_final; - cache[i0 + 1] = sinf(theta_final) * mscale_final; + rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache); - theta *= theta_scale; + theta_t *= theta_scale; + theta_h *= theta_scale; + theta_w *= theta_scale; + theta_e *= theta_scale; } } @@ -274,7 +335,8 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) { uint64_t tt = HAP_perf_get_qtimer_count(); const int32_t mode = rctx->mode; - const bool is_neox = mode & HTP_ROPE_TYPE_NEOX; + // MROPE and IMROPE use NEOX-style pairing for the rotation + const bool is_neox = (mode & HTP_ROPE_TYPE_NEOX) || (mode & HTP_ROPE_TYPE_MROPE); // VTCM setup uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread); @@ -326,8 +388,25 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) { if (i2 != prev_i2) { prev_i2 = i2; - const int32_t p = pos[i2]; - rope_cache_init(p, rctx->freq_scale, freq_factors, rctx->corr_dims, ne0, rctx->ext_factor, rctx->attn_factor, theta_cache, rctx->theta_scale); + const bool is_mrope = (rctx->mode & HTP_ROPE_TYPE_MROPE) != 0; + if (is_mrope) { + // src1 holds four position arrays stacked along ne0: + // pos[i2], pos[i2+ne2], pos[i2+ne2*2], pos[i2+ne2*3] + const bool is_imrope = (rctx->mode == HTP_ROPE_TYPE_IMROPE); + mrope_cache_init( + (float) pos[i2], + (float) pos[i2 + ne2], + (float) pos[i2 + ne2 * 2], + (float) pos[i2 + ne2 * 3], + rctx->sections, is_imrope, + rctx->freq_scale, freq_factors, rctx->corr_dims, + ne0, rctx->ext_factor, rctx->attn_factor, + theta_cache, rctx->theta_scale); + } else { + rope_cache_init(pos[i2], rctx->freq_scale, freq_factors, rctx->corr_dims, + ne0, rctx->ext_factor, rctx->attn_factor, + theta_cache, rctx->theta_scale); + } // FARF(HIGH, "rope-theta %u: ir %u i1 %u i2 %u i3 %u cache %p : usec %u", ith, ir, i1, i2, i3, theta_cache, // (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start)); From b28a2f372a4a470a90ad10f93654e5dc33e78949 Mon Sep 17 00:00:00 2001 From: shaofeiqi Date: Tue, 19 May 2026 14:29:00 -0700 Subject: [PATCH 084/309] opencl: add MoE support for q4_k, q5_k, q6_k on Adreno (#23303) * opencl: add q4_k moe support * opencl: add q5_k moe support * opencl: add q6_k moe support * opencl: adjust format --------- Co-authored-by: Li He --- ggml/src/ggml-opencl/CMakeLists.txt | 6 + ggml/src/ggml-opencl/ggml-opencl.cpp | 948 +++++++++++++++++- ggml/src/ggml-opencl/kernels/cvt.cl | 385 +++++++ .../kernels/gemm_moe_q4_k_f32_ns.cl | 279 ++++++ .../kernels/gemm_moe_q5_k_f32_ns.cl | 284 ++++++ .../kernels/gemm_moe_q6_k_f32_ns.cl | 263 +++++ .../kernels/gemv_moe_q4_k_f32_ns.cl | 151 +++ .../kernels/gemv_moe_q5_k_f32_ns.cl | 156 +++ .../kernels/gemv_moe_q6_k_f32_ns.cl | 137 +++ 9 files changed, 2601 insertions(+), 8 deletions(-) create mode 100644 ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index c6aba608736..f75d089b574 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -110,6 +110,12 @@ set(GGML_OPENCL_KERNELS gemv_moe_q5_0_f32_ns gemm_moe_q5_1_f32_ns gemv_moe_q5_1_f32_ns + gemm_moe_q4_k_f32_ns + gemv_moe_q4_k_f32_ns + gemm_moe_q5_k_f32_ns + gemv_moe_q5_k_f32_ns + gemm_moe_q6_k_f32_ns + gemv_moe_q6_k_f32_ns gemm_moe_mxfp4_f32 gemv_moe_mxfp4_f32 gemm_moe_mxfp4_f32_ns diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 0e511592d53..a3af8c2da41 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -558,6 +558,9 @@ struct ggml_backend_opencl_context { cl_kernel kernel_convert_block_q4_1_trans4_ns, kernel_restore_block_q4_1_trans4_ns; cl_kernel kernel_convert_block_q5_0_trans4_ns, kernel_restore_block_q5_0_trans4_ns; cl_kernel kernel_convert_block_q5_1_trans4_ns, kernel_restore_block_q5_1_trans4_ns; + cl_kernel kernel_convert_block_q4_k_trans4_ns, kernel_restore_block_q4_k_trans4_ns; + cl_kernel kernel_convert_block_q5_k_trans4_ns, kernel_restore_block_q5_k_trans4_ns; + cl_kernel kernel_convert_block_q6_k_trans4_ns, kernel_restore_block_q6_k_trans4_ns; cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans; cl_kernel kernel_convert_block_mxfp4_trans4_ns, kernel_restore_block_mxfp4_trans4_ns; cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans; @@ -619,6 +622,9 @@ struct ggml_backend_opencl_context { cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns; cl_kernel kernel_gemv_moe_q5_0_f32_ns, kernel_gemm_moe_q5_0_f32_ns; cl_kernel kernel_gemv_moe_q5_1_f32_ns, kernel_gemm_moe_q5_1_f32_ns; + cl_kernel kernel_gemv_moe_q4_k_f32_ns, kernel_gemm_moe_q4_k_f32_ns; + cl_kernel kernel_gemv_moe_q5_k_f32_ns, kernel_gemm_moe_q5_k_f32_ns; + cl_kernel kernel_gemv_moe_q6_k_f32_ns, kernel_gemm_moe_q6_k_f32_ns; cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32; cl_kernel kernel_gemv_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns; cl_kernel kernel_moe_reorder_b; @@ -981,6 +987,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_restore_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q4_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q4_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q5_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q5_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q6_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q6_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_k_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans4_ns", &err), err)); @@ -3071,6 +3083,108 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // gemv_moe_q4_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_moe_q4_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_moe_q4_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_moe_q4_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q4_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_moe_q4_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_moe_q4_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_moe_q4_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemm_moe_q4_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q4_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemv_moe_q5_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_moe_q5_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_moe_q5_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_moe_q5_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q5_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_moe_q5_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_moe_q5_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_moe_q5_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemm_moe_q5_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q5_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemv_moe_q6_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_moe_q6_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_moe_q6_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_moe_q6_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q6_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_moe_q6_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_moe_q6_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_moe_q6_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemm_moe_q6_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q6_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + // gemv_moe_mxfp4_f32_ns { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -4148,6 +4262,8 @@ struct ggml_tensor_extra_cl_iq4_nl { struct ggml_tensor_extra_cl_q4_K { // Quantized values cl_mem q = nullptr; + // Quantized values in image1d_buffer_t. + cl_mem q_img = nullptr; // Scales for each super block. cl_mem s = nullptr; // Scales @@ -4176,12 +4292,18 @@ struct ggml_tensor_extra_cl_q4_K { CL_CHECK(clReleaseMemObject(dm)); dm = nullptr; } + if (q_img != nullptr) { + CL_CHECK(clReleaseMemObject(q_img)); + q_img = nullptr; + } } }; struct ggml_tensor_extra_cl_q5_K { // Lower 4 bits of quantized weights. cl_mem q = nullptr; + // Quantized values in image1d_buffer_t. + cl_mem q_img = nullptr; // Upper 1 bit of quantized weights. cl_mem qh = nullptr; // Scales for each block. @@ -4222,6 +4344,10 @@ struct ggml_tensor_extra_cl_q5_K { CL_CHECK(clReleaseMemObject(dm)); dm = nullptr; } + if (q_img != nullptr) { + CL_CHECK(clReleaseMemObject(q_img)); + q_img = nullptr; + } size_q = 0; size_qh = 0; @@ -4234,6 +4360,8 @@ struct ggml_tensor_extra_cl_q5_K { struct ggml_tensor_extra_cl_q6_K { // Lower 4 bits of quantized weights. cl_mem ql = nullptr; + // Lower 4 bits as image1d_buffer_t + cl_mem ql_img = nullptr; // Upper 2 bits of quantized weights. cl_mem qh = nullptr; // Scales for each block. @@ -4267,6 +4395,10 @@ struct ggml_tensor_extra_cl_q6_K { CL_CHECK(clReleaseMemObject(d)); d = nullptr; } + if (ql_img != nullptr) { + CL_CHECK(clReleaseMemObject(ql_img)); + ql_img = nullptr; + } size_ql = 0; size_qh = 0; @@ -4700,7 +4832,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te // the quantizations here currently do not - they are only supported by Adreno with certain shapes if (op->src[0]->type == GGML_TYPE_Q4_1 || op->src[0]->type == GGML_TYPE_Q5_0 || - op->src[0]->type == GGML_TYPE_Q5_1) { + op->src[0]->type == GGML_TYPE_Q5_1 || + op->src[0]->type == GGML_TYPE_Q4_K || + op->src[0]->type == GGML_TYPE_Q5_K || + op->src[0]->type == GGML_TYPE_Q6_K) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (op->src[1]->type == GGML_TYPE_F32) { return use_adreno_moe_kernels(backend_ctx, op->src[0]) @@ -6047,14 +6182,57 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); CL_CHECK(err); - #ifdef GGML_OPENCL_USE_ADRENO_KERNELS +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_convert_block_q4_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_q = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor) / 8), + 0, 0, 0, 0, 0, 0, 0, + { extra->q } + }; + extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err); + CL_CHECK(err); + tensor->extra = extra; + + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K; if (use_adreno_kernels(backend_ctx, tensor)) { kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle; } - #else +#else cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K; - #endif +#endif // GGML_OPENCL_USE_ADRENO_KERNELS cl_uchar mask_0F = 0x0F; cl_uchar mask_F0 = 0xF0; @@ -6157,14 +6335,58 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); CL_CHECK(err); - #ifdef GGML_OPENCL_USE_ADRENO_KERNELS +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_convert_block_q5_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_q = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor) / 8), + 0, 0, 0, 0, 0, 0, 0, + { extra->q } + }; + extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err); + CL_CHECK(err); + tensor->extra = extra; + + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K; if (use_adreno_kernels(backend_ctx, tensor)) { kernel = backend_ctx->kernel_convert_block_q5_K_noshuffle; } - #else +#else cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K; - #endif +#endif cl_uchar mask_0F = 0x0F; cl_uchar mask_F0 = 0xF0; @@ -6232,6 +6454,79 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, cl_buffer_region region; + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + // Adreno MoE Q6_K kernel needs special transposed layout + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + size_t moe_size_ql = (size_t)(ggml_nelements(tensor) / 8) * sizeof(uint32_t); // 4 bits per element + size_t moe_size_qh = (size_t)(ggml_nelements(tensor) / 16) * sizeof(uint32_t); // 2 bits per element + size_t moe_size_s = size_s; + size_t moe_size_d = size_d; + + // Subbuffer for ql + region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); + region.size = moe_size_ql; + CL_CHECK((extra->ql = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + auto previous_origin = region.origin; + + // Subbuffer for qh + region.origin = align_to(previous_origin + moe_size_ql, backend_ctx->alignment); + region.size = moe_size_qh; + CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + previous_origin = region.origin; + + // Subbuffer for scales + region.origin = align_to(previous_origin + moe_size_qh, backend_ctx->alignment); + region.size = moe_size_s; + CL_CHECK((extra->s = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + previous_origin = region.origin; + + // Subbuffer for d + region.origin = align_to(previous_origin + moe_size_s, backend_ctx->alignment); + region.size = moe_size_d; + CL_CHECK((extra->d = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + cl_kernel kernel = backend_ctx->kernel_convert_block_q6_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + // Create image for ql + cl_image_format img_format_ql = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_ql = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor) / 8), + 0, 0, 0, 0, 0, 0, 0, + { extra->ql } + }; + extra->ql_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_ql, &img_desc_ql, NULL, &err); + tensor->extra = extra; + + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + // Subbuffer for ql region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); region.size = size_ql; @@ -6825,6 +7120,40 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, cl_uchar mask_F0 = 0xF0; #ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q4_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } if (use_adreno_kernels(backend_ctx, tensor)) { int M = tensor->ne[1]; int K = tensor->ne[0]; @@ -6901,6 +7230,40 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, cl_uchar mask_F0 = 0xF0; #ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + cl_kernel kernel = backend_ctx->kernel_restore_block_q5_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } if (use_adreno_kernels(backend_ctx, tensor)) { int M = tensor->ne[1]; int K = tensor->ne[0]; @@ -6974,7 +7337,44 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, if (tensor->type == GGML_TYPE_Q6_K) { ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra; -#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q6_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } if (use_adreno_kernels(backend_ctx, tensor)) { static ggml_cl_buffer buf_trans_ql; static ggml_cl_buffer buf_trans_qh; @@ -13733,6 +14133,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra; ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra; ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra; + ggml_tensor_extra_cl_q4_K * extra0_q4_K = (ggml_tensor_extra_cl_q4_K *)src0->extra; + ggml_tensor_extra_cl_q5_K * extra0_q5_K = (ggml_tensor_extra_cl_q5_K *)src0->extra; + ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra; ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra; ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra; #endif @@ -13741,6 +14144,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, (void)extra0_q4_1; (void)extra0_q5_0; (void)extra0_q5_1; + (void)extra0_q4_K; + (void)extra0_q5_K; + (void)extra0_q6_K; const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; @@ -14612,6 +15018,532 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, #endif // GGML_OPENCL_SOA_Q break; } + case GGML_TYPE_Q4_K: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, src0)) { + cl_int status; + + size_t local_size[3] = {64, 2, 1}; + size_t global_size[3] = {64, 2, 1}; + + if (ne12 == 1) { // for gemv + kernel = backend_ctx->kernel_gemv_moe_q4_k_f32_ns; + + cl_mem src1_sub_buffer, buf_src1_image, buf_src2; + + // create a sub_buffer for src2 + cl_buffer_region region; + region.origin = offset2; + region.size = ne20 * ne21 * sizeof(int); + buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // set thread grid + global_size[0] = static_cast(ne01); + global_size[1] = 4; + global_size[2] = static_cast(ne20); + local_size[1] = 4; + + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create image for src1 + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}}; + buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->q)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->dm)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); + + } else { // for gemm + kernel = backend_ctx->kernel_gemm_moe_q4_k_f32_ns; + + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { + moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; + } + + cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; + cl_mem buf_src2, buf_src2_emap; + + cl_buffer_region region; + region.origin = 0; + region.size = sizeof(int) * max_post_router_tile * n_tile_size; + buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + region.origin = 0; + region.size = sizeof(short) * max_post_router_tile; + buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Reorder activations + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Create image for reordered src1 + region.origin = 0; + region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); + backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); + buf_src1_reordered = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + unsigned short map_ratio = ne20 / ne11; + GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n"); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size)); + + size_t reorder_b_local_size[3] = {256, 1, 1}; + size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; + + // Dispatch reorder kernel + backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + + // MoE kernel prepare + region.origin = offsetd; + region.size = ne0 * ne1 * ne2 * sizeof(float); + sub_buf_dst = clCreateSubBuffer( + extrad->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // Create image for dst + cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT}; + cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}}; + buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->q_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->dm)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + + // set thread grid + global_size[1] = static_cast((ne01 + 63) / 64); + global_size[2] = static_cast(max_post_router_tile); + local_size[1] = 1; + local_size[2] = 1; + + // Dispatch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + clReleaseMemObject(sub_buf_src1_pre); + clReleaseMemObject(buf_src1_reordered); + clReleaseMemObject(image_src1_reordered); + clReleaseMemObject(buf_src2); + clReleaseMemObject(buf_src2_emap); + clReleaseMemObject(sub_buf_dst); + clReleaseMemObject(buf_dst_image); + } + return; + } +#endif //GGML_OPENCL_USE_ADRENO_KERNELS + } + case GGML_TYPE_Q5_K: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, src0)) { + cl_int status; + + size_t local_size[3] = {64, 2, 1}; + size_t global_size[3] = {64, 2, 1}; + + if (ne12 == 1) { // for gemv + kernel = backend_ctx->kernel_gemv_moe_q5_k_f32_ns; + + cl_mem src1_sub_buffer, buf_src1_image, buf_src2; + + // create a sub_buffer for src2 + cl_buffer_region region; + region.origin = offset2; + region.size = ne20 * ne21 * sizeof(int); + buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // set thread grid + global_size[0] = static_cast(ne01); + global_size[1] = 4; + global_size[2] = static_cast(ne20); + local_size[1] = 4; + + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create image for src1 + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}}; + buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->q)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->dm)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); + + } else { // for gemm + kernel = backend_ctx->kernel_gemm_moe_q5_k_f32_ns; + + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { + moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; + } + + cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; + cl_mem buf_src2, buf_src2_emap; + + cl_buffer_region region; + region.origin = 0; + region.size = sizeof(int) * max_post_router_tile * n_tile_size; + buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + region.origin = 0; + region.size = sizeof(short) * max_post_router_tile; + buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Reorder activations + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Create image for reordered src1 + // Use pre-allocated placeholder + region.origin = 0; + region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); + backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); + buf_src1_reordered = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + unsigned short map_ratio = ne20 / ne11; + GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n"); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size)); + + size_t reorder_b_local_size[3] = {256, 1, 1}; + size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; + + // Dispatch reorder kernel + backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + + // MoE kernel prepare + // Create sub buffer for dst + region.origin = offsetd; + region.size = ne0 * ne1 * ne2 * sizeof(float); + sub_buf_dst = clCreateSubBuffer( + extrad->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // Create image for dst + cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT}; + cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}}; + buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->q_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->dm)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + + // set thread grid + global_size[1] = static_cast((ne01 + 63) / 64); + global_size[2] = static_cast(max_post_router_tile); + local_size[1] = 1; + local_size[2] = 1; + + // Dispatch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + clReleaseMemObject(sub_buf_src1_pre); + clReleaseMemObject(buf_src1_reordered); + clReleaseMemObject(image_src1_reordered); + clReleaseMemObject(buf_src2); + clReleaseMemObject(buf_src2_emap); + clReleaseMemObject(sub_buf_dst); + clReleaseMemObject(buf_dst_image); + } + return; + } +#endif //GGML_OPENCL_USE_ADRENO_KERNELS + } + case GGML_TYPE_Q6_K: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, src0)) { + cl_int status; + + size_t local_size[3] = {64, 2, 1}; + size_t global_size[3] = {64, 2, 1}; + + if (ne12 == 1) { // for gemv + kernel = backend_ctx->kernel_gemv_moe_q6_k_f32_ns; + + cl_mem src1_sub_buffer, buf_src1_image, buf_src2; + + // create a sub_buffer for src2 + cl_buffer_region region; + region.origin = offset2; + region.size = ne20 * ne21 * sizeof(int); + buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // set thread grid + global_size[0] = static_cast(ne01); + global_size[1] = 4; + global_size[2] = static_cast(ne20); + local_size[1] = 4; + + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create image for src1 + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}}; + buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->ql)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); + + } else { // for gemm + kernel = backend_ctx->kernel_gemm_moe_q6_k_f32_ns; + + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { + moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; + } + + cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; + cl_mem buf_src2, buf_src2_emap; + + cl_buffer_region region; + region.origin = 0; + region.size = sizeof(int) * max_post_router_tile * n_tile_size; + buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + region.origin = 0; + region.size = sizeof(short) * max_post_router_tile; + buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Reorder activations + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Create image for reordered src1 + region.origin = 0; + region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); + backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); + buf_src1_reordered = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + unsigned short map_ratio = ne20 / ne11; + GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n"); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size)); + + size_t reorder_b_local_size[3] = {256, 1, 1}; + size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; + + // Dispatch reorder kernel + backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + + // MoE kernel prepare + // Create sub buffer for dst + region.origin = offsetd; + region.size = ne0 * ne1 * ne2 * sizeof(float); + sub_buf_dst = clCreateSubBuffer( + extrad->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // Create image for dst + cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT}; + cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}}; + buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->ql_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + + // set thread grid + global_size[1] = static_cast((ne01 + 63) / 64); + global_size[2] = static_cast(max_post_router_tile); + local_size[1] = 1; + local_size[2] = 1; + + // Dispatch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + clReleaseMemObject(sub_buf_src1_pre); + clReleaseMemObject(buf_src1_reordered); + clReleaseMemObject(image_src1_reordered); + clReleaseMemObject(buf_src2); + clReleaseMemObject(buf_src2_emap); + clReleaseMemObject(sub_buf_dst); + clReleaseMemObject(buf_dst_image); + } + return; + } +#endif //GGML_OPENCL_USE_ADRENO_KERNELS + } case GGML_TYPE_MXFP4: { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (use_adreno_moe_kernels(backend_ctx, src0)) { diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl index 8f06d570587..312366984b6 100644 --- a/ggml/src/ggml-opencl/kernels/cvt.cl +++ b/ggml/src/ggml-opencl/kernels/cvt.cl @@ -664,6 +664,391 @@ kernel void kernel_restore_block_q5_1_trans4_ns( ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block; } +kernel void kernel_convert_block_q4_k_trans4_ns( + __global struct block_q4_K * src0, + __global uint * dst_q, + __global half * dst_d, + __global half * dst_dm, + __global uchar * dst_s, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + uint ne00_blk = ne00 / QK_K; + uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + __global struct block_q4_K * b = src0 + src_blk_offset; + + dst_d [dst_blk_offset] = b->d; + dst_dm[dst_blk_offset] = b->dm; + + uint4 qv[8]; + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar x0 = b->q[i*32 + 2*j]; + uchar x1 = b->q[i*32 + 2*j + 1]; + + qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4); + qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0); + } + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + #pragma unroll + for (int p = 0; p < 8; ++p) { + uint4 v = qv[p]; + dst_q[base + (p * 4 + 0) * ne01] = v.x; + dst_q[base + (p * 4 + 1) * ne01] = v.y; + dst_q[base + (p * 4 + 2) * ne01] = v.z; + dst_q[base + (p * 4 + 3) * ne01] = v.w; + } + + __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + #pragma unroll + for (int i = 0; i < K_SCALE_SIZE; ++i) { + s_dst[i] = b->s[i]; + } +} + +kernel void kernel_restore_block_q4_k_trans4_ns( + __global uint * src_q, + __global half * src_d, + __global half * src_dm, + __global uchar * src_s, + __global struct block_q4_K * dst0, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); // block index along K + uint i01 = get_global_id(0); // row index + uint i02 = get_global_id(2); // batch index + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + + __global struct block_q4_K * b = dst0 + dst_blk_offset; + + b->d = src_d[src_blk_offset]; + b->dm = src_dm[src_blk_offset]; + + __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + for (int i = 0; i < K_SCALE_SIZE; ++i) { + b->s[i] = s_src[i]; + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + + uint4 qv[8]; + for (int p = 0; p < 8; ++p) { + qv[p].x = src_q[base + (p * 4 + 0) * ne01]; + qv[p].y = src_q[base + (p * 4 + 1) * ne01]; + qv[p].z = src_q[base + (p * 4 + 2) * ne01]; + qv[p].w = src_q[base + (p * 4 + 3) * ne01]; + } + + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar lo = qv_bytes[i*32 + j]; + uchar hi = qv_bytes[i*32 + j + 16]; + b->q[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4)); + b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0)); + } + } +} + +kernel void kernel_convert_block_q5_k_trans4_ns( + __global struct block_q5_K * src0, + __global uint * dst_qs, + __global uint * dst_qh, + __global half * dst_d, + __global half * dst_dm, + __global uchar * dst_s, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + uint ne00_blk = ne00 / QK_K; + uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + __global struct block_q5_K * b = src0 + src_blk_offset; + + dst_d [dst_blk_offset] = b->d; + dst_dm[dst_blk_offset] = b->dm; + + for (int k = 0; k < 8; k++) { + uchar b0 = 0, b1 = 0, b2 = 0, b3 = 0; + for (int bit = 0; bit < 8; bit++) { + b0 |= (uchar)(((b->qh[bit] >> k) & 1) << bit); + b1 |= (uchar)(((b->qh[8 + bit] >> k) & 1) << bit); + b2 |= (uchar)(((b->qh[16 + bit] >> k) & 1) << bit); + b3 |= (uchar)(((b->qh[24 + bit] >> k) & 1) << bit); + } + uint packed = (uint)b0 | ((uint)b1 << 8) | ((uint)b2 << 16) | ((uint)b3 << 24); + dst_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01] = packed; + } + + uint4 qv[8]; + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar x0 = b->qs[i*32 + 2*j]; + uchar x1 = b->qs[i*32 + 2*j + 1]; + + qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4); + qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0); + } + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + #pragma unroll + for (int p = 0; p < 8; ++p) { + uint4 v = qv[p]; + dst_qs[base + (p * 4 + 0) * ne01] = v.x; + dst_qs[base + (p * 4 + 1) * ne01] = v.y; + dst_qs[base + (p * 4 + 2) * ne01] = v.z; + dst_qs[base + (p * 4 + 3) * ne01] = v.w; + } + + __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + #pragma unroll + for (int i = 0; i < K_SCALE_SIZE; ++i) { + s_dst[i] = b->s[i]; + } +} + +kernel void kernel_restore_block_q5_k_trans4_ns( + __global uint * src_qs, + __global uint * src_qh, + __global half * src_d, + __global half * src_dm, + __global uchar * src_s, + __global struct block_q5_K * dst0, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); // block index along K + uint i01 = get_global_id(0); // row index + uint i02 = get_global_id(2); // batch index + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + + __global struct block_q5_K * b = dst0 + dst_blk_offset; + + b->d = src_d[src_blk_offset]; + b->dm = src_dm[src_blk_offset]; + + for (int j = 0; j < 32; j++) b->qh[j] = 0; + for (int k = 0; k < 8; k++) { + uint packed = src_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01]; + uchar b0 = (uchar)(packed & 0xFF); + uchar b1 = (uchar)((packed >> 8) & 0xFF); + uchar b2 = (uchar)((packed >> 16) & 0xFF); + uchar b3 = (uchar)((packed >> 24) & 0xFF); + for (int bit = 0; bit < 8; bit++) { + b->qh[bit] |= (uchar)(((b0 >> bit) & 1) << k); + b->qh[8 + bit] |= (uchar)(((b1 >> bit) & 1) << k); + b->qh[16 + bit] |= (uchar)(((b2 >> bit) & 1) << k); + b->qh[24 + bit] |= (uchar)(((b3 >> bit) & 1) << k); + } + } + + __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + for (int i = 0; i < K_SCALE_SIZE; ++i) { + b->s[i] = s_src[i]; + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + + uint4 qv[8]; + for (int p = 0; p < 8; ++p) { + qv[p].x = src_qs[base + (p * 4 + 0) * ne01]; + qv[p].y = src_qs[base + (p * 4 + 1) * ne01]; + qv[p].z = src_qs[base + (p * 4 + 2) * ne01]; + qv[p].w = src_qs[base + (p * 4 + 3) * ne01]; + } + + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar lo = qv_bytes[i*32 + j]; + uchar hi = qv_bytes[i*32 + j + 16]; + b->qs[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4)); + b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0)); + } + } +} + +kernel void kernel_convert_block_q6_k_trans4_ns( + __global struct block_q6_K * src0, + __global uint * dst_ql, + __global uint * dst_qh, + __global half * dst_d, + __global char * dst_s, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + __global struct block_q6_K * b = src0 + src_blk_offset; + + dst_d[dst_blk_offset] = b->d; + + uint4 qlv[8]; + uchar * qlv_bytes = (uchar *)qlv; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 16; ++j) { + uchar x0 = b->ql[i*64 + 2*j]; + uchar x1 = b->ql[i*64 + 2*j + 1]; + uchar x2 = b->ql[i*64 + 32 + 2*j]; + uchar x3 = b->ql[i*64 + 32 + 2*j + 1]; + qlv_bytes[i*64 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4); + qlv_bytes[i*64 + j + 16] = convert_uchar(x2 & mask_0F) | convert_uchar((x3 & mask_0F) << 4); + qlv_bytes[i*64 + j + 32] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0); + qlv_bytes[i*64 + j + 48] = convert_uchar((x2 & mask_F0) >> 4) | convert_uchar(x3 & mask_F0); + } + } + + uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + + #pragma unroll + for (int p = 0; p < 8; ++p) { + uint4 v = qlv[p]; + dst_ql[ql_base + (p * 4 + 0) * ne01] = v.x; + dst_ql[ql_base + (p * 4 + 1) * ne01] = v.y; + dst_ql[ql_base + (p * 4 + 2) * ne01] = v.z; + dst_ql[ql_base + (p * 4 + 3) * ne01] = v.w; + } + + uint qhv[16] = {0}; + + for (int n = 0; n < 2; ++n) { + for (int l = 0; l < 32; ++l) { + uchar h = b->qh[n*32 + l]; + int u = l / 16; + int bit_pos = (l % 16) * 2; + qhv[(n*4 + 0)*2 + u] |= ((uint)((h >> 0) & 0x03)) << bit_pos; + qhv[(n*4 + 1)*2 + u] |= ((uint)((h >> 2) & 0x03)) << bit_pos; + qhv[(n*4 + 2)*2 + u] |= ((uint)((h >> 4) & 0x03)) << bit_pos; + qhv[(n*4 + 3)*2 + u] |= ((uint)((h >> 6) & 0x03)) << bit_pos; + } + } + + uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01; + + for (int p = 0; p < 16; ++p) { + dst_qh[qh_base + p * ne01] = qhv[p]; + } + + __global char * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16; + #pragma unroll + for (int i = 0; i < 16; ++i) { + s_dst[i] = b->scales[i]; + } +} + +kernel void kernel_restore_block_q6_k_trans4_ns( + __global uint * src_ql, + __global uint * src_qh, + __global half * src_d, + __global char * src_s, + __global struct block_q6_K * dst0, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); // block index along K + uint i01 = get_global_id(0); // row index + uint i02 = get_global_id(2); // batch index + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + + __global struct block_q6_K * b = dst0 + dst_blk_offset; + + b->d = src_d[src_blk_offset]; + + uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + uint4 qlv[8]; + for (int p = 0; p < 8; ++p) { + qlv[p].x = src_ql[ql_base + (p * 4 + 0) * ne01]; + qlv[p].y = src_ql[ql_base + (p * 4 + 1) * ne01]; + qlv[p].z = src_ql[ql_base + (p * 4 + 2) * ne01]; + qlv[p].w = src_ql[ql_base + (p * 4 + 3) * ne01]; + } + + uchar * qlv_bytes = (uchar *)qlv; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 16; ++j) { + uchar lo_02 = qlv_bytes[i*64 + j]; + uchar lo_13 = qlv_bytes[i*64 + j + 16]; + uchar hi_02 = qlv_bytes[i*64 + j + 32]; + uchar hi_13 = qlv_bytes[i*64 + j + 48]; + b->ql[i*64 + 2*j] = convert_uchar((lo_02 & mask_0F) | ((hi_02 & mask_0F) << 4)); + b->ql[i*64 + 2*j + 1] = convert_uchar(((lo_02 & mask_F0) >> 4) | (hi_02 & mask_F0)); + b->ql[i*64 + 32 + 2*j] = convert_uchar((lo_13 & mask_0F) | ((hi_13 & mask_0F) << 4)); + b->ql[i*64 + 32 + 2*j + 1] = convert_uchar(((lo_13 & mask_F0) >> 4) | (hi_13 & mask_F0)); + } + } + + uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01; + uint qhv[16]; + for (int p = 0; p < 16; ++p) { + qhv[p] = src_qh[qh_base + p * ne01]; + } + + for (int n = 0; n < 2; ++n) { + for (int l = 0; l < 32; ++l) { + int u = l / 16; + int bit_pos = (l % 16) * 2; + uchar v0 = (uchar)((qhv[(n*4 + 0)*2 + u] >> bit_pos) & 0x03); + uchar v1 = (uchar)((qhv[(n*4 + 1)*2 + u] >> bit_pos) & 0x03); + uchar v2 = (uchar)((qhv[(n*4 + 2)*2 + u] >> bit_pos) & 0x03); + uchar v3 = (uchar)((qhv[(n*4 + 3)*2 + u] >> bit_pos) & 0x03); + b->qh[n*32 + l] = v0 | (v1 << 2) | (v2 << 4) | (v3 << 6); + } + } + + __global char * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16; + for (int i = 0; i < 16; ++i) { + b->scales[i] = s_src[i]; + } +} + //------------------------------------------------------------------------------ // block_mxfp4 //------------------------------------------------------------------------------ diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl new file mode 100644 index 00000000000..9d24aff6a20 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl @@ -0,0 +1,279 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable +#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable + +#define TILESIZE_K 16 +#define TILESIZE_M 64 +#define TILESIZE_N 32 +#define QK_K 256 +#define K_SCALE_SIZE 12 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +#define dequantize_q4_k(q4, a_f16, scale, minv) \ + a_f16.s0 = (half)((float)(q4.s0 & 0x000F) * scale - minv); \ + a_f16.s1 = (half)((float)((q4.s0 & 0x00F0) >> 4) * scale - minv); \ + a_f16.s2 = (half)((float)((q4.s0 & 0x0F00) >> 8) * scale - minv); \ + a_f16.s3 = (half)((float)((q4.s0 & 0xF000) >> 12) * scale - minv); \ + a_f16.s4 = (half)((float)(q4.s1 & 0x000F) * scale - minv); \ + a_f16.s5 = (half)((float)((q4.s1 & 0x00F0) >> 4) * scale - minv); \ + a_f16.s6 = (half)((float)((q4.s1 & 0x0F00) >> 8) * scale - minv); \ + a_f16.s7 = (half)((float)((q4.s1 & 0xF000) >> 12) * scale - minv); \ + a_f16.s8 = (half)((float)(q4.s2 & 0x000F) * scale - minv); \ + a_f16.s9 = (half)((float)((q4.s2 & 0x00F0) >> 4) * scale - minv); \ + a_f16.sa = (half)((float)((q4.s2 & 0x0F00) >> 8) * scale - minv); \ + a_f16.sb = (half)((float)((q4.s2 & 0xF000) >> 12) * scale - minv); \ + a_f16.sc = (half)((float)(q4.s3 & 0x000F) * scale - minv); \ + a_f16.sd = (half)((float)((q4.s3 & 0x00F0) >> 4) * scale - minv); \ + a_f16.se = (half)((float)((q4.s3 & 0x0F00) >> 8) * scale - minv); \ + a_f16.sf = (half)((float)((q4.s3 & 0xF000) >> 12) * scale - minv); \ + + +#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \ + acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \ + acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \ + acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \ + acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \ + acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \ + acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \ + acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \ + acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \ + acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \ + acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \ + acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \ + acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \ + acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \ + acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \ + acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \ + acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \ + acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \ + acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \ + acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \ + acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \ + acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \ + acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \ + acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \ + acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \ + acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \ + acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \ + acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \ + acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \ + acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \ + acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \ + acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \ + acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \ + acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \ + acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \ + acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \ + acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \ + acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \ + acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \ + acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \ + acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \ + acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \ + acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \ + acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \ + acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \ + acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \ + acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \ + acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \ + acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \ + acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \ + acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \ + acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \ + acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \ + acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \ + acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \ + acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \ + acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \ + acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \ + acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \ + acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \ + acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \ + acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \ + acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \ + acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + + +__attribute__((qcom_wave_pair_mode(1))) +kernel void kernel_gemm_moe_q4_k_f32_ns( + __read_only image1d_buffer_t src0_q, + __global half * src0_d, + __global half * src0_dm, + __global uchar * src0_s, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global ushort * src2_emap, + __write_only image1d_buffer_t dst, + __global int * total_tiles, + uint ne00, + uint ne01 +) { + uint block_id_m = get_global_id(1); // m_tile + uint block_id_n = get_global_id(2); // n_tile + + // Boundary check + if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + return; + } + + __private half16 reg_a; + __private float32 reg_c = (float32)(0); + __local half4 shared_b[128]; + + const ushort expert_id = src2_emap[block_id_n]; + + const uint row = block_id_m * TILESIZE_M; + const uint col = block_id_n * TILESIZE_N; + + uint sub_block_id_m = get_local_id(0); + uint2 b_global_offset; + b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00; + b_global_offset.y = b_global_offset.x + (16 * ne00); + uint2 b_local_offset; + b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2); + b_local_offset.y = b_local_offset.x + 16; + + uint num_superblocks = ne00 / QK_K; + uint scales_per_row = num_superblocks * K_SCALE_SIZE; + uint row_idx = row + get_global_id(0); + + // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16 + for (uint step = 0; step < ne00; step += TILESIZE_K * 2) { + uint sub = step / 32; + uint sb = sub / 8; + uint j = sub % 8; + + // Load d and dm for super-block + uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0); + half d_val = src0_d[d_offset]; + half dm_val = src0_dm[d_offset]; + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = (float)dm_val * (float)mn; + + // First sub-block (16 elements) + uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + uint b_sub_offset = col * ne00 + step; + + // Load 16 q (64-bits) in transposed layout + uint2 q4x16; + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B + float8 bx8_f32; + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + half8 bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantization + dequantize_q4_k(as_ushort4(q4x16), reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + half16 acc; + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + + // Second half (next 16 elements, same sub-block scale) + uint half_step = step + TILESIZE_K; + q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + b_sub_offset = col * ne00 + half_step; + + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + dequantize_q4_k(as_ushort4(q4x16), reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + } + + // Load post router and share in LM + __local uint out_idx[TILESIZE_N]; + + if (get_local_id(0) < TILESIZE_N) { + uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)]; + if (idx == 0xFFFFFFFF) { + idx = src2[block_id_n * TILESIZE_N + 0]; + } + out_idx[get_local_id(0)] = idx * ne01; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scatter results back to original position in output grid + uint m_offset = row + get_local_id(0); + + write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1)); + write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2)); + write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3)); + write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4)); + write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5)); + write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6)); + write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7)); + write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8)); + write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9)); + write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa)); + write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb)); + write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc)); + write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd)); + write_imagef(dst, out_idx[14] + m_offset, (reg_c.se)); + write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf)); + write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg)); + write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh)); + write_imagef(dst, out_idx[18] + m_offset, (reg_c.si)); + write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj)); + write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk)); + write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl)); + write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm)); + write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn)); + write_imagef(dst, out_idx[24] + m_offset, (reg_c.so)); + write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp)); + write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq)); + write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr)); + write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss)); + write_imagef(dst, out_idx[29] + m_offset, (reg_c.st)); + write_imagef(dst, out_idx[30] + m_offset, (reg_c.su)); + write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv)); + + // Store zero padding parts to the index of first output in tile + barrier(CLK_GLOBAL_MEM_FENCE); + write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0)); +} diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl new file mode 100644 index 00000000000..808a0c7db6a --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl @@ -0,0 +1,284 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable +#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable + +#define TILESIZE_K 16 +#define TILESIZE_M 64 +#define TILESIZE_N 32 +#define QK_K 256 +#define K_SCALE_SIZE 12 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +#define dequantize_q5_k(qs5x16, qh5x16, a_f16, scale, m) \ + a_f16.s0 = (half)((float)(( qs5x16.s0 & 0x000F) | (( qh5x16.s0 & 0x01) << 4)) * scale + m); \ + a_f16.s1 = (half)((float)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) * scale + m)); \ + a_f16.s2 = (half)((float)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) * scale + m)); \ + a_f16.s3 = (half)((float)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) * scale + m)); \ + a_f16.s4 = (half)((float)((( qs5x16.s1 & 0x000F) | (((qh5x16.s0 >> 4) & 0x01) << 4)) * scale + m)); \ + a_f16.s5 = (half)((float)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) * scale + m)); \ + a_f16.s6 = (half)((float)(((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) * scale + m); \ + a_f16.s7 = (half)((float)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) * scale + m)); \ + a_f16.s8 = (half)((float)((( qs5x16.s2 & 0x000F) | (( qh5x16.s1 & 0x01) << 4)) * scale + m)); \ + a_f16.s9 = (half)((float)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) * scale + m)); \ + a_f16.sa = (half)((float)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) * scale + m)); \ + a_f16.sb = (half)((float)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) * scale + m)); \ + a_f16.sc = (half)((float)((( qs5x16.s3 & 0x000F) | (((qh5x16.s1 >> 4) & 0x01) << 4)) * scale + m)); \ + a_f16.sd = (half)((float)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) * scale + m)); \ + a_f16.se = (half)((float)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) * scale + m)); \ + a_f16.sf = (half)((float)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) * scale + m)); \ + + +#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \ + acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \ + acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \ + acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \ + acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \ + acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \ + acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \ + acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \ + acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \ + acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \ + acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \ + acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \ + acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \ + acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \ + acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \ + acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \ + acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \ + acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \ + acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \ + acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \ + acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \ + acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \ + acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \ + acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \ + acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \ + acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \ + acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \ + acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \ + acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \ + acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \ + acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \ + acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \ + acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \ + acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \ + acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \ + acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \ + acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \ + acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \ + acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \ + acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \ + acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \ + acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \ + acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \ + acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \ + acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \ + acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \ + acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \ + acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \ + acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \ + acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \ + acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \ + acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \ + acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \ + acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \ + acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \ + acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \ + acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \ + acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \ + acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \ + acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \ + acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \ + acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \ + acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \ + acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + + +__attribute__((qcom_wave_pair_mode(1))) +kernel void kernel_gemm_moe_q5_k_f32_ns( + __read_only image1d_buffer_t src0_q, + __global uint * src0_qh, + __global uchar * src0_s, + __global half * src0_d, + __global half * src0_dm, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global ushort * src2_emap, + __write_only image1d_buffer_t dst, + __global int * total_tiles, + uint ne00, + uint ne01 +) { + uint block_id_m = get_global_id(1); // m_tile + uint block_id_n = get_global_id(2); // n_tile + + // Boundary check + if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + return; + } + + __private half16 reg_a; + __private float32 reg_c = (float32)(0); + __local half4 shared_b[128]; + + const ushort expert_id = src2_emap[block_id_n]; + + const uint row = block_id_m * TILESIZE_M; + const uint col = block_id_n * TILESIZE_N; + + uint sub_block_id_m = get_local_id(0); + uint2 b_global_offset; + b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00; + b_global_offset.y = b_global_offset.x + (16 * ne00); + uint2 b_local_offset; + b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2); + b_local_offset.y = b_local_offset.x + 16; + + uint num_superblocks = ne00 / QK_K; + uint scales_per_row = num_superblocks * K_SCALE_SIZE; + uint row_idx = row + get_global_id(0); + + // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16 + for (uint step = 0; step < ne00; step += TILESIZE_K * 2) { + uint sub = step / 32; + uint sb = sub / 8; + uint j = sub % 8; + + // Load d and dm for super-block + uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0); + half d_val = src0_d[d_offset]; + half dm_val = src0_dm[d_offset]; + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = -(float)dm_val * (float)mn; + + // qh is stored at sub-block granularity + uint qh_offset = row + sub * ne01 + expert_id * num_superblocks * 8 * ne01 + get_global_id(0); + uchar4 qhx32 = as_uchar4(src0_qh[qh_offset]); + + // First sub-block (16 elements) + uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + uint b_sub_offset = col * ne00 + step; + + // Load 16 q (64-bits) in transposed layout + uint2 q4x16; + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B + float8 bx8_f32; + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + half8 bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantization + dequantize_q5_k(as_ushort4(q4x16), qhx32.lo, reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + half16 acc; + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + + // Second half + uint half_step = step + TILESIZE_K; + q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + b_sub_offset = col * ne00 + half_step; + + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + dequantize_q5_k(as_ushort4(q4x16), qhx32.hi, reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + } + + // Load post router and share in LM + __local uint out_idx[TILESIZE_N]; + + if (get_local_id(0) < TILESIZE_N) { + uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)]; + if (idx == 0xFFFFFFFF) { + idx = src2[block_id_n * TILESIZE_N + 0]; + } + out_idx[get_local_id(0)] = idx * ne01; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scatter results back to original position in output grid + uint m_offset = row + get_local_id(0); + + write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1)); + write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2)); + write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3)); + write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4)); + write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5)); + write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6)); + write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7)); + write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8)); + write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9)); + write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa)); + write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb)); + write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc)); + write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd)); + write_imagef(dst, out_idx[14] + m_offset, (reg_c.se)); + write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf)); + write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg)); + write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh)); + write_imagef(dst, out_idx[18] + m_offset, (reg_c.si)); + write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj)); + write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk)); + write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl)); + write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm)); + write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn)); + write_imagef(dst, out_idx[24] + m_offset, (reg_c.so)); + write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp)); + write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq)); + write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr)); + write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss)); + write_imagef(dst, out_idx[29] + m_offset, (reg_c.st)); + write_imagef(dst, out_idx[30] + m_offset, (reg_c.su)); + write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv)); + + // Store zero padding parts to the index of first output in tile + barrier(CLK_GLOBAL_MEM_FENCE); + write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0)); +} diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl new file mode 100644 index 00000000000..a040335adfa --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl @@ -0,0 +1,263 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable +#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable + +#define TILESIZE_K 16 +#define TILESIZE_M 64 +#define TILESIZE_N 32 +#define QK_K 256 + +#define dequantize_q6_k(qs16, qh16, a_f16, scale) \ + a_f16.s0 = (half)(((float)(( qs16.s0 & 0x000F) | ((uint)(( qh16 ) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s1 = (half)(((float)((( qs16.s0 >> 4) & 0x000F) | ((uint)(( qh16 >> 2) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s2 = (half)(((float)((( qs16.s0 >> 8) & 0x000F) | ((uint)(( qh16 >> 4) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s3 = (half)(((float)((( qs16.s0 >>12) & 0x000F) | ((uint)(( qh16 >> 6) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s4 = (half)(((float)(( qs16.s1 & 0x000F) | ((uint)(( qh16 >> 8) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s5 = (half)(((float)((( qs16.s1 >> 4) & 0x000F) | ((uint)(( qh16 >> 10) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s6 = (half)(((float)((( qs16.s1 >> 8) & 0x000F) | ((uint)(( qh16 >> 12) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s7 = (half)(((float)((( qs16.s1 >>12) & 0x000F) | ((uint)(( qh16 >> 14) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s8 = (half)(((float)(( qs16.s2 & 0x000F) | ((uint)(( qh16 >> 16) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s9 = (half)(((float)((( qs16.s2 >> 4) & 0x000F) | ((uint)(( qh16 >> 18) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sa = (half)(((float)((( qs16.s2 >> 8) & 0x000F) | ((uint)(( qh16 >> 20) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sb = (half)(((float)((( qs16.s2 >>12) & 0x000F) | ((uint)(( qh16 >> 22) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sc = (half)(((float)(( qs16.s3 & 0x000F) | ((uint)(( qh16 >> 24) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sd = (half)(((float)((( qs16.s3 >> 4) & 0x000F) | ((uint)(( qh16 >> 26) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.se = (half)(((float)((( qs16.s3 >> 8) & 0x000F) | ((uint)(( qh16 >> 28) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sf = (half)(((float)((( qs16.s3 >>12) & 0x000F) | ((uint)(( qh16 >> 30) & 0x3) << 4)) - 32.f) * scale); \ + + +#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \ + acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \ + acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \ + acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \ + acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \ + acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \ + acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \ + acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \ + acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \ + acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \ + acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \ + acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \ + acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \ + acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \ + acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \ + acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \ + acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \ + acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \ + acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \ + acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \ + acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \ + acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \ + acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \ + acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \ + acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \ + acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \ + acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \ + acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \ + acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \ + acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \ + acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \ + acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \ + acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \ + acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \ + acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \ + acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \ + acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \ + acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \ + acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \ + acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \ + acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \ + acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \ + acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \ + acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \ + acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \ + acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \ + acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \ + acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \ + acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \ + acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \ + acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \ + acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \ + acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \ + acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \ + acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \ + acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \ + acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \ + acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \ + acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \ + acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \ + acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \ + acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \ + acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \ + acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + + +__attribute__((qcom_wave_pair_mode(1))) +kernel void kernel_gemm_moe_q6_k_f32_ns( + __read_only image1d_buffer_t src0_ql, + __global uint * src0_qh, + __global char * src0_s, + __global half * src0_d, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global ushort * src2_emap, + __write_only image1d_buffer_t dst, + __global int * total_tiles, + uint ne00, + uint ne01 +) { + uint block_id_m = get_global_id(1); // m_tile + uint block_id_n = get_global_id(2); // n_tile + + // Boundary check + if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + return; + } + + __private half16 reg_a; + __private float32 reg_c = (float32)(0); + __local half4 shared_b[128]; + + const ushort expert_id = src2_emap[block_id_n]; + + const uint row = block_id_m * TILESIZE_M; + const uint col = block_id_n * TILESIZE_N; + + uint sub_block_id_m = get_local_id(0); + uint2 b_global_offset; + b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00; + b_global_offset.y = b_global_offset.x + (16 * ne00); + uint2 b_local_offset; + b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2); + b_local_offset.y = b_local_offset.x + 16; + + uint num_superblocks = ne00 / QK_K; + uint scales_per_row = num_superblocks * 16; + uint row_idx = row + get_global_id(0); + + // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16 + for (uint step = 0; step < ne00; step += TILESIZE_K * 2) { + uint sub = step / 32; // 32-element group index + uint sb = sub / 8; // super-block index + uint j = sub % 8; // group within super-block + + // Load d for super-block + uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0); + half d_val = src0_d[d_offset]; + + // Load sub-block scales + global const char * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * 16; + float scale0 = (float)d_val * (float)sc[j * 2]; + float scale1 = (float)d_val * (float)sc[j * 2 + 1]; + + uint qh_base = row + (sub * 2) * ne01 + expert_id * (num_superblocks * 16) * ne01 + get_global_id(0); + uint qh_first16 = src0_qh[qh_base]; + uint qh_second16 = src0_qh[qh_base + ne01]; + + // First half (16 elements) + uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + uint b_sub_offset = col * ne00 + step; + + // Load 16 ql nibbles (2 uints) from image + uint2 q4x16; + q4x16.x = read_imageui(src0_ql, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_ql, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B + float8 bx8_f32; + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + half8 bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantize first 16 elements (scale0) + dequantize_q6_k(as_ushort4(q4x16), qh_first16, reg_a, scale0); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + half16 acc; + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + + // Second half + uint half_step = step + TILESIZE_K; + q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + b_sub_offset = col * ne00 + half_step; + + q4x16.x = read_imageui(src0_ql, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_ql, q_sub_offset + sub_block_id_m + ne01).x; + + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + dequantize_q6_k(as_ushort4(q4x16), qh_second16, reg_a, scale1); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + } + + // Load post router and share in LM + __local uint out_idx[TILESIZE_N]; + + if (get_local_id(0) < TILESIZE_N) { + uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)]; + if (idx == 0xFFFFFFFF) { + idx = src2[block_id_n * TILESIZE_N + 0]; + } + out_idx[get_local_id(0)] = idx * ne01; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scatter results back to original position in output grid + uint m_offset = row + get_local_id(0); + + write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1)); + write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2)); + write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3)); + write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4)); + write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5)); + write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6)); + write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7)); + write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8)); + write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9)); + write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa)); + write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb)); + write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc)); + write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd)); + write_imagef(dst, out_idx[14] + m_offset, (reg_c.se)); + write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf)); + write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg)); + write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh)); + write_imagef(dst, out_idx[18] + m_offset, (reg_c.si)); + write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj)); + write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk)); + write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl)); + write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm)); + write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn)); + write_imagef(dst, out_idx[24] + m_offset, (reg_c.so)); + write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp)); + write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq)); + write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr)); + write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss)); + write_imagef(dst, out_idx[29] + m_offset, (reg_c.st)); + write_imagef(dst, out_idx[30] + m_offset, (reg_c.su)); + write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv)); + + // Store zero padding parts to the index of first output in tile + barrier(CLK_GLOBAL_MEM_FENCE); + write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0)); +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl new file mode 100644 index 00000000000..13d79f2526f --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl @@ -0,0 +1,151 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#define QK_K 256 +#define K_SCALE_SIZE 12 +#define N_SIMDGROUP 4 +#define SIMDGROUP_WIDTH 64 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +static inline float8 q4_k_to_fp32_packed8(ushort2 q4x8, float scale, float minv) { + float8 fp32x8; + fp32x8.s0 = (q4x8.s0 & 0x000F) * scale - minv; + fp32x8.s1 = ((q4x8.s0 & 0x00F0) >> 4) * scale - minv; + fp32x8.s2 = ((q4x8.s0 & 0x0F00) >> 8) * scale - minv; + fp32x8.s3 = ((q4x8.s0 & 0xF000) >> 12) * scale - minv; + fp32x8.s4 = (q4x8.s1 & 0x000F) * scale - minv; + fp32x8.s5 = ((q4x8.s1 & 0x00F0) >> 4) * scale - minv; + fp32x8.s6 = ((q4x8.s1 & 0x0F00) >> 8) * scale - minv; + fp32x8.s7 = ((q4x8.s1 & 0xF000) >> 12) * scale - minv; + return fp32x8; +} + +__attribute__((qcom_reqd_sub_group_size("half"))) +__kernel void kernel_gemv_moe_q4_k_f32_ns( + __global uint * src0_q, + __global half * src0_d, + __global half * src0_dm, + __global uchar * src0_s, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne11 +) { + uint i01 = get_global_id(0); + uint i20 = get_global_id(2); + uint sgid = get_local_id(1); + uint slid = get_sub_group_local_id(); + + uint i11 = i20 % ne11; + + uint expert_id = src2[i20]; + + int num_superblocks = ne00 / QK_K; + int num_subblocks = ne00 / 32; + int scales_per_row = num_superblocks * K_SCALE_SIZE; + + // Expert offsets in the transposed noshuffle layout + uint expert_q_offset = expert_id * (ne00 / 8) * ne01; + uint expert_d_offset = expert_id * num_superblocks * ne01; + + __private float sum = 0.0f; + + // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter + for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) { + uint sb = ib / 8; + uint j = ib % 8; + + // Load d and dmin for this super-block + half d_val = src0_d[expert_d_offset + sb * ne01 + i01]; + half dm_val = src0_dm[expert_d_offset + sb * ne01 + i01]; + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = (float)dm_val * (float)mn; + + // Load 4 uints of quants (32 nibbles = 32 elements) + uint q_base = expert_q_offset + ib * ne01 * 4 + i01; + + uint4 regQ; + regQ.s0 = src0_q[q_base]; + regQ.s1 = src0_q[q_base + ne01]; + regQ.s2 = src0_q[q_base + ne01 * 2]; + regQ.s3 = src0_q[q_base + ne01 * 3]; + + // Load activations: 32 floats = 8 float4s + uint y_offset = i11 * ne00 / 4 + ib * 8; + + float8 fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s0), scale, minv); + + float4 shared_y4; + shared_y4 = read_imagef(src1, (y_offset + 0)); + float4 acc = shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 1)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s1), scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 2)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 3)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s2), scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 4)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 5)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s3), scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 6)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 7)); + acc += shared_y4 * fp32x8.hi; + + sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3)); + } + + // reduction in local memory, assumes #subgroups=4 + __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)]; + if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum; + if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum; + if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 output per thread in subgroup 0 + if (sgid == 0) { + dst = dst + (offsetd >> 2); + dst[i01 + i20 * ne01] = sum; + } +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl new file mode 100644 index 00000000000..f128d44340a --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl @@ -0,0 +1,156 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#define QK_K 256 +#define K_SCALE_SIZE 12 +#define N_SIMDGROUP 4 +#define SIMDGROUP_WIDTH 64 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +static inline float8 q5_k_to_fp32_packed8(ushort2 qs5x8, uchar qh5x8, half s, half m) { + float8 fp32x8; + fp32x8.s0 = (float)((( qs5x8.s0 & 0x000F) | (( qh5x8 & 0x01) << 4)) * s + m); + fp32x8.s1 = (float)((((qs5x8.s0 & 0x00F0) >> 4 ) | (((qh5x8 >> 1) & 0x01) << 4)) * s + m); + fp32x8.s2 = (float)((((qs5x8.s0 & 0x0F00) >> 8 ) | (((qh5x8 >> 2) & 0x01) << 4)) * s + m); + fp32x8.s3 = (float)((((qs5x8.s0 & 0xF000) >> 12) | (((qh5x8 >> 3) & 0x01) << 4)) * s + m); + fp32x8.s4 = (float)((( qs5x8.s1 & 0x000F) | (((qh5x8 >> 4) & 0x01) << 4)) * s + m); + fp32x8.s5 = (float)((((qs5x8.s1 & 0x00F0) >> 4 ) | (((qh5x8 >> 5) & 0x01) << 4)) * s + m); + fp32x8.s6 = (float)((((qs5x8.s1 & 0x0F00) >> 8 ) | (((qh5x8 >> 6) & 0x01) << 4)) * s + m); + fp32x8.s7 = (float)((((qs5x8.s1 & 0xF000) >> 12) | (((qh5x8 >> 7) & 0x01) << 4)) * s + m); + return fp32x8; +} + +__attribute__((qcom_reqd_sub_group_size("half"))) +__kernel void kernel_gemv_moe_q5_k_f32_ns( + __global uint * src0_q, + __global uint * src0_qh, + __global half * src0_d, + __global half * src0_dm, + __global uchar * src0_s, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne11 +) { + uint i01 = get_global_id(0); + uint i20 = get_global_id(2); + uint sgid = get_local_id(1); + uint slid = get_sub_group_local_id(); + + uint i11 = i20 % ne11; + + uint expert_id = src2[i20]; + + int num_superblocks = ne00 / QK_K; + int num_subblocks = ne00 / 32; + int scales_per_row = num_superblocks * K_SCALE_SIZE; + + // Expert offsets in the transposed noshuffle layout + uint expert_q_offset = expert_id * (ne00 / 8) * ne01; + uint expert_d_offset = expert_id * num_superblocks * ne01; + + __private float sum = 0.0f; + + // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter + for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) { + uint sb = ib / 8; + uint j = ib % 8; + + // Load d and dmin for this super-block + half d_val = src0_d[expert_d_offset + sb * ne01 + i01]; + half dm_val = src0_dm[expert_d_offset + sb * ne01 + i01]; + + // sub_block index = sb * 8 + j + uint expert_qh_offset = expert_id * num_superblocks * 8 * ne01; + uchar4 regQh = as_uchar4(src0_qh[expert_qh_offset + (sb * 8 + j) * ne01 + i01]); + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = -(float)dm_val * (float)mn; + + // Load 4 uints of quants (32 nibbles = 32 elements) + uint q_base = expert_q_offset + ib * ne01 * 4 + i01; + + uint4 regQ; + regQ.s0 = src0_q[q_base]; + regQ.s1 = src0_q[q_base + ne01]; + regQ.s2 = src0_q[q_base + ne01 * 2]; + regQ.s3 = src0_q[q_base + ne01 * 3]; + + // Load activations: 32 floats = 8 float4s + uint y_offset = i11 * ne00 / 4 + ib * 8; + + float8 fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s0), regQh.s0, scale, minv); + + float4 shared_y4; + shared_y4 = read_imagef(src1, (y_offset + 0)); + float4 acc = shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 1)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s1), regQh.s1, scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 2)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 3)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s2), regQh.s2, scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 4)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 5)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s3), regQh.s3, scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 6)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 7)); + acc += shared_y4 * fp32x8.hi; + + sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3)); + } + + // reduction in local memory, assumes #subgroups=4 + __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)]; + if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum; + if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum; + if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 output per thread in subgroup 0 + if (sgid == 0) { + dst = dst + (offsetd >> 2); + dst[i01 + i20 * ne01] = sum; + } +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl new file mode 100644 index 00000000000..526e609dc3a --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl @@ -0,0 +1,137 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#define QK_K 256 +#define N_SIMDGROUP 4 +#define SIMDGROUP_WIDTH 64 + +static inline float8 q6_k_to_fp32_packed8(ushort2 ql8, ushort qh8, float d_scale) { + float8 fp32x8; + fp32x8.s0 = ((float)(( ql8.s0 & 0x000F) | ((uint)((qh8 ) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s1 = ((float)((( ql8.s0 >> 4) & 0x000F) | ((uint)((qh8 >> 2) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s2 = ((float)((( ql8.s0 >> 8) & 0x000F) | ((uint)((qh8 >> 4) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s3 = ((float)((( ql8.s0 >> 12)& 0x000F) | ((uint)((qh8 >> 6) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s4 = ((float)(( ql8.s1 & 0x000F) | ((uint)((qh8 >> 8) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s5 = ((float)((( ql8.s1 >> 4) & 0x000F) | ((uint)((qh8 >>10) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s6 = ((float)((( ql8.s1 >> 8) & 0x000F) | ((uint)((qh8 >>12) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s7 = ((float)((( ql8.s1 >> 12)& 0x000F) | ((uint)((qh8 >>14) & 0x3) << 4)) - 32.f) * d_scale; + return fp32x8; +} + +__attribute__((qcom_reqd_sub_group_size("half"))) +__kernel void kernel_gemv_moe_q6_k_f32_ns( + __global uint * src0_ql, + __global uint * src0_qh, + __global char * src0_s, + __global half * src0_d, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne11 +) { + uint i01 = get_global_id(0); + uint i20 = get_global_id(2); + uint sgid = get_local_id(1); + uint slid = get_sub_group_local_id(); + + uint i11 = i20 % ne11; + + uint expert_id = src2[i20]; + + int num_superblocks = ne00 / QK_K; + int num_subblocks = ne00 / 32; // 8 sub-blocks of 32 per super-block + int scales_per_row = num_superblocks * 16; + + // Expert offsets in the transposed noshuffle layout + uint expert_ql_offset = expert_id * (ne00 / 8) * ne01; // 32 uints per super-block + uint expert_qh_offset = expert_id * (ne00 / 16) * ne01; // 16 uints per super-block + uint expert_d_offset = expert_id * num_superblocks * ne01; + + __private float sum = 0.0f; + + // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter + for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) { + uint sb = ib / 8; // super-block index + uint j = ib % 8; // 32-element group within super-block + + // Load d for this super-block + half d_val = src0_d[expert_d_offset + sb * ne01 + i01]; + + // Load 2 sub-block scales + global const char * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * 16; + float scale0 = (float)d_val * (float)sc[j * 2]; + float scale1 = (float)d_val * (float)sc[j * 2 + 1]; + + // Load 4 uints of ql + uint ql_base = expert_ql_offset + (ib * 4) * ne01 + i01; + uint4 regQL; + regQL.s0 = src0_ql[ql_base]; + regQL.s1 = src0_ql[ql_base + ne01]; + regQL.s2 = src0_ql[ql_base + ne01 * 2]; + regQL.s3 = src0_ql[ql_base + ne01 * 3]; + + // Load 2 uints of qh + uint qh_base = expert_qh_offset + (ib * 2) * ne01 + i01; + uint2 regQH; + regQH.s0 = src0_qh[qh_base]; + regQH.s1 = src0_qh[qh_base + ne01]; + + // Load activations: 32 floats = 8 float4s + uint y_offset = i11 * ne00 / 4 + ib * 8; + + float8 fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s0), (ushort)(regQH.s0 & 0xFFFF), scale0); + + float4 shared_y4; + shared_y4 = read_imagef(src1, (y_offset + 0)); + float4 acc = shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 1)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s1), (ushort)(regQH.s0 >> 16), scale0); + + shared_y4 = read_imagef(src1, (y_offset + 2)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 3)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s2), (ushort)(regQH.s1 & 0xFFFF), scale1); + + shared_y4 = read_imagef(src1, (y_offset + 4)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 5)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s3), (ushort)(regQH.s1 >> 16), scale1); + + shared_y4 = read_imagef(src1, (y_offset + 6)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 7)); + acc += shared_y4 * fp32x8.hi; + + sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3)); + } + + // reduction in local memory, assumes #subgroups=4 + __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)]; + if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum; + if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum; + if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 output per thread in subgroup 0 + if (sgid == 0) { + dst = dst + (offsetd >> 2); + dst[i01 + i20 * ne01] = sum; + } +} From b39a7bf1b038e8d1c3d58a313228dfc4f3993f5c Mon Sep 17 00:00:00 2001 From: ravel7524 <58877666+ravel7524@users.noreply.github.com> Date: Wed, 20 May 2026 03:52:21 +0200 Subject: [PATCH 085/309] ggml-cuda: tune RDNA3 Q6_K MMVQ nwarps (#23349) --- ggml/src/ggml-cuda/mmvq.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index da48f313a38..73a0991e206 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -359,7 +359,9 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_K: + return 8; case GGML_TYPE_Q6_K: + return 2; case GGML_TYPE_IQ4_NL: return 8; default: From 871b0b70f81d26494613ad7a9dcb933b1aec4611 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Tue, 19 May 2026 22:04:04 -0700 Subject: [PATCH 086/309] snapdragon: update toolchain to v0.6 (#23369) * snapdragon: update compiler flags to enable all CPU features * snapdragon: update readme to point to toolchain v0.6 * snapdragon: bump toolchain docker to v0.6 --- .github/workflows/build-and-test-snapdragon.yml | 4 ++-- docs/backend/snapdragon/CMakeUserPresets.json | 8 ++++---- docs/backend/snapdragon/README.md | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-and-test-snapdragon.yml b/.github/workflows/build-and-test-snapdragon.yml index ef3fe502fa7..84613b4c830 100644 --- a/.github/workflows/build-and-test-snapdragon.yml +++ b/.github/workflows/build-and-test-snapdragon.yml @@ -31,7 +31,7 @@ jobs: android-ndk-snapdragon: runs-on: ubuntu-latest container: - image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3' + image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6' defaults: run: shell: bash @@ -61,7 +61,7 @@ jobs: linux-iot-snapdragon: runs-on: ubuntu-latest container: - image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1' + image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6' defaults: run: shell: bash diff --git a/docs/backend/snapdragon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json index c07bf5ca0c6..d2629fc4de9 100644 --- a/docs/backend/snapdragon/CMakeUserPresets.json +++ b/docs/backend/snapdragon/CMakeUserPresets.json @@ -10,8 +10,8 @@ "ANDROID_ABI": "arm64-v8a", "ANDROID_PLATFORM": "android-31", "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake", - "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", - "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", @@ -59,8 +59,8 @@ "toolset": { "value": "host=x86_64", "strategy": "external" }, "cacheVariables": { "CMAKE_TOOLCHAIN_FILE": "cmake/arm64-linux-clang.cmake", - "CMAKE_C_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE", - "CMAKE_CXX_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE", "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md index 2414eeaf6a4..f5bb3d11c48 100644 --- a/docs/backend/snapdragon/README.md +++ b/docs/backend/snapdragon/README.md @@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc. This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop. ``` -~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3 +~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6 [d]/> cd /workspace ``` From 57ebaf4edd99ea675f256ae2286cd99206dbfcd1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 20 May 2026 09:42:00 +0300 Subject: [PATCH 087/309] metal : optimize pad + cpy (#23354) * metal : optimize pad * metal : optinmize cpy * cont : better row packing in threadgroup --- ggml/src/ggml-metal/ggml-metal-device.cpp | 8 +- ggml/src/ggml-metal/ggml-metal-ops.cpp | 17 ++- ggml/src/ggml-metal/ggml-metal.metal | 128 ++++++++++++---------- src/models/delta-net-base.cpp | 8 +- 4 files changed, 94 insertions(+), 67 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index e288a27f992..ba006d9b31a 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -1897,7 +1897,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l char base[256]; char name[256]; - snprintf(base, 256, "kernel_pad_%s", ggml_type_name(op->src[0]->type)); + // note: this is slower + //const bool is_c4 = op->src[0]->ne[0] % 4 == 0 && op->ne[0] % 4 == 0; + const bool is_c4 = false; + + snprintf(base, 256, "kernel_pad_%s%s", ggml_type_name(op->src[0]->type), is_c4 ? "_4" : ""); snprintf(name, 256, "%s", base); ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); @@ -1907,6 +1911,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + res.c4 = is_c4; + return res; } diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index a114391c2e8..8506000b6c0 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -816,9 +816,7 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1); } else { const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); - const int nth = MIN(args.ne00, nth_max); - const int nk0 = (args.ne00 + nth - 1)/nth; ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne01, ne02, ne03, nth, 1, 1); @@ -1863,7 +1861,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) { nk0 = ne00/ggml_blck_size(op->type); } - int nth = std::min(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + int nth = std::min(nk0*ne01, 256); // when rows are small, we can batch them together in a single threadgroup int nrptg = 1; @@ -1874,7 +1872,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) { nrptg = (nth + nk0 - 1)/nk0; nth = nk0; - if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) { + if (nrptg*nth > 256) { nrptg--; } } @@ -4039,14 +4037,21 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) { auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op); - const int nth = std::min(1024, ne0); + if (pipeline.c4) { + args.ne00 = ne00/4; + args.ne0 = ne0/4; + } + + const int nth_max = MIN(64, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + const int nth = MIN(args.ne0, nth_max); + const int nk0 = (args.ne0 + 1024 - 1)/1024; // note: 1024 is hardcoded in the kernel! ggml_metal_encoder_set_pipeline(enc, pipeline); ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2); - ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1); + ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne1, ne2, ne3, nth, 1, 1); return 1; } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index f6ffb2b3a1c..4cf9dbea946 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2643,7 +2643,7 @@ kernel void kernel_gated_delta_net_impl( b_ptr += args.ne21; g_ptr += args.ne21*G; - if (K > 1u) { + if (K > 1) { const int target_slot = (int)t - shift; if (target_slot >= 0 && target_slot < (int)K) { device float * dst_state = (device float *) (dst) + attn_size + (uint)target_slot * state_size_per_snap + state_out_base; @@ -2655,7 +2655,7 @@ kernel void kernel_gated_delta_net_impl( } } - if (K == 1u) { + if (K == 1) { device float * dst_state = (device float *) (dst) + attn_size + state_out_base; FOR_UNROLL (short j = 0; j < NSG; j++) { const short is = tx*NSG + j; @@ -5104,7 +5104,7 @@ kernel void kernel_upscale_bilinear_f32( for (int64_t sx = x_min; sx < x_max; ++sx) { const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0); const float w = wx * wy; - const device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00); + device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00); sum += (*src_ptr) * w; wsum += w; } @@ -5286,7 +5286,7 @@ kernel void kernel_upscale_bicubic_f32( const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx)); const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3; - const device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00); + device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00); sum += (*src_ptr) * wx * wy; } } @@ -5329,42 +5329,46 @@ kernel void kernel_roll_f32( } } -kernel void kernel_pad_f32( +template +kernel void kernel_pad_impl( constant ggml_metal_kargs_pad & args, device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { + const int32_t i3 = tgpig.z; + const int32_t i2 = tgpig.y; + const int32_t k0 = tgpig.x/args.ne1; + const int32_t i1 = tgpig.x - k0*args.ne1; - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; + const int32_t i03 = i3; + const int32_t i02 = i2; + const int32_t i01 = i1; - const int64_t i03 = i3; - const int64_t i02 = i2; - const int64_t i01 = i1; - - device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); - device float * dst_ptr = (device float *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); + device const T * src0_ptr = (device const T *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); + device T * dst_ptr = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); - if (i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - if (i0 < args.ne00) { - dst_ptr[i0] = src0_ptr[i0]; - } else { - dst_ptr[i0] = 0.0f; - } + for (int32_t l0 = 0; l0 < 1024; l0 += ntg.x) { + const int32_t i0 = k0*1024 + tpitg.x + l0; + if (i0 >= args.ne0) { + break; } - return; - } - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - dst_ptr[i0] = 0.0f; + if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { + dst_ptr[i0] = src0_ptr[i0]; + } else { + dst_ptr[i0] = 0.0f; + } } } +typedef decltype(kernel_pad_impl) kernel_pad_t; + +template [[host_name("kernel_pad_f32")]] kernel kernel_pad_t kernel_pad_impl; +template [[host_name("kernel_pad_f32_4")]] kernel kernel_pad_t kernel_pad_impl; + +// TODO: this is slow - optimize kernel void kernel_pad_reflect_1d_f32( constant ggml_metal_kargs_pad_reflect_1d & args, device const char * src0, @@ -7328,23 +7332,27 @@ kernel void kernel_cpy_t_t( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n/(args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); + const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.ne00; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.ne00;) { device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); dst_data[i00] = (T1) src[0]; break; @@ -7376,23 +7384,27 @@ kernel void kernel_cpy_f32_q( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n / (args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK; + const int32_t i3 = n / (args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK; device block_q * dst_data = (device block_q *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { device const float * src = (device const float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + (i00*QK)*args.nb00); quantize_func(src, dst_data[i00]); @@ -7417,24 +7429,28 @@ kernel void kernel_cpy_q_f32( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n/(args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); + const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { T4x4 temp; dequantize_func(src_data + i00/nl, i00%nl, temp); dst_data[i00] = temp; diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp index a67238383ed..4f4c7cac7a8 100644 --- a/src/models/delta-net-base.cpp +++ b/src/models/delta-net-base.cpp @@ -562,13 +562,13 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( } const int64_t D = S_v * S_v * H_v; - const int64_t K = (int64_t) cparams.n_rs_seq + 1; + const int64_t K = cparams.n_rs_seq + 1; // TODO: remove pad + simplify - ggml_tensor * state_in_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs); - ggml_tensor * state_3d = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0); + ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs); + ggml_tensor * s_3d_pad = ggml_pad (ctx0, s_3d, 0, K - 1, 0, 0); - ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d); + ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d_pad); if (n_seq_tokens > 1) { cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); } else { From 585080d3103fe1091987a4576780991ff88f802e Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Wed, 20 May 2026 09:46:31 +0200 Subject: [PATCH 088/309] fix: Div wrapper no pointer events on hidden (#23390) --- .../app/chat/ChatScreen/ChatScreenActionScrollDown.svelte | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte index bbfed95e948..c43bee3e3c3 100644 --- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte @@ -41,12 +41,16 @@ }); -
+