Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
1015 commits
Select commit Hold shift + click to select a range
f486ce9
(webui) REFACTOR: UI primitives and polish (#19551)
allozaur Feb 12, 2026
ff59903
scripts : add support for forks in pr2wt.sh (#19540)
danbev Feb 12, 2026
4d688f9
(webui) FEATURE: Enable adding or injecting System Message into chat …
allozaur Feb 12, 2026
f488429
llama : update outdated comment in llama.h (#19428)
MonkeybreadSoftware Feb 12, 2026
4b385bf
vendor : update cpp-httplib (#19537)
angt Feb 12, 2026
4c61875
webui: Add switcher to Chat Message UI to show raw LLM output (#19571)
allozaur Feb 12, 2026
338085c
args : add -kvu to llama-parallel (#19577)
ggerganov Feb 12, 2026
79cc0f2
opencl: add basic support for q4_1 (#19534)
lhez Feb 12, 2026
3bb7813
hexagon: fix typo in vtcm_needs_release (#19545)
FanShupei Feb 12, 2026
490eb96
metal : support GGML_OP_SET (#19548)
ggerganov Feb 13, 2026
0644bae
metal : improve concurrency (#19555)
ggerganov Feb 13, 2026
bb96bfd
memory : fix kv cache size for hybrid models (#19559)
ggerganov Feb 13, 2026
2f5d8f8
vendor : update BoringSSL to 0.20260211.0 (#19562)
angt Feb 13, 2026
25224c8
llama : remove deprecated codecvt (#19565)
angt Feb 13, 2026
33a56f9
model : Kimi Linear fix conv state update (#19531)
ymcki Feb 13, 2026
423cf0b
docs : fix broken link and typo (#19560)
pavan-sh Feb 13, 2026
43919b7
CUDA: Do not mutate cgraph for fused ADDs (#19566)
ORippler Feb 13, 2026
5174d72
webui: UI and routing fixes (#19586)
allozaur Feb 13, 2026
5065da5
CUDA: loop over ne2*ne3 in case it overflows (#19538)
am17an Feb 13, 2026
b2ecc0c
support --verbose-prompt (#19576)
CISC Feb 13, 2026
0e21991
fix vulkan ggml_acc only works in 3d but not 4d (#19426)
ymcki Feb 13, 2026
cc2aa81
Fix wrong memcpy length for block_interleave == 4 (#19575)
Alcpz Feb 13, 2026
752584d
model: support GLM MoE DSA arch (NOTE: indexer is not yet supported) …
ngxson Feb 13, 2026
b48e80f
common : update download code (#19573)
angt Feb 13, 2026
05a6f0e
vulkan: restore -inf check in FA shaders (#19582)
jeffbolznv Feb 13, 2026
94a602d
github : add missing backends to issue templates (#19603)
mengshengwu Feb 13, 2026
0ccbfde
hexagon: further optimizations and refactoring for flash attention (#…
max-krasnyansky Feb 14, 2026
2dec548
vulkan: Add vendor id for Qualcomm drivers (#19569)
strongtz Feb 14, 2026
53aef25
vulkan: support GGML_OP_SET (#19584)
jeffbolznv Feb 14, 2026
dbb0233
vulkan: support L2_NORM with contiguous rows (#19604)
jeffbolznv Feb 14, 2026
91ea5d6
build : fix libtool call in build-xcframework.sh (#19605)
angt Feb 14, 2026
0d00ef6
convert : store ffn_gate_inp_shexp as F32 (#19606)
CISC Feb 14, 2026
c7db95f
scripts : use official split.py for cpp-httplib (#19588)
angt Feb 14, 2026
6e473fb
metal : fix ACC op (#19427)
ggerganov Feb 14, 2026
eb145c0
mmap: Fix Windows handle lifetime (#19598)
noctrex Feb 14, 2026
2d8015e
llama : update LoRA API. + fix excessive graph reserves (#19280)
agent-enemy-2 Feb 14, 2026
baa12f3
webui: Architecture and UI improvements (#19596)
allozaur Feb 14, 2026
badba89
NetBSD build support (#19589)
iMilnb Feb 14, 2026
b7742cf
ggml : fix GGML_DEBUG with OpenMP (#19599)
angt Feb 14, 2026
1725e31
models : optimize qwen3next graph (#19375)
ggerganov Feb 14, 2026
01d8eaa
mtmd : Add Nemotron Nano 12B v2 VL support (#19547)
anavp-nvidia Feb 14, 2026
079feab
convert : ensure all models handle new experts count (#19621)
CISC Feb 14, 2026
3a00c98
cmake : fix KleidiAI install target failure with EXCLUDE_FROM_ALL (#1…
ssam18 Feb 15, 2026
684b361
ggml-cpu: FA add GEMM microkernel (#19422)
am17an Feb 15, 2026
184c694
ggml-cpu: optimize ggml_vec_dot_bf16 for s390x (#19399)
taronaeo Feb 15, 2026
08e6d91
ggml : avoid UB in gemm ukernel (#19642)
ggerganov Feb 15, 2026
341bc7d
context : fix output reorder with backend sampling (#19638)
ggerganov Feb 15, 2026
5708827
cmake : check if KleidiAI API has been fetched (#19640)
danbev Feb 15, 2026
9e118b9
build : remove LLAMA_HTTPLIB option (#19623)
angt Feb 15, 2026
6e67fd2
docs: update s390x build docs (#19643)
taronaeo Feb 15, 2026
27b93cb
cuda: optimize iq2xxs/iq2xs/iq3xxs dequantization (#19624)
dfriehs Feb 15, 2026
1a8c700
ggml : bump version to 0.9.6 (ggml/1423)
ggerganov Feb 7, 2026
55d5859
ggml : bump version to 0.9.7 (ggml/1425)
ggerganov Feb 15, 2026
ff4affb
sync : ggml
ggerganov Feb 15, 2026
267ba5a
ggml: aarch64: Implement SVE in Gemm q4_k 8x8 q8_k Kernel (#19132)
abhijain1204fujitsu Feb 16, 2026
d5dfc33
graph : fix KQ mask, lora, cvec reuse checks (#19644)
ggerganov Feb 16, 2026
cc45f2a
models : deduplicate delta-net graphs for Qwen family (#19597)
ggerganov Feb 16, 2026
2ba9adc
Adjust workaround for ROCWMMA_FATTN/GFX9 to only newer ROCm veresions…
superm1 Feb 16, 2026
4408494
build : rework llama_option_depr to handle LLAMA_CURL (#19658)
angt Feb 16, 2026
5f28c53
model: Add support for Tiny Aya Models (#19611)
saurabhdash2512 Feb 16, 2026
d23a559
ggml : make `ggml_is_view` as API (#19539)
foldl Feb 16, 2026
cceb1b4
common : inline functions (#18639)
Nekotekina Feb 16, 2026
d612901
perplexity: add proper batching (#19661)
AesSedai Feb 16, 2026
05fa625
convert : add JoyAI-LLM-Flash (#19651)
dranger003 Feb 16, 2026
65cede7
build : cleanup library linking logic (#19665)
angt Feb 17, 2026
ae46a61
build : link ws2_32 as PUBLIC on Windows (#19666)
angt Feb 17, 2026
e48349a
ci : bump komac version (#19682)
CISC Feb 17, 2026
667b694
model-conversion : make printing of config values optional (#19681)
danbev Feb 17, 2026
ad8207a
cuda : enable CUDA graphs for MMID 1 <= BS <= 4 (#19645)
ggerganov Feb 17, 2026
ae2d3f2
ggml: ggml-cpu: force-no-lto-for-cpu-feats (#19609)
talhaHavadar Feb 17, 2026
afa6bfe
Pre-MCP UI and architecture cleanup (#19685)
allozaur Feb 17, 2026
2b089c7
model-conversion : add option to print tensor values (#19692)
danbev Feb 17, 2026
983559d
opencl: optimize mean and sum_row kernels (#19614)
shaofeiqi Feb 17, 2026
e2f19b3
opencl: refactor expm1 and softplus (#19404)
shaofeiqi Feb 17, 2026
a569bda
common : make small string helpers as inline functions (#19693)
angt Feb 18, 2026
d0061be
vulkan: split mul_mat into multiple dispatches to avoid overflow (#19…
jeffbolznv Feb 18, 2026
ea00322
Pre-MCP UI and architecture cleanup (#19689)
allozaur Feb 18, 2026
238856e
ggml webgpu: shader library organization (#19530)
reeselevine Feb 18, 2026
e99f108
docs: Fix broken links for preparing models in Backends (#19684)
MaciejDromin Feb 18, 2026
eeef3cf
model: support GLM-OCR (#19677)
ngxson Feb 18, 2026
b55dcde
server: save generated text for the /slots endpoint (for LLAMA_SERVER…
matteoserva Feb 18, 2026
e7f2f95
ggml webgpu: Fix bug in dispatching large matrix-vector multiplicatio…
reeselevine Feb 18, 2026
8a70973
Add Jinja support for "indent" string filter (#19529)
pwilkin Feb 18, 2026
ad9f692
models : dedup Kimi Linear delta net implementation (#19668)
ymcki Feb 19, 2026
27326bf
models : dedup qwen35 graphs (#19660)
ggerganov Feb 19, 2026
3bb2fcc
llamafile: powerpc: add FP16 MMA path for Q4/Q8 matmul (#19709)
shalinib-ibm Feb 19, 2026
c0d0430
model : full modern bert support (#18330)
ryan-mangeno Feb 19, 2026
eacb4b6
llama : use output_resolve_row() in get_logits_ith/get_embeddings_ith…
danbev Feb 19, 2026
8004f3a
model : add tokenizer from LFM2.5-Audio-1.5B (#19687)
tdakhran Feb 19, 2026
03fd9d3
webui: Fix Attachments not being included in completion request (#19731)
allozaur Feb 19, 2026
c589799
mtmd : chat : Fix extra \n between text and media marker (#19595)
tdakhran Feb 19, 2026
c78e682
CUDA: fix kernel selection logic for tile FA (#19686)
JohannesGaessler Feb 19, 2026
2bf318f
model : add JAIS-2 architecture support (#19488)
alielfilali01 Feb 19, 2026
e6267a9
mtmd: build_attn modified, flash_attn on/off via ctx_params (#19729)
sfallah Feb 19, 2026
da348c9
models : fix qwen3.5 beta/gate shapes (#19730)
ggerganov Feb 19, 2026
abb9f3c
vulkan: fix MMQ shader push constants and multi-dispatch (#19732)
0cc4m Feb 19, 2026
237958d
model: Add PaddleOCR-VL model support (#18825)
megemini Feb 19, 2026
11c325c
ggml-webgpu: Add unary op (SQR, SQRT, SIN, COS) support. (#19700)
yomaytk Feb 19, 2026
39e4b1d
common : fix gpt-oss Jinja error when assistant message has both cont…
abhijitb11 Feb 19, 2026
3dadc88
common : fix Step-3.5-Flash format detection and thinking support (#1…
jesseposner Feb 19, 2026
10b26ee
WebUI hide models in router mode (#19374)
crsawyer Feb 19, 2026
77d6ae4
test: mul_mat tests with huge batch size (#19519)
jeffbolznv Feb 20, 2026
492bc31
quantize : add --dry-run option (#19526)
ddh0 Feb 20, 2026
b908baf
ggml-cpu: add RVV vec dot kernels for quantization types (#18784)
taimur-10x Feb 20, 2026
94b0200
common : merge qwen3-coder and nemotron nano 3 parsers (#19765)
aldehir Feb 20, 2026
ba3b9c8
hexagon : fix build release (#19444) (#19587)
mengshengwu Feb 21, 2026
07968d5
fix: UI single model selection in router mode (#19767)
crsawyer Feb 21, 2026
a0c91e8
Improve CUDA graph capture (#19754)
gaugarg-nv Feb 21, 2026
99156f3
vendor : update cpp-httplib to 0.33.1 (#19778)
angt Feb 21, 2026
f75c4e8
Add a build target to generate ROCm artifacts using ROCm 7.2 (#19433)
superm1 Feb 21, 2026
3571565
Update ROCm docker container to 7.2 release (#19418)
superm1 Feb 21, 2026
e877ad8
ci : fix rocm release path [no ci] (#19784)
CISC Feb 22, 2026
34ec1c3
server : merge contiguous Responses input items into a single assista…
aldehir Feb 22, 2026
9f0684f
ci : fix rocm archive name [no ci] (#19808)
CISC Feb 22, 2026
ae2368e
model : add Kanana-2 model support (#19803)
HelloKS Feb 22, 2026
cacc371
Fix wrong cli-argument in documentation (#19804)
Menkalian Feb 22, 2026
ed48378
common : fix improper trimming in XML parser on complete message (#19…
aldehir Feb 22, 2026
5452d73
jinja: correct stats for tojson and string filters (#19785)
ngxson Feb 22, 2026
e8e2616
cli : provide model with text filename (#19783)
CISC Feb 22, 2026
2b6dfe8
llama : remove write/read of output ids/logits/embeddings (#18862)
danbev Feb 23, 2026
bc160d3
ggml-cpu: arm64: q5_K repack gemm and gemv (and generic) implementati…
Alcpz Feb 23, 2026
72b44c0
model-conversion : merge inspect-org-model.py with tensor-info.py (#1…
danbev Feb 23, 2026
9051663
webui: Add setting to have full height Code Blocks in Chat Messages (…
allozaur Feb 23, 2026
d8aeb65
tests : fix typos in comments in test-backend-sampler [no ci] (#19824)
danbev Feb 23, 2026
b68a83e
vendor : update cpp-httplib to 0.34.0 (#19830)
angt Feb 23, 2026
5eb0ea3
feat: Add code blocks full height setting to parameter sync service (…
allozaur Feb 23, 2026
39fb81f
hexagon refactor all Ops to use local context struct (#19819)
max-krasnyansky Feb 24, 2026
3ea5360
vulkan: fix data race in mul_mat_id shader (#19790)
jeffbolznv Feb 24, 2026
8c2c010
vulkan: fix coopmat1 without bf16 support (#19793)
jeffbolznv Feb 24, 2026
aa6f918
Vulkan Scalar Flash Attention Refactor (#19625)
0cc4m Feb 24, 2026
c830f99
server : support max_completion_tokens request property (#19831)
rgerganov Feb 24, 2026
da426cb
model : update label for LFM2-24B-A2B (#19848)
tdakhran Feb 24, 2026
418dea3
ggml/gguf : prevent integer overflows (#19856)
ggerganov Feb 24, 2026
47eb12b
server: fix query params lost when proxying requests in multi-model r…
ServeurpersoCom Feb 24, 2026
2446419
models : fix graph splits (#19866)
ggerganov Feb 24, 2026
a96a112
gguf : fix ftell/fseek for Windows (#19870)
aldehir Feb 25, 2026
8fdf269
ci : update Windows ROCm build to 26.Q1 [no ci] (#19810)
superm1 Feb 25, 2026
c747294
scripts: update corpus of compare-logprobs (#19326)
ngxson Feb 25, 2026
d7d826b
server : support multi-modal context checkpoints (#19849)
ggerganov Feb 25, 2026
f20469d
server : enable multi-modal prompt caching (#19877)
ggerganov Feb 25, 2026
3af34b9
ci : update the ROCm/HIP toolchain versions [no ci] (#19891)
slojosic-amd Feb 25, 2026
832aa94
common : add more aliases for sampler CLI params (#19797)
ddh0 Feb 25, 2026
3769fe6
vulkan: check for memory overlap before doing fusion (#19768)
jeffbolznv Feb 25, 2026
2943210
support permuted, remove check s0/s10 (#19889)
arthw Feb 26, 2026
bd72300
server : fix typo in server README.md (#19900)
yggdrasil75 Feb 26, 2026
1ca3d1d
gguf : avoid too many file size calls (#19919)
ggerganov Feb 26, 2026
66287bd
model : add Jina Embeddings v5 Nano (partial EuroBERT) support (#19826)
maximilianwerk Feb 26, 2026
9b62913
jinja : correct default size for string slices (#19913)
EZForever Feb 26, 2026
efba35a
server: fix load-on-startup not respected in ini file (#19897)
drrros Feb 26, 2026
ffaafde
ggml-virtgpu: improve the reliability of the code (#19846)
kpouget Feb 26, 2026
b68d751
llama: Add option to merge gate and exp weights (#19139)
am17an Feb 26, 2026
99bd67c
kv-cache : fix can_shift() check to take into account M-RoPE (#19928)
ggerganov Feb 26, 2026
01cd448
server : fix ctx checkpoint restore logic (#19924)
ggerganov Feb 26, 2026
37964f4
mtmd : fix padding of n_tokens (#19930)
ggerganov Feb 26, 2026
723c710
vulkan: fix fp16 Flash Attention on Windows AMD RDNA2 and below (#19921)
0cc4m Feb 26, 2026
4e76d24
ggml : fix AMX and add batched support (#19925)
angt Feb 26, 2026
88cf781
ggml-zendnn: update code for latest ZenDNN API (#19923)
z-vishal Feb 27, 2026
c17dce4
replace the magic nunber 768 by max work group size to support iGPU (…
arthw Feb 27, 2026
a8b192b
tests : enable test-chat out of tree build (#19558)
jplehr Feb 27, 2026
2e7e638
server : support multiple model aliases via comma-separated --alias (…
ServeurpersoCom Feb 27, 2026
8387ffb
gguf-py : dump version to 0.18.0 (#19950)
danbev Feb 27, 2026
d903f30
ggml-cpu: add repack for mxfp4 (#19738)
am17an Feb 27, 2026
8d3b962
ci : use ubuntu-latest for gguf-publish workflow (#19951)
danbev Feb 27, 2026
5596a35
server: Mirroring /v1/responses to /responses to match /v1/chat/compl…
samikama Feb 27, 2026
3e6ab24
server: Add pragma once to server-context.h (#19944)
roj234 Feb 27, 2026
ecbcb7e
CUDA: add CDNA3 MFMA support for flash attention MMA kernel (#19806)
Jayluci4 Feb 27, 2026
d979f2b
tests : model metadata loading from huggingface (#19796)
bartowski1182 Feb 28, 2026
4720819
vendor : update cpp-httplib to 0.35.0 (#19969)
angt Feb 28, 2026
05728db
vendors : update miniaudio library to 0.11.24 (#19914)
data-man Feb 28, 2026
66d65ec
cuda: cap grid.y at 65535 in non-contiguous dequantize/convert kernel…
oobabooga Mar 1, 2026
3191462
vulkan: improve partial offloading performance on AMD (#19976)
0cc4m Mar 1, 2026
2afcdb9
ggml-cpu: optimise s390x multiply extend instructions (#20032)
taronaeo Mar 2, 2026
ec88c3c
scripts : improve get-wikitext-2.sh (#19952)
angt Mar 2, 2026
feefb92
vulkan: tune MMVQ for Intel Windows (#19988)
0cc4m Mar 2, 2026
36a7a65
ggml-webgpu: Support non-contiguous `src0` and overlapping `src0/src1…
yomaytk Mar 2, 2026
4d828bd
ggml webgpu: Clean up per-thread parameter buffer pool and job submis…
nikhilJain17 Mar 2, 2026
ff67808
Applied port patches
jpgaribotti Aug 13, 2025
a277275
Export mtmd target
jpgaribotti Aug 13, 2025
28067a9
Rebase temp-load-from-buffer and merge into master (#7)
jesusmb1995 Aug 28, 2025
7d274e5
Add option to build only mtmd library (#8)
jpgaribotti Aug 28, 2025
2ca6ebb
Add approval-check-worker workflow
kapilsingh421 Aug 29, 2025
4209150
Add CODEOWNERS file (#9)
chetasr Sep 4, 2025
1807ceb
Create merging_strategy.md
olyasir Sep 11, 2025
5396ad3
Tune python scripts
jesusmb1995 Sep 17, 2025
88180d3
remove_unneeded_script
jesusmb1995 Sep 17, 2025
3e194c8
Fix CMakeLists to support building with LLAMA_MTMD on or off
jpgaribotti Sep 23, 2025
927b80f
Corrected build interface for libmtmd
jpgaribotti Sep 23, 2025
63f2075
Make LLAMA_MTMD dependent on LLAMA_BUILD_TOOLS if not specified
jpgaribotti Sep 23, 2025
8e3bdc0
char_buff_stream
jesusmb1995 Sep 19, 2025
6311a2c
fix_include
jesusmb1995 Sep 30, 2025
f4ad598
remove_check
jesusmb1995 Sep 30, 2025
35649a7
Fix CI: macos13_x64 (disable blas)
jesusmb1995 Oct 1, 2025
bc0dd52
fix_ci_cmake_pkg
jesusmb1995 Oct 1, 2025
a7d2a34
fix_cmake_linking
jesusmb1995 Oct 1, 2025
faf8412
fix_ci_android
jesusmb1995 Oct 1, 2025
1856ee3
implement logging redirection
ogad-tether Oct 7, 2025
1249b92
rerouting common logs to callback
ogad-tether Oct 16, 2025
2ea33e9
Add compile flag to force vk performance logging
jpgaribotti Oct 9, 2025
2d38d89
Enable the FORCE_GGML_VK_PERF_LOGGER flag through an option
jpgaribotti Oct 9, 2025
faffd2c
Fix for hanging on model load on Windows
DmitryMalishev Oct 20, 2025
2de6117
fix multimodal backend
gianni-cor Oct 21, 2025
7aaa75d
Redirected performance logging to GGML_LOG
jpgaribotti Oct 23, 2025
25a6a7c
add debug info vulkan
gianni-cor Sep 8, 2025
902e614
enable sched debug
gianni-cor Sep 5, 2025
17cc72c
vk profiling triplet
gianni-cor Sep 12, 2025
1ebbb22
add python script to analyze log profilings
Sep 12, 2025
f1984fa
Amend vk profiling triplet
gianni-cor Sep 12, 2025
dbd6bb0
Install vulkan profiling analyzer
jpgaribotti Nov 4, 2025
0deaebd
default_dl_search
jesusmb1995 Oct 30, 2025
08464c9
log
jesusmb1995 Oct 30, 2025
326035f
force_SHARD_LIBSON
jesusmb1995 Oct 30, 2025
7de6b8c
adreno_logic
jesusmb1995 Oct 31, 2025
609049e
optimizedCpuEntries
jesusmb1995 Nov 4, 2025
0338cad
backend_selection
jesusmb1995 Nov 4, 2025
ffebe76
disable_ci_test_for_cmake
jesusmb1995 Nov 5, 2025
bf781dd
disable_flaky_shift_disabled_short_prompt
jesusmb1995 Nov 6, 2025
4d7deeb
disable_tests_failing_repositories
jesusmb1995 Nov 6, 2025
ae6847e
supress_test_sanitizer
jesusmb1995 Nov 6, 2025
5f488b2
fix_flake8_linter
jesusmb1995 Nov 6, 2025
d8ab589
fix_absolute_install
jesusmb1995 Nov 7, 2025
6cbe9d9
fix_absolute_install
jesusmb1995 Nov 7, 2025
8cb57b3
remove_cache
jesusmb1995 Nov 10, 2025
7450cef
Refactored llama-model to adapt upstream's unique_ptr vs raw pointer …
iancris Nov 13, 2025
001ba07
Code clean-up: removed unused functions
iancris Nov 14, 2025
7262d11
Handled scenario: build common but curl is off
iancris Nov 14, 2025
30dc60a
fixup clip_logger_state
jesusmb1995 Dec 3, 2025
99bdc1e
Remove unused parameter
jesusmb1995 Dec 4, 2025
8ea1255
Vulkan: Add build option for Adreno-specific fixes
Oct 10, 2025
5a263d5
Vulkan: Add QUALCOMM_ADRENO to vk_device_architecture list
Oct 10, 2025
6b578bb
Vulkan: disable subgroups on Adreno
Oct 10, 2025
ac2e673
Vulkan: disable mul_mat_l on adreno
Oct 10, 2025
1115953
Vulkan: disable rms_norm fusion on Adreno
Oct 10, 2025
36c3d52
Vulkan: generate Adreno-specific shader variants
Aug 22, 2025
39cb8db
Vulkan: Add Q4_K Adreno variant for mul_mat_vec
Oct 21, 2025
e4d4f39
vendor: add Vulkan Memory Allocator 3.3.0
bl4ckb0ne Nov 14, 2025
bc577e5
vulkan: use VMA for buffer allocation
bl4ckb0ne Dec 12, 2025
100afe2
vulkan: drop uma check for buffer read/write
bl4ckb0ne Jan 6, 2026
3689308
vulkan: add eShaderDeviceAddress usage to device buffer creation
bl4ckb0ne Jan 9, 2026
cf948e3
VMA create aligned buffers (#89)
bl4ckb0ne Jan 14, 2026
9ee95e3
Fix lib name resolution (#95)
gianni-cor Jan 27, 2026
45b346c
Update README.md
gianni-cor Jan 27, 2026
e88aa08
QVAC-13378: Model Metadata from .gguf without full loading (#100)
jesusmb1995 Feb 27, 2026
60adba7
meta_get_str (#101)
jesusmb1995 Mar 5, 2026
be3c216
Update linux runner for ubuntu-24-cmake-vulkan
gianni-cor Mar 11, 2026
bb7791b
Update build.yml
gianni-cor Mar 11, 2026
a54d618
LoRA Finetuning (#99)
zoq Mar 11, 2026
c087c62
resume patch removed (#103)
dev-nid Mar 12, 2026
10ac180
Update Readme (#104)
gianni-cor Mar 12, 2026
f2b0687
CI Fix (temp-7248): Corrupted .gguf, incorrectly cached native CPU bu…
jesusmb1995 Mar 25, 2026
88be00e
Align call sites with API changes.
zoq Mar 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
9 changes: 5 additions & 4 deletions .devops/cann.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# Define the CANN base image for easier version updates later
ARG CHIP_TYPE=910b
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11

# ==============================================================================
# BUILD STAGE
Expand All @@ -13,7 +13,7 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler2
FROM ${CANN_BASE_IMAGE} AS build

# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
yum clean all && \
rm -rf /var/cache/yum

Expand Down Expand Up @@ -42,6 +42,7 @@ RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
-DSOC_TYPE=ascend${CHIP_TYPE} \
-DUSE_ACL_GRAPH=ON \
. && \
cmake --build build --config Release -j$(nproc)

Expand Down Expand Up @@ -107,11 +108,11 @@ ENTRYPOINT ["/app/tools.sh"]
# ENTRYPOINT ["/app/llama-server"]

### Target: light
# Lightweight image containing only llama-cli
# Lightweight image containing only llama-cli and llama-completion
# ==============================================================================
FROM base AS light

COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

ENTRYPOINT [ "/app/llama-cli" ]

Expand Down
4 changes: 2 additions & 2 deletions .devops/cpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ FROM ubuntu:$UBUNTU_VERSION AS build
ARG TARGETARCH

RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev
apt-get install -y build-essential git cmake libssl-dev

WORKDIR /app

Expand Down Expand Up @@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

Expand Down
95 changes: 95 additions & 0 deletions .devops/cuda-new.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=13.1.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

FROM ${BASE_CUDA_DEV_CONTAINER} AS build

# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default

RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1

WORKDIR /app

COPY . .

RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh

## Base image
FROM ${BASE_CUDA_RUN_CONTAINER} AS base

RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

COPY --from=build /app/lib/ /app

### Full
FROM base AS full

COPY --from=build /app/full /app

WORKDIR /app

RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
python3-wheel \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete


ENTRYPOINT ["/app/tools.sh"]

### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

ENTRYPOINT [ "/app/llama-cli" ]

### Server, Server only
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

COPY --from=build /app/full/llama-server /app

WORKDIR /app

HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

ENTRYPOINT [ "/app/llama-server" ]
4 changes: 2 additions & 2 deletions .devops/cuda.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
ARG CUDA_DOCKER_ARCH=default

RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1

WORKDIR /app

Expand Down Expand Up @@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

Expand Down
4 changes: 2 additions & 2 deletions .devops/intel.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
apt-get install -y git libssl-dev

WORKDIR /app

Expand Down Expand Up @@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
FROM base AS light

COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

Expand Down
7 changes: 4 additions & 3 deletions .devops/llama-cli-cann.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ WORKDIR /app

COPY . .

RUN yum install -y gcc g++ cmake make libcurl-devel
RUN yum install -y gcc g++ cmake make openssl-devel
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
Expand All @@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
cmake --build build --config Release --target llama-cli
cmake --build build --config Release --target llama-cli && \
cmake --build build --config Release --target llama-completion

# TODO: use image with NNRT
FROM ascendai/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /

ENV LC_ALL=C.utf8

Expand Down
2 changes: 2 additions & 0 deletions .devops/llama-cpp-cuda.srpm.spec
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ make -j GGML_CUDA=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple

Expand Down Expand Up @@ -68,6 +69,7 @@ rm -rf %{_builddir}/*

%files
%{_bindir}/llama-cuda-cli
%{_bindir}/llama-cuda-completion
%{_bindir}/llama-cuda-server
%{_bindir}/llama-cuda-simple
/usr/lib/systemd/system/llamacuda.service
Expand Down
2 changes: 2 additions & 0 deletions .devops/llama-cpp.srpm.spec
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ make -j
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
cp -p llama-server %{buildroot}%{_bindir}/llama-server
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple

Expand Down Expand Up @@ -70,6 +71,7 @@ rm -rf %{_builddir}/*

%files
%{_bindir}/llama-cli
%{_bindir}/llama-completion
%{_bindir}/llama-server
%{_bindir}/llama-simple
/usr/lib/systemd/system/llama.service
Expand Down
4 changes: 2 additions & 2 deletions .devops/musa.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ RUN apt-get update && \
python3 \
python3-pip \
git \
libcurl4-openssl-dev \
libssl-dev \
libgomp1

WORKDIR /app
Expand Down Expand Up @@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app

WORKDIR /app

Expand Down
4 changes: 2 additions & 2 deletions .devops/nix/nixpkgs-instances.nix
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# the module `{ pkgs ... }: { /* config */ }` implicitly uses
# `_module.args.pkgs` (defined in this case by flake-parts).
perSystem =
{ system, ... }:
{ lib, system, ... }:
{
_module.args = {
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
Expand Down Expand Up @@ -33,7 +33,7 @@
"CUDA EULA"
"cuDNN EULA"
]
) (p.meta.licenses or [ p.meta.license ]);
) (p.meta.licenses or (lib.toList p.meta.license));
};
# Ensure dependencies use ROCm consistently
pkgsRocm = import inputs.nixpkgs {
Expand Down
2 changes: 2 additions & 0 deletions .devops/nix/package-gguf-py.nix
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
llamaVersion,
numpy,
tqdm,
requests,
sentencepiece,
pyyaml,
poetry-core,
Expand All @@ -20,6 +21,7 @@ buildPythonPackage {
tqdm
sentencepiece
pyyaml
requests
];
src = lib.cleanSource ../../gguf-py;
pythonImportsCheck = [
Expand Down
5 changes: 1 addition & 4 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
useMpi ? false,
useRocm ? config.rocmSupport,
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true,
useVulkan ? false,
useRpc ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
Expand Down Expand Up @@ -160,15 +159,13 @@ effectiveStdenv.mkDerivation (finalAttrs: {
++ optionals useMpi [ mpi ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs
++ optionals enableCurl [ curl ];
++ optionals useVulkan vulkanBuildInputs;

cmakeFlags =
[
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_CURL" enableCurl)
(cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CUDA" useCuda)
Expand Down
16 changes: 5 additions & 11 deletions .devops/nix/scope.nix
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,6 @@

let
pythonPackages = python3.pkgs;
buildPythonPackage = pythonPackages.buildPythonPackage;
numpy = pythonPackages.numpy;
tqdm = pythonPackages.tqdm;
sentencepiece = pythonPackages.sentencepiece;
pyyaml = pythonPackages.pyyaml;
poetry-core = pythonPackages.poetry-core;
pytestCheckHook = pythonPackages.pytestCheckHook;
in

# We're using `makeScope` instead of just writing out an attrset
Expand All @@ -23,17 +16,18 @@ in
lib.makeScope newScope (self: {
inherit llamaVersion;
gguf-py = self.callPackage ./package-gguf-py.nix {
inherit
buildPythonPackage
inherit (pythonPackages)
numpy
tqdm
sentencepiece
poetry-core
pyyaml
pytestCheckHook
requests
buildPythonPackage
poetry-core
;
};
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
python-scripts = self.callPackage ./python-scripts.nix { inherit (pythonPackages) buildPythonPackage poetry-core; };
llama-cpp = self.callPackage ./package.nix { };
docker = self.callPackage ./docker.nix { };
docker-min = self.callPackage ./docker.nix { interactive = false; };
Expand Down
Loading
Loading