diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 78dd7cdeb..11de03505 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -84,17 +84,30 @@ class ModelSettings(BaseSettings): description="The number of threads to use when batch processing. Use -1 for max cpu threads", ) rope_scaling_type: int = Field( - default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED + default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, + description="RoPE frequency scaling method. Defaults to the type defined by the model (unspecified).", ) rope_freq_base: float = Field(default=0.0, description="RoPE base frequency") rope_freq_scale: float = Field( default=0.0, description="RoPE frequency scaling factor" ) - yarn_ext_factor: float = Field(default=-1.0) - yarn_attn_factor: float = Field(default=1.0) - yarn_beta_fast: float = Field(default=32.0) - yarn_beta_slow: float = Field(default=1.0) - yarn_orig_ctx: int = Field(default=0) + yarn_ext_factor: float = Field( + default=-1.0, + description="YaRN extrapolation mix factor. -1.0 uses the value from the model.", + ) + yarn_attn_factor: float = Field( + default=1.0, description="YaRN magnitude scaling factor for attention." + ) + yarn_beta_fast: float = Field( + default=32.0, description="YaRN low correction dim (beta fast)." + ) + yarn_beta_slow: float = Field( + default=1.0, description="YaRN high correction dim (beta slow)." + ) + yarn_orig_ctx: int = Field( + default=0, + description="YaRN original context size of the model. 0 uses the model's training context size.", + ) mul_mat_q: bool = Field( default=True, description="if true, use experimental mul_mat_q kernels" )