XyLearningProgramming · XyLearningProgramming · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/.github/actions/helm-deploy/action.yml b/.github/actions/helm-deploy/action.yml
@@ -24,47 +24,62 @@ inputs:
     description: 'Kubernetes namespace'
     required: true
     default: 'backend'
+  github_token:
+    description: 'GitHub token for Helm installation'
+    required: true
 
 runs:
   using: 'composite'
-  steps:
-    - name: Set up Kubernetes config
-      shell: bash
-      run: |
-        mkdir -p $HOME/.kube
-        echo "${{ inputs.kube_config_data }}" | base64 -d > $HOME/.kube/config
-        chmod 600 $HOME/.kube/config
-
+  steps:        
     - name: Install Helm
       uses: azure/setup-helm@v3
       with:
         version: 'latest'
+      env:
+        GITHUB_TOKEN: ${{ inputs.github_token }}
 
+    - name: Set up Kubernetes config
+      shell: bash
+      run: |
+        mkdir -p $HOME/.kube
+        echo "${{ inputs.kube_config_data }}" > $HOME/.kube/config
+        chmod 600 $HOME/.kube/config
+
     - name: Parse environment variables
       id: parse_env
       shell: bash
       run: |
         if [ -n "${{ inputs.helm_values_env }}" ]; then
-          echo "helm_env_values<<EOF" >> $GITHUB_OUTPUT
+          # Create temporary file to avoid exposing secrets in logs
+          temp_file=$(mktemp)
           echo "${{ inputs.helm_values_env }}" | while IFS='=' read -r key value; do
             # Skip commented lines and empty lines
             if [[ "$key" =~ ^#.*$ ]] || [ -z "$key" ]; then
               continue
             fi
             if [ -n "$key" ] && [ -n "$value" ]; then
-              echo "  $key: \"$value\""
+              echo "::add-mask::$value"
+              echo "  $key: \"$value\"" >> "$temp_file"
             fi
-          done >> $GITHUB_OUTPUT
+          done
+
+          # Output the parsed values without exposing them in logs
+          echo "helm_env_values<<EOF" >> $GITHUB_OUTPUT
+          cat "$temp_file" >> $GITHUB_OUTPUT
           echo "EOF" >> $GITHUB_OUTPUT
+          rm "$temp_file"
         else
           echo "helm_env_values=" >> $GITHUB_OUTPUT
         fi
-
+         
     - name: Deploy with Helm
       shell: bash
       run: |
-        # Create temporary values file
-        cat > /tmp/override-values.yaml << EOF
+        # Create temporary values file with restricted permissions
+        temp_values=$(mktemp)
+        chmod 600 "$temp_values"
+
+        cat > "$temp_values" << EOF
         image:
           repository: ${{ inputs.registry_repository }}
           tag: "${{ inputs.image_tag }}"
@@ -77,13 +92,23 @@ runs:
         ${{ steps.parse_env.outputs.helm_env_values }}
         EOF
 
-        # Deploy using Helm
+        # Deploy using Helm (values file won't be logged due to file redirection)
         helm upgrade --install slm-server ./deploy/helm \
           --namespace ${{ inputs.namespace }} \
           --create-namespace \
-          --values /tmp/override-values.yaml \
+          --values "$temp_values" \
           --wait \
           --timeout 10m
+
+        # Clean up temporary file
+        rm "$temp_values"
+
+    - name: Cleanup on cancellation
+      if: cancelled()
+      shell: bash
+      run: |
+        echo "Workflow cancelled, attempting helm rollback..."
+        helm rollback slm-server 0 -n ${{ inputs.namespace }} --wait --timeout 5m || true
 
     - name: Verify deployment
       shell: bash

diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
@@ -76,3 +76,4 @@ jobs:
         helm_values_persistence_hostpath: ${{ secrets.HELM_VALUES_PERSISTENCE_HOSTPATH }}
         helm_values_persistence_nodename: ${{ secrets.HELM_VALUES_PERSISTENCE_NODENAME }}
         namespace: ${{ env.NAMESPACE }}
+        github_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -36,12 +36,12 @@ jobs:
         uv run pytest tests/ --ignore=tests/e2e/ --cov=slm_server --cov-report=xml --cov-report=term-missing
 
     - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@v4
+      uses: codecov/codecov-action@v5
       with:
         file: ./coverage.xml
         flags: unittests
-        name: codecov-umbrella
         fail_ci_if_error: false
+        token: ${{ secrets.CODECOV_TOKEN }}
 
     - name: Lint with ruff
       run: |

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -30,3 +30,4 @@ jobs:
         helm_values_persistence_hostpath: ${{ secrets.HELM_VALUES_PERSISTENCE_HOSTPATH }}
         helm_values_persistence_nodename: ${{ secrets.HELM_VALUES_PERSISTENCE_NODENAME }}
         namespace: ${{ env.NAMESPACE }}
+        github_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/README.md b/README.md
@@ -1,16 +1,18 @@
-# 🤖 SLM Server
+# Small-Language-Model Server
 
 [![CI Pipeline](https://github.com/XyLearningProgramming/slm_server/actions/workflows/ci.yml/badge.svg)](https://github.com/XyLearningProgramming/slm_server/actions/workflows/ci.yml)
 [![codecov](https://codecov.io/gh/XyLearningProgramming/slm_server/branch/main/graph/badge.svg)](https://codecov.io/gh/XyLearningProgramming/slm_server)
 [![Docker](https://img.shields.io/badge/docker-ready-blue.svg)](https://hub.docker.com/r/x3huang/slm_server)
 [![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
 
-> 🚀 **Production-ready FastAPI model server** for small language models with OpenAI-compatible API, built-in observability, and enterprise-grade deployment tools.
+🚀 A light model server that serves small language models (default: `Qwen3-0.6B-GGUF`) as a **thin wrapper** around `llama-cpp` exposing the OpenAI-compatible `/chat/completions` API. Core logic is just <100 lines under `./slm_server/app.py`!
 
-A light model server that serves small language models (default: `Qwen3-0.6B-GGUF`) using `llama-cpp` via the OpenAI-compatible `/chat/completions` API. Designed for resource-constrained environments with comprehensive monitoring and deployment automation.
+> This is still a WIP project. Issues, pull-requests are welcome. I mainly use this repo to deploy a SLM model as part of the backend on my own site [x3huang.dev](https://x3huang.dev/) while trying my best to keep this repo model-agonistic. 
 
 ## ✨ Features
 
+![Thin wrapper around llama cpp](./docs/20250712_slm_img1.jpg)
+
 - 🔌 **OpenAI-compatible API** - Drop-in replacement with `/chat/completions` endpoint and streaming support
 - ⚡ **Llama.cpp integration** - High-performance inference optimized for limited CPU and memory resources
 - 📊 **Production observability** - Built-in logging, Prometheus metrics, and OpenTelemetry tracing (all configurable)
@@ -50,7 +52,7 @@ docker run -p 8000:8000 -v $(pwd)/models:/app/models slm_server
 ### Test the API
 
 ```bash
-curl -X POST http://localhost:8000/chat/completions \
+curl -X POST http://localhost:8000/api/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
     "model": "qwen",

diff --git a/deploy/helm/templates/NOTES.txt b/deploy/helm/templates/NOTES.txt
@@ -6,16 +6,16 @@
   {{- end }}
 {{- end }}
 {{- else if contains "NodePort" .Values.service.type }}
-  export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "slm_server.fullname" . }})
+  export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "slm-server.fullname" . }})
   export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
   echo http://$NODE_IP:$NODE_PORT
 {{- else if contains "LoadBalancer" .Values.service.type }}
      NOTE: It may take a few minutes for the LoadBalancer IP to be available.
-           You can watch its status by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "slm_server.fullname" . }}'
-  export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "slm_server.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
+           You can watch its status by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "slm-server.fullname" . }}'
+  export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "slm-server.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
   echo http://$SERVICE_IP:{{ .Values.service.port }}
 {{- else if contains "ClusterIP" .Values.service.type }}
-  export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "slm_server.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
+  export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "slm-server.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
   export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
   echo "Visit http://127.0.0.1:8080 to use your application"
   kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT

diff --git a/deploy/helm/templates/hpa.yaml b/deploy/helm/templates/hpa.yaml
@@ -2,14 +2,14 @@
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
-  name: {{ include "slm_server.fullname" . }}
+  name: {{ include "slm-server.fullname" . }}
   labels:
-    {{- include "slm_server.labels" . | nindent 4 }}
+    {{- include "slm-server.labels" . | nindent 4 }}
 spec:
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
-    name: {{ include "slm_server.fullname" . }}
+    name: {{ include "slm-server.fullname" . }}
   minReplicas: {{ .Values.autoscaling.minReplicas }}
   maxReplicas: {{ .Values.autoscaling.maxReplicas }}
   metrics:

diff --git a/deploy/helm/templates/ingress.yaml b/deploy/helm/templates/ingress.yaml
@@ -2,9 +2,9 @@
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
-  name: {{ include "slm_server.fullname" . }}
+  name: {{ include "slm-server.fullname" . }}
   labels:
-    {{- include "slm_server.labels" . | nindent 4 }}
+    {{- include "slm-server.labels" . | nindent 4 }}
   {{- with .Values.ingress.annotations }}
   annotations:
     {{- toYaml . | nindent 4 }}
@@ -35,7 +35,7 @@ spec:
             {{- end }}
             backend:
               service:
-                name: {{ include "slm_server.fullname" $ }}
+                name: {{ include "slm-server.fullname" $ }}
                 port:
                   number: {{ $.Values.service.port }}
           {{- end }}

diff --git a/deploy/helm/templates/pv.yaml b/deploy/helm/templates/pv.yaml
@@ -10,7 +10,7 @@ spec:
     storage: {{ .Values.persistence.size }}
   accessModes:
     - {{ .Values.persistence.accessMode }}
-  hostPath:
+  local:
     path: {{ .Values.persistence.hostPath }}
   nodeAffinity:
     required:

diff --git a/deploy/helm/templates/tests/test-connection.yaml b/deploy/helm/templates/tests/test-connection.yaml
@@ -1,15 +1,15 @@
 apiVersion: v1
 kind: Pod
 metadata:
-  name: "{{ include "slm_server.fullname" . }}-test-connection"
+  name: "{{ include "slm-server.fullname" . }}-test-connection"
   labels:
-    {{- include "slm_server.labels" . | nindent 4 }}
+    {{- include "slm-server.labels" . | nindent 4 }}
   annotations:
     "helm.sh/hook": test
 spec:
   containers:
     - name: wget
       image: busybox
       command: ['wget']
-      args: ['{{ include "slm_server.fullname" . }}:{{ .Values.service.port }}']
+      args: ['{{ include "slm-server.fullname" . }}:{{ .Values.service.port }}']
   restartPolicy: Never
diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml
@@ -50,6 +50,14 @@ ingress:
 hpa:
   enabled: false
 
+# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/
+autoscaling:
+  enabled: false
+  minReplicas: 1
+  maxReplicas: 100
+  targetCPUUtilizationPercentage: 80
+  # targetMemoryUtilizationPercentage: 80
+
 # Environment variables to inject into the container
 # Example configuration for SLM server settings
 env: {}
@@ -73,7 +81,7 @@ env: {}
 # See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
 resources:
   limits:
-    cpu: 2.5
+    cpu: 1500m
     memory: 800Mi
   # requests:
   #   cpu: 1
@@ -85,15 +93,15 @@ probes:
     enabled: true
     path: /health
     initialDelaySeconds: 10
-    periodSeconds: 10
+    periodSeconds: 70
     timeoutSeconds: 5
     successThreshold: 1
     failureThreshold: 3
   liveness:
     enabled: true
     path: /health
     initialDelaySeconds: 30
-    periodSeconds: 30
+    periodSeconds: 70
     timeoutSeconds: 5
     successThreshold: 1
     failureThreshold: 3