Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,11 @@ func run() error {
}); ok {
reconciler.StartTAPGCReconciler(ctx)
}
if reconciler, ok := app.InstanceManager.(interface {
StartRuntimeOrphanReconciler(context.Context)
}); ok {
reconciler.StartRuntimeOrphanReconciler(ctx)
}

// Log OTel status
if cfg.Otel.Enabled {
Expand Down
3 changes: 2 additions & 1 deletion lib/instances/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ type manager struct {
admissionReconcileOnce sync.Once

// Periodic TAP garbage collection reconciler.
tapGCOnce sync.Once
tapGCOnce sync.Once
runtimeOrphanGCOnce sync.Once

// Hypervisor support
vmStarters map[hypervisor.Type]hypervisor.VMStarter
Expand Down
102 changes: 102 additions & 0 deletions lib/instances/runtime_orphan.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package instances

import (
"context"
"errors"
"time"

"github.com/kernel/hypeman/lib/logger"
)

const (
runtimeOrphanGCInterval = 60 * time.Second
runtimeOrphanMinAge = 5 * time.Minute
)

type orphanRuntimeProcess struct {
PID int
InstanceID string
Age time.Duration
Command string
}

// StartRuntimeOrphanReconciler adopts or removes hypervisor runtimes left behind
// after hypeman-api restarts. This protects hosts running systemd KillMode=process,
// where qemu/firecracker children can survive the API process and become PPID=1.
func (m *manager) StartRuntimeOrphanReconciler(ctx context.Context) {
if ctx == nil {
ctx = context.Background()
}
m.runtimeOrphanGCOnce.Do(func() {
go func() {
m.reconcileRuntimeOrphans(ctx)
ticker := time.NewTicker(runtimeOrphanGCInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
m.reconcileRuntimeOrphans(ctx)
}
}
}()
})
}

func (m *manager) reconcileRuntimeOrphans(ctx context.Context) {
log := logger.FromContext(ctx)

orphaned, err := scanOrphanRuntimeProcesses(m.paths.GuestsDir())
if err != nil {
log.WarnContext(ctx, "runtime orphan GC: failed to scan processes", "error", err)
return
}
for _, proc := range orphaned {
meta, err := m.loadMetadata(proc.InstanceID)
if err == nil {
if meta.HypervisorPID == nil || *meta.HypervisorPID != proc.PID {
pid := proc.PID
meta.HypervisorPID = &pid
if saveErr := m.saveMetadata(meta); saveErr != nil {
log.WarnContext(ctx, "runtime orphan GC: failed to adopt runtime process",
"instance_id", proc.InstanceID,
"pid", proc.PID,
"error", saveErr,
)
continue
}
log.InfoContext(ctx, "runtime orphan GC: adopted runtime process",
"instance_id", proc.InstanceID,
"pid", proc.PID,
)
}
continue
}
if !errors.Is(err, ErrNotFound) {
log.WarnContext(ctx, "runtime orphan GC: failed to load metadata",
"instance_id", proc.InstanceID,
"pid", proc.PID,
"error", err,
)
continue
}
if proc.Age < runtimeOrphanMinAge {
continue
}
if err := terminateRuntimeProcess(proc.PID); err != nil {
log.WarnContext(ctx, "runtime orphan GC: failed to terminate unowned runtime process",
"instance_id", proc.InstanceID,
"pid", proc.PID,
"age", proc.Age,
"error", err,
)
continue
}
log.InfoContext(ctx, "runtime orphan GC: terminated unowned runtime process",
"instance_id", proc.InstanceID,
"pid", proc.PID,
"age", proc.Age,
)
}
}
151 changes: 151 additions & 0 deletions lib/instances/runtime_orphan_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
//go:build linux

package instances

import (
"bytes"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
)

func scanOrphanRuntimeProcesses(guestsDir string) ([]orphanRuntimeProcess, error) {
entries, err := os.ReadDir("/proc")
if err != nil {
return nil, err
}
bootTime, _ := linuxBootTime()
now := time.Now()
var out []orphanRuntimeProcess
for _, entry := range entries {
if !entry.IsDir() {
continue
}
pid, err := strconv.Atoi(entry.Name())
if err != nil {
continue
}
comm, err := os.ReadFile(filepath.Join("/proc", entry.Name(), "comm"))
if err != nil {
continue
}
name := strings.TrimSpace(string(comm))
if name != "qemu-system-x86" && name != "firecracker" {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

QEMU comm name exceeds TASK_COMM_LEN, never matches

High Severity

The filter checks name != "qemu-system-x86" (16 characters), but Linux's TASK_COMM_LEN is 16 bytes including the null terminator, so /proc/PID/comm truncates process names to 15 characters. The actual QEMU binary is qemu-system-x86_64 (per qemuBinaryName() in process.go), which the kernel truncates to "qemu-system-x8" in comm. Since "qemu-system-x86" is 16 chars and can never appear in /proc/PID/comm, the reconciler silently skips all orphaned QEMU processes — it can neither adopt nor terminate them.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 11739f4. Configure here.

continue
}
ppid, startTime, err := readProcStatusAndStart(pid)
if err != nil || ppid != 1 {
continue
}
cmdlineBytes, err := os.ReadFile(filepath.Join("/proc", entry.Name(), "cmdline"))
if err != nil {
continue
}
cmdline := string(bytes.ReplaceAll(cmdlineBytes, []byte{0}, []byte{' '}))
instanceID := instanceIDFromRuntimeCmdline(guestsDir, cmdline)
if instanceID == "" {
continue
}
age := time.Duration(0)
if !bootTime.IsZero() && startTime > 0 {
age = now.Sub(bootTime.Add(startTime))
}
out = append(out, orphanRuntimeProcess{
PID: pid,
InstanceID: instanceID,
Age: age,
Command: cmdline,
})
}
return out, nil
}

func instanceIDFromRuntimeCmdline(guestsDir, cmdline string) string {
prefix := filepath.Clean(guestsDir) + string(filepath.Separator)
idx := strings.Index(cmdline, prefix)
if idx < 0 {
return ""
}
rest := cmdline[idx+len(prefix):]
end := strings.IndexAny(rest, string(filepath.Separator)+" ")
if end < 0 {
return rest
}
return rest[:end]
}

func readProcStatusAndStart(pid int) (ppid int, start time.Duration, err error) {
status, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "status"))
if err != nil {
return 0, 0, err
}
for _, line := range strings.Split(string(status), "\n") {
if strings.HasPrefix(line, "PPid:") {
fields := strings.Fields(line)
if len(fields) == 2 {
ppid, _ = strconv.Atoi(fields[1])
}
break
}
}
stat, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
if err != nil {
return ppid, 0, nil
}
fields := strings.Fields(string(stat))
if len(fields) > 21 {
ticks, _ := strconv.ParseInt(fields[21], 10, 64)
hz := int64(100)
start = time.Duration(ticks) * time.Second / time.Duration(hz)
}
return ppid, start, nil
}

func linuxBootTime() (time.Time, error) {
data, err := os.ReadFile("/proc/stat")
if err != nil {
return time.Time{}, err
}
for _, line := range strings.Split(string(data), "\n") {
if !strings.HasPrefix(line, "btime ") {
continue
}
fields := strings.Fields(line)
if len(fields) != 2 {
break
}
sec, err := strconv.ParseInt(fields[1], 10, 64)
if err != nil {
return time.Time{}, err
}
return time.Unix(sec, 0), nil
}
return time.Time{}, fmt.Errorf("btime not found")
}

func terminateRuntimeProcess(pid int) error {
if err := syscall.Kill(pid, syscall.SIGTERM); err != nil {
if err == syscall.ESRCH {
return nil
}
return err
}
deadline := time.Now().Add(10 * time.Second)
for time.Now().Before(deadline) {
if err := syscall.Kill(pid, 0); err != nil {
if err == syscall.ESRCH {
return nil
}
return err
}
time.Sleep(100 * time.Millisecond)
}
if err := syscall.Kill(pid, syscall.SIGKILL); err != nil && err != syscall.ESRCH {
return err
}
return nil
}
25 changes: 25 additions & 0 deletions lib/instances/runtime_orphan_linux_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
//go:build linux

package instances

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestInstanceIDFromRuntimeCmdline(t *testing.T) {
t.Parallel()

require.Equal(t,
"abc123",
instanceIDFromRuntimeCmdline("/var/lib/hypeman/guests", "/usr/bin/qemu-system-x86_64 -chardev socket,path=/var/lib/hypeman/guests/abc123/qemu.sock"),
)
require.Equal(t,
"fc456",
instanceIDFromRuntimeCmdline("/var/lib/hypeman/guests", "/var/lib/hypeman/system/binaries/firecracker --api-sock /var/lib/hypeman/guests/fc456/fc.sock"),
)
require.Empty(t,
instanceIDFromRuntimeCmdline("/var/lib/hypeman/guests", "/usr/bin/qemu-system-x86_64 -monitor /tmp/qemu.sock"),
)
}
11 changes: 11 additions & 0 deletions lib/instances/runtime_orphan_unsupported.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
//go:build !linux

package instances

func scanOrphanRuntimeProcesses(string) ([]orphanRuntimeProcess, error) {
return nil, nil
}

func terminateRuntimeProcess(int) error {
return nil
}
Loading