-
Notifications
You must be signed in to change notification settings - Fork 31
devices: drop cilium/ebpf{,link} deps #64
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,21 +1,138 @@ | ||
| package devices | ||
|
|
||
| import ( | ||
| "bytes" | ||
| "errors" | ||
| "fmt" | ||
| "os" | ||
| "runtime" | ||
| "sync" | ||
| "unsafe" | ||
|
|
||
| "github.com/cilium/ebpf" | ||
| "github.com/cilium/ebpf/asm" | ||
| "github.com/cilium/ebpf/link" | ||
| "github.com/sirupsen/logrus" | ||
| "golang.org/x/sys/unix" | ||
| ) | ||
|
|
||
| func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error) { | ||
| func bpf(cmd uintptr, attr unsafe.Pointer, size uintptr) (uintptr, error) { | ||
| r1, _, err := unix.Syscall(unix.SYS_BPF, cmd, uintptr(attr), size) | ||
| runtime.KeepAlive(attr) | ||
| if err != 0 { | ||
| return r1, err | ||
| } | ||
| return r1, nil | ||
| } | ||
|
|
||
| // bpfProgLoad loads a BPF_PROG_TYPE_CGROUP_DEVICE program and returns its fd. | ||
| func bpfProgLoad(insns asm.Instructions, license string) (int, error) { | ||
| buf := bytes.NewBuffer(make([]byte, 0, insns.Size())) | ||
| if err := insns.Marshal(buf, nativeEndian); err != nil { | ||
| return -1, err | ||
| } | ||
| insnsBytes := buf.Bytes() | ||
|
|
||
| licensePtr, err := unix.BytePtrFromString(license) | ||
| if err != nil { | ||
| return -1, err | ||
| } | ||
|
|
||
| // Subset of struct bpf_attr for BPF_PROG_LOAD. Fields past the ones we set | ||
| // are left zero; the kernel zero-fills any part of bpf_attr beyond the size | ||
| // we pass. | ||
| attr := struct { | ||
| progType uint32 | ||
| insnCnt uint32 | ||
| insns uint64 // pointer | ||
| license uint64 // pointer | ||
| logLevel uint32 | ||
| logSize uint32 | ||
| logBuf uint64 // pointer | ||
| }{ | ||
| progType: unix.BPF_PROG_TYPE_CGROUP_DEVICE, | ||
| insnCnt: uint32(len(insnsBytes) / asm.InstructionSize), | ||
| insns: uint64(uintptr(unsafe.Pointer(&insnsBytes[0]))), | ||
| license: uint64(uintptr(unsafe.Pointer(licensePtr))), | ||
| } | ||
|
|
||
| fd, err := bpf(unix.BPF_PROG_LOAD, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) | ||
| // attr holds the pointers as integers, so the GC can't see them; keep the | ||
| // referenced objects alive until the syscall returns. | ||
| runtime.KeepAlive(insnsBytes) | ||
| runtime.KeepAlive(licensePtr) | ||
| if err == nil { | ||
| return int(fd), nil | ||
| } | ||
|
|
||
| // The load failed. Retry with the verifier log enabled so we can include | ||
| // it in the error (the first attempt skips it, as it is the fast path). | ||
| log := make([]byte, 64*1024) | ||
| attr.logLevel = 1 | ||
| attr.logSize = uint32(len(log)) | ||
| attr.logBuf = uint64(uintptr(unsafe.Pointer(&log[0]))) | ||
|
|
||
| fd, err = bpf(unix.BPF_PROG_LOAD, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) | ||
| runtime.KeepAlive(insnsBytes) | ||
| runtime.KeepAlive(licensePtr) | ||
| runtime.KeepAlive(log) | ||
| if err == nil { | ||
| return int(fd), nil | ||
| } | ||
| if n := bytes.IndexByte(log, 0); n > 0 { | ||
| return -1, fmt.Errorf("%w: %s", err, bytes.TrimRight(log[:n], "\n")) | ||
| } | ||
| return -1, err | ||
| } | ||
|
|
||
| // bpfProgGetFdByID returns the fd for the BPF program with the given ID. | ||
| func bpfProgGetFdByID(id uint32) (int, error) { | ||
| // The kernel zero-fills the rest of bpf_attr beyond the size we pass. | ||
| attr := struct{ id uint32 }{id} | ||
| fd, err := bpf(unix.BPF_PROG_GET_FD_BY_ID, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) | ||
| if err != nil { | ||
| return -1, err | ||
| } | ||
| return int(fd), nil | ||
| } | ||
|
|
||
| // bpfProgAttach attaches progFd to cgroupFd with the given flags. If replaceFd | ||
| // is >= 0, its fd is set in replaceBpfFd (for BPF_F_REPLACE semantics). | ||
| func bpfProgAttach(cgroupFd, progFd int, attachFlags uint32, replaceFd int) error { | ||
| attr := struct { | ||
| targetFd uint32 | ||
| attachBpfFd uint32 | ||
| attachType uint32 | ||
| attachFlags uint32 | ||
| replaceBpfFd uint32 | ||
| }{ | ||
| targetFd: uint32(cgroupFd), | ||
| attachBpfFd: uint32(progFd), | ||
| attachType: uint32(unix.BPF_CGROUP_DEVICE), | ||
| attachFlags: attachFlags, | ||
| } | ||
| if replaceFd >= 0 { | ||
| attr.replaceBpfFd = uint32(replaceFd) | ||
| } | ||
| _, err := bpf(unix.BPF_PROG_ATTACH, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) | ||
| return err | ||
| } | ||
|
|
||
| // bpfProgDetach detaches progFd from cgroupFd. | ||
| func bpfProgDetach(cgroupFd, progFd int) error { | ||
| // The kernel zero-fills the rest of bpf_attr beyond the size we pass. | ||
| attr := struct { | ||
| targetFd uint32 | ||
| attachBpfFd uint32 | ||
| attachType uint32 | ||
| }{ | ||
| targetFd: uint32(cgroupFd), | ||
| attachBpfFd: uint32(progFd), | ||
| attachType: uint32(unix.BPF_CGROUP_DEVICE), | ||
| } | ||
| _, err := bpf(unix.BPF_PROG_DETACH, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) | ||
| return err | ||
| } | ||
|
|
||
| func findAttachedCgroupDeviceFilters(dirFd int) (_ []int, retErr error) { | ||
| type bpfAttrQuery struct { | ||
| TargetFd uint32 | ||
| AttachType uint32 | ||
|
|
@@ -37,36 +154,34 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error | |
| ProgCnt: uint32(len(progIds)), | ||
| } | ||
|
|
||
| // Fetch the list of program ids. | ||
| _, _, errno := unix.Syscall(unix.SYS_BPF, | ||
| uintptr(unix.BPF_PROG_QUERY), | ||
| uintptr(unsafe.Pointer(&query)), | ||
| unsafe.Sizeof(query)) | ||
| // Fetch the list of program ids. bpf() keeps &query alive for the | ||
| // duration of the syscall, and query.ProgCnt is read right after. | ||
| _, err := bpf(unix.BPF_PROG_QUERY, unsafe.Pointer(&query), unsafe.Sizeof(query)) | ||
| runtime.KeepAlive(progIds) | ||
| size = int(query.ProgCnt) | ||
| runtime.KeepAlive(query) | ||
| if errno != 0 { | ||
| if err != nil { | ||
| // On ENOSPC we get the correct number of programs. | ||
| if errno == unix.ENOSPC { | ||
| if errors.Is(err, unix.ENOSPC) { | ||
| retries++ | ||
| continue | ||
| } | ||
| return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno) | ||
| return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", err) | ||
| } | ||
|
|
||
| // Convert the ids to program handles. | ||
| // On error we don't return the programs slice, so close the fds stored there. | ||
| // Convert the ids to program fds. | ||
| // On error we don't return the fds slice, so close the fds stored there. | ||
| progIds = progIds[:size] | ||
| programs := make([]*ebpf.Program, 0, len(progIds)) | ||
| fds := make([]int, 0, len(progIds)) | ||
| defer func() { | ||
| if retErr != nil { | ||
| for _, p := range programs { | ||
| p.Close() | ||
| for _, fd := range fds { | ||
| unix.Close(fd) | ||
| } | ||
| } | ||
| }() | ||
|
|
||
| for _, progId := range progIds { | ||
| program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId)) | ||
| fd, err := bpfProgGetFdByID(progId) | ||
| if err != nil { | ||
| // We skip over programs that give us -EACCES or -EPERM. This | ||
| // is necessary because there may be BPF programs that have | ||
|
|
@@ -83,10 +198,10 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error | |
| } | ||
| return nil, fmt.Errorf("cannot fetch program from id: %w", err) | ||
| } | ||
| programs = append(programs, program) | ||
| fds = append(fds, fd) | ||
| } | ||
| runtime.KeepAlive(progIds) | ||
| return programs, nil | ||
| return fds, nil | ||
| } | ||
|
|
||
| return nil, errors.New("could not get complete list of CGROUP_DEVICE programs") | ||
|
|
@@ -99,23 +214,17 @@ var ( | |
|
|
||
| // Loosely based on the BPF_F_REPLACE support check in | ||
| // https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go. | ||
| // | ||
| // TODO: move this logic to cilium/ebpf | ||
|
Comment on lines
-102
to
-103
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was there anything in our code that would still be worth upstreaming (even if we don't use it?) not sure how generic our changes are.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code was/is there, just not as a public API. I guess the only "unique" code we have here is how we use BPF_PROG_REPLACE. Also, it's not too much code. See also opencontainers/runc#5218 (comment) |
||
| func haveBpfProgReplace() bool { | ||
| haveBpfProgReplaceOnce.Do(func() { | ||
| prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ | ||
| Type: ebpf.CGroupDevice, | ||
| License: "MIT", | ||
| Instructions: asm.Instructions{ | ||
| asm.Mov.Imm(asm.R0, 0), | ||
| asm.Return(), | ||
| }, | ||
| }) | ||
| progFd, err := bpfProgLoad(asm.Instructions{ | ||
| asm.Mov.Imm(asm.R0, 0), | ||
| asm.Return(), | ||
| }, "MIT") | ||
| if err != nil { | ||
| logrus.Warnf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err) | ||
| logrus.Warnf("checking for BPF_F_REPLACE support: bpfProgLoad failed: %v", err) | ||
| return | ||
| } | ||
| defer prog.Close() | ||
| defer unix.Close(progFd) | ||
|
|
||
| devnull, err := os.Open("/dev/null") | ||
| if err != nil { | ||
|
|
@@ -127,24 +236,19 @@ func haveBpfProgReplace() bool { | |
| // We know that we have BPF_PROG_ATTACH since we can load | ||
| // BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL | ||
| // we know that the feature isn't present. | ||
| err = link.RawAttachProgram(link.RawAttachProgramOptions{ | ||
| // We rely on this fd being checked after attachFlags in the kernel. | ||
| Target: int(devnull.Fd()), | ||
| // Attempt to "replace" our BPF program with itself. This will | ||
| // always fail, but we should get -EINVAL if BPF_F_REPLACE is not | ||
| // supported. | ||
| Anchor: link.ReplaceProgram(prog), | ||
| Program: prog, | ||
| Attach: ebpf.AttachCGroupDevice, | ||
| Flags: unix.BPF_F_ALLOW_MULTI, | ||
| }) | ||
| if errors.Is(err, ebpf.ErrNotSupported) || errors.Is(err, unix.EINVAL) { | ||
| // | ||
| // We rely on the target fd being checked after attachFlags in the | ||
| // kernel. Attempting to "replace" our BPF program with itself always | ||
| // fails, but we should get -EINVAL if BPF_F_REPLACE is not supported, | ||
| // and -EBADF (from the dummy target fd) if it is. | ||
| err = bpfProgAttach(int(devnull.Fd()), progFd, unix.BPF_F_ALLOW_MULTI|unix.BPF_F_REPLACE, progFd) | ||
| if errors.Is(err, unix.EINVAL) { | ||
| // not supported | ||
| return | ||
| } | ||
| if !errors.Is(err, unix.EBADF) { | ||
| // If we see any new errors here, it's possible that there is a | ||
| // regression due to a cilium/ebpf update and the above EINVAL | ||
| // regression due to a kernel update and the above EINVAL | ||
| // checks are not working. So, be loud about it so someone notices | ||
| // and we can get the issue fixed quicker. | ||
| logrus.Warnf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err) | ||
|
|
@@ -169,83 +273,58 @@ func loadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd | |
| _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) | ||
|
|
||
| // Get the list of existing programs. | ||
| oldProgs, err := findAttachedCgroupDeviceFilters(dirFd) | ||
| oldFds, err := findAttachedCgroupDeviceFilters(dirFd) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| defer func() { | ||
| for _, p := range oldProgs { | ||
| p.Close() | ||
| for _, fd := range oldFds { | ||
| unix.Close(fd) | ||
| } | ||
| }() | ||
|
|
||
| useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1 | ||
| useReplaceProg := haveBpfProgReplace() && len(oldFds) == 1 | ||
|
|
||
| // Generate new program. | ||
| spec := &ebpf.ProgramSpec{ | ||
| Type: ebpf.CGroupDevice, | ||
| Instructions: insts, | ||
| License: license, | ||
| } | ||
| prog, err := ebpf.NewProgram(spec) | ||
| progFd, err := bpfProgLoad(insts, license) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| defer prog.Close() | ||
| // Once the program is attached, the kernel keeps it alive via the cgroup | ||
| // attachment, so we no longer need our own fd; we also don't need it if the | ||
| // attach below fails. Either way, close it on return. | ||
| defer unix.Close(progFd) | ||
|
|
||
| // If there is only one old program, we can just replace it directly. | ||
|
|
||
| attachProgramOptions := link.RawAttachProgramOptions{ | ||
| Target: dirFd, | ||
| Program: prog, | ||
| Attach: ebpf.AttachCGroupDevice, | ||
| Flags: unix.BPF_F_ALLOW_MULTI, | ||
| } | ||
|
|
||
| replaceFd := -1 | ||
| attachFlags := uint32(unix.BPF_F_ALLOW_MULTI) | ||
| if useReplaceProg { | ||
| attachProgramOptions.Anchor = link.ReplaceProgram(oldProgs[0]) | ||
| replaceFd = oldFds[0] | ||
| attachFlags |= unix.BPF_F_REPLACE | ||
| } | ||
| err = link.RawAttachProgram(attachProgramOptions) | ||
| err = bpfProgAttach(dirFd, progFd, attachFlags, replaceFd) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err) | ||
| } | ||
|
|
||
| if !useReplaceProg { | ||
| logLevel := logrus.DebugLevel | ||
| // If there was more than one old program, give a warning (since this | ||
| // really shouldn't happen with runc-managed cgroups) and then detach | ||
| // all the old programs. | ||
| if len(oldProgs) > 1 { | ||
| if len(oldFds) > 1 { | ||
| // NOTE: Ideally this should be a warning but it turns out that | ||
| // systemd-managed cgroups trigger this warning (apparently | ||
| // systemd doesn't delete old non-systemd programs when | ||
| // setting properties). | ||
| logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs)) | ||
| logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldFds)) | ||
| logLevel = logrus.InfoLevel | ||
| } | ||
| for idx, oldProg := range oldProgs { | ||
| // Output some extra debug info. | ||
| if info, err := oldProg.Info(); err == nil { | ||
| fields := logrus.Fields{ | ||
| "type": info.Type.String(), | ||
| "tag": info.Tag, | ||
| "name": info.Name, | ||
| } | ||
| if id, ok := info.ID(); ok { | ||
| fields["id"] = id | ||
| } | ||
| if runCount, ok := info.RunCount(); ok { | ||
| fields["run_count"] = runCount | ||
| } | ||
| if runtime, ok := info.Runtime(); ok { | ||
| fields["runtime"] = runtime.String() | ||
| } | ||
| logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx) | ||
| } | ||
| err = link.RawDetachProgram(link.RawDetachProgramOptions{ | ||
| Target: dirFd, | ||
| Program: oldProg, | ||
| Attach: ebpf.AttachCGroupDevice, | ||
| }) | ||
| for idx, oldFd := range oldFds { | ||
| logrus.WithFields(logrus.Fields{ | ||
| "fd": oldFd, | ||
| }).Logf(logLevel, "removing old filter %d from cgroup", idx) | ||
| err = bpfProgDetach(dirFd, oldFd) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| //go:build armbe || arm64be || mips || mips64 || mips64p32 || ppc64 || s390 || s390x || sparc || sparc64 | ||
|
|
||
| package devices | ||
|
|
||
| import "encoding/binary" | ||
|
|
||
| // nativeEndian is used as a workaround for cilium/ebpf/asm | ||
| // which does not accept binary.NativeEndian. | ||
| var nativeEndian = binary.BigEndian | ||
|
Comment on lines
+7
to
+9
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is unfortunate; makes our code more brittle (if there would ever be more build-tags added). Would this be something that could be fixed in ebpf? My AI-buddy suggested something like this could work (but haven't verified); func newBPFRegisters(dst, src Register, bo binary.ByteOrder) (bpfRegisters, error) {
var b [2]byte
bo.PutUint16(b[:], 0x0102)
switch b {
case [2]byte{0x02, 0x01}: // little
return bpfRegisters((src << 4) | (dst & 0xf)), nil
case [2]byte{0x01, 0x02}: // big
return bpfRegisters((dst << 4) | (src & 0xf)), nil
default:
return 0, fmt.Errorf("unrecognized ByteOrder %T", bo)
}
}
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In that case, I guess we could do this locally as well if we don't want to depend on our build-tags being complete; var nativeEndian = detectNativeEndian()
func detectNativeEndian() binary.ByteOrder {
var b [2]byte
binary.NativeEndian.PutUint16(b[:], 0x0102)
switch b {
case [2]byte{0x02, 0x01}:
return binary.LittleEndian
case [2]byte{0x01, 0x02}:
return binary.BigEndian
default:
panic("unreachable: invalid native byte order")
}
}or var nativeEndian = detectNativeEndian()
func detectNativeEndian() binary.ByteOrder {
var x uint16 = 0x0102
if *(*byte)(unsafe.Pointer(&x)) == 0x02 {
return binary.LittleEndian
}
return binary.BigEndian
}
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Initially I had a similar code but I prefer to determine that during compile time rather than run time.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, if a whole new architecture is to be added to Golang, fixing this code is trivial.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But, we can have a runtime test to ensure our endian-ness is correct. Added.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, generally agreed. I wish there was a builtin endian-ness build-tag or something (and, yes, initially I thought; why not use The long list of platforms made me hesitate a bit (easy to miss one!)
Works for me (for now); still would be great if upstream cilium didn't require us to do this 😅
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I tried: cilium/ebpf#1523
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Why? The overhead should be negligible? |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| //go:build 386 || amd64 || amd64p32 || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || ppc64le || riscv64 || wasm | ||
|
|
||
| package devices | ||
|
|
||
| import "encoding/binary" | ||
|
|
||
| // nativeEndian is used as a workaround for cilium/ebpf/asm | ||
| // which does not accept binary.NativeEndian. | ||
| var nativeEndian = binary.LittleEndian |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doesn't the cgroups pkg have this already?