Skip to content

Commit 7c45843

Browse files
authored
add oom checker (#23)
* add oom checker * add some future tasks;add oom_score_adj;add help link * add another oom log pattern Co-authored-by: Binjie Qian <biqian@microsoft.com>
1 parent b99282e commit 7c45843

3 files changed

Lines changed: 170 additions & 0 deletions

File tree

pkg/checkers/oom/oom.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
package oom
2+
3+
import (
4+
"bufio"
5+
"fmt"
6+
"github.com/Azure/kdebug/pkg/base"
7+
"github.com/Azure/kdebug/pkg/env"
8+
"os"
9+
"regexp"
10+
"strings"
11+
)
12+
13+
const (
14+
logPath = "/var/log/kern.log"
15+
cgroupOOMKeyStr = "Memory cgroup out of memory"
16+
outOfMemoryKey = "Out of memory"
17+
)
18+
19+
var helpLink = []string{
20+
"https://www.kernel.org/doc/gorman/html/understand/understand016.html",
21+
"https://stackoverflow.com/questions/18845857/what-does-anon-rss-and-total-vm-mean",
22+
"https://medium.com/tailwinds-navigator/kubernetes-tip-how-does-oomkilled-work-ba71b135993b",
23+
}
24+
25+
var oomRegex = regexp.MustCompile("^(.*:.{2}:.{2}) .* process (.*) \\((.*)\\) .* anon-rss:(.*), file-rss.* oom_score_adj:(.*)")
26+
27+
type OOMChecker struct {
28+
kernLogPath string
29+
}
30+
31+
func (c *OOMChecker) Name() string {
32+
return "OOM"
33+
}
34+
35+
func New() *OOMChecker {
36+
//todo: support other logpath
37+
return &OOMChecker{
38+
kernLogPath: logPath,
39+
}
40+
}
41+
42+
func (c *OOMChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
43+
var results []*base.CheckResult
44+
oomResult, err := c.checkOOM(ctx)
45+
if err != nil {
46+
return nil, err
47+
}
48+
results = append(results, oomResult)
49+
return results, nil
50+
}
51+
52+
func (c *OOMChecker) checkOOM(ctx *base.CheckContext) (*base.CheckResult, error) {
53+
result := &base.CheckResult{
54+
Checker: c.Name(),
55+
}
56+
if !envCheck(ctx.Environment) {
57+
result.Description = fmt.Sprint("Skip oom check in non-linux os")
58+
return result, nil
59+
}
60+
oomInfos, err := c.getAndParseOOMLog()
61+
if err != nil {
62+
return nil, err
63+
} else if len(oomInfos) > 0 {
64+
result.Error = strings.Join(oomInfos, "\n")
65+
result.Description = "Detect process oom killed"
66+
result.HelpLinks = helpLink
67+
} else {
68+
result.Description = "No OOM found in recent kernlog."
69+
}
70+
return result, nil
71+
}
72+
func (c *OOMChecker) getAndParseOOMLog() ([]string, error) {
73+
file, err := os.Open(c.kernLogPath)
74+
if err != nil {
75+
return nil, err
76+
}
77+
defer file.Close()
78+
79+
var oomInfos []string
80+
scanner := bufio.NewScanner(file)
81+
for scanner.Scan() {
82+
tmp := scanner.Text()
83+
//todo: more sophisticated OOM context
84+
//pattern match. https://github.com/torvalds/linux/blob/551acdc3c3d2b6bc97f11e31dcf960bc36343bfc/mm/oom_kill.c#L1120, https://github.com/torvalds/linux/blob/551acdc3c3d2b6bc97f11e31dcf960bc36343bfc/mm/oom_kill.c#L895
85+
if strings.Contains(tmp, cgroupOOMKeyStr) || strings.Contains(tmp, outOfMemoryKey) {
86+
oomInfo, err := parseOOMContent(tmp)
87+
if err != nil {
88+
return nil, err
89+
} else {
90+
oomInfos = append(oomInfos, oomInfo)
91+
}
92+
}
93+
}
94+
95+
if err := scanner.Err(); err != nil {
96+
return nil, err
97+
}
98+
return oomInfos, nil
99+
}
100+
101+
func parseOOMContent(content string) (string, error) {
102+
match := oomRegex.FindStringSubmatch(content)
103+
if len(match) != 6 {
104+
err := fmt.Errorf("Can't parse oom content:%s \n", content)
105+
return "", err
106+
} else {
107+
return fmt.Sprintf("progress:[%s %s] is OOM kill at time [%s]. [rss:%s] [oom_score_adj:%s]\n", match[2], match[3], match[1], match[4], match[5]), nil
108+
}
109+
}
110+
111+
func envCheck(environment env.Environment) bool {
112+
//todo:support other os
113+
return environment.HasFlag("ubuntu")
114+
}

pkg/checkers/oom/oom_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package oom
2+
3+
import (
4+
"fmt"
5+
"github.com/Azure/kdebug/pkg/base"
6+
"github.com/Azure/kdebug/pkg/env"
7+
"io/ioutil"
8+
"os"
9+
"testing"
10+
)
11+
12+
var testStrings = []string{
13+
"Feb 22 16:15:02 k8s-ingress-11186066-z1-vmss0000B3 kernel: [989751.247878] Memory cgroup out of memory: Killed process 3841 (nginx) total-vm:240652kB, anon-rss:130344kB, file-rss:5212kB, shmem-rss:208kB, UID:101 pgtables:332kB oom_score_adj:986\n",
14+
"Feb 22 16:15:02 k8s-ingress-11186066-z1-vmss0000B3 kernel: [989751.247878] Out of memory: Killed process 3841 (nginx) total-vm:240652kB, anon-rss:130344kB, file-rss:5212kB, shmem-rss:208kB, UID:101 pgtables:332kB oom_score_adj:986\n",
15+
}
16+
17+
func TestCheckOOMLogWhenOOM(t *testing.T) {
18+
environment := &env.StaticEnvironment{
19+
Flags: []string{"ubuntu"},
20+
}
21+
if !envCheck(env.GetEnvironment()) {
22+
fmt.Println("skip oom test")
23+
return
24+
}
25+
for _, testString := range testStrings {
26+
27+
tmp, err := ioutil.TempFile("", "kernlog")
28+
if err != nil {
29+
t.Fatalf("error creating tmp file:%v", err)
30+
}
31+
check := OOMChecker{kernLogPath: tmp.Name()}
32+
defer func() {
33+
e := os.Remove(check.kernLogPath)
34+
if e != nil {
35+
t.Errorf(e.Error())
36+
}
37+
}()
38+
//should be 600. But it fails in 600
39+
err = os.WriteFile(check.kernLogPath, []byte(testString), 777)
40+
if err != nil {
41+
t.Errorf("Create tmp file error:%v", err)
42+
}
43+
result, _ := check.Check(&base.CheckContext{
44+
Environment: environment,
45+
})
46+
if len(result) != 1 {
47+
t.Errorf("Get unexpected OOM result length %v", len(result))
48+
}
49+
checkErr := result[0].Error
50+
if checkErr != "progress:[3841 nginx] is OOM kill at time [Feb 22 16:15:02]. [rss:130344kB] [oom_score_adj:986]\n" {
51+
t.Errorf("Unexpected check result:\n %v \n %v", result[0].Description, checkErr)
52+
}
53+
}
54+
}

pkg/checkers/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@ import (
77
"github.com/Azure/kdebug/pkg/checkers/dns"
88
"github.com/Azure/kdebug/pkg/checkers/dummy"
99
kubeobjectsize "github.com/Azure/kdebug/pkg/checkers/kube/objectsize"
10+
"github.com/Azure/kdebug/pkg/checkers/oom"
1011
"github.com/Azure/kdebug/pkg/checkers/kube/pod"
1112
)
1213

1314
var allCheckers = map[string]Checker{
1415
"dummy": &dummy.DummyChecker{},
1516
"dns": dns.New(),
17+
"oom": oom.New(),
1618
"kubeobjectsize": kubeobjectsize.New(),
1719
"diskusage": diskusage.New(),
1820
"kubepod": pod.New(),

0 commit comments

Comments
 (0)