Skip to content

Commit

Permalink
add oom checker (#23)
Browse files Browse the repository at this point in the history
* add oom checker

* add some future tasks;add oom_score_adj;add help link

* add another oom log pattern

Co-authored-by: Binjie Qian <[email protected]>
  • Loading branch information
bjqian and bjqian authored Mar 18, 2022
1 parent b99282e commit 7c45843
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 0 deletions.
114 changes: 114 additions & 0 deletions pkg/checkers/oom/oom.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package oom

import (
"bufio"
"fmt"
"github.com/Azure/kdebug/pkg/base"
"github.com/Azure/kdebug/pkg/env"
"os"
"regexp"
"strings"
)

const (
logPath = "/var/log/kern.log"
cgroupOOMKeyStr = "Memory cgroup out of memory"
outOfMemoryKey = "Out of memory"
)

var helpLink = []string{
"https://www.kernel.org/doc/gorman/html/understand/understand016.html",
"https://stackoverflow.com/questions/18845857/what-does-anon-rss-and-total-vm-mean",
"https://medium.com/tailwinds-navigator/kubernetes-tip-how-does-oomkilled-work-ba71b135993b",
}

var oomRegex = regexp.MustCompile("^(.*:.{2}:.{2}) .* process (.*) \\((.*)\\) .* anon-rss:(.*), file-rss.* oom_score_adj:(.*)")

type OOMChecker struct {
kernLogPath string
}

func (c *OOMChecker) Name() string {
return "OOM"
}

func New() *OOMChecker {
//todo: support other logpath
return &OOMChecker{
kernLogPath: logPath,
}
}

func (c *OOMChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
var results []*base.CheckResult
oomResult, err := c.checkOOM(ctx)
if err != nil {
return nil, err
}
results = append(results, oomResult)
return results, nil
}

func (c *OOMChecker) checkOOM(ctx *base.CheckContext) (*base.CheckResult, error) {
result := &base.CheckResult{
Checker: c.Name(),
}
if !envCheck(ctx.Environment) {
result.Description = fmt.Sprint("Skip oom check in non-linux os")
return result, nil
}
oomInfos, err := c.getAndParseOOMLog()
if err != nil {
return nil, err
} else if len(oomInfos) > 0 {
result.Error = strings.Join(oomInfos, "\n")
result.Description = "Detect process oom killed"
result.HelpLinks = helpLink
} else {
result.Description = "No OOM found in recent kernlog."
}
return result, nil
}
func (c *OOMChecker) getAndParseOOMLog() ([]string, error) {
file, err := os.Open(c.kernLogPath)
if err != nil {
return nil, err
}
defer file.Close()

var oomInfos []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
tmp := scanner.Text()
//todo: more sophisticated OOM context
//pattern match. https://github.com/torvalds/linux/blob/551acdc3c3d2b6bc97f11e31dcf960bc36343bfc/mm/oom_kill.c#L1120, https://github.com/torvalds/linux/blob/551acdc3c3d2b6bc97f11e31dcf960bc36343bfc/mm/oom_kill.c#L895
if strings.Contains(tmp, cgroupOOMKeyStr) || strings.Contains(tmp, outOfMemoryKey) {
oomInfo, err := parseOOMContent(tmp)
if err != nil {
return nil, err
} else {
oomInfos = append(oomInfos, oomInfo)
}
}
}

if err := scanner.Err(); err != nil {
return nil, err
}
return oomInfos, nil
}

func parseOOMContent(content string) (string, error) {
match := oomRegex.FindStringSubmatch(content)
if len(match) != 6 {
err := fmt.Errorf("Can't parse oom content:%s \n", content)
return "", err
} else {
return fmt.Sprintf("progress:[%s %s] is OOM kill at time [%s]. [rss:%s] [oom_score_adj:%s]\n", match[2], match[3], match[1], match[4], match[5]), nil
}
}

func envCheck(environment env.Environment) bool {
//todo:support other os
return environment.HasFlag("ubuntu")
}
54 changes: 54 additions & 0 deletions pkg/checkers/oom/oom_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package oom

import (
"fmt"
"github.com/Azure/kdebug/pkg/base"
"github.com/Azure/kdebug/pkg/env"
"io/ioutil"
"os"
"testing"
)

var testStrings = []string{
"Feb 22 16:15:02 k8s-ingress-11186066-z1-vmss0000B3 kernel: [989751.247878] Memory cgroup out of memory: Killed process 3841 (nginx) total-vm:240652kB, anon-rss:130344kB, file-rss:5212kB, shmem-rss:208kB, UID:101 pgtables:332kB oom_score_adj:986\n",
"Feb 22 16:15:02 k8s-ingress-11186066-z1-vmss0000B3 kernel: [989751.247878] Out of memory: Killed process 3841 (nginx) total-vm:240652kB, anon-rss:130344kB, file-rss:5212kB, shmem-rss:208kB, UID:101 pgtables:332kB oom_score_adj:986\n",
}

func TestCheckOOMLogWhenOOM(t *testing.T) {
environment := &env.StaticEnvironment{
Flags: []string{"ubuntu"},
}
if !envCheck(env.GetEnvironment()) {
fmt.Println("skip oom test")
return
}
for _, testString := range testStrings {

tmp, err := ioutil.TempFile("", "kernlog")
if err != nil {
t.Fatalf("error creating tmp file:%v", err)
}
check := OOMChecker{kernLogPath: tmp.Name()}
defer func() {
e := os.Remove(check.kernLogPath)
if e != nil {
t.Errorf(e.Error())
}
}()
//should be 600. But it fails in 600
err = os.WriteFile(check.kernLogPath, []byte(testString), 777)
if err != nil {
t.Errorf("Create tmp file error:%v", err)
}
result, _ := check.Check(&base.CheckContext{
Environment: environment,
})
if len(result) != 1 {
t.Errorf("Get unexpected OOM result length %v", len(result))
}
checkErr := result[0].Error
if checkErr != "progress:[3841 nginx] is OOM kill at time [Feb 22 16:15:02]. [rss:130344kB] [oom_score_adj:986]\n" {
t.Errorf("Unexpected check result:\n %v \n %v", result[0].Description, checkErr)
}
}
}
2 changes: 2 additions & 0 deletions pkg/checkers/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ import (
"github.com/Azure/kdebug/pkg/checkers/dns"
"github.com/Azure/kdebug/pkg/checkers/dummy"
kubeobjectsize "github.com/Azure/kdebug/pkg/checkers/kube/objectsize"
"github.com/Azure/kdebug/pkg/checkers/oom"
"github.com/Azure/kdebug/pkg/checkers/kube/pod"
)

var allCheckers = map[string]Checker{
"dummy": &dummy.DummyChecker{},
"dns": dns.New(),
"oom": oom.New(),
"kubeobjectsize": kubeobjectsize.New(),
"diskusage": diskusage.New(),
"kubepod": pod.New(),
Expand Down

0 comments on commit 7c45843

Please sign in to comment.