diff --git a/pkg/yaml/lexer/lexer.go b/pkg/yaml/lexer/lexer.go new file mode 100644 index 0000000..ae7998a --- /dev/null +++ b/pkg/yaml/lexer/lexer.go @@ -0,0 +1,302 @@ +package lexer + +import ( + "bufio" + "bytes" + "fmt" + "github.com/screw-coding/yaml/token" + "io" + "strings" +) + +// Tokenizer 词法解析器 +type Tokenizer struct { + reader *bufio.Reader + currentLine []rune // 使用 run 类型 而不是 byte 类型,因为希望兼容 unicode + line int // 所读取的文档行数 + + position int // 所输入字符串中的当前位置(指向当前字符) + readPosition int // 所输入字符串中的当前读取位置(指向当前字符之后的一个字符) + ch rune // 当前正在查看的字符 + + tokens []*token.Token // 保存所有的 token +} + +func New(reader io.Reader) *Tokenizer { + t := &Tokenizer{reader: bufio.NewReader(reader)} + err := t.readLine() + if err != nil { + panic("error read") + } + return t +} + +// 读取一行数据,并且去掉结束的换行符 +func (t *Tokenizer) readLine() error { + for { + line, err := t.reader.ReadBytes('\n') + if err != nil { + return err + } + line = bytes.TrimRight(line, "\r\n") + // line + 1 + t.line += 1 + t.currentLine = []rune(string(line)) + if !t.isBlankLine() { + break + } + } + + // 重置 position + t.position = 0 + t.readPosition = 0 + t.readRune() + return nil +} + +// 读取下一个字符 +func (t *Tokenizer) readRune() { + if t.readPosition >= len(t.currentLine) { + t.ch = 0 // NUL字符的ASCII编码,用来表示“尚未读取任何内容”或“文件结尾” + } else { + t.ch = t.currentLine[t.readPosition] + } + t.position = t.readPosition + t.readPosition += 1 +} + +// 查看下一个字符 +func (t *Tokenizer) peekRune() rune { + if t.readPosition >= len(t.currentLine) { + return 0 + } else { + return t.currentLine[t.readPosition] + } +} + +func (t *Tokenizer) getPosition() *token.Position { + // 前面的空白字符数量 + pos := 0 + for t.currentLine[pos] == ' ' { + pos++ + } + + return &token.Position{ + Line: t.line, + OffSet: t.position, + IndentNum: pos, + } +} + +func (t *Tokenizer) nextToken() (*token.Token, error) { + var tok *token.Token + + // 什么时候换行 + if t.ch == 0 { + err := t.readLine() + if err != nil { + if err == io.EOF { + // 结尾用给了 + tok = token.DocumentEnd(nil) + t.appendToken(tok) + return tok, err + } else { + return nil, err + } + } + + } + + currentLineString := string(t.currentLine) + fmt.Println(string(t.currentLine)) + + // 读取到了 -- , 判断是文档开头还是结束 + // TODO 如果读到了文档最后,也需要标记 documentEnd + if strings.HasPrefix(string(t.currentLine), "---") { + // 如果是 第一行 + if t.line == 1 { + tok = token.DocumentStart(t.getPosition()) + t.skipRunes(3) + } else { + if t.tokens[len(t.tokens)-1].Type == token.DocumentEndType { + tok = token.DocumentStart(t.getPosition()) + t.skipRunes(3) + } else { + tok = token.DocumentEnd(t.getPosition()) + } + } + t.appendToken(tok) + return tok, nil + } else { + // 如果第一行不是 --- 并且还没有任何token + if len(t.tokens) == 0 { + tok = token.DocumentStart(t.getPosition()) + t.appendToken(tok) + return tok, nil + } + } + // 如果 # 开头 + switch t.ch { + case '#': + // 读取 # 一直到 换行符之间的所有内容 + pos := t.getPosition() + tok = token.Comment(t.readUntilLineBreak(), pos) + case ':': + pos := t.getPosition() + tok = token.MappingValue(pos) + t.readRune() + default: + // 如果当前行有 : 那前面的一定是一个 key + if strings.ContainsRune(currentLineString, ':') && !strings.Contains(currentLineString, "- ") { + // 当前指针是否在开头 + if t.position == 0 { + t.skipBlankUntilLetter() + // 然后读取出 : 之前的 key + position := t.getPosition() + mappingKey := t.readUntilMappingValueCharacter() + tok = token.MappingKey(mappingKey, position) + // 如果不在开头,那么前面是不是 : + } else { + // 读取到第一个非空格的字符 + t.skipBlankUntilNoneBlank() + position := t.getPosition() + tok = token.ScalarValue(t.readUntilBlank(), position) + t.skipBlankUntilNoneBlank() + } + } + // 如果当前行有 -, 那么说明是一个 sequence + if strings.Contains(currentLineString, "- ") && !strings.ContainsRune(currentLineString, ':') { + // 当前指针是否在开头 + if t.position == 0 { + t.skipBlankUntilDash() + position := t.getPosition() + tok = token.SequenceEntry(position) + t.skipRunes(2) + } else { + // 读取到第一个非空格的字符 + t.skipBlankUntilNoneBlank() + position := t.getPosition() + tok = token.ScalarValue(t.readUntilBlank(), position) + t.skipBlankUntilNoneBlank() + } + } + + // 都有 + if strings.Contains(currentLineString, "- ") && strings.ContainsRune(currentLineString, ':') { + if t.position == 0 { + t.skipBlankUntilDash() + position := t.getPosition() + tok = token.SequenceEntry(position) + t.skipRunes(2) + } else if t.tokens[len(t.tokens)-1].Type == token.SequenceEntryType { + t.skipBlankUntilLetter() + // 然后读取出 : 之前的 key + position := t.getPosition() + mappingKey := t.readUntilMappingValueCharacter() + tok = token.MappingKey(mappingKey, position) + } else { + // 读取到第一个非空格的字符 + t.skipBlankUntilNoneBlank() + position := t.getPosition() + tok = token.ScalarValue(t.readUntilBlank(), position) + t.skipBlankUntilNoneBlank() + } + } + + } + + if tok == nil { + tok = token.Unknown(t.getPosition()) + } + + t.appendToken(tok) + return tok, nil +} + +// 跳过空行 +func (t *Tokenizer) skipWhiteLine() { + for t.ch == rune(token.SpaceCharacter) || + t.ch == rune(token.TabCharacter) || + t.ch == rune(token.LineBreakCharacter) || + t.ch == rune(token.ReturnCharacter) { + t.readRune() + } +} + +// 跳过读取几个字符 +func (t *Tokenizer) skipRunes(num int) { + for i := 0; i < num; i++ { + t.readRune() + } +} + +func (t *Tokenizer) appendToken(tok *token.Token) { + t.tokens = append(t.tokens, tok) +} + +func (t *Tokenizer) isBlankLine() bool { + for _, r := range t.currentLine { + if r != rune(token.SpaceCharacter) && + r != rune(token.TabCharacter) && + r != rune(token.LineBreakCharacter) && + r != rune(token.ReturnCharacter) { + return false + } + } + return true +} + +// 从当前位置读取,一直到换行为止 +func (t *Tokenizer) readUntilLineBreak() string { + position := t.position + for t.ch != 0 { + t.readRune() + } + return string(t.currentLine[position:t.position]) +} + +func (t *Tokenizer) readUntilMappingValueCharacter() string { + position := t.position + for t.ch != rune(token.MappingValueCharacter) { + t.readRune() + } + return strings.TrimSpace(string(t.currentLine[position:t.position])) +} + +func (t *Tokenizer) readUntilBlank() string { + position := t.position + for t.ch != rune(token.SpaceCharacter) { + if t.ch == 0 { + break + } + t.readRune() + } + return strings.TrimSpace(string(t.currentLine[position:t.position])) +} + +// 跳过空格字符移植到读取到当前字符为 字母为止 +func (t *Tokenizer) skipBlankUntilLetter() { + for isLetter(t.ch) && t.ch == ' ' { + t.readRune() + } +} + +func (t *Tokenizer) skipBlankUntilDash() { + for t.ch != '-' { + t.readRune() + } +} + +// 跳过空格字符移植到读取到当前字符为 字母为止 +func (t *Tokenizer) skipBlankUntilNoneBlank() { + for t.ch == ' ' { + if t.ch == 0 { + break + } + t.readRune() + } +} + +func isLetter(ch rune) bool { + return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' +} diff --git a/pkg/yaml/lexer/lexer_test.go b/pkg/yaml/lexer/lexer_test.go new file mode 100644 index 0000000..0a8fd8b --- /dev/null +++ b/pkg/yaml/lexer/lexer_test.go @@ -0,0 +1,141 @@ +package lexer + +import ( + "github.com/screw-coding/yaml/token" + "strings" + "testing" +) + +func TestNextToken_comment_and_multi_documents(t *testing.T) { + intput := `--- +# this is comment + +# comment2 +--- +# comment3 +--- +# comment3 +--- +` + tests := []struct { + expectedType token.TokenType + }{ + {token.DocumentStartType}, + {token.CommentType}, + {token.CommentType}, + {token.DocumentEndType}, + {token.DocumentStartType}, + {token.CommentType}, + {token.DocumentEndType}, + {token.DocumentStartType}, + {token.CommentType}, + {token.DocumentEndType}, + } + + l := New(strings.NewReader(intput)) + for i, test := range tests { + tok, _ := l.nextToken() + if test.expectedType != tok.Type { + t.Fatalf("tests[#[%d] - tokentype wrong: expected=%q got=%q", i, test.expectedType, tok.Type) + } + } + +} + +func TestNextToken_mapping(t *testing.T) { + intput := ` +a: b # comment a = b +c: d +e: + f: + g: xx # comment xxxx + +` + tests := []struct { + expectedType token.TokenType + }{ + {token.DocumentStartType}, + {token.MappingKeyType}, + {token.MappingValueType}, + {token.ScalarType}, + {token.CommentType}, + {token.MappingKeyType}, + {token.MappingValueType}, + {token.ScalarType}, + {token.MappingKeyType}, // e + {token.MappingValueType}, // : + {token.MappingKeyType}, // f + {token.MappingValueType}, // : + {token.MappingKeyType}, // g + {token.MappingValueType}, // : + {token.ScalarType}, // xx + {token.CommentType}, + {token.DocumentEndType}, + } + + l := New(strings.NewReader(intput)) + for i, test := range tests { + tok, _ := l.nextToken() + if test.expectedType != tok.Type { + t.Fatalf("tests[#[%d] - tokentype wrong: expected=%q got=%q", i, test.expectedType, tok.Type) + } + } +} + +func TestNextToken_sequence(t *testing.T) { + intput := `--- +a: + - a1 + - a2 +b: + - name: a1 + age: 18 + - name: a2 + age: 20 +` + tests := []struct { + expectedType token.TokenType + }{ + {token.DocumentStartType}, + {token.MappingKeyType}, + {token.MappingValueType}, + + {token.SequenceEntryType}, // '-' + {token.ScalarType}, // a1 + {token.SequenceEntryType}, + {token.ScalarType}, + + {token.MappingKeyType}, // b + {token.MappingValueType}, //: + + {token.SequenceEntryType}, // '-' + + {token.MappingKeyType}, // name + {token.MappingValueType}, //: + {token.ScalarType}, // a1 + + {token.MappingKeyType}, // age + {token.MappingValueType}, //: + {token.ScalarType}, // 18 + + {token.SequenceEntryType}, // '-' + + {token.MappingKeyType}, // name + {token.MappingValueType}, //: + {token.ScalarType}, // a2 + + {token.MappingKeyType}, // age + {token.MappingValueType}, //: + {token.ScalarType}, // 20 + + {token.DocumentEndType}, + } + + l := New(strings.NewReader(intput)) + for i, test := range tests { + tok, _ := l.nextToken() + if test.expectedType != tok.Type { + t.Fatalf("tests[#[%d] - tokentype wrong: expected=%q got=%q", i, test.expectedType, tok.Type) + } + } +} diff --git a/pkg/yaml/token/token.go b/pkg/yaml/token/token.go new file mode 100644 index 0000000..4b1ad30 --- /dev/null +++ b/pkg/yaml/token/token.go @@ -0,0 +1,181 @@ +package token + +type Character rune + +const ( + // SequenceEntryCharacter character for sequence entry + SequenceEntryCharacter Character = '-' + // MappingKeyCharacter character for mapping key + MappingKeyCharacter Character = '?' + // MappingValueCharacter character for mapping value + MappingValueCharacter Character = ':' + // CollectEntryCharacter character for collect entry + CollectEntryCharacter Character = ',' + // SequenceStartCharacter character for sequence start + SequenceStartCharacter Character = '[' + // SequenceEndCharacter character for sequence end + SequenceEndCharacter Character = ']' + // MappingStartCharacter character for mapping start + MappingStartCharacter Character = '{' + // MappingEndCharacter character for mapping end + MappingEndCharacter Character = '}' + // CommentCharacter character for comment + CommentCharacter Character = '#' + // AnchorCharacter character for anchor + AnchorCharacter Character = '&' + // AliasCharacter character for alias + AliasCharacter Character = '*' + // TagCharacter character for tag + TagCharacter Character = '!' + // LiteralCharacter character for literal + LiteralCharacter Character = '|' + // FoldedCharacter character for folded + FoldedCharacter Character = '>' + // SingleQuoteCharacter character for single quote + SingleQuoteCharacter Character = '\'' + // DoubleQuoteCharacter character for double quote + DoubleQuoteCharacter Character = '"' + // DirectiveCharacter character for directive + DirectiveCharacter Character = '%' + // SpaceCharacter character for space + SpaceCharacter Character = ' ' + // LineBreakCharacter character for line break + LineBreakCharacter Character = '\n' + TabCharacter Character = '\t' + ReturnCharacter Character = '\r' +) + +type TokenType int + +const ( + // UnknownType reserve for invalid type + UnknownType TokenType = iota + // DocumentHeaderType type for DocumentHeader token + DocumentStartType + // DocumentEndType type for DocumentEnd token + DocumentEndType + // SequenceEntryType type for SequenceEntry token + SequenceEntryType + // MappingKeyType type for MappingKey token + MappingKeyType + // MappingValueType type for MappingValue token + MappingValueType + // MergeKeyType type for MergeKey token + MergeKeyType + // CollectEntryType type for CollectEntry token + CollectEntryType + // SequenceStartType type for SequenceStart token + SequenceStartType + // SequenceEndType type for SequenceEnd token + SequenceEndType + // MappingStartType type for MappingStart token + MappingStartType + // MappingEndType type for MappingEnd token + MappingEndType + // CommentType type for Comment token + CommentType + // ScalarType 字面量 + ScalarType +) + +func (t TokenType) String() string { + switch t { + case DocumentStartType: + return "DocumentStartType" + case DocumentEndType: + return "DocumentEndType" + case MappingKeyType: + return "MappingKeyType" + case MappingValueType: + return "MappingValueType" + case CommentType: + return "CommentType" + case ScalarType: + return "ScalarType" + + } + return "" + +} + +// Position type for position in YAML document +type Position struct { + Line int // 行号 + OffSet int // 当前行的偏移量 + IndentNum int // 缩进,即前面的空白字符数 +} + +// Token type for token +type Token struct { + Type TokenType + Value string // 字面量 + Position *Position +} + +// MappingKey create token for MappingKey +func MappingKey(value string, pos *Position) *Token { + return &Token{ + Type: MappingKeyType, + Value: value, + Position: pos, + } +} + +// MappingValue create token for MappingValue +func MappingValue(pos *Position) *Token { + return &Token{ + Type: MappingValueType, + Value: string(MappingValueCharacter), + Position: pos, + } +} + +// ScalarValue create token for MappingValue +func ScalarValue(source string, pos *Position) *Token { + return &Token{ + Type: ScalarType, + Value: source, + Position: pos, + } +} + +func SequenceEntry(pos *Position) *Token { + return &Token{ + Type: SequenceEntryType, + Value: "- ", + Position: pos, + } +} + +func DocumentStart(pos *Position) *Token { + return &Token{ + Type: DocumentStartType, + Value: "---", + Position: pos, + } +} + +func DocumentEnd(pos *Position) *Token { + return &Token{ + Type: DocumentEndType, + Value: "", + Position: pos, + } +} + +func Comment(value string, pos *Position) *Token { + return &Token{ + Type: CommentType, + Value: value, + Position: pos, + } + +} + +func Unknown(pos *Position) *Token { + return &Token{ + Type: UnknownType, + Value: "", + Position: pos, + } +}