基于 Go 实现一个 Markdown 解析器

    
        标签：
        
            个人项目
        
            Go
        
    发布于：2020-12-06 21:59:01
    编辑于：2022-11-15 13:01:28
    浏览量：3670

概述

仓库链接：https://github.com/songquanpeng/md2html 。

简单说一下流程：

把 Markdown 字符串分解为 Token 流。
从 Token 流中构建出抽象语法树（AST）。
遍历抽象语法树，生成 HTML 文档。

词法分析部分

核心代码：

func nextToken() (textToken, otherToken Token) {
    textToken.Type = TextToken
    for {
        if pos >= len(input) {
            otherToken.Type = EofToken
            return
        }
        c := input[pos]
        if len(textToken.Value) == 0 && (lastTokenType == NewlineToken || lastTokenType == TabToken) {
            switch c {
            case '#':
                n := countSymbol(c)
                otherToken.Type = TitleToken
                otherToken.Value = append(otherToken.Value, rune(n))
                pos += n
                if input[pos] == ' ' {
                    pos++
                }
                return
            case '\t':
                otherToken.Type = TabToken
                pos++
                return
            case '\n':
                otherToken.Type = NewlineToken
                pos++
                return
            case '-':
                fallthrough
            case '+':
                fallthrough
            case '*':
                if isSpaceBehind() {
                    otherToken.Type = UnorderedListToken
                    pos += 2
                    yes, completed := isTaskSymbol()
                    if yes {
                        pos += 2
                        if isSpaceBehind() {
                            pos += 2
                            if completed {
                                otherToken.Type = CompletedTaskToken
                            } else {
                                otherToken.Type = UncompletedTaskToken
                            }
                            return
                        }
                        pos -= 2
                    }
                    return
                } else { // Consider if this is a dividing line
                    if nextIsSameTo(c) {
                        pos++
                        if nextIsSameTo(c) {
                            pos += 2
                            otherToken.Type = DividingLineToken
                            return
                        }
                        pos--
                    }
                }
            case '>':
                if isSpaceBehind() {
                    otherToken.Type = QuoteToken
                    pos += 2
                    return
                }
            case '`':
                if nextIsSameTo(c) {
                    pos++
                    if nextIsSameTo(c) {
                        pos += 2
                        otherToken.Type = CodeBlockToken
                        start, end := getCodeBlockStartEnd()
                        otherToken.Value = input[start:end]
                        pos = end + 3
                        return
                    }
                    pos--
                }
            case '\r':
                fallthrough
            case ' ':
                n := countSymbol(c)
                if n >= 2 {
                    pos += n
                    otherToken.Type = TabToken
                    return
                } else {
                    pos++
                }
            }
            if isNumDotSpace() {
                otherToken.Type = OrderedListToken
                return
            }
        }
        // Update c because pos maybe updated due to black symbol.
        c = input[pos]

        // Now we have to return the text token before the below token.
        switch c {
        case '*':
            if nextIsSameTo(c) {
                pos += 2
                otherToken.Type = DoubleStarToken
                otherToken.Value = []rune("**")
            } else {
                pos += 1
                otherToken.Type = SingleStarToken
                otherToken.Value = []rune("*")
            }
            return
        case '_':
            if nextIsSameTo(c) {
                pos += 2
                otherToken.Type = DoubleUnderscoreToken
                otherToken.Value = []rune("__")
            } else {
                pos += 1
                otherToken.Type = SingleUnderscoreToken
                otherToken.Value = []rune("_")
            }
            return
        case '~':
            if nextIsSameTo(c) {
                pos += 2
                otherToken.Type = DoubleTildeToken
                otherToken.Value = []rune("~~")
                return
            }
        case '`':
            otherToken.Type = SingleBacktickToken
            otherToken.Value = []rune("`")
            pos++
            return
        case '!':
            if nextIsSameTo('[') {
                pos += 2
                otherToken.Type = ImageHeadToken
                return
            }
        case '[':
            pos++
            otherToken.Type = LinkHeadToken
            return
        case ']':
            if nextIsSameTo('(') {
                pos += 2
                for i := pos; i < len(input) && input[i] != '\n'; i++ {
                    if input[i] == ')' {
                        otherToken.Type = LinkBodyToken
                        otherToken.Value = input[pos:i]
                        pos = i + 1
                        return
                    }
                }
                pos -= 2
            }
        case '\n':
            otherToken.Type = NewlineToken
            pos++
            return
        case '\t':
            otherToken.Type = TabToken
            pos++
            return
        }
        pos++
        if c != '\r' {
            textToken.Value = append(textToken.Value, c)
        }
    }
}

语法分析部分

核心代码：

func parseSectionList() (root *Node) {
    node := Node{}
    root = &node
    for {
        token := getToken()
        restoreToken()
        current := &Node{}
        switch token.Type {
        case lexer.TitleToken:
            current = parseTitle()
        case lexer.DividingLineToken:
            current = parseDividingLine()
        case lexer.CodeBlockToken:
            current = parseCodeBlock()
        case lexer.UncompletedTaskToken:
            fallthrough
        case lexer.CompletedTaskToken:
            fallthrough
        case lexer.UnorderedListToken:
            fallthrough
        case lexer.OrderedListToken:
            current = parseList()
        case lexer.QuoteToken:
            current = parseQuote()
        case lexer.NewlineToken:
            _ = getToken()
            tabCounter = 0
            continue
        case lexer.TabToken:
            tabCounter++
            _ = getToken()
            continue
        case lexer.EofToken:
            return
        default:
            current = parseContent(false)
        }
        root.Children = append(root.Children, current)
    }
}

HTML 代码生成部分

这部分最好玩也最简单，主要代码：

func Convert(markdown string, fullPage bool) (html string) {
    ast := parser.Parse(markdown)
    if os.Getenv("MODE") == "debug" {
        parser.PrintAST(ast)
    }
    html = processArticleNode(ast)
    if fullPage {
        html = fmt.Sprintf(HtmlTemplate, Style, html)
    }
    return html
}

func processArticleNode(node *parser.Node) (html string) {
    for _, child := range node.Children {
        switch child.Type {
        case parser.TitleNode:
            html += processTitleNode(child)
        case parser.DividingLineNode:
            html += processDividingLineNode(child)
        case parser.ContentNode:
            content := processContentNode(child)
            html += fmt.Sprintf("<div>%s</div>\n", content)
        case parser.ListNode:
            html += processListNode(child)
        case parser.QuoteNode:
            html += processQuoteNode(child)
        case parser.CodeBlockNode:
            html += processCodeBlockNode(child)
        }
    }
    html = fmt.Sprintf("<div class='article'>\n%s\n</div>", html)
    return
}

func processTitleNode(node *parser.Node) (html string) {
    content := processContentNode(node.Children[0])
    level := int(node.Value[0])
    html = fmt.Sprintf("<h%d>%s</h%d>\n", level, content, level)
    return
}

Links: md2html