From c0303d92260fc053770cd5f1c02ff7aabe25d60d Mon Sep 17 00:00:00 2001 From: Stephen Young Date: Sun, 22 Feb 2026 15:31:42 -0500 Subject: [PATCH] no more regexp converter --- converter.go | 5 - errors.go | 7 ++ regexp.go | 318 -------------------------------------------------- setup_test.go | 27 +---- textplain.go | 21 +--- 5 files changed, 16 insertions(+), 362 deletions(-) delete mode 100644 converter.go create mode 100644 errors.go delete mode 100644 regexp.go diff --git a/converter.go b/converter.go deleted file mode 100644 index fdd6224..0000000 --- a/converter.go +++ /dev/null @@ -1,5 +0,0 @@ -package textplain - -type Converter interface { - Convert(string, int) (string, error) -} diff --git a/errors.go b/errors.go new file mode 100644 index 0000000..b4ec036 --- /dev/null +++ b/errors.go @@ -0,0 +1,7 @@ +package textplain + +import "errors" + +var ( + ErrBodyNotFound = errors.New("could not find a `body` element in your html document") +) diff --git a/regexp.go b/regexp.go deleted file mode 100644 index bb0da3d..0000000 --- a/regexp.go +++ /dev/null @@ -1,318 +0,0 @@ -package textplain - -import ( - "bytes" - "fmt" - "regexp" - "strconv" - "strings" - - "golang.org/x/net/html" - "golang.org/x/net/html/atom" -) - -type RegexpConverter struct { - ignoredHTML *regexp.Regexp - comments *regexp.Regexp - imgAltDoubleQuotes submatchReplacer - imgAltSingleQuotes submatchReplacer - links submatchReplacer - headerClose submatchReplacer - headerBlockBr *regexp.Regexp - headerBlockTags *regexp.Regexp - headerBlock submatchReplacer - wrapSpans submatchReplacer - lists *regexp.Regexp - listsNoNewline *regexp.Regexp - paragraphs *regexp.Regexp - lineBreaks *regexp.Regexp - remainingTags *regexp.Regexp - shortenSpaces *regexp.Regexp - lineFeeds *regexp.Regexp - nonBreakingSpaces *regexp.Regexp - extraSpaceStartOfLine *regexp.Regexp - extraSpaceEndOfLine *regexp.Regexp - consecutiveNewlines *regexp.Regexp - fixWordWrappedParens submatchReplacer -} - -// New textplain converter object -func NewRegexpConverter() Converter { - - headerBlockBr := regexp.MustCompile(`(?i)`) - headerBlockTags := regexp.MustCompile(`(?i)<\/?[^>]*>`) - - return &RegexpConverter{ - ignoredHTML: regexp.MustCompile(`(?ms).*?`), - - comments: regexp.MustCompile(`(?ms)`), - - // imgAltDoubleQuotes replaces images with their alt tag when it is double quoted - imgAltDoubleQuotes: submatchReplacer{ - regexp: regexp.MustCompile(`(?i)]*\>`), - handler: func(t string, submatch []int) string { - return t[submatch[2]:submatch[3]] - }, - }, - - // imgAltSingleQuotes replaces images with their alt tag when it is single quoted - imgAltSingleQuotes: submatchReplacer{ - regexp: regexp.MustCompile(`(?i)]*\>`), - handler: func(t string, submatch []int) string { - return t[submatch[2]:submatch[3]] - }, - }, - - // links replaces anchor links with one of "href" or "content ( href )" - links: submatchReplacer{ - regexp: regexp.MustCompile(`(?i)]*>((.|\s)*?)<\/a>`), - handler: func(t string, submatch []int) string { - href, value := strings.TrimSpace(t[submatch[4]:submatch[5]]), strings.TrimSpace(t[submatch[6]:submatch[7]]) - var replace string - if strings.EqualFold(href, value) { - replace = value - } else if value != "" { - replace = fmt.Sprintf("%s ( %s )", value, href) - } - return replace - }, - }, - - // headerClose moves `` tags to their own line as a preprocessing step for headerBlock - headerClose: submatchReplacer{ - regexp: regexp.MustCompile(`(?i)(<\/h[1-6]>)`), - handler: func(t string, submatch []int) string { - return "\n" + t[submatch[2]:submatch[3]] - }, - }, - - // used in headerBlock to do some content replacement - headerBlockBr: headerBlockBr, - headerBlockTags: headerBlockTags, - - // headerBlock converts a `` block to plaintext - headerBlock: submatchReplacer{ - regexp: regexp.MustCompile(`(?imsU)[\s]*]*>[\s]*(.*)[\s]*<\/h[1-6]+>`), - handler: func(t string, submatch []int) string { - headerLevel, _ := strconv.Atoi(t[submatch[2]:submatch[3]]) - headerText := t[submatch[4]:submatch[5]] - - headerText = headerBlockBr.ReplaceAllString(headerText, "\n") - headerText = headerBlockTags.ReplaceAllString(headerText, "") - - var maxLength int - var headerLines []string - for _, line := range strings.Split(headerText, "\n") { - if trimmed := strings.TrimSpace(line); len(trimmed) > 0 { - headerLines = append(headerLines, trimmed) - if l := len(headerLines[len(headerLines)-1]); l > maxLength { - maxLength = l - } - } - } - - headerText = strings.Join(headerLines, "\n") - var header string - - // special case headers - switch headerLevel { - case 1: - header = strings.Repeat("*", maxLength) + "\n" + headerText + "\n" + strings.Repeat("*", maxLength) - case 2: - header = strings.Repeat("-", maxLength) + "\n" + headerText + "\n" + strings.Repeat("-", maxLength) - default: - header = headerText + "\n" + strings.Repeat("-", maxLength) - } - - return "\n\n" + header + "\n\n" - }, - }, - - // wrapSpans merges together contiguous span tags into a single line - wrapSpans: submatchReplacer{ - regexp: regexp.MustCompile(`(?msi)(<\/span>)[\s]+(]*>)[\s]*`), - listsNoNewline: regexp.MustCompile(`(?i)<\/li>[\s]*([\n]?)`), - paragraphs: regexp.MustCompile(`(?i)<\/p>`), - lineBreaks: regexp.MustCompile(`(?i)`), - remainingTags: regexp.MustCompile(`<\/?[^>]*>`), - shortenSpaces: regexp.MustCompile(` {2,}`), - lineFeeds: regexp.MustCompile(`\r\n?`), - nonBreakingSpaces: regexp.MustCompile(`[ \t]*\302\240+[ \t]*`), - extraSpaceStartOfLine: regexp.MustCompile(`\n[ \t]+`), - extraSpaceEndOfLine: regexp.MustCompile(`[ \t]+\n`), - consecutiveNewlines: regexp.MustCompile(`[\n]{3,}`), - - // fixWordWrappedParens searches for links that got broken by word wrap and moves them - // into a single line - fixWordWrappedParens: submatchReplacer{ - regexp: regexp.MustCompile(`\(([ \n])([^)]+)([\n ])\)`), - handler: func(t string, submatch []int) string { - leadingSpace, content, trailingSpace := t[submatch[2]:submatch[3]], t[submatch[4]:submatch[5]], t[submatch[6]:submatch[7]] - var out string - if leadingSpace == "\n" { - out += leadingSpace - } - out += "( " + content + " )" - if trailingSpace == "\n" { - out += leadingSpace - } - return out - }, - }, - } -} - -// XXX: based on premailer/premailer@7c94e7a5a457b6710bada8186c6a41fccbfa08d1 -// https://github.com/premailer/premailer/tree/7c94e7a5a457b6710bada8186c6a41fccbfa08d1 - -type submatchReplacer struct { - regexp *regexp.Regexp - handler func(string, []int) string -} - -func (s *submatchReplacer) Replace(text string) string { - var start int - var finalText string - for _, submatch := range s.regexp.FindAllStringSubmatchIndex(text, -1) { - finalText += text[start:submatch[0]] + s.handler(text, submatch) - start = submatch[1] - } - return finalText + text[start:] -} - -// Convert returns a text-only version of supplied document in UTF-8 format with all HTML tags removed -func (t *RegexpConverter) Convert(document string, lineLength int) (string, error) { - // Brutish way to get a fully formed html document - doc, err := html.Parse(strings.NewReader(document)) - if err != nil { - return "", err - } - - // Find the tag within the document - var bodyElement *html.Node - if doc.Type == html.ElementNode && doc.Data == "body" { - bodyElement = doc - } else { - var scanForBody func(n *html.Node, depth int) - scanForBody = func(n *html.Node, depth int) { - if n == nil { - return - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - if n.Type == html.ElementNode && n.Data == "body" { - bodyElement = n - return - } - if depth < 5 { - scanForBody(c, depth+1) - } - } - } - scanForBody(doc, 0) - } - if bodyElement == nil { - return "", ErrBodyNotFound - } - - var dropNonContentTags func(*html.Node) - dropNonContentTags = func(n *html.Node) { - if n == nil { - return - } - var toRemove []*html.Node - for c := n.FirstChild; c != nil; c = c.NextSibling { - if c.DataAtom == atom.Script || c.DataAtom == atom.Style { - toRemove = append(toRemove, c) - } else { - dropNonContentTags(c) - } - } - for _, r := range toRemove { - n.RemoveChild(r) - } - } - dropNonContentTags(bodyElement) - - // Reconstitute the cleaned HTML document for application - // of plaintext-conversion logic - var clean bytes.Buffer - err = html.Render(&clean, bodyElement) - if err != nil { - return "", err - } - - // strip text ignored html. Useful for removing - // headers and footers that aren't needed in the - // text version - txt := t.ignoredHTML.ReplaceAllString(clean.String(), "") - - // strip out html comments - txt = t.comments.ReplaceAllString(txt, "") - - // replace images with their alt attributes for img tags with "" for attribute quotes - // eg. the following formats: - // - // - txt = t.imgAltDoubleQuotes.Replace(txt) - - // replace images with their alt attributes for img tags with '' for attribute quotes - // eg. the following formats: - // - // - txt = t.imgAltSingleQuotes.Replace(txt) - - // links - txt = t.links.Replace(txt) - - // handle headings (H1-H6) - txt = t.headerClose.Replace(txt) - txt = t.headerBlock.Replace(txt) - - // wrap spans - txt = t.wrapSpans.Replace(txt) - - // lists -- TODO: should handle ordered lists - txt = t.lists.ReplaceAllString(txt, "* ") - - // list not followed by a newline - txt = t.listsNoNewline.ReplaceAllString(txt, "\n") - - // paragraphs and line breaks - txt = t.paragraphs.ReplaceAllString(txt, "\n\n") - txt = t.lineBreaks.ReplaceAllString(txt, "\n") - - // strip remaining tags - txt = t.remainingTags.ReplaceAllString(txt, "") - - // decode HTML entities - txt = html.UnescapeString(txt) - - // no more than two consecutive spaces - txt = t.shortenSpaces.ReplaceAllString(txt, " ") - - // apply word wrapping - txt = WordWrap(txt, lineLength) - - // remove linefeeds (\r\n and \r -> \n) - txt = t.lineFeeds.ReplaceAllString(txt, "\n") - - // strip extra spaces - txt = t.nonBreakingSpaces.ReplaceAllString(txt, " ") - txt = t.extraSpaceStartOfLine.ReplaceAllString(txt, "\n") - txt = t.extraSpaceEndOfLine.ReplaceAllString(txt, "\n") - - // no more than two consecutive newlines - txt = t.consecutiveNewlines.ReplaceAllString(txt, "\n\n") - - // wordWrap messes up the parens - txt = t.fixWordWrappedParens.Replace(txt) - - return strings.TrimSpace(txt), nil -} diff --git a/setup_test.go b/setup_test.go index 2c5a364..04ba62c 100644 --- a/setup_test.go +++ b/setup_test.go @@ -1,7 +1,6 @@ package textplain_test import ( - "reflect" "testing" "github.com/mailproto/textplain" @@ -18,23 +17,12 @@ func runTestCases(t *testing.T, testCases []testCase) { } } -func runTestCase(t *testing.T, tc testCase, converters ...textplain.Converter) { +func runTestCase(t *testing.T, tc testCase) { t.Helper() - if len(converters) == 0 { - converters = []textplain.Converter{textplain.NewRegexpConverter(), textplain.NewTreeConverter()} - } - - for _, converter := range converters { - if tc.skipRegexp && reflect.TypeOf(converter) == reflect.TypeOf(&textplain.RegexpConverter{}) { - continue - } - t.Run(reflect.TypeOf(converter).Elem().Name(), func(tt *testing.T) { - result, err := converter.Convert(tc.body, textplain.DefaultLineLength) - assert.Nil(tt, err) - assert.Equal(tt, tc.expect, result) - }) - } + result, err := textplain.Convert(tc.body, textplain.DefaultLineLength) + assert.Nil(t, err) + assert.Equal(t, tc.expect, result) } const html = ` @@ -63,13 +51,6 @@ const html = `` -func BenchmarkRegexp(b *testing.B) { - converter := textplain.NewRegexpConverter() - for i := 0; i < b.N; i++ { - _, _ = converter.Convert(html, textplain.DefaultLineLength) - } -} - func BenchmarkTree(b *testing.B) { converter := textplain.NewTreeConverter() for i := 0; i < b.N; i++ { diff --git a/textplain.go b/textplain.go index cbadaea..1b19676 100644 --- a/textplain.go +++ b/textplain.go @@ -1,29 +1,18 @@ package textplain -import ( - "errors" -) - // Defaults const ( DefaultLineLength = 65 ) -// Well-defined errors -var ( - ErrBodyNotFound = errors.New("could not find a `body` element in your html document") -) + +type Converter interface { + Convert(string, int) (string, error) +} var defaultConverter = NewTreeConverter() -// Convert is a convenience method so the library can be used without initializing a converter -// because this library relies heavily on regexp objects, it may act as a bottlneck to concurrency -// due to thread-safety mutexes in *regexp.Regexp internals +// Convert is a wrapper around the default converter singleton func Convert(document string, lineLength int) (string, error) { return defaultConverter.Convert(document, lineLength) } - -func MustConvert(document string, lineLength int) string { - result, _ := Convert(document, lineLength) - return result -}