diff --git a/converter.go b/converter.go
deleted file mode 100644
index fdd6224..0000000
--- a/converter.go
+++ /dev/null
@@ -1,5 +0,0 @@
-package textplain
-
-type Converter interface {
- Convert(string, int) (string, error)
-}
diff --git a/errors.go b/errors.go
new file mode 100644
index 0000000..b4ec036
--- /dev/null
+++ b/errors.go
@@ -0,0 +1,7 @@
+package textplain
+
+import "errors"
+
+var (
+ ErrBodyNotFound = errors.New("could not find a `body` element in your html document")
+)
diff --git a/regexp.go b/regexp.go
deleted file mode 100644
index bb0da3d..0000000
--- a/regexp.go
+++ /dev/null
@@ -1,318 +0,0 @@
-package textplain
-
-import (
- "bytes"
- "fmt"
- "regexp"
- "strconv"
- "strings"
-
- "golang.org/x/net/html"
- "golang.org/x/net/html/atom"
-)
-
-type RegexpConverter struct {
- ignoredHTML *regexp.Regexp
- comments *regexp.Regexp
- imgAltDoubleQuotes submatchReplacer
- imgAltSingleQuotes submatchReplacer
- links submatchReplacer
- headerClose submatchReplacer
- headerBlockBr *regexp.Regexp
- headerBlockTags *regexp.Regexp
- headerBlock submatchReplacer
- wrapSpans submatchReplacer
- lists *regexp.Regexp
- listsNoNewline *regexp.Regexp
- paragraphs *regexp.Regexp
- lineBreaks *regexp.Regexp
- remainingTags *regexp.Regexp
- shortenSpaces *regexp.Regexp
- lineFeeds *regexp.Regexp
- nonBreakingSpaces *regexp.Regexp
- extraSpaceStartOfLine *regexp.Regexp
- extraSpaceEndOfLine *regexp.Regexp
- consecutiveNewlines *regexp.Regexp
- fixWordWrappedParens submatchReplacer
-}
-
-// New textplain converter object
-func NewRegexpConverter() Converter {
-
- headerBlockBr := regexp.MustCompile(`(?i)
`)
- headerBlockTags := regexp.MustCompile(`(?i)<\/?[^>]*>`)
-
- return &RegexpConverter{
- ignoredHTML: regexp.MustCompile(`(?ms).*?`),
-
- comments: regexp.MustCompile(`(?ms)`),
-
- // imgAltDoubleQuotes replaces images with their alt tag when it is double quoted
- imgAltDoubleQuotes: submatchReplacer{
- regexp: regexp.MustCompile(`(?i)]*\>`),
- handler: func(t string, submatch []int) string {
- return t[submatch[2]:submatch[3]]
- },
- },
-
- // imgAltSingleQuotes replaces images with their alt tag when it is single quoted
- imgAltSingleQuotes: submatchReplacer{
- regexp: regexp.MustCompile(`(?i)]*\>`),
- handler: func(t string, submatch []int) string {
- return t[submatch[2]:submatch[3]]
- },
- },
-
- // links replaces anchor links with one of "href" or "content ( href )"
- links: submatchReplacer{
- regexp: regexp.MustCompile(`(?i)]*>((.|\s)*?)<\/a>`),
- handler: func(t string, submatch []int) string {
- href, value := strings.TrimSpace(t[submatch[4]:submatch[5]]), strings.TrimSpace(t[submatch[6]:submatch[7]])
- var replace string
- if strings.EqualFold(href, value) {
- replace = value
- } else if value != "" {
- replace = fmt.Sprintf("%s ( %s )", value, href)
- }
- return replace
- },
- },
-
- // headerClose moves `` tags to their own line as a preprocessing step for headerBlock
- headerClose: submatchReplacer{
- regexp: regexp.MustCompile(`(?i)(<\/h[1-6]>)`),
- handler: func(t string, submatch []int) string {
- return "\n" + t[submatch[2]:submatch[3]]
- },
- },
-
- // used in headerBlock to do some content replacement
- headerBlockBr: headerBlockBr,
- headerBlockTags: headerBlockTags,
-
- // headerBlock converts a `` block to plaintext
- headerBlock: submatchReplacer{
- regexp: regexp.MustCompile(`(?imsU)[\s]*]*>[\s]*(.*)[\s]*<\/h[1-6]+>`),
- handler: func(t string, submatch []int) string {
- headerLevel, _ := strconv.Atoi(t[submatch[2]:submatch[3]])
- headerText := t[submatch[4]:submatch[5]]
-
- headerText = headerBlockBr.ReplaceAllString(headerText, "\n")
- headerText = headerBlockTags.ReplaceAllString(headerText, "")
-
- var maxLength int
- var headerLines []string
- for _, line := range strings.Split(headerText, "\n") {
- if trimmed := strings.TrimSpace(line); len(trimmed) > 0 {
- headerLines = append(headerLines, trimmed)
- if l := len(headerLines[len(headerLines)-1]); l > maxLength {
- maxLength = l
- }
- }
- }
-
- headerText = strings.Join(headerLines, "\n")
- var header string
-
- // special case headers
- switch headerLevel {
- case 1:
- header = strings.Repeat("*", maxLength) + "\n" + headerText + "\n" + strings.Repeat("*", maxLength)
- case 2:
- header = strings.Repeat("-", maxLength) + "\n" + headerText + "\n" + strings.Repeat("-", maxLength)
- default:
- header = headerText + "\n" + strings.Repeat("-", maxLength)
- }
-
- return "\n\n" + header + "\n\n"
- },
- },
-
- // wrapSpans merges together contiguous span tags into a single line
- wrapSpans: submatchReplacer{
- regexp: regexp.MustCompile(`(?msi)(<\/span>)[\s]+(]*>)[\s]*`),
- listsNoNewline: regexp.MustCompile(`(?i)<\/li>[\s]*([\n]?)`),
- paragraphs: regexp.MustCompile(`(?i)<\/p>`),
- lineBreaks: regexp.MustCompile(`(?i)
`),
- remainingTags: regexp.MustCompile(`<\/?[^>]*>`),
- shortenSpaces: regexp.MustCompile(` {2,}`),
- lineFeeds: regexp.MustCompile(`\r\n?`),
- nonBreakingSpaces: regexp.MustCompile(`[ \t]*\302\240+[ \t]*`),
- extraSpaceStartOfLine: regexp.MustCompile(`\n[ \t]+`),
- extraSpaceEndOfLine: regexp.MustCompile(`[ \t]+\n`),
- consecutiveNewlines: regexp.MustCompile(`[\n]{3,}`),
-
- // fixWordWrappedParens searches for links that got broken by word wrap and moves them
- // into a single line
- fixWordWrappedParens: submatchReplacer{
- regexp: regexp.MustCompile(`\(([ \n])([^)]+)([\n ])\)`),
- handler: func(t string, submatch []int) string {
- leadingSpace, content, trailingSpace := t[submatch[2]:submatch[3]], t[submatch[4]:submatch[5]], t[submatch[6]:submatch[7]]
- var out string
- if leadingSpace == "\n" {
- out += leadingSpace
- }
- out += "( " + content + " )"
- if trailingSpace == "\n" {
- out += leadingSpace
- }
- return out
- },
- },
- }
-}
-
-// XXX: based on premailer/premailer@7c94e7a5a457b6710bada8186c6a41fccbfa08d1
-// https://github.com/premailer/premailer/tree/7c94e7a5a457b6710bada8186c6a41fccbfa08d1
-
-type submatchReplacer struct {
- regexp *regexp.Regexp
- handler func(string, []int) string
-}
-
-func (s *submatchReplacer) Replace(text string) string {
- var start int
- var finalText string
- for _, submatch := range s.regexp.FindAllStringSubmatchIndex(text, -1) {
- finalText += text[start:submatch[0]] + s.handler(text, submatch)
- start = submatch[1]
- }
- return finalText + text[start:]
-}
-
-// Convert returns a text-only version of supplied document in UTF-8 format with all HTML tags removed
-func (t *RegexpConverter) Convert(document string, lineLength int) (string, error) {
- // Brutish way to get a fully formed html document
- doc, err := html.Parse(strings.NewReader(document))
- if err != nil {
- return "", err
- }
-
- // Find the tag within the document
- var bodyElement *html.Node
- if doc.Type == html.ElementNode && doc.Data == "body" {
- bodyElement = doc
- } else {
- var scanForBody func(n *html.Node, depth int)
- scanForBody = func(n *html.Node, depth int) {
- if n == nil {
- return
- }
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- if n.Type == html.ElementNode && n.Data == "body" {
- bodyElement = n
- return
- }
- if depth < 5 {
- scanForBody(c, depth+1)
- }
- }
- }
- scanForBody(doc, 0)
- }
- if bodyElement == nil {
- return "", ErrBodyNotFound
- }
-
- var dropNonContentTags func(*html.Node)
- dropNonContentTags = func(n *html.Node) {
- if n == nil {
- return
- }
- var toRemove []*html.Node
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- if c.DataAtom == atom.Script || c.DataAtom == atom.Style {
- toRemove = append(toRemove, c)
- } else {
- dropNonContentTags(c)
- }
- }
- for _, r := range toRemove {
- n.RemoveChild(r)
- }
- }
- dropNonContentTags(bodyElement)
-
- // Reconstitute the cleaned HTML document for application
- // of plaintext-conversion logic
- var clean bytes.Buffer
- err = html.Render(&clean, bodyElement)
- if err != nil {
- return "", err
- }
-
- // strip text ignored html. Useful for removing
- // headers and footers that aren't needed in the
- // text version
- txt := t.ignoredHTML.ReplaceAllString(clean.String(), "")
-
- // strip out html comments
- txt = t.comments.ReplaceAllString(txt, "")
-
- // replace images with their alt attributes for img tags with "" for attribute quotes
- // eg. the following formats:
- //
- //
- txt = t.imgAltDoubleQuotes.Replace(txt)
-
- // replace images with their alt attributes for img tags with '' for attribute quotes
- // eg. the following formats:
- //
- //
- txt = t.imgAltSingleQuotes.Replace(txt)
-
- // links
- txt = t.links.Replace(txt)
-
- // handle headings (H1-H6)
- txt = t.headerClose.Replace(txt)
- txt = t.headerBlock.Replace(txt)
-
- // wrap spans
- txt = t.wrapSpans.Replace(txt)
-
- // lists -- TODO: should handle ordered lists
- txt = t.lists.ReplaceAllString(txt, "* ")
-
- // list not followed by a newline
- txt = t.listsNoNewline.ReplaceAllString(txt, "\n")
-
- // paragraphs and line breaks
- txt = t.paragraphs.ReplaceAllString(txt, "\n\n")
- txt = t.lineBreaks.ReplaceAllString(txt, "\n")
-
- // strip remaining tags
- txt = t.remainingTags.ReplaceAllString(txt, "")
-
- // decode HTML entities
- txt = html.UnescapeString(txt)
-
- // no more than two consecutive spaces
- txt = t.shortenSpaces.ReplaceAllString(txt, " ")
-
- // apply word wrapping
- txt = WordWrap(txt, lineLength)
-
- // remove linefeeds (\r\n and \r -> \n)
- txt = t.lineFeeds.ReplaceAllString(txt, "\n")
-
- // strip extra spaces
- txt = t.nonBreakingSpaces.ReplaceAllString(txt, " ")
- txt = t.extraSpaceStartOfLine.ReplaceAllString(txt, "\n")
- txt = t.extraSpaceEndOfLine.ReplaceAllString(txt, "\n")
-
- // no more than two consecutive newlines
- txt = t.consecutiveNewlines.ReplaceAllString(txt, "\n\n")
-
- // wordWrap messes up the parens
- txt = t.fixWordWrappedParens.Replace(txt)
-
- return strings.TrimSpace(txt), nil
-}
diff --git a/setup_test.go b/setup_test.go
index 2c5a364..04ba62c 100644
--- a/setup_test.go
+++ b/setup_test.go
@@ -1,7 +1,6 @@
package textplain_test
import (
- "reflect"
"testing"
"github.com/mailproto/textplain"
@@ -18,23 +17,12 @@ func runTestCases(t *testing.T, testCases []testCase) {
}
}
-func runTestCase(t *testing.T, tc testCase, converters ...textplain.Converter) {
+func runTestCase(t *testing.T, tc testCase) {
t.Helper()
- if len(converters) == 0 {
- converters = []textplain.Converter{textplain.NewRegexpConverter(), textplain.NewTreeConverter()}
- }
-
- for _, converter := range converters {
- if tc.skipRegexp && reflect.TypeOf(converter) == reflect.TypeOf(&textplain.RegexpConverter{}) {
- continue
- }
- t.Run(reflect.TypeOf(converter).Elem().Name(), func(tt *testing.T) {
- result, err := converter.Convert(tc.body, textplain.DefaultLineLength)
- assert.Nil(tt, err)
- assert.Equal(tt, tc.expect, result)
- })
- }
+ result, err := textplain.Convert(tc.body, textplain.DefaultLineLength)
+ assert.Nil(t, err)
+ assert.Equal(t, tc.expect, result)
}
const html = `
@@ -63,13 +51,6 @@ const html = ``
-func BenchmarkRegexp(b *testing.B) {
- converter := textplain.NewRegexpConverter()
- for i := 0; i < b.N; i++ {
- _, _ = converter.Convert(html, textplain.DefaultLineLength)
- }
-}
-
func BenchmarkTree(b *testing.B) {
converter := textplain.NewTreeConverter()
for i := 0; i < b.N; i++ {
diff --git a/textplain.go b/textplain.go
index cbadaea..1b19676 100644
--- a/textplain.go
+++ b/textplain.go
@@ -1,29 +1,18 @@
package textplain
-import (
- "errors"
-)
-
// Defaults
const (
DefaultLineLength = 65
)
-// Well-defined errors
-var (
- ErrBodyNotFound = errors.New("could not find a `body` element in your html document")
-)
+
+type Converter interface {
+ Convert(string, int) (string, error)
+}
var defaultConverter = NewTreeConverter()
-// Convert is a convenience method so the library can be used without initializing a converter
-// because this library relies heavily on regexp objects, it may act as a bottlneck to concurrency
-// due to thread-safety mutexes in *regexp.Regexp internals
+// Convert is a wrapper around the default converter singleton
func Convert(document string, lineLength int) (string, error) {
return defaultConverter.Convert(document, lineLength)
}
-
-func MustConvert(document string, lineLength int) string {
- result, _ := Convert(document, lineLength)
- return result
-}