package html2text
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path"
"regexp"
"strings"
"testing"
)
const destPath = "testdata"
// EnableExtraLogging turns on additional testing log output.
// Extra test logging can be enabled by setting the environment variable
// HTML2TEXT_EXTRA_LOGGING to "1" or "true".
var EnableExtraLogging bool
func init() {
if v := os.Getenv("HTML2TEXT_EXTRA_LOGGING"); v == "1" || v == "true" {
EnableExtraLogging = true
}
}
// TODO Add tests for FromHTMLNode and FromReader.
func TestParseUTF8(t *testing.T) {
htmlFiles := []struct {
file string
keywordShouldNotExist string
keywordShouldExist string
}{
{
"utf8.html",
"学习之道:美国公认学习第一书title",
"次世界冠军赛上,我几近疯狂",
},
{
"utf8_with_bom.xhtml",
"1892年波兰文版序言title",
"种新的波兰文本已成为必要",
},
}
for _, htmlFile := range htmlFiles {
bs, err := ioutil.ReadFile(path.Join(destPath, htmlFile.file))
if err != nil {
t.Fatal(err)
}
text, err := FromReader(bytes.NewReader(bs))
if err != nil {
t.Fatal(err)
}
if !strings.Contains(text, htmlFile.keywordShouldExist) {
t.Fatalf("keyword %s should exists in file %s", htmlFile.keywordShouldExist, htmlFile.file)
}
if strings.Contains(text, htmlFile.keywordShouldNotExist) {
t.Fatalf("keyword %s should not exists in file %s", htmlFile.keywordShouldNotExist, htmlFile.file)
}
}
}
func TestStrippingWhitespace(t *testing.T) {
testCases := []struct {
input string
output string
}{
{
"test text",
"test text",
},
{
" \ttext\ntext\n",
"text text",
},
{
" \na \n\t \n \n a \t",
"a a",
},
{
"test text",
"test text",
},
{
"test text ",
"test text",
},
}
for _, testCase := range testCases {
if msg, err := wantString(testCase.input, testCase.output); err != nil {
t.Error(err)
} else if len(msg) > 0 {
t.Log(msg)
}
}
}
func TestParagraphsAndBreaks(t *testing.T) {
testCases := []struct {
input string
output string
}{
{
"Test text",
"Test text",
},
{
"Test text
",
"Test text",
},
{
"Test text
Test",
"Test text\nTest",
},
{
"
Test text
", "Test text", }, { "Test text
Test text
", "Test text\n\nTest text", }, { "\nTest text
\n\n\n\tTest text
\n", "Test text\n\nTest text", }, { "\nTest text
Test text
Test text
\tTest text
| cell1 | cell2 |
| row1 |
| row2 |
Row-1-Col-1-Msg1 Row-1-Col-1-Msg2 | Row-1-Col-2 |
| Row-2-Col-1 | Row-2-Col-2 |
| cell1-1 | cell1-2 |
| cell2-1 | cell2-2 |
| Header 1 | Header 2 |
|---|---|
| Footer 1 | Footer 2 |
| Row 1 Col 1 | Row 1 Col 2 |
| Row 2 Col 1 | Row 2 Col 2 |
| Table 1 Header 1 | Table 1 Header 2 |
|---|---|
| Table 1 Footer 1 | Table 1 Footer 2 |
| Table 1 Row 1 Col 1 | Table 1 Row 1 Col 2 |
| Table 1 Row 2 Col 1 | Table 1 Row 2 Col 2 |
| Table 2 Header 1 | Table 2 Header 2 |
|---|---|
| Table 2 Footer 1 | Table 2 Footer 2 |
| Table 2 Row 1 Col 1 | Table 2 Row 1 Col 2 |
| Table 2 Row 2 Col 1 | Table 2 Row 2 Col 2 |
| cell |
| Item | Description | Price |
|---|---|---|
| Golang | Open source programming language that makes it easy to build simple, reliable, and efficient software | $10.99 |
| Hermes | Programmatically create beautiful e-mails using Golang. | $1.99 |
This is link1 and link2 is next.
", `This is link1 ( http://www.google.com ) and link2 ( http://www.google.com ) is next.`, }, { "http://www.google.com", `http://www.google.com`, }, } for _, testCase := range testCases { if msg, err := wantString(testCase.input, testCase.output); err != nil { t.Error(err) } else if len(msg) > 0 { t.Log(msg) } } } func TestImageAltTags(t *testing.T) { testCases := []struct { input string output string }{ { `
`,
``,
},
{
`
`,
``,
},
// Images do matter if they are in a link.
{
`
`,
`Example ( http://example.com/ )`,
},
{
`
`,
`Example ( http://example.com/ )`,
},
{
`
`,
`Example ( http://example.com/ )`,
},
{
`
`,
`Example ( http://example.com/ )`,
},
}
for _, testCase := range testCases {
if msg, err := wantString(testCase.input, testCase.output); err != nil {
t.Error(err)
} else if len(msg) > 0 {
t.Log(msg)
}
}
}
func TestHeadings(t *testing.T) {
testCases := []struct {
input string
output string
}{
{
"level 1level 2level 1
TestTest", "> \n> Test\n\nTest", }, { "\t
\nTest", "> \n> Test\n>", }, { "\t
\nTest line 1", "> \n> Test line 1\n> Test 2", }, { "
Test 2
Test
TestOther Test", "> \n> Test\n\n> \n> Test\n\nOther Test", }, { "
Lorem ipsum Commodo id consectetur pariatur ea occaecat minim aliqua ad sit consequat quis ex commodo Duis incididunt eu mollit consectetur fugiat voluptate dolore in pariatur in commodo occaecat Ut occaecat velit esse labore aute quis commodo non sit dolore officia Excepteur cillum amet cupidatat culpa velit labore ullamco dolore mollit elit in aliqua dolor irure do", "> \n> Lorem ipsum Commodo id consectetur pariatur ea occaecat minim aliqua ad\n> sit consequat quis ex commodo Duis incididunt eu mollit consectetur fugiat\n> voluptate dolore in pariatur in commodo occaecat Ut occaecat velit esse\n> labore aute quis commodo non sit dolore officia Excepteur cillum amet\n> cupidatat culpa velit labore ullamco dolore mollit elit in aliqua dolor\n> irure do", }, { "
LoremipsumCommodoidconsecteturpariatureaoccaecatminimaliquaadsitconsequatquisexcommodoDuisincididunteumollitconsecteturfugiatvoluptatedoloreinpariaturincommodooccaecatUtoccaecatvelitesselaboreautequiscommodononsitdoloreofficiaExcepteurcillumametcupidatatculpavelitlaboreullamcodoloremollitelitinaliquadoloriruredo", "> \n> Lorem *ipsum* *Commodo* *id* *consectetur* *pariatur* *ea* *occaecat* *minim*\n> *aliqua* *ad* *sit* *consequat* *quis* *ex* *commodo* *Duis* *incididunt* *eu*\n> *mollit* *consectetur* *fugiat* *voluptate* *dolore* *in* *pariatur* *in* *commodo*\n> *occaecat* *Ut* *occaecat* *velit* *esse* *labore* *aute* *quis* *commodo*\n> *non* *sit* *dolore* *officia* *Excepteur* *cillum* *amet* *cupidatat* *culpa*\n> *velit* *labore* *ullamco* *dolore* *mollit* *elit* *in* *aliqua* *dolor* *irure*\n> *do*", }, } for _, testCase := range testCases { if msg, err := wantString(testCase.input, testCase.output); err != nil { t.Error(err) } else if len(msg) > 0 { t.Log(msg) } } } func TestIgnoreStylesScriptsHead(t *testing.T) { testCases := []struct { input string output string }{ { "", "", }, { "", "", }, { "", "", }, { "", "", }, { "", "", }, { "", "", }, { "", "", }, { "", "", }, { "", "", }, { `
List:
`, `hi hello google \( https://google.com \) test List: \* Foo \( foo \) \* Barsoap \( http://www.microshwhat.com/bar/soapy \) \* Baz`, }, // Malformed input html. { `hi hello google testList:
`, `hi hello google \( https://google.com \) test List: \* Foo \( foo \) \* Bar \( /\n[ \t]+bar/baz \) \* Baz`, }, } for _, testCase := range testCases { if msg, err := wantRegExp(testCase.input, testCase.expr); err != nil { t.Error(err) } else if len(msg) > 0 { t.Log(msg) } } } func TestPeriod(t *testing.T) { testCases := []struct { input string expr string }{ { `Lorem ipsum test.
`, `Lorem ipsum test\.`, }, { `Lorem ipsum test.
`, `Lorem ipsum test\.`, }, } for _, testCase := range testCases { if msg, err := wantRegExp(testCase.input, testCase.expr); err != nil { t.Error(err) } else if len(msg) > 0 { t.Log(msg) } } } type StringMatcher interface { MatchString(string) bool String() string } type RegexpStringMatcher string func (m RegexpStringMatcher) MatchString(str string) bool { return regexp.MustCompile(string(m)).MatchString(str) } func (m RegexpStringMatcher) String() string { return string(m) } type ExactStringMatcher string func (m ExactStringMatcher) MatchString(str string) bool { return string(m) == str } func (m ExactStringMatcher) String() string { return string(m) } func wantRegExp(input string, outputRE string, options ...Options) (string, error) { return match(input, RegexpStringMatcher(outputRE), options...) } func wantString(input string, output string, options ...Options) (string, error) { return match(input, ExactStringMatcher(output), options...) } func match(input string, matcher StringMatcher, options ...Options) (string, error) { text, err := FromString(input, options...) if err != nil { return "", err } if !matcher.MatchString(text) { return "", fmt.Errorf(`error: input did not match specified expression Input: >>>> %v <<<< Output: >>>> %v <<<< Expected: >>>> %v <<<<`, input, text, matcher.String(), ) } var msg string if EnableExtraLogging { msg = fmt.Sprintf( ` input: %v output: %v `, input, text, ) } return msg, nil } func Example() { inputHTML := `Here is some more information:
| Header 1 | Header 2 |
|---|---|
| Footer 1 | Footer 2 |
| Row 1 Col 1 | Row 1 Col 2 |
| Row 2 Col 1 | Row 2 Col 2 |