package sanitize
import (
"strings"
"testing"
)
func TestStripBidiAndZeroWidthRe(t *testing.T) {
// Re-export sanity check.
if got := StripBidiAndZeroWidth("a\u200Bb"); got != "ab" {
t.Errorf("re-export StripBidiAndZeroWidth failed: got %q", got)
}
}
func TestNormalizeBreaksRe(t *testing.T) {
if got := NormalizeBreaks("a\r\nb"); got != "a\nb" {
t.Errorf("re-export NormalizeBreaks failed: got %q", got)
}
}
func TestInlineText(t *testing.T) {
cases := []struct{ in, want string }{
{"plain", "plain"},
{"a*b", `a\*b`},
{"a|b", "a|b"}, // | NOT escaped
{"a=b", "a=b"}, // = NOT escaped
{"line1\nline2", "line1 line2"}, // newline folded
{"line1\r\nline2", "line1 line2"}, // CRLF normalized then folded
{"a\u2028b", "a b"}, // U+2028 folded
{"a\u0085b", "a b"}, // NEL folded
{"a\u200Bb", "ab"}, // ZWSP stripped
{"hello *world*", `hello \*world\*`},
}
for _, c := range cases {
if got := InlineText(c.in); got != c.want {
t.Errorf("InlineText(%q) = %q, want %q", c.in, got, c.want)
}
}
}
func TestBlock(t *testing.T) {
// Block wraps every non-empty output with "\n\n" — CM §4.8 blank
// lines — on BOTH sides, so user content is paragraph-isolated
// from any realm chrome that precedes OR follows it. Bounds CM
// §4.6 HTML block types 6/7 (`
`, `
`, …) which are not
// escaped in any mode and would otherwise consume appended chrome.
cases := []struct{ in, want string }{
{"hello world\n", "\n\nhello world\n\n"},
{"# heading\n", "\n\n\\# heading\n\n"},
{"> quote\n", "\n\n\\> quote\n\n"},
{"| a | b |\n", "\n\n\\| a | b |\n\n"}, // GFM table-row escaped in strict mode
// CM §4.6 HTML block types 1-5 — escaped (blank-line-NON-terminating).
{"\n", "\n\n\\\n\n"},
{"\n", "\n\n\\\n\n"},
{"\n", "\n\n\\\n\n"},
{"\n", "\n\n\\\n\n"},
{"```\ncode\n", "\n\n```\ncode\n```\n\n"}, // fence auto-close at EOF
// Empty / strip-to-empty inputs short-circuit (no stray blank line).
{"[x]: y\n", ""},
{"", ""},
}
for _, c := range cases {
got := Block(c.in)
if got != c.want {
t.Errorf("Block(%q) = %q, want %q", c.in, got, c.want)
}
// Idempotency: Block(Block(in)) must be byte-identical to
// Block(in) for every input in the table.
twice := Block(got)
if twice != got {
t.Errorf("Block not idempotent for %q: Block(once)=%q, Block(twice)=%q", c.in, got, twice)
}
}
}
func TestBlockRich(t *testing.T) {
// BlockRich wraps every non-empty output with "\n\n" — CM §4.8
// blank lines — on BOTH sides, so user content is paragraph-
// isolated from any realm chrome that precedes OR follows it.
cases := []struct{ in, want string }{
// Block-level markdown that Block escapes — preserved by BlockRich.
{"hello world\n", "\n\nhello world\n\n"},
{"# heading\n", "\n\n# heading\n\n"},
{"> quote\n", "\n\n> quote\n\n"},
{"- item\n", "\n\n- item\n\n"},
{"1. item\n", "\n\n1. item\n\n"},
{"---\n", "\n\n\\---\n\n"}, // first-line setext-h2 → escaped by qualifying-setext pre-pass
{"***\n", "\n\n***\n\n"}, // thematic break NOT setext-h2 — preserved
{"text\n===\n", "\n\ntext\n===\n\n"}, // deeper setext preserved (user-authored above)
{"body\n===\nmore\n", "\n\nbody\n===\nmore\n\n"}, // cross-paragraph backward attack: realm `chrome\n\nbody\n===\nmore` keeps realm in its own paragraph
// GFM tables PRESERVED in Rich mode.
{"| a | b |\n", "\n\n| a | b |\n\n"},
{"| H |\n|---|\n| a |\n", "\n\n| H |\n|---|\n| a |\n\n"}, // full table renders as
// Realm-binding defenses STILL ON.
{"\n", "\n\n\\\n\n"},
// CM §4.6 HTML block types 1-5 — escaped in Rich mode too
// (defense is mode-independent).
{"\n", "\n\n\\\n\n"},
{"\n", "\n\n\\\n\n"},
{"\n", "\n\n\\\n\n"},
{"\n", "\n\n\\\n\n"},
{"[t][l]\n", "\n\n\\[t\\]\\[l\\]\n\n"},
{"[^name]\n", "\n\n\\[^name\\]\n\n"},
// LRD-only input strips to nothing; empty-after-escape short-
// circuits to "" so realm concatenation doesn't leak a stray
// blank line.
{"[x]: y\n", ""},
{"", ""}, // empty input → empty output, no stray blank line
{"```\ncode\n", "\n\n```\ncode\n```\n\n"},
}
for _, c := range cases {
got := BlockRich(c.in)
if got != c.want {
t.Errorf("BlockRich(%q) = %q, want %q", c.in, got, c.want)
}
// Idempotency: BlockRich(BlockRich(in)) must be byte-identical
// to BlockRich(in) for every input in the table.
twice := BlockRich(got)
if twice != got {
t.Errorf("BlockRich not idempotent for %q: BlockRich(once)=%q, BlockRich(twice)=%q", c.in, got, twice)
}
}
}
func TestBlockRich_LeadingBlankLineGuaranteed(t *testing.T) {
// Every non-empty result begins with "\n\n" so `chrome +
// BlockRich(user)` cannot place the user's first line in the same
// paragraph as the realm's last line (which would let a deeper
// `===` or `|---|` in user content retroactively promote realm
// chrome).
for _, in := range []string{"x", "x\n", "# h", "- i\n- j", "| a |\n|---|\n| 1 |"} {
got := BlockRich(in)
if got == "" {
continue
}
if !strings.HasPrefix(got, "\n\n") {
t.Errorf("BlockRich(%q) = %q; expected leading '\\n\\n'", in, got)
}
}
}
func TestBlockRich_TrailingBlankLineGuaranteed(t *testing.T) {
// Every non-empty result ends with "\n\n" so `BlockRich(user) +
// chrome` cannot extend user's last paragraph into the realm's
// next line (CM §5.2 lazy continuation, or a realm-supplied
// `|---|` row retroactively promoting user's last line into a
// `` header).
for _, in := range []string{"x", "x\n", "# h", "- i\n- j", "| H |\n|---|\n| a |"} {
got := BlockRich(in)
if got == "" {
continue
}
if !strings.HasSuffix(got, "\n\n") {
t.Errorf("BlockRich(%q) = %q; expected trailing '\\n\\n'", in, got)
}
}
}
func TestBlockquoteRich(t *testing.T) {
// BlockquoteRich strips BlockRich's leading "\n", line-prefixes
// each remaining line with "> ", and emits "\n" on both ends:
// leading "\n" prevents `chrome + BlockquoteRich(user)` from
// landing the first `>` mid-line; trailing "\n\n" (blank line)
// prevents `BlockquoteRich(user) + chrome` from pulling chrome
// into the quote via CM §5.2 lazy continuation.
cases := []struct{ name, in, want string }{
{"plain text", "hello world\n", "\n> hello world\n\n"},
{"atx heading preserved", "# heading\n", "\n> # heading\n\n"},
{"nested blockquote", "> nested\n", "\n> > nested\n\n"},
{"list item preserved", "- item\n", "\n> - item\n\n"},
{"ordered list preserved", "1. item\n", "\n> 1. item\n\n"},
{"first-line setext escaped", "---\n", "\n> \\---\n\n"},
{"deeper setext preserved", "text\n===\n", "\n> text\n> ===\n\n"},
{"thematic break asterisks preserved", "***\n", "\n> ***\n\n"},
// Realm-binding defenses still on.
{"gno extension escaped", "\n", "\n> \\\n\n"},
{"ref-link use escaped", "[t][l]\n", "\n> \\[t\\]\\[l\\]\n\n"},
{"footnote escaped", "[^name]\n", "\n> \\[^name\\]\n\n"},
// LRDs are stripped by BlockRich entirely; trimming the
// resulting bare "\n" leaves empty input and the helper
// returns "" without emitting a blockquote.
{"lrd alone strips to empty", "[x]: y\n", ""},
// Code fence autoclose at EOF lands inside the quote.
{"unclosed fence autoclosed", "```\ncode\n", "\n> ```\n> code\n> ```\n\n"},
// Multi-paragraph preserves the inner blank line (rendered
// as a quoted blank line `> \n`).
{"multi-paragraph preserves blank", "a\n\nb\n", "\n> a\n> \n> b\n\n"},
// Empty / blank-only input collapses cleanly to "".
{"empty input", "", ""},
// CRLF normalized through BlockRich.
{"crlf normalized", "a\r\nb\n", "\n> a\n> b\n\n"},
// NUL replaced with U+FFFD by BlockRich.
{"nul replaced", "x\x00y\n", "\n> x\uFFFDy\n\n"},
// Multiple trailing newlines collapse to the single trailing
// blank line shape — output never ends with more than `\n\n`.
{"multiple trailing newlines collapse", "foo\n\n\n", "\n> foo\n\n"},
// Internal tabs are preserved (BlockRich does not touch them).
{"internal tab preserved", "a\tb\n", "\n> a\tb\n\n"},
// Whitespace-only input still yields a blockquote (the user
// wrote literal spaces). The line-prefix loop emits `> ` plus
// the original two spaces.
{"whitespace-only input", " ", "\n> \n\n"},
}
for _, c := range cases {
got := BlockquoteRich(c.in)
if got != c.want {
t.Errorf("%s: BlockquoteRich(%q) = %q, want %q", c.name, c.in, got, c.want)
}
}
}
func TestBlockquoteRich_DoubleWrapNestsQuote(t *testing.T) {
// Not idempotent: calling twice nests the quote one level
// deeper. The aggressive TrimRight in BlockquoteRich also
// collapses the inner trailing blank line, so the second pass
// produces a clean `> > foo` nesting (no `> ` quoted blank in
// the middle) plus the outer trailing blank line.
once := BlockquoteRich("foo\n")
twice := BlockquoteRich(once)
if want := "\n> > foo\n\n"; twice != want {
t.Errorf("BlockquoteRich(BlockquoteRich(%q)) = %q, want %q", "foo\n", twice, want)
}
}
func TestBlockquoteRich_LeadingNewlineGuaranteed(t *testing.T) {
// Every non-empty result starts with "\n" so realm concatenation
// like `chrome + BlockquoteRich(user)` doesn't place `>` mid-line.
for _, in := range []string{"x", "x\n", "# h", "- i\n- j"} {
got := BlockquoteRich(in)
if got == "" {
continue
}
if got[0] != '\n' {
t.Errorf("BlockquoteRich(%q) = %q; expected leading '\\n'", in, got)
}
}
}
func TestBlockquoteRich_TrailingBlankLineGuaranteed(t *testing.T) {
// Every non-empty result ends with "\n\n" so realm concatenation
// like `BlockquoteRich(user) + "more text"` doesn't pull "more
// text" into the quote via CM §5.2 lazy continuation.
for _, in := range []string{"x", "x\n", "# h", "- i\n- j", "para\n\nmore"} {
got := BlockquoteRich(in)
if got == "" {
continue
}
if !strings.HasSuffix(got, "\n\n") {
t.Errorf("BlockquoteRich(%q) = %q; expected trailing '\\n\\n'", in, got)
}
}
}
func TestBlockquoteRich_NoStrayEmptyQuotedLine(t *testing.T) {
// BlockRich's own leading "\n" must be stripped before
// line-prefixing — otherwise the output would start with a
// useless `> \n` empty quoted line.
got := BlockquoteRich("foo\n")
if strings.HasPrefix(got, "\n> \n") {
t.Errorf("BlockquoteRich leaked BlockRich's leading '\\n' as `> \\n`: %q", got)
}
}
func TestNeuterLeadingSetextIfQualifying(t *testing.T) {
cases := []struct{ name, in, want string }{
{"leading-setext-h1", "===\nbody\n", "\\===\nbody\n"},
{"leading-setext-h2", "---\nbody\n", "\\---\nbody\n"},
{"leading-setext-indented", " ===\nbody\n", " \\===\nbody\n"},
{"leading-setext-indented-4", " ===\nbody\n", " ===\nbody\n"}, // indented code
{"leading-setext-trailing-ws", "=== \nbody\n", "\\=== \nbody\n"},
{"leading-setext-mixed-chars", "=-=-\nbody\n", "=-=-\nbody\n"}, // mixed; not setext
{"leading-setext-has-content", "=== text\nbody\n", "=== text\nbody\n"}, // mixed content
{"setext-deeper-untouched", "title\n===\nfoo\n", "title\n===\nfoo\n"},
{"leading-thematic-asterisk", "***\nbody\n", "***\nbody\n"}, // not setext, untouched
{"leading-thematic-underscore", "___\nbody\n", "___\nbody\n"},
{"leading-blank-then-setext", "\n===\nbody\n", "\n\\===\nbody\n"},
{"leading-blank-ws-then-setext", " \n===\nbody\n", " \n\\===\nbody\n"},
{"text-first", "hello\n===\n", "hello\n===\n"}, // text before === — user-authored
{"empty", "", ""},
{"all-blank", " \n\n", " \n\n"},
{"only-equals", "===", "\\==="},
}
for _, c := range cases {
if got := neuterLeadingSetextIfQualifying(c.in); got != c.want {
t.Errorf("%s: neuterLeadingSetextIfQualifying(%q) = %q, want %q", c.name, c.in, got, c.want)
}
}
}
func TestLinkTitle(t *testing.T) {
cases := []struct{ in, want string }{
{`he said "hi"`, `he said \"hi\"`},
{`it's nice`, `it\'s nice`},
{"line1\nline2", "line1 line2"},
}
for _, c := range cases {
if got := LinkTitle(c.in); got != c.want {
t.Errorf("LinkTitle(%q) = %q, want %q", c.in, got, c.want)
}
}
}
func TestTableCell(t *testing.T) {
cases := []struct{ in, want string }{
{"plain cell", "plain cell"},
{"a|b", `a\|b`},
{"a\tb", "a b"},
{"a*b|c", `a\*b\|c`},
}
for _, c := range cases {
if got := TableCell(c.in); got != c.want {
t.Errorf("TableCell(%q) = %q, want %q", c.in, got, c.want)
}
}
}
func TestHTMLEscape(t *testing.T) {
cases := []struct{ in, want string }{
{"plain", "plain"},
{"