sanitize.gno

75.71 Kb · 1753 lines
   1// Package sanitize provides input-cleaning primitives and safe-emit
   2// builders for each markdown lexical slot. Realm authors wrap user-
   3// supplied strings with these helpers before flowing them into rendered
   4// markdown output. Each helper targets one specific slot (link text,
   5// heading text, URL href, table cell, HTML attribute, fenced code block,
   6// blockquote, footnote definition, link-reference definition, etc.) and
   7// neutralizes the bytes that would otherwise let user content break out
   8// of that slot or inject new top-level structure.
   9//
  10// Pick the right helper from the table under "Picking the right helper"
  11// below, then wrap each user-supplied argument exactly once at the call
  12// site (see "The audit rule").
  13//
  14// # Wrap once
  15//
  16// Most escapers and safe-emit builders in this package are NOT
  17// idempotent — applying them twice re-escapes bytes the first pass
  18// added (`\*` becomes `\\\*`, `&amp;` becomes `&amp;amp;`, a fenced
  19// block gets re-fenced). Wrap each user-derived string with at most
  20// one sanitize.* call. Block and BlockRich are exceptions —
  21// idempotent by design — but the at-most-once rule is still the
  22// safest default. See the "Idempotence classes" enumeration below
  23// for the full breakdown.
  24//
  25// Some markdown-builder packages (e.g. p/moul/md) sanitize the args of
  26// specific helpers internally — see each builder's package doc for the
  27// per-helper contract. If the builder sanitizes for you, pass the raw
  28// user input; if it doesn't, wrap the input with the right sanitize.*
  29// helper at the call site.
  30//
  31// # Picking the right helper
  32//
  33// Match the helper to the slot the user content lands in:
  34//
  35//	slot                                       helper
  36//	-------------------------------------------------------------
  37//	[text](url)                                InlineText (text)
  38//	# Heading text                             InlineText
  39//	**bold** _italic_                          InlineText
  40//	![alt](src)                                InlineText (alt)
  41//	> [!NOTE] one-line title                   InlineText
  42//	multi-paragraph post body                  Block
  43//	multi-paragraph post body w/ rich block    BlockRich
  44//	  structure (headings, lists, tables, etc.)
  45//	multi-line blockquote (`> ` prefixed)      Blockquote
  46//	multi-line blockquote w/ rich block body   BlockquoteRich
  47//	[text](url "title")                        LinkTitle (title)
  48//	| cell |                                   TableCell
  49//	<gno-card caption="X">                     HTMLEscape
  50//	<h5>X</h5>                                 HTMLEscape
  51//	any URL going into ](X)                    URL
  52//	any image src going into (X)               ImageURL
  53//	`inline code` inside running prose         InlineCode
  54//	multi-line fenced code block               CodeBlock
  55//	multi-line fenced code with language tag   LanguageCodeBlock
  56//	[^name]: footnote body                     FootnoteDefinition
  57//	[label]: url "title" reference def         LinkReferenceDefinition
  58//	r/sys/users handle                         UserName       (validator)
  59//	g1.../gpub1... etc.                        BechString     (validator)
  60//	footnote / LRD label / {#id} anchor name   FootnoteLabel  (validator)
  61//	fenced-code language tag                   LanguageName   (validator)
  62//	prefix arg to md.Nested                    NestedPrefix   (validator)
  63//
  64// # Invariants
  65//
  66// All helpers in this package are panic-free for any string input and
  67// run in O(len(input)) time with bounded allocation.
  68//
  69// Idempotence classes:
  70//
  71//	Idempotent (calling twice == calling once):
  72//	  StripBidiAndZeroWidth, NormalizeBreaks
  73//	  UserName, BechString, FootnoteLabel, LanguageName, NestedPrefix
  74//	  URL, ImageURL              (accept→identity; reject→"")
  75//	  Block                      (bracket walker treats \[/\] as ordinary;
  76//	                              line-leader escapes don't re-fire on
  77//	                              already-escaped `\#` etc.)
  78//	  BlockRich                  (TrimLeft/TrimRight + "\n\n" wrap is stable)
  79//
  80//	NOT idempotent — never wrap an already-sanitized string:
  81//	  InlineText, LinkTitle, TableCell   (re-escape backslashes)
  82//	  HTMLEscape                         (re-escapes `&` → `&amp;`)
  83//	  Blockquote, BlockquoteRich         (re-prefixes `> `, nesting the quote each pass)
  84//	  InlineCode, CodeBlock,
  85//	  LanguageCodeBlock                  (wrap with a fence — calling twice double-wraps)
  86//	  FootnoteDefinition,
  87//	  LinkReferenceDefinition            (compose Block/InlineText/URL internally —
  88//	                                      passing already-sanitized strings double-escapes)
  89//
  90//	CodeFence is pure: same inputs always give the same output.
  91//
  92// Validators (UserName / BechString / FootnoteLabel / LanguageName /
  93// NestedPrefix) return either the cleaned input verbatim or "". They
  94// never partially-sanitize: if the input doesn't match the slot's
  95// charset/shape, the answer is rejection.
  96//
  97// # Composition rules
  98//
  99// Direct sanitize use (when emitting markdown without a builder package):
 100//
 101//	out := "# " + sanitize.InlineText(userTitle) + "\n\n" +
 102//	       sanitize.Block(userBody)
 103//	out += sanitize.Blockquote(userQuote)
 104//	out += sanitize.LanguageCodeBlock(realmLang, userCode)
 105//
 106// Use with a builder package (e.g. p/moul/md): pass raw user input to
 107// the builder helpers that sanitize internally — do NOT pre-wrap with
 108// sanitize.*, or the input gets double-escaped (escapers are not
 109// idempotent). See the builder's package doc for the per-helper
 110// contract. For example, with p/moul/md:
 111//
 112//	md.Blockquote(userProse)                 // good — md.Blockquote sanitizes
 113//	md.LanguageCodeBlock(realmLang, userCode) // good — sanitizes both args
 114//	md.Link(userText, userURL)               // good — sanitizes both slots
 115//
 116//	md.Blockquote(sanitize.Block(userProse))         // BAD: double-wrap
 117//	md.Link(sanitize.InlineText(t), sanitize.URL(u)) // BAD: double-wrap
 118//
 119// Wrong (across all callers):
 120//
 121//	sanitize.InlineText(sanitize.InlineText(s))   double-wrap (re-escape)
 122//	sanitize.TableCell(sanitize.InlineText(s))    TableCell already calls InlineText
 123//	sanitize.URL(sanitize.InlineText(href))       inline-escape backslash-escapes `.` `-` `_`
 124//	                                              inside the URL, corrupting the host/path
 125//	sanitize.Blockquote(sanitize.Blockquote(s))   double-wrap — outer would escape the
 126//	                                              inner `> ` prefixes
 127//	sanitize.Block(sanitize.BlockRich(s))         double-sanitize — strict Block re-escapes
 128//	                                              the markers BlockRich preserved (headings,
 129//	                                              lists, tables); BlockRich's rich structure
 130//	                                              renders as literal text after Block escapes
 131//	                                              its line-leaders
 132//	sanitize.BlockRich(sanitize.Block(s))         pointless double-sanitize — Block already
 133//	                                              escaped every line-leader to `\#`/`\>`/etc.;
 134//	                                              BlockRich preserves the backslash escapes
 135//	                                              as visible artifacts in user prose
 136//	sanitize.Blockquote(sanitize.BlockRich(s))    double-sanitize — Blockquote's Block step
 137//	                                              re-escapes the markers BlockRich preserved
 138//	sanitize.BlockRich(sanitize.Blockquote(s))    nonsense — Blockquote already line-prefixed
 139//	                                              with `> `; BlockRich expects raw user content
 140//	sanitize.BlockquoteRich(sanitize.BlockRich(s)) double-wrap — Rich + Rich nests twice
 141//	sanitize.BlockRich(sanitize.TableCell(s))     wrong slot — use TableCell for cell content,
 142//	                                              BlockRich for multi-paragraph block content
 143//	sanitize.TableCell(multiParagraphProse)       newlines fold to space silently; use a
 144//	                                              non-table layout for multi-paragraph text
 145//
 146// # Threat model
 147//
 148// Sanitizers in this package defend against:
 149//
 150//   - bidi/zero-width injection: invisible characters that make
 151//     displayed text disagree with stored bytes (e.g. an address `g1abc...`
 152//     that renders as `g1xyz...`, or a username that visually collides
 153//     with another). Stripped by StripBidiAndZeroWidth, which runs as
 154//     the first step of every text-shaped helper.
 155//   - line-ending homoglyphs: CR-only and Unicode separators
 156//     (U+0085 NEL, U+2028, U+2029) that some renderers treat as line
 157//     breaks. Folded uniformly.
 158//   - markdown-structure injection: user content opening a heading,
 159//     blockquote, list, code fence, link-reference def, setext underline,
 160//     gnoweb extension delimiter, or GFM table row at document level.
 161//     Strict Block escapes the line-leading `|` of any GFM table row so
 162//     user content cannot inject `<table>`-shaped structure; permissive
 163//     BlockRich preserves table rows so authors can compose `<table>`
 164//     elements (gnoweb loads extension.Table per render_config.go).
 165//   - HTML block type 1-5 absorption: CommonMark §4.6 HTML block types 1
 166//     (`<script>`, `<pre>`, `<style>`, `<textarea>`), 2 (`<!--`), 3
 167//     (`<?`), 4 (`<!UPPER`), and 5 (`<![CDATA[`) do NOT close on a blank
 168//     line — they only close on a type-specific token (`</tag>`, `-->`,
 169//     `?>`, `>`, `]]>`) or EOF. Without a defense, user content opening
 170//     any of these would swallow realm chrome appended afterward. Both
 171//     Block and BlockRich line-escape the openers (prepend `\`) so the
 172//     block never opens; this defense is unconditional in both modes.
 173//     Types 6 and 7 close on a blank line, so BlockRich's `\n\n`
 174//     paragraph envelope already bounds them and no escape is needed.
 175//   - realm-discipline boundary (caller's responsibility, not enforced):
 176//     callers should emit realm chrome at flush-left column 0 around
 177//     `BlockRich(user)`. Indented chrome (4+ leading spaces, list-item
 178//     continuations, footnote-definition body, or an unclosed Type 1
 179//     HTML tag in realm chrome before the call) can extend across blank
 180//     lines into user content or vice versa. The sanitizer cannot
 181//     defend against malformed realm chrome — only against user input.
 182//   - footnote / link-reference namespace pollution: user content
 183//     containing `[^name]` or `[text][label]` syntax that would otherwise
 184//     resolve against realm-defined footnote definitions or link
 185//     reference definitions elsewhere on the page. Block escapes the
 186//     opening `[` in both shapes.
 187//   - reference-link / footnote-ref / shortcut-ref collisions:
 188//     `[text][label]`, `[^name]`, and bare `[label]` shortcut forms
 189//     are ALL neutralized by Block's bracket walk, which preserves
 190//     only inline `[text](url)` and `![alt](src)` syntax — everything
 191//     else has both `[` and `]` backslash-escaped, so the parser sees
 192//     literal text and can't resolve against realm-defined LRDs or
 193//     footnote definitions.
 194//   - multi-line LRD evasion: Block's walker recognises `[lab\nel]: url`
 195//     across newlines (single `\n` OK, blank line aborts) and strips
 196//     the whole region. `\]` inside the label is honored as an escaped
 197//     literal, so `[label\]: url` is NOT treated as an LRD (renders as
 198//     literal text).
 199//   - URL scheme abuse: javascript:, data:text/html, vbscript:, blob:,
 200//     protocol-relative //, mailto: with prefill phishing parameters.
 201//     Allowlist-only (URL / ImageURL).
 202//   - HTML attribute / element breakout: `"`, `<`, `>`, `&`, `'` inside
 203//     HTML lexical slots. Handled by HTMLEscape.
 204//   - CommonMark §2.3 NUL: replaced with U+FFFD by Block, InlineText,
 205//     LinkTitle, TableCell, HTMLEscape, InlineCode, CodeBlock, and
 206//     LanguageCodeBlock.
 207//   - code-fence leakage: a user-opened ``` ``` ``` fence that runs to EOF
 208//     with no closing fence, which would otherwise swallow every realm-
 209//     emitted line that follows. Block auto-closes any open fence at EOF.
 210//   - table-alignment drift: tabs inside table cells expanding to variable
 211//     widths (1-4 spaces depending on column position) and shifting cell
 212//     boundaries unpredictably. TableCell replaces tabs with single spaces.
 213//
 214// What this package does NOT do:
 215//
 216//   - It does not store state. Every helper is a pure function.
 217//   - It does not validate semantic correctness. sanitize.URL accepts
 218//     a syntactically valid https:// URL even if the host is malicious;
 219//     URL reputation is a separate layer.
 220//   - It does not enforce CSS containment. ImageURL admits data:image/*
 221//     URIs on the assumption that the deploying gnoweb instance caps
 222//     rendered image dimensions via CSS. Without that cap, a malicious
 223//     image can blow out the page layout or exhaust memory.
 224//   - It does not perform structural sandboxing of foreign markdown.
 225//     If a realm concatenates an opaque markdown blob returned from a
 226//     polymorphic interface (`someThing.Render()`), it needs a structural
 227//     sandbox primitive (e.g. a `<gno-card>` extension), not just leaf
 228//     sanitization.
 229//
 230// # When to use Block vs BlockRich
 231//
 232// Both are safe sanitizers; both run identical realm-binding defenses.
 233// They differ in what user-authored block structure survives:
 234//
 235//   - Block — paragraph-shaped only. Escapes `#`, `>`, list markers,
 236//     `---`/`***`/`___` thematic breaks, and `===`/`---` setext
 237//     underlines. Use for leaf slots — footnote definition bodies,
 238//     table cells, blockquote bodies (Blockquote uses Block), single-
 239//     paragraph prose, any slot where richer structure has no benefit
 240//     or where richer structure could visually impersonate realm chrome.
 241//
 242//   - BlockRich — full-richness. Preserves user-authored headings,
 243//     lists, quotes, HR, setext. Use for user content the realm intends
 244//     to compose with full block-level structure, typically inside a
 245//     sandbox container (`<gno-card>`, `<gno-foreign>`) or a CSS-demoted
 246//     region. BlockRich's qualifying-setext defense prevents the
 247//     cross-boundary attack (user content reaching back to promote
 248//     realm chrome to a heading), but inner-heading visual containment
 249//     is the realm's CSS responsibility. gnoweb does not yet ship CSS
 250//     rules that demote headings inside sandbox containers — until they
 251//     land, BlockRich + sandbox renders inner headings at literal size.
 252//
 253// Do NOT compose Block and BlockRich in either direction. Pick one
 254// helper at the right level.
 255//
 256// # Extending
 257//
 258// A new helper added to this package MUST:
 259//
 260//  1. Be panic-free for any string input.
 261//  2. Strip bidi+zero-width before any other transform (so display
 262//     equals storage end-to-end).
 263//  3. Declare its idempotence class in the table above.
 264//  4. Document the markdown / HTML lexical slot it targets.
 265//  5. Reject rather than partially-sanitize when input is structurally
 266//     invalid (return "" — never half-process an address or URL).
 267//  6. Pick exactly one of the two return-value contracts and stick to
 268//     it: escapers always return a transformed string and never reject
 269//     (any input is OK — the transformation makes it safe); validators
 270//     return the cleaned input verbatim on accept or "" on reject and
 271//     never half-process. Mixing the contracts within one helper is a
 272//     bug — callers can't reason about whether "" means "input was
 273//     already empty" or "input was rejected".
 274package sanitize
 275
 276import (
 277	"chain/markdown"
 278	"html"
 279	"strings"
 280)
 281
 282// ----- Re-exports of the public chain/markdown natives -----
 283//
 284// These are general-purpose data-hygiene primitives, not markdown-specific.
 285// The other helpers in this package call them internally, so realms emitting
 286// markdown rarely need to call them directly. Reach for these when you have
 287// a non-markdown use case — e.g. normalizing a username before storage,
 288// canonicalizing a search query, or stripping invisible characters from
 289// any user string that will be displayed or compared.
 290
 291// StripBidiAndZeroWidth removes Unicode bidi controls and zero-width
 292// characters (U+200B-D, U+200E-F, U+202A-E, U+2066-9, U+FEFF) from s.
 293// Use it when storing or comparing user-supplied strings outside of a
 294// markdown context — for example, before saving a display name to state,
 295// or before hashing a search query. Idempotent: calling twice gives the
 296// same result.
 297//
 298// Thin wrapper over chain/markdown.StripBidiAndZeroWidth.
 299func StripBidiAndZeroWidth(s string) string {
 300	return markdown.StripBidiAndZeroWidth(s)
 301}
 302
 303// NormalizeBreaks unifies CR-LF and lone CR to LF (CommonMark §2.2 line
 304// endings only — does NOT touch U+2028/U+2029). Use it when comparing
 305// or hashing user input that may have been authored on different
 306// platforms (Windows CRLF vs. Unix LF), so equivalent strings normalize
 307// to the same bytes. Idempotent.
 308//
 309// Thin wrapper over chain/markdown.NormalizeBreaks.
 310func NormalizeBreaks(s string) string {
 311	return markdown.NormalizeBreaks(s)
 312}
 313
 314// ----- Escapers -----
 315
 316// InlineText prepares an arbitrary user string for an INLINE markdown
 317// slot — anywhere the rendered output stays on a single line and lives
 318// inside a larger markdown construct.
 319//
 320// Use for:
 321//   - link text:        [InlineText(label)](url)
 322//   - heading text:     # InlineText(title)
 323//   - bold/italic body: **InlineText(name)**
 324//   - image alt text:   ![InlineText(alt)](src)
 325//   - single-line block-context slots:
 326//     > [!NOTE] InlineText(title)
 327//     > Author: InlineText(name)
 328//
 329// Multi-paragraph prose belongs in Block, not InlineText. InlineText
 330// folds every newline to a single space (so paragraph structure is
 331// erased) and escapes inline-active CommonMark punctuation:
 332//
 333//	\ * _ [ ] ( ) ~ > - + . ! ` # < &
 334//
 335// Two characters are intentionally NOT escaped:
 336//
 337//   - `|` — only meaningful in GFM table rows. Leaving it literal here
 338//     lets TableCell (which calls InlineText then escapes `|` itself)
 339//     avoid double-escaping pipes into `\\|`.
 340//   - `=` — only meaningful as a setext heading underline, which is a
 341//     line-level construct. Escaping `=` inline would mangle expressions
 342//     like `x = 1` for no benefit.
 343//
 344// Not idempotent (see package doc).
 345func InlineText(s string) string {
 346	s = markdown.StripBidiAndZeroWidth(s)
 347	s = markdown.NormalizeBreaks(s)       // CM §2.2 \r\n / \r → \n
 348	s = foldNewlinesAndSeparators(s, ' ') // \n + NEL + U+2028/U+2029 → space
 349	return markdown.EscapeInline(s)
 350}
 351
 352// Block prepares user content for a top-level BLOCK markdown context
 353// where paragraphs, line breaks, code blocks, and other block structure
 354// should survive — but where the content must NOT be able to inject
 355// new top-level constructs (headings, lists, blockquotes,
 356// link-reference definitions, setext underlines, gnoweb extension
 357// delimiters, GFM table rows).
 358//
 359// Output shape: every non-empty result begins AND ends with "\n\n" —
 360// CM §4.8 blank lines on both sides — so user content is guaranteed
 361// to occupy its own paragraph(s), isolated from any realm chrome that
 362// precedes OR follows it. This bounds CM §4.6 HTML block types 6 and
 363// 7 (`<div>`, `<table>`, `<form>`, arbitrary `<foo>` tags) which
 364// close on a blank line and are NOT escaped in any mode, and it
 365// defeats first-line setext promotion (`===`/`---`) that strict-mode
 366// escapes miss when the previous line is blank in the user input but
 367// non-blank in the concatenated realm output. Empty input (or input
 368// that strips entirely, e.g. a lone LRD) returns "" — no envelope is
 369// emitted.
 370//
 371// Use for any multi-paragraph user-supplied prose that the realm
 372// concatenates into its rendered output:
 373//   - post bodies, comments, replies
 374//   - profile bios, About sections
 375//   - proposal descriptions, governance motions
 376//   - changelog entries, release notes
 377//
 378// What Block does with each kind of attacker input:
 379//
 380//	User attempt                                          | Block's response
 381//	------------------------------------------------------|----------------------------------------------------
 382//	  --- preserved verbatim ---                          |
 383//	[text](url) inline link, ![alt](src) image            | preserved verbatim
 384//	------------------------------------------------------|----------------------------------------------------
 385//	  --- escaped / stripped / folded ---                 |
 386//	# heading at line-start                               | escaped → literal `# heading`
 387//	> quoted at line-start                                | escaped → literal `>`
 388//	- item, * item, + item, 1. item at line-start         | escaped
 389//	---, ***, ___ (3+) at line-start                      | escaped
 390//	=== or --- on its own line after non-blank text       | escaped (no setext promotion of the line above)
 391//	<gno-card>, <gno-columns>, any <gno-…>/</gno-…> at    | escaped (wildcard match) → literal text
 392//	  line-start                                          |
 393//	| a | b | GFM table row (line-leading `|`)            | escaped → literal `| a | b |`
 394//	<!--, <script>, <pre>, <style>, <textarea>, <?…?>,    | escaped (\<…) → literal text;
 395//	  <!DOCTYPE…>, <![CDATA[…]]> at line-start            |   blocks goldmark from opening a
 396//	  (CM §4.6 HTML block types 1-5)                      |   blank-line-NON-terminating HTML block
 397//	[text][realm-label] ref-link USE                      | both bracket pairs escaped → \[text\]\[realm-label\]
 398//	[^name] footnote-ref                                  | both brackets escaped → \[^name\]
 399//	[label] bare shortcut-ref                             | both brackets escaped → \[label\]
 400//	[label]: url link-reference definition                | whole region stripped (incl. multi-line label
 401//	  (incl. [lab\nel]: url multi-line)                   |   `[lab\nel]: url` and any title continuation)
 402//	[label\]: url (backslash-escaped `]`)                 | NOT stripped; brackets escaped → paragraph text
 403//	code fence opened without close                       | autoclosed at end of input
 404//	NUL byte (\x00)                                       | replaced with U+FFFD
 405//	U+2028 / U+2029 / U+0085 (NEL)                        | folded to `\n`
 406//	bidi/zero-width controls                              | stripped
 407//
 408// COMPOSITION GOTCHA: Block's EOF fence-autoclose appends a final
 409// fence line. If you wrap Block's output with a line-prefixing
 410// builder like md.Blockquote (which prepends `> ` per line) or
 411// md.Nested, that closing fence becomes a prefixed line. The output
 412// is still safe (the fence still closes correctly) but may render
 413// awkwardly. If pixel-perfect output matters, strip a trailing blank
 414// fence line after Block.
 415//
 416// Why backslash and not a space for `<gno-…>` lines: gnoweb's
 417// extension parsers call `util.TrimLeftSpace` on the line before tag
 418// matching, which would strip a leading space and let the tag match
 419// anyway. A leading `\` survives the trim (only ASCII whitespace +
 420// form-feed are stripped) and is consumed by the inline escape phase
 421// before Type-7 HTML block detection can fire (Type-7 requires the
 422// first non-whitespace char to be `<`).
 423//
 424// Inline emphasis, code spans, inline links, and soft line breaks
 425// within a paragraph are PRESERVED — users can format. Pipes that
 426// are NOT at line-start stay literal so prose can still write things
 427// like `a | b`.
 428//
 429// Idempotent: Block(Block(s)) is byte-identical to Block(s). The
 430// bracket walker strips LRDs on the first pass; remaining `[`/`]`
 431// outside inline-link/image spans are escaped to `\[`/`\]`, and
 432// already-escaped brackets are preserved on subsequent passes
 433// (pass-2 backslash-parity tracking). Still, wrap each user-supplied
 434// string exactly once — chained sanitization adds no value and
 435// burns gas.
 436func Block(s string) string {
 437	s = markdown.NormalizeBreaks(s)
 438	s = markdown.StripBidiAndZeroWidth(s)
 439	s = replaceNULWithFFFD(s)
 440	s = markdown.EscapeBlockHazards(s)
 441	// Symmetric "\n\n" envelope — same pattern BlockRich uses for the
 442	// same reasons (see BlockRich docstring "Cross-paragraph safety").
 443	// Strict mode escapes most line-leading hazards (setext, GFM table
 444	// row, CM §4.6 HTML types 1-5, list/heading/HR markers), but two
 445	// hazards remain that only a blank-line break can close:
 446	//
 447	//   - CM §4.6 HTML block types 6 and 7 (`<div>`, `<table>`,
 448	//     `<form>`, arbitrary `<foo>` tags) are NOT escaped in any mode
 449	//     — they close on a blank line per CM. Without a trailing
 450	//     "\n\n", a `<div>` at the end of user content extends into
 451	//     appended realm chrome.
 452	//
 453	//   - First-line setext: strict mode's setext escape only fires
 454	//     when the previous line is non-blank IN THE USER'S INPUT.
 455	//     A user whose first line is `===` slips past, and concatenated
 456	//     after `chrome\n` would promote chrome to H1. The leading
 457	//     "\n\n" forces a paragraph break so chrome cannot be merged.
 458	//
 459	// TrimLeft/TrimRight + fixed wrap is idempotent: Block(Block(s)) is
 460	// byte-identical to Block(s). Empty post-escape result short-
 461	// circuits to "" so realm concatenation doesn't leak stray blank
 462	// lines for trivially empty inputs (e.g. lone LRD that strips
 463	// entirely).
 464	s = strings.TrimLeft(s, "\n")
 465	s = strings.TrimRight(s, "\n")
 466	if s == "" {
 467		return ""
 468	}
 469	return "\n\n" + s + "\n\n"
 470}
 471
 472// BlockRich is the permissive counterpart of Block. Both are safe
 473// sanitizers — the distinction is what markdown structure survives:
 474//
 475//   - Block escapes line-leading block markers (`#`, `>`, `-`, `*`,
 476//     `+`, `1.`), thematic breaks (`---`/`***`/`___`), and setext
 477//     underlines (`===`/`---`). User content becomes paragraph-shaped.
 478//   - BlockRich PRESERVES all of those, so user content can compose
 479//     headings, lists, quotes, horizontal rules, and setext-styled
 480//     headings. Realm-binding defenses stay on (extension delimiters
 481//     `<gno-…>`, the bracket walker for link / LRD / ref /
 482//     footnote / shortcut, fence autoclose, NUL / bidi /
 483//     Unicode-separator folding). GFM table-row openers are
 484//     PRESERVED (see "Tables" below).
 485//
 486// Cross-paragraph safety: BlockRich's output begins with "\n\n"
 487// AND ends with "\n\n" — CM §4.8 blank lines on both sides — so
 488// user content is guaranteed to occupy its own paragraph(s),
 489// isolated from anything the realm emits before OR after. Symmetric
 490// isolation closes four distinct attacks:
 491//
 492//   - Cross-paragraph setext promotion (backward). User content
 493//     `body\n===\nmore` concatenated after realm chrome (no
 494//     trailing `\n`) would, without paragraph isolation, place
 495//     "chrome\nbody" in one paragraph; the `===` setext underline
 496//     would then promote that merged paragraph to H1, hijacking
 497//     realm chrome. The leading "\n\n" forces a paragraph break.
 498//
 499//   - Cross-paragraph GFM table promotion (backward). User content
 500//     beginning with `|---|---|` (a table delimiter row) would,
 501//     without a blank-line break, retroactively turn the preceding
 502//     realm line into a `<thead>`. Paragraph isolation prevents
 503//     the table-detection scan from crossing the boundary.
 504//
 505//   - Cross-paragraph GFM table promotion (forward). Realm chrome
 506//     appended immediately after BlockRich(user) that begins with
 507//     `|---|` would, without a trailing blank line, extend user's
 508//     last line into a table header and pull realm chrome into the
 509//     body row. The trailing "\n\n" prevents the merge.
 510//
 511//   - Lazy paragraph continuation (forward). Paragraph-shaped realm
 512//     chrome appended immediately after BlockRich(user) would, via
 513//     CM §5.2, merge into user's trailing paragraph and inherit any
 514//     block-level decoration it carries.
 515//
 516// First-line qualifying-setext escape (the
 517// `neuterLeadingSetextIfQualifying` pre-pass) remains in place as
 518// belt-and-suspenders: if the first non-blank line of user input
 519// matches the CM §4.3 setext-underline pattern (run of `=` or `-`
 520// with 0-3 leading spaces and only trailing whitespace), BlockRich
 521// inserts `\` before the first `=`/`-`. This is redundant given
 522// paragraph isolation but harmless and inexpensive.
 523//
 524// # Tables
 525//
 526// BlockRich preserves line-leading `|` so user content can
 527// compose GFM tables:
 528//
 529//	| Header A | Header B |
 530//	|----------|----------|
 531//	| cell a   | cell b   |
 532//
 533// renders as a real `<table>` element. Strict Block continues to
 534// escape line-leading `|` (each row becomes literal `\| a | b |`
 535// text). When the realm authors the table itself and inserts user
 536// content into a specific cell, use TableCell — NOT BlockRich —
 537// to sanitize that cell value.
 538//
 539// What attacker input produces what (full table, same rows as Block
 540// except where marked CHANGED):
 541//
 542//	User attempt                                  | BlockRich response
 543//	----------------------------------------------|--------------------------------------------------
 544//	  --- preserved (compose freely) ---          |
 545//	# heading at line-start                       | preserved [CHANGED from Block]
 546//	> quoted at line-start                        | preserved [CHANGED]
 547//	- item, * item, + item, 1. item               | preserved [CHANGED]
 548//	---, ***, ___ thematic break                  | preserved [CHANGED]
 549//	=== or --- setext underline                   | preserved when preceded by user text;
 550//	                                              | escaped (\===/\---) if the first non-blank
 551//	                                              | line of input [CHANGED]
 552//	| a | b | GFM table row (line-leading |)      | preserved → renders as <table> when followed by
 553//	                                              | a delimiter row [CHANGED]
 554//	[text](url), ![alt](src)                      | preserved verbatim [SAME]
 555//	----------------------------------------------|--------------------------------------------------
 556//	  --- escaped / stripped / folded ---         |
 557//	<gno-card>, any <gno-…>/</gno-…> at line-start| escaped (wildcard match) [SAME]
 558//	<!--, <script>, <pre>, <style>, <textarea>,   | escaped (\<…) [SAME] — Types 1-5 don't close
 559//	  <?…?>, <!DOCTYPE…>, <![CDATA[…]]>           |   on blank lines, so `\n\n` envelope
 560//	  at line-start (CM §4.6 HTML block types 1-5)|   doesn't isolate them; explicit escape
 561//	[text][realm-label] ref-link USE              | both pairs escaped [SAME]
 562//	[^name] footnote-ref                          | both brackets escaped [SAME]
 563//	[label] bare shortcut-ref                     | both brackets escaped [SAME]
 564//	[label]: url link-reference definition        | whole region stripped [SAME]
 565//	[label\]: url (escaped `]`)                   | not stripped; brackets escaped [SAME]
 566//	code fence opened without close               | autoclosed at end of input [SAME]
 567//	NUL byte (\x00)                               | replaced with U+FFFD [SAME]
 568//	U+2028 / U+2029 / U+0085 (NEL)                | folded to `\n` [SAME]
 569//	bidi/zero-width controls                      | stripped [SAME]
 570//
 571// Use BlockRich for user content the realm intends to compose with
 572// full block-level richness — typically inside a sandbox container
 573// (`<gno-card>`, `<gno-foreign>`) or a CSS-demoted region where inner
 574// headings render visually distinct from realm chrome. The realm
 575// must own the visual containment: concatenating BlockRich's output
 576// directly into a top-level page still lets the user write `# heading`
 577// at document level. BlockRich's cross-boundary setext defense prevents
 578// the worst case (reaching backwards into realm bytes), but visual
 579// containment of inner headings is the realm's CSS responsibility.
 580// gnoweb does not yet ship CSS rules that demote inner headings inside
 581// `<gno-card>` / `<gno-foreign>` — until those rules land, realms
 582// using BlockRich + a sandbox should be aware that inner headings
 583// render at their literal level.
 584//
 585// Idempotent: BlockRich(BlockRich(s)) is byte-identical to
 586// BlockRich(s). The TrimLeft-then-"\n\n"-prepend pattern strips
 587// any leading newlines and reapplies exactly two, so the leading
 588// shape is stable across passes; the qualifying-setext escape is
 589// stable (a line beginning with `\` no longer matches the setext
 590// pattern); and the bracket walker treats already-escaped
 591// `\[`/`\]` as ordinary bytes. Empty input (or input that strips
 592// to empty, e.g. a lone link-reference definition) returns "" —
 593// realm concatenation doesn't get a stray blank line.
 594// Still, wrap each user-supplied string exactly once — chained
 595// sanitization adds no value and burns gas.
 596//
 597// Realm-discipline boundary: BlockRich defends user input against
 598// every cross-paragraph attack listed above, but it CANNOT defend
 599// against malformed REALM chrome. Specifically, callers should emit
 600// realm chrome at flush-left column 0 around `BlockRich(user)`. If
 601// the realm chrome BEFORE the call contains an unclosed CM §4.6
 602// Type 1 HTML tag (`<script>`, `<pre>`, `<style>`, `<textarea>`),
 603// the `\n\n` envelope does NOT close it (Type 1 closes only on the
 604// matching close tag), and user-controlled `</tag>` content can
 605// then prematurely terminate it. Indented chrome (4+ leading
 606// spaces, list-item continuations, footnote-definition body) can
 607// likewise extend across the envelope into user content. Keep
 608// chrome flush-left and Type 1 tags closed within the chrome.
 609//
 610// PREVIEW: BlockquoteRich is currently the only in-tree caller of
 611// BlockRich; the API and the `"\n\n"` output shape may evolve once
 612// direct callers emerge.
 613func BlockRich(s string) string {
 614	s = markdown.NormalizeBreaks(s)
 615	s = markdown.StripBidiAndZeroWidth(s)
 616	s = replaceNULWithFFFD(s)
 617	// Fold Unicode separators (U+2028, U+2029, U+0085 NEL) to '\n'
 618	// BEFORE the setext-qualifying check. The native
 619	// EscapeBlockHazardsRich also folds them internally, but the Gno
 620	// helper below needs to see the folded form to correctly identify
 621	// the first non-blank line — otherwise an attacker can hide the
 622	// `===` setext underline behind a U+2028 / U+2029 / U+0085 and
 623	// reach back to promote realm chrome above it.
 624	s = foldSeparatorsToNewline(s)
 625	s = neuterLeadingSetextIfQualifying(s)
 626	s = markdown.EscapeBlockHazardsRich(s)
 627	// Ensure the output BOTH starts AND ends with "\n\n" — CM §4.8
 628	// blank lines on each side — so user content is GUARANTEED to
 629	// occupy its own paragraph(s), isolated from anything the realm
 630	// emits before OR after. Symmetric isolation closes four attacks:
 631	//
 632	//  Backward (closed by leading "\n\n"):
 633	//   1. Deeper-setext: user content `body\n===\nmore` concatenated
 634	//      after realm chrome (no trailing `\n`) would otherwise place
 635	//      "chrome\nbody" in one paragraph; the `===` setext underline
 636	//      would then promote that merged paragraph to H1, hijacking
 637	//      realm chrome.
 638	//   2. GFM table-row promotion: user content beginning with
 639	//      `|---|---|` (a table delimiter row) would, without a blank-
 640	//      line break, retroactively promote the preceding realm line
 641	//      into a `<thead>` cell.
 642	//
 643	//  Forward (closed by trailing "\n\n"):
 644	//   3. GFM table-row promotion in reverse: realm appending its own
 645	//      chrome immediately after BlockRich(user), where chrome
 646	//      starts with `|---|`, would extend user's last line into a
 647	//      table header and pull realm chrome into the body row.
 648	//   4. Lazy paragraph continuation: realm appending paragraph-
 649	//      shaped chrome immediately after BlockRich(user) would, via
 650	//      CM §5.2 lazy-continuation, merge into user's trailing
 651	//      paragraph and inherit any block-level decoration it carries.
 652	//
 653	// `neuterLeadingSetextIfQualifying` above is now belt-and-
 654	// suspenders for the first-line setext case: even if the blank-
 655	// line guarantee were somehow defeated by an exotic CM consumer,
 656	// the first-line escape still blocks the simplest setext shape.
 657	//
 658	// Empty post-escape result short-circuits to "" so realm
 659	// concatenation doesn't leak stray blank lines for trivially empty
 660	// inputs (e.g. a lone link-reference definition that strips
 661	// entirely).
 662	//
 663	// Idempotency: TrimLeft and TrimRight strip ALL leading/trailing
 664	// "\n"s, then the wrap adds exactly two on each side. Stable
 665	// across passes.
 666	s = strings.TrimLeft(s, "\n")
 667	s = strings.TrimRight(s, "\n")
 668	if s == "" {
 669		return ""
 670	}
 671	return "\n\n" + s + "\n\n"
 672}
 673
 674// foldSeparatorsToNewline replaces U+0085 NEL (0xC2 0x85),
 675// U+2028 (0xE2 0x80 0xA8), and U+2029 (0xE2 0x80 0xA9) with '\n'.
 676// Leaves '\n' bytes alone. Used by BlockRich so the qualifying-setext
 677// pre-pass and the native both see the same line structure.
 678func foldSeparatorsToNewline(s string) string {
 679	// Cheap pre-check: only the 0xC2 / 0xE2 lead bytes can trigger.
 680	if !containsAnyByteForFold(s) {
 681		return s
 682	}
 683	out := make([]byte, 0, len(s))
 684	for i := 0; i < len(s); {
 685		c := s[i]
 686		if c == 0xC2 && i+1 < len(s) && s[i+1] == 0x85 {
 687			out = append(out, '\n')
 688			i += 2
 689			continue
 690		}
 691		if c == 0xE2 && i+2 < len(s) && s[i+1] == 0x80 && (s[i+2] == 0xA8 || s[i+2] == 0xA9) {
 692			out = append(out, '\n')
 693			i += 3
 694			continue
 695		}
 696		out = append(out, c)
 697		i++
 698	}
 699	return string(out)
 700}
 701
 702func containsAnyByteForFold(s string) bool {
 703	for i := 0; i < len(s); i++ {
 704		if s[i] == 0xC2 || s[i] == 0xE2 {
 705			return true
 706		}
 707	}
 708	return false
 709}
 710
 711// neuterLeadingSetextIfQualifying scans s for the first non-blank
 712// line. If that line matches the CommonMark §4.3 setext-underline
 713// pattern (0-3 leading spaces, then a run of all `=` or all `-`,
 714// then optional trailing whitespace, then `\n` or EOF), the function
 715// returns s with a `\` inserted before the first `=`/`-`. Otherwise
 716// returns s unchanged. The escape prevents a realm-emitted line above
 717// BlockRich's output from being retroactively promoted to a heading.
 718func neuterLeadingSetextIfQualifying(s string) string {
 719	pos := 0
 720	for pos < len(s) {
 721		// Walk to the first non-whitespace byte of the current line.
 722		lineStart := pos
 723		i := pos
 724		for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
 725			i++
 726		}
 727		if i >= len(s) || s[i] == '\n' {
 728			// Blank line; advance to next line.
 729			if i >= len(s) {
 730				return s
 731			}
 732			pos = i + 1
 733			continue
 734		}
 735		// First non-blank line. Check setext-underline shape.
 736		if i-lineStart > 3 {
 737			return s // 4+ leading spaces = indented code, not setext
 738		}
 739		c := s[i]
 740		if c != '=' && c != '-' {
 741			return s // not a setext underline candidate
 742		}
 743		j := i + 1
 744		for j < len(s) && s[j] == c {
 745			j++
 746		}
 747		for j < len(s) && (s[j] == ' ' || s[j] == '\t') {
 748			j++
 749		}
 750		if j < len(s) && s[j] != '\n' {
 751			return s // mixed content on the line — not setext
 752		}
 753		return s[:i] + "\\" + s[i:]
 754	}
 755	return s
 756}
 757
 758// Blockquote wraps user content as a CommonMark blockquote: each line
 759// of the cleaned content gets a "> " prefix so the renderer displays
 760// it inside a `<blockquote>` element.
 761//
 762// Use for any multi-paragraph user-supplied text that the realm wants
 763// to render as a quotation: cited posts, attached responses, error
 764// snapshots that should visually stand out.
 765//
 766// The content is first cleaned by Block (bidi-strip, line-ending
 767// normalize, NUL→U+FFFD, bracket walker for link/image/LRD spans,
 768// block-marker escape, code-fence auto-close at EOF, Unicode-separator
 769// fold). Block's "\n\n" cross-paragraph envelope is then stripped —
 770// the `> ` marker creates the container boundary, so the envelope
 771// would only line-prefix to empty `> ` lines top and bottom — and
 772// every remaining line is prefixed with "> ". The user content can
 773// still use inline emphasis, code spans, and nested fenced code blocks
 774// inside the quote; what it cannot do is open new top-level structure
 775// (heading, list, blockquote, GFM table row, etc.) or escape the
 776// quote.
 777//
 778// Output shape — every non-empty result begins with "\n" and ends
 779// with "\n\n" (same shape as BlockquoteRich):
 780//
 781//   - Leading "\n" guarantees a clean blockquote opener even when the
 782//     realm concatenates `chrome + Blockquote(user)` without its own
 783//     newline separator.
 784//   - Trailing "\n\n" (blank line) cleanly ends the blockquote so a
 785//     realm appending `Blockquote(user) + chrome` cannot pull chrome
 786//     bytes into the quote via CommonMark §5.2 lazy continuation.
 787//
 788// Empty input (or input that strips entirely, e.g. a lone LRD)
 789// returns "" — no blockquote is emitted.
 790//
 791// Composition gotcha: Block's EOF code-fence auto-close (added when
 792// user content opens a ``` ``` ``` fence without closing it) becomes a
 793// "> ```" line at the end of the blockquote. Goldmark parses this
 794// correctly as the close of a fenced block inside the quote — the
 795// output is structurally safe — but the markdown source looks unusual
 796// to a human reviewer. If aesthetic output matters, ensure user
 797// content closes its own fences.
 798//
 799// Not idempotent (see package doc): wraps with `> ` per line, so
 800// calling twice double-wraps and the outer call's Block step escapes
 801// the inner `>` prefixes.
 802//
 803// Do NOT compose with BlockRich in either direction:
 804//   - Blockquote(BlockRich(s)) double-sanitizes: BlockRich preserves
 805//     `#`/`>`/etc., then Blockquote's Block step escapes them again.
 806//   - BlockRich(Blockquote(s)) doesn't make sense: Blockquote already
 807//     line-prefixed with `> `; BlockRich expects raw user content.
 808//
 809// For a quoted body that can contain headings, lists, nested quotes,
 810// or thematic breaks, use BlockquoteRich.
 811func Blockquote(text string) string {
 812	text = Block(text)
 813	// Block wraps its output with "\n\n" on each side for cross-
 814	// paragraph isolation. Inside a blockquote both wraps are redundant
 815	// — the `> ` marker creates the container boundary — and they
 816	// would line-prefix to two useless `> ` empty quoted lines top and
 817	// bottom. Strip ALL leading and trailing "\n"s so the body starts
 818	// and ends clean; this helper re-wraps with `\n` + body + `\n\n`
 819	// below (same shape as BlockquoteRich).
 820	text = strings.TrimLeft(text, "\n")
 821	if text == "" {
 822		return ""
 823	}
 824	text = strings.TrimRight(text, "\n")
 825	if text == "" {
 826		return ""
 827	}
 828	var sb strings.Builder
 829	sb.WriteByte('\n')
 830	for _, line := range strings.Split(text, "\n") {
 831		sb.WriteString("> ")
 832		sb.WriteString(line)
 833		sb.WriteByte('\n')
 834	}
 835	sb.WriteByte('\n')
 836	return sb.String()
 837}
 838
 839// BlockquoteRich is the permissive counterpart of Blockquote. Both
 840// wrap user content as a CommonMark blockquote (each line prefixed
 841// with `> `), but they differ in what block-level structure inside
 842// the quote survives:
 843//
 844//   - Blockquote escapes line-leading block markers, so the quoted
 845//     body is paragraph-shaped — `# x` inside a Blockquote stays a
 846//     literal `#`.
 847//   - BlockquoteRich PRESERVES line-leading block markers, so the
 848//     quoted body can compose ATX headings, lists, thematic breaks,
 849//     nested blockquotes (`> > nested`), and other block-level
 850//     structure. Realm-binding defenses stay on (extension delimiters,
 851//     GFM table-row openers, bracket walker, fence autoclose,
 852//     NUL / bidi / Unicode-separator folding).
 853//
 854// Output shape — every non-empty result begins with "\n" and ends
 855// with "\n\n":
 856//
 857//   - Leading "\n" guarantees a clean blockquote opener even when the
 858//     realm concatenates `chrome + BlockquoteRich(user)` without its
 859//     own newline separator. Without the leading "\n", chrome ending
 860//     mid-line followed by "> quoted" would render `>` as literal
 861//     paragraph text instead of opening a blockquote.
 862//   - Trailing "\n\n" (blank line) cleanly ends the blockquote so a
 863//     realm appending `BlockquoteRich(user) + chrome` cannot pull
 864//     chrome bytes into the quote via CommonMark §5.2 lazy
 865//     continuation. Without the trailing blank line, paragraph chrome
 866//     immediately after BlockquoteRich would render inside the quote.
 867//   - BlockRich's own leading "\n\n" (paragraph-isolation blank line)
 868//     is stripped before line-prefixing — otherwise the output would
 869//     carry one or two redundant empty `> ` quoted lines at the top.
 870//     A single "\n" is then re-prepended at the BlockquoteRich
 871//     boundary so `chrome + BlockquoteRich(user)` still lands the
 872//     first `>` at column 0.
 873//   - The cross-boundary setext defense BlockRich provides is
 874//     redundant inside a blockquote: a setext underline inside `> `
 875//     content can only promote a line in the same blockquote, never
 876//     reach realm bytes (different CM container). BlockRich still
 877//     applies it, harmlessly.
 878//
 879// What attacker input produces what (rows that differ from
 880// Blockquote are marked CHANGED):
 881//
 882//	User attempt                                  | BlockquoteRich response
 883//	----------------------------------------------|------------------------------------------------
 884//	  --- preserved inside `> ` quote ---         |
 885//	# heading                                     | preserved as `> # heading` [CHANGED]
 886//	> nested quote                                | preserved as `> > nested quote` [CHANGED]
 887//	- item, * item, + item, 1. item               | preserved as `> - item` etc. [CHANGED]
 888//	---, ***, ___ thematic break                  | preserved [CHANGED]
 889//	=== or --- setext underline                   | preserved when preceded by user text;
 890//	                                              | escaped (\===/\---) if first non-blank
 891//	                                              | line of input [CHANGED]
 892//	| a | b | GFM table row (line-leading |)      | preserved → renders as <table> inside the
 893//	                                              | blockquote when followed by a delimiter row [CHANGED]
 894//	[text](url), ![alt](src)                      | preserved verbatim [SAME]
 895//	----------------------------------------------|------------------------------------------------
 896//	  --- escaped / stripped / folded ---         |
 897//	<gno-card>, any <gno-…>/</gno-…> at line-start| escaped (wildcard match) [SAME]
 898//	<!--, <script>, <pre>, <style>, <textarea>,   | escaped (\<…) [SAME] — CM §4.6 Types 1-5
 899//	  <?…?>, <!DOCTYPE…>, <![CDATA[…]]>           |   don't close on blank lines; without escape
 900//	  at line-start                               |   they would swallow chrome past the `> ` quote
 901//	[text][realm-label] ref-link USE              | both pairs escaped [SAME]
 902//	[^name] footnote-ref                          | both brackets escaped [SAME]
 903//	[label] bare shortcut-ref                     | both brackets escaped [SAME]
 904//	[label]: url link-reference definition        | whole region stripped [SAME]
 905//	code fence opened without close               | autoclosed at end of input [SAME]
 906//	NUL byte (\x00)                               | replaced with U+FFFD [SAME]
 907//	U+2028 / U+2029 / U+0085 (NEL)                | folded to `\n` [SAME]
 908//	bidi/zero-width controls                      | stripped [SAME]
 909//
 910// Use BlockquoteRich when the realm wants to render user content as
 911// a quotation that itself reads like authored markdown — the visual
 912// CSS containment of `<blockquote>` already demotes inner headings
 913// relative to realm chrome, so the "inner headings need a sandbox"
 914// caveat that applies to BlockRich at top level does not apply here.
 915//
 916// Not idempotent: like Blockquote, calling twice double-wraps —
 917// `BlockquoteRich(BlockquoteRich(s))` produces `> > content`,
 918// nesting the quote a level deeper each pass.
 919//
 920// Empty input (or input that reduces to nothing after BlockRich,
 921// e.g. a lone link-reference definition) returns "" — no blockquote
 922// is emitted and neither the leading "\n" nor the trailing "\n\n"
 923// shape applies.
 924func BlockquoteRich(text string) string {
 925	text = BlockRich(text)
 926	// BlockRich wraps user content with "\n\n" on each side for
 927	// cross-paragraph isolation. Inside a blockquote both wraps are
 928	// redundant — the `> ` marker creates the container boundary —
 929	// and they would line-prefix to two useless `> ` empty quoted
 930	// lines top and bottom. Strip ALL leading and trailing "\n"s so
 931	// the body starts and ends clean; this helper re-wraps with `\n`
 932	// + body + `\n\n` below.
 933	text = strings.TrimLeft(text, "\n")
 934	if text == "" {
 935		return ""
 936	}
 937	// Strip ALL trailing newlines so the loop produces exactly one
 938	// `> line` per content line, then append `\n\n` at the end so the
 939	// blockquote terminates cleanly (see "Output shape" above).
 940	text = strings.TrimRight(text, "\n")
 941	if text == "" {
 942		return ""
 943	}
 944	var sb strings.Builder
 945	// Leading "\n" so `chrome + BlockquoteRich(user)` cannot land the
 946	// first `>` mid-line.
 947	sb.WriteByte('\n')
 948	for _, line := range strings.Split(text, "\n") {
 949		sb.WriteString("> ")
 950		sb.WriteString(line)
 951		sb.WriteByte('\n')
 952	}
 953	// Trailing blank line so `BlockquoteRich(user) + chrome` cannot
 954	// pull chrome into the quote via lazy continuation.
 955	sb.WriteByte('\n')
 956	return sb.String()
 957}
 958
 959// LinkTitle prepares user content for a CommonMark link-title or
 960// image-title slot — the optional quoted text after the URL in any of
 961// these forms:
 962//
 963//	[text](url "TITLE")
 964//	![alt](src "TITLE")
 965//	[label]: url "TITLE"
 966//
 967// Escapes the inline-active set plus `"` and `'` (the title delimiters
 968// that aren't already in the inline set; `(` and `)` are), so the
 969// caller can choose any of the three title-quote styles safely.
 970//
 971// Pick the right helper for the slot — markdown title and HTML
 972// attribute share the look but use different escape rules:
 973//
 974//	[text](url "X")              → LinkTitle      (markdown title)
 975//	<a title="X">                → HTMLEscape     (HTML attribute)
 976//	<h5>X</h5>                   → HTMLEscape     (HTML element body)
 977//
 978// Swapping HTMLEscape for LinkTitle is wrong: HTML's `&amp;` written
 979// inside a markdown title renders as the literal characters `&amp;`.
 980// Swapping LinkTitle for HTMLEscape is wrong: markdown's `\"` survives
 981// into the rendered HTML as a literal backslash-quote.
 982//
 983// Not idempotent (see package doc).
 984func LinkTitle(s string) string {
 985	s = markdown.StripBidiAndZeroWidth(s)
 986	s = markdown.NormalizeBreaks(s)
 987	s = foldNewlinesAndSeparators(s, ' ')
 988	return markdown.EscapeTitle(s)
 989}
 990
 991// TableCell prepares user content for a GFM table cell — the bytes
 992// between two `|` column delimiters in a table row like
 993// `| cell-a | cell-b | cell-c |`. An unescaped `|` inside cell
 994// content would open a new column, letting a malicious user shift
 995// every column to its right.
 996//
 997// On top of InlineText's behavior, TableCell:
 998//   - escapes `|` to `\|` so user content can't end the cell early.
 999//   - replaces tabs with single spaces. CommonMark expands tabs to
1000//     the next multiple-of-4 column boundary (variable 1-4 spaces),
1001//     which would shift the displayed cell-content width unpredictably
1002//     and confuse table alignment.
1003//
1004// Not idempotent (see package doc).
1005func TableCell(s string) string {
1006	s = InlineText(s)
1007	s = strings.ReplaceAll(s, "\t", " ")
1008	s = strings.ReplaceAll(s, "|", `\|`)
1009	return s
1010}
1011
1012// HTMLEscape prepares user content for an HTML lexical slot inside
1013// markdown — covers attribute values, element bodies, and HTML
1014// comment bodies:
1015//
1016//	<gno-card type="..." caption="X">         attribute value
1017//	<gno-alert title="X">                     attribute value
1018//	<h5>X</h5>                                element body
1019//	<details><summary>X</summary>...          element body
1020//	<!-- X -->                                comment body (safe: `>`
1021//	                                          becomes `&gt;`, so user
1022//	                                          cannot inject `-->`)
1023//
1024// HTMLEscape escapes the union of attribute-breaking and body-breaking
1025// characters (`<`, `>`, `&`, `"`, `'`), so one function safely serves
1026// every HTML lexical context. Callers don't have to remember which
1027// subset to use for which slot.
1028//
1029// Pick the right helper — markdown title and HTML attribute share
1030// the look but use different escape rules:
1031//
1032//	[text](url "X")              → LinkTitle      (markdown title)
1033//	<span title="X">             → HTMLEscape     (HTML attribute)
1034//	<h5>X</h5>                   → HTMLEscape     (HTML element body)
1035//
1036// Swapping InlineText for HTMLEscape is wrong: markdown's backslash
1037// escapes survive into the rendered HTML as literal `\*`. Swapping
1038// LinkTitle for HTMLEscape is also wrong: `&amp;` written inside a
1039// markdown title renders as the literal characters `&amp;`.
1040//
1041// Not idempotent (see package doc): calling twice produces
1042// `&amp;` → `&amp;amp;`.
1043func HTMLEscape(s string) string {
1044	s = markdown.StripBidiAndZeroWidth(s)
1045	s = markdown.NormalizeBreaks(s)
1046	s = foldNewlinesAndSeparators(s, ' ')
1047	s = replaceNULWithFFFD(s)
1048	return html.EscapeString(s)
1049}
1050
1051// ----- URL filters -----
1052
1053// URL validates a URL for use as a link href, percent-encodes unsafe
1054// bytes, and rejects anything outside the allowlist of schemes.
1055//
1056// Allowlist:
1057//   - http, https
1058//   - mailto (rejected if contains ?body= or &body= — prefill phishing)
1059//   - any URL WITHOUT a scheme — relative paths (`/path`, `./rel`,
1060//     `bare-path`), query-only (`?q=v`), fragment-only (`#anchor`).
1061//     A `:` appearing inside the URL (e.g. `/path:foo`, `?q=a:b`) is
1062//     NOT a scheme separator per RFC 3986 — only `:` immediately after
1063//     a leading `[a-zA-Z][a-zA-Z0-9+.-]*` counts.
1064//
1065// Rejected (have an unknown scheme):
1066//   - javascript:, data:, vbscript:, blob:, file:, etc.
1067//   - `//host/...` (protocol-relative — tracking-pixel vector)
1068//
1069// Returns "" if the URL is empty after trim or fails the allowlist.
1070func URL(s string) string {
1071	s = strings.TrimSpace(s)
1072	if s == "" {
1073		return ""
1074	}
1075	if !linkSchemeAllowed(s) {
1076		return ""
1077	}
1078	return markdown.PercentEncodeURL(s)
1079}
1080
1081// ImageURL validates a URL for use as an image src. Kept separate from
1082// URL — not a parameterized variant — because the allowlist shapes
1083// differ qualitatively (data:image/* vs. mailto:) and a single boolean
1084// flag would invite callers to pass the wrong default.
1085//
1086// Allowlist:
1087//   - http, https
1088//   - schemeless relative URLs starting with /, ./, or ..
1089//     (rejects // protocol-relative — tracking-pixel vector)
1090//   - data:image/svg+xml, data:image/png, data:image/jpeg,
1091//     data:image/gif, data:image/webp
1092//
1093// Any other data: subtype is rejected — data:text/html etc. would
1094// render as inline HTML and execute embedded scripts.
1095//
1096// DEPLOYMENT PRECONDITION: data: URIs encode the bytes of the image
1097// directly into the markup, so a malicious sender can construct an
1098// image whose pixel dimensions are arbitrarily large at minimal byte
1099// cost. The deploying gnoweb instance MUST clamp rendered image
1100// dimensions via CSS (e.g. `max-width: 100%; max-height: <bound>`).
1101// Without that cap, a single image can blow out the page layout or
1102// exhaust the browser's memory.
1103//
1104// Returns "" if the URL is empty after trim or fails the allowlist.
1105func ImageURL(s string) string {
1106	s = strings.TrimSpace(s)
1107	if s == "" {
1108		return ""
1109	}
1110	if !imageSchemeAllowed(s) {
1111		return ""
1112	}
1113	return markdown.PercentEncodeURL(s)
1114}
1115
1116// ----- Validators -----
1117
1118// userNameCharsets builds the [2]uint64 bitmaps for the r/sys/users
1119// charset: first [a-z], rest [a-z0-9_-]. Initialized once at package
1120// init.
1121var (
1122	userNameFirstLo, userNameFirstHi           uint64
1123	userNameRestLo, userNameRestHi             uint64
1124	footnoteLabelFirstLo, footnoteLabelFirstHi uint64
1125	footnoteLabelRestLo, footnoteLabelRestHi   uint64
1126	langFirstLo, langFirstHi                   uint64
1127	langRestLo, langRestHi                     uint64
1128	bechHrpFirstLo, bechHrpFirstHi             uint64
1129	bechHrpRestLo, bechHrpRestHi               uint64
1130	bechDataFirstLo, bechDataFirstHi           uint64
1131	bechDataRestLo, bechDataRestHi             uint64
1132)
1133
1134func init() {
1135	// UserName: first [a-z], rest [a-z0-9_-].
1136	for c := byte('a'); c <= 'z'; c++ {
1137		setBit(&userNameFirstLo, &userNameFirstHi, c)
1138		setBit(&userNameRestLo, &userNameRestHi, c)
1139	}
1140	for c := byte('0'); c <= '9'; c++ {
1141		setBit(&userNameRestLo, &userNameRestHi, c)
1142	}
1143	setBit(&userNameRestLo, &userNameRestHi, '_')
1144	setBit(&userNameRestLo, &userNameRestHi, '-')
1145
1146	// FootnoteLabel: [A-Za-z0-9_-] for both first and rest.
1147	for c := byte('A'); c <= 'Z'; c++ {
1148		setBit(&footnoteLabelFirstLo, &footnoteLabelFirstHi, c)
1149		setBit(&footnoteLabelRestLo, &footnoteLabelRestHi, c)
1150	}
1151	for c := byte('a'); c <= 'z'; c++ {
1152		setBit(&footnoteLabelFirstLo, &footnoteLabelFirstHi, c)
1153		setBit(&footnoteLabelRestLo, &footnoteLabelRestHi, c)
1154	}
1155	for c := byte('0'); c <= '9'; c++ {
1156		setBit(&footnoteLabelFirstLo, &footnoteLabelFirstHi, c)
1157		setBit(&footnoteLabelRestLo, &footnoteLabelRestHi, c)
1158	}
1159	for _, c := range []byte{'_', '-'} {
1160		setBit(&footnoteLabelFirstLo, &footnoteLabelFirstHi, c)
1161		setBit(&footnoteLabelRestLo, &footnoteLabelRestHi, c)
1162	}
1163
1164	// LanguageName: [a-zA-Z0-9_+-] for both first and rest.
1165	for c := byte('A'); c <= 'Z'; c++ {
1166		setBit(&langFirstLo, &langFirstHi, c)
1167		setBit(&langRestLo, &langRestHi, c)
1168	}
1169	for c := byte('a'); c <= 'z'; c++ {
1170		setBit(&langFirstLo, &langFirstHi, c)
1171		setBit(&langRestLo, &langRestHi, c)
1172	}
1173	for c := byte('0'); c <= '9'; c++ {
1174		setBit(&langFirstLo, &langFirstHi, c)
1175		setBit(&langRestLo, &langRestHi, c)
1176	}
1177	for _, c := range []byte{'_', '+', '-'} {
1178		setBit(&langFirstLo, &langFirstHi, c)
1179		setBit(&langRestLo, &langRestHi, c)
1180	}
1181
1182	// Bech HRP (when prefix==""): [a-z], 1-16 chars.
1183	for c := byte('a'); c <= 'z'; c++ {
1184		setBit(&bechHrpFirstLo, &bechHrpFirstHi, c)
1185		setBit(&bechHrpRestLo, &bechHrpRestHi, c)
1186	}
1187
1188	// Bech data part: [a-z0-9], 6-90 chars.
1189	for c := byte('a'); c <= 'z'; c++ {
1190		setBit(&bechDataFirstLo, &bechDataFirstHi, c)
1191		setBit(&bechDataRestLo, &bechDataRestHi, c)
1192	}
1193	for c := byte('0'); c <= '9'; c++ {
1194		setBit(&bechDataFirstLo, &bechDataFirstHi, c)
1195		setBit(&bechDataRestLo, &bechDataRestHi, c)
1196	}
1197}
1198
1199func setBit(lo, hi *uint64, c byte) {
1200	if c < 64 {
1201		*lo |= 1 << c
1202	} else {
1203		*hi |= 1 << (c - 64)
1204	}
1205}
1206
1207// UserName validates the r/sys/users-registration charset:
1208// ^[a-z][a-z0-9]*([_-][a-z0-9]+)*$ length ≤ 64.
1209//
1210// The native MatchCharsetN enforces the leading-letter + tail-charset
1211// shape and length bound; this helper also performs the bidi-strip
1212// pre-pass. The "no consecutive [_-]" rule from r/sys/users is NOT
1213// enforced here (it's a registration-policy rule, not a sanitization
1214// concern — registrations go through r/sys/users itself).
1215//
1216// Returns the (bidi-stripped) input if valid, "" otherwise. On a ""
1217// return, do not emit the user-mention markup at all (e.g. skip the
1218// `[@user](/u/user)` link); falling back to the raw user-supplied
1219// string would defeat the validation.
1220func UserName(s string) string {
1221	s = markdown.StripBidiAndZeroWidth(s)
1222	if markdown.MatchCharsetN(s, userNameFirstLo, userNameFirstHi, userNameRestLo, userNameRestHi, 1, 64) {
1223		return s
1224	}
1225	return ""
1226}
1227
1228// BechString validates a bech32-style address-like string.
1229//
1230// A bech32 string has the shape `<hrp>1<data>`: a human-readable
1231// prefix (HRP) that names the family (e.g. `g` for gno addresses,
1232// `gpub` for gno pubkeys, `cosmos` for cosmos addresses), the
1233// separator character `1`, then a data part carrying the encoded
1234// payload as lowercase alphanumerics.
1235//
1236// If prefix != "", requires s to start with prefix+"1" exactly, and the
1237// data part to match ^[a-z0-9]{6,90}$. Use this when you know the
1238// expected family:
1239//
1240//	sanitize.BechString(addr, "g")     // only g1...     (addresses)
1241//	sanitize.BechString(pk,   "gpub")  // only gpub1...  (pubkeys)
1242//
1243// If prefix == "", accepts any reasonable bech32 shape:
1244// ^[a-z]{1,16}1[a-z0-9]{6,90}$.
1245//
1246// Syntactic only — does NOT verify the bech32 checksum. Use a true
1247// bech32 decoder if you need that. Returns the cleaned input on
1248// accept, "" on reject; on "" return, do not emit the address-link
1249// markup (the user-supplied bytes have failed shape validation and
1250// should not appear unmodified in output).
1251func BechString(s, prefix string) string {
1252	s = markdown.StripBidiAndZeroWidth(s)
1253	if s == "" {
1254		return ""
1255	}
1256	if prefix != "" {
1257		// HRP must be lowercase ASCII letters.
1258		for i := 0; i < len(prefix); i++ {
1259			c := prefix[i]
1260			if c < 'a' || c > 'z' {
1261				return ""
1262			}
1263		}
1264		need := prefix + "1"
1265		if !strings.HasPrefix(s, need) {
1266			return ""
1267		}
1268		data := s[len(need):]
1269		if markdown.MatchCharsetN(data, bechDataFirstLo, bechDataFirstHi, bechDataRestLo, bechDataRestHi, 6, 90) {
1270			return s
1271		}
1272		return ""
1273	}
1274	// prefix == "" — accept any 1-16 char lowercase HRP, then '1', then data.
1275	sep := strings.IndexByte(s, '1')
1276	if sep < 1 || sep > 16 {
1277		return ""
1278	}
1279	hrp := s[:sep]
1280	if !markdown.MatchCharsetN(hrp, bechHrpFirstLo, bechHrpFirstHi, bechHrpRestLo, bechHrpRestHi, 1, 16) {
1281		return ""
1282	}
1283	data := s[sep+1:]
1284	if markdown.MatchCharsetN(data, bechDataFirstLo, bechDataFirstHi, bechDataRestLo, bechDataRestHi, 6, 90) {
1285		return s
1286	}
1287	return ""
1288}
1289
1290// FootnoteLabel validates an identifier used as a footnote name, link-
1291// reference-definition label, or {#id} anchor: ^[A-Za-z0-9_-]{1,64}$.
1292// Strips bidi/zero-width first. Returns s if valid, "" otherwise.
1293//
1294// Use for every shape where a markdown identifier is treated as an
1295// opaque key by the parser:
1296//
1297//   - footnote-definition labels:        [^FootnoteLabel(name)]: body
1298//   - footnote-reference labels:         see [^FootnoteLabel(name)]
1299//   - link-reference-definition labels:  [FootnoteLabel(label)]: url
1300//   - reference-link USE labels:         [text][FootnoteLabel(label)]
1301//   - goldmark auto-anchor {#id}:        # Heading {#FootnoteLabel(id)}
1302//
1303// The shared validator name reflects the shared charset and shared
1304// security goal — keep untrusted bytes out of any parser-managed
1305// identifier slot.
1306//
1307// On "" return, omit the footnote / LRD / anchor entirely rather than
1308// emitting it with raw user bytes.
1309func FootnoteLabel(s string) string {
1310	s = markdown.StripBidiAndZeroWidth(s)
1311	if markdown.MatchCharsetN(s, footnoteLabelFirstLo, footnoteLabelFirstHi, footnoteLabelRestLo, footnoteLabelRestHi, 1, 64) {
1312		return s
1313	}
1314	return ""
1315}
1316
1317// LanguageName validates the language tag (a.k.a. "info string") for
1318// a fenced code block — the `go` in:
1319//
1320//	```go
1321//	fmt.Println("hi")
1322//	```
1323//
1324// Charset: ^[a-zA-Z0-9_+-]{1,32}$ — letters, digits, `_`, `+`, `-`,
1325// up to 32 bytes. Strips bidi/zero-width first.
1326//
1327// Returns the cleaned input if valid, "" otherwise. A "" return means
1328// the caller should emit a language-less fence (``` without a tag)
1329// rather than letting the user pick the syntax highlighter — which
1330// could otherwise be used to inject newlines or block markers into
1331// what becomes the opening fence line.
1332func LanguageName(s string) string {
1333	s = markdown.StripBidiAndZeroWidth(s)
1334	if markdown.MatchCharsetN(s, langFirstLo, langFirstHi, langRestLo, langRestHi, 1, 32) {
1335		return s
1336	}
1337	return ""
1338}
1339
1340// NestedPrefix validates a prefix string for line-prefixing builders
1341// like md.Nested, which prepends `prefix` to every line of content
1342// to render the content as a nested/indented sub-block.
1343//
1344// Allowed: any string matching `^[ \t>]*$` — spaces, tabs, blockquote
1345// `>` chars only. Anything else (a `#`, a `-`, a letter) would let a
1346// caller turn benign sub-content into a heading, list, or paragraph
1347// at the wrong nesting level.
1348//
1349// Returns s if valid, "" otherwise. Strips bidi/zero-width first —
1350// otherwise an invisible character hidden inside a `>` prefix would
1351// be replicated on every nested content line, producing per-line
1352// display-vs-storage divergence.
1353//
1354// On "" return, fall back to a known-safe prefix literal (e.g.
1355// `"> "`) or skip the nesting entirely. Do not emit the raw
1356// user-supplied prefix.
1357func NestedPrefix(s string) string {
1358	s = markdown.StripBidiAndZeroWidth(s)
1359	for i := 0; i < len(s); i++ {
1360		c := s[i]
1361		if c != ' ' && c != '\t' && c != '>' {
1362			return ""
1363		}
1364	}
1365	return s
1366}
1367
1368// ----- Primitive -----
1369
1370// CodeFence returns a string of backticks long enough to wrap content
1371// as a CommonMark fenced code block without the content's own backticks
1372// closing the fence prematurely.
1373//
1374// Returned length N = max(minCount, longestBacktickRunInContent + 1).
1375// Use N backticks both before and after the content:
1376//
1377//	fence := sanitize.CodeFence(userCode, 3)
1378//	out += fence + "\n" + userCode + "\n" + fence + "\n"
1379//
1380// Typical minCount values:
1381//   - 1 for inline code spans (`x`)
1382//   - 3 for block fenced code (CommonMark §4.5 requires ≥3)
1383//
1384// `minCount < 1` is clamped to 1. Empty content returns
1385// strings.Repeat("`", max(minCount, 1)). Never panics.
1386//
1387// Most realms should reach for InlineCode / CodeBlock /
1388// LanguageCodeBlock below, which call CodeFence internally and emit
1389// the full code block for you. Call CodeFence directly only when
1390// you're rolling a custom fence emitter (e.g. a renderer that needs
1391// the fence length but emits the body differently).
1392func CodeFence(content string, minCount int) string {
1393	return markdown.CodeFence(content, minCount)
1394}
1395
1396// InlineCode wraps user content as a CommonMark inline code span — the
1397// `code` in “ `code` “. Use for any user-derived token, identifier,
1398// or short literal that should render in monospace inside running
1399// prose: variable names, hashes, hex addresses, token symbols, error
1400// codes, package paths, transaction IDs.
1401//
1402// Inline code spans cannot span lines (a `\n` inside the content would
1403// end the span and leave the surrounding backticks as literal text),
1404// so all line breaks — CR / CRLF / LF, NEL (U+0085), U+2028, U+2029 —
1405// are folded to a single space. If you want each line of user content
1406// on its own row, use CodeBlock instead.
1407//
1408// Behavior:
1409//   - Bidi/zero-width controls are stripped (browsers honor bidi marks
1410//     inside `<code>`, so leaving them would let stored bytes display
1411//     as something different).
1412//   - NUL is replaced with U+FFFD.
1413//   - The wrapping fence is one backtick longer than the longest
1414//     backtick run in the content, so internal backticks can never
1415//     close the span prematurely.
1416//   - A single space pad is added on each side when content starts or
1417//     ends with “ ` “ or space, so leading/trailing backticks render
1418//     literally rather than fusing with the fence (the renderer
1419//     strips one space from each side per CommonMark spec).
1420//
1421// Empty input returns "" rather than a literal two-backtick string
1422// (which CommonMark parses as text, not as an empty code span). If
1423// you use InlineCode as link text and it returns "", omit the link
1424// entirely.
1425//
1426// Not idempotent (see package doc): wraps with a fence, so calling
1427// twice double-wraps.
1428func InlineCode(content string) string {
1429	content = markdown.StripBidiAndZeroWidth(content)
1430	content = markdown.NormalizeBreaks(content)
1431	content = foldNewlinesAndSeparators(content, ' ')
1432	content = replaceNULWithFFFD(content)
1433	if content == "" {
1434		return ""
1435	}
1436	fence := markdown.CodeFence(content, 1)
1437	pad := ""
1438	if content[0] == '`' || content[0] == ' ' ||
1439		content[len(content)-1] == '`' || content[len(content)-1] == ' ' {
1440		pad = " "
1441	}
1442	return fence + pad + content + pad + fence
1443}
1444
1445// CodeBlock wraps user content as a CommonMark fenced code block.
1446// Use for any user-derived multi-line snippet that should render as a
1447// code block: log excerpts, JSON dumps, error backtraces, config
1448// snippets, posted code samples.
1449//
1450// Behavior:
1451//   - Bidi/zero-width controls are stripped.
1452//   - CR/CRLF line endings are normalized to LF; Unicode separators
1453//     (NEL U+0085, U+2028, U+2029) are folded to LF for line-count
1454//     consistency.
1455//   - NUL is replaced with U+FFFD per CM §2.3.
1456//   - The wrapping fence is at least 3 backticks (CM §4.5 minimum) and
1457//     sized to outscan internal backticks — an attacker cannot embed
1458//     a closing fence in the content.
1459//
1460// Empty content emits an empty fenced block ("```\n\n```\n"), which is
1461// valid CommonMark and renders as an empty `<pre><code></code></pre>`.
1462//
1463// Not idempotent (see package doc).
1464func CodeBlock(content string) string {
1465	content = markdown.StripBidiAndZeroWidth(content)
1466	content = markdown.NormalizeBreaks(content)
1467	content = foldNewlinesAndSeparators(content, '\n')
1468	content = replaceNULWithFFFD(content)
1469	fence := markdown.CodeFence(content, 3)
1470	return fence + "\n" + content + "\n" + fence + "\n"
1471}
1472
1473// LanguageCodeBlock wraps user content as a fenced code block tagged
1474// with a programming-language hint (the "info string" after the
1475// opening fence, e.g. `go` in ```` ```go ````) so the renderer can
1476// apply syntax highlighting.
1477//
1478// An invalid `language` tag silently falls back to a tagless fence —
1479// the helper never returns an error or panics. If a realm author is
1480// debugging "why is my Go highlighting gone?", the input failed the
1481// language validator (charset ^[a-zA-Z0-9_+-]{1,32}$ after bidi-strip).
1482// This fallback exists because an unvalidated tag could contain a
1483// newline that injects content (e.g. a heading) onto what becomes the
1484// opening fence line.
1485//
1486// Content is cleaned exactly as in CodeBlock (bidi-strip, CR/CRLF
1487// normalize to LF, NEL/U+2028/U+2029 fold to LF, NUL→U+FFFD, fence
1488// sized to outscan internal backticks).
1489//
1490// Not idempotent (see package doc).
1491func LanguageCodeBlock(language, content string) string {
1492	content = markdown.StripBidiAndZeroWidth(content)
1493	content = markdown.NormalizeBreaks(content)
1494	content = foldNewlinesAndSeparators(content, '\n')
1495	content = replaceNULWithFFFD(content)
1496	fence := markdown.CodeFence(content, 3)
1497	lang := LanguageName(language) // "" on reject
1498	return fence + lang + "\n" + content + "\n" + fence + "\n"
1499}
1500
1501// ----- Reference-style definitions -----
1502
1503// FootnoteDefinition emits a GFM footnote definition — the
1504// `[^name]: body` form that introduces a footnote whose body is rendered
1505// in the page footer (or wherever the renderer chooses to place it).
1506// Other parts of the markdown reference the footnote by writing
1507// `[^name]` inline.
1508//
1509// Use for any realm-rendered footnote where the body text comes from
1510// user input. The realm picks the footnote name (passed as `name`,
1511// validated by FootnoteLabel — failure here returns ""); the user's
1512// content goes in `text`, which is sanitized via Block.
1513//
1514// Contract:
1515//   - `name`: passed raw, validated as a FootnoteLabel
1516//     (^[A-Za-z0-9_-]{1,64}$). Reject → return "".
1517//   - `text`: passed raw multi-paragraph user prose, cleaned via Block
1518//     (bidi-strip, line-ending normalize, LRD strip, block-marker
1519//     escape, ref-link USE escape, fence auto-close).
1520//
1521// Empty body → returns "" (a label without body is not a valid
1522// footnote definition; the markdown would parse as a paragraph
1523// containing the label).
1524//
1525// Output shape:
1526//
1527//	[^name]:
1528//	    line 1 of body
1529//	    line 2 of body
1530//	    ...
1531//
1532// The label sits on its own line and each body line gets a 4-space
1533// indent — the GFM continuation rule that keeps multi-paragraph body
1534// text bound to the footnote rather than detaching as a new paragraph.
1535//
1536// Not idempotent (see package doc): composes Block internally; passing
1537// already-sanitized body text double-escapes.
1538func FootnoteDefinition(name, text string) string {
1539	label := FootnoteLabel(name)
1540	if label == "" {
1541		return ""
1542	}
1543	// Block now wraps with "\n\n" on both sides for cross-paragraph
1544	// isolation; inside a footnote-definition's 4-space-indented body
1545	// the wrap would line-prefix to blank padding lines, so strip ALL
1546	// leading and trailing "\n"s before continuation-indenting.
1547	body := strings.Trim(Block(text), "\n")
1548	if body == "" {
1549		return ""
1550	}
1551	var b strings.Builder
1552	b.WriteString("[^")
1553	b.WriteString(label)
1554	b.WriteString("]:\n")
1555	for _, line := range strings.Split(body, "\n") {
1556		if line == "" {
1557			b.WriteByte('\n')
1558		} else {
1559			b.WriteString("    ")
1560			b.WriteString(line)
1561			b.WriteByte('\n')
1562		}
1563	}
1564	return b.String()
1565}
1566
1567// LinkReferenceDefinition emits a CommonMark link reference definition
1568// (CM §4.7) — the `[label]: url "title"` form that other parts of the
1569// markdown reference by writing `[text][label]` or `[label]` (shortcut).
1570//
1571// Use for any realm-rendered LRD where the realm owns the label but
1572// any of the URL or title come from user input. The user content for
1573// the URL goes through URL (allowlist-based — reject → ""); the title
1574// goes through LinkTitle (escape).
1575//
1576// Contract:
1577//   - `label`: passed raw, validated as a FootnoteLabel
1578//     (^[A-Za-z0-9_-]{1,64}$). Realms should choose a namespaced label
1579//     using dashes (e.g. `r-myrealm-help`) so shortcut-reference
1580//     invocations from user content can't collide with bare prose
1581//     (`[help]`, `[click here]`). `/` is not in the FootnoteLabel
1582//     charset; reject → return "".
1583//   - `url`: passed raw, sanitized via URL. If URL rejects, the LRD is
1584//     skipped (return "").
1585//   - `title`: passed raw, sanitized via LinkTitle. Empty title → no
1586//     title clause emitted.
1587//
1588// The output is framed with leading and trailing blank lines so that
1589// the definition cannot accidentally fuse with adjacent paragraph
1590// content into a setext underline or a continuation line.
1591//
1592// Not idempotent (see package doc).
1593func LinkReferenceDefinition(label, url, title string) string {
1594	lbl := FootnoteLabel(label)
1595	if lbl == "" {
1596		return ""
1597	}
1598	safeURL := URL(url)
1599	if safeURL == "" {
1600		return ""
1601	}
1602	var b strings.Builder
1603	b.WriteString("\n\n[")
1604	b.WriteString(lbl)
1605	b.WriteString("]: ")
1606	b.WriteString(safeURL)
1607	if title != "" {
1608		b.WriteString(" \"")
1609		b.WriteString(LinkTitle(title))
1610		b.WriteString("\"")
1611	}
1612	b.WriteString("\n\n")
1613	return b.String()
1614}
1615
1616// ----- internal helpers -----
1617
1618// linkSchemeAllowed returns true if s passes the URL helper's scheme
1619// allowlist. See URL's doc for the policy.
1620func linkSchemeAllowed(s string) bool {
1621	if strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://") {
1622		return true
1623	}
1624	if strings.HasPrefix(s, "mailto:") {
1625		// Reject prefill phishing via ?body= or &body=.
1626		if strings.Contains(s, "?body=") || strings.Contains(s, "&body=") {
1627			return false
1628		}
1629		return true
1630	}
1631	if strings.HasPrefix(s, "//") {
1632		// Protocol-relative — reject (tracking-pixel vector).
1633		return false
1634	}
1635	// Any URL with an unknown scheme (RFC 3986: `^[a-zA-Z][a-zA-Z0-9+.-]*:`)
1636	// is rejected — this blocks `javascript:`, `data:`, `vbscript:`, `blob:`,
1637	// and anything else not handled above. URLs without a scheme are
1638	// treated as relative and accepted (bare path, query-only, fragment).
1639	if hasURLScheme(s) {
1640		return false
1641	}
1642	return true
1643}
1644
1645// hasURLScheme reports whether s begins with a scheme followed by ':'
1646// per RFC 3986 (^[a-zA-Z][a-zA-Z0-9+.-]*:). A `:` appearing later in
1647// the URL (e.g. `/path:foo` or `?q=a:b`) does not count.
1648func hasURLScheme(s string) bool {
1649	if len(s) == 0 {
1650		return false
1651	}
1652	c := s[0]
1653	if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
1654		return false
1655	}
1656	for i := 1; i < len(s); i++ {
1657		c := s[i]
1658		if c == ':' {
1659			return true
1660		}
1661		if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
1662			(c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-') {
1663			return false
1664		}
1665	}
1666	return false
1667}
1668
1669// imageSchemeAllowed returns true if s passes the ImageURL helper's
1670// scheme allowlist. Tighter than linkSchemeAllowed: no mailto/tel,
1671// only data:image/<subset>.
1672func imageSchemeAllowed(s string) bool {
1673	if strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://") {
1674		return true
1675	}
1676	if strings.HasPrefix(s, "//") {
1677		return false
1678	}
1679	if strings.HasPrefix(s, "/") || strings.HasPrefix(s, "./") || strings.HasPrefix(s, "../") {
1680		return true
1681	}
1682	if strings.HasPrefix(s, "data:") {
1683		// Only the curated image/* subset. CSS must enforce sizing.
1684		for _, p := range []string{
1685			"data:image/svg+xml",
1686			"data:image/png",
1687			"data:image/jpeg",
1688			"data:image/gif",
1689			"data:image/webp",
1690		} {
1691			if strings.HasPrefix(s, p) {
1692				return true
1693			}
1694		}
1695		return false
1696	}
1697	return false
1698}
1699
1700// foldNewlinesAndSeparators replaces \n, U+0085 NEL, U+2028 LINE SEPARATOR,
1701// U+2029 PARAGRAPH SEPARATOR with the given replacement byte (typically
1702// space for inline-context helpers).
1703//
1704// NormalizeBreaks has already folded \r\n and \r to \n before this runs,
1705// so \n is the canonical break byte to substitute.
1706func foldNewlinesAndSeparators(s string, replacement byte) string {
1707	if !needsSeparatorFold(s) {
1708		return s
1709	}
1710	out := make([]byte, 0, len(s))
1711	for i := 0; i < len(s); {
1712		c := s[i]
1713		if c == '\n' {
1714			out = append(out, replacement)
1715			i++
1716			continue
1717		}
1718		// U+0085 NEL: 0xC2 0x85
1719		if c == 0xC2 && i+1 < len(s) && s[i+1] == 0x85 {
1720			out = append(out, replacement)
1721			i += 2
1722			continue
1723		}
1724		// U+2028 (0xE2 0x80 0xA8) or U+2029 (0xE2 0x80 0xA9)
1725		if c == 0xE2 && i+2 < len(s) && s[i+1] == 0x80 && (s[i+2] == 0xA8 || s[i+2] == 0xA9) {
1726			out = append(out, replacement)
1727			i += 3
1728			continue
1729		}
1730		out = append(out, c)
1731		i++
1732	}
1733	return string(out)
1734}
1735
1736func needsSeparatorFold(s string) bool {
1737	for i := 0; i < len(s); i++ {
1738		c := s[i]
1739		if c == '\n' || c == 0xC2 || c == 0xE2 {
1740			return true
1741		}
1742	}
1743	return false
1744}
1745
1746// replaceNULWithFFFD substitutes any NUL byte with the UTF-8 encoding
1747// of U+FFFD REPLACEMENT CHARACTER per CM §2.3.
1748func replaceNULWithFFFD(s string) string {
1749	if !strings.ContainsRune(s, 0) {
1750		return s
1751	}
1752	return strings.ReplaceAll(s, "\x00", "\ufffd")
1753}