sanitize.gno
75.71 Kb · 1753 lines
1// Package sanitize provides input-cleaning primitives and safe-emit
2// builders for each markdown lexical slot. Realm authors wrap user-
3// supplied strings with these helpers before flowing them into rendered
4// markdown output. Each helper targets one specific slot (link text,
5// heading text, URL href, table cell, HTML attribute, fenced code block,
6// blockquote, footnote definition, link-reference definition, etc.) and
7// neutralizes the bytes that would otherwise let user content break out
8// of that slot or inject new top-level structure.
9//
10// Pick the right helper from the table under "Picking the right helper"
11// below, then wrap each user-supplied argument exactly once at the call
12// site (see "The audit rule").
13//
14// # Wrap once
15//
16// Most escapers and safe-emit builders in this package are NOT
17// idempotent — applying them twice re-escapes bytes the first pass
18// added (`\*` becomes `\\\*`, `&` becomes `&`, a fenced
19// block gets re-fenced). Wrap each user-derived string with at most
20// one sanitize.* call. Block and BlockRich are exceptions —
21// idempotent by design — but the at-most-once rule is still the
22// safest default. See the "Idempotence classes" enumeration below
23// for the full breakdown.
24//
25// Some markdown-builder packages (e.g. p/moul/md) sanitize the args of
26// specific helpers internally — see each builder's package doc for the
27// per-helper contract. If the builder sanitizes for you, pass the raw
28// user input; if it doesn't, wrap the input with the right sanitize.*
29// helper at the call site.
30//
31// # Picking the right helper
32//
33// Match the helper to the slot the user content lands in:
34//
35// slot helper
36// -------------------------------------------------------------
37// [text](url) InlineText (text)
38// # Heading text InlineText
39// **bold** _italic_ InlineText
40//  InlineText (alt)
41// > [!NOTE] one-line title InlineText
42// multi-paragraph post body Block
43// multi-paragraph post body w/ rich block BlockRich
44// structure (headings, lists, tables, etc.)
45// multi-line blockquote (`> ` prefixed) Blockquote
46// multi-line blockquote w/ rich block body BlockquoteRich
47// [text](url "title") LinkTitle (title)
48// | cell | TableCell
49// <gno-card caption="X"> HTMLEscape
50// <h5>X</h5> HTMLEscape
51// any URL going into ](X) URL
52// any image src going into (X) ImageURL
53// `inline code` inside running prose InlineCode
54// multi-line fenced code block CodeBlock
55// multi-line fenced code with language tag LanguageCodeBlock
56// [^name]: footnote body FootnoteDefinition
57// [label]: url "title" reference def LinkReferenceDefinition
58// r/sys/users handle UserName (validator)
59// g1.../gpub1... etc. BechString (validator)
60// footnote / LRD label / {#id} anchor name FootnoteLabel (validator)
61// fenced-code language tag LanguageName (validator)
62// prefix arg to md.Nested NestedPrefix (validator)
63//
64// # Invariants
65//
66// All helpers in this package are panic-free for any string input and
67// run in O(len(input)) time with bounded allocation.
68//
69// Idempotence classes:
70//
71// Idempotent (calling twice == calling once):
72// StripBidiAndZeroWidth, NormalizeBreaks
73// UserName, BechString, FootnoteLabel, LanguageName, NestedPrefix
74// URL, ImageURL (accept→identity; reject→"")
75// Block (bracket walker treats \[/\] as ordinary;
76// line-leader escapes don't re-fire on
77// already-escaped `\#` etc.)
78// BlockRich (TrimLeft/TrimRight + "\n\n" wrap is stable)
79//
80// NOT idempotent — never wrap an already-sanitized string:
81// InlineText, LinkTitle, TableCell (re-escape backslashes)
82// HTMLEscape (re-escapes `&` → `&`)
83// Blockquote, BlockquoteRich (re-prefixes `> `, nesting the quote each pass)
84// InlineCode, CodeBlock,
85// LanguageCodeBlock (wrap with a fence — calling twice double-wraps)
86// FootnoteDefinition,
87// LinkReferenceDefinition (compose Block/InlineText/URL internally —
88// passing already-sanitized strings double-escapes)
89//
90// CodeFence is pure: same inputs always give the same output.
91//
92// Validators (UserName / BechString / FootnoteLabel / LanguageName /
93// NestedPrefix) return either the cleaned input verbatim or "". They
94// never partially-sanitize: if the input doesn't match the slot's
95// charset/shape, the answer is rejection.
96//
97// # Composition rules
98//
99// Direct sanitize use (when emitting markdown without a builder package):
100//
101// out := "# " + sanitize.InlineText(userTitle) + "\n\n" +
102// sanitize.Block(userBody)
103// out += sanitize.Blockquote(userQuote)
104// out += sanitize.LanguageCodeBlock(realmLang, userCode)
105//
106// Use with a builder package (e.g. p/moul/md): pass raw user input to
107// the builder helpers that sanitize internally — do NOT pre-wrap with
108// sanitize.*, or the input gets double-escaped (escapers are not
109// idempotent). See the builder's package doc for the per-helper
110// contract. For example, with p/moul/md:
111//
112// md.Blockquote(userProse) // good — md.Blockquote sanitizes
113// md.LanguageCodeBlock(realmLang, userCode) // good — sanitizes both args
114// md.Link(userText, userURL) // good — sanitizes both slots
115//
116// md.Blockquote(sanitize.Block(userProse)) // BAD: double-wrap
117// md.Link(sanitize.InlineText(t), sanitize.URL(u)) // BAD: double-wrap
118//
119// Wrong (across all callers):
120//
121// sanitize.InlineText(sanitize.InlineText(s)) double-wrap (re-escape)
122// sanitize.TableCell(sanitize.InlineText(s)) TableCell already calls InlineText
123// sanitize.URL(sanitize.InlineText(href)) inline-escape backslash-escapes `.` `-` `_`
124// inside the URL, corrupting the host/path
125// sanitize.Blockquote(sanitize.Blockquote(s)) double-wrap — outer would escape the
126// inner `> ` prefixes
127// sanitize.Block(sanitize.BlockRich(s)) double-sanitize — strict Block re-escapes
128// the markers BlockRich preserved (headings,
129// lists, tables); BlockRich's rich structure
130// renders as literal text after Block escapes
131// its line-leaders
132// sanitize.BlockRich(sanitize.Block(s)) pointless double-sanitize — Block already
133// escaped every line-leader to `\#`/`\>`/etc.;
134// BlockRich preserves the backslash escapes
135// as visible artifacts in user prose
136// sanitize.Blockquote(sanitize.BlockRich(s)) double-sanitize — Blockquote's Block step
137// re-escapes the markers BlockRich preserved
138// sanitize.BlockRich(sanitize.Blockquote(s)) nonsense — Blockquote already line-prefixed
139// with `> `; BlockRich expects raw user content
140// sanitize.BlockquoteRich(sanitize.BlockRich(s)) double-wrap — Rich + Rich nests twice
141// sanitize.BlockRich(sanitize.TableCell(s)) wrong slot — use TableCell for cell content,
142// BlockRich for multi-paragraph block content
143// sanitize.TableCell(multiParagraphProse) newlines fold to space silently; use a
144// non-table layout for multi-paragraph text
145//
146// # Threat model
147//
148// Sanitizers in this package defend against:
149//
150// - bidi/zero-width injection: invisible characters that make
151// displayed text disagree with stored bytes (e.g. an address `g1abc...`
152// that renders as `g1xyz...`, or a username that visually collides
153// with another). Stripped by StripBidiAndZeroWidth, which runs as
154// the first step of every text-shaped helper.
155// - line-ending homoglyphs: CR-only and Unicode separators
156// (U+0085 NEL, U+2028, U+2029) that some renderers treat as line
157// breaks. Folded uniformly.
158// - markdown-structure injection: user content opening a heading,
159// blockquote, list, code fence, link-reference def, setext underline,
160// gnoweb extension delimiter, or GFM table row at document level.
161// Strict Block escapes the line-leading `|` of any GFM table row so
162// user content cannot inject `<table>`-shaped structure; permissive
163// BlockRich preserves table rows so authors can compose `<table>`
164// elements (gnoweb loads extension.Table per render_config.go).
165// - HTML block type 1-5 absorption: CommonMark §4.6 HTML block types 1
166// (`<script>`, `<pre>`, `<style>`, `<textarea>`), 2 (`<!--`), 3
167// (`<?`), 4 (`<!UPPER`), and 5 (`<![CDATA[`) do NOT close on a blank
168// line — they only close on a type-specific token (`</tag>`, `-->`,
169// `?>`, `>`, `]]>`) or EOF. Without a defense, user content opening
170// any of these would swallow realm chrome appended afterward. Both
171// Block and BlockRich line-escape the openers (prepend `\`) so the
172// block never opens; this defense is unconditional in both modes.
173// Types 6 and 7 close on a blank line, so BlockRich's `\n\n`
174// paragraph envelope already bounds them and no escape is needed.
175// - realm-discipline boundary (caller's responsibility, not enforced):
176// callers should emit realm chrome at flush-left column 0 around
177// `BlockRich(user)`. Indented chrome (4+ leading spaces, list-item
178// continuations, footnote-definition body, or an unclosed Type 1
179// HTML tag in realm chrome before the call) can extend across blank
180// lines into user content or vice versa. The sanitizer cannot
181// defend against malformed realm chrome — only against user input.
182// - footnote / link-reference namespace pollution: user content
183// containing `[^name]` or `[text][label]` syntax that would otherwise
184// resolve against realm-defined footnote definitions or link
185// reference definitions elsewhere on the page. Block escapes the
186// opening `[` in both shapes.
187// - reference-link / footnote-ref / shortcut-ref collisions:
188// `[text][label]`, `[^name]`, and bare `[label]` shortcut forms
189// are ALL neutralized by Block's bracket walk, which preserves
190// only inline `[text](url)` and `` syntax — everything
191// else has both `[` and `]` backslash-escaped, so the parser sees
192// literal text and can't resolve against realm-defined LRDs or
193// footnote definitions.
194// - multi-line LRD evasion: Block's walker recognises `[lab\nel]: url`
195// across newlines (single `\n` OK, blank line aborts) and strips
196// the whole region. `\]` inside the label is honored as an escaped
197// literal, so `[label\]: url` is NOT treated as an LRD (renders as
198// literal text).
199// - URL scheme abuse: javascript:, data:text/html, vbscript:, blob:,
200// protocol-relative //, mailto: with prefill phishing parameters.
201// Allowlist-only (URL / ImageURL).
202// - HTML attribute / element breakout: `"`, `<`, `>`, `&`, `'` inside
203// HTML lexical slots. Handled by HTMLEscape.
204// - CommonMark §2.3 NUL: replaced with U+FFFD by Block, InlineText,
205// LinkTitle, TableCell, HTMLEscape, InlineCode, CodeBlock, and
206// LanguageCodeBlock.
207// - code-fence leakage: a user-opened ``` ``` ``` fence that runs to EOF
208// with no closing fence, which would otherwise swallow every realm-
209// emitted line that follows. Block auto-closes any open fence at EOF.
210// - table-alignment drift: tabs inside table cells expanding to variable
211// widths (1-4 spaces depending on column position) and shifting cell
212// boundaries unpredictably. TableCell replaces tabs with single spaces.
213//
214// What this package does NOT do:
215//
216// - It does not store state. Every helper is a pure function.
217// - It does not validate semantic correctness. sanitize.URL accepts
218// a syntactically valid https:// URL even if the host is malicious;
219// URL reputation is a separate layer.
220// - It does not enforce CSS containment. ImageURL admits data:image/*
221// URIs on the assumption that the deploying gnoweb instance caps
222// rendered image dimensions via CSS. Without that cap, a malicious
223// image can blow out the page layout or exhaust memory.
224// - It does not perform structural sandboxing of foreign markdown.
225// If a realm concatenates an opaque markdown blob returned from a
226// polymorphic interface (`someThing.Render()`), it needs a structural
227// sandbox primitive (e.g. a `<gno-card>` extension), not just leaf
228// sanitization.
229//
230// # When to use Block vs BlockRich
231//
232// Both are safe sanitizers; both run identical realm-binding defenses.
233// They differ in what user-authored block structure survives:
234//
235// - Block — paragraph-shaped only. Escapes `#`, `>`, list markers,
236// `---`/`***`/`___` thematic breaks, and `===`/`---` setext
237// underlines. Use for leaf slots — footnote definition bodies,
238// table cells, blockquote bodies (Blockquote uses Block), single-
239// paragraph prose, any slot where richer structure has no benefit
240// or where richer structure could visually impersonate realm chrome.
241//
242// - BlockRich — full-richness. Preserves user-authored headings,
243// lists, quotes, HR, setext. Use for user content the realm intends
244// to compose with full block-level structure, typically inside a
245// sandbox container (`<gno-card>`, `<gno-foreign>`) or a CSS-demoted
246// region. BlockRich's qualifying-setext defense prevents the
247// cross-boundary attack (user content reaching back to promote
248// realm chrome to a heading), but inner-heading visual containment
249// is the realm's CSS responsibility. gnoweb does not yet ship CSS
250// rules that demote headings inside sandbox containers — until they
251// land, BlockRich + sandbox renders inner headings at literal size.
252//
253// Do NOT compose Block and BlockRich in either direction. Pick one
254// helper at the right level.
255//
256// # Extending
257//
258// A new helper added to this package MUST:
259//
260// 1. Be panic-free for any string input.
261// 2. Strip bidi+zero-width before any other transform (so display
262// equals storage end-to-end).
263// 3. Declare its idempotence class in the table above.
264// 4. Document the markdown / HTML lexical slot it targets.
265// 5. Reject rather than partially-sanitize when input is structurally
266// invalid (return "" — never half-process an address or URL).
267// 6. Pick exactly one of the two return-value contracts and stick to
268// it: escapers always return a transformed string and never reject
269// (any input is OK — the transformation makes it safe); validators
270// return the cleaned input verbatim on accept or "" on reject and
271// never half-process. Mixing the contracts within one helper is a
272// bug — callers can't reason about whether "" means "input was
273// already empty" or "input was rejected".
274package sanitize
275
276import (
277 "chain/markdown"
278 "html"
279 "strings"
280)
281
282// ----- Re-exports of the public chain/markdown natives -----
283//
284// These are general-purpose data-hygiene primitives, not markdown-specific.
285// The other helpers in this package call them internally, so realms emitting
286// markdown rarely need to call them directly. Reach for these when you have
287// a non-markdown use case — e.g. normalizing a username before storage,
288// canonicalizing a search query, or stripping invisible characters from
289// any user string that will be displayed or compared.
290
291// StripBidiAndZeroWidth removes Unicode bidi controls and zero-width
292// characters (U+200B-D, U+200E-F, U+202A-E, U+2066-9, U+FEFF) from s.
293// Use it when storing or comparing user-supplied strings outside of a
294// markdown context — for example, before saving a display name to state,
295// or before hashing a search query. Idempotent: calling twice gives the
296// same result.
297//
298// Thin wrapper over chain/markdown.StripBidiAndZeroWidth.
299func StripBidiAndZeroWidth(s string) string {
300 return markdown.StripBidiAndZeroWidth(s)
301}
302
303// NormalizeBreaks unifies CR-LF and lone CR to LF (CommonMark §2.2 line
304// endings only — does NOT touch U+2028/U+2029). Use it when comparing
305// or hashing user input that may have been authored on different
306// platforms (Windows CRLF vs. Unix LF), so equivalent strings normalize
307// to the same bytes. Idempotent.
308//
309// Thin wrapper over chain/markdown.NormalizeBreaks.
310func NormalizeBreaks(s string) string {
311 return markdown.NormalizeBreaks(s)
312}
313
314// ----- Escapers -----
315
316// InlineText prepares an arbitrary user string for an INLINE markdown
317// slot — anywhere the rendered output stays on a single line and lives
318// inside a larger markdown construct.
319//
320// Use for:
321// - link text: [InlineText(label)](url)
322// - heading text: # InlineText(title)
323// - bold/italic body: **InlineText(name)**
324// - image alt text: 
325// - single-line block-context slots:
326// > [!NOTE] InlineText(title)
327// > Author: InlineText(name)
328//
329// Multi-paragraph prose belongs in Block, not InlineText. InlineText
330// folds every newline to a single space (so paragraph structure is
331// erased) and escapes inline-active CommonMark punctuation:
332//
333// \ * _ [ ] ( ) ~ > - + . ! ` # < &
334//
335// Two characters are intentionally NOT escaped:
336//
337// - `|` — only meaningful in GFM table rows. Leaving it literal here
338// lets TableCell (which calls InlineText then escapes `|` itself)
339// avoid double-escaping pipes into `\\|`.
340// - `=` — only meaningful as a setext heading underline, which is a
341// line-level construct. Escaping `=` inline would mangle expressions
342// like `x = 1` for no benefit.
343//
344// Not idempotent (see package doc).
345func InlineText(s string) string {
346 s = markdown.StripBidiAndZeroWidth(s)
347 s = markdown.NormalizeBreaks(s) // CM §2.2 \r\n / \r → \n
348 s = foldNewlinesAndSeparators(s, ' ') // \n + NEL + U+2028/U+2029 → space
349 return markdown.EscapeInline(s)
350}
351
352// Block prepares user content for a top-level BLOCK markdown context
353// where paragraphs, line breaks, code blocks, and other block structure
354// should survive — but where the content must NOT be able to inject
355// new top-level constructs (headings, lists, blockquotes,
356// link-reference definitions, setext underlines, gnoweb extension
357// delimiters, GFM table rows).
358//
359// Output shape: every non-empty result begins AND ends with "\n\n" —
360// CM §4.8 blank lines on both sides — so user content is guaranteed
361// to occupy its own paragraph(s), isolated from any realm chrome that
362// precedes OR follows it. This bounds CM §4.6 HTML block types 6 and
363// 7 (`<div>`, `<table>`, `<form>`, arbitrary `<foo>` tags) which
364// close on a blank line and are NOT escaped in any mode, and it
365// defeats first-line setext promotion (`===`/`---`) that strict-mode
366// escapes miss when the previous line is blank in the user input but
367// non-blank in the concatenated realm output. Empty input (or input
368// that strips entirely, e.g. a lone LRD) returns "" — no envelope is
369// emitted.
370//
371// Use for any multi-paragraph user-supplied prose that the realm
372// concatenates into its rendered output:
373// - post bodies, comments, replies
374// - profile bios, About sections
375// - proposal descriptions, governance motions
376// - changelog entries, release notes
377//
378// What Block does with each kind of attacker input:
379//
380// User attempt | Block's response
381// ------------------------------------------------------|----------------------------------------------------
382// --- preserved verbatim --- |
383// [text](url) inline link,  image | preserved verbatim
384// ------------------------------------------------------|----------------------------------------------------
385// --- escaped / stripped / folded --- |
386// # heading at line-start | escaped → literal `# heading`
387// > quoted at line-start | escaped → literal `>`
388// - item, * item, + item, 1. item at line-start | escaped
389// ---, ***, ___ (3+) at line-start | escaped
390// === or --- on its own line after non-blank text | escaped (no setext promotion of the line above)
391// <gno-card>, <gno-columns>, any <gno-…>/</gno-…> at | escaped (wildcard match) → literal text
392// line-start |
393// | a | b | GFM table row (line-leading `|`) | escaped → literal `| a | b |`
394// <!--, <script>, <pre>, <style>, <textarea>, <?…?>, | escaped (\<…) → literal text;
395// <!DOCTYPE…>, <![CDATA[…]]> at line-start | blocks goldmark from opening a
396// (CM §4.6 HTML block types 1-5) | blank-line-NON-terminating HTML block
397// [text][realm-label] ref-link USE | both bracket pairs escaped → \[text\]\[realm-label\]
398// [^name] footnote-ref | both brackets escaped → \[^name\]
399// [label] bare shortcut-ref | both brackets escaped → \[label\]
400// [label]: url link-reference definition | whole region stripped (incl. multi-line label
401// (incl. [lab\nel]: url multi-line) | `[lab\nel]: url` and any title continuation)
402// [label\]: url (backslash-escaped `]`) | NOT stripped; brackets escaped → paragraph text
403// code fence opened without close | autoclosed at end of input
404// NUL byte (\x00) | replaced with U+FFFD
405// U+2028 / U+2029 / U+0085 (NEL) | folded to `\n`
406// bidi/zero-width controls | stripped
407//
408// COMPOSITION GOTCHA: Block's EOF fence-autoclose appends a final
409// fence line. If you wrap Block's output with a line-prefixing
410// builder like md.Blockquote (which prepends `> ` per line) or
411// md.Nested, that closing fence becomes a prefixed line. The output
412// is still safe (the fence still closes correctly) but may render
413// awkwardly. If pixel-perfect output matters, strip a trailing blank
414// fence line after Block.
415//
416// Why backslash and not a space for `<gno-…>` lines: gnoweb's
417// extension parsers call `util.TrimLeftSpace` on the line before tag
418// matching, which would strip a leading space and let the tag match
419// anyway. A leading `\` survives the trim (only ASCII whitespace +
420// form-feed are stripped) and is consumed by the inline escape phase
421// before Type-7 HTML block detection can fire (Type-7 requires the
422// first non-whitespace char to be `<`).
423//
424// Inline emphasis, code spans, inline links, and soft line breaks
425// within a paragraph are PRESERVED — users can format. Pipes that
426// are NOT at line-start stay literal so prose can still write things
427// like `a | b`.
428//
429// Idempotent: Block(Block(s)) is byte-identical to Block(s). The
430// bracket walker strips LRDs on the first pass; remaining `[`/`]`
431// outside inline-link/image spans are escaped to `\[`/`\]`, and
432// already-escaped brackets are preserved on subsequent passes
433// (pass-2 backslash-parity tracking). Still, wrap each user-supplied
434// string exactly once — chained sanitization adds no value and
435// burns gas.
436func Block(s string) string {
437 s = markdown.NormalizeBreaks(s)
438 s = markdown.StripBidiAndZeroWidth(s)
439 s = replaceNULWithFFFD(s)
440 s = markdown.EscapeBlockHazards(s)
441 // Symmetric "\n\n" envelope — same pattern BlockRich uses for the
442 // same reasons (see BlockRich docstring "Cross-paragraph safety").
443 // Strict mode escapes most line-leading hazards (setext, GFM table
444 // row, CM §4.6 HTML types 1-5, list/heading/HR markers), but two
445 // hazards remain that only a blank-line break can close:
446 //
447 // - CM §4.6 HTML block types 6 and 7 (`<div>`, `<table>`,
448 // `<form>`, arbitrary `<foo>` tags) are NOT escaped in any mode
449 // — they close on a blank line per CM. Without a trailing
450 // "\n\n", a `<div>` at the end of user content extends into
451 // appended realm chrome.
452 //
453 // - First-line setext: strict mode's setext escape only fires
454 // when the previous line is non-blank IN THE USER'S INPUT.
455 // A user whose first line is `===` slips past, and concatenated
456 // after `chrome\n` would promote chrome to H1. The leading
457 // "\n\n" forces a paragraph break so chrome cannot be merged.
458 //
459 // TrimLeft/TrimRight + fixed wrap is idempotent: Block(Block(s)) is
460 // byte-identical to Block(s). Empty post-escape result short-
461 // circuits to "" so realm concatenation doesn't leak stray blank
462 // lines for trivially empty inputs (e.g. lone LRD that strips
463 // entirely).
464 s = strings.TrimLeft(s, "\n")
465 s = strings.TrimRight(s, "\n")
466 if s == "" {
467 return ""
468 }
469 return "\n\n" + s + "\n\n"
470}
471
472// BlockRich is the permissive counterpart of Block. Both are safe
473// sanitizers — the distinction is what markdown structure survives:
474//
475// - Block escapes line-leading block markers (`#`, `>`, `-`, `*`,
476// `+`, `1.`), thematic breaks (`---`/`***`/`___`), and setext
477// underlines (`===`/`---`). User content becomes paragraph-shaped.
478// - BlockRich PRESERVES all of those, so user content can compose
479// headings, lists, quotes, horizontal rules, and setext-styled
480// headings. Realm-binding defenses stay on (extension delimiters
481// `<gno-…>`, the bracket walker for link / LRD / ref /
482// footnote / shortcut, fence autoclose, NUL / bidi /
483// Unicode-separator folding). GFM table-row openers are
484// PRESERVED (see "Tables" below).
485//
486// Cross-paragraph safety: BlockRich's output begins with "\n\n"
487// AND ends with "\n\n" — CM §4.8 blank lines on both sides — so
488// user content is guaranteed to occupy its own paragraph(s),
489// isolated from anything the realm emits before OR after. Symmetric
490// isolation closes four distinct attacks:
491//
492// - Cross-paragraph setext promotion (backward). User content
493// `body\n===\nmore` concatenated after realm chrome (no
494// trailing `\n`) would, without paragraph isolation, place
495// "chrome\nbody" in one paragraph; the `===` setext underline
496// would then promote that merged paragraph to H1, hijacking
497// realm chrome. The leading "\n\n" forces a paragraph break.
498//
499// - Cross-paragraph GFM table promotion (backward). User content
500// beginning with `|---|---|` (a table delimiter row) would,
501// without a blank-line break, retroactively turn the preceding
502// realm line into a `<thead>`. Paragraph isolation prevents
503// the table-detection scan from crossing the boundary.
504//
505// - Cross-paragraph GFM table promotion (forward). Realm chrome
506// appended immediately after BlockRich(user) that begins with
507// `|---|` would, without a trailing blank line, extend user's
508// last line into a table header and pull realm chrome into the
509// body row. The trailing "\n\n" prevents the merge.
510//
511// - Lazy paragraph continuation (forward). Paragraph-shaped realm
512// chrome appended immediately after BlockRich(user) would, via
513// CM §5.2, merge into user's trailing paragraph and inherit any
514// block-level decoration it carries.
515//
516// First-line qualifying-setext escape (the
517// `neuterLeadingSetextIfQualifying` pre-pass) remains in place as
518// belt-and-suspenders: if the first non-blank line of user input
519// matches the CM §4.3 setext-underline pattern (run of `=` or `-`
520// with 0-3 leading spaces and only trailing whitespace), BlockRich
521// inserts `\` before the first `=`/`-`. This is redundant given
522// paragraph isolation but harmless and inexpensive.
523//
524// # Tables
525//
526// BlockRich preserves line-leading `|` so user content can
527// compose GFM tables:
528//
529// | Header A | Header B |
530// |----------|----------|
531// | cell a | cell b |
532//
533// renders as a real `<table>` element. Strict Block continues to
534// escape line-leading `|` (each row becomes literal `\| a | b |`
535// text). When the realm authors the table itself and inserts user
536// content into a specific cell, use TableCell — NOT BlockRich —
537// to sanitize that cell value.
538//
539// What attacker input produces what (full table, same rows as Block
540// except where marked CHANGED):
541//
542// User attempt | BlockRich response
543// ----------------------------------------------|--------------------------------------------------
544// --- preserved (compose freely) --- |
545// # heading at line-start | preserved [CHANGED from Block]
546// > quoted at line-start | preserved [CHANGED]
547// - item, * item, + item, 1. item | preserved [CHANGED]
548// ---, ***, ___ thematic break | preserved [CHANGED]
549// === or --- setext underline | preserved when preceded by user text;
550// | escaped (\===/\---) if the first non-blank
551// | line of input [CHANGED]
552// | a | b | GFM table row (line-leading |) | preserved → renders as <table> when followed by
553// | a delimiter row [CHANGED]
554// [text](url),  | preserved verbatim [SAME]
555// ----------------------------------------------|--------------------------------------------------
556// --- escaped / stripped / folded --- |
557// <gno-card>, any <gno-…>/</gno-…> at line-start| escaped (wildcard match) [SAME]
558// <!--, <script>, <pre>, <style>, <textarea>, | escaped (\<…) [SAME] — Types 1-5 don't close
559// <?…?>, <!DOCTYPE…>, <![CDATA[…]]> | on blank lines, so `\n\n` envelope
560// at line-start (CM §4.6 HTML block types 1-5)| doesn't isolate them; explicit escape
561// [text][realm-label] ref-link USE | both pairs escaped [SAME]
562// [^name] footnote-ref | both brackets escaped [SAME]
563// [label] bare shortcut-ref | both brackets escaped [SAME]
564// [label]: url link-reference definition | whole region stripped [SAME]
565// [label\]: url (escaped `]`) | not stripped; brackets escaped [SAME]
566// code fence opened without close | autoclosed at end of input [SAME]
567// NUL byte (\x00) | replaced with U+FFFD [SAME]
568// U+2028 / U+2029 / U+0085 (NEL) | folded to `\n` [SAME]
569// bidi/zero-width controls | stripped [SAME]
570//
571// Use BlockRich for user content the realm intends to compose with
572// full block-level richness — typically inside a sandbox container
573// (`<gno-card>`, `<gno-foreign>`) or a CSS-demoted region where inner
574// headings render visually distinct from realm chrome. The realm
575// must own the visual containment: concatenating BlockRich's output
576// directly into a top-level page still lets the user write `# heading`
577// at document level. BlockRich's cross-boundary setext defense prevents
578// the worst case (reaching backwards into realm bytes), but visual
579// containment of inner headings is the realm's CSS responsibility.
580// gnoweb does not yet ship CSS rules that demote inner headings inside
581// `<gno-card>` / `<gno-foreign>` — until those rules land, realms
582// using BlockRich + a sandbox should be aware that inner headings
583// render at their literal level.
584//
585// Idempotent: BlockRich(BlockRich(s)) is byte-identical to
586// BlockRich(s). The TrimLeft-then-"\n\n"-prepend pattern strips
587// any leading newlines and reapplies exactly two, so the leading
588// shape is stable across passes; the qualifying-setext escape is
589// stable (a line beginning with `\` no longer matches the setext
590// pattern); and the bracket walker treats already-escaped
591// `\[`/`\]` as ordinary bytes. Empty input (or input that strips
592// to empty, e.g. a lone link-reference definition) returns "" —
593// realm concatenation doesn't get a stray blank line.
594// Still, wrap each user-supplied string exactly once — chained
595// sanitization adds no value and burns gas.
596//
597// Realm-discipline boundary: BlockRich defends user input against
598// every cross-paragraph attack listed above, but it CANNOT defend
599// against malformed REALM chrome. Specifically, callers should emit
600// realm chrome at flush-left column 0 around `BlockRich(user)`. If
601// the realm chrome BEFORE the call contains an unclosed CM §4.6
602// Type 1 HTML tag (`<script>`, `<pre>`, `<style>`, `<textarea>`),
603// the `\n\n` envelope does NOT close it (Type 1 closes only on the
604// matching close tag), and user-controlled `</tag>` content can
605// then prematurely terminate it. Indented chrome (4+ leading
606// spaces, list-item continuations, footnote-definition body) can
607// likewise extend across the envelope into user content. Keep
608// chrome flush-left and Type 1 tags closed within the chrome.
609//
610// PREVIEW: BlockquoteRich is currently the only in-tree caller of
611// BlockRich; the API and the `"\n\n"` output shape may evolve once
612// direct callers emerge.
613func BlockRich(s string) string {
614 s = markdown.NormalizeBreaks(s)
615 s = markdown.StripBidiAndZeroWidth(s)
616 s = replaceNULWithFFFD(s)
617 // Fold Unicode separators (U+2028, U+2029, U+0085 NEL) to '\n'
618 // BEFORE the setext-qualifying check. The native
619 // EscapeBlockHazardsRich also folds them internally, but the Gno
620 // helper below needs to see the folded form to correctly identify
621 // the first non-blank line — otherwise an attacker can hide the
622 // `===` setext underline behind a U+2028 / U+2029 / U+0085 and
623 // reach back to promote realm chrome above it.
624 s = foldSeparatorsToNewline(s)
625 s = neuterLeadingSetextIfQualifying(s)
626 s = markdown.EscapeBlockHazardsRich(s)
627 // Ensure the output BOTH starts AND ends with "\n\n" — CM §4.8
628 // blank lines on each side — so user content is GUARANTEED to
629 // occupy its own paragraph(s), isolated from anything the realm
630 // emits before OR after. Symmetric isolation closes four attacks:
631 //
632 // Backward (closed by leading "\n\n"):
633 // 1. Deeper-setext: user content `body\n===\nmore` concatenated
634 // after realm chrome (no trailing `\n`) would otherwise place
635 // "chrome\nbody" in one paragraph; the `===` setext underline
636 // would then promote that merged paragraph to H1, hijacking
637 // realm chrome.
638 // 2. GFM table-row promotion: user content beginning with
639 // `|---|---|` (a table delimiter row) would, without a blank-
640 // line break, retroactively promote the preceding realm line
641 // into a `<thead>` cell.
642 //
643 // Forward (closed by trailing "\n\n"):
644 // 3. GFM table-row promotion in reverse: realm appending its own
645 // chrome immediately after BlockRich(user), where chrome
646 // starts with `|---|`, would extend user's last line into a
647 // table header and pull realm chrome into the body row.
648 // 4. Lazy paragraph continuation: realm appending paragraph-
649 // shaped chrome immediately after BlockRich(user) would, via
650 // CM §5.2 lazy-continuation, merge into user's trailing
651 // paragraph and inherit any block-level decoration it carries.
652 //
653 // `neuterLeadingSetextIfQualifying` above is now belt-and-
654 // suspenders for the first-line setext case: even if the blank-
655 // line guarantee were somehow defeated by an exotic CM consumer,
656 // the first-line escape still blocks the simplest setext shape.
657 //
658 // Empty post-escape result short-circuits to "" so realm
659 // concatenation doesn't leak stray blank lines for trivially empty
660 // inputs (e.g. a lone link-reference definition that strips
661 // entirely).
662 //
663 // Idempotency: TrimLeft and TrimRight strip ALL leading/trailing
664 // "\n"s, then the wrap adds exactly two on each side. Stable
665 // across passes.
666 s = strings.TrimLeft(s, "\n")
667 s = strings.TrimRight(s, "\n")
668 if s == "" {
669 return ""
670 }
671 return "\n\n" + s + "\n\n"
672}
673
674// foldSeparatorsToNewline replaces U+0085 NEL (0xC2 0x85),
675// U+2028 (0xE2 0x80 0xA8), and U+2029 (0xE2 0x80 0xA9) with '\n'.
676// Leaves '\n' bytes alone. Used by BlockRich so the qualifying-setext
677// pre-pass and the native both see the same line structure.
678func foldSeparatorsToNewline(s string) string {
679 // Cheap pre-check: only the 0xC2 / 0xE2 lead bytes can trigger.
680 if !containsAnyByteForFold(s) {
681 return s
682 }
683 out := make([]byte, 0, len(s))
684 for i := 0; i < len(s); {
685 c := s[i]
686 if c == 0xC2 && i+1 < len(s) && s[i+1] == 0x85 {
687 out = append(out, '\n')
688 i += 2
689 continue
690 }
691 if c == 0xE2 && i+2 < len(s) && s[i+1] == 0x80 && (s[i+2] == 0xA8 || s[i+2] == 0xA9) {
692 out = append(out, '\n')
693 i += 3
694 continue
695 }
696 out = append(out, c)
697 i++
698 }
699 return string(out)
700}
701
702func containsAnyByteForFold(s string) bool {
703 for i := 0; i < len(s); i++ {
704 if s[i] == 0xC2 || s[i] == 0xE2 {
705 return true
706 }
707 }
708 return false
709}
710
711// neuterLeadingSetextIfQualifying scans s for the first non-blank
712// line. If that line matches the CommonMark §4.3 setext-underline
713// pattern (0-3 leading spaces, then a run of all `=` or all `-`,
714// then optional trailing whitespace, then `\n` or EOF), the function
715// returns s with a `\` inserted before the first `=`/`-`. Otherwise
716// returns s unchanged. The escape prevents a realm-emitted line above
717// BlockRich's output from being retroactively promoted to a heading.
718func neuterLeadingSetextIfQualifying(s string) string {
719 pos := 0
720 for pos < len(s) {
721 // Walk to the first non-whitespace byte of the current line.
722 lineStart := pos
723 i := pos
724 for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
725 i++
726 }
727 if i >= len(s) || s[i] == '\n' {
728 // Blank line; advance to next line.
729 if i >= len(s) {
730 return s
731 }
732 pos = i + 1
733 continue
734 }
735 // First non-blank line. Check setext-underline shape.
736 if i-lineStart > 3 {
737 return s // 4+ leading spaces = indented code, not setext
738 }
739 c := s[i]
740 if c != '=' && c != '-' {
741 return s // not a setext underline candidate
742 }
743 j := i + 1
744 for j < len(s) && s[j] == c {
745 j++
746 }
747 for j < len(s) && (s[j] == ' ' || s[j] == '\t') {
748 j++
749 }
750 if j < len(s) && s[j] != '\n' {
751 return s // mixed content on the line — not setext
752 }
753 return s[:i] + "\\" + s[i:]
754 }
755 return s
756}
757
758// Blockquote wraps user content as a CommonMark blockquote: each line
759// of the cleaned content gets a "> " prefix so the renderer displays
760// it inside a `<blockquote>` element.
761//
762// Use for any multi-paragraph user-supplied text that the realm wants
763// to render as a quotation: cited posts, attached responses, error
764// snapshots that should visually stand out.
765//
766// The content is first cleaned by Block (bidi-strip, line-ending
767// normalize, NUL→U+FFFD, bracket walker for link/image/LRD spans,
768// block-marker escape, code-fence auto-close at EOF, Unicode-separator
769// fold). Block's "\n\n" cross-paragraph envelope is then stripped —
770// the `> ` marker creates the container boundary, so the envelope
771// would only line-prefix to empty `> ` lines top and bottom — and
772// every remaining line is prefixed with "> ". The user content can
773// still use inline emphasis, code spans, and nested fenced code blocks
774// inside the quote; what it cannot do is open new top-level structure
775// (heading, list, blockquote, GFM table row, etc.) or escape the
776// quote.
777//
778// Output shape — every non-empty result begins with "\n" and ends
779// with "\n\n" (same shape as BlockquoteRich):
780//
781// - Leading "\n" guarantees a clean blockquote opener even when the
782// realm concatenates `chrome + Blockquote(user)` without its own
783// newline separator.
784// - Trailing "\n\n" (blank line) cleanly ends the blockquote so a
785// realm appending `Blockquote(user) + chrome` cannot pull chrome
786// bytes into the quote via CommonMark §5.2 lazy continuation.
787//
788// Empty input (or input that strips entirely, e.g. a lone LRD)
789// returns "" — no blockquote is emitted.
790//
791// Composition gotcha: Block's EOF code-fence auto-close (added when
792// user content opens a ``` ``` ``` fence without closing it) becomes a
793// "> ```" line at the end of the blockquote. Goldmark parses this
794// correctly as the close of a fenced block inside the quote — the
795// output is structurally safe — but the markdown source looks unusual
796// to a human reviewer. If aesthetic output matters, ensure user
797// content closes its own fences.
798//
799// Not idempotent (see package doc): wraps with `> ` per line, so
800// calling twice double-wraps and the outer call's Block step escapes
801// the inner `>` prefixes.
802//
803// Do NOT compose with BlockRich in either direction:
804// - Blockquote(BlockRich(s)) double-sanitizes: BlockRich preserves
805// `#`/`>`/etc., then Blockquote's Block step escapes them again.
806// - BlockRich(Blockquote(s)) doesn't make sense: Blockquote already
807// line-prefixed with `> `; BlockRich expects raw user content.
808//
809// For a quoted body that can contain headings, lists, nested quotes,
810// or thematic breaks, use BlockquoteRich.
811func Blockquote(text string) string {
812 text = Block(text)
813 // Block wraps its output with "\n\n" on each side for cross-
814 // paragraph isolation. Inside a blockquote both wraps are redundant
815 // — the `> ` marker creates the container boundary — and they
816 // would line-prefix to two useless `> ` empty quoted lines top and
817 // bottom. Strip ALL leading and trailing "\n"s so the body starts
818 // and ends clean; this helper re-wraps with `\n` + body + `\n\n`
819 // below (same shape as BlockquoteRich).
820 text = strings.TrimLeft(text, "\n")
821 if text == "" {
822 return ""
823 }
824 text = strings.TrimRight(text, "\n")
825 if text == "" {
826 return ""
827 }
828 var sb strings.Builder
829 sb.WriteByte('\n')
830 for _, line := range strings.Split(text, "\n") {
831 sb.WriteString("> ")
832 sb.WriteString(line)
833 sb.WriteByte('\n')
834 }
835 sb.WriteByte('\n')
836 return sb.String()
837}
838
839// BlockquoteRich is the permissive counterpart of Blockquote. Both
840// wrap user content as a CommonMark blockquote (each line prefixed
841// with `> `), but they differ in what block-level structure inside
842// the quote survives:
843//
844// - Blockquote escapes line-leading block markers, so the quoted
845// body is paragraph-shaped — `# x` inside a Blockquote stays a
846// literal `#`.
847// - BlockquoteRich PRESERVES line-leading block markers, so the
848// quoted body can compose ATX headings, lists, thematic breaks,
849// nested blockquotes (`> > nested`), and other block-level
850// structure. Realm-binding defenses stay on (extension delimiters,
851// GFM table-row openers, bracket walker, fence autoclose,
852// NUL / bidi / Unicode-separator folding).
853//
854// Output shape — every non-empty result begins with "\n" and ends
855// with "\n\n":
856//
857// - Leading "\n" guarantees a clean blockquote opener even when the
858// realm concatenates `chrome + BlockquoteRich(user)` without its
859// own newline separator. Without the leading "\n", chrome ending
860// mid-line followed by "> quoted" would render `>` as literal
861// paragraph text instead of opening a blockquote.
862// - Trailing "\n\n" (blank line) cleanly ends the blockquote so a
863// realm appending `BlockquoteRich(user) + chrome` cannot pull
864// chrome bytes into the quote via CommonMark §5.2 lazy
865// continuation. Without the trailing blank line, paragraph chrome
866// immediately after BlockquoteRich would render inside the quote.
867// - BlockRich's own leading "\n\n" (paragraph-isolation blank line)
868// is stripped before line-prefixing — otherwise the output would
869// carry one or two redundant empty `> ` quoted lines at the top.
870// A single "\n" is then re-prepended at the BlockquoteRich
871// boundary so `chrome + BlockquoteRich(user)` still lands the
872// first `>` at column 0.
873// - The cross-boundary setext defense BlockRich provides is
874// redundant inside a blockquote: a setext underline inside `> `
875// content can only promote a line in the same blockquote, never
876// reach realm bytes (different CM container). BlockRich still
877// applies it, harmlessly.
878//
879// What attacker input produces what (rows that differ from
880// Blockquote are marked CHANGED):
881//
882// User attempt | BlockquoteRich response
883// ----------------------------------------------|------------------------------------------------
884// --- preserved inside `> ` quote --- |
885// # heading | preserved as `> # heading` [CHANGED]
886// > nested quote | preserved as `> > nested quote` [CHANGED]
887// - item, * item, + item, 1. item | preserved as `> - item` etc. [CHANGED]
888// ---, ***, ___ thematic break | preserved [CHANGED]
889// === or --- setext underline | preserved when preceded by user text;
890// | escaped (\===/\---) if first non-blank
891// | line of input [CHANGED]
892// | a | b | GFM table row (line-leading |) | preserved → renders as <table> inside the
893// | blockquote when followed by a delimiter row [CHANGED]
894// [text](url),  | preserved verbatim [SAME]
895// ----------------------------------------------|------------------------------------------------
896// --- escaped / stripped / folded --- |
897// <gno-card>, any <gno-…>/</gno-…> at line-start| escaped (wildcard match) [SAME]
898// <!--, <script>, <pre>, <style>, <textarea>, | escaped (\<…) [SAME] — CM §4.6 Types 1-5
899// <?…?>, <!DOCTYPE…>, <![CDATA[…]]> | don't close on blank lines; without escape
900// at line-start | they would swallow chrome past the `> ` quote
901// [text][realm-label] ref-link USE | both pairs escaped [SAME]
902// [^name] footnote-ref | both brackets escaped [SAME]
903// [label] bare shortcut-ref | both brackets escaped [SAME]
904// [label]: url link-reference definition | whole region stripped [SAME]
905// code fence opened without close | autoclosed at end of input [SAME]
906// NUL byte (\x00) | replaced with U+FFFD [SAME]
907// U+2028 / U+2029 / U+0085 (NEL) | folded to `\n` [SAME]
908// bidi/zero-width controls | stripped [SAME]
909//
910// Use BlockquoteRich when the realm wants to render user content as
911// a quotation that itself reads like authored markdown — the visual
912// CSS containment of `<blockquote>` already demotes inner headings
913// relative to realm chrome, so the "inner headings need a sandbox"
914// caveat that applies to BlockRich at top level does not apply here.
915//
916// Not idempotent: like Blockquote, calling twice double-wraps —
917// `BlockquoteRich(BlockquoteRich(s))` produces `> > content`,
918// nesting the quote a level deeper each pass.
919//
920// Empty input (or input that reduces to nothing after BlockRich,
921// e.g. a lone link-reference definition) returns "" — no blockquote
922// is emitted and neither the leading "\n" nor the trailing "\n\n"
923// shape applies.
924func BlockquoteRich(text string) string {
925 text = BlockRich(text)
926 // BlockRich wraps user content with "\n\n" on each side for
927 // cross-paragraph isolation. Inside a blockquote both wraps are
928 // redundant — the `> ` marker creates the container boundary —
929 // and they would line-prefix to two useless `> ` empty quoted
930 // lines top and bottom. Strip ALL leading and trailing "\n"s so
931 // the body starts and ends clean; this helper re-wraps with `\n`
932 // + body + `\n\n` below.
933 text = strings.TrimLeft(text, "\n")
934 if text == "" {
935 return ""
936 }
937 // Strip ALL trailing newlines so the loop produces exactly one
938 // `> line` per content line, then append `\n\n` at the end so the
939 // blockquote terminates cleanly (see "Output shape" above).
940 text = strings.TrimRight(text, "\n")
941 if text == "" {
942 return ""
943 }
944 var sb strings.Builder
945 // Leading "\n" so `chrome + BlockquoteRich(user)` cannot land the
946 // first `>` mid-line.
947 sb.WriteByte('\n')
948 for _, line := range strings.Split(text, "\n") {
949 sb.WriteString("> ")
950 sb.WriteString(line)
951 sb.WriteByte('\n')
952 }
953 // Trailing blank line so `BlockquoteRich(user) + chrome` cannot
954 // pull chrome into the quote via lazy continuation.
955 sb.WriteByte('\n')
956 return sb.String()
957}
958
959// LinkTitle prepares user content for a CommonMark link-title or
960// image-title slot — the optional quoted text after the URL in any of
961// these forms:
962//
963// [text](url "TITLE")
964// 
965// [label]: url "TITLE"
966//
967// Escapes the inline-active set plus `"` and `'` (the title delimiters
968// that aren't already in the inline set; `(` and `)` are), so the
969// caller can choose any of the three title-quote styles safely.
970//
971// Pick the right helper for the slot — markdown title and HTML
972// attribute share the look but use different escape rules:
973//
974// [text](url "X") → LinkTitle (markdown title)
975// <a title="X"> → HTMLEscape (HTML attribute)
976// <h5>X</h5> → HTMLEscape (HTML element body)
977//
978// Swapping HTMLEscape for LinkTitle is wrong: HTML's `&` written
979// inside a markdown title renders as the literal characters `&`.
980// Swapping LinkTitle for HTMLEscape is wrong: markdown's `\"` survives
981// into the rendered HTML as a literal backslash-quote.
982//
983// Not idempotent (see package doc).
984func LinkTitle(s string) string {
985 s = markdown.StripBidiAndZeroWidth(s)
986 s = markdown.NormalizeBreaks(s)
987 s = foldNewlinesAndSeparators(s, ' ')
988 return markdown.EscapeTitle(s)
989}
990
991// TableCell prepares user content for a GFM table cell — the bytes
992// between two `|` column delimiters in a table row like
993// `| cell-a | cell-b | cell-c |`. An unescaped `|` inside cell
994// content would open a new column, letting a malicious user shift
995// every column to its right.
996//
997// On top of InlineText's behavior, TableCell:
998// - escapes `|` to `\|` so user content can't end the cell early.
999// - replaces tabs with single spaces. CommonMark expands tabs to
1000// the next multiple-of-4 column boundary (variable 1-4 spaces),
1001// which would shift the displayed cell-content width unpredictably
1002// and confuse table alignment.
1003//
1004// Not idempotent (see package doc).
1005func TableCell(s string) string {
1006 s = InlineText(s)
1007 s = strings.ReplaceAll(s, "\t", " ")
1008 s = strings.ReplaceAll(s, "|", `\|`)
1009 return s
1010}
1011
1012// HTMLEscape prepares user content for an HTML lexical slot inside
1013// markdown — covers attribute values, element bodies, and HTML
1014// comment bodies:
1015//
1016// <gno-card type="..." caption="X"> attribute value
1017// <gno-alert title="X"> attribute value
1018// <h5>X</h5> element body
1019// <details><summary>X</summary>... element body
1020// <!-- X --> comment body (safe: `>`
1021// becomes `>`, so user
1022// cannot inject `-->`)
1023//
1024// HTMLEscape escapes the union of attribute-breaking and body-breaking
1025// characters (`<`, `>`, `&`, `"`, `'`), so one function safely serves
1026// every HTML lexical context. Callers don't have to remember which
1027// subset to use for which slot.
1028//
1029// Pick the right helper — markdown title and HTML attribute share
1030// the look but use different escape rules:
1031//
1032// [text](url "X") → LinkTitle (markdown title)
1033// <span title="X"> → HTMLEscape (HTML attribute)
1034// <h5>X</h5> → HTMLEscape (HTML element body)
1035//
1036// Swapping InlineText for HTMLEscape is wrong: markdown's backslash
1037// escapes survive into the rendered HTML as literal `\*`. Swapping
1038// LinkTitle for HTMLEscape is also wrong: `&` written inside a
1039// markdown title renders as the literal characters `&`.
1040//
1041// Not idempotent (see package doc): calling twice produces
1042// `&` → `&amp;`.
1043func HTMLEscape(s string) string {
1044 s = markdown.StripBidiAndZeroWidth(s)
1045 s = markdown.NormalizeBreaks(s)
1046 s = foldNewlinesAndSeparators(s, ' ')
1047 s = replaceNULWithFFFD(s)
1048 return html.EscapeString(s)
1049}
1050
1051// ----- URL filters -----
1052
1053// URL validates a URL for use as a link href, percent-encodes unsafe
1054// bytes, and rejects anything outside the allowlist of schemes.
1055//
1056// Allowlist:
1057// - http, https
1058// - mailto (rejected if contains ?body= or &body= — prefill phishing)
1059// - any URL WITHOUT a scheme — relative paths (`/path`, `./rel`,
1060// `bare-path`), query-only (`?q=v`), fragment-only (`#anchor`).
1061// A `:` appearing inside the URL (e.g. `/path:foo`, `?q=a:b`) is
1062// NOT a scheme separator per RFC 3986 — only `:` immediately after
1063// a leading `[a-zA-Z][a-zA-Z0-9+.-]*` counts.
1064//
1065// Rejected (have an unknown scheme):
1066// - javascript:, data:, vbscript:, blob:, file:, etc.
1067// - `//host/...` (protocol-relative — tracking-pixel vector)
1068//
1069// Returns "" if the URL is empty after trim or fails the allowlist.
1070func URL(s string) string {
1071 s = strings.TrimSpace(s)
1072 if s == "" {
1073 return ""
1074 }
1075 if !linkSchemeAllowed(s) {
1076 return ""
1077 }
1078 return markdown.PercentEncodeURL(s)
1079}
1080
1081// ImageURL validates a URL for use as an image src. Kept separate from
1082// URL — not a parameterized variant — because the allowlist shapes
1083// differ qualitatively (data:image/* vs. mailto:) and a single boolean
1084// flag would invite callers to pass the wrong default.
1085//
1086// Allowlist:
1087// - http, https
1088// - schemeless relative URLs starting with /, ./, or ..
1089// (rejects // protocol-relative — tracking-pixel vector)
1090// - data:image/svg+xml, data:image/png, data:image/jpeg,
1091// data:image/gif, data:image/webp
1092//
1093// Any other data: subtype is rejected — data:text/html etc. would
1094// render as inline HTML and execute embedded scripts.
1095//
1096// DEPLOYMENT PRECONDITION: data: URIs encode the bytes of the image
1097// directly into the markup, so a malicious sender can construct an
1098// image whose pixel dimensions are arbitrarily large at minimal byte
1099// cost. The deploying gnoweb instance MUST clamp rendered image
1100// dimensions via CSS (e.g. `max-width: 100%; max-height: <bound>`).
1101// Without that cap, a single image can blow out the page layout or
1102// exhaust the browser's memory.
1103//
1104// Returns "" if the URL is empty after trim or fails the allowlist.
1105func ImageURL(s string) string {
1106 s = strings.TrimSpace(s)
1107 if s == "" {
1108 return ""
1109 }
1110 if !imageSchemeAllowed(s) {
1111 return ""
1112 }
1113 return markdown.PercentEncodeURL(s)
1114}
1115
1116// ----- Validators -----
1117
1118// userNameCharsets builds the [2]uint64 bitmaps for the r/sys/users
1119// charset: first [a-z], rest [a-z0-9_-]. Initialized once at package
1120// init.
1121var (
1122 userNameFirstLo, userNameFirstHi uint64
1123 userNameRestLo, userNameRestHi uint64
1124 footnoteLabelFirstLo, footnoteLabelFirstHi uint64
1125 footnoteLabelRestLo, footnoteLabelRestHi uint64
1126 langFirstLo, langFirstHi uint64
1127 langRestLo, langRestHi uint64
1128 bechHrpFirstLo, bechHrpFirstHi uint64
1129 bechHrpRestLo, bechHrpRestHi uint64
1130 bechDataFirstLo, bechDataFirstHi uint64
1131 bechDataRestLo, bechDataRestHi uint64
1132)
1133
1134func init() {
1135 // UserName: first [a-z], rest [a-z0-9_-].
1136 for c := byte('a'); c <= 'z'; c++ {
1137 setBit(&userNameFirstLo, &userNameFirstHi, c)
1138 setBit(&userNameRestLo, &userNameRestHi, c)
1139 }
1140 for c := byte('0'); c <= '9'; c++ {
1141 setBit(&userNameRestLo, &userNameRestHi, c)
1142 }
1143 setBit(&userNameRestLo, &userNameRestHi, '_')
1144 setBit(&userNameRestLo, &userNameRestHi, '-')
1145
1146 // FootnoteLabel: [A-Za-z0-9_-] for both first and rest.
1147 for c := byte('A'); c <= 'Z'; c++ {
1148 setBit(&footnoteLabelFirstLo, &footnoteLabelFirstHi, c)
1149 setBit(&footnoteLabelRestLo, &footnoteLabelRestHi, c)
1150 }
1151 for c := byte('a'); c <= 'z'; c++ {
1152 setBit(&footnoteLabelFirstLo, &footnoteLabelFirstHi, c)
1153 setBit(&footnoteLabelRestLo, &footnoteLabelRestHi, c)
1154 }
1155 for c := byte('0'); c <= '9'; c++ {
1156 setBit(&footnoteLabelFirstLo, &footnoteLabelFirstHi, c)
1157 setBit(&footnoteLabelRestLo, &footnoteLabelRestHi, c)
1158 }
1159 for _, c := range []byte{'_', '-'} {
1160 setBit(&footnoteLabelFirstLo, &footnoteLabelFirstHi, c)
1161 setBit(&footnoteLabelRestLo, &footnoteLabelRestHi, c)
1162 }
1163
1164 // LanguageName: [a-zA-Z0-9_+-] for both first and rest.
1165 for c := byte('A'); c <= 'Z'; c++ {
1166 setBit(&langFirstLo, &langFirstHi, c)
1167 setBit(&langRestLo, &langRestHi, c)
1168 }
1169 for c := byte('a'); c <= 'z'; c++ {
1170 setBit(&langFirstLo, &langFirstHi, c)
1171 setBit(&langRestLo, &langRestHi, c)
1172 }
1173 for c := byte('0'); c <= '9'; c++ {
1174 setBit(&langFirstLo, &langFirstHi, c)
1175 setBit(&langRestLo, &langRestHi, c)
1176 }
1177 for _, c := range []byte{'_', '+', '-'} {
1178 setBit(&langFirstLo, &langFirstHi, c)
1179 setBit(&langRestLo, &langRestHi, c)
1180 }
1181
1182 // Bech HRP (when prefix==""): [a-z], 1-16 chars.
1183 for c := byte('a'); c <= 'z'; c++ {
1184 setBit(&bechHrpFirstLo, &bechHrpFirstHi, c)
1185 setBit(&bechHrpRestLo, &bechHrpRestHi, c)
1186 }
1187
1188 // Bech data part: [a-z0-9], 6-90 chars.
1189 for c := byte('a'); c <= 'z'; c++ {
1190 setBit(&bechDataFirstLo, &bechDataFirstHi, c)
1191 setBit(&bechDataRestLo, &bechDataRestHi, c)
1192 }
1193 for c := byte('0'); c <= '9'; c++ {
1194 setBit(&bechDataFirstLo, &bechDataFirstHi, c)
1195 setBit(&bechDataRestLo, &bechDataRestHi, c)
1196 }
1197}
1198
1199func setBit(lo, hi *uint64, c byte) {
1200 if c < 64 {
1201 *lo |= 1 << c
1202 } else {
1203 *hi |= 1 << (c - 64)
1204 }
1205}
1206
1207// UserName validates the r/sys/users-registration charset:
1208// ^[a-z][a-z0-9]*([_-][a-z0-9]+)*$ length ≤ 64.
1209//
1210// The native MatchCharsetN enforces the leading-letter + tail-charset
1211// shape and length bound; this helper also performs the bidi-strip
1212// pre-pass. The "no consecutive [_-]" rule from r/sys/users is NOT
1213// enforced here (it's a registration-policy rule, not a sanitization
1214// concern — registrations go through r/sys/users itself).
1215//
1216// Returns the (bidi-stripped) input if valid, "" otherwise. On a ""
1217// return, do not emit the user-mention markup at all (e.g. skip the
1218// `[@user](/u/user)` link); falling back to the raw user-supplied
1219// string would defeat the validation.
1220func UserName(s string) string {
1221 s = markdown.StripBidiAndZeroWidth(s)
1222 if markdown.MatchCharsetN(s, userNameFirstLo, userNameFirstHi, userNameRestLo, userNameRestHi, 1, 64) {
1223 return s
1224 }
1225 return ""
1226}
1227
1228// BechString validates a bech32-style address-like string.
1229//
1230// A bech32 string has the shape `<hrp>1<data>`: a human-readable
1231// prefix (HRP) that names the family (e.g. `g` for gno addresses,
1232// `gpub` for gno pubkeys, `cosmos` for cosmos addresses), the
1233// separator character `1`, then a data part carrying the encoded
1234// payload as lowercase alphanumerics.
1235//
1236// If prefix != "", requires s to start with prefix+"1" exactly, and the
1237// data part to match ^[a-z0-9]{6,90}$. Use this when you know the
1238// expected family:
1239//
1240// sanitize.BechString(addr, "g") // only g1... (addresses)
1241// sanitize.BechString(pk, "gpub") // only gpub1... (pubkeys)
1242//
1243// If prefix == "", accepts any reasonable bech32 shape:
1244// ^[a-z]{1,16}1[a-z0-9]{6,90}$.
1245//
1246// Syntactic only — does NOT verify the bech32 checksum. Use a true
1247// bech32 decoder if you need that. Returns the cleaned input on
1248// accept, "" on reject; on "" return, do not emit the address-link
1249// markup (the user-supplied bytes have failed shape validation and
1250// should not appear unmodified in output).
1251func BechString(s, prefix string) string {
1252 s = markdown.StripBidiAndZeroWidth(s)
1253 if s == "" {
1254 return ""
1255 }
1256 if prefix != "" {
1257 // HRP must be lowercase ASCII letters.
1258 for i := 0; i < len(prefix); i++ {
1259 c := prefix[i]
1260 if c < 'a' || c > 'z' {
1261 return ""
1262 }
1263 }
1264 need := prefix + "1"
1265 if !strings.HasPrefix(s, need) {
1266 return ""
1267 }
1268 data := s[len(need):]
1269 if markdown.MatchCharsetN(data, bechDataFirstLo, bechDataFirstHi, bechDataRestLo, bechDataRestHi, 6, 90) {
1270 return s
1271 }
1272 return ""
1273 }
1274 // prefix == "" — accept any 1-16 char lowercase HRP, then '1', then data.
1275 sep := strings.IndexByte(s, '1')
1276 if sep < 1 || sep > 16 {
1277 return ""
1278 }
1279 hrp := s[:sep]
1280 if !markdown.MatchCharsetN(hrp, bechHrpFirstLo, bechHrpFirstHi, bechHrpRestLo, bechHrpRestHi, 1, 16) {
1281 return ""
1282 }
1283 data := s[sep+1:]
1284 if markdown.MatchCharsetN(data, bechDataFirstLo, bechDataFirstHi, bechDataRestLo, bechDataRestHi, 6, 90) {
1285 return s
1286 }
1287 return ""
1288}
1289
1290// FootnoteLabel validates an identifier used as a footnote name, link-
1291// reference-definition label, or {#id} anchor: ^[A-Za-z0-9_-]{1,64}$.
1292// Strips bidi/zero-width first. Returns s if valid, "" otherwise.
1293//
1294// Use for every shape where a markdown identifier is treated as an
1295// opaque key by the parser:
1296//
1297// - footnote-definition labels: [^FootnoteLabel(name)]: body
1298// - footnote-reference labels: see [^FootnoteLabel(name)]
1299// - link-reference-definition labels: [FootnoteLabel(label)]: url
1300// - reference-link USE labels: [text][FootnoteLabel(label)]
1301// - goldmark auto-anchor {#id}: # Heading {#FootnoteLabel(id)}
1302//
1303// The shared validator name reflects the shared charset and shared
1304// security goal — keep untrusted bytes out of any parser-managed
1305// identifier slot.
1306//
1307// On "" return, omit the footnote / LRD / anchor entirely rather than
1308// emitting it with raw user bytes.
1309func FootnoteLabel(s string) string {
1310 s = markdown.StripBidiAndZeroWidth(s)
1311 if markdown.MatchCharsetN(s, footnoteLabelFirstLo, footnoteLabelFirstHi, footnoteLabelRestLo, footnoteLabelRestHi, 1, 64) {
1312 return s
1313 }
1314 return ""
1315}
1316
1317// LanguageName validates the language tag (a.k.a. "info string") for
1318// a fenced code block — the `go` in:
1319//
1320// ```go
1321// fmt.Println("hi")
1322// ```
1323//
1324// Charset: ^[a-zA-Z0-9_+-]{1,32}$ — letters, digits, `_`, `+`, `-`,
1325// up to 32 bytes. Strips bidi/zero-width first.
1326//
1327// Returns the cleaned input if valid, "" otherwise. A "" return means
1328// the caller should emit a language-less fence (``` without a tag)
1329// rather than letting the user pick the syntax highlighter — which
1330// could otherwise be used to inject newlines or block markers into
1331// what becomes the opening fence line.
1332func LanguageName(s string) string {
1333 s = markdown.StripBidiAndZeroWidth(s)
1334 if markdown.MatchCharsetN(s, langFirstLo, langFirstHi, langRestLo, langRestHi, 1, 32) {
1335 return s
1336 }
1337 return ""
1338}
1339
1340// NestedPrefix validates a prefix string for line-prefixing builders
1341// like md.Nested, which prepends `prefix` to every line of content
1342// to render the content as a nested/indented sub-block.
1343//
1344// Allowed: any string matching `^[ \t>]*$` — spaces, tabs, blockquote
1345// `>` chars only. Anything else (a `#`, a `-`, a letter) would let a
1346// caller turn benign sub-content into a heading, list, or paragraph
1347// at the wrong nesting level.
1348//
1349// Returns s if valid, "" otherwise. Strips bidi/zero-width first —
1350// otherwise an invisible character hidden inside a `>` prefix would
1351// be replicated on every nested content line, producing per-line
1352// display-vs-storage divergence.
1353//
1354// On "" return, fall back to a known-safe prefix literal (e.g.
1355// `"> "`) or skip the nesting entirely. Do not emit the raw
1356// user-supplied prefix.
1357func NestedPrefix(s string) string {
1358 s = markdown.StripBidiAndZeroWidth(s)
1359 for i := 0; i < len(s); i++ {
1360 c := s[i]
1361 if c != ' ' && c != '\t' && c != '>' {
1362 return ""
1363 }
1364 }
1365 return s
1366}
1367
1368// ----- Primitive -----
1369
1370// CodeFence returns a string of backticks long enough to wrap content
1371// as a CommonMark fenced code block without the content's own backticks
1372// closing the fence prematurely.
1373//
1374// Returned length N = max(minCount, longestBacktickRunInContent + 1).
1375// Use N backticks both before and after the content:
1376//
1377// fence := sanitize.CodeFence(userCode, 3)
1378// out += fence + "\n" + userCode + "\n" + fence + "\n"
1379//
1380// Typical minCount values:
1381// - 1 for inline code spans (`x`)
1382// - 3 for block fenced code (CommonMark §4.5 requires ≥3)
1383//
1384// `minCount < 1` is clamped to 1. Empty content returns
1385// strings.Repeat("`", max(minCount, 1)). Never panics.
1386//
1387// Most realms should reach for InlineCode / CodeBlock /
1388// LanguageCodeBlock below, which call CodeFence internally and emit
1389// the full code block for you. Call CodeFence directly only when
1390// you're rolling a custom fence emitter (e.g. a renderer that needs
1391// the fence length but emits the body differently).
1392func CodeFence(content string, minCount int) string {
1393 return markdown.CodeFence(content, minCount)
1394}
1395
1396// InlineCode wraps user content as a CommonMark inline code span — the
1397// `code` in “ `code` “. Use for any user-derived token, identifier,
1398// or short literal that should render in monospace inside running
1399// prose: variable names, hashes, hex addresses, token symbols, error
1400// codes, package paths, transaction IDs.
1401//
1402// Inline code spans cannot span lines (a `\n` inside the content would
1403// end the span and leave the surrounding backticks as literal text),
1404// so all line breaks — CR / CRLF / LF, NEL (U+0085), U+2028, U+2029 —
1405// are folded to a single space. If you want each line of user content
1406// on its own row, use CodeBlock instead.
1407//
1408// Behavior:
1409// - Bidi/zero-width controls are stripped (browsers honor bidi marks
1410// inside `<code>`, so leaving them would let stored bytes display
1411// as something different).
1412// - NUL is replaced with U+FFFD.
1413// - The wrapping fence is one backtick longer than the longest
1414// backtick run in the content, so internal backticks can never
1415// close the span prematurely.
1416// - A single space pad is added on each side when content starts or
1417// ends with “ ` “ or space, so leading/trailing backticks render
1418// literally rather than fusing with the fence (the renderer
1419// strips one space from each side per CommonMark spec).
1420//
1421// Empty input returns "" rather than a literal two-backtick string
1422// (which CommonMark parses as text, not as an empty code span). If
1423// you use InlineCode as link text and it returns "", omit the link
1424// entirely.
1425//
1426// Not idempotent (see package doc): wraps with a fence, so calling
1427// twice double-wraps.
1428func InlineCode(content string) string {
1429 content = markdown.StripBidiAndZeroWidth(content)
1430 content = markdown.NormalizeBreaks(content)
1431 content = foldNewlinesAndSeparators(content, ' ')
1432 content = replaceNULWithFFFD(content)
1433 if content == "" {
1434 return ""
1435 }
1436 fence := markdown.CodeFence(content, 1)
1437 pad := ""
1438 if content[0] == '`' || content[0] == ' ' ||
1439 content[len(content)-1] == '`' || content[len(content)-1] == ' ' {
1440 pad = " "
1441 }
1442 return fence + pad + content + pad + fence
1443}
1444
1445// CodeBlock wraps user content as a CommonMark fenced code block.
1446// Use for any user-derived multi-line snippet that should render as a
1447// code block: log excerpts, JSON dumps, error backtraces, config
1448// snippets, posted code samples.
1449//
1450// Behavior:
1451// - Bidi/zero-width controls are stripped.
1452// - CR/CRLF line endings are normalized to LF; Unicode separators
1453// (NEL U+0085, U+2028, U+2029) are folded to LF for line-count
1454// consistency.
1455// - NUL is replaced with U+FFFD per CM §2.3.
1456// - The wrapping fence is at least 3 backticks (CM §4.5 minimum) and
1457// sized to outscan internal backticks — an attacker cannot embed
1458// a closing fence in the content.
1459//
1460// Empty content emits an empty fenced block ("```\n\n```\n"), which is
1461// valid CommonMark and renders as an empty `<pre><code></code></pre>`.
1462//
1463// Not idempotent (see package doc).
1464func CodeBlock(content string) string {
1465 content = markdown.StripBidiAndZeroWidth(content)
1466 content = markdown.NormalizeBreaks(content)
1467 content = foldNewlinesAndSeparators(content, '\n')
1468 content = replaceNULWithFFFD(content)
1469 fence := markdown.CodeFence(content, 3)
1470 return fence + "\n" + content + "\n" + fence + "\n"
1471}
1472
1473// LanguageCodeBlock wraps user content as a fenced code block tagged
1474// with a programming-language hint (the "info string" after the
1475// opening fence, e.g. `go` in ```` ```go ````) so the renderer can
1476// apply syntax highlighting.
1477//
1478// An invalid `language` tag silently falls back to a tagless fence —
1479// the helper never returns an error or panics. If a realm author is
1480// debugging "why is my Go highlighting gone?", the input failed the
1481// language validator (charset ^[a-zA-Z0-9_+-]{1,32}$ after bidi-strip).
1482// This fallback exists because an unvalidated tag could contain a
1483// newline that injects content (e.g. a heading) onto what becomes the
1484// opening fence line.
1485//
1486// Content is cleaned exactly as in CodeBlock (bidi-strip, CR/CRLF
1487// normalize to LF, NEL/U+2028/U+2029 fold to LF, NUL→U+FFFD, fence
1488// sized to outscan internal backticks).
1489//
1490// Not idempotent (see package doc).
1491func LanguageCodeBlock(language, content string) string {
1492 content = markdown.StripBidiAndZeroWidth(content)
1493 content = markdown.NormalizeBreaks(content)
1494 content = foldNewlinesAndSeparators(content, '\n')
1495 content = replaceNULWithFFFD(content)
1496 fence := markdown.CodeFence(content, 3)
1497 lang := LanguageName(language) // "" on reject
1498 return fence + lang + "\n" + content + "\n" + fence + "\n"
1499}
1500
1501// ----- Reference-style definitions -----
1502
1503// FootnoteDefinition emits a GFM footnote definition — the
1504// `[^name]: body` form that introduces a footnote whose body is rendered
1505// in the page footer (or wherever the renderer chooses to place it).
1506// Other parts of the markdown reference the footnote by writing
1507// `[^name]` inline.
1508//
1509// Use for any realm-rendered footnote where the body text comes from
1510// user input. The realm picks the footnote name (passed as `name`,
1511// validated by FootnoteLabel — failure here returns ""); the user's
1512// content goes in `text`, which is sanitized via Block.
1513//
1514// Contract:
1515// - `name`: passed raw, validated as a FootnoteLabel
1516// (^[A-Za-z0-9_-]{1,64}$). Reject → return "".
1517// - `text`: passed raw multi-paragraph user prose, cleaned via Block
1518// (bidi-strip, line-ending normalize, LRD strip, block-marker
1519// escape, ref-link USE escape, fence auto-close).
1520//
1521// Empty body → returns "" (a label without body is not a valid
1522// footnote definition; the markdown would parse as a paragraph
1523// containing the label).
1524//
1525// Output shape:
1526//
1527// [^name]:
1528// line 1 of body
1529// line 2 of body
1530// ...
1531//
1532// The label sits on its own line and each body line gets a 4-space
1533// indent — the GFM continuation rule that keeps multi-paragraph body
1534// text bound to the footnote rather than detaching as a new paragraph.
1535//
1536// Not idempotent (see package doc): composes Block internally; passing
1537// already-sanitized body text double-escapes.
1538func FootnoteDefinition(name, text string) string {
1539 label := FootnoteLabel(name)
1540 if label == "" {
1541 return ""
1542 }
1543 // Block now wraps with "\n\n" on both sides for cross-paragraph
1544 // isolation; inside a footnote-definition's 4-space-indented body
1545 // the wrap would line-prefix to blank padding lines, so strip ALL
1546 // leading and trailing "\n"s before continuation-indenting.
1547 body := strings.Trim(Block(text), "\n")
1548 if body == "" {
1549 return ""
1550 }
1551 var b strings.Builder
1552 b.WriteString("[^")
1553 b.WriteString(label)
1554 b.WriteString("]:\n")
1555 for _, line := range strings.Split(body, "\n") {
1556 if line == "" {
1557 b.WriteByte('\n')
1558 } else {
1559 b.WriteString(" ")
1560 b.WriteString(line)
1561 b.WriteByte('\n')
1562 }
1563 }
1564 return b.String()
1565}
1566
1567// LinkReferenceDefinition emits a CommonMark link reference definition
1568// (CM §4.7) — the `[label]: url "title"` form that other parts of the
1569// markdown reference by writing `[text][label]` or `[label]` (shortcut).
1570//
1571// Use for any realm-rendered LRD where the realm owns the label but
1572// any of the URL or title come from user input. The user content for
1573// the URL goes through URL (allowlist-based — reject → ""); the title
1574// goes through LinkTitle (escape).
1575//
1576// Contract:
1577// - `label`: passed raw, validated as a FootnoteLabel
1578// (^[A-Za-z0-9_-]{1,64}$). Realms should choose a namespaced label
1579// using dashes (e.g. `r-myrealm-help`) so shortcut-reference
1580// invocations from user content can't collide with bare prose
1581// (`[help]`, `[click here]`). `/` is not in the FootnoteLabel
1582// charset; reject → return "".
1583// - `url`: passed raw, sanitized via URL. If URL rejects, the LRD is
1584// skipped (return "").
1585// - `title`: passed raw, sanitized via LinkTitle. Empty title → no
1586// title clause emitted.
1587//
1588// The output is framed with leading and trailing blank lines so that
1589// the definition cannot accidentally fuse with adjacent paragraph
1590// content into a setext underline or a continuation line.
1591//
1592// Not idempotent (see package doc).
1593func LinkReferenceDefinition(label, url, title string) string {
1594 lbl := FootnoteLabel(label)
1595 if lbl == "" {
1596 return ""
1597 }
1598 safeURL := URL(url)
1599 if safeURL == "" {
1600 return ""
1601 }
1602 var b strings.Builder
1603 b.WriteString("\n\n[")
1604 b.WriteString(lbl)
1605 b.WriteString("]: ")
1606 b.WriteString(safeURL)
1607 if title != "" {
1608 b.WriteString(" \"")
1609 b.WriteString(LinkTitle(title))
1610 b.WriteString("\"")
1611 }
1612 b.WriteString("\n\n")
1613 return b.String()
1614}
1615
1616// ----- internal helpers -----
1617
1618// linkSchemeAllowed returns true if s passes the URL helper's scheme
1619// allowlist. See URL's doc for the policy.
1620func linkSchemeAllowed(s string) bool {
1621 if strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://") {
1622 return true
1623 }
1624 if strings.HasPrefix(s, "mailto:") {
1625 // Reject prefill phishing via ?body= or &body=.
1626 if strings.Contains(s, "?body=") || strings.Contains(s, "&body=") {
1627 return false
1628 }
1629 return true
1630 }
1631 if strings.HasPrefix(s, "//") {
1632 // Protocol-relative — reject (tracking-pixel vector).
1633 return false
1634 }
1635 // Any URL with an unknown scheme (RFC 3986: `^[a-zA-Z][a-zA-Z0-9+.-]*:`)
1636 // is rejected — this blocks `javascript:`, `data:`, `vbscript:`, `blob:`,
1637 // and anything else not handled above. URLs without a scheme are
1638 // treated as relative and accepted (bare path, query-only, fragment).
1639 if hasURLScheme(s) {
1640 return false
1641 }
1642 return true
1643}
1644
1645// hasURLScheme reports whether s begins with a scheme followed by ':'
1646// per RFC 3986 (^[a-zA-Z][a-zA-Z0-9+.-]*:). A `:` appearing later in
1647// the URL (e.g. `/path:foo` or `?q=a:b`) does not count.
1648func hasURLScheme(s string) bool {
1649 if len(s) == 0 {
1650 return false
1651 }
1652 c := s[0]
1653 if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
1654 return false
1655 }
1656 for i := 1; i < len(s); i++ {
1657 c := s[i]
1658 if c == ':' {
1659 return true
1660 }
1661 if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
1662 (c >= '0' && c <= '9') || c == '+' || c == '.' || c == '-') {
1663 return false
1664 }
1665 }
1666 return false
1667}
1668
1669// imageSchemeAllowed returns true if s passes the ImageURL helper's
1670// scheme allowlist. Tighter than linkSchemeAllowed: no mailto/tel,
1671// only data:image/<subset>.
1672func imageSchemeAllowed(s string) bool {
1673 if strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://") {
1674 return true
1675 }
1676 if strings.HasPrefix(s, "//") {
1677 return false
1678 }
1679 if strings.HasPrefix(s, "/") || strings.HasPrefix(s, "./") || strings.HasPrefix(s, "../") {
1680 return true
1681 }
1682 if strings.HasPrefix(s, "data:") {
1683 // Only the curated image/* subset. CSS must enforce sizing.
1684 for _, p := range []string{
1685 "data:image/svg+xml",
1686 "data:image/png",
1687 "data:image/jpeg",
1688 "data:image/gif",
1689 "data:image/webp",
1690 } {
1691 if strings.HasPrefix(s, p) {
1692 return true
1693 }
1694 }
1695 return false
1696 }
1697 return false
1698}
1699
1700// foldNewlinesAndSeparators replaces \n, U+0085 NEL, U+2028 LINE SEPARATOR,
1701// U+2029 PARAGRAPH SEPARATOR with the given replacement byte (typically
1702// space for inline-context helpers).
1703//
1704// NormalizeBreaks has already folded \r\n and \r to \n before this runs,
1705// so \n is the canonical break byte to substitute.
1706func foldNewlinesAndSeparators(s string, replacement byte) string {
1707 if !needsSeparatorFold(s) {
1708 return s
1709 }
1710 out := make([]byte, 0, len(s))
1711 for i := 0; i < len(s); {
1712 c := s[i]
1713 if c == '\n' {
1714 out = append(out, replacement)
1715 i++
1716 continue
1717 }
1718 // U+0085 NEL: 0xC2 0x85
1719 if c == 0xC2 && i+1 < len(s) && s[i+1] == 0x85 {
1720 out = append(out, replacement)
1721 i += 2
1722 continue
1723 }
1724 // U+2028 (0xE2 0x80 0xA8) or U+2029 (0xE2 0x80 0xA9)
1725 if c == 0xE2 && i+2 < len(s) && s[i+1] == 0x80 && (s[i+2] == 0xA8 || s[i+2] == 0xA9) {
1726 out = append(out, replacement)
1727 i += 3
1728 continue
1729 }
1730 out = append(out, c)
1731 i++
1732 }
1733 return string(out)
1734}
1735
1736func needsSeparatorFold(s string) bool {
1737 for i := 0; i < len(s); i++ {
1738 c := s[i]
1739 if c == '\n' || c == 0xC2 || c == 0xE2 {
1740 return true
1741 }
1742 }
1743 return false
1744}
1745
1746// replaceNULWithFFFD substitutes any NUL byte with the UTF-8 encoding
1747// of U+FFFD REPLACEMENT CHARACTER per CM §2.3.
1748func replaceNULWithFFFD(s string) string {
1749 if !strings.ContainsRune(s, 0) {
1750 return s
1751 }
1752 return strings.ReplaceAll(s, "\x00", "\ufffd")
1753}