foreign.gno
9.96 Kb · 241 lines
1// Package foreign provides the realm-side helper for emitting the
2// gno-foreign sandbox block. Realm authors wrap externally-built
3// markdown (markdown returned by an interface method on a foreign
4// realm, fetched from chain storage owned by another realm, etc.) in
5// Foreign before flowing it into rendered output, so gnoweb renders
6// the body inside its own goldmark sub-instance with structural
7// extensions selectively loaded.
8//
9// The renderer-side contract lives in
10// gno.land/pkg/gnoweb/markdown/ext_foreign.go. This helper produces
11// bytes that satisfy the parser's opener requirements (CommonMark
12// §4.6 Type-7 HTML block, no attribute fall-through) and neutralizes
13// any literal sentinel lines in the body so the foreign markdown
14// cannot terminate the outer block prematurely.
15package foreign
16
17import (
18 "chain/markdown"
19 "strings"
20)
21
22// Foreign wraps body in a `<gno-foreign>` ... `</gno-foreign>` sandbox
23// block. The returned string is ready to concatenate into a larger
24// markdown document.
25//
26// Three normalization steps apply to body:
27//
28// 1. \r\n and bare \r line endings are normalized to \n. The parser
29// uses byte-equal matching against the sentinel close tag, so
30// mixed line endings would otherwise change the match boundary.
31//
32// 2. Any line whose trimmed content looks like a gno-foreign tag
33// opener OR closer — bare (`<gno-foreign>`, `</gno-foreign>`) or
34// attribute-bearing (`<gno-foreign label="x">`, `</gno-foreign
35// attr="…">`, etc.) — is neutralized by HTML-escaping the leading
36// `<` to `<`. The parser tokenizes line bytes literally, so the
37// escaped form is seen as text and cannot terminate the outer
38// block or open an unintended inner block.
39//
40// Crucially, BOTH open-tag and close-tag attribute-bearing forms
41// are neutralized. The parser recognizes a bare `<gno-foreign>`
42// opener, a labeled `<gno-foreign label="x">` opener, and ANY
43// `</gno-foreign…>` closer (golang.org/x/net/html drops attrs on
44// end tags before our recognizer sees them, so attr-bearing
45// closers are sentinel-equivalent). Leaving any of those forms
46// un-neutralized in body bytes would let attacker-supplied
47// markdown adjust the parser's framing-depth counter and either
48// consume the helper's own close (capturing trailing realm
49// content into the sandbox) or close the outer block early
50// (escaping the sandbox entirely).
51//
52// There is therefore NO nesting via the helper: Foreign(Foreign(x))
53// escapes the inner call's own `<gno-foreign>`/`</gno-foreign>`
54// lines, so the inner block renders as visible literal text inside
55// one sandbox, not as a nested sandbox. This is intended — wrapping
56// foreign-built markdown that itself contains gno-foreign sentinels
57// must neutralize them, not honor them.
58//
59// 3. A leading and trailing blank line are emitted around the
60// opener / closer. CommonMark §4.6 forbids Type-7 HTML blocks
61// from interrupting a paragraph; without the blank line, an
62// opener following a non-blank line is absorbed into the
63// preceding paragraph instead of opening a sandbox.
64//
65// The renderer caps cross-family nesting at 4 levels and per-Convert
66// foreign blocks at 256. Beyond those caps, the opener falls through
67// to raw HTML and is stripped by the renderer's safe mode.
68func Foreign(body string) string {
69 return wrapForeign("", body)
70}
71
72// ForeignWithLabel wraps body like Foreign but emits an explicit
73// `label="…"` attribute on the opener so the rendered sandbox carries
74// a caller-supplied label (e.g., "Pulled from /r/foo") shown as a
75// strip above the body. The label is sanitized so it cannot inject
76// HTML or break out of the attribute value:
77//
78// - NUL bytes are dropped.
79// - Other control characters (U+0000–U+001F, U+007F) become spaces.
80// - `&`, `<`, `>`, and `"` are replaced with their HTML entities.
81// - Leading/trailing whitespace is trimmed.
82//
83// A label that is empty after sanitization behaves identically to
84// Foreign: no attribute is emitted, and the renderer shows the sandbox
85// box with NO label strip (there is no default label text).
86func ForeignWithLabel(label, body string) string {
87 return wrapForeign(label, body)
88}
89
90// MaxBlocksPerRender is gnoweb's per-render cap on the number of
91// <gno-foreign> blocks a single page render admits; beyond it, later
92// blocks fall through to raw HTML and are dropped. A realm emitting
93// many foreign blocks (e.g. one per comment) should keep its rendered
94// total under this. Re-exports chain/markdown.MaxForeignBlocksPerConvert
95// — the single source of truth the gnoweb renderer also reads — so
96// callers get the cap without importing chain/markdown directly.
97func MaxBlocksPerRender() int {
98 return markdown.MaxForeignBlocksPerConvert()
99}
100
101func wrapForeign(rawLabel, body string) string {
102 label := sanitizeLabel(rawLabel)
103
104 // Normalize line endings (CR/CRLF → LF). The parser matches the
105 // sentinel close against \n-delimited lines, so mixed line endings
106 // would otherwise shift the match boundary. CR/CRLF → LF ONLY: do
107 // not fold Unicode separators here — they must stay verbatim in the
108 // body so the inner renderer sees the foreign markdown unaltered.
109 body = markdown.NormalizeBreaks(body)
110
111 // Mangle any line that would terminate the outer block or open
112 // an inner one. Covers bare and attribute-bearing forms of both
113 // the opener and the closer (see step 2 in the package doc).
114 var b strings.Builder
115 // b accumulates only the body lines (the opener/closer envelope is
116 // concatenated separately below), so len(body) is the exact size in
117 // the common case. Sentinel lines that expand `<`→`<` may force
118 // one growth — rare enough not to pre-size for.
119 b.Grow(len(body))
120 lines := strings.Split(body, "\n")
121 for i, line := range lines {
122 if isForeignSentinelLine(trimSentinel(line)) {
123 // Escape just the leading `<` so the html tokenizer
124 // sees this as text instead of a tag. Preserve any 0-3
125 // leading spaces the parser's trim would have stripped.
126 idx := strings.Index(line, "<")
127 if idx >= 0 {
128 line = line[:idx] + "<" + line[idx+1:]
129 }
130 }
131 b.WriteString(line)
132 if i < len(lines)-1 {
133 b.WriteByte('\n')
134 }
135 }
136
137 opener := "<gno-foreign>"
138 if label != "" {
139 opener = `<gno-foreign label="` + label + `">`
140 }
141 return "\n\n" + opener + "\n" + b.String() + "\n</gno-foreign>\n\n"
142}
143
144// sanitizeLabel makes a user-supplied label safe to splice into an
145// HTML attribute value on the gno-foreign opener line.
146func sanitizeLabel(s string) string {
147 // Strip bidi-override and zero-width controls FIRST — same ordering
148 // as the sanitize package's HTMLEscape — so invisible reordering or
149 // zero-width payloads can't survive into the rendered label.
150 s = markdown.StripBidiAndZeroWidth(s)
151 // Drop NUL; map other ASCII controls AND the Unicode line/paragraph
152 // separators (U+2028, U+2029, U+0085 NEL) to spaces. The opener is a
153 // single line, so any of these surviving in the label would either
154 // add a control payload or, for the separators, render as a stray
155 // line break inside the attribute.
156 s = strings.Map(func(r rune) rune {
157 if r == 0 {
158 return -1
159 }
160 if r < 0x20 || r == 0x7f || r == 0x2028 || r == 0x2029 || r == 0x0085 {
161 return ' '
162 }
163 return r
164 }, s)
165 // Escape `&` first so subsequent entity bytes don't get
166 // re-escaped.
167 s = strings.ReplaceAll(s, "&", "&")
168 s = strings.ReplaceAll(s, `"`, """)
169 s = strings.ReplaceAll(s, "<", "<")
170 s = strings.ReplaceAll(s, ">", ">")
171 return strings.TrimSpace(s)
172}
173
174// isForeignSentinelLine reports whether s (already trimmed via
175// trimSentinel) begins with the gno-foreign tag prefix and so must be
176// neutralized before it can reach the renderer-side parser.
177//
178// Deliberately OVER-INCLUSIVE: it matches any line whose trimmed form
179// starts (case-INSENSITIVELY) with `<gno-foreign` or `</gno-foreign`,
180// regardless of what follows. This is a strict superset of every line
181// goldmark's html.Tokenizer can recognize as a <gno-foreign> opener or
182// closer, which is what makes it safe:
183//
184// - The tokenizer lowercases tag names, so `<GNO-FOREIGN>` etc. are
185// sentinels; the prefix match is case-folded to mirror that.
186// - The tokenizer ends a tag name at ANY of several terminators
187// (`>`, space, tab, form-feed, `/`). A precise check that
188// enumerates terminators keeps missing variants — e.g.
189// `</gno-foreign/>` and `</gno-foreign\f>` are both recognized as
190// closers by the parser. Matching on the prefix alone cannot miss
191// one: if a body line could be parsed as a sentinel, it starts with
192// this prefix and is escaped here.
193//
194// The only cost is that an unrelated longer tag like `<gno-foreignx>`
195// (a different tag name, not a sentinel) is also escaped — rendered as
196// visible literal text instead of being raw-HTML-stripped — which is
197// harmless for foreign body bytes.
198func isForeignSentinelLine(s string) bool {
199 return hasASCIIFoldPrefix(s, "</gno-foreign") || hasASCIIFoldPrefix(s, "<gno-foreign")
200}
201
202// hasASCIIFoldPrefix reports whether s begins with prefix, comparing
203// ASCII letters case-insensitively. prefix must be lowercase ASCII;
204// folding is ASCII-only on purpose (Unicode case folding would
205// over-match, and the sentinel envelope is pure ASCII anyway).
206func hasASCIIFoldPrefix(s, prefix string) bool {
207 if len(s) < len(prefix) {
208 return false
209 }
210 for i := 0; i < len(prefix); i++ {
211 c := s[i]
212 if c >= 'A' && c <= 'Z' {
213 c += 'a' - 'A'
214 }
215 if c != prefix[i] {
216 return false
217 }
218 }
219 return true
220}
221
222// trimSentinel returns line with the leading 0-3 spaces and trailing
223// ASCII whitespace that the parser's trimForeignLine strips. Mirrors
224// the byte-level trim the parser performs so this helper detects the
225// same sentinel match the parser would.
226func trimSentinel(s string) string {
227 i := 0
228 for i < len(s) && i < 3 && s[i] == ' ' {
229 i++
230 }
231 s = s[i:]
232 for len(s) > 0 {
233 c := s[len(s)-1]
234 if c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r' {
235 s = s[:len(s)-1]
236 continue
237 }
238 break
239 }
240 return s
241}