vendor/github.com/google/safehtml/html.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

// Copyright (c) 2017 The Go Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file or at
// https://developers.google.com/open-source/licenses/bsd

package safehtml

import (
	"bytes"
	"html"
	"unicode"

	"golang.org/x/text/unicode/rangetable"
)

// An HTML is an immutable string-like type that is safe to use in HTML
// contexts in DOM APIs and HTML documents.
//
// HTML guarantees that its value as a string will not cause untrusted script
// execution when evaluated as HTML in a browser.
//
// Values of this type are guaranteed to be safe to use in HTML contexts,
// such as assignment to the innerHTML DOM property, or interpolation into an
// HTML template in HTML PC_DATA context, in the sense that the use will not
// result in a Cross-site Scripting (XSS) vulnerability.
type HTML struct {
	// We declare an HTML not as a string but as a struct wrapping a string
	// to prevent construction of HTML values through string conversion.
	str string
}

// HTMLer is implemented by any value that has an HTML method, which defines the
// safe HTML format for that value.
type HTMLer interface {
	HTML() HTML
}

// HTMLEscaped returns an HTML whose value is text, with the characters [&<>"'] escaped.
//
// text is coerced to interchange valid, so the resulting HTML contains only
// valid UTF-8 characters which are legal in HTML and XML.
//
func HTMLEscaped(text string) HTML {
	return HTML{escapeAndCoerceToInterchangeValid(text)}
}

// HTMLConcat returns an HTML which contains, in order, the string representations
// of the given htmls.
func HTMLConcat(htmls ...HTML) HTML {
	var b bytes.Buffer
	for _, html := range htmls {
		b.WriteString(html.String())
	}
	return HTML{b.String()}
}

// String returns the string form of the HTML.
func (h HTML) String() string {
	return h.str
}

// escapeAndCoerceToInterchangeValid coerces the string to interchange-valid
// UTF-8 and then HTML-escapes it.
func escapeAndCoerceToInterchangeValid(str string) string {
	return html.EscapeString(coerceToUTF8InterchangeValid(str))
}

// coerceToUTF8InterchangeValid coerces a string to interchange-valid UTF-8.
// Illegal UTF-8 bytes are replaced with the Unicode replacement character
// ('\uFFFD'). C0 and C1 control codes (other than CR LF HT FF) and
// non-characters are also replaced with the Unicode replacement character.
func coerceToUTF8InterchangeValid(s string) string {
	// TODO: Replace this entire function with stdlib function if https://golang.org/issue/25805 gets addressed.
	runes := make([]rune, 0, len(s))
	// If s contains any invalid UTF-8 byte sequences, range will have rune
	// contain the Unicode replacement character and there's no need to call
	// utf8.ValidRune. I.e. iteration over the string implements
	// CoerceToStructurallyValid() from C++/Java.
	// See https://blog.golang.org/strings.
	for _, rune := range s {
		if unicode.Is(controlAndNonCharacter, rune) {
			runes = append(runes, unicode.ReplacementChar)
		} else {
			runes = append(runes, rune)
		}
	}
	return string(runes)
}

// controlAndNonCharacters contains the non-interchange-valid codepoints.
//
// See http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
//
// safehtml functions do a lot of lookups on these tables, so merging them is probably
// worth it to avoid comparing against both tables each time.
var controlAndNonCharacter = rangetable.Merge(unicode.Noncharacter_Code_Point, controlChar)

// controlChar contains Unicode control characters disallowed in interchange
// valid UTF-8. This table is slightly different from unicode.Cc:
// - Disallows null.
// - Allows LF, CR, HT, and FF.
//
// unicode.C is mentioned in unicode.IsControl; it contains "special" characters
// which includes at least control characters, surrogate code points, and
// formatting codepoints (e.g. word joiner). We don't need to exclude all of
// those. In particular, surrogates are handled by the for loop converting
// invalid UTF-8 byte sequences to the Unicode replacement character.
var controlChar = &unicode.RangeTable{
	R16: []unicode.Range16{
		{0x0000, 0x0008, 1},
		{0x000B, 0x000B, 1},
		{0x000E, 0x001F, 1},
		{0x007F, 0x009F, 1},
	},
	LatinOffset: 4,
}