diff options
Diffstat (limited to 'vendor/github.com/google/safehtml/html.go')
| -rw-r--r-- | vendor/github.com/google/safehtml/html.go | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/vendor/github.com/google/safehtml/html.go b/vendor/github.com/google/safehtml/html.go new file mode 100644 index 000000000..27c0f337d --- /dev/null +++ b/vendor/github.com/google/safehtml/html.go @@ -0,0 +1,117 @@ +// Copyright (c) 2017 The Go Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +package safehtml + +import ( + "bytes" + "html" + "unicode" + + "golang.org/x/text/unicode/rangetable" +) + +// An HTML is an immutable string-like type that is safe to use in HTML +// contexts in DOM APIs and HTML documents. +// +// HTML guarantees that its value as a string will not cause untrusted script +// execution when evaluated as HTML in a browser. +// +// Values of this type are guaranteed to be safe to use in HTML contexts, +// such as assignment to the innerHTML DOM property, or interpolation into an +// HTML template in HTML PC_DATA context, in the sense that the use will not +// result in a Cross-site Scripting (XSS) vulnerability. +type HTML struct { + // We declare an HTML not as a string but as a struct wrapping a string + // to prevent construction of HTML values through string conversion. + str string +} + +// HTMLer is implemented by any value that has an HTML method, which defines the +// safe HTML format for that value. +type HTMLer interface { + HTML() HTML +} + +// HTMLEscaped returns an HTML whose value is text, with the characters [&<>"'] escaped. +// +// text is coerced to interchange valid, so the resulting HTML contains only +// valid UTF-8 characters which are legal in HTML and XML. +// +func HTMLEscaped(text string) HTML { + return HTML{escapeAndCoerceToInterchangeValid(text)} +} + +// HTMLConcat returns an HTML which contains, in order, the string representations +// of the given htmls. +func HTMLConcat(htmls ...HTML) HTML { + var b bytes.Buffer + for _, html := range htmls { + b.WriteString(html.String()) + } + return HTML{b.String()} +} + +// String returns the string form of the HTML. +func (h HTML) String() string { + return h.str +} + +// escapeAndCoerceToInterchangeValid coerces the string to interchange-valid +// UTF-8 and then HTML-escapes it. +func escapeAndCoerceToInterchangeValid(str string) string { + return html.EscapeString(coerceToUTF8InterchangeValid(str)) +} + +// coerceToUTF8InterchangeValid coerces a string to interchange-valid UTF-8. +// Illegal UTF-8 bytes are replaced with the Unicode replacement character +// ('\uFFFD'). C0 and C1 control codes (other than CR LF HT FF) and +// non-characters are also replaced with the Unicode replacement character. +func coerceToUTF8InterchangeValid(s string) string { + // TODO: Replace this entire function with stdlib function if https://golang.org/issue/25805 gets addressed. + runes := make([]rune, 0, len(s)) + // If s contains any invalid UTF-8 byte sequences, range will have rune + // contain the Unicode replacement character and there's no need to call + // utf8.ValidRune. I.e. iteration over the string implements + // CoerceToStructurallyValid() from C++/Java. + // See https://blog.golang.org/strings. + for _, rune := range s { + if unicode.Is(controlAndNonCharacter, rune) { + runes = append(runes, unicode.ReplacementChar) + } else { + runes = append(runes, rune) + } + } + return string(runes) +} + +// controlAndNonCharacters contains the non-interchange-valid codepoints. +// +// See http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream +// +// safehtml functions do a lot of lookups on these tables, so merging them is probably +// worth it to avoid comparing against both tables each time. +var controlAndNonCharacter = rangetable.Merge(unicode.Noncharacter_Code_Point, controlChar) + +// controlChar contains Unicode control characters disallowed in interchange +// valid UTF-8. This table is slightly different from unicode.Cc: +// - Disallows null. +// - Allows LF, CR, HT, and FF. +// +// unicode.C is mentioned in unicode.IsControl; it contains "special" characters +// which includes at least control characters, surrogate code points, and +// formatting codepoints (e.g. word joiner). We don't need to exclude all of +// those. In particular, surrogates are handled by the for loop converting +// invalid UTF-8 byte sequences to the Unicode replacement character. +var controlChar = &unicode.RangeTable{ + R16: []unicode.Range16{ + {0x0000, 0x0008, 1}, + {0x000B, 0x000B, 1}, + {0x000E, 0x001F, 1}, + {0x007F, 0x009F, 1}, + }, + LatinOffset: 4, +} |
