aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/google/safehtml/urlset.go
blob: 8d74a7732b13a54b10ea3818dcad2d2832f99ce0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
// Copyright (c) 2017 The Go Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file or at
// https://developers.google.com/open-source/licenses/bsd

package safehtml

import (
	"bytes"
	"strconv"
)

// https://infra.spec.whatwg.org/#ascii-whitespace
// ASCII whitespace is U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE.
var asciiWhitespace [256]bool

// Metacharacters that affect parsing of srcset values.
var srcsetMetachars [256]bool

func init() {
	asciiWhitespace['\t'] = true
	asciiWhitespace[' '] = true
	asciiWhitespace['\n'] = true
	asciiWhitespace['\f'] = true
	asciiWhitespace['\r'] = true

	srcsetMetachars['\t'] = true
	srcsetMetachars[' '] = true
	srcsetMetachars['\n'] = true
	srcsetMetachars['\f'] = true
	srcsetMetachars['\r'] = true
	srcsetMetachars[','] = true
}

// URLSetSanitized returns a safe srcset by individually vetting each
// substring that specifies a URL.
//
// https://html.spec.whatwg.org/multipage/images.html#srcset-attributes
func URLSetSanitized(str string) URLSet {
	var buffer bytes.Buffer

	for len(str) != 0 {
		// Consume one image candidate
		var url, metadata string
		_, str = consumeIn(str, asciiWhitespace)
		url, str = consumeNotIn(str, asciiWhitespace)
		_, str = consumeIn(str, asciiWhitespace)
		metadata, str = consumeNotIn(str, srcsetMetachars)
		_, str = consumeIn(str, asciiWhitespace)

		// Append sanitized content onto buffer.
		if len(url) != 0 && isSafeURL(url) && isOptionalSrcMetadataWellFormed(metadata) {
			if buffer.Len() != 0 {
				// The space before the comma is necessary because
				// a comma adjacent to a URL will attach to it.
				buffer.WriteString(" , ")
			}
			// URL may contain commas.  Disambiguate.
			appendURLToSet(url, &buffer)
			if len(metadata) != 0 {
				buffer.WriteByte(' ')
				buffer.WriteString(metadata)
			}
		}

		// Consume any trailing comma
		if len(str) == 0 || str[0] != ',' {
			break
		}
		str = str[1:]
	}

	if buffer.Len() == 0 {
		return URLSet{InnocuousURL}
	}

	return URLSet{buffer.String()}
}

// appendURLToSet appends a URL so that it does not start or end with a comma
//
// https://html.spec.whatwg.org/multipage/images.html#srcset-attributes
// parsing step 2 which says:
// """
// A valid non-empty URL that does not start or end with a U+002C COMMA character (,),
// referencing a non-interactive, optionally animated, image resource that is neither
// paged nor scripted
// """
//
// Simply replacing all commas would break data:image/png;base64,IMAGECONTENT
// Note: This breaks data URLs with empty content since they end with a comma.
// We could handle that case by appending a '#'.
func appendURLToSet(url string, buffer *bytes.Buffer) {
	n := len(url)
	left, right := 0, n
	if url[left] == ',' {
		buffer.WriteString("%2c")
		left++
	}
	commaAtEnd := false
	if left < right && url[right-1] == ',' {
		commaAtEnd = true
		right--
	}
	buffer.WriteString(url[left:right])
	if commaAtEnd {
		buffer.WriteString("%2c")
	}
}

// consumeNotIn uses bytes in str as bit indices in mask to find
// the least index >= left whose byte corresponds to a zero bit.
func consumeNotIn(str string, mask [256]bool) (consumed, rest string) {
	i, n := 0, len(str)
	for ; i < n; i++ {
		if mask[str[i]] {
			return str[0:i], str[i:n]
		}
	}
	return str, ""
}

// consumeIn is like consumeNotIn but treats mask as inverted.
func consumeIn(str string, mask [256]bool) (consumed, rest string) {
	for i, n := 0, len(str); i < n; i++ {
		if !mask[str[i]] {
			return str[0:i], str[i:n]
		}
	}
	return str, ""
}

// isOptionalSrcMetadataWellFormed is true when its input is empty and
// when it is a floating point number optionally followed by an ASCII letter.
func isOptionalSrcMetadataWellFormed(metadata string) bool {
	// srcset for both image candidates (<img srcset>) and
	// the proposal for script allow a number and an optional letter
	// afterwards.
	n := len(metadata)
	if n == 0 {
		// Metadata is optional
		return true
	}
	metadataPrefix := metadata
	if last := metadata[n-1] | 32; 'a' <= last && last <= 'z' {
		metadataPrefix = metadata[0 : n-1]
	}
	// This overmatches
	// html.spec.whatwg.org/multipage/common-microsyntaxes.html#valid-floating-point-number
	// but is sufficient.
	_, err := strconv.ParseFloat(metadataPrefix, 64)
	return err == nil
}

// URLSet corresponds to the value of a srcset attribute outside a
// TrustedResourceURL context.
type URLSet struct {
	// We declare a URLSet not as a string but as a struct wrapping a string
	// to prevent construction of URL values through string conversion.
	str string
}

// String returns the string content of a URLSet
func (s URLSet) String() string {
	return s.str
}