From 18847f55bb3fe9db41e46a2e9e49a9f7c28143af Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Mon, 10 Feb 2020 14:45:20 +0100
Subject: pkg/ast: introduce hex-encoded string literals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The stringnozescapes does not make sense with filename,
also we may need similar escaping for string flags.
Handle escaped strings on ast level instead.
This avoids introducing new type and works seamleassly with flags.

As alternative I've also tried using strconv.Quote/Unquote
but it leads to ugly half-escaped strings:
"\xb0\x80s\xe8\xd4N\x91\xe3ڒ,\"C\x82D\xbb\x88\\i\xe2i\xc8\xe9\xd85\xb1\x14):M\xdcn"

Make hex-encoded strings a separate string format instead.
---
 pkg/ast/ast.go                    |  9 ++++++
 pkg/ast/clone.go                  |  2 ++
 pkg/ast/format.go                 | 15 ++++++++--
 pkg/ast/parser.go                 | 19 +++++++++++--
 pkg/ast/scanner.go                | 59 ++++++++++++++++++---------------------
 pkg/ast/testdata/all.txt          |  7 ++---
 pkg/compiler/testdata/all.txt     |  3 ++
 pkg/compiler/testdata/errors2.txt |  1 +
 pkg/compiler/types.go             | 21 ++++----------
 9 files changed, 79 insertions(+), 57 deletions(-)

(limited to 'pkg')

diff --git a/pkg/ast/ast.go b/pkg/ast/ast.go
index 13ff2b40e..f1f74e61b 100644
--- a/pkg/ast/ast.go
+++ b/pkg/ast/ast.go
@@ -163,6 +163,7 @@ func (n *Ident) Info() (Pos, string, string) {
 type String struct {
 	Pos   Pos
 	Value string
+	Fmt   StrFmt
 }
 
 func (n *String) Info() (Pos, string, string) {
@@ -178,6 +179,13 @@ const (
 	IntFmtChar
 )
 
+type StrFmt int
+
+const (
+	StrFmtRaw StrFmt = iota
+	StrFmtHex
+)
+
 type Int struct {
 	Pos Pos
 	// Only one of Value, Ident, CExpr is filled.
@@ -198,6 +206,7 @@ type Type struct {
 	ValueFmt  IntFmt
 	Ident     string
 	String    string
+	StringFmt StrFmt
 	HasString bool
 	// Parts after COLON (for ranges and bitfields).
 	Colon []*Type
diff --git a/pkg/ast/clone.go b/pkg/ast/clone.go
index a594e41f6..54bf8250c 100644
--- a/pkg/ast/clone.go
+++ b/pkg/ast/clone.go
@@ -134,6 +134,7 @@ func (n *String) Clone() Node {
 	return &String{
 		Pos:   n.Pos,
 		Value: n.Value,
+		Fmt:   n.Fmt,
 	}
 }
 
@@ -154,6 +155,7 @@ func (n *Type) Clone() Node {
 		ValueFmt:  n.ValueFmt,
 		Ident:     n.Ident,
 		String:    n.String,
+		StringFmt: n.StringFmt,
 		HasString: n.HasString,
 		Colon:     cloneTypes(n.Colon),
 		Args:      cloneTypes(n.Args),
diff --git a/pkg/ast/format.go b/pkg/ast/format.go
index c3d931706..a2ead06f2 100644
--- a/pkg/ast/format.go
+++ b/pkg/ast/format.go
@@ -50,6 +50,17 @@ func FormatInt(v uint64, format IntFmt) string {
 	}
 }
 
+func FormatStr(v string, format StrFmt) string {
+	switch format {
+	case StrFmtRaw:
+		return fmt.Sprintf(`"%v"`, v)
+	case StrFmtHex:
+		return fmt.Sprintf("`%x`", v)
+	default:
+		panic(fmt.Sprintf("unknown str format %v", format))
+	}
+}
+
 type serializer interface {
 	serialize(w io.Writer)
 }
@@ -153,7 +164,7 @@ func (flags *IntFlags) serialize(w io.Writer) {
 func (flags *StrFlags) serialize(w io.Writer) {
 	fmt.Fprintf(w, "%v = ", flags.Name.Name)
 	for i, v := range flags.Values {
-		fmt.Fprintf(w, "%v\"%v\"", comma(i, ""), v.Value)
+		fmt.Fprintf(w, "%v%v", comma(i, ""), FormatStr(v.Value, v.Fmt))
 	}
 	fmt.Fprintf(w, "\n")
 }
@@ -172,7 +183,7 @@ func fmtType(t *Type) string {
 	case t.Ident != "":
 		v = t.Ident
 	case t.HasString:
-		v = fmt.Sprintf("\"%v\"", t.String)
+		v = FormatStr(t.String, t.StringFmt)
 	default:
 		v = FormatInt(t.Value, t.ValueFmt)
 	}
diff --git a/pkg/ast/parser.go b/pkg/ast/parser.go
index b8d22fd88..7b46f6611 100644
--- a/pkg/ast/parser.go
+++ b/pkg/ast/parser.go
@@ -314,7 +314,7 @@ func (p *parser) parseFlags(name *Ident) Node {
 	switch p.tok {
 	case tokInt, tokIdent:
 		return p.parseIntFlags(name)
-	case tokString:
+	case tokString, tokStringHex:
 		return p.parseStrFlags(name)
 	default:
 		p.expect(tokInt, tokIdent, tokString)
@@ -417,9 +417,10 @@ func (p *parser) parseType() *Type {
 	case tokIdent:
 		allowColon = true
 		arg.Ident = p.lit
-	case tokString:
+	case tokString, tokStringHex:
 		arg.String = p.lit
 		arg.HasString = true
+		arg.StringFmt = strTokToFmt(p.tok)
 	default:
 		p.expect(tokInt, tokIdent, tokString)
 	}
@@ -468,15 +469,27 @@ func (p *parser) parseIdent() *Ident {
 }
 
 func (p *parser) parseString() *String {
-	p.expect(tokString)
+	p.expect(tokString, tokStringHex)
 	str := &String{
 		Pos:   p.pos,
 		Value: p.lit,
+		Fmt:   strTokToFmt(p.tok),
 	}
 	p.next()
 	return str
 }
 
+func strTokToFmt(tok token) StrFmt {
+	switch tok {
+	case tokString:
+		return StrFmtRaw
+	case tokStringHex:
+		return StrFmtHex
+	default:
+		panic("bad string token")
+	}
+}
+
 func (p *parser) parseInt() *Int {
 	i := &Int{
 		Pos: p.pos,
diff --git a/pkg/ast/scanner.go b/pkg/ast/scanner.go
index a9448b34f..3a6ba9d98 100644
--- a/pkg/ast/scanner.go
+++ b/pkg/ast/scanner.go
@@ -4,6 +4,7 @@
 package ast
 
 import (
+	"encoding/hex"
 	"fmt"
 	"os"
 	"strconv"
@@ -20,6 +21,7 @@ const (
 	tokDefine
 	tokResource
 	tokString
+	tokStringHex
 	tokCExpr
 	tokInt
 
@@ -51,18 +53,19 @@ var punctuation = [256]token{
 }
 
 var tok2str = [...]string{
-	tokIllegal:  "ILLEGAL",
-	tokComment:  "comment",
-	tokIdent:    "identifier",
-	tokInclude:  "include",
-	tokIncdir:   "incdir",
-	tokDefine:   "define",
-	tokResource: "resource",
-	tokString:   "string",
-	tokCExpr:    "CEXPR",
-	tokInt:      "int",
-	tokNewLine:  "NEWLINE",
-	tokEOF:      "EOF",
+	tokIllegal:   "ILLEGAL",
+	tokComment:   "comment",
+	tokIdent:     "identifier",
+	tokInclude:   "include",
+	tokIncdir:    "incdir",
+	tokDefine:    "define",
+	tokResource:  "resource",
+	tokString:    "string",
+	tokStringHex: "hex string",
+	tokCExpr:     "CEXPR",
+	tokInt:       "int",
+	tokNewLine:   "NEWLINE",
+	tokEOF:       "EOF",
 }
 
 func init() {
@@ -132,12 +135,7 @@ func (s *scanner) Scan() (tok token, lit string, pos Pos) {
 	case s.ch == 0:
 		tok = tokEOF
 		s.next()
-	case s.ch == '`':
-		tok = tokCExpr
-		lit = s.scanCExpr(pos)
 	case s.prev2 == tokDefine && s.prev1 == tokIdent:
-		// Note: the old form for C expressions, not really lexable.
-		// TODO(dvyukov): get rid of this eventually.
 		tok = tokCExpr
 		for ; s.ch != '\n'; s.next() {
 		}
@@ -150,6 +148,9 @@ func (s *scanner) Scan() (tok token, lit string, pos Pos) {
 	case s.ch == '"' || s.ch == '<':
 		tok = tokString
 		lit = s.scanStr(pos)
+	case s.ch == '`':
+		tok = tokStringHex
+		lit = s.scanStr(pos)
 	case s.ch >= '0' && s.ch <= '9' || s.ch == '-':
 		tok = tokInt
 		lit = s.scanInt(pos)
@@ -170,21 +171,9 @@ func (s *scanner) Scan() (tok token, lit string, pos Pos) {
 	return
 }
 
-func (s *scanner) scanCExpr(pos Pos) string {
-	for s.next(); s.ch != '`' && s.ch != '\n'; s.next() {
-	}
-	if s.ch == '\n' {
-		s.Error(pos, "C expression is not terminated")
-		return ""
-	}
-	lit := string(s.data[pos.Off+1 : s.off])
-	s.next()
-	return lit
-}
-
 func (s *scanner) scanStr(pos Pos) string {
 	// TODO(dvyukov): get rid of <...> strings, that's only includes
-	closing := byte('"')
+	closing := s.ch
 	if s.ch == '<' {
 		closing = '>'
 	}
@@ -196,7 +185,6 @@ func (s *scanner) scanStr(pos Pos) string {
 	}
 	lit := string(s.data[pos.Off+1 : s.off])
 	for i := 0; i < len(lit); i++ {
-		//lit[i]
 		if lit[i] < 0x20 || lit[i] >= 0x80 {
 			pos1 := pos
 			pos1.Col += i + 1
@@ -206,7 +194,14 @@ func (s *scanner) scanStr(pos Pos) string {
 		}
 	}
 	s.next()
-	return lit
+	if closing != '`' {
+		return lit
+	}
+	decoded, err := hex.DecodeString(lit)
+	if err != nil {
+		s.Error(pos, "bad hex string literal: %v", err)
+	}
+	return string(decoded)
 }
 
 func (s *scanner) scanInt(pos Pos) string {
diff --git a/pkg/ast/testdata/all.txt b/pkg/ast/testdata/all.txt
index 392796254..122fa514b 100644
--- a/pkg/ast/testdata/all.txt
+++ b/pkg/ast/testdata/all.txt
@@ -17,15 +17,14 @@ int_flags4 = 1, -2-			### bad integer "-2-"
 str_flags0 = "foo", "bar"
 str_flags1 = "non terminated		### string literal is not terminated
 str_flags2 = "bad chars здесь"		### illegal character U+00D0 'Ð' in string literal
-str_flags3 = "string", not a string	### unexpected identifier, expecting string
-str_flags4 = "string", 42		### unexpected int, expecting string
+str_flags3 = "string", not a string	### unexpected identifier, expecting string, hex string
+str_flags4 = "string", 42		### unexpected int, expecting string, hex string
 
 call(foo ,int32 , bar int32)		### unexpected ',', expecting int, identifier, string
 call(foo int32:"bar")			### unexpected string, expecting int, identifier
 call(a int32, b len[a:"bar"])		### unexpected string, expecting int, identifier
 
-define FOO `bar`
-define FOO `bar				### C expression is not terminated
+define FOO bar
 
 foo(x int32[1:2:3, opt])
 foo2(x int32[1[2]:2])			### unexpected ':', expecting ']'
diff --git a/pkg/compiler/testdata/all.txt b/pkg/compiler/testdata/all.txt
index 9d94a81a1..789071964 100644
--- a/pkg/compiler/testdata/all.txt
+++ b/pkg/compiler/testdata/all.txt
@@ -40,10 +40,13 @@ strings {
 	f11	stringnoz[string_flags1]
 	f12	string[string_flags2]
 	f13	stringnoz[string_flags2]
+	f14	string[`abcdef`, 4]
+	f15	string[string_flags3, 4]
 } [packed]
 
 string_flags1 = "foo", "barbaz"
 string_flags2 = ""
+string_flags3 = "ab", `010203`, `de`
 int_flags = 0, 1, 0xabc, 'x', -11
 _ = 1, 2
 _ = C1, C2
diff --git a/pkg/compiler/testdata/errors2.txt b/pkg/compiler/testdata/errors2.txt
index 47d76ab89..b5ab19ebf 100644
--- a/pkg/compiler/testdata/errors2.txt
+++ b/pkg/compiler/testdata/errors2.txt
@@ -267,6 +267,7 @@ foo$525(a int8[-256:256])		### int range [18446744073709551360:256] is too large
 foo$526(a int8[-255:255])		### int range [18446744073709551361:255] is too large for base type of size 8
 foo$527(a int16[-40000:40000])		### int range [18446744073709511616:40000] is too large for base type of size 16
 foo$528(a ptr[in, s405])
+foo$529(a ptr[in, string[`abcdde`, 3]])	### string value "\xab\xcd\xde\x00" exceeds buffer length 3
 
 type type500 proc[C1, 8, int8]	### values starting from 1 with step 8 overflow base type for 32 procs
 type type501 int8		### unused type type501
diff --git a/pkg/compiler/types.go b/pkg/compiler/types.go
index 251e0fcaa..43efee202 100644
--- a/pkg/compiler/types.go
+++ b/pkg/compiler/types.go
@@ -514,12 +514,11 @@ func genTextType(t *ast.Type) prog.TextKind {
 }
 
 const (
-	stringnoz        = "stringnoz"
-	stringnozescapes = "stringnozescapes"
+	stringnoz = "stringnoz"
 )
 
 var typeString = &typeDesc{
-	Names:        []string{"string", stringnoz, stringnozescapes},
+	Names:        []string{"string", stringnoz},
 	CanBeTypedef: true,
 	OptArgs:      2,
 	Args: []namedArg{
@@ -527,7 +526,7 @@ var typeString = &typeDesc{
 		{Name: "size", Type: typeArgInt},
 	},
 	Check: func(comp *compiler, t *ast.Type, args []*ast.Type, base prog.IntTypeCommon) {
-		if (t.Ident == stringnoz || t.Ident == stringnozescapes) && len(args) > 1 {
+		if t.Ident == stringnoz && len(args) > 1 {
 			comp.error(args[0].Pos, "fixed-size string can't be non-zero-terminated")
 		}
 	},
@@ -559,7 +558,7 @@ var typeString = &typeDesc{
 			return &prog.BufferType{
 				TypeCommon: base.TypeCommon,
 				Kind:       prog.BufferFilename,
-				NoZ:        t.Ident == stringnoz || t.Ident == stringnozescapes,
+				NoZ:        t.Ident == stringnoz,
 			}
 		}
 		subkind := ""
@@ -576,7 +575,7 @@ var typeString = &typeDesc{
 			Kind:       prog.BufferString,
 			SubKind:    subkind,
 			Values:     vals,
-			NoZ:        t.Ident == stringnoz || t.Ident == stringnozescapes,
+			NoZ:        t.Ident == stringnoz,
 		}
 	},
 }
@@ -592,16 +591,6 @@ func (comp *compiler) genStrings(t *ast.Type, args []*ast.Type) []string {
 	}
 	if t.Ident == stringnoz {
 		return vals
-	} else if t.Ident == stringnozescapes {
-		for i := range vals {
-			unquote, err := strconv.Unquote(`"` + vals[i] + `"`)
-			if err != nil {
-				comp.error(args[0].Pos, fmt.Sprintf("unable to unquote stringnozescapes %q: %v", vals[i], err))
-			} else {
-				vals[i] = unquote
-			}
-		}
-		return vals
 	}
 	var size uint64
 	if len(args) > 1 {
-- 
cgit mrf-deployment