Skip to content

Commit

Permalink
implement json curly bracket attributes (for v0.2.0)
Browse files Browse the repository at this point in the history
  • Loading branch information
sawka committed Nov 2, 2024
1 parent 29db9c1 commit 068b78c
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 8 deletions.
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,25 @@
This is a fork of part of the golang.org/x/net/html package.

## v0.2.0

For v0.2.0 we made a more radical change to the [tokenizer](https://pkg.go.dev/golang.org/x/net/html#Tokenizer) package.

We added a new syntax to allow attributes to be set with '{}' syntax.
Any valid JSON expression is allowed within the curly brackets (this more
closely matches JSX syntax).

```
<div data-num={5}></div>
```

To support proper decoding in the client, attributes now have a an `IsJson bool` field
which is set to true if an attribute was parsed with the new {} syntax.

If you only need the case-sensitive tokenization for tags/attributes it is
recommended to use v0.1.0 and not v0.2.0.

## v0.1.0

It is not a complete fork as we only want to modify and change https://pkg.go.dev/golang.org/x/net/html#Tokenizer. So this is the minimal amount of code to get html.Tokenizer working.

The reason for the fork is to allow for returning of case-sensitive tag names and attribute names. The current package normalizes the tag names and attribute names by calling (the equivalent of) strings.ToLower on them before returning them to the caller. We made a very small two line change in token.go to remove those ToLower calls. Other changes involve copying enough code from other files to get all the dependencies satisfied and get it compling again.
Expand Down
97 changes: 97 additions & 0 deletions parsebraceattr.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// Copyright 2024 Command Line Inc. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package htmltoken

import "fmt"

func (z *Tokenizer) parseBraceAttr() {
braceCount := 1
inString := false
prevStrBackslash := false

z.pendingAttr[1].start = z.raw.end
for {
ch := z.readByte()
if z.err != nil {
z.pendingAttr[1].end = z.raw.end
return
}
if inString {
if prevStrBackslash {
prevStrBackslash = false
continue
}
if ch == '\\' {
prevStrBackslash = true
continue
}
if ch == '"' {
inString = false
continue
}
continue
}
if ch == '{' {
braceCount++
continue
}
if ch == '"' {
inString = true
continue
}
if ch == '}' {
braceCount--
if braceCount == 0 {
z.pendingAttr[1].end = z.raw.end - 1
return
}
continue
}
}
}

func (z *Tokenizer) parseBraceAttrEx(input string) (string, error) {
var result []rune
braceCount := 0
inString := false

for i := 0; i < len(input); i++ {
ch := rune(input[i])

if inString {
// Handle string escape sequences
if ch == '\\' && i+1 < len(input) {
result = append(result, ch, rune(input[i+1]))
i++
continue
}
if ch == '"' {
inString = false
}
result = append(result, ch)
continue
}

switch ch {
case '{':
braceCount++
case '}':
braceCount--
if braceCount == 0 {
return string(result), nil
}
case '"':
inString = true
}

result = append(result, ch)
}

if braceCount != 0 {
return "", fmt.Errorf("unbalanced braces")
}

return string(result), nil
}
33 changes: 25 additions & 8 deletions token.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
// and modified to be used in the vdom package
// we are producing a JSX-like parser, which requires us to have case sensitivity for attributes and tags
// the sole changes in this package are to remove the lower() calls.
// modifications are marked with a comment starting with "MOD"

package htmltoken

Expand Down Expand Up @@ -72,6 +73,7 @@ func (t TokenType) String() string {
// Namespace is only used by the parser, not the tokenizer.
type Attribute struct {
Namespace, Key, Val string
IsJson bool // MOD - added to support json attributes
}

// A Token consists of a TokenType and some Data (tag name for start and end
Expand Down Expand Up @@ -297,9 +299,11 @@ type Tokenizer struct {
// pendingAttr is the attribute key and value currently being tokenized.
// When complete, pendingAttr is pushed onto attr. nAttrReturned is
// incremented on each call to TagAttr.
pendingAttr [2]span
attr [][2]span
nAttrReturned int
pendingAttr [2]span
pendingAttrIsJson bool // MOD - added to support json attributes
attr [][2]span
jsonAttr []bool // MOD - added to support json attributes (made parallel to reduce code changes)
nAttrReturned int
// rawTag is the "script" in "</script>" that closes the next token. If
// non-empty, the subsequent call to Next will return a raw or RCDATA text
// token: one that treats "<p>" as text instead of an element.
Expand Down Expand Up @@ -995,6 +999,7 @@ func (z *Tokenizer) readStartTag() TokenType {
// in [A-Za-z].
func (z *Tokenizer) readTag(saveAttr bool) {
z.attr = z.attr[:0]
z.jsonAttr = z.jsonAttr[:0]
z.nAttrReturned = 0
// Read the tag name and attribute key/value pairs.
z.readTagName()
Expand All @@ -1006,12 +1011,14 @@ func (z *Tokenizer) readTag(saveAttr bool) {
if z.err != nil || c == '>' {
break
}
z.pendingAttrIsJson = false
z.raw.end--
z.readTagAttrKey()
z.readTagAttrVal()
// Save pendingAttr if saveAttr and that attribute has a non-empty key.
if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end {
z.attr = append(z.attr, z.pendingAttr)
z.jsonAttr = append(z.jsonAttr, z.pendingAttrIsJson)
}
if z.skipWhiteSpace(); z.err != nil {
break
Expand Down Expand Up @@ -1116,6 +1123,12 @@ func (z *Tokenizer) readTagAttrVal() {
}
}

case '{':
// MOD -- added support for brace-enclosed JSON attributes
z.pendingAttrIsJson = true
z.parseBraceAttr()
return

default:
z.pendingAttr[1].start = z.raw.end - 1
for {
Expand Down Expand Up @@ -1345,20 +1358,22 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
// TagAttr returns the lower-cased key and unescaped value of the next unparsed
// attribute for the current tag token and whether there are more attributes.
// The contents of the returned slices may change on the next call to Next.
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
// MOD -- added isJson bool return value
func (z *Tokenizer) TagAttr() (key, val []byte, isJson bool, moreAttr bool) {
if z.nAttrReturned < len(z.attr) {
switch z.tt {
case StartTagToken, SelfClosingTagToken:
x := z.attr[z.nAttrReturned]
isJson := z.jsonAttr[z.nAttrReturned]
z.nAttrReturned++
key = z.buf[x[0].start:x[0].end]
val = z.buf[x[1].start:x[1].end]
// MOD -- remove lower(s)
return key, unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr)
return key, unescape(convertNewlines(val), true), isJson, z.nAttrReturned < len(z.attr)
// return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr)
}
}
return nil, nil, false
return nil, nil, false, false
}

// Token returns the current Token. The result's Data and Attr values remain
Expand All @@ -1372,8 +1387,10 @@ func (z *Tokenizer) Token() Token {
name, moreAttr := z.TagName()
for moreAttr {
var key, val []byte
key, val, moreAttr = z.TagAttr()
t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)})
var isJson bool
// MOD -- added isJson
key, val, isJson, moreAttr = z.TagAttr()
t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val), isJson})
}
if a := atom.Lookup(name); a != 0 {
t.DataAtom, t.Data = a, a.String()
Expand Down

0 comments on commit 068b78c

Please sign in to comment.