ref: bee6a69846f476013e78c32d55f6c508ed8f1a99
parent: be6482603ee8ac3516ae19ceebb35662f038b25e
author: bep <bjorn.erik.pedersen@gmail.com>
date: Sun May 3 15:54:17 EDT 2015
canonifyurls in srcset Speed is about the same as before, uses slightly less memory: ``` benchmark old ns/op new ns/op delta BenchmarkAbsURL 17302 17713 +2.38% BenchmarkXMLAbsURL 9463 9470 +0.07% benchmark old allocs new allocs delta BenchmarkAbsURL 28 24 -14.29% BenchmarkXMLAbsURL 14 12 -14.29% benchmark old bytes new bytes delta BenchmarkAbsURL 3422 3144 -8.12% BenchmarkXMLAbsURL 1985 1864 -6.10% ``` Fixes #1059
--- a/transform/absurlreplacer.go
+++ b/transform/absurlreplacer.go
@@ -29,119 +29,207 @@
start int // item start position
width int // width of last element
- matchers []absURLMatcher
- state stateFunc
- prefixLookup *prefixes
+ matchers []absURLMatcher
+ state stateFunc
+ ms matchState
+ matches [3]bool // track matches of the 3 prefixes
+ i int // last index in matches checked
+
w io.Writer
}
type stateFunc func(*contentlexer) stateFunc
-type prefixRunes []rune
+type prefix struct {+ r []rune
+ f func(l *contentlexer)
+}
-type prefixes struct {- pr []prefixRunes
- curr prefixRunes // current prefix lookup table
- i int // current index
+var prefixes = []*prefix{+ &prefix{r: []rune{'s', 'r', 'c', '='}, f: checkCandidateSrc},+ &prefix{r: []rune{'s', 'r', 'c', 's', 'e', 't', '='}, f: checkCandidateSrcset},+ &prefix{r: []rune{'h', 'r', 'e', 'f', '='}, f: checkCandidateHref}}- // first rune in potential match
- first rune
-
- // match-state:
- // none, whitespace, partial, full
- ms matchState
+type absURLMatcher struct {+ prefix int
+ match []byte
+ quote []byte
+ replacementURL []byte
}
-// match returns partial and full match for the prefix in play
-// - it's a full match if all prefix runes has checked out in row
-// - it's a partial match if it's on its way towards a full match
func (l *contentlexer) match(r rune) {- p := l.prefixLookup
- if p.curr == nil {- // assumes prefixes all start off on a different rune
- // works in this special case: href, src
- p.i = 0
- for _, pr := range p.pr {- if pr[p.i] == r {- fullMatch := len(p.pr) == 1
- p.first = r
- if !fullMatch {- p.curr = pr
- l.prefixLookup.ms = matchStatePartial
- } else {- l.prefixLookup.ms = matchStateFull
+
+ var found bool
+
+ // note, the prefixes can start off on the same foot, i.e.
+ // src and srcset.
+ if l.ms == matchStateWhitespace {+ l.i = 0
+ for j, p := range prefixes {+ if r == p.r[l.i] {+ l.matches[j] = true
+ found = true
+ if l.checkMatchState(r, j) {+ return
}
- return
+ } else {+ l.matches[j] = false
}
}
- } else {- p.i++
- if p.curr[p.i] == r {- fullMatch := len(p.curr) == p.i+1
- if fullMatch {- p.curr = nil
- l.prefixLookup.ms = matchStateFull
+
+ if !found {+ l.ms = matchStateNone
+ }
+
+ return
+ }
+
+ l.i++
+ for j, m := range l.matches {+ // still a match?
+ if m {+ if prefixes[j].r[l.i] == r {+ found = true
+ if l.checkMatchState(r, j) {+ return
+ }
} else {- l.prefixLookup.ms = matchStatePartial
+ l.matches[j] = false
}
- return
}
+ }
- p.curr = nil
+ if found {+ return
}
- l.prefixLookup.ms = matchStateNone
+ l.ms = matchStateNone
}
+func (l *contentlexer) checkMatchState(r rune, idx int) bool {+ if r == '=' {+ l.ms = matchStateFull
+ for k := range l.matches {+ if k != idx {+ l.matches[k] = false
+ }
+ }
+ return true
+ }
+
+ l.ms = matchStatePartial
+
+ return false
+}
+
func (l *contentlexer) emit() {l.w.Write(l.content[l.start:l.pos])
l.start = l.pos
}
-var mainPrefixRunes = []prefixRunes{{'s', 'r', 'c', '='}, {'h', 'r', 'e', 'f', '='}}+func (a absURLMatcher) isSourceType() bool {+ return a.prefix == matchPrefixSrc
+}
-type absURLMatcher struct {- prefix int
- match []byte
- replacement []byte
+func checkCandidateSrc(l *contentlexer) {+ for _, m := range l.matchers {+ if !m.isSourceType() {+ continue
+ }
+ l.replaceSimple(m)
+ }
}
-func (a absURLMatcher) isSourceType() bool {- return a.prefix == matchPrefixSrc
+func checkCandidateHref(l *contentlexer) {+ for _, m := range l.matchers {+ if m.isSourceType() {+ continue
+ }
+ l.replaceSimple(m)
+ }
}
-func checkCandidate(l *contentlexer) {- isSource := l.prefixLookup.first == 's'
+func checkCandidateSrcset(l *contentlexer) {+ // special case, not frequent (me think)
for _, m := range l.matchers {+ if m.isSourceType() {+ continue
+ }
- if isSource && !m.isSourceType() || !isSource && m.isSourceType() {+ if !bytes.HasPrefix(l.content[l.pos:], m.match) {continue
}
- if bytes.HasPrefix(l.content[l.pos:], m.match) {- // check for schemaless URLs
- posAfter := l.pos + len(m.match)
- if posAfter >= len(l.content) {- return
- }
- r, _ := utf8.DecodeRune(l.content[posAfter:])
- if r == '/' {- // schemaless: skip
- return
- }
- if l.pos > l.start {- l.emit()
- }
- l.pos += len(m.match)
- l.w.Write(m.replacement)
- l.start = l.pos
+ // check for schemaless URLs
+ posAfter := l.pos + len(m.match)
+ if posAfter >= len(l.content) {return
+ }
+ r, _ := utf8.DecodeRune(l.content[posAfter:])
+ if r == '/' {+ // schemaless: skip
+ continue
+ }
+ posLastQuote := bytes.Index(l.content[l.pos+1:], m.quote)
+
+ // safe guard
+ if posLastQuote < 0 || posLastQuote > 2000 {+ return
}
+
+ if l.pos > l.start {+ l.emit()
+ }
+
+ section := l.content[l.pos+len(m.quote) : l.pos+posLastQuote+1]
+
+ fields := bytes.Fields(section)
+ l.w.Write([]byte(m.quote))
+ for i, f := range fields {+ if f[0] == '/' {+ l.w.Write(m.replacementURL)
+ l.w.Write(f[1:])
+
+ } else {+ l.w.Write(f)
+ }
+
+ if i < len(fields)-1 {+ l.w.Write([]byte(" "))+ }
+ }
+
+ l.w.Write(m.quote)
+ l.pos += len(section) + (len(m.quote) * 2)
+ l.start = l.pos
}
}
+func (l *contentlexer) replaceSimple(m absURLMatcher) {+ if !bytes.HasPrefix(l.content[l.pos:], m.match) {+ return
+ }
+ // check for schemaless URLs
+ posAfter := l.pos + len(m.match)
+ if posAfter >= len(l.content) {+ return
+ }
+ r, _ := utf8.DecodeRune(l.content[posAfter:])
+ if r == '/' {+ // schemaless: skip
+ return
+ }
+ if l.pos > l.start {+ l.emit()
+ }
+ l.pos += len(m.match)
+ l.w.Write(m.quote)
+ l.w.Write(m.replacementURL)
+ l.start = l.pos
+}
+
func (l *contentlexer) replace() {contentLength := len(l.content)
var r rune
@@ -152,7 +240,7 @@
break
}
- var width int = 1
+ var width = 1
r = rune(l.content[l.pos])
if r >= utf8.RuneSelf {r, width = utf8.DecodeRune(l.content[l.pos:])
@@ -160,14 +248,24 @@
l.width = width
l.pos += l.width
if r == ' ' {- l.prefixLookup.ms = matchStateWhitespace
- } else if l.prefixLookup.ms != matchStateNone {+ l.ms = matchStateWhitespace
+ } else if l.ms != matchStateNone {l.match(r)
- if l.prefixLookup.ms == matchStateFull {- checkCandidate(l)
+ if l.ms == matchStateFull {+ var p *prefix
+ for i, m := range l.matches {+ if m {+ p = prefixes[i]
+ }
+ l.matches[i] = false
+ }
+ if p == nil {+ panic("illegal state: curr is nil when state is full")+ }
+ l.ms = matchStateNone
+ p.f(l)
}
}
-
}
// Done!
@@ -177,15 +275,12 @@
}
func doReplace(ct contentTransformer, matchers []absURLMatcher) {-
lexer := &contentlexer{- content: ct.Content(),
- w: ct,
- prefixLookup: &prefixes{pr: mainPrefixRunes},- matchers: matchers}
+ content: ct.Content(),
+ w: ct,
+ matchers: matchers}
lexer.replace()
-
}
type absURLReplacer struct {@@ -195,7 +290,7 @@
func newAbsURLReplacer(baseURL string) *absURLReplacer {u, _ := url.Parse(baseURL)
- base := strings.TrimRight(u.String(), "/")
+ base := []byte(strings.TrimRight(u.String(), "/") + "/")
// HTML
dqHTMLMatch := []byte("\"/")@@ -205,23 +300,23 @@
dqXMLMatch := []byte(""/") sqXMLMatch := []byte("'/")- dqHTML := []byte("\"" + base + "/")- sqHTML := []byte("'" + base + "/")+ dqHTML := []byte("\"")+ sqHTML := []byte("'")- dqXML := []byte(""" + base + "/")- sqXML := []byte("'" + base + "/")+ dqXML := []byte(""")+ sqXML := []byte("'") return &absURLReplacer{ htmlMatchers: []absURLMatcher{- {matchPrefixSrc, dqHTMLMatch, dqHTML},- {matchPrefixSrc, sqHTMLMatch, sqHTML},- {matchPrefixHref, dqHTMLMatch, dqHTML},- {matchPrefixHref, sqHTMLMatch, sqHTML}},+ {matchPrefixSrc, dqHTMLMatch, dqHTML, base},+ {matchPrefixSrc, sqHTMLMatch, sqHTML, base},+ {matchPrefixHref, dqHTMLMatch, dqHTML, base},+ {matchPrefixHref, sqHTMLMatch, sqHTML, base}}, xmlMatchers: []absURLMatcher{- {matchPrefixSrc, dqXMLMatch, dqXML},- {matchPrefixSrc, sqXMLMatch, sqXML},- {matchPrefixHref, dqXMLMatch, dqXML},- {matchPrefixHref, sqXMLMatch, sqXML},+ {matchPrefixSrc, dqXMLMatch, dqXML, base},+ {matchPrefixSrc, sqXMLMatch, sqXML, base},+ {matchPrefixHref, dqXMLMatch, dqXML, base},+ {matchPrefixHref, sqXMLMatch, sqXML, base},}}
}
--- a/transform/chain_test.go
+++ b/transform/chain_test.go
@@ -25,9 +25,43 @@
// Issue: 816, schemaless links combined with others
const REPLACE_SCHEMALESS_HTML = `Pre. src='//schemaless' src='/normal' <a href="//schemaless">Schemaless</a>. <a href="/normal">normal</a>. Post.`
const REPLACE_SCHEMALESS_HTML_CORRECT = `Pre. src='//schemaless' src='http://base/normal' <a href="//schemaless">Schemaless</a>. <a href="http://base/normal">normal</a>. Post.`
-const REPLACE_SCHEMALESS_XML = `Pre. src="//schemaless" src="/normal" <a href='//schemaless'>Schemaless</a>. <a href='/normal'>normal</a>. Post.`
-const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src="//schemaless" src="http://base/normal" <a href='//schemaless'>Schemaless</a>. <a href='http://base/normal'>normal</a>. Post.`
+const REPLACE_SCHEMALESS_XML = `Pre. src='//schemaless' src='/normal' <a href='//schemaless'>Schemaless</a>. <a href='/normal'>normal</a>. Post.`
+const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src='//schemaless' src='http://base/normal' <a href='//schemaless'>Schemaless</a>. <a href='http://base/normal'>normal</a>. Post.`
+// srcset=
+const SRCSET_BASIC = `Pre. <img srcset="/img/small.jpg 200w /img/big.jpg 700w" alt="text" src="/img/foo.jpg">`
+const SRCSET_BASIC_CORRECT = `Pre. <img srcset="http://base/img/small.jpg 200w http://base/img/big.jpg 700w" alt="text" src="http://base/img/foo.jpg">`
+const SRCSET_SINGLE_QUOTE = `Pre. <img srcset='/img/small.jpg 200w /img/big.jpg 700w' alt="text" src="/img/foo.jpg"> POST.`
+const SRCSET_SINGLE_QUOTE_CORRECT = `Pre. <img srcset='http://base/img/small.jpg 200w http://base/img/big.jpg 700w' alt="text" src="http://base/img/foo.jpg"> POST.`
+const SRCSET_XML_BASIC = `Pre. <img srcset="/img/small.jpg 200w /img/big.jpg 700w" alt="text" src="/img/foo.jpg">`
+const SRCSET_XML_BASIC_CORRECT = `Pre. <img srcset="http://base/img/small.jpg 200w http://base/img/big.jpg 700w" alt="text" src="http://base/img/foo.jpg">`
+const SRCSET_XML_SINGLE_QUOTE = `Pre. <img srcset="/img/small.jpg 200w /img/big.jpg 700w" alt="text" src="/img/foo.jpg">`
+const SRCSET_XML_SINGLE_QUOTE_CORRECT = `Pre. <img srcset="http://base/img/small.jpg 200w http://base/img/big.jpg 700w" alt="text" src="http://base/img/foo.jpg">`
+const SRCSET_VARIATIONS = `Pre.
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='/img/foo.jpg'> FOO.
+<img srcset='/img.jpg'>
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+const SRCSET_VARIATIONS_CORRECT = `Pre.
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='http://base/img/foo.jpg'> FOO.
+<img srcset='http://base/img.jpg'>
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+const SRCSET_XML_VARIATIONS = `Pre.
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='/img/foo.jpg'> FOO.
+<img srcset='/img.jpg'>
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+const SRCSET_XML_VARIATIONS_CORRECT = `Pre.
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='http://base/img/foo.jpg'> FOO.
+<img srcset='http://base/img.jpg'>
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+
var abs_url_bench_tests = []test{ {H5_JS_CONTENT_DOUBLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_DQ}, {H5_JS_CONTENT_SINGLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_SQ},@@ -46,6 +80,12 @@
var extra_tests_xml = []test{{REPLACE_SCHEMALESS_XML, REPLACE_SCHEMALESS_XML_CORRECT}}var xml_abs_url_tests = append(xml_abs_url_bench_tests, append(sanity_tests, extra_tests_xml...)...)
+var srcset_tests = []test{{SRCSET_BASIC, SRCSET_BASIC_CORRECT}, {SRCSET_SINGLE_QUOTE, SRCSET_SINGLE_QUOTE_CORRECT}, {SRCSET_VARIATIONS, SRCSET_VARIATIONS_CORRECT}}+var srcset_xml_tests = []test{+ {SRCSET_XML_BASIC, SRCSET_XML_BASIC_CORRECT},+ {SRCSET_XML_SINGLE_QUOTE, SRCSET_XML_SINGLE_QUOTE_CORRECT},+ {SRCSET_XML_VARIATIONS, SRCSET_XML_VARIATIONS_CORRECT}}+
func TestChainZeroTransformers(t *testing.T) {tr := NewChain()
in := new(bytes.Buffer)
@@ -99,6 +139,21 @@
tr := NewChain(absURL...)
apply(t.Errorf, tr, abs_url_tests)
+
+}
+
+func TestAbsURLSrcSet(t *testing.T) {+ absURL, _ := absURLFromURL("http://base")+ tr := NewChain(absURL...)
+
+ apply(t.Errorf, tr, srcset_tests)
+}
+
+func TestAbsXMLURLSrcSet(t *testing.T) {+ absURLInXML, _ := absURLInXMLFromURL("http://base")+ tr := NewChain(absURLInXML...)
+
+ apply(t.Errorf, tr, srcset_xml_tests)
}
--
⑨