From 41251715adafaa3a6a5b0e624d92fc53fee525fa Mon Sep 17 00:00:00 2001
From: Martin Probst <martin@probst.io>
Date: Sun, 27 Apr 2014 23:40:44 +0200
Subject: [PATCH] Use go.net/html's parser to sanitize HTML.

Use an HTML5 compliant parser that interprets HTML as a browser would to parse
the Markdown result and then sanitize based on the result.
Escape unrecognized and disallowed HTML in the result.
Currently works with a hard coded whitelist of safe HTML tags and attributes.
---
 html.go        |  71 ---------------------------
 inline.go      |   1 +
 inline_test.go | 104 ++++++++++++++++++++--------------------
 markdown.go    |   2 +-
 sanitize.go    | 127 +++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 181 insertions(+), 124 deletions(-)
 create mode 100644 sanitize.go
diff --git a/html.go b/html.go
index 54744c2..0c85530 100644
--- a/html.go
+++ b/html.go
@@ -43,52 +43,12 @@ const (
 )
 
 var (
-	tags = []string{
-		"b",
-		"blockquote",
-		"code",
-		"del",
-		"dd",
-		"dl",
-		"dt",
-		"em",
-		"h1",
-		"h2",
-		"h3",
-		"h4",
-		"h5",
-		"h6",
-		"i",
-		"kbd",
-		"li",
-		"ol",
-		"p",
-		"pre",
-		"s",
-		"sup",
-		"sub",
-		"strong",
-		"strike",
-		"ul",		
-		"table",
-		"tr",
-		"td",
-		"th",
-		"thead",
-		"tbody",
-		
-	}
-	
 	alignments = []string{
 		"left",
 		"right",
 		"center",
 	}
 
-	urlRe        = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
-	tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)(\salign="(` + strings.Join(alignments, "|") + `)")?>|<(br|hr)\s?\/?>)$`)
-	anchorClean  = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
-	imgClean     = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
 	// TODO: improve this regexp to catch all possible entities:
 	htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`)
 )
@@ -820,24 +780,6 @@ func findHtmlTagPos(tag []byte, tagname string) (bool, int) {
 	return false, -1
 }
 
-func sanitizeHtml(html []byte) []byte {
-	var result []byte
-	for string(html) != "" {
-		skip, tag, rest := findHtmlTag(html)
-		html = rest
-		result = append(result, skip...)
-		result = append(result, sanitizeTag(tag)...)
-	}
-	return append(result, []byte("\n")...)
-}
-
-func sanitizeTag(tag []byte) []byte {
-	if tagWhitelist.Match(tag) || anchorClean.Match(tag) || imgClean.Match(tag) {
-		return tag
-	}
-	return []byte("")
-}
-
 func skipUntilChar(text []byte, start int, char byte) int {
 	i := start
 	for i < len(text) && text[i] != char {
@@ -846,19 +788,6 @@ func skipUntilChar(text []byte, start int, char byte) int {
 	return i
 }
 
-func findHtmlTag(html []byte) (skip, tag, rest []byte) {
-	start := skipUntilChar(html, 0, '<')
-	rightAngle := skipUntilCharIgnoreQuotes(html, start, '>')
-	if rightAngle > start {
-		skip = html[0:start]
-		tag = html[start : rightAngle+1]
-		rest = html[rightAngle+1:]
-		return
-	}
-
-	return []byte(""), []byte(""), []byte("")
-}
-
 func skipSpace(tag []byte, i int) int {
 	for i < len(tag) && isspace(tag[i]) {
 		i++
diff --git a/inline.go b/inline.go
index 41225ce..b3aaf0f 100644
--- a/inline.go
+++ b/inline.go
@@ -20,6 +20,7 @@ import (
 )
 
 var (
+	urlRe    = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
 	anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`)
 )
 
diff --git a/inline_test.go b/inline_test.go
index 478fbe0..1937021 100644
--- a/inline_test.go
+++ b/inline_test.go
@@ -72,135 +72,135 @@ func doTestsInlineParam(t *testing.T, tests []string, extensions, htmlFlags int)
 func TestRawHtmlTag(t *testing.T) {
 	tests := []string{
 		"zz <style>p {}</style>\n",
-		"<p>zz p {}</p>\n",
+		"<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n",
 
 		"zz <STYLE>p {}</STYLE>\n",
-		"<p>zz p {}</p>\n",
+		"<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n",
 
 		"<SCRIPT>alert()</SCRIPT>\n",
-		"<p>alert()</p>\n",
+		"<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n",
 
 		"zz <SCRIPT>alert()</SCRIPT>\n",
-		"<p>zz alert()</p>\n",
+		"<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n",
 
 		"zz <script>alert()</script>\n",
-		"<p>zz alert()</p>\n",
+		"<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n",
 
 		" <script>alert()</script>\n",
-		"<p>alert()</p>\n",
+		"<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n",
 
 		"<script>alert()</script>\n",
-		"alert()\n",
+		"&lt;script&gt;alert()&lt;/script&gt;\n",
 
 		"<script src='foo'></script>\n",
-		"\n",
+		"&lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;\n",
 
 		"<script src='a>b'></script>\n",
-		"\n",
+		"&lt;script src=&#39;a&gt;b&#39;&gt;&lt;/script&gt;\n",
 
 		"zz <script src='foo'></script>\n",
-		"<p>zz </p>\n",
+		"<p>zz &lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;</p>\n",
 
 		"zz <script src=foo></script>\n",
-		"<p>zz </p>\n",
+		"<p>zz &lt;script src=foo&gt;&lt;/script&gt;</p>\n",
 
 		`<script><script src="http://example.com/exploit.js"></SCRIPT></script>`,
-		"\n",
+		"&lt;script&gt;&lt;script src=&#34;http://example.com/exploit.js&#34;&gt;&lt;/script&gt;&lt;/script&gt;\n",
 
 		`'';!--"<XSS>=&{()}`,
-		"<p>'';!--&quot;=&amp;{()}</p>\n",
+		"<p>&#39;&#39;;!--&#34;&lt;xss&gt;=&amp;{()}</p>\n",
 
 		"<SCRIPT SRC=http://ha.ckers.org/xss.js></SCRIPT>",
-		"<p></p>\n",
+		"<p>&lt;script SRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n",
 
 		"<SCRIPT \nSRC=http://ha.ckers.org/xss.js></SCRIPT>",
-		"<p></p>\n",
+		"<p>&lt;script \nSRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n",
 
 		`<IMG SRC="javascript:alert('XSS');">`,
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		"<IMG SRC=javascript:alert('XSS')>",
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		"<IMG SRC=JaVaScRiPt:alert('XSS')>",
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		"<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>",
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		`<a onmouseover="alert(document.cookie)">xss link</a>`,
-		"<p>xss link</a></p>\n",
+		"<p><a>xss link</a></p>\n",
 
 		"<a onmouseover=alert(document.cookie)>xss link</a>",
-		"<p>xss link</a></p>\n",
+		"<p><a>xss link</a></p>\n",
 
-		// XXX: this doesn't pass yet
-		//`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
-		//"<p></p>\n",
+		`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
+		"<p><img>&lt;script&gt;alert(&amp;quot;XSS&amp;quot;)&lt;/script&gt;&#34;&gt;</p>\n",
 
 		"<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>",
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		`<IMG SRC=# onmouseover="alert('xxs')">`,
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		`<IMG SRC= onmouseover="alert('xxs')">`,
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		`<IMG onmouseover="alert('xxs')">`,
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		"<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>",
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		"<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>",
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		"<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>",
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		`<IMG SRC="javascriptascript:alert('XSS');">`,
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		`<IMG SRC="jav&#x09;ascript:alert('XSS');">`,
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		`<IMG SRC="jav&#x0A;ascript:alert('XSS');">`,
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		`<IMG SRC="jav&#x0D;ascript:alert('XSS');">`,
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		`<IMG SRC=" &#14;  javascript:alert('XSS');">`,
-		"<p></p>\n",
+		"<p><img></p>\n",
 
 		`<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
-		"<p></p>\n",
+		"<p>&lt;script/XSS SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n",
 
-		// XXX: this doesn't pass yet
-		//"<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>",
-		//"\n",
+		"<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>",
+		"<p>&lt;body onload!#$%&amp;()*~+-_.,:;?@[/|\\]^`=alert(&#34;XSS&#34;)&gt;</p>\n",
 
 		`<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
-		"<p></p>\n",
+		"<p>&lt;script/SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n",
 
-		// XXX: this doesn't pass yet
-		//`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
-		//"",
+		// HTML5 interprets the <script> tag contents as raw test, thus the end
+		// result has double-escaped &amp;quot;
+		`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
+		"<p>&lt;&lt;script&gt;alert(&amp;quot;XSS&amp;quot;);//&amp;lt;&lt;/script&gt;</p>\n",
 
+		// HTML5 parses the </p> within an unclosed <script> tag as text.
+		// Same for the following tests.
 		"<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >",
-		"<p></p>\n",
+		"<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;&lt;/p&gt;\n",
 
 		"<SCRIPT SRC=//ha.ckers.org/.j>",
-		"<p></p>\n",
+		"<p>&lt;script SRC=//ha.ckers.org/.j&gt;&lt;/p&gt;\n",
 
-		// XXX: this doesn't pass yet
-		//`<IMG SRC="javascript:alert('XSS')"`,
-		//"",
+		`<IMG SRC="javascript:alert('XSS')"`,
+		"<p>&lt;IMG SRC=&#34;javascript:alert(&#39;XSS&#39;)&#34;</p>\n",
 
-		// XXX: this doesn't pass yet
-		//"<iframe src=http://ha.ckers.org/scriptlet.html <",
-		//"",
+		"<iframe src=http://ha.ckers.org/scriptlet.html <",
+		// The hyperlink gets linkified, the <iframe> gets escaped
+		"<p>&lt;iframe src=<a href=\"http://ha.ckers.org/scriptlet.html\">http://ha.ckers.org/scriptlet.html</a> &lt;</p>\n",
 	}
 	doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT)
 }
diff --git a/markdown.go b/markdown.go
index 02e486d..dc849f9 100644
--- a/markdown.go
+++ b/markdown.go
@@ -298,7 +298,7 @@ func Markdown(input []byte, renderer Renderer, extensions int) []byte {
 	second := secondPass(p, first)
 
 	if renderer.GetFlags()&HTML_SANITIZE_OUTPUT != 0 {
-		second = sanitizeHtml(second)
+		second = sanitizeHtmlSafe(second)
 	}
 
 	return second
diff --git a/sanitize.go b/sanitize.go
new file mode 100644
index 0000000..6e72fa8
--- /dev/null
+++ b/sanitize.go
@@ -0,0 +1,127 @@
+package blackfriday
+
+import (
+	"bufio"
+	"bytes"
+	"code.google.com/p/go.net/html"
+	"fmt"
+	"io"
+)
+
+// Whitelisted element tags, attributes on particular tags, attributes that are
+// interpreted as protocols (again on particular tags), and allowed protocols.
+var (
+	whitelistTags      map[string]bool
+	whitelistAttrs     map[string]map[string]bool
+	protocolAttrs      map[string]map[string]bool
+	whitelistProtocols [][]byte
+)
+
+func init() {
+	whitelistTags = toSet([]string{
+		"a", "b", "blockquote", "br", "caption", "cite", "code", "col",
+		"colgroup", "dd", "div", "dl", "dt", "em",
+		"h1", "h2", "h3", "h4", "h5", "h6",
+		"i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong",
+		"sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u",
+		"ul"})
+	whitelistAttrs = map[string]map[string]bool{
+		"a":   toSet([]string{"href", "title"}),
+		"img": toSet([]string{"src", "alt", "title"}),
+	}
+	protocolAttrs = map[string]map[string]bool{
+		"a":   toSet([]string{"href"}),
+		"img": toSet([]string{"src"}),
+	}
+	whitelistProtocols = [][]byte{
+		[]byte("http://"),
+		[]byte("https://"),
+		[]byte("ftp://"),
+		[]byte("mailto:"),
+	}
+}
+
+func toSet(keys []string) map[string]bool {
+	m := make(map[string]bool, len(keys))
+	for _, k := range keys {
+		m[k] = true
+	}
+	return m
+}
+
+// Sanitizes the given input by parsing it as HTML5, then whitelisting known to
+// be safe elements and attributes. All other HTML is escaped, unsafe attributes
+// are stripped.
+func sanitizeHtmlSafe(input []byte) []byte {
+	r := bytes.NewReader(input)
+	var w bytes.Buffer
+	tokenizer := html.NewTokenizer(r)
+	wr := bufio.NewWriter(&w)
+
+	// Iterate through all tokens in the input stream and sanitize them.
+	for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() {
+		switch t {
+		case html.TextToken:
+			// Text is written escaped.
+			wr.WriteString(tokenizer.Token().String())
+		case html.StartTagToken:
+			// HTML tags are escaped unless whitelisted.
+			tag, hasAttributes := tokenizer.TagName()
+			tagName := string(tag)
+			if whitelistTags[tagName] {
+				wr.WriteString("<")
+				wr.Write(tag)
+				for hasAttributes {
+					var key, val []byte
+					key, val, hasAttributes = tokenizer.TagAttr()
+					attrName := string(key)
+					// Only include whitelisted attributes for the given tagName.
+					tagWhitelistedAttrs, ok := whitelistAttrs[tagName]
+					if ok && tagWhitelistedAttrs[attrName] {
+						// For whitelisted attributes, if it's an attribute that requires
+						// protocol checking, do so and strip it if it's not known to be safe.
+						tagProtocolAttrs, ok := protocolAttrs[tagName]
+						if ok && tagProtocolAttrs[attrName] {
+							if !protocolAllowed(val) {
+								continue
+							}
+						}
+						wr.WriteByte(' ')
+						wr.Write(key)
+						wr.WriteString(`="`)
+						wr.WriteString(html.EscapeString(string(val)))
+						wr.WriteByte('"')
+					}
+				}
+				wr.WriteString(">")
+			} else {
+				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
+			}
+		case html.EndTagToken:
+			// Whitelisted tokens can be written in raw.
+			tag, _ := tokenizer.TagName()
+			if whitelistTags[string(tag)] {
+				wr.Write(tokenizer.Raw())
+			} else {
+				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
+			}
+		default:
+			panic(fmt.Errorf("Unexpected token type %v", t))
+		}
+	}
+	err := tokenizer.Err()
+	if err != nil && err != io.EOF {
+		panic(tokenizer.Err())
+	}
+	wr.Flush()
+	return w.Bytes()
+}
+
+func protocolAllowed(attr []byte) bool {
+	for _, prefix := range whitelistProtocols {
+		if bytes.HasPrefix(attr, prefix) {
+			return true
+		}
+	}
+	return false
+}