From 41251715adafaa3a6a5b0e624d92fc53fee525fa Mon Sep 17 00:00:00 2001 From: Martin Probst Date: Sun, 27 Apr 2014 23:40:44 +0200 Subject: [PATCH] Use go.net/html's parser to sanitize HTML. Use an HTML5 compliant parser that interprets HTML as a browser would to parse the Markdown result and then sanitize based on the result. Escape unrecognized and disallowed HTML in the result. Currently works with a hard coded whitelist of safe HTML tags and attributes. --- html.go | 71 --------------------------- inline.go | 1 + inline_test.go | 104 ++++++++++++++++++++-------------------- markdown.go | 2 +- sanitize.go | 127 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 181 insertions(+), 124 deletions(-) create mode 100644 sanitize.go diff --git a/html.go b/html.go index 54744c2..0c85530 100644 --- a/html.go +++ b/html.go @@ -43,52 +43,12 @@ const ( ) var ( - tags = []string{ - "b", - "blockquote", - "code", - "del", - "dd", - "dl", - "dt", - "em", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "i", - "kbd", - "li", - "ol", - "p", - "pre", - "s", - "sup", - "sub", - "strong", - "strike", - "ul", - "table", - "tr", - "td", - "th", - "thead", - "tbody", - - } - alignments = []string{ "left", "right", "center", } - urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` - tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)(\salign="(` + strings.Join(alignments, "|") + `)")?>|<(br|hr)\s?\/?>)$`) - anchorClean = regexp.MustCompile(`^(]+")?\s?>|<\/a>)$`) - imgClean = regexp.MustCompile(`^(]*")?(\stitle="[^"<>]*")?\s?\/?>)$`) // TODO: improve this regexp to catch all possible entities: htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`) ) @@ -820,24 +780,6 @@ func findHtmlTagPos(tag []byte, tagname string) (bool, int) { return false, -1 } -func sanitizeHtml(html []byte) []byte { - var result []byte - for string(html) != "" { - skip, tag, rest := findHtmlTag(html) - html = rest - result = append(result, skip...) - result = append(result, sanitizeTag(tag)...) - } - return append(result, []byte("\n")...) -} - -func sanitizeTag(tag []byte) []byte { - if tagWhitelist.Match(tag) || anchorClean.Match(tag) || imgClean.Match(tag) { - return tag - } - return []byte("") -} - func skipUntilChar(text []byte, start int, char byte) int { i := start for i < len(text) && text[i] != char { @@ -846,19 +788,6 @@ func skipUntilChar(text []byte, start int, char byte) int { return i } -func findHtmlTag(html []byte) (skip, tag, rest []byte) { - start := skipUntilChar(html, 0, '<') - rightAngle := skipUntilCharIgnoreQuotes(html, start, '>') - if rightAngle > start { - skip = html[0:start] - tag = html[start : rightAngle+1] - rest = html[rightAngle+1:] - return - } - - return []byte(""), []byte(""), []byte("") -} - func skipSpace(tag []byte, i int) int { for i < len(tag) && isspace(tag[i]) { i++ diff --git a/inline.go b/inline.go index 41225ce..b3aaf0f 100644 --- a/inline.go +++ b/inline.go @@ -20,6 +20,7 @@ import ( ) var ( + urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` anchorRe = regexp.MustCompile(`^(]+")?\s?>` + urlRe + `<\/a>)`) ) diff --git a/inline_test.go b/inline_test.go index 478fbe0..1937021 100644 --- a/inline_test.go +++ b/inline_test.go @@ -72,135 +72,135 @@ func doTestsInlineParam(t *testing.T, tests []string, extensions, htmlFlags int) func TestRawHtmlTag(t *testing.T) { tests := []string{ "zz \n", - "

zz p {}

\n", + "

zz <style>p {}</style>

\n", "zz \n", - "

zz p {}

\n", + "

zz <style>p {}</style>

\n", "\n", - "

alert()

\n", + "

<script>alert()</script>

\n", "zz \n", - "

zz alert()

\n", + "

zz <script>alert()</script>

\n", "zz \n", - "

zz alert()

\n", + "

zz <script>alert()</script>

\n", " \n", - "

alert()

\n", + "

<script>alert()</script>

\n", "\n", - "alert()\n", + "<script>alert()</script>\n", "\n", - "\n", + "<script src='foo'></script>\n", "\n", - "\n", + "<script src='a>b'></script>\n", "zz \n", - "

zz

\n", + "

zz <script src='foo'></script>

\n", "zz \n", - "

zz

\n", + "

zz <script src=foo></script>

\n", ``, - "\n", + "<script><script src="http://example.com/exploit.js"></script></script>\n", `'';!--"=&{()}`, - "

'';!--"=&{()}

\n", + "

'';!--"<xss>=&{()}

\n", "", - "

\n", + "

<script SRC=http://ha.ckers.org/xss.js></script>

\n", "", - "

\n", + "

<script \nSRC=http://ha.ckers.org/xss.js></script>

\n", ``, - "

\n", + "

\n", "", - "

\n", + "

\n", "", - "

\n", + "

\n", "", - "

\n", + "

\n", `xss link`, - "

xss link

\n", + "

xss link

\n", "xss link", - "

xss link

\n", + "

xss link

\n", - // XXX: this doesn't pass yet - //`">`, - //"

\n", + `">`, + "

<script>alert(&quot;XSS&quot;)</script>">

\n", "", - "

\n", + "

\n", ``, - "

\n", + "

\n", ``, - "

\n", + "

\n", ``, - "

\n", + "

\n", "", - "

\n", + "

\n", "", - "

\n", + "

\n", "", - "

\n", + "

\n", ``, - "

\n", + "

\n", ``, - "

\n", + "

\n", ``, - "

\n", + "

\n", ``, - "

\n", + "

\n", ``, - "

\n", + "

\n", ``, - "

\n", + "

<script/XSS SRC="http://ha.ckers.org/xss.js"></script>

\n", - // XXX: this doesn't pass yet - //"", - //"\n", + "", + "

<body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")>

\n", ``, - "

\n", + "

<script/SRC="http://ha.ckers.org/xss.js"></script>

\n", - // XXX: this doesn't pass yet - //`<`, - //"", + // HTML5 interprets the `, + "

<<script>alert(&quot;XSS&quot;);//&lt;</script>

\n", + // HTML5 parses the

within an unclosed